gkamradt/LLMTest_NeedleInAHaystack

Can I use local LLM as the evaluator and provider?

hijkzzz opened this issue · 6 comments

Can I use local LLM as the evaluator and provider?

found this repo today and will be attempting as well

@hijkzzz @disperaller @RahulSinghalChicago hey guys, I have some findings. But I found the score may be a bit to hight. Do you have a better idea?

base on the openai provider, you can add your own provider like this: (namely local_llama.py in this case)

--- ./providers/openai.py
+++ ./providers/local_llama.py
@@ -1,16 +1,19 @@
 import os
 from operator import itemgetter
 from typing import Optional
+import torch
 
-from openai import AsyncOpenAI
-from langchain_openai import ChatOpenAI  
+from langchain_openai import ChatOpenAI
 from langchain.prompts import PromptTemplate
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
 import tiktoken
 
 from .model import ModelProvider
 
 
-class OpenAI(ModelProvider):
+class LocalLlama(ModelProvider):
     """
     A wrapper class for interacting with OpenAI's API, providing methods to encode text, generate prompts,
     evaluate models, and create LangChain runnables for language model interactions.
@@ -25,7 +28,7 @@
                                       temperature = 0)
 
     def __init__(self,
-                 model_name: str = "gpt-3.5-turbo-0125",
+                 model_name: str = "meta/llama-2-7b-chat-hf",
                  model_kwargs: dict = DEFAULT_MODEL_KWARGS):
         """
         Initializes the OpenAI model provider with a specific model.
@@ -37,15 +40,18 @@
         Raises:
             ValueError: If NIAH_MODEL_API_KEY is not found in the environment.
         """
-        api_key = os.getenv('NIAH_MODEL_API_KEY')
-        if (not api_key):
-            raise ValueError("NIAH_MODEL_API_KEY must be in env.")
-
-        self.model_name = model_name
+        self.model_or_path = model_name
+        self.model_name = model_name.split("/")[-1]
         self.model_kwargs = model_kwargs
-        self.api_key = api_key
-        self.model = AsyncOpenAI(api_key=self.api_key)
-        self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_or_path,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+            )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_or_path)
     
     async def evaluate_model(self, prompt: str) -> str:
         """
@@ -57,12 +63,18 @@
         Returns:
             str: The content of the model's response to the prompt.
         """
-        response = await self.model.chat.completions.create(
-                model=self.model_name,
-                messages=prompt,
-                **self.model_kwargs
-            )
-        return response.choices[0].message.content
+        MAX_GEN_LENGTH = 128
+        tokenized_prompts = self.tokenizer(prompt, return_tensors="pt")
+        input_ids = tokenized_prompts.input_ids.cuda()
+
+        generation_output = self.model.generate(
+            input_ids,
+            max_new_tokens=MAX_GEN_LENGTH,
+            use_cache=True,
+            return_dict_in_generate=True)
+
+        output = self.tokenizer.decode(generation_output.sequences[:,input_ids.shape[1]:][0])
+        return output
     
     def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]:
         """
@@ -75,19 +87,16 @@
         Returns:
             list[dict[str, str]]: A list of dictionaries representing the structured prompt, including roles and content for system and user messages.
         """
-        return [{
-                "role": "system",
-                "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
-            },
-            {
-                "role": "user",
-                "content": context
-            },
-            {
-                "role": "user",
-                "content": f"{retrieval_question} Don't give information outside the document or repeat your findings"
-            }]
-    
+        return f"""
+<s>[INST] <<SYS>>
+You are a helpful AI bot that answers questions for a user. Keep your response short and direct
+<</SYS>>
+{ context }
+
+{retrieval_question} Don't give information outside the document or repeat your findings
+[/INST]</s>
+"""
+
     def encode_text_to_tokens(self, text: str) -> list[int]:
         """
         Encodes a given text string to a sequence of tokens using the model's tokenizer.

add a entry in run.py like this:

diff --git a/needlehaystack/run.py b/needlehaystack/run.py
index 8edbccb..f5b6783 100644
--- a/needlehaystack/run.py
+++ b/needlehaystack/run.py
@@ -6,7 +6,7 @@ from jsonargparse import CLI
 
 from . import LLMNeedleHaystackTester, LLMMultiNeedleHaystackTester
 from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator
-from .providers import Anthropic, ModelProvider, OpenAI, Cohere
+from .providers import Anthropic, ModelProvider, OpenAI, Cohere, LocalLlama
 
 load_dotenv()
 
@@ -65,6 +65,8 @@ def get_model_to_test(args: CommandArgs) -> ModelProvider:
             return Anthropic(model_name=args.model_name)
         case "cohere":
             return Cohere(model_name=args.model_name)
+        case "local":
+            return LocalLlama(model_name=args.model_name)
         case _:
             raise ValueError(f"Invalid provider: {args.provider}")

@hijkzzz @disperaller @RahulSinghalChicago hey guys, I have some findings. But I found the score may be a bit to hight. Do you have a better idea?

base on the openai provider, you can add your own provider like this: (namely local_llama.py in this case)

--- ./providers/openai.py
+++ ./providers/local_llama.py
@@ -1,16 +1,19 @@
 import os
 from operator import itemgetter
 from typing import Optional
+import torch
 
-from openai import AsyncOpenAI
-from langchain_openai import ChatOpenAI  
+from langchain_openai import ChatOpenAI
 from langchain.prompts import PromptTemplate
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
 import tiktoken
 
 from .model import ModelProvider
 
 
-class OpenAI(ModelProvider):
+class LocalLlama(ModelProvider):
     """
     A wrapper class for interacting with OpenAI's API, providing methods to encode text, generate prompts,
     evaluate models, and create LangChain runnables for language model interactions.
@@ -25,7 +28,7 @@
                                       temperature = 0)
 
     def __init__(self,
-                 model_name: str = "gpt-3.5-turbo-0125",
+                 model_name: str = "meta/llama-2-7b-chat-hf",
                  model_kwargs: dict = DEFAULT_MODEL_KWARGS):
         """
         Initializes the OpenAI model provider with a specific model.
@@ -37,15 +40,18 @@
         Raises:
             ValueError: If NIAH_MODEL_API_KEY is not found in the environment.
         """
-        api_key = os.getenv('NIAH_MODEL_API_KEY')
-        if (not api_key):
-            raise ValueError("NIAH_MODEL_API_KEY must be in env.")
-
-        self.model_name = model_name
+        self.model_or_path = model_name
+        self.model_name = model_name.split("/")[-1]
         self.model_kwargs = model_kwargs
-        self.api_key = api_key
-        self.model = AsyncOpenAI(api_key=self.api_key)
-        self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_or_path,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                attn_implementation="flash_attention_2",
+            )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_or_path)
     
     async def evaluate_model(self, prompt: str) -> str:
         """
@@ -57,12 +63,18 @@
         Returns:
             str: The content of the model's response to the prompt.
         """
-        response = await self.model.chat.completions.create(
-                model=self.model_name,
-                messages=prompt,
-                **self.model_kwargs
-            )
-        return response.choices[0].message.content
+        MAX_GEN_LENGTH = 128
+        tokenized_prompts = self.tokenizer(prompt, return_tensors="pt")
+        input_ids = tokenized_prompts.input_ids.cuda()
+
+        generation_output = self.model.generate(
+            input_ids,
+            max_new_tokens=MAX_GEN_LENGTH,
+            use_cache=True,
+            return_dict_in_generate=True)
+
+        output = self.tokenizer.decode(generation_output.sequences[:,input_ids.shape[1]:][0])
+        return output
     
     def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]:
         """
@@ -75,19 +87,16 @@
         Returns:
             list[dict[str, str]]: A list of dictionaries representing the structured prompt, including roles and content for system and user messages.
         """
-        return [{
-                "role": "system",
-                "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
-            },
-            {
-                "role": "user",
-                "content": context
-            },
-            {
-                "role": "user",
-                "content": f"{retrieval_question} Don't give information outside the document or repeat your findings"
-            }]
-    
+        return f"""
+<s>[INST] <<SYS>>
+You are a helpful AI bot that answers questions for a user. Keep your response short and direct
+<</SYS>>
+{ context }
+
+{retrieval_question} Don't give information outside the document or repeat your findings
+[/INST]</s>
+"""
+
     def encode_text_to_tokens(self, text: str) -> list[int]:
         """
         Encodes a given text string to a sequence of tokens using the model's tokenizer.

add a entry in run.py like this:

diff --git a/needlehaystack/run.py b/needlehaystack/run.py
index 8edbccb..f5b6783 100644
--- a/needlehaystack/run.py
+++ b/needlehaystack/run.py
@@ -6,7 +6,7 @@ from jsonargparse import CLI
 
 from . import LLMNeedleHaystackTester, LLMMultiNeedleHaystackTester
 from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator
-from .providers import Anthropic, ModelProvider, OpenAI, Cohere
+from .providers import Anthropic, ModelProvider, OpenAI, Cohere, LocalLlama
 
 load_dotenv()
 
@@ -65,6 +65,8 @@ def get_model_to_test(args: CommandArgs) -> ModelProvider:
             return Anthropic(model_name=args.model_name)
         case "cohere":
             return Cohere(model_name=args.model_name)
+        case "local":
+            return LocalLlama(model_name=args.model_name)
         case _:
             raise ValueError(f"Invalid provider: {args.provider}")

What about the evaluator.py? :)

@AnaRhisT94 Use the OpenAI evaluator.

But I recently found a offline version of NeedleInAHaystack, https://github.com/66RING/LLMTest_NeedleInAHaystack-Local, which is base on this repo

@AnaRhisT94 Use the OpenAI evaluator.

But I recently found a offline version of NeedleInAHaystack, https://github.com/66RING/LLMTest_NeedleInAHaystack-Local, which is base on this repo

Thanks, I already found this local repo in your profile and looking at it.
I'm not sure I can use the OpenAIEvaluator class.
I need to change the self.evaluator = ChatOpenAI.. to something else, still not sure to what.

Anyways, I'm going to test the local version.