Can I use local LLM as the evaluator and provider?
hijkzzz opened this issue · 6 comments
found this repo today and will be attempting as well
+1
@hijkzzz @disperaller @RahulSinghalChicago hey guys, I have some findings. But I found the score may be a bit to hight. Do you have a better idea?
base on the openai provider, you can add your own provider like this: (namely local_llama.py
in this case)
--- ./providers/openai.py
+++ ./providers/local_llama.py
@@ -1,16 +1,19 @@
import os
from operator import itemgetter
from typing import Optional
+import torch
-from openai import AsyncOpenAI
-from langchain_openai import ChatOpenAI
+from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
import tiktoken
from .model import ModelProvider
-class OpenAI(ModelProvider):
+class LocalLlama(ModelProvider):
"""
A wrapper class for interacting with OpenAI's API, providing methods to encode text, generate prompts,
evaluate models, and create LangChain runnables for language model interactions.
@@ -25,7 +28,7 @@
temperature = 0)
def __init__(self,
- model_name: str = "gpt-3.5-turbo-0125",
+ model_name: str = "meta/llama-2-7b-chat-hf",
model_kwargs: dict = DEFAULT_MODEL_KWARGS):
"""
Initializes the OpenAI model provider with a specific model.
@@ -37,15 +40,18 @@
Raises:
ValueError: If NIAH_MODEL_API_KEY is not found in the environment.
"""
- api_key = os.getenv('NIAH_MODEL_API_KEY')
- if (not api_key):
- raise ValueError("NIAH_MODEL_API_KEY must be in env.")
-
- self.model_name = model_name
+ self.model_or_path = model_name
+ self.model_name = model_name.split("/")[-1]
self.model_kwargs = model_kwargs
- self.api_key = api_key
- self.model = AsyncOpenAI(api_key=self.api_key)
- self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+
+ self.model = AutoModelForCausalLM.from_pretrained(
+ self.model_or_path,
+ device_map="auto",
+ torch_dtype=torch.bfloat16,
+ attn_implementation="flash_attention_2",
+ )
+
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_or_path)
async def evaluate_model(self, prompt: str) -> str:
"""
@@ -57,12 +63,18 @@
Returns:
str: The content of the model's response to the prompt.
"""
- response = await self.model.chat.completions.create(
- model=self.model_name,
- messages=prompt,
- **self.model_kwargs
- )
- return response.choices[0].message.content
+ MAX_GEN_LENGTH = 128
+ tokenized_prompts = self.tokenizer(prompt, return_tensors="pt")
+ input_ids = tokenized_prompts.input_ids.cuda()
+
+ generation_output = self.model.generate(
+ input_ids,
+ max_new_tokens=MAX_GEN_LENGTH,
+ use_cache=True,
+ return_dict_in_generate=True)
+
+ output = self.tokenizer.decode(generation_output.sequences[:,input_ids.shape[1]:][0])
+ return output
def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]:
"""
@@ -75,19 +87,16 @@
Returns:
list[dict[str, str]]: A list of dictionaries representing the structured prompt, including roles and content for system and user messages.
"""
- return [{
- "role": "system",
- "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
- },
- {
- "role": "user",
- "content": context
- },
- {
- "role": "user",
- "content": f"{retrieval_question} Don't give information outside the document or repeat your findings"
- }]
-
+ return f"""
+<s>[INST] <<SYS>>
+You are a helpful AI bot that answers questions for a user. Keep your response short and direct
+<</SYS>>
+{ context }
+
+{retrieval_question} Don't give information outside the document or repeat your findings
+[/INST]</s>
+"""
+
def encode_text_to_tokens(self, text: str) -> list[int]:
"""
Encodes a given text string to a sequence of tokens using the model's tokenizer.
add a entry in run.py like this:
diff --git a/needlehaystack/run.py b/needlehaystack/run.py
index 8edbccb..f5b6783 100644
--- a/needlehaystack/run.py
+++ b/needlehaystack/run.py
@@ -6,7 +6,7 @@ from jsonargparse import CLI
from . import LLMNeedleHaystackTester, LLMMultiNeedleHaystackTester
from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator
-from .providers import Anthropic, ModelProvider, OpenAI, Cohere
+from .providers import Anthropic, ModelProvider, OpenAI, Cohere, LocalLlama
load_dotenv()
@@ -65,6 +65,8 @@ def get_model_to_test(args: CommandArgs) -> ModelProvider:
return Anthropic(model_name=args.model_name)
case "cohere":
return Cohere(model_name=args.model_name)
+ case "local":
+ return LocalLlama(model_name=args.model_name)
case _:
raise ValueError(f"Invalid provider: {args.provider}")
@hijkzzz @disperaller @RahulSinghalChicago hey guys, I have some findings. But I found the score may be a bit to hight. Do you have a better idea?
base on the openai provider, you can add your own provider like this: (namely
local_llama.py
in this case)--- ./providers/openai.py +++ ./providers/local_llama.py @@ -1,16 +1,19 @@ import os from operator import itemgetter from typing import Optional +import torch -from openai import AsyncOpenAI -from langchain_openai import ChatOpenAI +from langchain_openai import ChatOpenAI from langchain.prompts import PromptTemplate + +from transformers import AutoTokenizer, AutoModelForCausalLM + import tiktoken from .model import ModelProvider -class OpenAI(ModelProvider): +class LocalLlama(ModelProvider): """ A wrapper class for interacting with OpenAI's API, providing methods to encode text, generate prompts, evaluate models, and create LangChain runnables for language model interactions. @@ -25,7 +28,7 @@ temperature = 0) def __init__(self, - model_name: str = "gpt-3.5-turbo-0125", + model_name: str = "meta/llama-2-7b-chat-hf", model_kwargs: dict = DEFAULT_MODEL_KWARGS): """ Initializes the OpenAI model provider with a specific model. @@ -37,15 +40,18 @@ Raises: ValueError: If NIAH_MODEL_API_KEY is not found in the environment. """ - api_key = os.getenv('NIAH_MODEL_API_KEY') - if (not api_key): - raise ValueError("NIAH_MODEL_API_KEY must be in env.") - - self.model_name = model_name + self.model_or_path = model_name + self.model_name = model_name.split("/")[-1] self.model_kwargs = model_kwargs - self.api_key = api_key - self.model = AsyncOpenAI(api_key=self.api_key) - self.tokenizer = tiktoken.encoding_for_model(self.model_name) + + self.model = AutoModelForCausalLM.from_pretrained( + self.model_or_path, + device_map="auto", + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + ) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_or_path) async def evaluate_model(self, prompt: str) -> str: """ @@ -57,12 +63,18 @@ Returns: str: The content of the model's response to the prompt. """ - response = await self.model.chat.completions.create( - model=self.model_name, - messages=prompt, - **self.model_kwargs - ) - return response.choices[0].message.content + MAX_GEN_LENGTH = 128 + tokenized_prompts = self.tokenizer(prompt, return_tensors="pt") + input_ids = tokenized_prompts.input_ids.cuda() + + generation_output = self.model.generate( + input_ids, + max_new_tokens=MAX_GEN_LENGTH, + use_cache=True, + return_dict_in_generate=True) + + output = self.tokenizer.decode(generation_output.sequences[:,input_ids.shape[1]:][0]) + return output def generate_prompt(self, context: str, retrieval_question: str) -> str | list[dict[str, str]]: """ @@ -75,19 +87,16 @@ Returns: list[dict[str, str]]: A list of dictionaries representing the structured prompt, including roles and content for system and user messages. """ - return [{ - "role": "system", - "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct" - }, - { - "role": "user", - "content": context - }, - { - "role": "user", - "content": f"{retrieval_question} Don't give information outside the document or repeat your findings" - }] - + return f""" +<s>[INST] <<SYS>> +You are a helpful AI bot that answers questions for a user. Keep your response short and direct +<</SYS>> +{ context } + +{retrieval_question} Don't give information outside the document or repeat your findings +[/INST]</s> +""" + def encode_text_to_tokens(self, text: str) -> list[int]: """ Encodes a given text string to a sequence of tokens using the model's tokenizer.add a entry in run.py like this:
diff --git a/needlehaystack/run.py b/needlehaystack/run.py index 8edbccb..f5b6783 100644 --- a/needlehaystack/run.py +++ b/needlehaystack/run.py @@ -6,7 +6,7 @@ from jsonargparse import CLI from . import LLMNeedleHaystackTester, LLMMultiNeedleHaystackTester from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator -from .providers import Anthropic, ModelProvider, OpenAI, Cohere +from .providers import Anthropic, ModelProvider, OpenAI, Cohere, LocalLlama load_dotenv() @@ -65,6 +65,8 @@ def get_model_to_test(args: CommandArgs) -> ModelProvider: return Anthropic(model_name=args.model_name) case "cohere": return Cohere(model_name=args.model_name) + case "local": + return LocalLlama(model_name=args.model_name) case _: raise ValueError(f"Invalid provider: {args.provider}")
What about the evaluator.py
? :)
@AnaRhisT94 Use the OpenAI evaluator.
But I recently found a offline version of NeedleInAHaystack, https://github.com/66RING/LLMTest_NeedleInAHaystack-Local, which is base on this repo
@AnaRhisT94 Use the OpenAI evaluator.
But I recently found a offline version of NeedleInAHaystack, https://github.com/66RING/LLMTest_NeedleInAHaystack-Local, which is base on this repo
Thanks, I already found this local repo in your profile and looking at it.
I'm not sure I can use the OpenAIEvaluator
class.
I need to change the self.evaluator = ChatOpenAI..
to something else, still not sure to what.
Anyways, I'm going to test the local version.