pytorch/ao

Update Eval scripts

Opened this issue · 1 comments

Summary

Currently we have two "eval" scripts for measuring performance of LLMs post quantization: https://github.com/pytorch/ao/blob/main/torchao/_models/llama/eval.py,
https://github.com/pytorch/ao/blob/main/scripts/hf_eval.py

The default task we have is wikitext. We should create a "large" eval option that evaluates on more tasks;

A good "medium" task list would be to run the MMLU tasks without: gsm8k which is quite slow

@mobicham provided a good list of tasks we should add:

import numpy as np
import copy
import lm_eval
model.eval();
model.config.use_cache = False
try:
    lm_eval.tasks.initialize_tasks() 
except:
    pass
model_eval = lm_eval.models.huggingface.HFLM(pretrained=model, tokenizer=tokenizer)
eval_batch_size = 1 #8

results = {}
############################################
for task in [("truthfulqa_mc2", 0)]: 
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])

for task in [("winogrande", 5)]:
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])

for task in [("arc_challenge", 25)]: 
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])

# ############################################
for task in [("hellaswag", 10)]: 
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])
    
for task in [("gsm8k", 5)]:
    tag, fewshot = task
    results[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results[tag])
#############################################

We could also add "MMLU"

results_1  = copy.deepcopy(results)

#MMLU
results_mmlu = {}
for task in [("mmlu", 5)]:  
    tag, fewshot = task
    results_mmlu[tag] = lm_eval.evaluator.simple_evaluate(model_eval, tasks=[tag], num_fewshot=fewshot, batch_size=eval_batch_size)['results']
    print(tag, results_mmlu[tag])

mmlu_list    = "hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions"
mmlu_list    = [l.replace('hendrycksTest-','') for l in mmlu_list.split(',')]
results_mmlu = results_mmlu['mmlu']

k = []
for r in results_mmlu:
    if np.any([(l in r) for l in mmlu_list]):
        k.append(results_mmlu[r]['acc,none'])

assert len(k)==57
print('MMLU avg acc', np.mean(k)) 

results['mmlu'] = np.mean(k) 

I saw that this file introduces a TransformerEvalWrapper, and it seems specific to the _model/llama model. Which one would you prefer to keep? Will it also support models from the Hugging Face Transformers library?