python3 -u evaluation/run_evaluation.py \
--evaluation_data=evaluation_starters_love.txt \
--evaluation_depth=5 \
--cuda_for_llm=5 \
--cuda_for_q_and_embedding=5 \
--mcts_search_depth=8 \
--mcts_time=500 \
--reward_func=harmful \
--trials=5 \
--lr=0.0001 \
--pretrained_q_function=trained_q_function_daily_dialogueFULL \
--embedding=llama \
--agent=semantic_online \
--result_file=output \
--evaluation_mode=None
python3 -u evaluation/run_evaluation.py \
--evaluation_data=evaluation_starters_love.txt \ ### conversation starter files
--evaluation_depth=5 \ ### how many evaluation decision steps (so, LLM makes 5 decisions during evaluation; aka a conversation of 10 turns will happen)
--cuda_for_llm=5 \ ### cuda for create_human_and_llm() and Llama_2_Guard_Reward(()
--cuda_for_q_and_embedding=5 \ ### cuda for Q function and embedding_model_llama(), I realised embedding_model_llama() uses Llama guard as well so i think we are initializing twice here, using unncessary GPUs oops.
--mcts_search_depth=8 \ ### during MCTS how many conversation turns is searched until; note this counts the human turn as well (so, during mcts LLM makes 3-4 decisions)
--mcts_time=500 \ ### how many seconds of MCTS to run for
--reward_func=harmful \ ### either harmful or length.
--trials=5 \ ### repeat trials
--lr=0.00001 \ ### LR used in Q function, actually i used 0.00001 for length. for harmfulness not sure what is a good value.
--pretrained_q_function=trained_q_function_daily_dialogueFULL \ ### initialize Q function from an offline trained one. Only useful if agent is "offline_online_mixed"
--embedding=llama \ ### embedding model. use llama now.
--agent=semantic_online \ ### greedy, random, pure_online (vanilla MCTS), offline_online_mixed, semantic_online (ours). For some agents, some previous args don't matter.
--result_file=output \ ### a csv file will be created in directory after everything is finished.
--evaluation_mode=gpt ### this is optional. If I put None, it defaults to using llama for evaluation. you can put gpt to use chatgpt for evaluation