Evaluation datasets needed
Opened this issue · 0 comments
Mr-lonely0 commented
Thank you for your great work!
While running the evaluation, I encountered a "datasets not found" issue. Could you please let me know how I can obtain the evaluation datasets?
I appreciate your assistance and look forward to your response.
Here is my eval.sh
:
#!/bin/bash -l
#SBATCH -J eval_llama
#SBATCH -N 1 -n 1
#SBATCH --output=slurm/%x-%j.out
#SBATCH --gres=gpu:a100:1 --constraint gpu80
#SBATCH --cpus-per-task=32
#SBATCH --mem=128G
#SBATCH -t 0-3:00:00
# nvidia-smi
# conda activate auto
base_model=${BASE:-"opt-125m"}
# run_name=${NAME:-"ac_Llama-2-7b-hf_sub2_seg2_sum50_lr4e-4_bsz32_rand_accu/checkpoint-65000"} # use llama-2-7b-hf for base model
run_name=${NAME:-"ac_opt-125m_sub2_seg2_sum50_lr2e-5_bsz16_rand_accu/checkpoint-1000"} # use llama-2-7b-hf for base model
block_size=${BLOCK:-8192}
total=${BATCH:-16} # total batch size
bs=${SEQ:-1} # batch size per device
lr=${LR:-2e-5}
warmup_steps=${WU:-1000}
save_steps=${SAVE:-1000}
num_gpus=${NUM_GPUS:-1}
segments_per_substep=${SEG:-2}
training_substeps=${SUB:-2}
summary_length=${SUM:-50}
summary_accumulation=${ACC:-true}
randomize_substeps=${RAND:-true}
segment_lengths=${SEGLEN:-"2048 2048"}
mode=${MODE:-CP} #CP for compression, FA for full-attention
rope_theta=${ROPE:-10000}
segment_gradient_checkpointing=${CHECK:-false}
num_train_epochs=1
max_eval_samples=${MAXEVAL:-500}
num_nodes=${NUM_NODES:-1}
node=${NODE:-"localhost"}
################################
# num_gpus=$(jq -n "[$CUDA_VISIBLE_DEVICES] | length")
num_gpus=1
total_per_device=$((${total}/${num_gpus}))
accu=$(( ${total_per_device} / ${bs} ))
run_name_suffix="sub${training_substeps}_seg${segments_per_substep}_sum${summary_length}_lr${lr}_bsz${total}"
if [[ ${randomize_substeps} == true ]]; then
run_name_suffix+="_rand"
fi
if [[ $summary_accumulation == true ]]; then
run_name_suffix+="_accu"
fi
if [[ ${segment_gradient_checkpointing} == true ]]; then
run_name+="_check"
fi
run_name="ac_${base_model}_${run_name_suffix}"
echo "Run: ${run_name}"
cache_dir=./.cache
out_dir=checkpoints/$run_name
mkdir -p $out_dir
wandb disabled
export OMP_NUM_THREADS=8
header="torchrun --standalone \
--nnodes=1 \
--nproc_per_node=$num_gpus \
train.py "
model_url="/data3/lk/llm/model/opt-125m"
arguments=(
--report_to wandb
--config_name $model_url
--tokenizer_name $model_url
--model_name_or_path $model_url
--gradient_accumulation_steps $accu
--per_device_eval_batch_size $bs
--per_device_train_batch_size $bs
--learning_rate $lr
--warmup_steps $warmup_steps
--do_eval
--max_eval_samples $max_eval_samples
--logging_steps 1
--save_steps $save_steps
--preprocessing_num_workers 6
--dataloader_num_workers 6
--cache_dir $cache_dir
--add_special_tokens false
--num_train_epochs ${num_train_epochs}
--disable_tqdm true
--resume_from_checkpoint true
--log_level info
--learning_rate $lr
--output_dir $out_dir
--use_fast_tokenizer false
--summary_length $summary_length
--accumulate_summary $summary_accumulation
--remove_unused_columns false
--segments_per_substep $segments_per_substep
--training_substeps $training_substeps
--randomize_substeps $randomize_substeps
--segment_lengths $segment_lengths
--segment_gradient_checkpointing $segment_gradient_checkpointing
--bf16
--run_name $run_name
# --rope_theta ${rope_theta}
$@
)
echo "Evaluating on ${block_size} token sequences"
data="preprocessed_redpajama-weighted-disjoint_${block_size}"
arguments+=(--preprocessed_validation_datasets \
${data}/arxiv \
${data}/book \
${data}/c4 \
${data}/github \
${data}/stack_exchange \
${data}/wiki \
${data}/cc/2019-30-head-en \
${data}/cc/2019-30-middle-en \
${data}/cc/2020-05-head-en \
${data}/cc/2020-05-middle-en \
${data}/cc/2021-04-head-en \
${data}/cc/2021-04-middle-en \
${data}/cc/2022-05-head-en \
${data}/cc/2022-05-middle-en \
${data}/cc/2023-06-head-en \
${data}/cc/2023-06-middle-en \
)
if [[ $run_name == checkpoints/ac_Llama* ]]; then
arguments+=(
--lora
--lora_path $run_name
--lora_r 16
--lora_alpha 16
--lora_dropout 0.05
--lora_target_modules q_proj v_proj o_proj k_proj
--lora_modules_to_save embed_summary
)
fi
#################
echo "Training ${base_model} with lr ${lr} on ${dataset}"
echo Outputting to $out_dir
echo command: echo "$header ${arguments[@]}"
$header ${arguments[@]} 2>&1 | tee -a $out_dir/log-resume.out
And here are the outputs:
FileNotFoundError: Couldn't find a dataset script at /data3/lk/AutoCompressors/preprocessed_redpajama-weighted-disjoint_8192/arxiv/arxiv.py or any data file in the same directory. Couldn't find 'preprocessed_redpajama-weighted-disjoint_8192/arxiv' on the Hugging Face Hub either: FileNotFoundError: Dataset 'preprocessed_redpajama-weighted-disjoint_8192/arxiv' doesn't exist on the Hub