princeton-nlp/AutoCompressors

Evaluation datasets needed

Opened this issue · 0 comments

Thank you for your great work!

While running the evaluation, I encountered a "datasets not found" issue. Could you please let me know how I can obtain the evaluation datasets?

I appreciate your assistance and look forward to your response.

Here is my eval.sh:

#!/bin/bash -l
#SBATCH -J eval_llama
#SBATCH -N 1 -n 1
#SBATCH --output=slurm/%x-%j.out
#SBATCH --gres=gpu:a100:1 --constraint gpu80
#SBATCH --cpus-per-task=32
#SBATCH --mem=128G
#SBATCH -t 0-3:00:00




# nvidia-smi
# conda activate auto
base_model=${BASE:-"opt-125m"}
# run_name=${NAME:-"ac_Llama-2-7b-hf_sub2_seg2_sum50_lr4e-4_bsz32_rand_accu/checkpoint-65000"}    # use llama-2-7b-hf for base model
run_name=${NAME:-"ac_opt-125m_sub2_seg2_sum50_lr2e-5_bsz16_rand_accu/checkpoint-1000"}    # use llama-2-7b-hf for base model
block_size=${BLOCK:-8192}

total=${BATCH:-16}      # total batch size
bs=${SEQ:-1}            # batch size per device
lr=${LR:-2e-5}
warmup_steps=${WU:-1000}
save_steps=${SAVE:-1000}
num_gpus=${NUM_GPUS:-1}
segments_per_substep=${SEG:-2}
training_substeps=${SUB:-2}
summary_length=${SUM:-50}
summary_accumulation=${ACC:-true}
randomize_substeps=${RAND:-true}
segment_lengths=${SEGLEN:-"2048 2048"}
mode=${MODE:-CP}        #CP for compression, FA for full-attention
rope_theta=${ROPE:-10000}
segment_gradient_checkpointing=${CHECK:-false}
num_train_epochs=1

max_eval_samples=${MAXEVAL:-500}
num_nodes=${NUM_NODES:-1}
node=${NODE:-"localhost"}
################################

# num_gpus=$(jq -n "[$CUDA_VISIBLE_DEVICES] | length")
num_gpus=1
total_per_device=$((${total}/${num_gpus}))
accu=$(( ${total_per_device} / ${bs} ))

run_name_suffix="sub${training_substeps}_seg${segments_per_substep}_sum${summary_length}_lr${lr}_bsz${total}"
if [[ ${randomize_substeps} == true ]]; then
    run_name_suffix+="_rand"
fi
if [[ $summary_accumulation == true ]]; then
    run_name_suffix+="_accu"
fi
if [[ ${segment_gradient_checkpointing} == true ]]; then
    run_name+="_check"
fi
run_name="ac_${base_model}_${run_name_suffix}"

echo "Run: ${run_name}"

cache_dir=./.cache
out_dir=checkpoints/$run_name
mkdir -p $out_dir

wandb disabled

export OMP_NUM_THREADS=8
header="torchrun --standalone \
--nnodes=1 \
--nproc_per_node=$num_gpus \
train.py "


model_url="/data3/lk/llm/model/opt-125m"
arguments=(
    --report_to wandb
    --config_name $model_url
    --tokenizer_name $model_url
    --model_name_or_path $model_url
    --gradient_accumulation_steps $accu
    --per_device_eval_batch_size $bs
    --per_device_train_batch_size $bs
    --learning_rate $lr
    --warmup_steps $warmup_steps
    --do_eval
    --max_eval_samples $max_eval_samples
    --logging_steps 1
    --save_steps $save_steps
    --preprocessing_num_workers 6
    --dataloader_num_workers 6
    --cache_dir $cache_dir
    --add_special_tokens false
    --num_train_epochs ${num_train_epochs}
    --disable_tqdm true
    --resume_from_checkpoint true
    --log_level info
    --learning_rate $lr
    --output_dir $out_dir
    --use_fast_tokenizer false
    --summary_length $summary_length
    --accumulate_summary $summary_accumulation
    --remove_unused_columns false
    --segments_per_substep $segments_per_substep
    --training_substeps $training_substeps
    --randomize_substeps $randomize_substeps
    --segment_lengths $segment_lengths
    --segment_gradient_checkpointing $segment_gradient_checkpointing
    --bf16
    --run_name $run_name
    # --rope_theta ${rope_theta}
    $@
)

echo "Evaluating on ${block_size} token sequences"
data="preprocessed_redpajama-weighted-disjoint_${block_size}"
arguments+=(--preprocessed_validation_datasets \
                    ${data}/arxiv \
                    ${data}/book \
                    ${data}/c4 \
                    ${data}/github \
                    ${data}/stack_exchange \
                    ${data}/wiki \
                    ${data}/cc/2019-30-head-en \
                    ${data}/cc/2019-30-middle-en \
                    ${data}/cc/2020-05-head-en \
                    ${data}/cc/2020-05-middle-en \
                    ${data}/cc/2021-04-head-en \
                    ${data}/cc/2021-04-middle-en \
                    ${data}/cc/2022-05-head-en \
                    ${data}/cc/2022-05-middle-en \
                    ${data}/cc/2023-06-head-en \
                    ${data}/cc/2023-06-middle-en \
                    )

if [[ $run_name == checkpoints/ac_Llama* ]]; then
    arguments+=(
    --lora
    --lora_path $run_name
    --lora_r 16
    --lora_alpha 16
    --lora_dropout 0.05
    --lora_target_modules q_proj v_proj o_proj k_proj
    --lora_modules_to_save embed_summary
    )
fi

#################

echo "Training ${base_model} with lr ${lr} on ${dataset}"
echo Outputting to $out_dir

echo command: echo "$header ${arguments[@]}"
$header ${arguments[@]} 2>&1 | tee -a $out_dir/log-resume.out

And here are the outputs:

FileNotFoundError: Couldn't find a dataset script at /data3/lk/AutoCompressors/preprocessed_redpajama-weighted-disjoint_8192/arxiv/arxiv.py or any data file in the same directory. Couldn't find 'preprocessed_redpajama-weighted-disjoint_8192/arxiv' on the Hugging Face Hub either: FileNotFoundError: Dataset 'preprocessed_redpajama-weighted-disjoint_8192/arxiv' doesn't exist on the Hub