/gill-cc3m

Primary LanguagePython

gill-cc3m

Prepare data

Download from ConceptualCaptions:

  • raw/Train_GCC-training.tsv
  • raw/Validation_GCC-1.1.0-Validation.tsv

Add header:

mkdir -p header

cp raw/Train_GCC-training.tsv header/training_00.tsv
sed -i '' -e '1s/^/caption\turl\n/' header/training_00.tsv

cp raw/Validation_GCC-1.1.0-Validation.tsv header/validation_00.tsv
sed -i '' -e '1s/^/caption\turl\n/' header/validation_00.tsv

Small dataset to test first:

cp header/training_00.tsv header/training_01.tsv
sed -i '' -e '100,$d' header/training_01.tsv

cp header/validation_00.tsv header/validation_01.tsv
sed -i '' -e '10,$d' header/validation_01.tsv

Download

Require img2dataset, wandb:

uv venv
uv pip install img2dataset wandb
source .venv/bin/activate

# Create an account first
wandb login

img2dataset \
    --url_list header/training_01.tsv \
    --input_format "tsv" \
    --url_col "url" \
    --caption_col "caption" \
    --output_format webdataset \
    --output_folder output/training_01 \
    --processes_count 1 \
    --thread_count 4 \
    --image_size 256 \
    --retries 1 \
    --enable_wandb True \
    --wandb_project "gill-cc3m"

img2dataset \
    --url_list header/validation_01.tsv \
    --input_format "tsv" \
    --url_col "url" \
    --caption_col "caption" \
    --output_format webdataset \
    --output_folder output/validation_01 \
    --processes_count 1 \
    --thread_count 4 \
    --image_size 256 \
    --retries 1 \
    --enable_wandb True \
    --wandb_project "gill-cc3m"

Process

cp scripts/untar.sh output/training_01/
cd output/training_01
./untar.sh

cp scripts/untar.sh output/validation_01/
cd output/validation_01
./untar.sh

mkdir -p processed

python3 scripts/gen_train_val_tsv.py

References