CLIP Training Code

Not really an issue, I just want to share my training code since some people still have some difficulties to write the training code. Just modify the code to suit your usage.
Feel free to ask or point out any mistakes in my code.

# Latest Update : 18 July 2022, 09:55 GMT+7

# TO ADD :
# Gradient Checkpointing
# Filter out bias from weight decay
# Decaying learning rate with cosine schedule
# Half-precision Adam statistics
# Half-precision stochastically rounded text encoder weights were used

#BATCH_SIZE must larger than 1

device = "cuda:0" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training

class image_title_dataset(Dataset):
    def __init__(self, list_image_path,list_txt):

        self.image_path = list_image_path
        self.title  = clip.tokenize(list_txt) #you can tokenize everything at once in here(slow at the beginning), or tokenize it in the training loop.

    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        image = preprocess(Image.open(self.image_path[idx])) # Image from PIL module
        title = self.title[idx]
        return image,title

# use your own data
list_image_path = ['folder/image1.jpg','folder2/image2.jpg'] 
list_txt = ['description for image1.jpg' , 'description for image2.jpg']
dataset = image_title_dataset(list_image_path,list_txt)
train_dataloader = DataLoader(dataset,batch_size = BATCH_SIZE) #Define your own dataloader

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 


if device == "cpu":
  model.float()
else :
  clip.model.convert_weights(model) # Actually this line is unnecessary since clip by default already on float16

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

# add your own code to track the training progress.
for epoch in range(EPOCH):
  for batch in train_dataloader :
      optimizer.zero_grad()

      images,texts = batch 
    
      images= images.to(device)
      texts = texts.to(device)
    
      logits_per_image, logits_per_text = model(images, texts)

      ground_truth = torch.arange(len(images),dtype=torch.long,device=device)

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

NOTE :

that for inference purpose, the conversion step from fp16 to fp32 is not needed, just use the model in full fp16
For multi-GPU training, see my comment on #111 (comment)
I'm not the author of this model nor having any relationship with the author. I'm just a random guy who interested in CLIP.
For training image-image or text-text, please refer to this principle : #83 (comment)
What is the difference between image loss and text loss? isn't one just a transposed version of the other one? read this then #83 (comment)
Why the ground truth is torch.arange? #83 (comment)

Code to save the model :

torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"model_checkpoint/model_10.pt") #just change to your preferred folder/filename

Code to load the saved model :

model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("model_checkpoint/model_10.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

Alternative training code :

@Zasder3 have created a PyTorch lighting version to train the CLIP https://github.com/Zasder3/train-CLIP
@mitchellnw researchers at UW, Google, Stanford, Amazon, Columbia, and Berkeley also create their training code https://github.com/mlfoundations/open_clip

Very helpful. Thank you

Not really an issue, I just want to share my training code since some people still have some difficulties to write the training code
Feel free to ask or point out any mistakes in my code.

train_dataloader = DataLoader(...,batch_size = BATCH_SIZE) #Define your own dataloader

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
clip.model.convert_weights(model)

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params from paper

for batch in train_dataloader :
    optimizer.zero_grad()

    list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8)
    
    images= torch.stack([preprocess(Image.fromarray(img)) for img in list_image],dim=0)
    texts = clip.tokenize(list_txt)
    
    logits_per_image, logits_per_text = model(images, texts)

    ground_truth = torch.arange(BATCH_SIZE).to(device)
    total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
    total_loss.backward()

    convert_models_to_fp32(model)
    optimizer.step()
    clip.model.convert_weights(model)

Hi, Thank you for this training code.
I have a dataset, where I want to check the image similarity, and I want to use the CLIP. But I don't know how to prepare(image_size, embedding_size, transforms, etc) a dataset to feed this training code. Can you please provide me the dataset class if possible?

@vkmavani sure. The preprocess object from CLIP takes care of all of the preprocessing steps for the image part, so you don't need to worry about image_size or transform(see https://github.com/openai/CLIP/blob/main/clip/clip.py line 58).
For example, maybe your data look like this :

| image  | caption  |
---------------------
| url1   | caption1 |
| url2   | caption2 |

where the URL is the path to the image and the caption is the string of the caption.

Here's the dataset class definition for image-text similarity :

from PIL import Image

class image_caption_dataset(Dataset):
    def __init__(self, df):

        self.images = df["image"].tolist()
        self.caption = df["caption"].tolist()

    def __len__(self):
        return len(self.caption)

    def __getitem__(self, idx):
        
        images = preprocess(Image.open(self.images[idx])) #preprocess from clip.load
        caption = self.caption[idx]
        return images,caption

dataset = image_caption_dataset(df)
train_dataloader = DataLoader(dataset,batch_size = BATCH_SIZE) #Define your own dataloader

With this dataset definition, you can omit the Image.fromarray() and the preprocess step after loading the batch since the actual data already in tensor format

If you are interested in doing image-image similarity, just modify the dataset to return pair of images and
for the training code, adjust the code accordingly, a big change will happen in the creating the logits part. Change the forward method logits_per_image, logits_per_text = model(images, texts) according to https://github.com/openai/CLIP/blob/main/clip/model.py, line 354.

what is the clip.model.convert_weights meaning? and can you Provide a complete training code if possible

@lonngxiang For more information, read #57, clip.model.convert_weights basically convert the CLIP model weight into float16. This will help accelerate and reduce memory usage during training.
The definition of clip.model.convert_weight can be found at https://github.com/openai/CLIP/blob/main/clip/model.py line 371

I can't give a fully working example code since I'm using a private dataset, but I believe the training code and dataset code that I provided is sufficient.

@lonngxiang For more information, read #57, clip.model.convert_weights basically convert the CLIP model weight into float16. This will help accelerate and reduce memory usage during training.
The definition of clip.model.convert_weight can be found at https://github.com/openai/CLIP/blob/main/clip/model.py line 371

I can't give a fully working example code since I'm using a private dataset, but I believe the training code and dataset code that I provided is sufficient.

Thank you for your kind reply

there is a error when run this train code：
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.JpegImagePlugin.JpegImageFile'>

@vkmavani sure. The preprocess object from CLIP takes care of all of the preprocessing steps for the image part, so you don't need to worry about image_size or transform(see https://github.com/openai/CLIP/blob/main/clip/clip.py line 58).
For example, maybe your data look like this :
| image  | caption  |
---------------------
| url1   | caption1 |
| url2   | caption2 |
where the URL is the path to the image and the caption is the string of the caption.

Here's the dataset class definition for image-text similarity :
from PIL import Image

class image_caption_dataset(Dataset):
    def __init__(self, df):

        self.images = df["image"].tolist()
        self.caption = df["caption"].tolist()

    def __len__(self):
        return len(self.caption)

    def __getitem__(self, idx):
        
        images = Image.open(self.images[idx])
        caption = self.caption[idx]
        return images,caption

dataset = image_caption_dataset(df)
train_dataloader = DataLoader(dataset,batch_size = BATCH_SIZE) #Define your own dataloader
With this dataset definition, you can omit the Image.fromarray() since the actual data already in PIL format.

If you are interested in doing image-image similarity, just modify the dataset to return pair of images and
for the training code, adjust the code accordingly, a big change will happen in the creating the logits part. Change the forward method logits_per_image, logits_per_text = model(images, texts) according to https://github.com/openai/CLIP/blob/main/clip/model.py, line 354.

Thank you very much. It really helps a lot.

@lonngxiang oh you are correct. pardon me, I have edited my code above. The dataset should return something that can be put on PyTorch tensor.

@lonngxiang oh you are correct. pardon me, I have edited my code above. The dataset should return something that can be put on PyTorch tensor.

one more thing，when you use preprocess in class image_caption_dataset, the torch.stack's preprocess is it still useful?

@lonngxiang oh you are correct. pardon me, I have edited my code above. The dataset should return something that can be put on PyTorch tensor.

still have a error in images= torch.stack([preprocess(Image.fromarray(img)) for img in list_image],dim=0):

AttributeError: 'Tensor' object has no attribute 'array_interface'

Yeah, if already using preprocess inside the class. The result from the batch can be used directly to the CLIP. So that line can be change into this : images = list_image

Yeah, if already using preprocess inside the class. The result from the batch can be used directly to the CLIP. So that line can be change into this : images = list_image

then have anthor error:
RuntimeError: "unfolded2d_copy" not implemented for 'Half'

Hmmmm, that error is new for me. Is the error occurred when calculating the loss?

Hmmmm, that error is new for me. Is the error occurred when calculating the loss?

yes,the error occurred in this line:
logits_per_image, logits_per_text = model(images, texts)

add model(images.float(), texts.float()) still error:
RuntimeError: "unfolded2d_copy" not implemented for 'Half'

Are you using CPU by any chance? The mixed precision training usually don't work on CPU

Are you using CPU by any chance? The mixed precision training usually don't work on CPU

yes, i use it on cpu

@lonngxiang I have updated the code again. Basically, remove all code related to mixed-precision training when using CPU instead of GPU

@lonngxiang I have updated the code again. Basically, remove all code related to mixed-precision training when using CPU instead of GPU

ok. so kind of you; Thank you for your patience

@lonngxiang I have updated the code again. Basically, remove all code related to mixed-precision training when using CPU instead of GPU
run it on cpu；There's still a problem. the total_loss is always 0

@lonngxiang I have updated the code again. Basically, remove all code related to mixed-precision training when using CPU instead of GPU

how to set BATCH_SIZE to get ground_truth's label?

@lonngxiang Hmmmm, I don't have the faintest idea why the loss is = 0.

BATCH_SIZE is just an integer that you set. Since the image-text are in pairs, the first image will correspond to the first text. So the ground truth for the first image is 0, the second image will correspond to the second image, so the ground truth is 1.
This pattern keeps repeating until the last image-text pair.
So the ground truth is a torch tensor like this : torch.tensor([0,1,2,3,...,BATCH_SIZE-1]).
Since the pre-trained CLIP use a massive batch size, just try to use the largest BATCH_SIZE as your system can take.

You can read more info about cross-entropy loss https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html, especially about the target. Also the CLIP paper, page 5, the upper left part.

@lonngxiang Hmmmm, I don't have the faintest idea why the loss is = 0.

BATCH_SIZE is just an integer that you set. Since the image-text are in pairs, the first image will correspond to the first text. So the ground truth for the first image is 0, the second image will correspond to the second image, so the ground truth is 1.
This pattern keeps repeating until the last image-text pair.
So the ground truth is a torch tensor like this : torch.tensor([0,1,2,3,...,BATCH_SIZE-1]).
Since the pre-trained CLIP use a massive batch size, just try to use the largest BATCH_SIZE as your system can take.

You can read more info about cross-entropy loss https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html, especially about the target. Also the CLIP paper, page 5, the upper left part.

tks for your reply；so If you have five pairs, so your BATCH_SIZE is five，is right？

Your BATCH_SIZE will determince the number of pairs for each batch

For example, If you have 1000 pairs, and set BATCH_SIZE = 20.
Then for each loop of for batch in train_dataloader, the variable batch will give you 20 pairs. The loop will be repeated 50 times to cover all the data for 1 epoch.

Your BATCH_SIZE will determince the number of pairs for each batch

For example, If you have 1000 pairs, and set BATCH_SIZE = 20.
Then for each loop of for batch in train_dataloader, the variable batch will give you 20 pairs. The loop will be repeated 50 times to cover all the data for 1 epoch.

yes，but when I set BATCH_SIZE = 1，the total_loss is always 0，is this right？What's wrong with it

Yes, that's the problem. BATCH_SIZE must be greater than 1.
The reason is your prediction will return cosine similarity for that image and that text.
CrossEntropyLoss is combination of softmax with logloss.
Since one row only has 1 prediction(because BATCH_SIZE=1), the softmax will return probability=1 for that entry(It doesn't matter whether the logits is high or low), where it automatically correspond to the correct ground truth.

Yes, that's the problem. BATCH_SIZE must be greater than 1.
The reason is your prediction will return cosine similarity for that image and that text.
CrossEntropyLoss is combination of softmax with logloss.
Since one row only has 1 prediction(because BATCH_SIZE=1), the softmax will return probability=1 for that entry(It doesn't matter whether the logits is high or low), where it automatically correspond to the correct ground truth.

Thank you for helping me a lot and learning a lot

Don't we need to do clip.load_state_dict after clip.load?
Are we not doing model.encode_image and model.encode_text and then doing norm before training?
Can you please add demo code for early stopping, saving the model (.pt) and metrics as well
Are we fine-tuning only ViT and not the text part? How did this impact performance on custom dataset?

@dmoham1476

No. See this code https://github.com/openai/CLIP/blob/main/clip/clip.py line 114. They already load the model when we calling CLIP. Only use torch load_state_dict to continue training.
Yes, that all happen inside forward function. See this code https://github.com/openai/CLIP/blob/main/clip/model.py line 354. If you want to train text and train similarity with one to one pair, the forward already take care off the encode_image, encode_text and normalizing.

EARLYSTOP_PATIENCE = 10 # Define your own number
best_loss = np.Inf
best_iter = 0
for epoch in range(EPOCH):
  for batch in train_dataloader :
      <do training>
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

  # EVALUATION ON VALIDATION DATASET
  for batch in validation_dataloader :
    <do forward prop on batch validation data>
    val_loss = <calculate loss>

  if val_loss < best_loss :
    best_iter = epoch+1
    best_loss = val_loss

  torch.save({
        'epoch': k,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        }, f"save_dir")
        
   if ((epoch+1)-best_iter)>EARLYSTOP_PATIENCE:
     print("Early stop achieved at", epoch+1)
     break

After loading the CLIP. Try to print the CLIP. It will show a long list of layers. You can call the component like this : model.transformer, model.visual.transformer. The text part only using transformers. While the visual part, also using transformers(it's the model.visual.transformer). Loading CLIP will allow you to train all the parts by default. You can freeze some components for example like this :

for k in model.visual.transformer.parameters():  
  k.requires_grad=False

This code will freeze all the visual parts.
I encourage you to see the components of CLIP

Hi, Vinson! Thank you for your code, it helps me a lot! but I met a problem when I fine-tune CLIP on my own data with your code. The task is to classify a 6-class problem so I set batch_size=6. After fine-tuning, the model outputs sample feature for every image, is it the problem of small batch size or fixed order of 6 classes or perhaps something else?

@uplusv If you want to modify CLIP as a classifier(Single label, multi class), here's some modification you can do :

Change the ground_truth = torch.arange(BATCH_SIZE).to(device) to integer vector that specify which class your image are on (for example torch.tensor([0,1,2,1,2,3,4,5])). With this now you can set your batch size in arbitrary size.
One image should match 1 label, but 1 label can match will multiple images. You can omit the loss_txt in the total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2 to total_loss = loss_img(logits_per_image,ground_truth)

I'm not sure what you meant by "After fine-tuning, the model outputs sample feature for every image"

@uplusv If you want to modify CLIP as a classifier(Single label, multi class), here's some modification you can do :

Change the ground_truth = torch.arange(BATCH_SIZE).to(device) to integer vector that specify which class your image are on (for example torch.tensor([0,1,2,1,2,3,4,5])). With this now you can set your batch size in arbitrary size.

One image should match 1 label, but 1 label can match will multiple images. You can omit the loss_txt in the total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2 to total_loss = loss_img(logits_per_image,ground_truth)

I'm not sure what you meant by "After fine-tuning, the model outputs sample feature for every image"

Thank you for your reply and advice, I will try it soon! By "After fine-tuning, the model outputs sample feature for every image", I mean that, with "image_features = model.encode_image(image_input)" I print this "image_features" and get image_features: tensor([
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039],
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039],
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039],
...,
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039],
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039],
[ 0.0098, 0.0047, 0.0057, ..., 0.0018, 0.0056, -0.0039]])
while the original model outputs:
image_features: tensor([
[ 0.0304, -0.0169, -0.0383, ..., 0.0927, 0.0261, 0.0203],
[ 0.0013, -0.0067, -0.0524, ..., 0.1029, 0.0028, 0.0169],
[ 0.0115, -0.0006, -0.0392, ..., 0.0616, 0.0317, 0.0171],
...,
[ 0.0173, -0.0152, -0.0431, ..., 0.0836, 0.0405, 0.0268],
[ 0.0287, -0.0236, -0.0401, ..., 0.0856, 0.0119, 0.0287],
[ 0.0150, 0.0013, -0.0537, ..., 0.0792, 0.0104, 0.0062]])
After fine-tuning, the features become same and smaller so I get identical and large logits(like 99.8856) for every image😢.

Hmmm, I don't know what caused the model to produce the same value. Maybe something broke inside your data loader.
Whatever the cause is, I hope you can find your solution.

For those looking here in the future, I've made use of @vinson2233's code to create an easy-to-use PyTorch Lightning repo for training your own CLIP model from scratch.

Hey, I try to train it from scratch. But I found that the model is hard to train. The loss remains stable after some iterations. Do you meet the same problem?

@Zasder3 awesome, thanks for the effort 👍. It would be a blast if we can recreate every configuration from the paper since my code still lacks several features
@ChawDoe For me, the training went quite smoothly. I use batch size 512(with 4 GPU), 1million pairs data, and gradient accumulation for 8 steps. First several step give me loss around 2, at 20 epoch my average raining loss is 0.14

@vinson2233 Do you use fp16 training here? I think the problem may be due to my fp16 training and I set lr to 5e-5, which may lead to invalid gradients.

@ChawDoe I'm using fp16 for forward pass and gradient calculation(backward), using fp32 for parameter update(step), just like the code I'm posted. Using full fp32 give slow training time and lower batch size, while using full fp16 give NaN gradient because of underflow. The gradient might differ slightly between fp16 and fp32 but it shouldn't affect the training to the point of wrong step direction.

I use this code training to save PT or PKL files, but how do I load and reuse them later

@lonngxiang i have update the code for save and load, basically to load the model use this code :

model, preprocess = clip.load("ViT-B/32",device=device,jit=False) 
checkpoint = torch.load("model_checkpoint/model_10.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
checkpoint['model_state_dict']["input_resolution"] = model.input_resolution 
checkpoint['model_state_dict']["context_length"] = model.context_length
checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

Just modify the dict key to match your dict key when saving to .pt file

@lonngxiang i have update the code for save and load, basically to load the model use this code :

model, preprocess = clip.load("ViT-B/32",device=device,jit=False) 
checkpoint = torch.load("model_checkpoint/model_10.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
checkpoint['model_state_dict']["input_resolution"] = model.input_resolution 
checkpoint['model_state_dict']["context_length"] = model.context_length
checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

Just modify the dict key to match your dict key when saving to .pt
i see，Let me try；tks

@lonngxiang actually, you don't need to copy the entire message to reply to a specific message, especially for long message. You can use the copy link function on top right of the message to produce a URL like this that directed to the specific message: this #83 (comment). Just to make things shorter

For your question, yes k is epoch and loss is total_loss, I just copy-paste from my actual code and forgot to change the variable, will fix that right away.

@vinson2233 tks，#83 (comment) ，is k equal EPOCH or epoch？i saw you write epoch now

@lonngxiang I save my model in every epoch, so I use the epoch variable. Also, if the model training completed, then the epoch will equal to EPOCH-1.
You also can change it torch.save({'epoch': epoch+1,...} so the savings start from 1 and the final save will have epoch key equal to EPOCH.
Note that this epoch key will not affect the model behavior after loading, it only stores meta-data for the model.
The same also goes for total_loss.

How about the effect of fine-tuning? It seems to affect the previous normal results, and the effect is not good

Some scenario for saving model :

Train and inference : The important thing to save only the model state
Train, pause and resume : Save the model, epoch and optimizer state. Your epoch counter in the loop will be the continuation of the last epoch. You also need to load the optimizer state from previous training (Adam need info about the running gradient), you can use the load_state_dict method on the optimizer to load checkpoint['optimizer_state_dict']

Some scenario for saving model :

Train and inference : The important thing to save only the model state

Train, pause and resume : Save the model, epoch and optimizer state. Your epoch counter in the loop will be the continuation of the last epoch. You also need to load the optimizer state from previous training (Adam need info about the running gradient), you can use the load_state_dict method on the optimizer to load checkpoint['optimizer_state_dict']

how to load？like this？
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

@lonngxiang That should be correct. Note that I never try to load the optimizer since I never pause the training.

@lonngxiang That should be correct. Note that I never try to load the optimizer since I never pause the training.
tks，Let me try it

what is it that doesn't work? does it raise any error? I set jit=False when loading the model for the clip.load

what is it that doesn't work? does it raise any error? I set jit=False when loading the model for the clip.load
Again, it didn't work out well

what is it that doesn't work? does it raise any error? I set jit=False when loading the model for the clip.load

There were no mistakes, but the results were bad

Hi @vinson2233 ,

First of all, thanks for the contribution.

I'm implementing a package using part of the code you published here. The add-ons I have are more related to creating the custom dataset, adding unit tests, dockerising the whole thing and also offering a service so other people can have quickly have inference running for their own + some plots.

However, when trying to train on my RTX 2080, I'm getting this:

Traceback (most recent call last):
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/wilderrodrigues/clip-mania/clip_mania/application/train.py", line 37, in <module>
    app.run(main)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/absl/app.py", line 303, in run
    _run_main(main, args)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/absl/app.py", line 251, in _run_main
    sys.exit(main(argv))
  File "/home/wilderrodrigues/clip-mania/clip_mania/application/train.py", line 28, in main
    model, preprocess = executor.train(dataset_path, epochs=epochs)
  File "/home/wilderrodrigues/clip-mania/clip_mania/core/executor.py", line 67, in train
    logits_per_image, logits_per_text = clip_model(images, prompts)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/clip/model.py", line 355, in forward
    image_features = self.encode_image(image)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/clip/model.py", line 337, in encode_image
    return self.visual(image.type(self.dtype))
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/clip/model.py", line 220, in forward
    x = self.conv1(x)  # shape = [*, width, grid, grid]
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 423, in forward
    return self._conv_forward(input, self.weight)
  File "/home/wilderrodrigues/.conda/envs/clip-mania/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 419, in _conv_forward
    return F.conv2d(input, weight, self.bias, self.stride,
RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _thnn_conv2d_forward

This is very weird! The code is basically the same. I was using the latest PyTorch, but just now decided to downgrade it and use the same version as the CLIP repo. It still fails. I also tried this:

clip_model, preprocess = clip.load(name=model_name, device=device, jit=False)
clip_model = clip_model.cuda()

No effect at all.

And yes, the GPU is available. I can see it when I try this:

>>> import torch
>>> torch.cuda.is_available()
True

Thanks in advance!

Just another point:

I checked the parameters of both clip.visual.transformer and clip.transformer blocks and all the tensors are already in the right device.

...
-7.6884e-01,  2.5686e-01, -6.0447e-01], device='cuda:0',
       requires_grad=True)
...

I'm also trying to change the Transformer conv1, having it allocated to the CUDA device. But not helping yet.

Forward pass in the model.py of clip is not happy:

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]

Will keep digging.

I found the problem, finally! :) For example, this:

      images= torch.stack([preprocess(Image.fromarray(img)) for img in list_image],dim=0)
      texts = clip.tokenize(list_txt)

Should be like this:

      images= torch.stack([preprocess(Image.fromarray(img)) for img in list_image],dim=0).cuda()
      texts = clip.tokenize(list_txt).cuda()

Or, of course, if one prefers, one can have .to(device) instead.

Just another quick note: the way the ground_truth is being taken as a range from the BATCH_SIZE is not right. If one has 10 classes and uses 2 batches, it will always give classes (ground_truth) of 0 and 1. So, it bias the training saying that all the images are 0s and 1s.

I changed it and I'm now mapping my prompts (the sentences) to classes in a dictionary: prompt being the key and an integer being the class. It works as expected and with BATCH_SIZE=1 on my MacBook and my GPU. I got 3 novel pictures of airplanes - not in the original dataset - and ran the training for 2 epochs to get this as result:

test_executor.py::TestModelExecutor::test_instance 
test_executor.py::TestModelExecutor::test_train 

============================== 2 passed in 19.44s ==============================

Process finished with exit code 0
PASSED                [ 50%]PASSED                   [100%]
Expected 'an airplane' and  got 'an airplane'
Probability for the expected prompt was '0.9883'
Highest probability was '0.9883'
100%|██████████| 2/2 [00:13<00:00,  6.93s/it]

Before my changes were applied, the loss was always the same and the classifier was not working.

Have anyone tried fine-tune CLIP on MS COCO image-text retrieval task? How is the performance compared with other state-of-the-art models?

Have anyone tried fine-tune CLIP on MS COCO image-text retrieval task? How is the performance compared with other state-of-the-art models?

everything goes worse when I fine-tune the CLIP

Have anyone tried fine-tune CLIP on MS COCO image-text retrieval task? How is the performance compared with other state-of-the-art models?

everything goes worse when I fine-tune the CLIP

From my experiment, the zero-shot image retrieval performance is R@1 25.4, R@5 48.7 and R@10 59.9 on the MS COCO
5k test set. After fine-tuning, it slightly improves to R@1 33.6, R@5 62.2 and R@10 73.8. Still lags behind SOTA non-transformer-based models(e.g., VSRN).

Have anyone tried fine-tune CLIP on MS COCO image-text retrieval task? How is the performance compared with other state-of-the-art models?

everything goes worse when I fine-tune the CLIP

From my experiment, the zero-shot image retrieval performance is R@1 25.4, R@5 48.7 and R@10 59.9 on the MS COCO
5k test set. After fine-tuning, it slightly improves to R@1 33.6, R@5 62.2 and R@10 73.8. Still lags behind SOTA non-transformer-based models(e.g., VSRN).

are you use this issues code to finetune？ https://github.com/openai/CLIP/issues/83；anythin are you changes ？

@wilderrodrigues I forgot to include to(device) in my code, thanks for the catch.
Also regarding the ground truth, this ground_truth is designed for image-title embedding to utilize the concept of n-pair-loss, not for image-title classification. I have mentioned this somewhere in this long threads about modifying the CLIP for classification task (#83 (comment))

Have anyone tried fine-tune CLIP on MS COCO image-text retrieval task? How is the performance compared with other state-of-the-art models?

everything goes worse when I fine-tune the CLIP

From my experiment, the zero-shot image retrieval performance is R@1 25.4, R@5 48.7 and R@10 59.9 on the MS COCO
5k test set. After fine-tuning, it slightly improves to R@1 33.6, R@5 62.2 and R@10 73.8. Still lags behind SOTA non-transformer-based models(e.g., VSRN).

are you use this issues code to finetune？ https://github.com/openai/CLIP/issues/83；anythin are you changes ？

I used the PyTorch lightning code, but modify the learning rate for ViT-B/32 from 5e-4 to 5e-5. If I use 5e-4 learning rate, after the first several steps, the model degraded to near-zero accuracy. But if I use 5e-5 learning rate, the model will steadily get improved to the results I posted.

Not really an issue, I just want to share my training code since some people still have some difficulties to write the training code. Just modify the code to suit your usage.
Feel free to ask or point out any mistakes in my code.

# Latest Update : 04 June 2021, 09:59 GMT+7

# TO ADD :
# Gradient Checkpointing
# Filter out bias from weight decay
# Decaying learning rate with cosine schedule
# Half-precision Adam statistics
# Half-precision stochastically rounded text encoder weights were used

#BATCH_SIZE must larger than 1
train_dataloader = DataLoader(...,batch_size = BATCH_SIZE) #Define your own dataloader

#https://github.com/openai/CLIP/issues/57
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

device = "cuda:0" if torch.cuda.is_available() else "cpu" # If using GPU then use mixed precision training.
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
if device == "cpu":
  model.float()
else :
  clip.model.convert_weights(model) # Actually this line is unnecessary since clip by default already on float16

loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

for epoch in range(EPOCH):
  for batch in train_dataloader :
      optimizer.zero_grad()

      list_image,list_txt = batch #list_images is list of image in numpy array(np.uint8), or list of PIL images
    
      images= torch.stack([preprocess(Image.fromarray(img)) for img in list_image],dim=0).to(device) # omit the Image.fromarray if the images already in PIL format, change this line to images=list_image if using preprocess inside the dataset class
      texts = clip.tokenize(list_txt).to(device)
    
      logits_per_image, logits_per_text = model(images, texts)
      if device == "cpu":
        ground_truth = torch.arange(BATCH_SIZE).long().to(device)
      else:
        ground_truth = torch.arange(BATCH_SIZE).half().to(device)

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

NOTE :
that for inference purpose, the conversion step from fp16 to fp32 is not needed, just use the model in full fp16
For multi-GPU training, see my comment on #111 (comment)

Code to save the model :

torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"model_checkpoint/model_10.pt") #just change to your preferred folder/filename

Code to load the saved model :

model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("model_checkpoint/model_10.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])

Notes : @Zasder3 have created a PyTorch lighting version to train the CLIP https://github.com/Zasder3/train-CLIP

Hi, thanks for your sharing.
I was wondering what's the difference between your operation (like cast to fp16 and set back to fp32) and automatic-mix-precision(eg. torch.cuda.amp). Because in my code, I adopt "torch.cuda.amp" but it sometimes occurs NAN during training.

@yangydeng good question. Your question was discussed in the #57 (comment).
The Author manually specify what kind of operation done in FP16 and FP32, this can be seen from these part of code

CLIP/clip/model.py

Lines 371 to 392 in cfcffb9

    
           def convert_weights(model: nn.Module): 
        
               """Convert applicable model parameters to fp16""" 
        
               def _convert_weights_to_fp16(l): 
        
                   if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): 
        
                       l.weight.data = l.weight.data.half() 
        
                       if l.bias is not None: 
        
                           l.bias.data = l.bias.data.half() 
        
                   if isinstance(l, nn.MultiheadAttention): 
        
                       for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]: 
        
                           tensor = getattr(l, attr) 
        
                           if tensor is not None: 
        
                               tensor.data = tensor.data.half() 
        
                   for name in ["text_projection", "proj"]: 
        
                       if hasattr(l, name): 
        
                           attr = getattr(l, name) 
        
                           if attr is not None: 
        
                               attr.data = attr.data.half() 
        
               model.apply(_convert_weights_to_fp16)

That's why torch.cuda.amp give different result since amp convert everything(I think), while the author filter out which operation done in FP16

@yangydeng good question. Your question was discussed in the #57 (comment).
The Author manually specify what kind of operation done in FP16 and FP32, this can be seen from these part of code

CLIP/clip/model.py

Lines 371 to 392 in cfcffb9

def convert_weights(model: nn.Module):

"""Convert applicable model parameters to fp16"""

def _convert_weights_to_fp16(l):

if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):

l.weight.data = l.weight.data.half()

if l.bias is not None:

l.bias.data = l.bias.data.half()

if isinstance(l, nn.MultiheadAttention):

for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:

tensor = getattr(l, attr)

if tensor is not None:

tensor.data = tensor.data.half()

for name in ["text_projection", "proj"]:

if hasattr(l, name):

attr = getattr(l, name)

if attr is not None:

attr.data = attr.data.half()

model.apply(_convert_weights_to_fp16)

That's why torch.cuda.amp give different result since amp convert everything(I think), while the author filter out which operation done in FP16

Thanks for your reply, I'll try that later.
And I found you adpot quite a large weight-decay (=0.2) as default. Have you try any other number?

@yangydeng That's the value that used by the author (see the paper Appendix F, table 18 on page 48). I haven't try weight-decay lower than that. But I have tried AdamW with weight decay=0.1 and it gives a worse result.

@yangydeng That's the value that used by the author (see the paper Appendix F, table 18 on page 48). I haven't try weight-decay lower than that. But I have tried AdamW with weight decay=0.1 and it gives a worse result.

Cool, many details can be found on this table.

@vinson2233 Thanks for sharing the code. I am getting following error while using the code. Have you seen this during your training?

RuntimeError: Expected object of scalar type Long but got scalar type Half for argument #2 'target' in call to _thnn_nll_loss_forward

@ThakurRajAnand Is the error raise on the loss_img(logits_per_image,ground_truth) part? Are you working on CPU or GPU ? The error you are showing indicates that the target is Half(then your device is not "CPU"), but the logit are Long type, which indicate that your device is indeed CPU. It might be more helpful if you could print the data type at any given step of your training.

@ThakurRajAnand Is the error raise on the loss_img(logits_per_image,ground_truth) part? Are you working on CPU or GPU ? The error you are showing indicates that the target is Half(then your device is not "CPU"), but the logit are Long type, which indicate that your device is indeed CPU. It might be more helpful if you could print the data type at any given step of your training.

Hi, I am facing the same issue. I verified that my device = 'cuda:0' - I am not sure where I am going wrong. Can you please point out why its giving the error:

RuntimeError: Expected object of scalar type Long but got scalar type Half for argument #2 'target' in call to _thnn_nll_loss_forward

@ThakurRajAnand @uahsan3 Thanks for the catch. The mistake is from my code. The ground truth should be always long regardless of the device. I've already edited the code. This is the new definition of the ground truth.

ground_truth = torch.arange(BATCH_SIZE,dtype=torch.long,device=device)

@vinson2233 Thanks. I got busy and didn't get a chance to reply that I was able to fix it same day by making same change in your code.

I had another question. Can we use the model to predict text from the model I trained on my own data? I tried but it was asking for image and text both.

@ThakurRajAnand the forward method that defined on CLIP model is basically composition of model.encode_text(text), model.encode_image(img) and cosine calculation between 2 embeddings. If you want to use just the text, then the encode_text is the method you are looking for.
See this part of code for details

CLIP/clip/model.py

Lines 354 to 356 in cfcffb9

    
           def forward(self, image, text): 
        
               image_features = self.encode_image(image) 
        
               text_features = self.encode_text(text)

@vinson2233 I think I didn't explain clearly. I am interested in predicting text by providing new image only. Is that possible? Text I trained on is long e.g. this is a geometry question and we have circles in it

@ThakurRajAnand
Ohh i see. So regarding text generation, there are 2 types of techniques. The first one is you already have a set of predefined text and try to fetch which text is the most suitable one(text-retrieval style). The second one is to generate new text from the image(generative). I think the CLIP cannot do the latter one, but you can use CLIP for the text-retrieval style.
I think what your are looking for is something like this https://github.com/pzzhang/VinVL or https://github.com/microsoft/Oscar

if I only want to save encode_text piece of the model, anyone knows how to do it?

@lonngxiang someone have post this question in this threads #113. I also want to know the answer.

@lonngxiang someone have post this question in this threads #113. I also want to know the answer.

tks

@vinson2233
Hey vinson, Thank you for sharing, it was a great help. But I encountered a problem during training.

Traceback (most recent call last):
File "/CLIP-main/fine_tune.py", line 88, in
logits_per_image, logits_per_text = model(images, texts)
File "/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/CLIP-main/clip/model.py", line 357, in forward
image_features = self.encode_image(image)
File "/CLIP-main/clip/model.py", line 339, in encode_image
return self.visual(image.type(self.dtype))
File "/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/CLIP-main/clip/model.py", line 226, in forward
x = x + self.positional_embedding.to(x.dtype)
RuntimeError: The size of tensor a (1711) must match the size of tensor b (50) at non-singleton dimension 1`

I further tested and finally found that the error occurred in model.py line226：
`

def forward(self, x: torch.Tensor):
    x = self.conv1(x)  # shape = [*, width, grid, grid]
    x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
    x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
    x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
    x = x + self.positional_embedding.to(x.dtype) **#The size of tensor x is 1711 but another is 50.**
    x = self.ln_pre(x)`

I’m not sure what went wrong. I used pandas to load my csv data set, and some texts are very long. Maybe the reason is the text length is too long to be preprocessed but I am not sure.

@lr19960813 first, your error occured during the image_features = self.encode_image(image) step, so the error caused by the image. Have you make sure to use preprocess module to your images before feeding them to the network? you can put it inside the dataloader when fetching the images.

second, regarding the long text, yes, Clip text only accept token length 77 (1 token represent 1-3 characracter). It's better to trim your text first.

@lr19960813 first, your error occured during the image_features = self.encode_image(image) step, so the error caused by the image. Have you make sure to use preprocess module to your images before feeding them to the network? you can put it inside the dataloader when fetching the images.

second, regarding the long text, yes, Clip text only accept token length 77 (1 token represent 1-3 characracter). It's better to trim your text first.

Thank you for your reply! It is very helpful! I have finiished that experiment. By the way, how I can train clip model with my own dataset from scratch. It seems that this code justs fine tune clip model by using its default parameters.

@lr19960813 Then you need to load the model from scratch without loading the trained weight.
Here's the code, you can also set the context_length to a greater number if you have a really long text.

from clip.model import CLIP
model = CLIP(
        embed_dim,
        image_resolution, vision_layers, vision_width, vision_patch_size,
        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
    )

If you confused about how to fill the value (for example : embed_dim), you can refer to existing model by inspecting the state_dict of existing model.

model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Best model use ViT-B/32
model_state_dict = model.state_dict()
embed_dim = model_state_dict['text_projection'].shape[1]

To see other params, you can refer to this code :

CLIP/clip/model.py

Lines 395 to 424 in cfcffb9

    
           def build_model(state_dict: dict): 
        
               vit = "visual.proj" in state_dict 
        
               if vit: 
        
                   vision_width = state_dict["visual.conv1.weight"].shape[0] 
        
                   vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")]) 
        
                   vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] 
        
                   grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5) 
        
                   image_resolution = vision_patch_size * grid_size 
        
               else: 
        
                   counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]] 
        
                   vision_layers = tuple(counts) 
        
                   vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] 
        
                   output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5) 
        
                   vision_patch_size = None 
        
                   assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0] 
        
                   image_resolution = output_width * 32 
        
               embed_dim = state_dict["text_projection"].shape[1] 
        
               context_length = state_dict["positional_embedding"].shape[0] 
        
               vocab_size = state_dict["token_embedding.weight"].shape[0] 
        
               transformer_width = state_dict["ln_final.weight"].shape[0] 
        
               transformer_heads = transformer_width // 64 
        
               transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks"))) 
        
               model = CLIP( 
        
                   embed_dim, 
        
                   image_resolution, vision_layers, vision_width, vision_patch_size, 
        
                   context_length, vocab_size, transformer_width, transformer_heads, transformer_layers 
        
               )

Hello, this is Yong, a researcher from South Korea.

I appreciate your work and it is very helpful for me as I have tried conducting fine-tuning task.
As I studied, CLIP includes a potential risk such as overfiting during fine-tuning with the small number of data.

Thus, I wonder about how much data is necessary at least when applying fine-tuning?
In my case, I have 4 classes and each class includes at most 10 pictures.
In such scenario, can I carry out fine-tuning task? or just use the traditional CLIP?

I am looking forward to your answer and I appreciate your work again.
Thank you.

Dear Yong This is Li Rui a doctor from japan. I think you are right. I meet the same problem with you. I try another fine tune task by using 80000+ datas and it can not converge. I am trying small learning rate and use bigger batch size. It preforms better than before but still not very good. I think you can try it. Li Rui gunwooYong ***@***.***> 于2021年7月22日周四下午1:54写道：

…

Hello, this is Yong, a researcher from South Korea. I appreciate your work and it is very helpful for me as I have tried conducting fine-tuning task. As I studied, CLIP includes a potential risk such as overfiting during fine-tuning with the small number of data. Thus, I wonder about how much data is necessary at least when applying fine-tuning? In my case, I have 4 classes and each class includes at most 10 pictures. In such scenario, can I carry out fine-tuning task? or just use the traditional CLIP? I am looking forward to your answer and I appreciate your work again. Thank you. — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#83 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/AJQWBGTJXBFN6TPRPGURLWDTY6P73ANCNFSM42SGLVGA> .

Dear Rui,

I really appreciate your help.
As described in CLIP-Art(https://github.com/KeremTurgutlu/clip_art), they used 140,000 images for fine-tuning.
So, I think you may solve your problem thorough data augmentation.

However, I cannot acquire more data as it is related with a specific domain.
For this reason, I am trying to conduct fine-tuning with the tiny number of images.

I hope you to overcome this issue.

Best regard,

Dear Yong This is Li Rui a doctor from japan. I think you are right. I meet the same problem with you. I try another fine tune task by using 80000+ datas and it can not converge. I am trying small learning rate and use bigger batch size. It preforms better than before but still not very good. I think you can try it. Li Rui gunwooYong @.***> 于2021年7月22日周四下午1:54写道：
…
Hello, this is Yong, a researcher from South Korea. I appreciate your work and it is very helpful for me as I have tried conducting fine-tuning task. As I studied, CLIP includes a potential risk such as overfiting during fine-tuning with the small number of data. Thus, I wonder about how much data is necessary at least when applying fine-tuning? In my case, I have 4 classes and each class includes at most 10 pictures. In such scenario, can I carry out fine-tuning task? or just use the traditional CLIP? I am looking forward to your answer and I appreciate your work again. Thank you. — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub <#83 (comment)>, or unsubscribe https://github.com/notifications/unsubscribe-auth/AJQWBGTJXBFN6TPRPGURLWDTY6P73ANCNFSM42SGLVGA .

@gunwooYong I might be not the person you are looking for since I'm not the author nor have any affiliation with openai. I think this question is better to discuss in new issue on github page so the real author have more chance to see the question.

But regardless, I did train CLIP as image classification model, where I have 19 categories with around 100k images, with modification where the loss only use the first logits(dim : number_image x num_class) since 1 image only have 1 category but 1 category can be owned by multiple images in the batch. The performance is still way behind a CNN Classification model.

In case it is helpful, posting a link to our implementation of CLIP-training code https://github.com/mlfoundations/open_clip

@mitchellnw Nice, thanks for sharing, I'll put it on #83 (comment) so it is easier for people to reach your work.

Thank for providing this @vinson2233 . I was wondering how come the loss does not include the cosine similarity of the encoded image and text? Following the "Figure 3" from the original clip paper. Or it is included and I am missing something?! 🤔

@sarahESL, Actually, the code already include that, but it is not explicit enough on showing that.

logits_per_image, logits_per_text = model(images, texts)

This line of code is producing the cosine similarity of the encoded image and text, not the embedding. I'm using the terms 'logits' just to follow the paper and keep it consistent with the example shown in the readme (Actually I didn't like the word logits).

So if we have 10 text and 5 images. logits_per_image dimension is 5 x 10, and the logits_per_text is 10 x 5. Where each entries is cosine similarity times the temperature parameter (you can divide it by 100 or the temperature value itself to get the usual cosine similarity tough).

Hi, just wanna share my work on deploying CLIP on iOS for cross-modal text-to-image retrieval/search: https://github.com/DRSY/MTIS. I exported the CLIP image encoder and text encoder as TorchScript format, which can then be loaded via torch's c++ libtorch frontend.

Hi, I want to revise the structure of clip's model. how to load some special layer and keep it constant in train on new dataset.If you reply me earlier, I will be greatly thankful for you.

I'm not sure what you mean by revise. But have you seen this link? https://discuss.pytorch.org/t/how-to-replace-a-layer-or-module-in-a-pretrained-network/60068
So basically you just access specific part of CLIP, and replace it.

To make it constant, I'm assuming what you mean is not update that specific layer during training. Maybe this forum will match your interest https://discuss.pytorch.org/t/how-the-pytorch-freeze-network-in-some-layers-only-the-rest-of-the-training/7088. So basically just set require grad = False if the layers name match with your custom layer

I'm not sure what you mean by revise. But have you seen this link? https://discuss.pytorch.org/t/how-to-replace-a-layer-or-module-in-a-pretrained-network/60068
So basically you just access specific part of CLIP, and replace it.

To make it constant, I'm assuming what you mean is not update that specific layer during training. Maybe this forum will match your interest https://discuss.pytorch.org/t/how-the-pytorch-freeze-network-in-some-layers-only-the-rest-of-the-training/7088. So basically just set require grad = False if the layers name match with your custom layer

Thank for providing this. I have some confuse, how to verify the revised net have improvement. Which training set is best to train and verify.

@NingYuanxiang If I in your position, I will create new image-title dataset, which have not been trained by the CLIP itself (So don't use COCO). This dataset should be totally new and never seen by the CLIP before.
After you have such dataset, try to finetune the original CLIP for the new dataset. Also finetune the revised CLIP for the new dataset, and then compare the performance.
Of course this approach is only based on empirical result, which mean you need to do extensive experiment for validating your modification of CLIP.

@NingYuanxiang If I in your position, I will create new image-title dataset, which have not been trained by the CLIP itself (So don't use COCO). This dataset should be totally new and never seen by the CLIP before.
After you have such dataset, try to finetune the original CLIP for the new dataset. Also finetune the revised CLIP for the new dataset, and then compare the performance.
Of course this approach is only based on empirical result, which mean you need to do extensive experiment for validating your modification of CLIP.

Dear @vinson2233 am I correct that as least train split of MS-COCO (COCO Captions) was used during training of provided checkpoints in this repo?

@4sunshine When reading the paper, I'm assume they use every dataset that mentioned in paper in the checkpoint model in this repo. For example MS-COCO (Lin et al., 2014), Visual Genome (Krishna et al., 2017), YFCC100M (Thomee et al., 2016).
I don't know if the COCO part they are using is only the train part, or maybe they just doing evaluation on COCO. We need to clarify it with the author (And again, I'm not the author).

@4sunshine When reading the paper, I'm assume they use every dataset that mentioned in paper in the checkpoint model in this repo. For example MS-COCO (Lin et al., 2014), Visual Genome (Krishna et al., 2017), YFCC100M (Thomee et al., 2016).
I don't know if the COCO part they are using is only the train part, or maybe they just doing evaluation on COCO. We need to clarify it with the author (And again, I'm not the author).

Thank you @vinson2233 for quick reply. I want to know the answer because of evaluation setup used in paper https://arxiv.org/abs/2012.04329. This paper is about Scene-Text aware Text-to-Image retrieval and its test samples partly belong to MS-COCO train.

is this demo code used for finetune on small dataset?

I want to know what is the general loss of your code during training? My loss will drop from greater than 2 to greater than 1. Is this normal?

	def convert_weights(model: nn.Module):
	"""Convert applicable model parameters to fp16"""

	def _convert_weights_to_fp16(l):
	if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
	l.weight.data = l.weight.data.half()
	if l.bias is not None:
	l.bias.data = l.bias.data.half()

	if isinstance(l, nn.MultiheadAttention):
	for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
	tensor = getattr(l, attr)
	if tensor is not None:
	tensor.data = tensor.data.half()

	for name in ["text_projection", "proj"]:
	if hasattr(l, name):
	attr = getattr(l, name)
	if attr is not None:
	attr.data = attr.data.half()

	model.apply(_convert_weights_to_fp16)

	def forward(self, image, text):
	image_features = self.encode_image(image)
	text_features = self.encode_text(text)

	def build_model(state_dict: dict):
	vit = "visual.proj" in state_dict

	if vit:
	vision_width = state_dict["visual.conv1.weight"].shape[0]
	vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
	vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
	grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
	image_resolution = vision_patch_size * grid_size
	else:
	counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
	vision_layers = tuple(counts)
	vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
	output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
	vision_patch_size = None
	assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
	image_resolution = output_width * 32

	embed_dim = state_dict["text_projection"].shape[1]
	context_length = state_dict["positional_embedding"].shape[0]
	vocab_size = state_dict["token_embedding.weight"].shape[0]
	transformer_width = state_dict["ln_final.weight"].shape[0]
	transformer_heads = transformer_width // 64
	transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))

	model = CLIP(
	embed_dim,
	image_resolution, vision_layers, vision_width, vision_patch_size,
	context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
	)