salesforce/LAVIS

BLIP3 inference error

josephzpng opened this issue · 2 comments

Great job! I encountered the following problem when running the inference example. Looking forward to your reply!

Environment:

torch==2.0.1
transformers==4.41.1

Model Response:

The second image is a close-up of a cat's face, focusing on its eyes and nose. The cat appears to be looking directly at the camera, and its eyes are wide open, giving it a curious and alert expression. The fur is predominantly white with black markings around the eyes and nose. The background is blurred, emphasizing the cat's face. The image contains person (in the center), person (to the left of the center), person (to the right of the center), person (to the right of the center), person (to the left of the center), person (in the center), person (in the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the right of the center), person (to the left of the center), person (to the

Code

import os

from omegaconf import OmegaConf
from functools import partial
from PIL import Image
import torch

from open_flamingo import create_model_and_transforms
from open_flamingo.train.any_res_data_utils import process_images

model_ckpt="/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/blip3/xgen-mm-phi3-mini-base-r-v1.5.pt"
cfg = dict(
model_family = 'xgenmm_v1',
lm_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/Phi-3-mini-4k-instruct',
vision_encoder_path = '/mnt/dolphinfs/hdd_pool/docker/user/hadoop-vacv/checkpoint/siglip-so400m-patch14-384',
vision_encoder_pretrained = 'google',
num_vision_tokens = 128,
image_aspect_ratio = 'anyres',
anyres_patch_sampling = True,
anyres_grids = [(1,2),(2,1),(2,2),(3,1),(1,3)],
ckpt_pth = model_ckpt,
)
cfg = OmegaConf.create(cfg)

additional_kwargs = {
"num_vision_tokens": cfg.num_vision_tokens,
"image_aspect_ratio": cfg.image_aspect_ratio,
"anyres_patch_sampling": cfg.anyres_patch_sampling,
}

model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path=cfg.vision_encoder_path,
clip_vision_encoder_pretrained=cfg.vision_encoder_pretrained,
lang_model_path=cfg.lm_path,
tokenizer_path=cfg.lm_path,
model_family=cfg.model_family,
**additional_kwargs)

ckpt = torch.load(cfg.ckpt_pth)
model.load_state_dict(ckpt, strict=True)
torch.cuda.empty_cache()
model = model.eval().cuda()

base_img_size = model.base_img_size
anyres_grids = []
for (m,n) in cfg.anyres_grids:
anyres_grids.append([base_img_sizem, base_img_sizen])
model.anyres_grids = anyres_grids

image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)

def apply_prompt_template(prompt, cfg):
if 'Phi-3' in cfg.lm_path:
s = (
'<|system|>\nA chat between a curious user and an artificial intelligence assistant. '
"The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n"
f'<|user|>\n{prompt}<|end|>\n<|assistant|>\n'
)
else:
raise NotImplementedError
return s

image_path_1 = 'example_images/image-1.jpeg'
image_path_2 = 'example_images/image-2.jpeg'

image_1 = Image.open(image_path_1).convert('RGB')
image_2 = Image.open(image_path_2).convert('RGB')
images = [image_1, image_2]
image_size = [image_1.size, image_2.size]
image_size = [image_size]
vision_x = [image_proc([img]) for img in images]
vision_x = [vision_x]

prompt = "Look at this image and this image . What is in the second image?"
prompt = apply_prompt_template(prompt, cfg)
lang_x = tokenizer([prompt], return_tensors="pt")

kwargs_default = dict(do_sample=False, temperature=0, max_new_tokens=1024, top_p=None, num_beams=1)

generated_text = model.generate(
vision_x=vision_x,
lang_x=lang_x['input_ids'].to(torch.device('cuda:0')),
image_size=image_size,
attention_mask=lang_x['attention_mask'].to(torch.device('cuda:0')),
**kwargs_default)

generated_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
if 'Phi-3' in cfg.lm_path:
text = generated_text.split('<|end|>')[0]
else:
text=generated_text

print(text)

@josephzpng Hi~I currently face the same issue here, may I ask how did you solve this problem?