Will bertviz work for vision transformer?
nahidalam opened this issue · 0 comments
nahidalam commented
Since both the model_view
and head_view
methods require tokens
, I wonder if this will work for vision transformers. Here is a sample code I tried that fails because ValueError: 'tokens' is required
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view
utils.logging.set_verbosity_error() # Suppress standard warnings
from transformers import MobileViTFeatureExtractor, MobileViTForImageClassification
from PIL import Image
import requests
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
feature_extractor = MobileViTFeatureExtractor.from_pretrained("apple/mobilevit-small")
model = MobileViTForImageClassification.from_pretrained("apple/mobilevit-small")
inputs = feature_extractor(images=image, return_tensors="pt")
outputs = model(**inputs)
attention = outputs[-1] # Retrieve attention from model outputs
#tokens = tokenizer.convert_ids_to_tokens(inputs[0]) # Convert input ids to token strings
#model_view(attention, tokens) # Display model view
model_view(attention)