Erlangshen embedding模型推理使用
Congcong-Song opened this issue · 0 comments
Congcong-Song commented
tokenizer = BertTokenizer.from_pretrained("/home/inspur/nas_data/pretrain/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese") #.cuda().eval() # text长度512
model = BertForMaskedLM.from_pretrained("/home/inspur/nas_data/pretrain/Erlangshen-TCBert-330M-Sentence-Embedding-Chinese").cuda().eval()
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-8)
with torch.no_grad():
# To extract sentence representations for training data
training_input = tokenizer("怎样的房子才算户型方正?", return_tensors="pt")
print(f"training_input {training_input}")
training_output = BertForMaskedLM(**token_text, output_hidden_states=True)
training_representation = torch.mean(training_outputs.hidden_states[-1].squeeze(), dim=0)
# To extract sentence representations for training data
test_input = tokenizer("下面是一则关于[MASK][MASK]的新闻:股票放量下趺,大资金出逃谁在接盘?", return_tensors="pt")
test_output = BertForMaskedLM(**token_text, output_hidden_states=True)
test_representation = torch.mean(training_output.hidden_states[-1].squeeze(), dim=0)
similarity_score = cos(training_representation, test_representation)
这个是huggingface的代码,但是里面的token_text和training_outputs没有定义