Morizeyao/GPT2-Chinese

分离两个生成句子的办法

liangcaihua opened this issue · 1 comments

第1种:

import numpy as np
import torch
#torch.manual_seed(2)

n_ctx = model.config.n_ctx
temperature = 2
repitition_penalty = 1.7

model.eval()
with torch.no_grad():
for generated in '家无铜臭孔方兄':

    for _ in range(120):
        inputs = generated
        eval_inputs = full_tokenizer(inputs, add_special_tokens=False, truncation=True, return_tensors='pt')
        eval_inputs.to(device)
        #print(eval_inputs.input_ids.shape)
        outputs = model(**eval_inputs)
        next_token_logits = outputs.logits[0, -1, :]
        #print('outputs:', outputs.logits.shape, next_token_logits.shape)
        # 诗歌的要防止重复录入相同的单词
        for id in set(full_tokenizer.encode(generated[-17:].replace(',','').replace('。',''))): # 原始的输入惩罚下,因为输出很有可能和输入不一样
            #其实没多大作用,只是为了让输出的词尽量不用输入的词
            next_token_logits[id] /= repitition_penalty 
        #next_token_logits = next_token_logits / temperature #整体热度降低

        #未知单词的权重降低
        next_token_logits[full_tokenizer.convert_tokens_to_ids(['[PAD]','[UNK]'])] = -float('Inf')

        values, indices = torch.topk(next_token_logits, 10)
        p = F.softmax(values, dim=-1)
        #if _>490: print(p)
        next_id = np.random.choice(indices.cpu().numpy(), 1, p=p.cpu().numpy())

        #print(full_tokenizer.decode(next_id))
        generated += full_tokenizer.decode(next_id)

    text = generated.replace(' ','')
    print(text.replace('[CLS]','\n'), '\n')

第二种:
eval_inputs = full_tokenizer('[CLS]', return_tensors='pt', add_special_tokens=False)
eval_inputs.to(device)

print(eval_inputs)
out = model.generate(eval_inputs.input_ids, max_length=100,
temperature=1.0,
top_k=8,
top_p=0.3,
repetition_penalty=1.0,
do_sample=True,
num_return_sequences=1)

out.shape, [full_tokenizer.decode(c).replace(' ','')[5:]+'\n' for c in out]

使用第二种生成的如下:

'一片清光万里同,几人曾见月如弓。只今犹有中秋月,曾照当时照大东。[SEP]脚亭亭似水哉,天风吹下玉楼台。夜深环佩归何处,更向桥边礼斗来。[SEP]人无限好楼台,楼上笙歌次第催。一片笙歌人并醉,月中倒载舞衣回