[ERROR] 出現 >> [Gensim] 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
o55665516 opened this issue · 11 comments
請問大神,這個錯誤該怎麼解決,拜託幫忙了~~感覺差一點點可以完成
console檔程式碼如下
==========================================================
import random
import os
import jieba
import jieba.analyse
import RuleMatcher.rulebase as rulebase
class Console(object):
"""
Build some nlp function as an package.
"""
def __init__(self,model_path="model/ch-corpus-3sg.bin",
rule_path="RuleMatcher/rule/",
stopword="jieba_dict/stopword.txt",
jieba_dic="jieba_dict/dict.txt.big",
jieba_user_dic="jieba_dict/userdict.txt"):
print("[Console] Building a console...")
cur_dir = os.getcwd()
curPath = os.path.dirname(__file__)
os.chdir(curPath)
# jieba custom setting.
self.init_jieba(jieba_dic, jieba_user_dic)
self.stopword = self.load_stopword(stopword)
# build the rulebase.
self.rb = rulebase.RuleBase()
print("[Console] Loading the word embedding model...")
try:
self.rb.load_model(model_path)
# models.Word2Vec.load('word2vec.model')
except FileNotFoundError as e:
print("[Console] 請確定詞向量模型有正確配置")
print(e)
exit()
except Exception as e:
print("[Gensim]")
print(e)
exit()
print("[Console] Loading pre-defined rules.")
self.rb.load_rules_from_dic(rule_path)
print("[Console] Initialized successfully :>")
os.chdir(cur_dir)
def listen(self):
#into interactive console
while True:
self.show_information()
choice = input('Your choice is: ')
choice = choice.lower()
if choice == 'e':
res = self.jieba_tf_idf()
for tag, weight in res:
print('%s %s' % (tag, weight))
elif choice == 'g':
res = self.jieba_textrank()
for tag, weight in res:
print('%s %s' % (tag, weight))
elif choice == 'p':
print(self.rb)
elif choice == 'r':
self.rb.load_rules('RuleMatcher/rule/',reload=True)
elif choice == 'd':
self.test_speech()
elif choice == 'm':
speech = input('Input a sentence:')
res,path = self.rule_match(speech)
self.write_output(speech,res,path)
elif choice == 'b':
exit()
elif choice == 's':
rule_id = input('Input a rule id:')
res = self.get_response(rule_id)
if res is not None:
print(res)
elif choice == 'o':
self.rb.output_as_json()
else:
print('[Opps!] No such choice: ' + choice + '.')
def jieba_textrank(self):
"""
Use textrank in jieba to extract keywords in a sentence.
"""
speech = input('Input a sentence: ')
return jieba.analyse.textrank(speech, withWeight=True, topK=20)
def jieba_tf_idf(self):
"""
Use tf/idf in jieba to extract keywords in a sentence
"""
speech = input('Input a sentence: ')
return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)
def show_information(self):
print('Here is chatbot backend, enter your choice.')
print('- D)emo the data in speech.txt.')
print('- E)xtract the name entity.')
print('- G)ive me the TextRank.')
print('- M)atch a sentence with rules.')
print('- P)rint all rules in the rulebase.')
print('- R)eload the base rule.')
print('- O)utput all rules to rule.json.')
print('- S)how me a random response of a rule')
print('- B)ye.')
def init_jieba(self, seg_dic, userdic):
"""
jieba custom setting.
"""
jieba.load_userdict(userdic)
jieba.set_dictionary(seg_dic)
with open(userdic,'r',encoding='utf-8') as input:
for word in input:
word = word.strip('\n')
jieba.suggest_freq(word, True)
def load_stopword(self, path):
stopword = set()
with open(path,'r',encoding='utf-8') as stopword_list:
for sw in stopword_list:
sw = sw.strip('\n')
stopword.add(sw)
return stopword
def word_segment(self, sentence):
words = jieba.cut(sentence, HMM=False)
#clean up the stopword
keyword = []
for word in words:
if word not in self.stopword:
keyword.append(word)
return keyword
def rule_match(self, sentence, best_only=False, search_from=None, segmented=False):
"""
Match the sentence with rules.
Args:
- sentence : the string you want to match with rules.
- best_only : if True, only return the best matched rule.
- root : a domain name, then the rule match will start
at searching from that domain, not from forest roots.
- segmented : the sentence is segmented or not.
Return:
- a list of candiate rule
- the travel path of classification tree.
"""
keyword = []
if segmented:
keyword = sentence
else:
keyword = self.word_segment(sentence)
if search_from is None: # use for classification (rule matching).
result_list,path = self.rb.match(keyword,threshold=0.1)
else: # use for reasoning.
result_list,path = self.rb.match(keyword,threshold=0.1,root=search_from)
if best_only:
return [result_list[0], path]
else:
return [result_list, path]
def get_response(self, rule_id):
"""
Get a random response from the given rule's response'list.
"""
rule = self.rb.rules[rule_id]
res_num = rule.has_response()
if res_num == 0:
return None
else:
return rule.response[random.randrange(0,res_num)]
def test_speech(self):
"""
Try matching all sentence in 'example/output.txt'
"""
output = open('example/output.txt','w',encoding='utf-8')
# load sample data
with open('example/speech.txt','r',encoding='utf-8') as input:
for speech in input:
speech = speech.strip('\n')
result,path = self.rule_match(speech)
self.write_output(speech, result, path, output)
def write_output(self, org_speech, result, path, output = None):
"""
Show the matching result.
Args:
- org_speech: the original input string.
- result: a sorted array, refer match() in rulebase.py.
- path: the travel path in classification tree.
- output: expect as a file writer, if none, print
the result to stdio.
"""
result_information = ''
result_information += "Case# " + str(org_speech) + '\n'
result_information += "------------------\n"
for similarity,rule,matchee in result:
str_sim = '%.4f' % similarity
result_information += str_sim+'\t'+path+rule+'\t\t'+matchee+'\n'
result_information += "------------------\n"
if output is None:
print(result_information)
else:
output.write(result_information)
if name == 'main':
main()
這種編碼錯誤是 gensim 在載入詞向量失敗時所導致,我猜有兩個可能性。
一種是載入方法不對,
不過如果您用的是該 repo 的最新版本,應該是不會發生這種錯誤。
建議您也將 gensim 更新至最新版本,我剛剛看了一下文件,目前使用到的功能應該都有兼容。
第二種是詞向量本身有問題,
您可以試著重新訓練一份詞向量並儲存,
看看是否能夠正確載入。
祝順利
哈囉大大,先謝謝你提供訓練好的模型,後來可以成功運作了,而且我自己有再用其他語料集訓練出模型,也可以被載入。
不過我有個問題,是不是執行 demo_chatbot.py 有預設的對話集!?怎麼感覺我用你的載點那個model,跟我自己後來訓練的model的機器人回應都差不多??(我確定w2v_model_path有更改過)
感覺是去讀取QuestionAnswering這個資料夾裡面的語錄~~~!?
再麻煩你給予指教了~~謝謝
對,是這樣沒錯。
不過資料集是需要下載的,並非預設。
如果我單純想用自己的訓練的model回應就好呢~該怎麼設定
(應該是我有下載你的PTT回應那個資料夾,他都會把這個當預設)
似乎沒有寫這種功能,不過調用 listen 時設定
listen(..., qa_threshold=100, qa_block_threshold=100)
應該就可以了。
我有時間會重新調整接口,感謝告知。
調整成這樣是指 主要以自己訓練的model來進行回應嗎
是的,這樣調整是基於 word2vec 的規則比對,不會去撈 ptt 的 dataset
大大我照你的方式調整完,他都只會出現預設的 "是嗎?", "我不太明白你的意思", "原來如此"這三個詞....
是不是還有哪邊需要調整!?
中文的也已经下载不了了,能麻烦您再传一次么