zake7749/Chatbot

[ERROR] 出現 >> [Gensim] 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

o55665516 opened this issue · 11 comments

請問大神,這個錯誤該怎麼解決,拜託幫忙了~~感覺差一點點可以完成
console檔程式碼如下

==========================================================
import random
import os

import jieba
import jieba.analyse

import RuleMatcher.rulebase as rulebase

class Console(object):

"""
Build some nlp function as an package.
"""

def __init__(self,model_path="model/ch-corpus-3sg.bin",
             rule_path="RuleMatcher/rule/",
             stopword="jieba_dict/stopword.txt",
             jieba_dic="jieba_dict/dict.txt.big",
             jieba_user_dic="jieba_dict/userdict.txt"):

    print("[Console] Building a console...")

    cur_dir = os.getcwd()
    curPath = os.path.dirname(__file__)
    os.chdir(curPath)

    # jieba custom setting.
    self.init_jieba(jieba_dic, jieba_user_dic)
    self.stopword = self.load_stopword(stopword)

    # build the rulebase.
    self.rb = rulebase.RuleBase()

    print("[Console] Loading the word embedding model...")

    try:
        self.rb.load_model(model_path)
        # models.Word2Vec.load('word2vec.model')
        
    except FileNotFoundError as e:
        print("[Console] 請確定詞向量模型有正確配置")
        print(e)
        exit()
    except Exception as e:
        print("[Gensim]")
        print(e)
        exit()

    print("[Console] Loading pre-defined rules.")
    self.rb.load_rules_from_dic(rule_path)

    print("[Console] Initialized successfully :>")

    os.chdir(cur_dir)


def listen(self):
    #into interactive console
    while True:
        self.show_information()
        choice = input('Your choice is: ')
        choice = choice.lower()
        if choice == 'e':
            res = self.jieba_tf_idf()
            for tag, weight in res:
                print('%s %s' % (tag, weight))
        elif choice == 'g':
            res = self.jieba_textrank()
            for tag, weight in res:
                print('%s %s' % (tag, weight))
        elif choice == 'p':
            print(self.rb)
        elif choice == 'r':
            self.rb.load_rules('RuleMatcher/rule/',reload=True)
        elif choice == 'd':
            self.test_speech()
        elif choice == 'm':
            speech = input('Input a sentence:')
            res,path = self.rule_match(speech)
            self.write_output(speech,res,path)
        elif choice == 'b':
            exit()
        elif choice == 's':
            rule_id = input('Input a rule id:')
            res = self.get_response(rule_id)
            if res is not None:
                print(res)
        elif choice == 'o':
            self.rb.output_as_json()
        else:
            print('[Opps!] No such choice: ' + choice + '.')

def jieba_textrank(self):

    """
    Use textrank in jieba to extract keywords in a sentence.
    """

    speech = input('Input a sentence: ')
    return jieba.analyse.textrank(speech, withWeight=True, topK=20)

def jieba_tf_idf(self):

    """
    Use tf/idf in jieba to extract keywords in a sentence
    """

    speech = input('Input a sentence: ')
    return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)

def show_information(self):
    print('Here is chatbot backend, enter your choice.')
    print('- D)emo the data in speech.txt.')
    print('- E)xtract the name entity.')
    print('- G)ive me the TextRank.')
    print('- M)atch a sentence with rules.')
    print('- P)rint all rules in the rulebase.')
    print('- R)eload the base rule.')
    print('- O)utput all rules to rule.json.')
    print('- S)how me a random response of a rule')
    print('- B)ye.')

def init_jieba(self, seg_dic, userdic):

    """
    jieba custom setting.
    """

    jieba.load_userdict(userdic)
    jieba.set_dictionary(seg_dic)
    with open(userdic,'r',encoding='utf-8') as input:
        for word in input:
            word = word.strip('\n')
            jieba.suggest_freq(word, True)

def load_stopword(self, path):

    stopword = set()
    with open(path,'r',encoding='utf-8') as stopword_list:
        for sw in stopword_list:
            sw = sw.strip('\n')
            stopword.add(sw)
    return stopword

def word_segment(self, sentence):

    words = jieba.cut(sentence, HMM=False)
    #clean up the stopword
    keyword = []
    for word in words:
        if word not in self.stopword:
            keyword.append(word)
    return keyword

def rule_match(self, sentence, best_only=False, search_from=None, segmented=False):

    """
    Match the sentence with rules.

    Args:
        - sentence  : the string you want to match with rules.
        - best_only : if True, only return the best matched rule.
        - root      : a domain name, then the rule match will start
                      at searching from that domain, not from forest roots.
        - segmented : the sentence is segmented or not.
    Return:
        - a list of candiate rule
        - the travel path of classification tree.
    """
    keyword = []
    if segmented:
        keyword = sentence
    else:
        keyword = self.word_segment(sentence)

    if search_from is None: # use for classification (rule matching).
        result_list,path = self.rb.match(keyword,threshold=0.1)
    else:  # use for reasoning.
        result_list,path = self.rb.match(keyword,threshold=0.1,root=search_from)

    if best_only:
        return [result_list[0], path]
    else:
        return [result_list, path]


def get_response(self, rule_id):

    """
    Get a random response from the given rule's response'list.
    """
    rule = self.rb.rules[rule_id]
    res_num = rule.has_response()
    if res_num == 0:
        return None
    else:
        return rule.response[random.randrange(0,res_num)]

def test_speech(self):

    """
    Try matching all sentence in 'example/output.txt'
    """

    output = open('example/output.txt','w',encoding='utf-8')
    # load sample data
    with open('example/speech.txt','r',encoding='utf-8') as input:
        for speech in input:
            speech = speech.strip('\n')
            result,path = self.rule_match(speech)
            self.write_output(speech, result, path, output)

def write_output(self, org_speech, result, path, output = None):

    """
    Show the matching result.

        Args:
            - org_speech: the original input string.
            - result: a sorted array, refer match() in rulebase.py.
            - path: the travel path in classification tree.
            - output: expect as a file writer, if none, print
              the result to stdio.
    """
    result_information = ''
    result_information += "Case# " + str(org_speech) + '\n'
    result_information += "------------------\n"
    for similarity,rule,matchee in result:
        str_sim = '%.4f' % similarity
        result_information += str_sim+'\t'+path+rule+'\t\t'+matchee+'\n'
    result_information += "------------------\n"

    if output is None:
        print(result_information)
    else:
        output.write(result_information)

if name == 'main':
main()

這種編碼錯誤是 gensim 在載入詞向量失敗時所導致,我猜有兩個可能性。

一種是載入方法不對,
不過如果您用的是該 repo 的最新版本,應該是不會發生這種錯誤。
建議您也將 gensim 更新至最新版本,我剛剛看了一下文件,目前使用到的功能應該都有兼容。

第二種是詞向量本身有問題,
您可以試著重新訓練一份詞向量並儲存,
看看是否能夠正確載入。

祝順利

@zake7749 那版主大大,您有辦法提供你訓練好的model來讓我做使用嗎,十萬分感謝

中文的可以使用這個

哈囉大大,先謝謝你提供訓練好的模型,後來可以成功運作了,而且我自己有再用其他語料集訓練出模型,也可以被載入。
不過我有個問題,是不是執行 demo_chatbot.py 有預設的對話集!?怎麼感覺我用你的載點那個model,跟我自己後來訓練的model的機器人回應都差不多??(我確定w2v_model_path有更改過)
感覺是去讀取QuestionAnswering這個資料夾裡面的語錄~~~!?
再麻煩你給予指教了~~謝謝

對,是這樣沒錯。
不過資料集是需要下載的,並非預設。

如果我單純想用自己的訓練的model回應就好呢~該怎麼設定
(應該是我有下載你的PTT回應那個資料夾,他都會把這個當預設)

似乎沒有寫這種功能,不過調用 listen 時設定
listen(..., qa_threshold=100, qa_block_threshold=100) 應該就可以了。
我有時間會重新調整接口,感謝告知。

調整成這樣是指 主要以自己訓練的model來進行回應嗎

是的,這樣調整是基於 word2vec 的規則比對,不會去撈 ptt 的 dataset

大大我照你的方式調整完,他都只會出現預設的 "是嗎?", "我不太明白你的意思", "原來如此"這三個詞....
是不是還有哪邊需要調整!?

中文的也已经下载不了了,能麻烦您再传一次么