[ERROR] 出現 >> [Gensim] 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

Question

[ERROR] 出現 >> [Gensim] 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

o55665516 opened this issue 7 years ago · 11 comments

o55665516 commented 7 years ago

請問大神，這個錯誤該怎麼解決，拜託幫忙了~~感覺差一點點可以完成
console檔程式碼如下

==========================================================
import random
import os

import jieba
import jieba.analyse

import RuleMatcher.rulebase as rulebase

class Console(object):

"""
Build some nlp function as an package.
"""

def __init__(self,model_path="model/ch-corpus-3sg.bin",
             rule_path="RuleMatcher/rule/",
             stopword="jieba_dict/stopword.txt",
             jieba_dic="jieba_dict/dict.txt.big",
             jieba_user_dic="jieba_dict/userdict.txt"):

    print("[Console] Building a console...")

    cur_dir = os.getcwd()
    curPath = os.path.dirname(__file__)
    os.chdir(curPath)

    # jieba custom setting.
    self.init_jieba(jieba_dic, jieba_user_dic)
    self.stopword = self.load_stopword(stopword)

    # build the rulebase.
    self.rb = rulebase.RuleBase()

    print("[Console] Loading the word embedding model...")

    try:
        self.rb.load_model(model_path)
        # models.Word2Vec.load('word2vec.model')
        
    except FileNotFoundError as e:
        print("[Console] 請確定詞向量模型有正確配置")
        print(e)
        exit()
    except Exception as e:
        print("[Gensim]")
        print(e)
        exit()

    print("[Console] Loading pre-defined rules.")
    self.rb.load_rules_from_dic(rule_path)

    print("[Console] Initialized successfully :>")

    os.chdir(cur_dir)


def listen(self):
    #into interactive console
    while True:
        self.show_information()
        choice = input('Your choice is: ')
        choice = choice.lower()
        if choice == 'e':
            res = self.jieba_tf_idf()
            for tag, weight in res:
                print('%s %s' % (tag, weight))
        elif choice == 'g':
            res = self.jieba_textrank()
            for tag, weight in res:
                print('%s %s' % (tag, weight))
        elif choice == 'p':
            print(self.rb)
        elif choice == 'r':
            self.rb.load_rules('RuleMatcher/rule/',reload=True)
        elif choice == 'd':
            self.test_speech()
        elif choice == 'm':
            speech = input('Input a sentence:')
            res,path = self.rule_match(speech)
            self.write_output(speech,res,path)
        elif choice == 'b':
            exit()
        elif choice == 's':
            rule_id = input('Input a rule id:')
            res = self.get_response(rule_id)
            if res is not None:
                print(res)
        elif choice == 'o':
            self.rb.output_as_json()
        else:
            print('[Opps!] No such choice: ' + choice + '.')

def jieba_textrank(self):

    """
    Use textrank in jieba to extract keywords in a sentence.
    """

    speech = input('Input a sentence: ')
    return jieba.analyse.textrank(speech, withWeight=True, topK=20)

def jieba_tf_idf(self):

    """
    Use tf/idf in jieba to extract keywords in a sentence
    """

    speech = input('Input a sentence: ')
    return jieba.analyse.extract_tags(speech, topK=20, withWeight=True)

def show_information(self):
    print('Here is chatbot backend, enter your choice.')
    print('- D)emo the data in speech.txt.')
    print('- E)xtract the name entity.')
    print('- G)ive me the TextRank.')
    print('- M)atch a sentence with rules.')
    print('- P)rint all rules in the rulebase.')
    print('- R)eload the base rule.')
    print('- O)utput all rules to rule.json.')
    print('- S)how me a random response of a rule')
    print('- B)ye.')

def init_jieba(self, seg_dic, userdic):

    """
    jieba custom setting.
    """

    jieba.load_userdict(userdic)
    jieba.set_dictionary(seg_dic)
    with open(userdic,'r',encoding='utf-8') as input:
        for word in input:
            word = word.strip('\n')
            jieba.suggest_freq(word, True)

def load_stopword(self, path):

    stopword = set()
    with open(path,'r',encoding='utf-8') as stopword_list:
        for sw in stopword_list:
            sw = sw.strip('\n')
            stopword.add(sw)
    return stopword

def word_segment(self, sentence):

    words = jieba.cut(sentence, HMM=False)
    #clean up the stopword
    keyword = []
    for word in words:
        if word not in self.stopword:
            keyword.append(word)
    return keyword

def rule_match(self, sentence, best_only=False, search_from=None, segmented=False):

    """
    Match the sentence with rules.

    Args:
        - sentence  : the string you want to match with rules.
        - best_only : if True, only return the best matched rule.
        - root      : a domain name, then the rule match will start
                      at searching from that domain, not from forest roots.
        - segmented : the sentence is segmented or not.
    Return:
        - a list of candiate rule
        - the travel path of classification tree.
    """
    keyword = []
    if segmented:
        keyword = sentence
    else:
        keyword = self.word_segment(sentence)

    if search_from is None: # use for classification (rule matching).
        result_list,path = self.rb.match(keyword,threshold=0.1)
    else:  # use for reasoning.
        result_list,path = self.rb.match(keyword,threshold=0.1,root=search_from)

    if best_only:
        return [result_list[0], path]
    else:
        return [result_list, path]


def get_response(self, rule_id):

    """
    Get a random response from the given rule's response'list.
    """
    rule = self.rb.rules[rule_id]
    res_num = rule.has_response()
    if res_num == 0:
        return None
    else:
        return rule.response[random.randrange(0,res_num)]

def test_speech(self):

    """
    Try matching all sentence in 'example/output.txt'
    """

    output = open('example/output.txt','w',encoding='utf-8')
    # load sample data
    with open('example/speech.txt','r',encoding='utf-8') as input:
        for speech in input:
            speech = speech.strip('\n')
            result,path = self.rule_match(speech)
            self.write_output(speech, result, path, output)

def write_output(self, org_speech, result, path, output = None):

    """
    Show the matching result.

        Args:
            - org_speech: the original input string.
            - result: a sorted array, refer match() in rulebase.py.
            - path: the travel path in classification tree.
            - output: expect as a file writer, if none, print
              the result to stdio.
    """
    result_information = ''
    result_information += "Case# " + str(org_speech) + '\n'
    result_information += "------------------\n"
    for similarity,rule,matchee in result:
        str_sim = '%.4f' % similarity
        result_information += str_sim+'\t'+path+rule+'\t\t'+matchee+'\n'
    result_information += "------------------\n"

    if output is None:
        print(result_information)
    else:
        output.write(result_information)

if name == 'main':
main()

Answer 1 · 2018-07-24T05:34:38.000Z

這種編碼錯誤是 gensim 在載入詞向量失敗時所導致，我猜有兩個可能性。

一種是載入方法不對，
不過如果您用的是該 repo 的最新版本，應該是不會發生這種錯誤。
建議您也將 gensim 更新至最新版本，我剛剛看了一下文件，目前使用到的功能應該都有兼容。

第二種是詞向量本身有問題，
您可以試著重新訓練一份詞向量並儲存，
看看是否能夠正確載入。

祝順利

Answer 2 · 2018-07-24T16:21:11.000Z

@zake7749 那版主大大，您有辦法提供你訓練好的model來讓我做使用嗎，十萬分感謝

Answer 3 · 2018-07-25T06:08:22.000Z

中文的可以使用這個。

Answer 4 · 2018-07-28T07:17:28.000Z

哈囉大大，先謝謝你提供訓練好的模型，後來可以成功運作了，而且我自己有再用其他語料集訓練出模型，也可以被載入。
不過我有個問題，是不是執行 demo_chatbot.py 有預設的對話集!?怎麼感覺我用你的載點那個model，跟我自己後來訓練的model的機器人回應都差不多??(我確定w2v_model_path有更改過)
感覺是去讀取QuestionAnswering這個資料夾裡面的語錄~~~!?
再麻煩你給予指教了~~謝謝

Answer 5 · 2018-07-29T10:20:30.000Z

對，是這樣沒錯。
不過資料集是需要下載的，並非預設。

Answer 6 · 2018-07-29T15:44:27.000Z

如果我單純想用自己的訓練的model回應就好呢~該怎麼設定
(應該是我有下載你的PTT回應那個資料夾，他都會把這個當預設)

Answer 7 · 2018-08-01T07:31:57.000Z

似乎沒有寫這種功能，不過調用 listen 時設定
listen(..., qa_threshold=100, qa_block_threshold=100) 應該就可以了。
我有時間會重新調整接口，感謝告知。

Answer 8 · 2018-08-02T05:39:24.000Z

調整成這樣是指主要以自己訓練的model來進行回應嗎

Answer 9 · 2018-08-06T08:15:37.000Z

是的，這樣調整是基於 word2vec 的規則比對，不會去撈 ptt 的 dataset

Answer 10 · 2018-08-12T17:24:04.000Z

大大我照你的方式調整完，他都只會出現預設的 "是嗎?", "我不太明白你的意思", "原來如此"這三個詞....
是不是還有哪邊需要調整!?

Answer 11 · 2018-12-14T08:58:30.000Z

中文的也已经下载不了了，能麻烦您再传一次么