can't find correct match result in special words
BigFishDreamWater opened this issue · 0 comments
BigFishDreamWater commented
i love this wonderful project and try to use in my work.
it's nice work with english word.
but when i try to match chinese words,some bug happens.
there is code:
import ahocorasick
def make_AC(AC, word_set):
for Word in word_set:
print(Word)
AC.add_word(Word,Word)
def issue_ahocorasick():
key_list =["心甘情愿这个词勇敢又卑微(just for test)",
"Test心甘情愿这个词真的又卑微又勇敢",
"notetsgdsdf心甘情愿这个词 卑微又勇敢",
"心甘情愿这个词sdgdsfdfdg满是卑微又勇敢"]
AC_KEY = ahocorasick.Automaton()
AC_KEY = make_AC(AC_KEY, set(key_list))
AC_KEY.make_automaton()
# test_str_list[0] is subset of test_str_list[1].
# and test_str_list[1] match ok,but test_str_list[0] match fail. it's makes me confused.
test_str_list = ["心甘情愿这个词","心甘情愿这个词勇敢又卑微(just for test)"]
print("beigin >>>>>>>")
for content in test_str_list:
name_list = [0]
match_length = 0
for item in AC_KEY.iter(content):
print("item:",item)
# just find the max lenth match result
if len(item[1])>match_length:
name_list[0]= item[1]
match_length = len(item[1])
# if match result exists, print match result
if name_list[0]:
print("【{}】--->match:{}".format(content,name_list[0]))
else:
print("{}:there is no match".format(content))
if __name__ == "__main__":
`issue_ahocorasick()