text similarity algorithm

Levenshtein distance javaScript version
Cosine similarity python version
SimHash
tokenVector
Vector space model

Levenshtein distance javaScript version

1317 questions and 1317 standards. Correctly matched 541 questions. Matching accuracy: 0.41078208048595294

function lenvenshteinDistance(s, t) {
    let sLen = s.length;
    let tLen = t.length;
    let substitutionCost = 0;
    let d = [];
    // 初始化
    d[0] = [];
    for(let i = 0; i <= tLen; i ++) d[0][i] = i;
    // 打表
    for(let i = 1; i <= sLen; i ++) {
        d[i] = [i];
        for(let j = 1; j <= tLen; j ++) {
            if(s[i - 1] === t[j - 1]) {
                substitutionCost = 0;
            }else substitutionCost = 1;
            d[i][j] = Math.min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + substitutionCost)
        }
    }
    return d[sLen][tLen];
}

Cosine similarity python version

Correctly matched 730 questions. Matching accuracy was: 0.554290053151

def getCosineSimilarity(str1, str2):
    words = set() # 创建集合
    seg_list1 = jieba.lcut(str1) # 分词 返回list
    seg_list2 = jieba.lcut(str2)
    seg_list_number1 = Counter(seg_list1) # 计算list里的每个元素的出现次数
    seg_list_number2 = Counter(seg_list2)

    # 把分好的词装入集合，去重
    for value in seg_list1:
        words.add(value) # 把分好的词放入集合中，去重

    for value in seg_list2:
        words.add(value) # 把分好的词放入集合中，去重

    # 遍历集合，生成向量, 计算余弦值

    vector_product = 0
    vector_length_product1 = 0
    vector_length_product2 = 0
    for word in words:
        num1 = seg_list_number1[word]
        num2 = seg_list_number2[word]
        vector_product += num1 * num2
        vector_length_product1 += num1 * num1
        vector_length_product2 += num2 * num2
    return vector_product / numpy.sqrt(vector_length_product1 * vector_length_product2)

SimHash

Correctly matched 710 questions. Correct rate is: 0.5391040242976461

    class SimHaming:
    '''利用64位数，计算海明距离'''
    def haming_distance(self, code_s1, code_s2):
        x = (code_s1 ^ code_s2) & ((1 << 64) - 1)
        ans = 0
        while x:
            ans += 1
            x &= x - 1
        return ans
    '''利用相似度计算方式,计算全文编码相似度'''
    def get_similarity(self, a, b):
        if a > b :
            return b / a
        else:
            return a / b

    '''对全文进行分词,提取全文特征,使用词性将虚词等无关字符去重'''
    def get_features(self, string):
        word_list=[word.word for word in pseg.cut(string) if word.flag[0] not in ['u','x','w','o','p','c','m','q']]
        return word_list

    '''计算两个全文编码的距离'''
    def get_distance(self, code_s1, code_s2):
        return self.haming_distance(code_s1, code_s2)

    '''对全文进行编码'''
    def get_code(self, string):
        return Simhash(self.get_features(string)).value

    '''计算s1与s2之间的距离'''
    def distance(self, s1, s2):
        code_s1 = self.get_code(s1)
        code_s2 = self.get_code(s2)
        similarity = (100 - self.haming_distance(code_s1,code_s2)*100/64)/100
        return similarity

tokenVector

word vector: sgns.financial.bigram-char test result: Correctly matched 794 questions, Correct rate is: 0.6028853454821564

word vector: token_vector.bin test result: correctly matched 900 questions, Correct rate is: 0.683371298405467

去除停用词之后，匹配正确的数量提升到964道，正确率为0.7319665907365224

class SimTokenVec:

    def __init__(self):
        self.embedding_path = './model/token_vector.bin'
        self.model = gensim.models.KeyedVectors.load_word2vec_format(self.embedding_path, binary=False)
        #self.model = word2vec.load('./model/token_vector.bin')
    '''获取词向量文件'''
    def get_wordvector(self, word):#获取词向量

        try:
            return self.model[word]
        except:
            return np.zeros(200)
     '''去除停用词'''
     def deleteStopWord(self, text):
        rightText = []
        stopWord = getStopWord()
        for str in text:
            #print (str)
            if stopWord.count(str) == 0:
                rightText.append(str)
        return rightText
    '''基于余弦相似度计算句子之间的相似度，句子向量等于字符向量求平均'''
    def similarity_cosine(self, word_list1,word_list2):# 给予余弦相似度的相似度计算
        vector1 = np.zeros(200) # 返回具有输入形状和类型的零数组
        for word in word_list1:
            vector1 += self.get_wordvector(word) # 得到每个词的向量
        vector1=vector1/len(word_list1) # 句子向量为每个词向量求平均
        vector2=np.zeros(200)
        for word in word_list2:
            vector2 += self.get_wordvector(word)
        vector2=vector2/len(word_list2)

        cos1 = np.sum(vector1*vector2) # 向量乘积求和
        cos21 = np.sqrt(sum(vector1**2)) # 求向量长度
        cos22 = np.sqrt(sum(vector2**2))
        similarity = cos1/float(cos21*cos22)
        return  similarity

    '''计算句子相似度'''
    def distance(self, text1, text2):# 相似性计算主函数
        word_list1=[word for word in text1]
        word_list2=[word for word in text2]
        word_list1 = self.deleteStopWord(word_list1)
        word_list2 = self.deleteStopWord(word_list2)
        return self.similarity_cosine(word_list1,word_list2)

Vector space model

Correctly matched 710 questions. Correct rate is: 0.5391040242976461

class SimVsm:

    '''比较相似度'''
    def distance(self, text1, text2):
        words1 = [word.word for word in pesg.cut(text1) if word.flag[0] not in ['u', 'x', 'w']]
        words2 = [word.word for word in pesg.cut(text2) if word.flag[0] not in ['u', 'x', 'w']]
        tfidf_reps = self.tfidf_rep([words1, words2])
        return self.cosine_sim(np.array(tfidf_reps[0]), np.array(tfidf_reps[1]))
    '''对句子进行tfidf向量表示'''
    def tfidf_rep(self, sents):
        sent_list = []
        df_dict = {}
        tfidf_list = []
        for sent in sents:
            tmp = {}
            for word in sent:
                if word not in tmp:
                    tmp[word] = 1
                else:
                    tmp[word] += 1
            tmp = {word:word_count/sum(tmp.values()) for word, word_count in tmp.items()}
            for word in set(sent):
                if word not in df_dict:
                    df_dict[word] = 1
                else:
                    df_dict[word] += 1
            sent_list.append(tmp)
        df_dict = {word :math.log(len(sents)/df+1) for word, df in df_dict.items()}
        words = list(df_dict.keys())
        for sent in sent_list:
            tmp = []
            for word in words:
                tmp.append(sent.get(word, 0))
            tfidf_list.append(tmp)
        return tfidf_list

    '''余弦相似度计算相似度'''
    def cosine_sim(self, vector1, vector2):
        cos1 = np.sum(vector1 * vector2)
        cos21 = np.sqrt(sum(vector1 ** 2))
        cos22 = np.sqrt(sum(vector2 ** 2))
        similarity = cos1 / float(cos21 * cos22)
        return similarity

personalized-news/text-similarity

text similarity algorithm

Levenshtein distance javaScript version

Cosine similarity python version

SimHash

tokenVector

Vector space model