chapmanjacobd/library

Explore using log-likelihood instead of TF-IDF

Opened this issue · 3 comments

Is your feature request related to a problem? Please describe.

This is a solution looking for a problem but the results might be interesting.

Describe the solution you'd like

https://wordhoard.northwestern.edu/userman/analysis-comparewords.html#loglike

Describe alternatives you've considered

I'm not an expert in this space so any other algorithms, suggestions, or explorations are welcome!

similarly, experiment with other libraries like gensim, etc:

def find_clusters_gensim(n_clusters, sentence_strings):
    from gensim import corpora
    from gensim.models import LdaModel
    from gensim.parsing.preprocessing import STOPWORDS
    from gensim.utils import simple_preprocess
    import numpy as np

    def preprocess(text):
        result = []
        for token in simple_preprocess(text, max_len=32):
            if token not in STOPWORDS:
                result.append(token)
        return result

    processed_docs = [preprocess(doc) for doc in sentence_strings]
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lda_model = LdaModel(
        corpus,
        num_topics=n_clusters or int(np.sqrt(len(corpus))),
        id2word=dictionary,
        passes=10,
        random_state=0,
    )

    clusters = [max(lda_model[doc], key=lambda item: item[1])[0] for doc in corpus if doc]
    return clusters
def find_clusters_fasttext(n_clusters, sentence_strings):
    from gensim.models import FastText
    from sklearn.cluster import KMeans
    import numpy as np

    def preprocess(text):
        return text.lower().split()
    tokenized_sentences = [preprocess(sentence) for sentence in sentence_strings]

    fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, epochs=10)
    def sentence_embedding(sentence):
        embeddings = [fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(fasttext_model.vector_size)
    sentence_embeddings = np.array([sentence_embedding(sentence) for sentence in tokenized_sentences])

    n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit(sentence_embeddings)

    return kmeans.labels_

I tried these:


def find_clusters_gensim(n_clusters, sentence_strings):
    from gensim import corpora, matutils
    from gensim.models import LsiModel
    import numpy as np
    from sklearn.cluster import KMeans

    processed_docs = [s.split() for s in sentence_strings]
    n_clusters = n_clusters or int(np.sqrt(len(processed_docs)))

    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    lsi_model = LsiModel(
        corpus,
        num_topics=n_clusters,
        id2word=dictionary,
    )

    # Transform the corpus to LSI space
    corpus_lsi = lsi_model[corpus]
    lsi_matrix = matutils.corpus2dense(corpus_lsi, num_docs=len(processed_docs), num_terms=n_clusters).T

    kmeans_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
    kmeans_model.fit(lsi_matrix)
    return kmeans_model.labels_
def find_clusters_gensim2(n_clusters, sentence_strings):
    from gensim import corpora
    from gensim.models import TfidfModel
    from sklearn.cluster import KMeans
    from gensim.matutils import corpus2dense

    tokenized_sentences = [s.split() for s in sentence_strings]
    n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))

    dictionary = corpora.Dictionary(tokenized_sentences)
    corpus = [dictionary.doc2bow(text) for text in tokenized_sentences]

    tfidf = TfidfModel(corpus, normalize=True)
    corpus_tfidf = tfidf[corpus]

    X = corpus2dense(corpus_tfidf, num_terms=len(dictionary.token2id), num_docs=len(tokenized_sentences)).T

    clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
    return clusterizer.labels_

but they seem slower and lower quality