Explore using log-likelihood instead of TF-IDF
Opened this issue · 3 comments
chapmanjacobd commented
Is your feature request related to a problem? Please describe.
This is a solution looking for a problem but the results might be interesting.
Describe the solution you'd like
https://wordhoard.northwestern.edu/userman/analysis-comparewords.html#loglike
Describe alternatives you've considered
I'm not an expert in this space so any other algorithms, suggestions, or explorations are welcome!
chapmanjacobd commented
similarly, experiment with other libraries like gensim, etc:
def find_clusters_gensim(n_clusters, sentence_strings):
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
import numpy as np
def preprocess(text):
result = []
for token in simple_preprocess(text, max_len=32):
if token not in STOPWORDS:
result.append(token)
return result
processed_docs = [preprocess(doc) for doc in sentence_strings]
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lda_model = LdaModel(
corpus,
num_topics=n_clusters or int(np.sqrt(len(corpus))),
id2word=dictionary,
passes=10,
random_state=0,
)
clusters = [max(lda_model[doc], key=lambda item: item[1])[0] for doc in corpus if doc]
return clusters
def find_clusters_fasttext(n_clusters, sentence_strings):
from gensim.models import FastText
from sklearn.cluster import KMeans
import numpy as np
def preprocess(text):
return text.lower().split()
tokenized_sentences = [preprocess(sentence) for sentence in sentence_strings]
fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, epochs=10)
def sentence_embedding(sentence):
embeddings = [fasttext_model.wv[word] for word in sentence if word in fasttext_model.wv]
if embeddings:
return np.mean(embeddings, axis=0)
else:
return np.zeros(fasttext_model.vector_size)
sentence_embeddings = np.array([sentence_embedding(sentence) for sentence in tokenized_sentences])
n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))
kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit(sentence_embeddings)
return kmeans.labels_
chapmanjacobd commented
I tried these:
def find_clusters_gensim(n_clusters, sentence_strings):
from gensim import corpora, matutils
from gensim.models import LsiModel
import numpy as np
from sklearn.cluster import KMeans
processed_docs = [s.split() for s in sentence_strings]
n_clusters = n_clusters or int(np.sqrt(len(processed_docs)))
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
lsi_model = LsiModel(
corpus,
num_topics=n_clusters,
id2word=dictionary,
)
# Transform the corpus to LSI space
corpus_lsi = lsi_model[corpus]
lsi_matrix = matutils.corpus2dense(corpus_lsi, num_docs=len(processed_docs), num_terms=n_clusters).T
kmeans_model = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
kmeans_model.fit(lsi_matrix)
return kmeans_model.labels_
def find_clusters_gensim2(n_clusters, sentence_strings):
from gensim import corpora
from gensim.models import TfidfModel
from sklearn.cluster import KMeans
from gensim.matutils import corpus2dense
tokenized_sentences = [s.split() for s in sentence_strings]
n_clusters = n_clusters or int(np.sqrt(len(tokenized_sentences)))
dictionary = corpora.Dictionary(tokenized_sentences)
corpus = [dictionary.doc2bow(text) for text in tokenized_sentences]
tfidf = TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]
X = corpus2dense(corpus_tfidf, num_terms=len(dictionary.token2id), num_docs=len(tokenized_sentences)).T
clusterizer = KMeans(n_clusters=n_clusters or int(X.shape[0] ** 0.5), random_state=0, n_init=10).fit(X)
return clusterizer.labels_
but they seem slower and lower quality