Question 1: Computing the TF-IDF Matrix using NumPy
Task: Write a Python function to compute the TF-IDF matrix for the given set of documents using only NumPy.
importnumpyasnpdefcompute_tf_idf(documents, vocabulary):
N=len(documents)
V=len(vocabulary)
tf=np.zeros((N, V))
fori, docinenumerate(documents):
words=doc.lower().split()
forwordinwords:
ifwordinvocabulary:
j=vocabulary.index(word)
tf[i, j] +=1tf[i] =tf[i] /len(words)
df=np.zeros(V)
forj, terminenumerate(vocabulary):
df[j] =sum(1fordocindocumentsiftermindoc.lower().split())
idf=np.log(N/ (df+1))
tf_idf=tf*idfreturntf_idfdocuments= [
"cat sat on the mat",
"dog sat on the log",
"cat and dog played together"
]
vocabulary=list(set(" ".join(documents).lower().split()))
tf_idf_matrix=compute_tf_idf(documents, vocabulary)
print("Vocabulary:", vocabulary)
print("TF-IDF Matrix:\n", tf_idf_matrix)
Question 2: Generating n-grams for a Sentence
Task: Write a Python function to generate n-grams for a given sentence.
defgenerate_ngrams(sentence, n):
words=sentence.lower().split()
ngrams= []
foriinrange(len(words) -n+1):
ngram=tuple(words[i:i+n])
ngrams.append(ngram)
returnngramssentence="The quick brown fox jumps over the lazy dog."n=3ngrams=generate_ngrams(sentence, n)
print(f"{n}-grams:")
forgraminngrams:
print(gram)
Question 3: Computing a 3-gram Language Model
Task: Write a Python function to compute a 3-gram language model.
defcompute_trigram_language_model(documents):
fromcollectionsimportdefaultdicttrigram_counts=defaultdict(int)
total_trigrams=0fordocindocuments:
words=doc.lower().split()
foriinrange(len(words) -2):
trigram=tuple(words[i:i+3])
trigram_counts[trigram] +=1total_trigrams+=1trigram_probabilities= {}
fortrigram, countintrigram_counts.items():
trigram_probabilities[trigram] =count/total_trigramsreturntrigram_probabilitiesdocuments= [
"The quick brown fox jumps over the lazy dog",
"The quick blue fox jumps over the lazy cat",
"The lazy dog sleeps under the blue sky"
]
trigram_model=compute_trigram_language_model(documents)
print("Trigram Probabilities:")
fortrigram, probintrigram_model.items():
print(f"{trigram}: {prob}")
Question 4: Creating a Word Embedding Matrix
1. Implement the function create_embedding_matrix(corpus, embedding_dim).
2. Test the function and get_word_vector with the given corpus and embedding_dim=3.
importnumpyasnpdefcreate_embedding_matrix(corpus, embedding_dim):
vocabulary= {}
index=0forsentenceincorpus:
words=sentence.lower().split()
forwordinwords:
ifwordnotinvocabulary:
vocabulary[word] =indexindex+=1V=len(vocabulary)
E=np.random.rand(V, embedding_dim)
word_to_index=vocabularydefget_word_vector(word):
word=word.lower()
ifwordinword_to_index:
idx=word_to_index[word]
returnE[idx]
else:
returnnp.zeros(embedding_dim)
returnE, vocabulary, get_word_vectorcorpus= [
"I love machine learning",
"Machine learning is amazing",
"I love learning new things"
]
embedding_dim=3E, vocabulary, get_word_vector=create_embedding_matrix(corpus, embedding_dim)
print("Vocabulary:", vocabulary)
print("Embedding Matrix E:\n", E)
word="learning"vector=get_word_vector(word)
print(f"Embedding for '{word}':", vector)
word="unknown"vector=get_word_vector(word)
print(f"Embedding for '{word}':", vector)
Question 5: Creating a Word Embedding Matrix with Pre-trained Embeddings
1. Implement the function create_embedding_matrix_with_pretrained(corpus, pretrained_embeddings, embedding_dim).
2. Test the function with the given corpus and pre-trained embeddings.