Importing required packages
import pandas as pd
import pickle
from sklearn .feature_extraction .text import CountVectorizer
from sklearn .naive_bayes import MultinomialNB
import pickle
from nltk .corpus import words
from nltk .corpus import stopwords
from nltk .stem .porter import PorterStemmer
import re
Loading dataset
df = pd .read_csv ('spam.csv' ,encoding = "latin-1" )
df .drop (['Unnamed: 2' ,'Unnamed: 3' ,'Unnamed: 4' ],inplace = True ,axis = 1 )
df .groupby ('class' ).describe ()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead tr th {
text-align: left;
}
.dataframe thead tr:last-of-type th {
text-align: right;
}
</style>
message
count
unique
top
freq
class
ham
4825
4516
Sorry, I'll call later
30
spam
747
653
Please call our customer service representativ...
4
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
class
message
5567
spam
This is the 2nd time we have tried 2 contact u...
5568
ham
Will Ì_ b going to esplanade fr home?
5569
ham
Pity, * was in mood for that. So...any other s...
5570
ham
The guy did some bitching but I acted like i'd...
5571
ham
Rofl. Its true to its name
#dataset = dataset.sample(frac=1).reset_index(drop=True)
Data Preprocessing
df ['class' ]= df ['class' ].map ({'ham' :0 ,'spam' :1 })
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
class
message
0
0
Go until jurong point, crazy.. Available only ...
1
0
Ok lar... Joking wif u oni...
2
1
Free entry in 2 a wkly comp to win FA Cup fina...
3
0
U dun say so early hor... U c already then say...
4
0
Nah I don't think he goes to usf, he lives aro...
Stop Words removal and Stemming
ps = PorterStemmer ()
stemmed_messages = []
for i in range (0 , len (df )):
x = re .sub ('[^a-zA-Z]' , ' ' , df ['message' ][i ]) #keeping only alphabets
x = x .lower () #converting everything to lowercase
x = x .split ()
x = [ps .stem (word ) for word in x if not word in stopwords .words ('english' )]
x = ' ' .join (x )
stemmed_messages .append (x )
cv = CountVectorizer ()
X = cv .fit_transform (stemmed_messages ).toarray ()
y = df ['class' ]
pickle .dump (cv ,open ('transform.pkl' ,'wb' ))
Splitting dataset and fitting model with optimal learning rate
from sklearn .model_selection import train_test_split as TTS
X_train ,X_test ,y_train ,y_test = TTS (X ,y ,test_size = 0.5 )
#Multinomial Naive Bayes
for i in [0.0001 ,0.001 ,0.01 ,0.1 ,1 ,1.2 ,1.3 ]:
clf = MultinomialNB (alpha = i )
clf .fit (X_train ,y_train )
x = clf .score (X_test ,y_test )
print ('For learning rate:' + str (i )+ ' we get score ' + str (x ))
clf = MultinomialNB (alpha = 1 )
clf .fit (X ,y ) #training on whole dataset
For learning rate:0.0001 we get score 0.9752333094041636
For learning rate:0.001 we get score 0.9759511844938981
For learning rate:0.01 we get score 0.9777458722182341
For learning rate:0.1 we get score 0.9798994974874372
For learning rate:1 we get score 0.9827709978463748
For learning rate:1.2 we get score 0.9816941852117731
For learning rate:1.3 we get score 0.9813352476669059
MultinomialNB(alpha=1)
pickle .dump (cv ,open ('nlp_model.pkl' ,'wb' )) #Saving model for future direct use
Testing
data1 = "Hey there yo, I saw a live red-handed briber"
data2 = "Hey you won a voucher of Rs 10000. Send your credit card number and claim it"
#testing
sent1 = data1
sent2 = data2
vect1 = [sent1 ]
vect2 = [sent2 ]
vect1 = cv .transform (vect1 ).toarray ()
vect2 = cv .transform (vect2 ).toarray ()
pred1 = clf .predict (vect1 )
pred2 = clf .predict (vect2 )
print (pred1 )
print (pred2 )
The model is ready
Additional hardcoding in the app.py file filters out random gibberish and some abusive words