/Lung-Cancer

Lung Cancer Data Set

Primary LanguageHTMLMIT LicenseMIT

# Author : Amir Shokri
# github link : https://github.com/amirshnll/Lung-Cancer
# dataset link : http://archive.ics.uci.edu/ml/datasets/Lung+Cancer
# email : amirsh.nll@gmail.com
import numpy as np, matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import seaborn as sns
sns.set()
def Read_Data(address, Name='*.csv', Sperator=';'):
    Data = pd.read_csv(address+Name, sep=Sperator, header=None)
#    Data = Data.dropna()
    X = Data.drop([0], axis=1)
    Y = Data.iloc[:,0]
    return X, Y
def KNN_Plot(X, Y, n1, n2, knn_title):
    
    '''
    n1 and n2 are Neighbours
    '''
    
    neighbors = np.arange(n1, n2)
    train_acc = np.empty(len(neighbors))
    test_acc = np.empty(len(neighbors))
    
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=42,
                                                        stratify=Y)
    
    for i, k in enumerate(neighbors):
        knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance',
                                         algorithm='auto', p=2)
        knn_model.fit(x_train, y_train)
        TAcc = knn_model.score(x_train, y_train)
        TAcc *= 100
        TAcc = float(format(TAcc,'.2f'))
        train_acc[i] = TAcc
        
        pred = knn_model.predict(x_test)
        Test_acc = accuracy_score(y_test, pred)
        Test_acc *= 100
        Test_acc = float(format(Test_acc,'.2f'))
        test_acc[i] = Test_acc
        
    plt.plot(neighbors, train_acc, label='Train Accuracy')
    plt.plot(neighbors, test_acc, label='Test Accuracy')
    plt.legend(loc='best')
    plt.title(knn_title)
    plt.xlabel('Neighbors')
    plt.ylabel('Accuracy (%)')
    plt.xticks(neighbors)
    plt.show()
    
    return knn_model
def NB(x, y):
   
    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                        test_size=.2,
                                                        random_state=42,
                                                        stratify=y)
    
    
    nb_clf = GaussianNB()
    nb_clf.fit(x_train, y_train)
    Predict_nb_clf = nb_clf.predict(x_test)
    Accuracy = accuracy_score(y_test, Predict_nb_clf)
    
    Accuracy *= 100
    print('GaussianNB Accuracy: ')
    print(float(format(Accuracy,'.2f')))
    print('---------------------------------\n')
    
    return Accuracy
def Tree(X, Y):
    
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=42,
                                                        stratify=Y)
    
    clf = tree.DecisionTreeClassifier(random_state=80)
    clf.fit(x_train, y_train)
    
    Predict = clf.predict(x_test)
    Accuracy = accuracy_score(y_test, Predict)

    Accuracy *= 100
    print('DecisionTree Accuracy: ')
    print(float(format(Accuracy,'.2f')))
    print('---------------------------------\n')
    
    return Accuracy
def MLP(X, Y):
    
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=42,
                                                        stratify=Y)
    
    mlp = MLPClassifier(hidden_layer_sizes=(800,), max_iter=1000, random_state=50)
    mlp.fit(x_train, y_train)

    Predict = mlp.predict(x_test)
    Accuracy = accuracy_score(y_test, Predict)

    Accuracy *= 100
    print('MLP Accuracy: ')
    print(float(format(Accuracy,'.2f')))
    print('---------------------------------\n')
    
    return Accuracy
def LogisticRegressionClf(X, Y):
    
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=.2,
                                                        random_state=50,
                                                        stratify=Y)
    
    clf = LogisticRegression(random_state=50, solver='lbfgs', max_iter=200)
    clf.fit(x_train, y_train)
    
    Predict = clf.predict(x_test)
    Accuracy = accuracy_score(y_test, Predict)

    Accuracy *= 100
    print('LogisticRegression Accuracy: ')
    print(float(format(Accuracy,'.2f')))
    print('---------------------------------\n')
    
    return Accuracy
address = 'C:/'
X, Y = Read_Data(address, Name='lc.csv', Sperator=';')
print(X,Y)
    1   2   3   4   5   6   7   8   9   10  ...  47  48  49  50  51  52  53  \
0    0   3   3   1   0   3   1   3   1   1  ...   2   2   2   2   2   2   2   
1    0   3   3   2   0   3   3   3   1   1  ...   2   2   2   2   2   2   2   
2    0   2   3   2   1   3   3   3   1   2  ...   2   2   2   2   2   2   2   
3    0   3   2   1   1   3   3   3   2   2  ...   2   2   2   2   2   2   2   
4    0   3   3   2   0   3   3   3   1   2  ...   2   2   2   2   2   2   2   
5    0   3   2   1   0   3   3   3   1   2  ...   2   2   2   2   1   2   2   
6    0   2   2   1   0   3   1   3   3   3  ...   2   2   1   2   2   2   2   
7    0   3   1   1   0   3   1   3   1   1  ...   2   2   2   2   2   2   2   
8    0   2   3   2   0   2   2   2   1   2  ...   2   2   2   1   3   2   1   
9    0   2   2   0   0   3   2   3   1   1  ...   2   2   2   2   2   2   2   
10   0   2   3   2   0   1   2   1   1   2  ...   2   2   2   2   2   1   1   
11   0   2   1   1   0   1   2   2   1   2  ...   2   2   2   2   2   2   2   
12   0   2   2   1   1   2   3   3   1   1  ...   2   2   2   2   2   1   1   
13   0   3   2   2   1   2   2   2   1   1  ...   2   2   2   2   2   2   2   
14   0   3   2   2   0   1   1   3   1   1  ...   2   2   2   2   2   2   2   
15   0   2   1   1   0   2   1   3   1   1  ...   2   2   2   2   2   1   1   
16   0   1   2   1   0   3   3   3   1   2  ...   2   2   2   2   2   1   1   
17   0   3   3   2   0   2   1   3   1   1  ...   2   2   1   2   2   2   2   
18   0   2   3   1   1   2   2   1   1   1  ...   3   3   3   3   1   3   3   
19   0   2   3   1   1   1   2   1   1   1  ...   2   2   2   2   2   2   2   
20   0   3   3   1   0   3   3   1   1   1  ...   2   2   2   2   3   2   2   
21   0   2   2   2   0   2   1   2   1   1  ...   2   2   2   2   2   2   2   
22   0   2   2   1   0   2   2   2   1   1  ...   3   3   2   2   3   2   2   
23   0   3   2   2   0   2   2   2   1   1  ...   2   2   2   3   1   2   2   
24   0   2   1   1   0   2   2   1   1   1  ...   2   2   3   2   2   2   2   
25   0   2   3   2   1   2   2   3   1   1  ...   2   2   2   2   2   2   2   
26   0   2   3   1   0   2   3   3   1   1  ...   2   2   2   2   2   2   2   

    54  55  56  
0    1   2   2  
1    2   1   2  
2    2   2   2  
3    1   2   2  
4    2   1   2  
5    2   1   2  
6    1   2   2  
7    1   2   2  
8    1   2   2  
9    2   2   2  
10   2   2   1  
11   1   2   2  
12   1   2   2  
13   2   2   2  
14   1   2   2  
15   1   2   2  
16   2   2   1  
17   2   1   2  
18   2   2   1  
19   2   2   1  
20   2   2   1  
21   1   2   1  
22   2   2   1  
23   2   2   2  
24   2   2   1  
25   1   2   2  
26   2   2   2  

[27 rows x 56 columns] 0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     2
9     2
10    2
11    2
12    2
13    2
14    2
15    2
16    2
17    2
18    3
19    3
20    3
21    3
22    3
23    3
24    3
25    3
26    3
Name: 0, dtype: int64
n1 = 1
n2 = 12
knn_title = 'lung cancer knn Classifier'
KNN_Plot(X, Y, n1, n2, knn_title)

png

KNeighborsClassifier(n_neighbors=11, weights='distance')
Accuracy = NB(X, Y)
GaussianNB Accuracy: 
66.67
---------------------------------
Accuracy = Tree(X, Y)
DecisionTree Accuracy: 
83.33
---------------------------------
Accuracy = MLP(X, Y)
MLP Accuracy: 
16.67
---------------------------------
LGR_Accuraccy = LogisticRegressionClf(X, Y)
LogisticRegression Accuracy: 
50.0
---------------------------------