Predictive Maintenance of Air Quality Data
# Load libraries
import pandas
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
sensor_file = "./data/sensor_data.csv"
quality_file = "./data/quality_control_data.csv"
# names = ['weight', 'humidity', 'temperature', 'quality']
sensor_data = pandas.read_csv(sensor_file)
quality_data = pandas.read_csv(quality_file)
|
weight |
humidity |
temperature |
prod_id |
0 |
1030.871118 |
29.687881 |
71.995808 |
1 |
1 |
1044.961148 |
28.862453 |
68.468664 |
2 |
2 |
972.710479 |
37.951588 |
65.121344 |
3 |
3 |
1010.182509 |
25.076383 |
67.821336 |
4 |
4 |
970.039236 |
27.137886 |
72.931800 |
5 |
5 |
990.154359 |
32.422428 |
71.406207 |
6 |
6 |
965.660243 |
42.603619 |
65.876158 |
7 |
7 |
969.221212 |
31.655071 |
74.430054 |
8 |
8 |
976.495532 |
26.499721 |
69.866121 |
9 |
9 |
974.993517 |
38.644055 |
69.891709 |
10 |
|
prod_id |
quality |
0 |
1 |
good |
1 |
2 |
good |
2 |
3 |
good |
3 |
4 |
good |
4 |
5 |
good |
5 |
6 |
good |
6 |
7 |
poor |
7 |
8 |
good |
8 |
9 |
good |
9 |
10 |
good |
rawdataset = sensor_data.merge(quality_data, on="prod_id")
|
weight |
humidity |
temperature |
prod_id |
quality |
0 |
1030.871118 |
29.687881 |
71.995808 |
1 |
good |
1 |
1044.961148 |
28.862453 |
68.468664 |
2 |
good |
2 |
972.710479 |
37.951588 |
65.121344 |
3 |
good |
3 |
1010.182509 |
25.076383 |
67.821336 |
4 |
good |
4 |
970.039236 |
27.137886 |
72.931800 |
5 |
good |
dataset = rawdataset.drop(columns='prod_id')
dataset.head(10)
|
weight |
humidity |
temperature |
quality |
0 |
1030.871118 |
29.687881 |
71.995808 |
good |
1 |
1044.961148 |
28.862453 |
68.468664 |
good |
2 |
972.710479 |
37.951588 |
65.121344 |
good |
3 |
1010.182509 |
25.076383 |
67.821336 |
good |
4 |
970.039236 |
27.137886 |
72.931800 |
good |
5 |
990.154359 |
32.422428 |
71.406207 |
good |
6 |
965.660243 |
42.603619 |
65.876158 |
poor |
7 |
969.221212 |
31.655071 |
74.430054 |
good |
8 |
976.495532 |
26.499721 |
69.866121 |
good |
9 |
974.993517 |
38.644055 |
69.891709 |
good |
# shape
print(dataset.shape)
# descriptions
print(dataset.describe())
weight humidity temperature
count 3000.000000 3000.000000 3000.000000
mean 999.940363 34.863581 69.962969
std 28.765904 5.755869 2.857898
min 950.017007 25.008023 65.000514
25% 975.552942 29.783650 67.522238
50% 998.875197 34.825848 69.890808
75% 1025.649219 39.887405 72.414522
max 1049.954013 44.986735 74.999312
# quality distribution
print(dataset.groupby('quality').size())
quality
good 2907
poor 93
dtype: int64
# box and whisker plots to show data distribution
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()
# check the histograms
dataset.hist()
plt.show()
# scatter plot matrix - anything useful here?
scatter_matrix(dataset)
plt.show()
# Split-out validation dataset
array = dataset.values
X = array[:,0:3]
Y = array[:,3]
validation_size = 0.20
seed = 8
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='lbfgs')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
LR: 0.976667 (0.008375)
LDA: 0.973750 (0.007229)
KNN: 0.992083 (0.005417)
CART: 0.998750 (0.002668)
NB: 0.994167 (0.003333)
SVM: 0.985417 (0.005966)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Comparison of ML Models')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Make predictions on validation dataset
#knn = KNeighborsClassifier()
CART = DecisionTreeClassifier()
CART.fit(X_train, Y_train)
predictions = CART.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
0.9983333333333333
[[581 0]
[ 1 18]]
precision recall f1-score support
good 1.00 1.00 1.00 581
poor 1.00 0.95 0.97 19
accuracy 1.00 600
macro avg 1.00 0.97 0.99 600
weighted avg 1.00 1.00 1.00 600
Now test some values of your own
testWeight = 1200
testHumidity = 60
testTemperature = 65
testPrediction = CART.predict([[testWeight,testHumidity,testTemperature]])
print(testPrediction)