/K-Means-Cluster-from-Scratch

Clustering data using K-Means cluster method with Elbow method for getting optimum cluster numbers.

Primary LanguageJupyter Notebook

K-means Cluster

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset = pd.read_csv('Mall_Customers.csv')
dataset.describe()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
X = dataset.iloc[:, [3, 4]].values
m = X.shape[0] # number of training examples
n = X.shape[1] # number of features. n = 2

n_iter = 100
K = 5 # number of clusters considered to be 5
Centroids = np.array([]).reshape(n, 0)
idx = np.random.randint(0, m-1, size=K)
Centroids = X[idx, :]
print(Centroids)
[[42 52]
 [38 35]
 [73 73]
 [76 87]
 [75  5]]
for _ in range(n_iter):
    distance = np.array([]).reshape(m, 0)
    for k in range(K):
        tempDist = np.sum((X-Centroids[k, :])**2, axis=1)
        distance = np.c_[distance, tempDist]

    C = np.argmin(distance, axis=1)+1

    Y = {}
    for k in range(K):
        Y[k+1] = np.array([]).reshape(n, 0)
    for i in range(m):
        Y[C[i]] = np.c_[Y[C[i]], X[i]]

        # Regrouping to nearby centroids
    for k in range(K):
        Y[k+1] = Y[k+1].T

    for k in range(K):
        Centroids[k,:] = np.mean(Y[k+1], axis=0)
    Output = Y
plt.scatter(X[:,0], X[:,1], c='black',label='unclustered data')
plt.xlabel('Income')
plt.ylabel('# transactions')
plt.legend()
plt.title('Plot of data points')
plt.show()

png

color = ['red', 'blue', 'green', 'cyan', 'magenta']
labels = ['cluster' + str(i+1) for i in range(K)]
for k in range(K):
    plt.scatter(Output[k+1][:,0], Output[k+1][:,1], c=color[k], label=labels[k])
plt.scatter(Centroids[:,0], Centroids[:,1],s=30, c='yellow',label='Centroids')
plt.xlabel('Income')
plt.ylabel('# transactions')
plt.legend()
plt.show()

png

def Kmeans(X, K, n_iter):
    Centroids = np.array([]).reshape(n, 0)
    idx = np.random.randint(0, m-1, size=K)
    Centroids = X[idx, :]
    for _ in range(n_iter):
        distance = np.array([]).reshape(m, 0)
        for k in range(K):
            tempDist = np.sum((X-Centroids[k, :])**2, axis=1)
            distance = np.c_[distance, tempDist]

        C = np.argmin(distance, axis=1)+1

        Y = {}
        for k in range(K):
            Y[k+1] = np.array([]).reshape(n, 0)
        for i in range(m):
            Y[C[i]] = np.c_[Y[C[i]], X[i]]

            # Regrouping to nearby centroids
        for k in range(K):
            Y[k+1] = Y[k+1].T

        for k in range(K):
            Centroids[k,:] = np.mean(Y[k+1], axis=0)
        Output = Y
    return Output, Centroids
WCSS_array = []
for K in range(1,11):
    Output, Centroids = Kmeans(X,K,n_iter)
    wcss = 0
    for k in range(K):
        wcss += np.sum((Output[k+1] - Centroids[k,:])**2)
    WCSS_array.append(wcss)
K_array = np.arange(1,11,1)
plt.plot(K_array, WCSS_array)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS value')
plt.title('Elbow method for optimum # clusters')
plt.show()

png

Optimum number of clusters is 5