K-means Cluster

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('Mall_Customers.csv')
dataset.describe()

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	CustomerID	Age	Annual Income (k$)	Spending Score (1-100)
count	200.000000	200.000000	200.000000	200.000000
mean	100.500000	38.850000	60.560000	50.200000
std	57.879185	13.969007	26.264721	25.823522
min	1.000000	18.000000	15.000000	1.000000
25%	50.750000	28.750000	41.500000	34.750000
50%	100.500000	36.000000	61.500000	50.000000
75%	150.250000	49.000000	78.000000	73.000000
max	200.000000	70.000000	137.000000	99.000000

X = dataset.iloc[:, [3, 4]].values

m = X.shape[0] # number of training examples
n = X.shape[1] # number of features. n = 2

n_iter = 100

K = 5 # number of clusters considered to be 5
Centroids = np.array([]).reshape(n, 0)

idx = np.random.randint(0, m-1, size=K)
Centroids = X[idx, :]
print(Centroids)

[[42 52]
 [38 35]
 [73 73]
 [76 87]
 [75  5]]

for _ in range(n_iter):
    distance = np.array([]).reshape(m, 0)
    for k in range(K):
        tempDist = np.sum((X-Centroids[k, :])**2, axis=1)
        distance = np.c_[distance, tempDist]

    C = np.argmin(distance, axis=1)+1

    Y = {}
    for k in range(K):
        Y[k+1] = np.array([]).reshape(n, 0)
    for i in range(m):
        Y[C[i]] = np.c_[Y[C[i]], X[i]]

        # Regrouping to nearby centroids
    for k in range(K):
        Y[k+1] = Y[k+1].T

    for k in range(K):
        Centroids[k,:] = np.mean(Y[k+1], axis=0)
    Output = Y

plt.scatter(X[:,0], X[:,1], c='black',label='unclustered data')
plt.xlabel('Income')
plt.ylabel('# transactions')
plt.legend()
plt.title('Plot of data points')
plt.show()

color = ['red', 'blue', 'green', 'cyan', 'magenta']
labels = ['cluster' + str(i+1) for i in range(K)]
for k in range(K):
    plt.scatter(Output[k+1][:,0], Output[k+1][:,1], c=color[k], label=labels[k])
plt.scatter(Centroids[:,0], Centroids[:,1],s=30, c='yellow',label='Centroids')
plt.xlabel('Income')
plt.ylabel('# transactions')
plt.legend()
plt.show()

def Kmeans(X, K, n_iter):
    Centroids = np.array([]).reshape(n, 0)
    idx = np.random.randint(0, m-1, size=K)
    Centroids = X[idx, :]
    for _ in range(n_iter):
        distance = np.array([]).reshape(m, 0)
        for k in range(K):
            tempDist = np.sum((X-Centroids[k, :])**2, axis=1)
            distance = np.c_[distance, tempDist]

        C = np.argmin(distance, axis=1)+1

        Y = {}
        for k in range(K):
            Y[k+1] = np.array([]).reshape(n, 0)
        for i in range(m):
            Y[C[i]] = np.c_[Y[C[i]], X[i]]

            # Regrouping to nearby centroids
        for k in range(K):
            Y[k+1] = Y[k+1].T

        for k in range(K):
            Centroids[k,:] = np.mean(Y[k+1], axis=0)
        Output = Y
    return Output, Centroids

WCSS_array = []
for K in range(1,11):
    Output, Centroids = Kmeans(X,K,n_iter)
    wcss = 0
    for k in range(K):
        wcss += np.sum((Output[k+1] - Centroids[k,:])**2)
    WCSS_array.append(wcss)

K_array = np.arange(1,11,1)
plt.plot(K_array, WCSS_array)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS value')
plt.title('Elbow method for optimum # clusters')
plt.show()

Optimum number of clusters is 5

matriculus/K-Means-Cluster-from-Scratch

K-means Cluster