
Desafio proposto pela B2W, no qual deve ser feito uma clusteriazação dos dados.

Primary LanguageJupyter Notebook


Desafio proposto pela B2W, no qual deve ser feito uma clusteriazação dos dados. Resolvido pela aplicação de K-Means Localizado em https://github.com/B2W-BIT/We-are-Hiring/tree/master/desafios-ia-front/clusters-sapatos

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import preprocessing
from sklearn.cluster import KMeans
file_url = "https://raw.githubusercontent.com/B2W-BIT/We-are-Hiring/master/desafios-ia-front/clusters-sapatos/shoes.json"
data = pd.read_json(file_url, orient="records", lines=True)

Primeira etapa - Análise dos dados

brand categories colors condition id isSale price
0 Naturalizer [Clothing, Heels, All Women's Shoes, Shoes, Sa... [Silver, Cream] USED AVpfEf_hLJeJML431ueH False 55.990
1 MUK LUKS [Clothing, All Women's Shoes, Women's Casual S... [Grey] NEW AVpi74XfLJeJML43qZAc True 41.125
2 MUK LUKS [Clothing, All Women's Shoes, Women's Casual S... [Grey] NEW AVpi74XfLJeJML43qZAc False 35.250
3 MUK LUKS [Clothing, All Women's Shoes, Shoes, Women's S... [Black] NEW AVpjXyCc1cnluZ0-V-Gj False 24.750
4 MUK LUKS [Clothing, All Women's Shoes, Shoes, Women's S... [Grey] NEW AVphGKLPilAPnD_x1Nrm True 31.695
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
brand         10000 non-null object
categories    10000 non-null object
colors        10000 non-null object
condition     10000 non-null object
id            10000 non-null object
isSale        10000 non-null bool
price         10000 non-null float64
dtypes: bool(1), float64(1), object(5)
memory usage: 478.6+ KB
data = data.dropna()
#get list of unique id's
product_id = data['id'].value_counts().keys().tolist()
product_counts = data['id'].value_counts().tolist()
#get list of colors and amount times it appears
colors_names = data['colors'].value_counts().keys().tolist()
colors_counts = data['colors'].value_counts().tolist()
#get list of unique colors
colors_list = []

for colors in colors_names:
    for color_name in colors:
        if color_name not in colors_list:
#get list of categories and amount times it appears
categories_names = data['categories'].value_counts().keys().tolist()
categories_counts = data['categories'].value_counts().tolist()
#get list of unique categories
categories_list = []

for categories in categories_names:
    for category_name in categories:
        if category_name not in categories_list:

Repetindo a análise para as condições:

#get list of conditions and amount times it appears
condition_names = data['condition'].value_counts().keys().tolist()
condition_counts = data['condition'].value_counts().tolist()
condition_list = condition_names

As condições são bem mais simples e só possuem duas alternativas, novo ou usado.

Segunda Etapa - Condicionamento das entradas

Agora precisamos criar apenas uma entrada para cada id, combinando as cores, condições e categorias. Como muitos dados possuem cores repetidas e categorias repetidas. Primeiro esses dados serão filtrados para que depois sejam codificados.

product_color_list = []
all_color_list = []
for product in product_id:
    product_color = []
    for colors in data[data['id'] == str(product)]['colors']:
        for name in colors:
            if name not in product_color:
product_category_list = []
all_category_list = []
for product in product_id:
    product_category = []
    for categories in data[data['id'] == str(product)]['categories']:
        for name in categories:
            if name not in product_category:

all_category_listDF = pd.DataFrame(data=all_category_list ,columns=['categories'])
categories_names_separeted = all_category_listDF['categories'].value_counts().keys().tolist()
categories_counts_separated = all_category_listDF['categories'].value_counts().tolist()
all_category_listDF = pd.DataFrame(data=all_category_list ,columns=['categories'])
categories_names_separeted = all_category_listDF['categories'].value_counts().keys().tolist()
categories_counts_separated = all_category_listDF['categories'].value_counts().tolist()
categoriesOrdered = categories_names_separeted[-1::-2]+categories_names_separeted[1::2]
product_condition_list = []
for product in product_id:
    product_condition = []
    for condition in data[data['id'] == str(product)]['condition']:
        if condition not in product_condition:
cluster_data = pd.DataFrame(data=list(zip(product_id,product_color_list,product_category_list,product_condition_list)) , columns=['id','colors','categories','conditions'])
id colors categories conditions
0 AV46uYe4glJLPUi8VW57 [Pure, Blackchrome-Aurora] [Clothing, Women's Athletic Shoes, Womens, Wom... [USED]
1 AV2cZ7SYGV-KLJ3al3p6 [Stone, Dark, Black, Luxe, Silver] [Heels, Pumps, Clothing, LifeStride, All Women... [USED]
2 AVpfn5s3LJeJML43ArJE [White] [Clothing, Womens, All Women's Shoes, Women's ... [USED]
3 AV-nbnIEKZqtpbFMSzYa [NA] [Heels, Pumps, Shoes, Simply Vera Vera Wang, W... [USED]
4 AVpidVDz1cnluZ0-Nd4_ [Brown] [Clothing, All Women's Shoes, Women's Shoes, S... [USED]

Agora que os dados estão prontos, será feita a codificação.

def create_col_labels(df,col_name):
    col = df[col_name].copy()
    col.update(col[(col.str.len() > 1)].apply(tuple))
    col.update(col[~(col.str.len() > 1)].apply(', '.join))
    cat_col = pd.Categorical(col, ordered=True, categories=col.value_counts().keys().tolist())
    labels, uniques = pd.factorize(cat_col,sort=True)
    df[col_name+'_labels'] = labels
id colors categories conditions categories_labels colors_labels conditions_labels
0 AV46uYe4glJLPUi8VW57 [Pure, Blackchrome-Aurora] [Clothing, Women's Athletic Shoes, Womens, Wom... [USED] 108 39 0
1 AV2cZ7SYGV-KLJ3al3p6 [Stone, Dark, Black, Luxe, Silver] [Heels, Pumps, Clothing, LifeStride, All Women... [USED] 189 189 0
2 AVpfn5s3LJeJML43ArJE [White] [Clothing, Womens, All Women's Shoes, Women's ... [USED] 162 4 0
3 AV-nbnIEKZqtpbFMSzYa [NA] [Heels, Pumps, Shoes, Simply Vera Vera Wang, W... [USED] 83 0 0
4 AVpidVDz1cnluZ0-Nd4_ [Brown] [Clothing, All Women's Shoes, Women's Shoes, S... [USED] 150 3 0

Terceira Etapa - Clusterização

Por fim, iremos tentar agrupar os nossos dados a fim de encontrar 4 clusters de acordo com as semelhanças que forem encontradas.

kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(cluster_data[['categories_labels','colors_labels','conditions_labels']])
cluster_data['KMeans_labels'] = kmeans.labels_
cluster_centers = kmeans.cluster_centers_
for n_cluster in range(num_clusters):
    print('Para o Cluster {}:'.format(n_cluster))
    print('Centro em x: {:.3f}, y: {:.3f}, z: {:.3f}'.format(cluster_centers[n_cluster,0],cluster_centers[n_cluster,1],cluster_centers[n_cluster,2]))
    print('Média dos preços é de {:.2f}, com o desvio padrão de {:.2f}\n'.format(
Para o Cluster 0:
Centro em x: 145.000, y: 122.462, z: 0.231
Média dos preços é de 60.28, com o desvio padrão de 31.29

Para o Cluster 1:
Centro em x: 18.074, y: 4.811, z: 0.200
Média dos preços é de 58.80, com o desvio padrão de 15.32

Para o Cluster 2:
Centro em x: 131.366, y: 6.149, z: 0.050
Média dos preços é de 60.14, com o desvio padrão de 16.16

Para o Cluster 3:
Centro em x: 12.224, y: 137.935, z: 0.916
Média dos preços é de 74.47, com o desvio padrão de 34.37
#make 3d plot with cluster data
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111,projection='3d')

plot_color  = ['r','b','g','m']
plot_marker = ['D','o','s','X']

for c,m,cluster_label in zip(plot_color,plot_marker,range(num_clusters)):
    xs = cluster_data[cluster_data['id'].isin(cluster_data[cluster_data['KMeans_labels']==cluster_label]['id'])]['categories_labels']
    ys = cluster_data[cluster_data['id'].isin(cluster_data[cluster_data['KMeans_labels']==cluster_label]['id'])]['colors_labels']
    zs = cluster_data[cluster_data['id'].isin(cluster_data[cluster_data['KMeans_labels']==cluster_label]['id'])]['conditions_labels']
    mean_price = data[data['id'].isin(cluster_data[cluster_data['KMeans_labels']==cluster_label]['id'])]['price'].agg('mean')
    std_price = data[data['id'].isin(cluster_data[cluster_data['KMeans_labels']==cluster_label]['id'])]['price'].agg('std')
    ax.scatter(xs, ys, zs, c=c, marker=m, label='Cluster '+str(cluster_label)+' (Preço: $\mu = $'+str(round(mean_price,2))+' e $\delta = $'+str(round(std_price,2))+')')
    ax.scatter(cluster_centers[cluster_label, 0], cluster_centers[cluster_label, 1],cluster_centers[cluster_label,2],  
               c=c, s=200, alpha=0.5, label='Centro do cluster '+str(cluster_label));

ax.set_title('Women Shoes Cluster Plot')
ax.legend(bbox_to_anchor=(1.05, .7), loc='upper left', borderaxespad=0.)

