KMeans Clustering Bidimensional Daniel Ames Camayo

Download as pdf or txt
Download as pdf or txt
You are on page 1of 15

pvaofem1s

June 7, 2024

1 GUIA DE LABORATORIO
1.1 APELLIDOS, Nombres: Ames Camayo Daniel Vides
Fecha: 07 de Julio de 2024

2 STANDARD LIBRARIES:
[ ]: import pandas as pd
import matplotlib.pyplot as plt

3 CUSTOMIZED LIBRARIES:
[ ]: from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

4 EXTRACCION DE DATOS:
[ ]: pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S') # Se capta la fecha y hora␣
↪actual

[ ]: '2024-06-07 05:59:10'

[ ]: from google.colab import files


uploaded = files.upload()

<IPython.core.display.HTML object>
Saving ClusteringBidimensional.csv to ClusteringBidimensional (4).csv

[ ]: # Carga del dataset


df = pd.read_csv('ClusteringBidimensional.csv')
df

[ ]: Index Semilla Risk Return Cluster


0 1 0.5835 99 0.0374 2
1 2 0.1800 10 0.1173 1

1
2 3 0.0119 28 0.1439 1
3 4 0.5362 76 0.2878 0
4 5 0.8096 14 0.1641 1
5 6 0.5323 35 0.1502 1
6 7 0.7870 90 0.0360 2
7 8 0.9125 31 0.1158 1
8 9 0.6307 91 0.0601 2
9 10 0.8131 22 0.0595 1
10 11 0.7874 17 0.1261 1
11 12 0.0579 70 0.0996 2
12 13 0.3477 37 0.0843 1
13 14 0.1617 24 0.0644 1
14 15 0.9018 81 0.3518 0
15 16 0.8550 32 0.1218 1
16 17 0.9065 22 0.1765 1
17 18 0.2867 63 0.0383 2
18 19 0.7480 28 0.1585 1
19 20 0.7381 99 0.0357 2
20 21 0.4254 22 0.1685 1
21 22 0.0550 70 0.1002 2
22 23 0.7066 19 0.0931 1
23 24 0.1206 35 0.0981 1
24 25 0.5469 75 0.3965 0
25 26 0.7344 18 0.0561 1
26 27 0.8721 17 0.1319 1
27 28 0.2066 82 0.0496 2
28 29 0.1000 28 0.1406 1
29 30 0.5946 98 0.0678 2
30 31 0.1220 32 0.1686 1
31 32 0.5110 82 0.4803 0
32 33 0.2753 86 0.4155 0
33 34 0.7862 80 0.4860 0
34 35 0.1394 94 0.4062 0
35 36 0.6563 70 0.4168 0
36 37 0.4410 74 0.3937 0
37 38 0.4346 85 0.0789 2
38 39 0.8532 74 0.3001 0
39 40 0.0828 69 0.0394 2

5 ARRAY de Numpy
[ ]: import numpy as np
array = np.array([1, 2, 3, 4, 5])
print(array)

[1 2 3 4 5]

2
6 E.D.A - Exploratory Data Analysis: Análisis Estadística De-
scriptiva
[ ]: df.describe()

[ ]: Index Semilla Risk Return Cluster


count 40.000000 40.000000 40.00000 40.000000 40.000000
mean 20.500000 0.506290 54.47500 0.172925 1.025000
std 11.690452 0.296183 30.08619 0.138622 0.733362
min 1.000000 0.011900 10.00000 0.035700 0.000000
25% 10.750000 0.199950 27.00000 0.066950 0.750000
50% 20.500000 0.541550 66.00000 0.123950 1.000000
75% 30.250000 0.786400 81.25000 0.204325 2.000000
max 40.000000 0.912500 99.00000 0.486000 2.000000

7 Análisis Univariado Cuantitativas: Análisis de cada variable


cuantitativa independientemente
[ ]: import seaborn as sns
sns.histplot(df['Risk'])
sns.histplot(df['Return'])

[ ]: <Axes: xlabel='Risk', ylabel='Count'>

3
8 Análisis Univariado Cualitativas: Análisis de cada variable
categórica nominal y ordinal independientemente
[ ]: sns.countplot(x='Risk', data=df) # Asegurarse de que 'Index' es una columna␣
↪categórica

[ ]: <Axes: xlabel='Risk', ylabel='count'>

4
9 Análisis Bivariado y Multivariado: Análisis de cada variable
“x(i)” versus la variable target (objetivo) “y”
[ ]: sns.scatterplot(x='Risk', y='Return', data=df)

[ ]: <Axes: xlabel='Risk', ylabel='Return'>

5
10 TRANSFORMACION Y/O LIMPIEZA - PRE-
PROCESAMIENTO: ETL
[ ]: df_no_normalizado = df.copy()
scaler = MinMaxScaler(feature_range=(0, 1))
df[['Risk', 'Return']] = scaler.fit_transform(df[['Risk', 'Return']])
df

[ ]: Index Semilla Risk Return Cluster


0 1 0.5835 1.000000 0.003775 2
1 2 0.1800 0.000000 0.181213 1
2 3 0.0119 0.202247 0.240284 1
3 4 0.5362 0.741573 0.559849 0
4 5 0.8096 0.044944 0.285143 1
5 6 0.5323 0.280899 0.254275 1
6 7 0.7870 0.898876 0.000666 2
7 8 0.9125 0.235955 0.177881 1
8 9 0.6307 0.910112 0.054186 2
9 10 0.8131 0.134831 0.052854 1
10 11 0.7874 0.078652 0.200755 1

6
11 12 0.0579 0.674157 0.141905 2
12 13 0.3477 0.303371 0.107928 1
13 14 0.1617 0.157303 0.063735 1
14 15 0.9018 0.797753 0.701976 0
15 16 0.8550 0.247191 0.191206 1
16 17 0.9065 0.134831 0.312680 1
17 18 0.2867 0.595506 0.005774 2
18 19 0.7480 0.202247 0.272707 1
19 20 0.7381 1.000000 0.000000 2
20 21 0.4254 0.134831 0.294915 1
21 22 0.0550 0.674157 0.143238 2
22 23 0.7066 0.101124 0.127471 1
23 24 0.1206 0.280899 0.138574 1
24 25 0.5469 0.730337 0.801244 0
25 26 0.7344 0.089888 0.045303 1
26 27 0.8721 0.078652 0.213635 1
27 28 0.2066 0.808989 0.030868 2
28 29 0.1000 0.202247 0.232956 1
29 30 0.5946 0.988764 0.071286 2
30 31 0.1220 0.247191 0.295137 1
31 32 0.5110 0.808989 0.987342 0
32 33 0.2753 0.853933 0.843438 0
33 34 0.7862 0.786517 1.000000 0
34 35 0.1394 0.943820 0.822785 0
35 36 0.6563 0.674157 0.846325 0
36 37 0.4410 0.719101 0.795026 0
37 38 0.4346 0.842697 0.095936 2
38 39 0.8532 0.719101 0.587164 0
39 40 0.0828 0.662921 0.008217 2

11 MODELO(S): Modelamiento: Crear modelo, FIT (Ajustar,


Entrenar), PREDICT (generar valores ajustados y/o predic-
ción)
[ ]: num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(df[['Risk', 'Return']])

# Obtener las etiquetas de los clusters


labels = kmeans.labels_
df['Cluster_identificado'] = labels

# Obtener los centros de los clusters


centroids = kmeans.cluster_centers_

/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870:

7
FutureWarning: The default value of `n_init` will change from 10 to 'auto' in
1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(

12 EVALUAR MODELOS: Métricas


[ ]: from sklearn.metrics import silhouette_score
score = silhouette_score(df[['Risk', 'Return']], labels)
print(f'Silhouette Score: {score}')

Silhouette Score: 0.7281645881252201

13 INTERPRETACION: De resultados
[ ]: # Plot the clusters on a scatter plot
plt.scatter(df['Risk'], df['Return'], c=df['Cluster_identificado'],␣
↪cmap='viridis')

plt.scatter(centroids[:, 0], centroids[:, 1], marker='o', s=200, c='red')


plt.xlabel('Risk')
plt.ylabel('Return')
plt.title('Clustering Result')
plt.show()

8
[ ]: from scipy.spatial import ConvexHull
from matplotlib.patches import Polygon

[ ]: # Create a figure and axes


fig, ax = plt.subplots()

# Plot the clusters on a scatter plot


scatter = ax.scatter(df['Risk'], df['Return'], c=df['Cluster_identificado'],␣
↪cmap='viridis')

# Plot the cluster centers as red circles


ax.scatter(centroids[:, 0], centroids[:, 1], marker='o', s=200, c='red')

# Plot the geometric polygon for each cluster


for cluster in range(num_clusters):
cluster_points = df[df['Cluster_identificado'] == cluster][['Risk',␣
↪'Return']].values

hull = ConvexHull(cluster_points)
polygon = plt.Polygon(cluster_points[hull.vertices], edgecolor='blue',␣
↪linewidth=1, fill=None)

9
ax.add_patch(polygon)

# Set the labels and title


plt.xlabel('Risk')
plt.ylabel('Return')
plt.title('Clustering Result')

# Create a legend
legend_elements = scatter.legend_elements()[0]
legend_labels = ['Cluster {}'.format(i) for i in range(num_clusters)]
ax.legend(legend_elements, legend_labels, loc='upper left')

# Show the plot


plt.show()

[ ]: # Create a figure and axes


fig, ax = plt.subplots()

# Plot the clusters on a scatter plot


scatter = ax.scatter(df['Risk'], df['Return'], c=df['Cluster'], cmap='viridis')

10
# Plot the cluster centers as red circles
ax.scatter(centroids[:, 0], centroids[:, 1], marker='o', s=200, c='red')

# Plot the geometric polygon for each cluster


for cluster in range(num_clusters):
cluster_points = df[df['Cluster'] == cluster][['Risk', 'Return']].values
hull = ConvexHull(cluster_points)
polygon = plt.Polygon(cluster_points[hull.vertices], edgecolor='blue',␣
↪linewidth=1, fill=None)

ax.add_patch(polygon)

# Set the labels and title


plt.xlabel('Risk')
plt.ylabel('Return')
plt.title('Clustering Result')

# Create a legend
legend_elements = scatter.legend_elements()[0]
legend_labels = ['Cluster {}'.format(i) for i in range(num_clusters)]
ax.legend(legend_elements, legend_labels, loc='upper left')

# Show the plot


plt.show()

11
[ ]: # Perform clustering using K-means
%%time
num_clusters = 3
kmeans2 = KMeans(n_clusters=num_clusters)
kmeans2.fit(df_no_normalizado[['Risk', 'Return']])

# Add the cluster labels to the DataFrame


df_no_normalizado['Cluster_identificado'] = kmeans2.labels_

# Get the cluster centers


centroids = kmeans2.cluster_centers_

# Plot the clusters on a scatter plot


plt.scatter(df_no_normalizado['Risk'], df_no_normalizado['Return'],
c=df_no_normalizado['Cluster_identificado'], cmap='viridis')
plt.scatter(centroids[:, 0], centroids[:, 1], marker='o', s=200, c='red')
plt.xlabel('Risk')
plt.ylabel('Return')
plt.title('Clustering Result')
plt.show()

12
/usr/local/lib/python3.10/dist-packages/sklearn/cluster/_kmeans.py:870:
FutureWarning: The default value of `n_init` will change from 10 to 'auto' in
1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(

CPU times: user 572 ms, sys: 45.8 ms, total: 618 ms
Wall time: 644 ms

[ ]: # Create a figure and axes


fig, ax = plt.subplots()

# Plot the clusters on a scatter plot


scatter = ax.scatter(df_no_normalizado['Risk'], df_no_normalizado['Return'],
c=df_no_normalizado['Cluster_identificado'],␣
↪cmap='viridis')

# Plot the cluster centers as red circles


ax.scatter(centroids[:, 0], centroids[:, 1], marker='o', s=200, c='red')

# Plot the geometric polygon for each cluster

13
for cluster in range(num_clusters):
cluster_points =␣
↪df_no_normalizado[df_no_normalizado['Cluster_identificado'] ==␣

↪cluster][['Risk', 'Return']].values

hull = ConvexHull(cluster_points)
polygon = plt.Polygon(cluster_points[hull.vertices], edgecolor='blue',␣
↪linewidth=1, fill=None)

ax.add_patch(polygon)

# Set the labels and title


plt.xlabel('Risk')
plt.ylabel('Return')
plt.title('Clustering Result')

# Create a legend
legend_elements = scatter.legend_elements()[0]
legend_labels = ['Cluster {}'.format(i) for i in range(num_clusters)]
ax.legend(legend_elements, legend_labels, loc='upper left')

# Show the plot


plt.show()

14
15

You might also like