PCA File
PCA File
PCA File
Module Needed:
Import sys
Import pandas as pd
Import numpy as np
Import matplotlib
print('Python : {}'.format(sys.version))
output:
Python : 3.7.4 (default, Aug 9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Pandas : 0.25.1
Numpy : 1.16.5
Scikit_learn : 0.22.1
Matplotlib : 3.1.1
In [6]:
from sklearn import datasets
iris = datasets.load_iris()
features = iris.data
target = iris.target
print(df.shape)
print(df.head(20))
(150, 4)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
In [10]:
print(df.describe())
output:
scatter_matrix(df)
plt.show()
X = []
Y = []
for i in range(1,31):
kmeans = KMeans(n_clusters = i)
kmeans.fit(df)
X.append(i)
awcss = kmeans.inertia_/df.shape[0]
Y.append(awcss)
plt.plot(X,Y,'bo-')
plt.xlim((1,30))
plt.xlabel('Number of Clusters')
plt.show()
output:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pc = pca.fit_transform(df)
print(pc.shape)
print(pc[:10])
output:
(150, 2)
[[-2.68412563 0.31939725]
[-2.71414169 -0.17700123]
[-2.88899057 -0.14494943]
[-2.74534286 -0.31829898]
[-2.72871654 0.32675451]
[-2.28085963 0.74133045]
[-2.82053775 -0.08946138]
[-2.62614497 0.16338496]
[-2.88638273 -0.57831175]
[-2.6727558 -0.11377425]]
kmeans = KMeans(n_clusters = 3)
kmeans.fit(pc)
output:
plt.imshow(Z,interpolation='nearest',
extent = (xx.min(),xx.max(),yy.min(),yy.max()),
cmap=plt.cm.tab20c,
aspect = 'auto', origin = 'lower')
for i, point in enumerate(pc):
if target[i]==0:
plt.plot(point[0],point[0],'g.',markersize = 10)
if target[i]==1:
plt.plot(point[0],point[1],'b.',markersize = 10)
if target[i]==2:
plt.plot(point[0],point[1],'r.',markersize = 10)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0], centroids[:,1], marker = 'x', s = 250, linewidth = 4,
color = 'w',zorder = 10)
plt.title('K-Means Clustering on PCA-Reduced Iris Data Set')
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
plt.xlabel('pca1')
plt.ylabel('pca2')
plt.xticks(())
plt.yticks(())
plt.show()
output:
kmeans2 = KMeans(n_clusters = 3)
kmeans2.fit(pc)
print('Non Reduced Data')
print('Homogeneity : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
print('Completeness : {}'. format(metrics.completeness_score(target,kmeans1.labels_)))
print('V-measure : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
print('Reduced Data')
print('Homogeneity : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
print('Completeness : {}'. format(metrics.completeness_score(target,kmeans1.labels_)))
print('V-measure: {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
Output: