PCA File

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 7

Principal Component Analysis(PCA)

Principal Component Analysis (PCA) is a statistical procedure that uses an


orthogonal transformation which converts a set of correlated variables to a set of
uncorrelated variables. PCA is a most widely used tool in exploratory data analysis and
in machine learning for predictive models. Moreover, PCA is an unsupervised statistical
technique used to examine the interrelations among a set of variables. It is also known
as a general factor analysis where regression determines a line of best fit.

Module Needed:
Import sys
Import pandas as pd
Import numpy as np
Import matplotlib

print('Python : {}'.format(sys.version))

print('Pandas : {}'.format(pd. __version__))

print('Numpy : {}'.format(np. __version__))

print('Scikit_learn : {}'.format(sklearn. __version__))

print('Matplotlib : {}'.format(matplotlib. __version__))

output:

Python : 3.7.4 (default, Aug 9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
Pandas : 0.25.1
Numpy : 1.16.5
Scikit_learn : 0.22.1
Matplotlib : 3.1.1
In [6]:
from sklearn import datasets

iris = datasets.load_iris()

features = iris.data

target = iris.target

print(df.shape)

print(df.head(20))

(150, 4)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
In [10]:
print(df.describe())

output:

sepal length (cm) sepal width (cm) petal length (cm) \


count 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000
std 0.828066 0.435866 1.765298
min 4.300000 2.000000 1.000000
25% 5.100000 2.800000 1.600000
50% 5.800000 3.000000 4.350000
75% 6.400000 3.300000 5.100000
max 7.900000 4.400000 6.900000

petal width (cm)


count 150.000000
mean 1.199333
std 0.762238
min 0.100000
25% 0.300000
50% 1.300000
75% 1.800000
max 2.500000
from pandas.plotting import scatter_matrix

import matplotlib.pyplot as plt

scatter_matrix(df)

plt.show()

<Figure size 640x480 with 16 Axes>

from sklearn.cluster import KMeans

X = []

Y = []

for i in range(1,31):

kmeans = KMeans(n_clusters = i)

kmeans.fit(df)

X.append(i)

awcss = kmeans.inertia_/df.shape[0]

Y.append(awcss)

import matplotlib.pyplot as plt

plt.plot(X,Y,'bo-')

plt.xlim((1,30))

plt.xlabel('Number of Clusters')

plt.ylabel('Average Within_Cluster Sum of Squares')

plt.title('K-Means Clustering Elbow Method')

plt.show()

output:
from sklearn.decomposition import PCA

from sklearn import preprocessing

pca = PCA(n_components = 2)

pc = pca.fit_transform(df)

print(pc.shape)

print(pc[:10])

output:

(150, 2)
[[-2.68412563 0.31939725]
[-2.71414169 -0.17700123]
[-2.88899057 -0.14494943]
[-2.74534286 -0.31829898]
[-2.72871654 0.32675451]
[-2.28085963 0.74133045]
[-2.82053775 -0.08946138]
[-2.62614497 0.16338496]
[-2.88638273 -0.57831175]
[-2.6727558 -0.11377425]]

kmeans = KMeans(n_clusters = 3)
kmeans.fit(pc)

output:

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,


n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
h = 0.02
x_min, x_max = pc[:,0].min()-1,pc[:,1].max()+1
y_min, y_max = pc[:,1].min()-1,pc[:,1].max()+1
xx,yy = np.meshgrid(np.arange(x_min,x_max, h), np.arange(y_min,y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(12,12))
plt.clf()

plt.imshow(Z,interpolation='nearest',
extent = (xx.min(),xx.max(),yy.min(),yy.max()),
cmap=plt.cm.tab20c,
aspect = 'auto', origin = 'lower')
for i, point in enumerate(pc):
if target[i]==0:
plt.plot(point[0],point[0],'g.',markersize = 10)
if target[i]==1:
plt.plot(point[0],point[1],'b.',markersize = 10)
if target[i]==2:
plt.plot(point[0],point[1],'r.',markersize = 10)
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:,0], centroids[:,1], marker = 'x', s = 250, linewidth = 4,
color = 'w',zorder = 10)
plt.title('K-Means Clustering on PCA-Reduced Iris Data Set')
plt.xlim(x_min,x_max)
plt.ylim(y_min,y_max)
plt.xlabel('pca1')
plt.ylabel('pca2')
plt.xticks(())
plt.yticks(())
plt.show()
output:

from sklearn import metrics


kmeans1 = KMeans(n_clusters=3)
kmeans1.fit(features)

kmeans2 = KMeans(n_clusters = 3)
kmeans2.fit(pc)
print('Non Reduced Data')
print('Homogeneity : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
print('Completeness : {}'. format(metrics.completeness_score(target,kmeans1.labels_)))
print('V-measure : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))

print('Reduced Data')
print('Homogeneity : {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
print('Completeness : {}'. format(metrics.completeness_score(target,kmeans1.labels_)))
print('V-measure: {}'. format(metrics.homogeneity_score(target,kmeans1.labels_)))
Output:

Non Reduced Data


Homogeneity : 0.7514854021988339
Completeness : 0.7649861514489816
V-measure : 0.7514854021988339
Reduced Data
Homogeneity : 0.7514854021988339
Completeness : 0.7649861514489816
V-measure: 0.7514854021988339

You might also like