Unit 4
Unit 4
Unit 4
using Python
1
• Data Warehouse: A central repository of integrated data from
different sources.
2
Creating Data Frame from an Excelsheet
• read_excel('filepath', 'sheetnumber')
Example
import pandas as pd
import xlrd
df=pd.read_excel(" C:\\Users\\admin\\Desktop\\Pythonprograms.Empdata.xlsx")
print(df)
3
Creating Data Frame from .csv Files
Example:
import pandas as pd
df=pd.read_csv("C:\\python\...\empdata.csv", "Sheet1")
df
4
Creating Data Frame from python Dictionary
Example:
import pandas as pd
empdata={"empid":[1001,1002,1003,1004,1005,1006],
"empname":["Sheela", "Akila", "Shwetha", "Suman", "Arun", "Sujit"],
"salary":[10000,70000,30000,50000,25000,80000],
"doj":["10-10-2000", "23-2-2002", "2-3-2002", "9-10-2000", "10-8-2000", "9-9-1999"]}
df=pd.DataFrame(empdata)
print(df)
6
Operations on Data Frames
import pandas as pd
import xlrd
df=pd.read_excel('C:\\Users\\admin\\Desktop\\Pythonprograms\\Empdata.xlsx')
print(df.shape) (6, 4)
r,c=df.shape
print(df.head())
8
Operations on Data Frames- Cont.
print(df.columns)
Index(['empid', 'empname', 'salary', 'doj'], dtype='object')
print(df.empid)
print(df['empid'])
0 1001
1 1002
2 1003
3 1004
4 1005
5 1006
Name: empid, dtype: int64
9
print(df[['empid', 'empname']])
empid empname
0 1001 Sheela
1 1002 Akila
2 1003 Shwetha
3 1004 Suman
4 1005 Arun
5 1006 Sujit
print(df['salary'].max())
80000
print(df['salary'].min())
10000
10
print(df.describe())
empid salary
count 6.000000 6.000000
Mean 1003.500000 44166.666667
std 1.870829 27279.418371
min 1001.000000 10000.000000
25% 1002.250000 26250.000000
50% 1003.500000 40000.000000
75% 1004.750000 65000.000000
max 1006.000000 80000.000000
11
print(df[df.salary>10000])
empid empname salary doj
1 1002 Akila 70000 2000-10-11
2 1003 Shwetha 30000 2020-10-15
3 1004 Suman 50000 1999-10-13
4 1005 Arun 25000 2000-10-14
5 1006 Sujit 80000 2000-11-15
print(df[df.salary == df.salary.max()])
empid empname salary doj
5 1006 Sujit 80000 2000-11-15
12
print(df[['empid', 'empname']][df.salary>10000])
empid empname
1 1002 Akila
2 1003 Shwetha
3 1004 Suman
4 1005 Arun
5 1006 Sujit
13
Operations on Data Frames- Cont.
print(df.index)
RangeIndex(start=0, stop=6, step=1)
print(df.set_index('empid'))
empname salary doj
empid
1001 Sheela 10000 2000-11-10
1002 Akila 70000 2000-10-11
1003 Shwetha 30000 2020-10-15
1004 Suman 50000 1999-10-13
1005 Arun 25000 2000-10-14
1006 Sujit 80000 2000-11-15
14
print(df.set_index('empid',inplace=True))
print(df.loc[1004])
empname Suman
salary 50000
doj 1999-10-13 00:00:00
Name: 1004, dtype: object
print(df.reset_index(inplace=True))
print(df.sort_values('doj'))
print(df.sort_values('doj', ascending=False))
15
Handling of missing Data(Data cleansing)
import pandas as pd
df=pd.read_csv("C:\\python\...\empdata.csv", "Sheet1")
df
df1=df.fillna(0)
df1=df.fillna({'ename':'Name missing', 'sal':'0', 'doj':'00-00-00'})
df1=df.dropna()
16
Data Visualization
import matplotlib.pyplot as plt
import pandas as pd
import xlrd
df=pd.read_excel('C:\\Users\\admin\\Desktop\\Pythonprograms\\Empdata.xlsx')
x=df['empid']
y=df['salary']
plt.bar(x,y,label='Employee Data', color='green')
plt.xlabel('Employee ids')
plt.ylabel('Employee salaries')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
Note: Try Bar graph to represent data coming from multiple sources 17
import matplotlib.pyplot as plt
semester=[1,2,3,4,5,6]
sem=[1.3,2.3,3.3,4.3,5.3,6.3]
no_of_boys=[25,40,32,30,28,35]
no_of_girls=[30,30,40,20,35,40]
plt.bar(semester,no_of_boys,label='Boys', color='green',width=0.3)
plt.bar(sem,no_of_girls,label='Girls', color='blue',width=0.3)
plt.xlabel('Semester')
plt.ylabel('No. of students')
plt.title('Student Details')
plt.legend()
plt.show()
18
Data Visualization-Histogram
import matplotlib.pyplot as plt
emp_ages=[22,45,30,59,57,58,56,45,40,43,43,50,40,34,33,25,19]
bins = [0,10,20,30,40,50,60]
plt.hist(emp_ages,bins,histtype='bar',rwidth=0.8, color='blue') #stacked bar, stack step
plt.xlabel('Employee ages')
plt.ylabel('No. of employees')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
19
Data Visualization-Histogram
import matplotlib.pyplot as plt
emp_ages=[22,45,30,59,57,58,56,45,40,43,43,50,40,34,33,25,19]
bins = [0,10,20,30,40,50,60]
plt.hist(emp_ages,bins,histtype='step',rwidth=0.8, color='blue')
plt.xlabel('Employee ages')
plt.ylabel('No. of employees')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
20
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=90,explode=(0,0,0.2,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
21
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=90,explode=(0,0.4,0,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
22
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=0,explode=(0,0.4,0,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()
23
Data Visualization-Line Graph
import matplotlib.pyplot as plt
years=['2012', '2013', '2014', '2015', '2016', '2017']
profits = [9,10,10.5,8.8,10.9,9.75]
plt.plot(years,profits, 'blue')
plt.xlabel('Years')
plt.ylabel('Profits in million Rs.')
plt.title('XYZ COMPANY')
plt.show()
24
import matplotlib.pyplot as plt
years=['2012', '2013', '2014', '2015', '2016', '2017']
profits1 = [9,10,10.5,8.8,10.9,9.75]
profits2=[8,9,12,12.5,11.5,12]
plt.plot(years,profits1, 'blue',label='XYZ Profit',marker='*')
plt.plot(years,profits2, 'red',label='abc Profit',marker='o')
plt.xlabel('Years')
plt.ylabel('Profits in million Rs.')
plt.title('Profit Data')
plt.legend()
plt.show()
25