Unit 4

Download as pdf or txt
Download as pdf or txt
You are on page 1of 25

Introduction to Data Science

using Python

1
• Data Warehouse: A central repository of integrated data from
different sources.

• Data Science: A term used for techniques to extract information from


the data warehouse, analyse them and present the necessary data to
the business organization in order to arrive at important conclusions
and decisions.

• Data Frame: An object that is useful in representing data in form of


rows and columns.

2
Creating Data Frame from an Excelsheet
• read_excel('filepath', 'sheetnumber')
Example
import pandas as pd
import xlrd
df=pd.read_excel(" C:\\Users\\admin\\Desktop\\Pythonprograms.Empdata.xlsx")
print(df)

empid empname salary doj


0 1001 Sheela 10000 2000-11-10
1 1002 Akila 70000 2000-10-11
2 1003 Shwetha 30000 2020-10-15
3 1004 Suman 50000 1999-10-13
4 1005 Arun 25000 2000-10-14
5 1006 Sujit 80000 2000-11-15

3
Creating Data Frame from .csv Files
Example:
import pandas as pd
df=pd.read_csv("C:\\python\...\empdata.csv", "Sheet1")
df

4
Creating Data Frame from python Dictionary
Example:
import pandas as pd
empdata={"empid":[1001,1002,1003,1004,1005,1006],
"empname":["Sheela", "Akila", "Shwetha", "Suman", "Arun", "Sujit"],
"salary":[10000,70000,30000,50000,25000,80000],
"doj":["10-10-2000", "23-2-2002", "2-3-2002", "9-10-2000", "10-8-2000", "9-9-1999"]}
df=pd.DataFrame(empdata)
print(df)

empid empname salary doj


0 1001 Sheela 10000 10-10-2000
1 1002 Akila 70000 23-2-2002
2 1003 Shwetha 30000 2-3-2002
3 1004 Suman 50000 9-10-2000
4 1005 Arun 25000 10-8-2000
5 1006 Sujit 80000 9-9-1999
5
Creating Data Frame from python List of Tuples
Example:
empid empname salary doj
import pandas as pd 0 1001 Sheela 10000 10-10-2000
empdata=[(1001, "Sheela", 10000, 10-10-2000"), 1 1002 Akila 70000 23-2-2002
(1002, "Akila", 70000, "3-20-2002"), 2 1003 Shwetha 30000 2-3-2002
3 1004 Suman 50000 9-10-2000
(1003, "Shwetha", 30000, "2-3-2002"), 4 1005 Arun 25000 10-8-2000
(1004, "Suman", 50000, "9-10-2000"), 5 1006 Sujit 80000 9-9-1999
(1005, "Arun", 25000, "10-8-2000"),
(1006, "Sujit", 80000, "9-9-1999")]
df=pd.DataFrame(empdata, columns=["eno", "ename", "salary", "doj"])
print(df)

6
Operations on Data Frames
import pandas as pd
import xlrd
df=pd.read_excel('C:\\Users\\admin\\Desktop\\Pythonprograms\\Empdata.xlsx')
print(df.shape) (6, 4)
r,c=df.shape
print(df.head())

empid empname salary doj


0 1001 Sheela 10000 2000-11-10
1 1002 Akila 70000 2000-10-11
2 1003 Shwetha 30000 2020-10-15
3 1004 Suman 50000 1999-10-13
4 1005 Arun 25000 2000-10-14 7
print(df.tail())
print(df.head(2))
print(df.tail(2))
print(df[2:5])
print(df[0::2])
print(df[5:0:-1])

8
Operations on Data Frames- Cont.
print(df.columns)
Index(['empid', 'empname', 'salary', 'doj'], dtype='object')
print(df.empid)
print(df['empid'])

0 1001
1 1002
2 1003
3 1004
4 1005
5 1006
Name: empid, dtype: int64
9
print(df[['empid', 'empname']])
empid empname
0 1001 Sheela
1 1002 Akila
2 1003 Shwetha
3 1004 Suman
4 1005 Arun
5 1006 Sujit

print(df['salary'].max())
80000

print(df['salary'].min())
10000

10
print(df.describe())

empid salary
count 6.000000 6.000000
Mean 1003.500000 44166.666667
std 1.870829 27279.418371
min 1001.000000 10000.000000
25% 1002.250000 26250.000000
50% 1003.500000 40000.000000
75% 1004.750000 65000.000000
max 1006.000000 80000.000000

11
print(df[df.salary>10000])
empid empname salary doj
1 1002 Akila 70000 2000-10-11
2 1003 Shwetha 30000 2020-10-15
3 1004 Suman 50000 1999-10-13
4 1005 Arun 25000 2000-10-14
5 1006 Sujit 80000 2000-11-15

print(df[df.salary == df.salary.max()])
empid empname salary doj
5 1006 Sujit 80000 2000-11-15

12
print(df[['empid', 'empname']][df.salary>10000])
empid empname
1 1002 Akila
2 1003 Shwetha
3 1004 Suman
4 1005 Arun
5 1006 Sujit

13
Operations on Data Frames- Cont.
print(df.index)
RangeIndex(start=0, stop=6, step=1)
print(df.set_index('empid'))
empname salary doj
empid
1001 Sheela 10000 2000-11-10
1002 Akila 70000 2000-10-11
1003 Shwetha 30000 2020-10-15
1004 Suman 50000 1999-10-13
1005 Arun 25000 2000-10-14
1006 Sujit 80000 2000-11-15
14
print(df.set_index('empid',inplace=True))
print(df.loc[1004])
empname Suman
salary 50000
doj 1999-10-13 00:00:00
Name: 1004, dtype: object

print(df.reset_index(inplace=True))
print(df.sort_values('doj'))
print(df.sort_values('doj', ascending=False))

15
Handling of missing Data(Data cleansing)
import pandas as pd
df=pd.read_csv("C:\\python\...\empdata.csv", "Sheet1")
df
df1=df.fillna(0)
df1=df.fillna({'ename':'Name missing', 'sal':'0', 'doj':'00-00-00'})
df1=df.dropna()

16
Data Visualization
import matplotlib.pyplot as plt
import pandas as pd
import xlrd
df=pd.read_excel('C:\\Users\\admin\\Desktop\\Pythonprograms\\Empdata.xlsx')
x=df['empid']
y=df['salary']
plt.bar(x,y,label='Employee Data', color='green')
plt.xlabel('Employee ids')
plt.ylabel('Employee salaries')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

Note: Try Bar graph to represent data coming from multiple sources 17
import matplotlib.pyplot as plt
semester=[1,2,3,4,5,6]
sem=[1.3,2.3,3.3,4.3,5.3,6.3]
no_of_boys=[25,40,32,30,28,35]
no_of_girls=[30,30,40,20,35,40]
plt.bar(semester,no_of_boys,label='Boys', color='green',width=0.3)
plt.bar(sem,no_of_girls,label='Girls', color='blue',width=0.3)
plt.xlabel('Semester')
plt.ylabel('No. of students')
plt.title('Student Details')
plt.legend()
plt.show()

18
Data Visualization-Histogram
import matplotlib.pyplot as plt
emp_ages=[22,45,30,59,57,58,56,45,40,43,43,50,40,34,33,25,19]
bins = [0,10,20,30,40,50,60]
plt.hist(emp_ages,bins,histtype='bar',rwidth=0.8, color='blue') #stacked bar, stack step
plt.xlabel('Employee ages')
plt.ylabel('No. of employees')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

bar', 'barstacked', 'step', 'stepfilled'

19
Data Visualization-Histogram
import matplotlib.pyplot as plt
emp_ages=[22,45,30,59,57,58,56,45,40,43,43,50,40,34,33,25,19]
bins = [0,10,20,30,40,50,60]
plt.hist(emp_ages,bins,histtype='step',rwidth=0.8, color='blue')
plt.xlabel('Employee ages')
plt.ylabel('No. of employees')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

‘bar', 'barstacked', 'step', 'stepfilled'

20
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=90,explode=(0,0,0.2,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

21
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=90,explode=(0,0.4,0,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

22
Data Visualization-Pie Chart
import matplotlib.pyplot as plt
slices=[50,20,15,15]
depts=['Sales', 'Production', 'HR', 'Finance']
cols=['magenta', 'cyan', 'brown', 'gold']
plt.pie(slices,labels=depts,colours=cols,startangle=0,explode=(0,0.4,0,0),
shadow=True,autopct='%.1f%%')
plt.title('XYZ COMPANY')
plt.legend()
plt.show()

23
Data Visualization-Line Graph
import matplotlib.pyplot as plt
years=['2012', '2013', '2014', '2015', '2016', '2017']
profits = [9,10,10.5,8.8,10.9,9.75]
plt.plot(years,profits, 'blue')
plt.xlabel('Years')
plt.ylabel('Profits in million Rs.')
plt.title('XYZ COMPANY')
plt.show()

24
import matplotlib.pyplot as plt
years=['2012', '2013', '2014', '2015', '2016', '2017']
profits1 = [9,10,10.5,8.8,10.9,9.75]
profits2=[8,9,12,12.5,11.5,12]
plt.plot(years,profits1, 'blue',label='XYZ Profit',marker='*')
plt.plot(years,profits2, 'red',label='abc Profit',marker='o')
plt.xlabel('Years')
plt.ylabel('Profits in million Rs.')
plt.title('Profit Data')
plt.legend()
plt.show()

25

You might also like