EDA Project

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # data visulization libraray
%matplotlib inline
import seaborn as sns
In [54]:
df = pd.read_csv(r'D:\Python_Diwali_Sales_Analysis\Diwali Sales Data.csv', encoding = 'unicode_escape')
# to avoid encoder error , use unicode_escape.
A UnicodeDecodeError in Python, particularly when working with pandas, usually occurs when trying to read a file or data that contains non-UTF-8
encoded characters without specifying the correct encoding.
In [55]:
df.shape
Out [55]: (11251, 15)
In [56]: df.head()
Out [56]:
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Am
Group
0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 239
1 1000732 Kartik P00110942 F 26-35 35 1 Andhra Pradesh Southern Govt Auto 3 239
2 1001990 Bindu P00118542 F 26-35 35 1 Uttar Pradesh Central Automobile Auto 3 239
3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 239

Food
4 1000588 Joni P00057942 M 26-35 28 1 Gujarat Western Auto 2 238
Processing
In [57]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
13 Status 0 non-null float64
14 unnamed1 0 non-null float64
dtypes: float64(3), int64(4), object(8)
memory usage: 1.3+ MB
In [58]:
#drop unrelated / blank columns
df.drop(['Status', 'unnamed1'], axis = 1 , inplace = True)
In [59]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11251 entries, 0 to 11250
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User_ID 11251 non-null int64
1 Cust_name 11251 non-null object
2 Product_ID 11251 non-null object
3 Gender 11251 non-null object
4 Age Group 11251 non-null object
5 Age 11251 non-null int64
6 Marital_Status 11251 non-null int64
7 State 11251 non-null object
8 Zone 11251 non-null object
9 Occupation 11251 non-null object
10 Product_Category 11251 non-null object
11 Orders 11251 non-null int64
12 Amount 11239 non-null float64
dtypes: float64(1), int64(4), object(8)
memory usage: 1.1+ MB
In [60]: pd.isnull(df)
Out [60]:
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount
Group
0 False False False False False False False False False False False False False
Age
User_ID Cust_name Product_ID Gender Age Marital_Status State Zone Occupation Product_Category Orders Amount
Group
... ... ... ... ... ... ... ... ... ... ... ... ... ...
11251 rows × 13 columns
here we are finding null values it most important part of data cleaning process in python but lokking above data we cant understand. false indicating
that there are some values it is not null.
In [61]: pd.isnull(df).sum()
Out [61]: User_ID 0

Cust_name 0
Product_ID 0
Gender 0
Age Group 0
Age 0
Marital_Status 0
State 0
Zone 0
Occupation 0
Product_Category 0
Orders 0
Amount 12
dtype: int64
In [62]:
df.shape
Out [62]: (11251, 13)
In [63]: #drop the null values

df.dropna(inplace = True)
In [64]:
df.shape
Out [64]: (11239, 13)
Amount column is in float(decimal) so we want to change the data type so we did it using .astype('datatype')
In [65]:
#change the data type
df['Amount'] = df['Amount'].astype('int')
In [66]: df['Amount'].dtypes
Out [66]: dtype('int32')
In [67]:
df.columns
Out [67]: Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',

'Marital_Status', 'State', 'Zone', 'Occupation', 'Product_Category',
'Orders', 'Amount'],
dtype='object')
In [68]: #rename the column

df.rename(columns = {'Marital_Status':'Shaadi'})
Out [68]:
Age
User_ID Cust_name Product_ID Gender Age Shaadi State Zone Occupation Product_Category Orders Amo
Group
0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 2395
1 1000732 Kartik P00110942 F 26-35 35 1 Andhra Pradesh Southern Govt Auto 3 2393
2 1001990 Bindu P00118542 F 26-35 35 1 Uttar Pradesh Central Automobile Auto 3 2392
3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 2391

Food
4 1000588 Joni P00057942 M 26-35 28 1 Gujarat Western Auto 2 2387
Processing
... ... ... ... ... ... ... ... ... ... ... ... ... ...
11246 1000695 Manning P00296942 M 18-25 19 1 Maharashtra Western Chemical Office 4 370
11247 1004089 Reichenbach P00171342 M 26-35 33 0 Haryana Northern Healthcare Veterinary 3 367
Madhya
11248 1001209 Oshin P00201342 F 36-45 40 0 Central Textile Office 4 213
Pradesh
11249 1004023 Noonan P00059442 M 36-45 37 0 Karnataka Southern Agriculture Office 3 206
11250 1002744 Brumley P00281742 F 18-25 19 0 Maharashtra Western Healthcare Office 3 188
11239 rows × 13 columns
In [69]:
# describe method returns description of the data in the dataframe(i.e, mean std, count)
df.describe()
Out [69]:
User_ID Age Marital_Status Orders Amount
count 1.123900e+04 11239.000000 11239.000000 11239.000000 11239.000000

mean 1.003004e+06 35.410357 0.420055 2.489634 9453.610553
std 1.716039e+03 12.753866 0.493589 1.114967 5222.355168

min 1.000001e+06 12.000000 0.000000 1.000000 188.000000
25% 1.001492e+06 27.000000 0.000000 2.000000 5443.000000

50% 1.003064e+06 33.000000 0.000000 2.000000 8109.000000
75% 1.004426e+06 43.000000 1.000000 3.000000 12675.000000

max 1.006040e+06 92.000000 1.000000 4.000000 23952.000000
exploratory data anlaysis
Gender
In [70]:
df.columns

dtype='object')
In [71]:
# using seaborn for data visulation here we can see that gender column has displyed using countplot
# normaly we can put we can use seaborn like this
sns.countplot(x = 'Gender', data = df, hue='Gender')
Out [71]: <Axes: xlabel='Gender', ylabel='count'>
In [72]: ax = sns.countplot(x = 'Gender', data = df, hue='Gender')
for bars in ax.containers:

ax.bar_label(bars)
# here we just craete containers where it gives me total amount about male and female.
In [73]:
df.groupby(['Gender'], as_index = False)['Amount'].sum().sort_values(by='Amount', ascending = False)
Out [73]:
Gender Amount
0 F 74335853
1 M 31913276
In [74]:
sales_gen = df.groupby(['Gender'], as_index = False)['Amount'].sum().sort_values(by='Amount', ascending = False
sns.barplot(x = 'Gender', y = 'Amount', data = sales_gen, hue = 'Gender')
Out [74]: <Axes: xlabel='Gender', ylabel='Amount'>

From above graph we can understand that female purchasing power is more than males. that means most of the buyers are Female.
Age group
In [75]:
ax = sns.countplot(x = 'Age Group', data = df, hue='Gender')

ax.bar_label(bars)
# anlayzing the age-group of buyers in the basis of Gender.
In [76]: # total amount vs age group

sales_age = df.groupby(['Age Group'], as_index = False)['Amount'].sum().sort_values(by='Amount', ascending = Fa
sns.barplot(x = 'Age Group', y = 'Amount', data = sales_age, hue = 'Age Group')
Out [76]: <Axes: xlabel='Age Group', ylabel='Amount'>
from above graph we can see that most of the buyers are Females in between the 26-35 Age-group
state
In [77]:
# top 10 sales among the states
sales_states = df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False)
# Plotting the data

sns.set(rc={'figure.figsize': (17, 5)})
sns.barplot(x='State', y='Orders', data=sales_states, hue='State')
Out [77]: <Axes: xlabel='State', ylabel='Orders'>

In [78]:
#total amount/sales from top - 10
sales_states = df.groupby(['State'], as_index = False)['Amount'].sum().sort_values(by='Amount', ascending = Fal
sns.set(rc={'figure.figsize':(17,5)})
sns.barplot(data = sales_states, x = 'State', y = 'Amount', hue = 'State')
Out [78]: <Axes: xlabel='State', ylabel='Amount'>
from aboove graphs we can see that most of the ordes are form uttar pradesh, Maharashtra , karnataka respectviley.
Martial status
In [79]:
df.columns

dtype='object')
In [80]:
ax = sns.countplot(data = df, x = 'Marital_Status', hue = 'Marital_Status')
ax.bar_label(bars)
In [81]: #identiying from married couple who spent the money based on gender
sales_state = df.groupby(['Marital_Status', 'Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount'
sns.barplot(data=sales_state, x='Marital_Status', y='Amount', hue='Gender')
Out [81]: <Axes: xlabel='Marital_Status', ylabel='Amount'>

breakdown of spending by gender within each marital status group from above graph we can see that most of the buyers are married(women) and
they have high purchasing power.
Occupation
In [82]:
df.columns

dtype='object')
In [83]:
ax = sns.countplot(data = df, x = 'Occupation', hue = 'Occupation')

ax.bar_label(bars)
In [84]:
sales_state = df.groupby(['Occupation'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=Fal
sns.barplot(data=sales_state, x ='Occupation', y='Amount', hue = 'Occupation')
Out [84]: <Axes: xlabel='Occupation', ylabel='Amount'>
from above graph most of the buyers are from IT sector, Helathcare, Aviation.
product catogory
In [85]:
ax = sns.countplot(data = df, x = 'Product_Category', hue = 'Product_Category')
ax.bar_label(bars)
In [86]:
sales_state = df.groupby(['Product_Category'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascendi
sns.barplot(data=sales_state, x ='Product_Category', y='Amount', hue = 'Product_Category')
Out [86]: <Axes: xlabel='Product_Category', ylabel='Amount'>
from above graphs we can see that most of the sold products are food, clothing and electronics category
In [87]:
sales_state = df.groupby(['Product_ID'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=Fal
sns.barplot(data=sales_state, x ='Product_ID', y='Orders', hue = 'Product_ID')
Out [87]: <Axes: xlabel='Product_ID', ylabel='Orders'>
married women age group 26-25 yrs from UP, Maharaeshtra and karnataka working in It, Healthcare and Aviation are more likley to buy products from
food, clothing and Electronics category.

EDA Project

Uploaded by

Copyright:

Available Formats

EDA Project

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

EDA Project

Uploaded by

Copyright:

Available Formats

In [53]:

Out [55]: (11251, 15)

0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 239

3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 239

11251 rows × 13 columns

Out [61]: User_ID 0

Out [62]: (11251, 13)

In [63]: #drop the null values

Out [64]: (11239, 13)

Out [66]: dtype('int32')

Out [67]: Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',

In [68]: #rename the column

0 1002903 Sanskriti P00125942 F 26-35 28 0 Maharashtra Western Healthcare Auto 1 2395

3 1001425 Sudevi P00237842 M 0-17 16 0 Karnataka Southern Construction Auto 2 2391

count 1.123900e+04 11239.000000 11239.000000 11239.000000 11239.000000

std 1.716039e+03 12.753866 0.493589 1.114967 5222.355168

25% 1.001492e+06 27.000000 0.000000 2.000000 5443.000000

75% 1.004426e+06 43.000000 1.000000 3.000000 12675.000000

exploratory data anlaysis

Out [70]: Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',

Out [71]: <Axes: xlabel='Gender', ylabel='count'>

In [72]: ax = sns.countplot(x = 'Gender', data = df, hue='Gender')

for bars in ax.containers:

Out [74]: <Axes: xlabel='Gender', ylabel='Amount'>

for bars in ax.containers:

In [76]: # total amount vs age group

Out [76]: <Axes: xlabel='Age Group', ylabel='Amount'>

# Plotting the data

Out [77]: <Axes: xlabel='State', ylabel='Orders'>

Out [78]: <Axes: xlabel='State', ylabel='Amount'>

Out [79]: Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',

Out [81]: <Axes: xlabel='Marital_Status', ylabel='Amount'>

Out [82]: Index(['User_ID', 'Cust_name', 'Product_ID', 'Gender', 'Age Group', 'Age',

for bars in ax.containers:

Out [84]: <Axes: xlabel='Occupation', ylabel='Amount'>

Out [86]: <Axes: xlabel='Product_Category', ylabel='Amount'>

Out [87]: <Axes: xlabel='Product_ID', ylabel='Orders'>

You might also like