Statisitics Project 7
Statisitics Project 7
Statisitics Project 7
In [20]:
%matplotlib inline
In [21]:
# Numerical libraries
import numpy as np
import pandas as pd
import matplotlib.style
plt.style.use('classic')
In [22]:
mpg_df = pd.read_csv("car-mpg.csv")
In [23]:
mpg_df.head(50)
Out[23]:
In [24]:
In [25]:
mpg_df
Out[25]:
... ... ... ... ... ... ... ... ... ...
In [26]:
# Replace the numbers in categorical variables with the actual country name in the orig
in col
In [27]:
mpg_df
Out[27]:
... ... ... ... ... ... ... ... ... ...
In [28]:
# This is also kown as one hot coding. The column names will be A merica, Europe and As
ia... with one hot coding
In [29]:
mpg_df
Out[29]:
... ... ... ... ... ... ... ... ... ... ...
In [31]:
mpg_df.describe().transpose()
Out[31]:
In [32]:
mpg_df.dtypes
Out[32]:
mpg float64
cyl int64
disp float64
hp object
wt int64
acc float64
yr int64
car_type int64
origin_america uint8
origin_asia uint8
origin_europe uint8
dtype: object
In [33]:
# Note: HP column is missing the describe output. That indicates something is not righ
t with that column
In [34]:
# run the "isdigit() check on 'hp' column of the mpg_df dataframe. Result will be True
or False for every row
# capture the result in temp dataframe and dow a frequency count using value_counts()
# There are six records with non digit values in 'hp' column
# in temp dataframe
temp[temp['hp'] == False] # from temp take only those rows where hp has false
Out[34]:
hp
32 False
126 False
330 False
336 False
354 False
374 False
In [36]:
# On inspecting records number 32, 126 etc, we find "?" in the columns. Replace them wi
th "nan"
#Replace them with nan and remove the records from the data frame that have "nan"
In [37]:
mpg_df[mpg_df.isnull().any(axis=1)]
Out[37]:
In [38]:
# There are various ways to handle missing values. Drop the rows, replace missing value
s with median values etc.
In [39]:
#of the 398 rows 6 have NAN in the hp column. We will drop those 6 rows. Not a good ide
a under all situations
#mpg_df = mpg_df.dropna()
In [40]:
#instead of dropping the rows, lets replace the missing values with median value
mpg_df.median
Out[40]:
origin_asia origin_europe
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
.. ... ...
393 0 0
394 0 1
395 0 0
396 0 0
397 0 0
In [41]:
# replace the missing values in 'hp' with median value of 'hp' :Note, we do not need to
specify the column names
# every column's missing value is replaced with that column's median respectively (axi
s =0 means columnwise)
#mpg_df = mpg_df.fillna(mpg_df.median())
In [42]:
mpg_df.dtypes
Out[42]:
mpg float64
cyl int64
disp float64
hp object
wt int64
acc float64
yr int64
car_type int64
origin_america uint8
origin_asia uint8
origin_europe uint8
dtype: object
In [43]:
In [45]:
mpg_df.describe()
Out[45]:
In [46]:
# Let us do a correlation analysis among the different dimensions and also each dimensi
on with the dependent dimension
# This is done using scatter matrix function which creates a dashboard reflecting usefu
l information about the dimensions
# The result can be stored as a .png file and opened in say, paint to get a larger view
#axes = pd.plotting.scatter_matrix(mpg_df_attr)
#plt.tight_layout()
#plt.savefig('d:\greatlakes\mpg_pairpanel.png')
Out[46]:
<seaborn.axisgrid.PairGrid at 0x1817e50cfd0>
In [47]:
#The data distribution across various dimensions except 'Acc' do not look normal
#Close observation between 'mpg' and other attributes indicate the relationship is not
really linear
#relation between 'mpg' and 'hp' show hetroscedacity... which will impact model accurac
y
In [50]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable
drop it
X = mpg_df.drop('mpg', axis=1)
# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = mpg_df[['mpg']]
In [51]:
#Let us break the X and y dataframes into training set and test set. For this we will u
se
In [52]:
In [53]:
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[53]:
LinearRegression()
In [54]:
In [55]:
intercept = regression_model.intercept_[0]
In [56]:
regression_model.score(X_train, y_train)
Out[56]:
0.825809118133759
In [57]:
regression_model.score(X_test, y_test)
Out[57]:
0.8404952015294237
In [58]:
# R^2 is not a reliable metric as it always increases with addition of more attributes
even if the attributes have no
# influence on the predicted variable. Instead we use adjusted R^2 which removes the st
atistical chance that improves R^2
data_train.head()
Out[58]:
In [59]:
lm1.params
Out[59]:
Intercept -26.693360
cyl 1.863718
disp 0.010066
hp -0.039229
wt -0.006415
acc 0.011724
yr 0.758818
car_type 6.626521
dtype: float64
In [60]:
Df Model: 7
Covariance Type: nonrobust
==========================================================================
====
--------------------------------------------------------------------------
----
==========================================================================
====
==========================================================================
====
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is cor
rectly specified.
[2] The condition number is large, 8.64e+04. This might indicate that ther
e are
In [61]:
# Let us check the sum of squared errors by predicting value of y for test cases and
mse = np.mean((regression_model.predict(X_test)-y_test)**2)
In [62]:
import math
math.sqrt(mse)
Out[62]:
3.0538103653849573
In [63]:
# so there is avg of 3.0 (roundoff) mpg difference from real mpg on an avg
In [64]:
# R^2=1–RSS / TSS
regression_model.score(X_test, y_test)
Out[64]:
0.8404952015294237
In [65]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)
In [66]:
# Since this is regression, plot the predicted y value vs actual y values for the test
data
# A good model's prediction will be close to actual leading to high R and R2 values
#plt.rcParams['figure.dpi'] = 500
plt.scatter(y_test['mpg'], y_pred)
Out[66]:
<matplotlib.collections.PathCollection at 0x18104b9ddc0>
In [67]:
# It is always a good practice to scale all the dimensions using z scores or someother
methode to address the problem of different scales
In [68]:
X_train_scaled = X_train.apply(zscore)
X_test_scaled = X_test.apply(zscore)
y_train_scaled = y_train.apply(zscore)
y_test_scaled = y_test.apply(zscore)
In [69]:
# invoke the LinearRegression function and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train_scaled)
Out[69]:
LinearRegression()
In [70]:
Out[70]:
LinearRegression()
In [71]:
In [72]:
intercept = regression_model.intercept_[0]
In [73]:
# R^2=1–RSS / TSS
regression_model.score(X_test_scaled, y_test_scaled)
Out[73]:
0.8460575288663481
In [74]:
# Let us check the sum of squared errors by predicting value of y for training cases an
d
mse = np.mean((regression_model.predict(X_test_scaled)-y_test_scaled)**2)
In [75]:
import math
math.sqrt(mse)
Out[75]:
0.39235503199736316
In [76]:
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test_scaled)
In [77]:
# Since this is regression, plot the predicted y value vs actual y values for the test
data
# A good model's prediction will be close to actual leading to high R and R2 values
plt.scatter(y_test_scaled['mpg'], y_pred)
Out[77]:
<matplotlib.collections.PathCollection at 0x18104bfc880>
In [78]:
In [79]:
In [80]:
i=0
if i < 11:
i = i+1
hp ---> 71.23983108333236
wt ---> 139.1665144189037
yr ---> 166.95012233353933
In [ ]: