House Price Prediction Models
House Price Prediction Models
House Price Prediction Models
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load dataset
houseprice_df = pd.read_excel('HousePrice_Dataset.xlsx')
houseprice_df.head()
(18565, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18565 entries, 0 to 18564
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 18565 non-null float64
1 latitude 18565 non-null float64
2 housing_median_age 18565 non-null int64
3 total_rooms 18565 non-null int64
4 total_bedrooms 18376 non-null float64
5 population 18565 non-null int64
6 households 18565 non-null int64
7 median_income 18565 non-null float64
8 median_house_value 18565 non-null int64
9 ocean_proximity 18565 non-null object
dtypes: float64(4), int64(5), object(1)
memory usage: 1.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18376 entries, 0 to 18564
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 18376 non-null float64
1 latitude 18376 non-null float64
2 housing_median_age 18376 non-null int64
3 total_rooms 18376 non-null int64
4 total_bedrooms 18376 non-null float64
5 population 18376 non-null int64
6 households 18376 non-null int64
7 median_income 18376 non-null float64
8 median_house_value 18376 non-null int64
9 ocean_proximity 18376 non-null object
dtypes: float64(4), int64(5), object(1)
memory usage: 1.5+ MB
0
Out[6]:
# df.head()
ocean_proximity
<1H OCEAN 8096.0 239973.465415 106101.457273 17500.0 164175.0 214750.0 289425.0 500001.0
NEAR BAY 2034.0 258756.622911 122646.084078 22500.0 162500.0 231800.0 345525.0 500001.0
NEAR OCEAN 2372.0 249858.342327 122701.540906 22500.0 150000.0 229800.0 323825.0 500001.0
<AxesSubplot:xlabel='ocean_proximity', ylabel='median_house_value'>
Out[15]:
<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>
Out[16]:
In [17]: # Convert categorical variable 'ocean_proximity' into dummy/indicator variables
df = pd.get_dummies(df, columns=['ocean_proximity'])
In [18]: #Renaing feature name as it should not contain charaters like <>= etc...This will give problem
df.rename(columns ={'ocean_proximity_<1H OCEAN' : 'ocean_proximity_LT1H OCEAN'}, inplace = Tr
Out[19]: ocean_proximity
longitude latitude housing_median_age households median_income median_house_value
O
<AxesSubplot:xlabel='median_house_value'>
Out[21]:
In [24]: sns.distplot(np.log(df['median_house_value']))
<AxesSubplot:xlabel='median_house_value', ylabel='Density'>
Out[24]:
Index([], dtype='object')
Index(['longitude', 'latitude', 'housing_median_age', 'households',
'median_income', 'median_house_value', 'ocean_proximity_LT1H OCEAN',
'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
dtype='object')
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(17419, 10)
(17419,)
(13935, 10)
(3484, 10)
(13935,)
(3484,)
Out[27]: ocean_proximity
longitude latitude housing_median_age households median_income median_house_value
O
Out[31]: ▾ LinearRegression
LinearRegression()
Out[35]: ▾ DecisionTreeRegressor
DecisionTreeRegressor(max_depth=8, min_samples_leaf=10, min_samples_split=10)
Out[38]: ▾ RandomForestRegressor
RandomForestRegressor(max_depth=10, min_samples_split=12, n_estimators=300)
y_pred_RF_2 = Rand_Forest_2.predict(x_test)
RF_df_2 = model_instantiation(Rand_Forest_2 ,x_train,x_test,y_train,y_test,y_pred_RF_2 , 'RF2
RF_df_2
y_pred_ada = ada.predict(x_test)
xgb_model = XGBRegressor()
# # # Create GridSearchCV
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_sq
xgb_model.fit(x_train,y_train)
y_pred_xgboost = xgb_model.predict(x_test)
In [ ]:
lasso = Lasso(alpha=1.0)
lasso.fit(x_train, y_train)
y_pred_lasso = lasso.predict(x_test)
C:\Users\sumit\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:628: C
onvergenceWarning: Objective did not converge. You might want to increase the number of itera
tions, check the scale of the features or consider increasing regularisation. Duality gap: 4.
345e+12, tolerance: 1.269e+10
model = cd_fast.enet_coordinate_descent(
Out[45]: Train_R2 Test_R2 Test_MSE Test_RMSE Test_MAE