House Price Prediction Models

In [1]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import KMeans
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load dataset
houseprice_df = pd.read_excel('HousePrice_Dataset.xlsx')
houseprice_df.head()
Out[1]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_inco
0 -122.23 37.88 41 880 129.0 322 126 8.3
1 -122.22 37.86 21 7099 1106.0 2401 1138 8.3
2 -122.25 37.85 52 1627 280.0 565 259 3.8
3 -122.25 37.85 52 919 213.0 413 193 4.0
4 -122.25 37.84 52 2535 489.0 1094 514 3.6
In [2]: # Checking input data shape & info

df = houseprice_df.copy()
print(df.shape)
print(df.info())
(18565, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18565 entries, 0 to 18564
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 18565 non-null float64
1 latitude 18565 non-null float64
2 housing_median_age 18565 non-null int64
3 total_rooms 18565 non-null int64
4 total_bedrooms 18376 non-null float64
5 population 18565 non-null int64
6 households 18565 non-null int64
7 median_income 18565 non-null float64
8 median_house_value 18565 non-null int64
9 ocean_proximity 18565 non-null object
dtypes: float64(4), int64(5), object(1)
memory usage: 1.4+ MB
None
In [3]: #Checking for NULL values in Data

df.isnull().sum()
longitude 0
Out[3]:
latitude 0
housing_median_age 0
total_rooms 0
total_bedrooms 189
population 0
households 0
median_income 0
median_house_value 0
ocean_proximity 0
dtype: int64
In [4]: #Dropping NULL Values

df = df.dropna()
In [5]: #Confirming dropna operation

df.info()
Int64Index: 18376 entries, 0 to 18564
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 18376 non-null float64
1 latitude 18376 non-null float64
2 housing_median_age 18376 non-null int64
3 total_rooms 18376 non-null int64
4 total_bedrooms 18376 non-null float64
5 population 18376 non-null int64
6 households 18376 non-null int64
7 median_income 18376 non-null float64
8 median_house_value 18376 non-null int64
9 ocean_proximity 18376 non-null object
dtypes: float64(4), int64(5), object(1)
memory usage: 1.5+ MB
In [6]: #Checking for any duplicates

df.duplicated().sum()
0
Out[6]:
In [7]: #Checking data composition for numerical features

df.describe()
Out[7]: longitude latitude housing_median_age total_rooms total_bedrooms population househ
count 18376.000000 18376.000000 18376.000000 18376.000000 18376.000000 18376.000000 18376.000
mean -119.571095 35.635164 28.605736 2635.302188 537.711199 1425.810786 499.37
std 2.003042 2.137485 12.570789 2200.534974 424.125849 1143.481721 384.514
min -124.350000 32.540000 1.000000 2.000000 2.000000 3.000000 2.000
25% -121.800000 33.930000 18.000000 1444.000000 295.000000 786.000000 280.000
50% -118.500000 34.260000 29.000000 2123.000000 434.000000 1165.500000 408.000
75% -118.010000 37.720000 37.000000 3137.000000 646.000000 1722.000000 603.000
max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000
In [8]: #Standardizing the numerical feilds.

# df[['housing_median_age']] = preprocessing.scale(df[['housing_median_age']]).astype('float64
# df[['total_rooms']] = preprocessing.scale(df[['total_rooms']]).astype('float64')
# df[['total_bedrooms']] = preprocessing.scale(df[['total_bedrooms']]).astype('float64')
# df[['population']] = preprocessing.scale(df[['population']]).astype('float64')
# df[['households']] = preprocessing.scale(df[['households']]).astype('float64')
# df[['median_income']] = preprocessing.scale(df[['median_income']]).astype('float64')
# df[['median_house_value']] = preprocessing.scale(df[['median_house_value']]).astype('float64
# df.head()
In [9]: # Correlation matrix

figure1, a = plt.subplots(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()
In [10]: df=df.drop(['total_rooms','total_bedrooms','population'], axis=1)
In [11]: # Correlation matrix

figure1, a = plt.subplots(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()
In [12]: #Checking unique values in categorical column
df['ocean_proximity'].unique()
array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],

Out[12]:
dtype=object)
In [13]: #House price based on categorical value.

df.groupby(['ocean_proximity'])['median_house_value'].describe()
Out[13]: count mean std min 25% 50% 75% max
ocean_proximity
<1H OCEAN 8096.0 239973.465415 106101.457273 17500.0 164175.0 214750.0 289425.0 500001.0
INLAND 5869.0 124937.335492 70783.931014 14999.0 77500.0 108300.0 148600.0 500001.0
ISLAND 5.0 380440.000000 80559.561816 287500.0 300000.0 414700.0 450000.0 450000.0
NEAR BAY 2034.0 258756.622911 122646.084078 22500.0 162500.0 231800.0 345525.0 500001.0
NEAR OCEAN 2372.0 249858.342327 122701.540906 22500.0 150000.0 229800.0 323825.0 500001.0
In [14]: #Same information using a bar chart.

df['ocean_proximity'].value_counts().plot(kind='bar')
plt.show()
In [15]: figure2, ax = plt.subplots(figsize = (12,8))
sns.barplot(x = 'ocean_proximity', y = 'median_house_value' , data = df)
<AxesSubplot:xlabel='ocean_proximity', ylabel='median_house_value'>
Out[15]:
In [16]: figure3, ax = plt.subplots(figsize = (12,8))

sns.scatterplot(x = 'median_income', y = 'median_house_value' , data = df)
<AxesSubplot:xlabel='median_income', ylabel='median_house_value'>
Out[16]:
In [17]: # Convert categorical variable 'ocean_proximity' into dummy/indicator variables
df = pd.get_dummies(df, columns=['ocean_proximity'])
In [18]: #Renaing feature name as it should not contain charaters like <>= etc...This will give problem
df.rename(columns ={'ocean_proximity_<1H OCEAN' : 'ocean_proximity_LT1H OCEAN'}, inplace = Tr
In [19]: #Checking for dummies action & feature name change

df.head()
Out[19]: ocean_proximity
longitude latitude housing_median_age households median_income median_house_value
O
0 -122.23 37.88 41 126 8.3252 452600
1 -122.22 37.86 21 1138 8.3014 358500
2 -122.25 37.85 52 259 3.8462 342200
3 -122.25 37.85 52 193 4.0368 269700
4 -122.25 37.84 52 514 3.6591 299200
In [20]: #Distribution of median house value which we are predicting

#Data shows right skewed and some outliers
sns.distplot(df['median_house_value'])
plt.show()
In [21]: #Confirming via box plot
sns.boxplot(df['median_house_value'])
<AxesSubplot:xlabel='median_house_value'>
Out[21]:
In [22]: #Trying to identify outliers and remove them

# IQR
# Calculate the upper and lower limits
Q1 = df['median_house_value'].quantile(0.25)
Q3 = df['median_house_value'].quantile(0.75)
IQR = Q3 - Q1
# Set the threshold for outliers

lower_threshold = Q1 - 1.5 * IQR
upper_threshold = Q3 + 1.5 * IQR
# Identify outlier rows using boolean masks

outliers_lower = df['median_house_value'] < lower_threshold
outliers_upper = df['median_house_value'] > upper_threshold
# Display the number of outliers

print(f"Number of lower outliers: {outliers_lower.sum()}")
print(f"Number of upper outliers: {outliers_upper.sum()}")
# Remove the outliers

df = df[~(outliers_lower | outliers_upper)].copy()
# Display the shape of the cleaned DataFrame

print(f"Shape of the cleaned DataFrame: {df.shape}")
Number of lower outliers: 0
Number of upper outliers: 957
Shape of the cleaned DataFrame: (17419, 11)
In [23]: #Confirming outliers removal

sns.distplot(df['median_house_value'])
plt.show()
In [24]: sns.distplot(np.log(df['median_house_value']))
<AxesSubplot:xlabel='median_house_value', ylabel='Density'>
Out[24]:
In [25]: #Checking if any column belongs to object

category_columns = df.dtypes[df.dtypes=='object'].index
numerical_columns = df.dtypes[df.dtypes!='object'].index
print(category_columns)
print(numerical_columns)
Index([], dtype='object')
Index(['longitude', 'latitude', 'housing_median_age', 'households',
'median_income', 'median_house_value', 'ocean_proximity_LT1H OCEAN',
'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
dtype='object')
In [26]: #Seperating input features and labels

X = df.drop('median_house_value', axis = 1)
Y = np.log(df['median_house_value'])
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)
<class 'pandas.core.series.Series'>
(17419, 10)
(17419,)
In [27]: # Split the dataset into training and testing sets

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
df.head()
(13935, 10)
(3484, 10)
(13935,)
(3484,)
Out[27]: ocean_proximity
longitude latitude housing_median_age households median_income median_house_value
O
0 -122.23 37.88 41 126 8.3252 452600
1 -122.22 37.86 21 1138 8.3014 358500
2 -122.25 37.85 52 259 3.8462 342200
3 -122.25 37.85 52 193 4.0368 269700
4 -122.25 37.84 52 514 3.6591 299200
In [28]: # # Standardize the features using StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(x_train)
# X_test = scaler.transform(x_test)
In [29]: #Defining functions to calculate R2 and error rate

def model_evaluation(y_test, y_pred):
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# r2_scr = r2_score(y_test, y_pred)
return {'mae': mae, 'mse': mse, 'rmse': rmse}
def model_instantiation(model, x_train, x_test, y_train, y_test, y_pred, model_name):

training_r2 = model.score(x_train, y_train)
testing_r2 = model.score(x_test, y_test)
eval_return = model_evaluation(y_test, y_pred)
result_metric = {
'Train_R2': training_r2,
'Test_R2': testing_r2,
'Test_MSE': eval_return['mse'],
'Test_RMSE': eval_return['rmse'],
'Test_MAE': eval_return['mae']
}
result = pd.DataFrame(result_metric, index=[model_name])
return result
In [30]: from sklearn.linear_model import LinearRegression
In [31]: #Instantiating Linear reg model

linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)
Out[31]: ▾ LinearRegression
LinearRegression()
In [32]: #Predicting via LR

y_pred_LR = linear_reg.predict(x_test)
In [33]: LR_df = model_instantiation(linear_reg ,x_train,x_test,y_train,y_test,y_pred_LR , 'Linear Reg

LR_df
Out[33]: Train_R2 Test_R2 Test_MSE Test_RMSE Test_MAE
Linear Regression 0.630962 0.624499 0.10712 0.327293 0.251028
In [34]: from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
In [35]: #Decision Tree

Decs_Tree = DecisionTreeRegressor(max_depth = 8 , min_samples_leaf =10 , min_samples_split =
Decs_Tree.fit(x_train, y_train)
Out[35]: ▾ DecisionTreeRegressor
DecisionTreeRegressor(max_depth=8, min_samples_leaf=10, min_samples_split=10)
In [36]: y_pred_DT = Decs_Tree.predict(x_test)
In [37]: DT_df = model_instantiation(Decs_Tree ,x_train,x_test,y_train,y_test,y_pred_DT , 'DTree_Regre

DT_df
DTree_Regression 0.763267 0.717443 0.080606 0.283912 0.208418
In [38]: #Randon forest

Rand_Forest = RandomForestRegressor(n_estimators=300, max_depth = 10, min_samples_split=12)
Rand_Forest.fit(x_train, y_train)
Out[38]: ▾ RandomForestRegressor
RandomForestRegressor(max_depth=10, min_samples_split=12, n_estimators=300)
In [39]: y_pred_RF = Rand_Forest.predict(x_test)
In [40]: RF_df = model_instantiation(Rand_Forest ,x_train,x_test,y_train,y_test,y_pred_RF , 'RF_Regres

RF_df
RF_Regression 0.850595 0.774959 0.064198 0.253374 0.180186
In [41]: Rand_Forest_2 = RandomForestRegressor(n_estimators=300, max_depth = 9, min_samples_split=4)

Rand_Forest_2.fit(x_train, y_train)
y_pred_RF_2 = Rand_Forest_2.predict(x_test)
RF_df_2 = model_instantiation(Rand_Forest_2 ,x_train,x_test,y_train,y_test,y_pred_RF_2 , 'RF2
RF_df_2
RF2_Regression 0.830124 0.763076 0.067588 0.259977 0.186851
In [42]: ada =AdaBoostRegressor(n_estimators = 300, random_state = 10)

ada.fit(x_train,y_train)
y_pred_ada = ada.predict(x_test)
ada_df = model_instantiation(ada ,x_train,x_test,y_train,y_test,y_pred_ada , 'ADA_Regression'

ada_df
ADA_Regression 0.599874 0.586475 0.117968 0.343464 0.271343
In [43]: from xgboost.sklearn import XGBRegressor
xgb_model = XGBRegressor()
# Define the parameter grid to search

# param_grid = {
# 'n_estimators': [100, 200, 300],
# 'learning_rate': [0.01, 0.1, 0.2],
# 'max_depth': [3, 4, 5],
# 'subsample': [0.8, 0.9, 1.0],
# 'colsample_bytree': [0.8, 0.9, 1.0],
# 'reg_alpha': [0, 0.1, 0.5], # Regularization term on weights (L1)
# 'reg_lambda': [0, 0.1, 0.5] # Regularization term on weights (L2)
# }
# # # Create GridSearchCV
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_sq
# grid_search.fit(x_train,y_train,early_stopping_rounds=10,eval_set=[(x_test, y_test)], verbo

# XGB_Best_Model = grid_search.best_estimator_
# y_pred_xgboost = XGB_Best_Model.predict(x_test)
xgb_model.fit(x_train,y_train)
y_pred_xgboost = xgb_model.predict(x_test)
xgboost_df = model_instantiation(xgb_model, x_train, x_test, y_train, y_test, y_pred_xgboost,

xgboost_df
XGB_Regression 0.923149 0.801669 0.056579 0.237862 0.163638
In [ ]:
In [44]: import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.pipeline import make_pipeline
from tensorflow.keras.regularizers import l2
# # Separate features (X) and target variable (y)

X = df.drop('median_house_value', axis=1)
y = df['median_house_value']
# # Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# # Create a pipeline with StandardScaler and PolynomialFeatures

pipeline = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2, include_bias=False))
# # Fit and transform the training data

X_train_poly = pipeline.fit_transform(X_train)
# # Transform the testing data

X_test_poly = pipeline.transform(X_test)
# Build a multi-layer neural network model

model = Sequential()
model.add(Dense(128, input_dim=X_train_poly.shape[1], activation='relu', kernel_regularizer=l
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dense(1, activation='relu'))
# Define early stopping callback

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# Compile the model with an adjusted learning rate

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='mean_squared_error')
# Train the model

model.fit(X_train_poly, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[e
# Make predictions on the test set

predictions = model.predict(X_test_poly)
# Evaluate the model

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'Mean Squared Error: {mse}')

print(f'R-squared: {r2}')
# xgboost_df = model_instantiation(model, x_train, x_test, y_train, y_test, predictions, 'Ten
# xgboost_df
Epoch 1/50
349/349 [==============================] - 3s 4ms/step - loss: 45521571840.0000 - val_loss: 4
5659099136.0000
Epoch 2/50
5651877888.0000
Epoch 3/50
5618933760.0000
Epoch 4/50
5588664320.0000
Epoch 5/50
5505753088.0000
Epoch 6/50
5382037504.0000
Epoch 7/50
5313585152.0000
Epoch 8/50
5178540032.0000
Epoch 9/50
5011771392.0000
Epoch 10/50
4739723264.0000
Epoch 11/50
4404719616.0000
Epoch 12/50
4059979776.0000
Epoch 13/50
3515187200.0000
Epoch 14/50
3261763584.0000
Epoch 15/50
2615050240.0000
Epoch 16/50
1998520320.0000
Epoch 17/50
1106513920.0000
Epoch 18/50
0114827264.0000
Epoch 19/50
9349116928.0000
Epoch 20/50
8629601280.0000
Epoch 21/50
8276018176.0000
Epoch 22/50
6098195456.0000
Epoch 23/50
5949268992.0000
Epoch 24/50
4343340032.0000
Epoch 25/50
2776638464.0000
Epoch 26/50
1230275584.0000
Epoch 27/50
0732843008.0000
Epoch 28/50
8998916096.0000
Epoch 29/50
7636432896.0000
Epoch 30/50
5972879360.0000
Epoch 31/50
5463404544.0000
Epoch 32/50
4184819712.0000
Epoch 33/50
1572216832.0000
Epoch 34/50
2135050240.0000
Epoch 35/50
834973696.0000
Epoch 36/50
097423872.0000
Epoch 37/50
530541056.0000
Epoch 38/50
991861760.0000
Epoch 39/50
955921408.0000
Epoch 40/50
321756672.0000
Epoch 41/50
848336896.0000
Epoch 42/50
548585984.0000
Epoch 43/50
736945664.0000
Epoch 44/50
284270080.0000
Epoch 45/50
758976512.0000
Epoch 46/50
697150464.0000
Epoch 47/50
694658560.0000
Epoch 48/50
71138048.0000
Epoch 49/50
03393024.0000
Epoch 50/50
682987520.0000
109/109 [==============================] - 1s 1ms/step
Mean Squared Error: 10499794813.146599
R-squared: -0.1518082573703612
In [45]: from sklearn.linear_model import Ridge, Lasso
lasso = Lasso(alpha=1.0)
lasso.fit(x_train, y_train)
y_pred_lasso = lasso.predict(x_test)
lasso_df = model_instantiation(lasso, x_train, x_test, y_train, y_test, y_pred_lasso, 'Lasso'

lasso_df
C:\Users\sumit\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:628: C
onvergenceWarning: Objective did not converge. You might want to increase the number of itera
tions, check the scale of the features or consider increasing regularisation. Duality gap: 4.
345e+12, tolerance: 1.269e+10
model = cd_fast.enet_coordinate_descent(
Lasso 0.585009 0.588948 3.747114e+09 61213.678646 46199.161283
In [46]: ridge = Ridge(alpha=1.0)

ridge.fit(x_train, y_train)
y_pred_ridge = ridge.predict(x_test)
ridge_df = model_instantiation(ridge, x_train, x_test, y_train, y_test, y_pred_ridge, 'Ridge'

ridge_df
Ridge 0.584982 0.588941 3.747182e+09 61214.231846 46202.274281
In [47]: all_results = pd.concat([LR_df,DT_df,RF_df,RF_df_2,ada_df,xgboost_df,lasso_df,ridge_df])

all_results
Linear Regression 0.630962 0.624499 1.071205e-01 0.327293 0.251028
DTree_Regression 0.763267 0.717443 8.060598e-02 0.283912 0.208418
RF_Regression 0.850595 0.774959 6.419829e-02 0.253374 0.180186
RF2_Regression 0.830124 0.763076 6.758802e-02 0.259977 0.186851
ADA_Regression 0.599874 0.586475 1.179678e-01 0.343464 0.271343
XGB_Regression 0.923149 0.801669 5.657852e-02 0.237862 0.163638
Lasso 0.585009 0.588948 3.747114e+09 61213.678646 46199.161283
Ridge 0.584982 0.588941 3.747182e+09 61214.231846 46202.274281

House Price Prediction Models

Uploaded by

Copyright:

Available Formats

House Price Prediction Models

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

House Price Prediction Models

Uploaded by

Copyright:

Available Formats

In [1]: import pandas as pd

Out[1]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_inco

0 -122.23 37.88 41 880 129.0 322 126 8.3

1 -122.22 37.86 21 7099 1106.0 2401 1138 8.3

2 -122.25 37.85 52 1627 280.0 565 259 3.8

3 -122.25 37.85 52 919 213.0 413 193 4.0

4 -122.25 37.84 52 2535 489.0 1094 514 3.6

In [2]: # Checking input data shape & info

In [3]: #Checking for NULL values in Data

In [4]: #Dropping NULL Values

In [5]: #Confirming dropna operation

In [6]: #Checking for any duplicates

In [7]: #Checking data composition for numerical features

Out[7]: longitude latitude housing_median_age total_rooms total_bedrooms population househ

count 18376.000000 18376.000000 18376.000000 18376.000000 18376.000000 18376.000000 18376.000

mean -119.571095 35.635164 28.605736 2635.302188 537.711199 1425.810786 499.37

std 2.003042 2.137485 12.570789 2200.534974 424.125849 1143.481721 384.514

min -124.350000 32.540000 1.000000 2.000000 2.000000 3.000000 2.000

25% -121.800000 33.930000 18.000000 1444.000000 295.000000 786.000000 280.000

50% -118.500000 34.260000 29.000000 2123.000000 434.000000 1165.500000 408.000

75% -118.010000 37.720000 37.000000 3137.000000 646.000000 1722.000000 603.000

max -114.310000 41.950000 52.000000 39320.000000 6445.000000 35682.000000 6082.000

In [8]: #Standardizing the numerical feilds.

In [9]: # Correlation matrix

In [10]: df=df.drop(['total_rooms','total_bedrooms','population'], axis=1)

In [11]: # Correlation matrix

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],

In [13]: #House price based on categorical value.

Out[13]: count mean std min 25% 50% 75% max

INLAND 5869.0 124937.335492 70783.931014 14999.0 77500.0 108300.0 148600.0 500001.0

ISLAND 5.0 380440.000000 80559.561816 287500.0 300000.0 414700.0 450000.0 450000.0

In [14]: #Same information using a bar chart.

In [16]: figure3, ax = plt.subplots(figsize = (12,8))

In [19]: #Checking for dummies action & feature name change

0 -122.23 37.88 41 126 8.3252 452600

1 -122.22 37.86 21 1138 8.3014 358500

2 -122.25 37.85 52 259 3.8462 342200

3 -122.25 37.85 52 193 4.0368 269700

4 -122.25 37.84 52 514 3.6591 299200

In [20]: #Distribution of median house value which we are predicting

In [22]: #Trying to identify outliers and remove them

# Set the threshold for outliers

# Identify outlier rows using boolean masks

# Display the number of outliers

# Remove the outliers

# Display the shape of the cleaned DataFrame

In [23]: #Confirming outliers removal

In [25]: #Checking if any column belongs to object

In [26]: #Seperating input features and labels

In [27]: # Split the dataset into training and testing sets

0 -122.23 37.88 41 126 8.3252 452600

1 -122.22 37.86 21 1138 8.3014 358500

2 -122.25 37.85 52 259 3.8462 342200

3 -122.25 37.85 52 193 4.0368 269700

4 -122.25 37.84 52 514 3.6591 299200

In [28]: # # Standardize the features using StandardScaler

In [29]: #Defining functions to calculate R2 and error rate

def model_instantiation(model, x_train, x_test, y_train, y_test, y_pred, model_name):

In [30]: from sklearn.linear_model import LinearRegression

In [31]: #Instantiating Linear reg model