predicting_house_prices.py

# -*- coding: utf-8 -*-
"""Predicting_house_prices.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1IGlFxkg2yt_kwlumAQav1UMq1qK1IUVa
"""

import numpy as np
import pandas as pd 
import cv2
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

prices = pd.read_csv('house prices.csv')

prices.head()

# In this section we are searching for null columns.
r = []
find_nullity = prices.isnull().sum()
nullity_sum = []
for i in range(len(find_nullity)):
  if find_nullity[i] != 0:
    r.append(i)
    nullity_sum.append(find_nullity[i])
print('columns number:', end='')
for i in range(len(r)):
  print(r[i], end=' ')  
print("are missing")

for i in range(len(r)):
  print(prices.columns[r[i]], nullity_sum[i])

for i in range(len(r)):
  print('column', r[i], 'has', nullity_sum[i], 'NA items')

# In this section we are replacing the values of the columns which have less than 100 NAs with other columns' mean. 
# This should be taking into account that, the cells containing NA are two types: 
# 1. the cell should have a numerical data. In this situation the NA is a symbol for lack of data and we should replace it by the mean of other datas. 
# 2. the cell should have a word data (its dtype is an object). In this situation the NA is not the symbol of lacking data rather it is a symbol of 
# the house does not have that feature. As an example: In some of houses, in the GarageCond column there is a NA which represent that that house does 
# not have a garage.
l1 = LabelEncoder()
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')


for i in range(len(r)):
  if nullity_sum[i] < 100:
    name = prices.columns[r[i]]
    if prices[name].dtype == 'object':
      prices[name] = prices[name].replace(np.nan, 'none', regex=True)
    else:
      prices[name].fillna(prices[name].mean(), inplace=True)

# Here we are deleting the columns which have more than a 100 NAs 
del prices['LotFrontage']  
del prices['Alley']  
del prices['FireplaceQu']  
del prices['PoolQC']  
del prices['Fence']  
del prices['MiscFeature']

prices.isnull().sum()

prices

# In this section we are replacing the defining word for each feature to a number 
prices.to_numpy()
l1 = LabelEncoder()
for c in prices.columns:
  if prices[c].dtype == 'object':
    prices[c] = l1.fit_transform(prices[c])

prices

X = prices.values
minmaxscaler = preprocessing.MinMaxScaler()
p = (minmaxscaler.fit_transform(X))
p_data = pd.DataFrame(p)

p_data

X_houses = p_data[p_data.columns[1:74]]
y_houses = p_data[p_data.columns[74]]

X1 = X_houses.values
y1 = y_houses.values

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Checkout the Data
print('Training data shape : ', X_train.shape, y_train.shape)
print('Testing data shape : ', X_test.shape, y_test.shape)
 
# Find the unique numbers from the train labels
classes = np.unique(y_train)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)

from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense

model = Sequential()
model.add(Dense(200, activation='linear', input_shape=(73,)))	#Hidden Layer 1
model.add(Dense(100, activation='linear'))	#Hidden Layer 2
# model.add(Dense(500, activation='linear'))	#Hidden Layer 3
model.add(Dense(1, activation='linear')) #Last layer with one output per class
model.summary()

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])

history = model.fit(X_train, y_train, epochs=20, batch_size=10, validation_split=0.2)

# Plotting Metrics
# Plot the mean_squared_error Curves
fig = plt.figure()
plt.plot(history.history['mean_absolute_error'],'r')
plt.plot(history.history['val_mean_absolute_error'],'b')
plt.title('model mean_absolute_error')
plt.ylabel('mean_absolute_error')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.grid()

# Plotting Metrics
# Plot the mean_squared_error Curves
fig = plt.figure()
plt.plot(history.history['mean_squared_error'],'r')
plt.plot(history.history['val_mean_squared_error'],'b')
plt.title('model mean_squared_error')
plt.ylabel('mean_squared_error')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.grid()

# Plot the Loss Curves
fig = plt.figure()
plt.plot(history.history['loss'],'r')
plt.plot(history.history['val_loss'],'b')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.grid()

# Prediction Labels
Y_pred = model.predict(X_test)

sum = 0
for i in range(len(Y_pred)):
  sum += (Y_pred[i]-y_test[i])**2
MSE = sum/len(Y_pred)  
print('the result of our mean squre error is', MSE)

plt.plot(Y_pred,'r')
plt.plot(y_test,'b')
plt.title('model prediction and real values')
plt.legend(['Y_pred', 'Y_test'], loc='lower right')
plt.grid()

model = Sequential()
model.add(Dense(200, activation='linear', input_shape=(73,)))	#Hidden Layer 1
model.add(Dense(100, activation='linear'))	#Hidden Layer 2
# model.add(Dense(500, activation='linear'))	#Hidden Layer 3
model.add(Dense(1, activation='linear')) #Last layer with one output per class
model.summary()

model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error', 'mean_squared_error'])

history = model.fit(X_train, y_train, epochs=20, batch_size=10, validation_split=0.2)

fig = plt.figure()
plt.plot(history.history['mean_absolute_error'],'r')
plt.plot(history.history['val_mean_absolute_error'],'b')
plt.title('model mean_absolute_error')
plt.ylabel('mean_absolute_error')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.grid()

# Plotting Metrics
# Plot the mean_squared_error Curves
fig = plt.figure()
plt.plot(history.history['mean_squared_error'],'r')
plt.plot(history.history['val_mean_squared_error'],'b')
plt.title('model mean_squared_error')
plt.ylabel('mean_squared_error')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.grid()


# Plot the Loss Curves
fig = plt.figure()
plt.plot(history.history['loss'],'r')
plt.plot(history.history['val_loss'],'b')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.grid()

# Prediction Labels
Y_pred = model.predict(X_test)

plt.plot(Y_pred,'r')
plt.plot(y_test,'b')
plt.title('model prediction and real values')
plt.legend(['Y_pred', 'Y_test'], loc='lower right')
plt.grid()

sum = 0
for i in range(len(Y_pred)):
  sum += (Y_pred[i]-y_test[i])**2
MSE = sum/len(Y_pred)  
print('the result of our mean squre error is', MSE)