This is a more complex example of Keras, utilizing Regression. This utilizes a good sized dataset from Kaggle, but does contain a little bit of data cleansing before we can build out the model. Unfortunately the model we end up building isn’t perfect and requires more tuning or some final dataset alterations, but it’s a good example none the less. More information below.
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv('DATA/kc_house_data.csv')
In [3]:
plt.figure(figsize=(15,6))
sns.distplot(df['price'])
Out[3]:
In [4]:
df.sort_values('price',ascending=False).head(20)
Out[4]:
In [5]:
#Original
df.shape
Out[5]:
In [6]:
#Above 350k
df[df['price'] <= 3500000].shape
Out[6]:
In [7]:
#Kick out outliers
df = df[df['price'] <= 3500000]
In [8]:
#Convert string to date
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].apply(lambda date : date.year)
df['month'] = df['date'].apply(lambda date : date.month)
In [9]:
#Drop since we no longer need the original field
df = df.drop('date',axis=1)
In [10]:
df = df.drop('id',axis=1)
df = df.drop('zipcode',axis=1)
In [11]:
df.head()
Out[11]:
In [12]:
#Separate features from label
X = df.drop('price',axis=1)
y = df['price']
In [13]:
from sklearn.model_selection import train_test_split
In [14]:
#Perform splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
In [15]:
from sklearn.preprocessing import MinMaxScaler
In [16]:
scaler = MinMaxScaler()
In [17]:
#Fit and transform training and test set
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
In [59]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
In [19]:
#Try to base model off size of dataset
X_train.shape
Out[19]:
In [44]:
model = Sequential()
#Make it 19 neurons since we have 19 cols
model.add(Dense(19,activation='relu'))
model.add(Dropout(0.25)) #Choose somewhere between 0/1 (1 = 100%) of neurons to turn off randomly to avoid overfitting
model.add(Dense(19,activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
In [45]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=25)
In [60]:
model.fit(x=X_train,y=y_train.values,
validation_data=(X_test,y_test.values),
batch_size=128,epochs=400,callbacks=[early_stop])
In [47]:
losses = pd.DataFrame(model.history.history)
losses.plot()
Out[47]:
In [48]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
In [49]:
#Grab predictions from the testing set
predictions = model.predict(X_test)
In [50]:
#MAE
mean_absolute_error(y_test,predictions)
Out[50]:
In [51]:
#Mean price
df['price'].mean()
Out[51]:
In [52]:
#MSE
np.sqrt(mean_squared_error(y_test,predictions))
Out[52]:
In [53]:
#Best possible score is 1.0
#How much variance is explained by our model?
explained_variance_score(y_test,predictions)
Out[53]:
In [54]:
plt.figure(figsize=(12,6))
plt.scatter(y_test,predictions)
plt.plot(y_test,y_test,'r')
Out[54]:
In [55]:
#Assign the first house in the DataFrame to a new DataFrame
single_house = df.drop('price',axis=1).iloc[0]
In [56]:
#Scale the values AND reshape the results (so the shape matches the expected shape for the model)
single_house = scaler.transform(single_house.values.reshape(-1,19))
In [57]:
#Predict the price
model.predict(single_house)
Out[57]:
In [58]:
#Check the real price
df.head(1)
Out[58]:
WordPress conversion from Keras Regression.ipynb by nb2wp v0.3.1