I’ve already briefly done some work in the dataset in my tutorial for Logistic Regression – but never in entirety. I decided to re-evaluate utilizing Random Forest and submit to Kaggle. In this dataset, we’re utilizing a testing/training dataset of passengers on the Titanic in which we need to predict if passengers survived or not (1 or 0).
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
#Import training data
df_test = pd.read_csv('test.csv')
df_train = pd.read_csv('train.csv')
In [3]:
df_test.head()
Out[3]:
In [4]:
df_train.head()
Out[4]:
In [5]:
#We have some missing data here, so we're going to map out where NaN exists
#We might be able to take care of age, but cabin is probably too bad to save
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[5]:
In [6]:
#Test dataset
sns.heatmap(df_test.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[6]:
In [7]:
#Determine the average ages of passengers by class
#In an attempt to fix the NaN for this column somewhat
sns.set_style('whitegrid')
plt.figure(figsize=(10,7))
sns.boxplot(x='Pclass',y='Age',data=df_train)
Out[7]:
In [8]:
#Fill out the Age column with the average ages of passengers per class
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 37 #Return the avg age of passengers in the 1st class
elif Pclass == 2:
return 29 #2nd class
else:
return 24 #3rd class
else:
return Age
#Apply the function to the Age column
df_train['Age'] = df_train[['Age','Pclass']].apply(impute_age,axis=1)
df_test['Age'] = df_test[['Age','Pclass']].apply(impute_age,axis=1)
#Drop the cabin data
df_train.drop('Cabin',axis=1,inplace=True)
df_test.drop('Cabin',axis=1,inplace=True)
In [9]:
#Recheck the heatmap
#No more problems with Age
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[9]:
In [10]:
sns.heatmap(df_test.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[10]:
In [11]:
df_test[df_test['Fare'].isnull() == True]
Out[11]:
In [12]:
df_test['Fare'][df_test['Pclass'] == 3].mean()
Out[12]:
In [14]:
df_test['Fare'][df_test['PassengerId'] == 1044] = 12.45
In [15]:
#Determine where most people in PClass = 1 embarked
#In an attempt to fix the NaN for this column somewhat
sns.set_style('whitegrid')
plt.figure(figsize=(10,7))
sns.countplot(x="Embarked", data=df_train[df_train['Pclass'] == 1])
Out[15]:
In [16]:
df_train[df_train['Embarked'].isnull() == True]
Out[16]:
In [18]:
#Determine where most people in PClass = 1 embarked
#In an attempt to fix the NaN for this column somewhat
sns.set_style('whitegrid')
plt.figure(figsize=(10,7))
sns.countplot(x="Embarked", data=df_train[df_train['Pclass'] == 1])
Out[18]:
In [20]:
df_train['Embarked'][df_train['Embarked'].isnull() == True] = 'S'
In [21]:
df_train[df_train['Embarked'].isnull() == True]
Out[21]:
In [23]:
#Transform male/female into numeric columns - we drop female since M/F are perfect predictors
#Transform embarked into numeric columns Q/S
sex_train = pd.get_dummies(df_train['Sex'],drop_first=True)
embark_train = pd.get_dummies(df_train['Embarked'],drop_first=True)
#Repeat for test dataset
sex_test = pd.get_dummies(df_test['Sex'],drop_first=True)
embark_test = pd.get_dummies(df_test['Embarked'],drop_first=True)
In [24]:
#Add the new columns to the dataset
df_train = pd.concat([df_train,sex_train,embark_train],axis=1)
df_test = pd.concat([df_test,sex_test,embark_test],axis=1)
In [25]:
#Drop the old Sex/Embarked columns, along with the other columns we can't use for predictions
#Save passenger ID's for submission
pass_id = df_test['PassengerId']
df_train.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
df_test.drop(['PassengerId','Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
In [26]:
df_train.head()
Out[26]:
In [27]:
df_test.head()
Out[27]:
In [28]:
#Assign variables
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']
In [29]:
from sklearn.model_selection import train_test_split
In [30]:
#Choose the test size
#Test size = % of dataset allocated for testing (.3 = 30%)
#Random state = # of random splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
In [31]:
from sklearn.ensemble import RandomForestClassifier
In [32]:
#Object, try out 200 estimators to start
rfc = RandomForestClassifier(n_estimators=200)
In [33]:
#Fit model
rfc.fit(X_train,y_train)
Out[33]:
In [34]:
#Form predictions
predictions = rfc.predict(X_test)
In [35]:
from sklearn.metrics import classification_report, confusion_matrix
In [36]:
#Print reports
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
In [37]:
#Imports
from sklearn.model_selection import RandomizedSearchCV
In [38]:
#Provide a dictionary of these values to test
param_grid = {'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
In [39]:
#Instantiate object
grid = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
In [40]:
#Fit to find the best combo of params
grid.fit(X_train,y_train)
Out[40]:
In [41]:
#Show the best params to use
grid.best_params_
Out[41]:
In [42]:
#New predictions
grid_predictions = grid.predict(X_test)
In [43]:
#Print reports
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))
In [50]:
#Object, try out 200 estimators to start
rfc_full = RandomForestClassifier(n_estimators=2000,
min_samples_split=2,
min_samples_leaf=2,
max_features='auto',
max_depth=10,
bootstrap=False)
#Fit model
rfc_full.fit(X,y)
Out[50]:
In [51]:
#Run on the test dataframe
full_predictions = rfc_full.predict(df_test)
In [52]:
full_predictions
Out[52]:
In [53]:
#Merge into final dataframe
pred_df = pd.DataFrame(full_predictions)
#Drop indexes (can cause NaN when using Concat if you don't do this beforehand)
pred_df.reset_index(drop=True, inplace=True)
pass_id.reset_index(drop=True, inplace=True)
submission = pd.concat([pass_id.iloc[0:],pred_df], axis=1)
submission.columns=['PassengerId','Survived']
submission.to_csv('submission.csv', index = False)
In [47]:
submission.head()
Out[47]:
WordPress conversion from Titanic Dataset – Kaggle Submission.ipynb by nb2wp v0.3.1