Titanic: Machine Learning from Disaster

4 minute read

Analysis and predictions on the most popular kaggle Titanic dataset

The data has been downloaded from https://www.kaggle.com/c/titanic.

This is a Jupyter notebook aimed at providing insights of major algorithms used in Data Analysis.

# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 100,'display.max_rows' , 500)
import matplotlib as plt
import seaborn as sns
%matplotlib inline


# importing all models

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)






#Loading Test data

test_data = pd.read_csv("test.csv")
test_data.head()
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
#Loading Training data
 
train_data = pd.read_csv("train.csv")
train_data.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
#Dropping unecessary columns

train_df = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_data.drop(['Ticket', 'Cabin'], axis=1)

train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name','PassengerId'], axis=1)

combine = [train_df, test_df]
#Encoding categorical variable 'Sex'

for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
train_data.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
#Handling NaN values in Age column

train_df['Age']=train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age']=test_df['Age'].fillna(train_df['Age'].mean())
#Handling NaN values in Embarked column



freq_port = train_df.Embarked.dropna().mode()[0] 

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Embarked Survived
0 C 0.553571
1 Q 0.389610
2 S 0.339009
#Encoding values in Embarked collumn

combine = [train_df, test_df]

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_df.head()
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 0 22.0 1 0 7.2500 0
1 1 1 1 38.0 1 0 71.2833 1
2 1 3 1 26.0 0 0 7.9250 0
3 1 1 1 35.0 1 0 53.1000 0
4 0 3 0 35.0 0 0 8.0500 0


features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

X_train = pd.get_dummies(train_df[features])
Y_train = train_df["Survived"]
X_test  = pd.get_dummies(test_df[features])
X_train.shape, Y_train.shape, X_test.shape
((891, 7), (891,), (418, 7))
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].mean())
#Random Forest Regressor


y = train_data["Survived"]

random_forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
random_forest.fit(X_train, Y_train)
predictions = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

85.41
#LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log
80.02
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc
89.23
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
83.16
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
79.35
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron
61.73
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
71.04
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree
98.2
#The above iterations could also be done in a single snippet 


algolist = [LogisticRegression(), SVC() , RandomForestClassifier() , KNeighborsClassifier() , GaussianNB() , 
            Perceptron() , DecisionTreeClassifier()]

for algo in algolist :
    algo.fit(X_train, Y_train)
    predictions = algo.predict(X_test)
    accuracy = round(algo.score(X_train , Y_train) * 100 , 2)
    print("The accuracy score using " + str(algo).split('(')[0] + " is " + str(accuracy) + " %")
The accuracy score using LogisticRegression is 80.02 %
The accuracy score using SVC is 89.23 %
The accuracy score using RandomForestClassifier is 97.19 %
The accuracy score using KNeighborsClassifier is 80.58 %
The accuracy score using GaussianNB is 79.35 %
The accuracy score using Perceptron is 61.73 %
The accuracy score using DecisionTreeClassifier is 98.2 %