Titanic: Machine Learning from Disaster
4 minute read
Analysis and predictions on the most popular kaggle Titanic dataset
The data has been downloaded from https://www.kaggle.com/c/titanic.
This is a Jupyter notebook aimed at providing insights of major algorithms used in Data Analysis.
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 100,'display.max_rows' , 500)
import matplotlib as plt
import seaborn as sns
%matplotlib inline
# importing all models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
#Loading Test data
test_data = pd.read_csv("test.csv")
test_data.head()
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
892 |
3 |
Kelly, Mr. James |
male |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
Q |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
female |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
S |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
3 |
895 |
3 |
Wirz, Mr. Albert |
male |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
S |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
#Loading Training data
train_data = pd.read_csv("train.csv")
train_data.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
#Dropping unecessary columns
train_df = train_data.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_data.drop(['Ticket', 'Cabin'], axis=1)
train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Name','PassengerId'], axis=1)
combine = [train_df, test_df]
#Encoding categorical variable 'Sex'
for dataset in combine:
dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
train_data.isnull().sum()
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
#Handling NaN values in Age column
train_df['Age']=train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age']=test_df['Age'].fillna(train_df['Age'].mean())
#Handling NaN values in Embarked column
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)
|
Embarked |
Survived |
0 |
C |
0.553571 |
1 |
Q |
0.389610 |
2 |
S |
0.339009 |
#Encoding values in Embarked collumn
combine = [train_df, test_df]
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
train_df.head()
|
Survived |
Pclass |
Sex |
Age |
SibSp |
Parch |
Fare |
Embarked |
0 |
0 |
3 |
0 |
22.0 |
1 |
0 |
7.2500 |
0 |
1 |
1 |
1 |
1 |
38.0 |
1 |
0 |
71.2833 |
1 |
2 |
1 |
3 |
1 |
26.0 |
0 |
0 |
7.9250 |
0 |
3 |
1 |
1 |
1 |
35.0 |
1 |
0 |
53.1000 |
0 |
4 |
0 |
3 |
0 |
35.0 |
0 |
0 |
8.0500 |
0 |
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
X_train = pd.get_dummies(train_df[features])
Y_train = train_df["Survived"]
X_test = pd.get_dummies(test_df[features])
X_train.shape, Y_train.shape, X_test.shape
((891, 7), (891,), (418, 7))
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].mean())
#Random Forest Regressor
y = train_data["Survived"]
random_forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
random_forest.fit(X_train, Y_train)
predictions = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest
#output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
#LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron
# Stochastic Gradient Descent
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree
#The above iterations could also be done in a single snippet
algolist = [LogisticRegression(), SVC() , RandomForestClassifier() , KNeighborsClassifier() , GaussianNB() ,
Perceptron() , DecisionTreeClassifier()]
for algo in algolist :
algo.fit(X_train, Y_train)
predictions = algo.predict(X_test)
accuracy = round(algo.score(X_train , Y_train) * 100 , 2)
print("The accuracy score using " + str(algo).split('(')[0] + " is " + str(accuracy) + " %")
The accuracy score using LogisticRegression is 80.02 %
The accuracy score using SVC is 89.23 %
The accuracy score using RandomForestClassifier is 97.19 %
The accuracy score using KNeighborsClassifier is 80.58 %
The accuracy score using GaussianNB is 79.35 %
The accuracy score using Perceptron is 61.73 %
The accuracy score using DecisionTreeClassifier is 98.2 %