Titanic - Machine Learning from Disaster
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as functional
import torch.nn as nn
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import csv
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic-model-02-07-2021/model_02_07_2021_epoch_200.pt
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
Obviously, no 'Survived' column
def display_all(df):
with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
display(df)
Encode categorical Variables
df = pd.concat([train, test], axis=0, sort=True)
#Convert to category dtype
df['Sex'] = df['Sex'].astype('category')
df['Sex'] = df['Sex'].cat.codes
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Fare'].fillna(df['Fare'].median(), inplace = True)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
del df['Embarked']
X_test_passengers = test['PassengerId']
df.drop(['Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)
df.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | Fare | Parch | Pclass | Sex | SibSp | Survived | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 22.0 | 7.2500 | 0 | 3 | 1 | 1 | 0.0 | 0 | 0 | 1 |
1 | 38.0 | 71.2833 | 0 | 1 | 0 | 1 | 1.0 | 1 | 0 | 0 |
2 | 26.0 | 7.9250 | 0 | 3 | 0 | 0 | 1.0 | 0 | 0 | 1 |
3 | 35.0 | 53.1000 | 0 | 1 | 0 | 1 | 1.0 | 0 | 0 | 1 |
4 | 35.0 | 8.0500 | 0 | 3 | 1 | 0 | 0.0 | 0 | 0 | 1 |
Scale continuous variables
continuous = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']
scaler = StandardScaler()
for var in continuous:
df[var] = df[var].astype('float32')
df[var] = scaler.fit_transform(df[var].values.reshape(-1, 1))
display_all(df.describe(include='all').T)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Age | 1309.0 | 2.914207e-09 | 1.000382 | -2.273836 | -0.581628 | -0.116523 | 0.426099 | 3.914388 |
Fare | 1309.0 | -2.914207e-09 | 1.000382 | -0.643464 | -0.490805 | -0.364003 | -0.038786 | 9.262028 |
Parch | 1309.0 | 1.602814e-08 | 1.000382 | -0.444999 | -0.444999 | -0.444999 | -0.444999 | 9.956863 |
Pclass | 1309.0 | 2.185655e-08 | 1.000382 | -1.546098 | -0.352091 | 0.841916 | 0.841916 | 0.841916 |
Sex | 1309.0 | 6.440031e-01 | 0.478997 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
SibSp | 1309.0 | -3.497049e-08 | 1.000382 | -0.479087 | -0.479087 | -0.479087 | 0.481288 | 7.203909 |
Survived | 891.0 | 3.838384e-01 | 0.486592 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
Embarked_C | 1309.0 | 2.062643e-01 | 0.404777 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Embarked_Q | 1309.0 | 9.396486e-02 | 0.291891 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
Embarked_S | 1309.0 | 6.997708e-01 | 0.458533 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
Neural Network
Seperate back train and test data
X_train = df[pd.notnull(df['Survived'])].drop(['Survived'], axis=1)
y_train = df[pd.notnull(df['Survived'])]['Survived']
X_test = df[pd.isnull(df['Survived'])].drop(['Survived'], axis=1)
Convert to tensors
X_train_t = torch.tensor(X_train.values)
y_train_t = torch.tensor(y_train.values).float()
X_test_t = torch.tensor(X_test.values)
Create a cross validation set
train_test_size = 0.8
seed_random = 123
X_train_t, X_val_t = train_test_split(X_train_t, random_state = seed_random, train_size = train_test_size, shuffle = True)
y_train_t, y_val_t = train_test_split(y_train_t, random_state = seed_random, train_size = train_test_size, shuffle = True)
print(X_train_t.shape, X_val_t.shape)
print(y_train_t.shape, y_val_t.shape)
print(X_test_t.shape)
torch.Size([712, 9]) torch.Size([179, 9])
torch.Size([712]) torch.Size([179])
torch.Size([418, 9])
class TitanicDataset(Dataset):
def __init__(self, features, labels):
self.features = features
self.labels = labels.type(torch.float32)
self.labels = torch.unsqueeze(self.labels, 1)
def __len__(self):
return len(self.labels)
def __getitem__(self, index):
return self.features[index], self.labels[index]
dataset_train = TitanicDataset(features = X_train_t, labels = y_train_t)
dataset_val = TitanicDataset(features = X_val_t, labels = y_val_t)
dataloader_train = DataLoader(dataset = dataset_train, batch_size = 64, shuffle = True)
dataloader_val = DataLoader(dataset = dataset_val, batch_size = 64, shuffle = True)
class Network(nn.Module):
def __init__(self, channels_in):
super().__init__()
self.fc1 = nn.Linear(channels_in, 250)
self.fc2 = nn.Linear(250, 1)
def forward(self, x):
x = self.fc1(x)
x = functional.relu(x)
x = self.fc2(x)
x = functional.relu(x)
x = functional.sigmoid(x)
return x
network = Network(X_train_t.shape[1])
def train(network, device, dataloader_train, dataloader_val, loss_function, optimizer, epochs):
network.to(device = device)
train_loss, val_loss = [], []
for epoch in range(epochs):
for phase in ['train', 'val']:
if phase == 'train':
network.train(True)
dataloader = dataloader_train
else:
network.train(False)
dataloader = dataloader_val
actual_loss = 0.0
actual_acc = 0.0
batch = 0
for features, labels in dataloader:
features = features.to(device = device)
labels = labels.to(device = device)
batch += 1
if phase == 'train':
outputs = network(features)
loss = loss_function(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
else:
with torch.no_grad():
outputs = network(features)
loss = loss_function(outputs, labels)
acc = accuracy_metric(outputs, labels)
actual_acc += acc
actual_loss += loss.item() * dataloader.batch_size
actual_acc /= len(dataloader.dataset)
epoch_loss = actual_loss / len(dataloader.dataset)
if epoch % 10 == 0:
print('Phase: ' + str(phase) + ', epoch loss: ' + str(epoch_loss))
if phase == 'val':
print('Accuracy: ' + str(actual_acc))
torch.save(network.state_dict(), "/kaggle/working/model_02_07_2021_epoch_" + str(epoch) + '.pt')
if phase == 'train':
train_loss.append(epoch_loss)
else:
val_loss.append(epoch_loss)
print('Training complete')
return train_loss, val_loss
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(network.parameters(), lr = 0.0005)
def accuracy_metric(outputs, labels):
acc = 0
for i in range(outputs.shape[0]):
if (outputs[i] > 0.5 and labels[i] == 1) or (outputs[i] <= 0.5 and labels[i] == 0):
acc += 1
return acc
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print('Training on device: ' + str(device))
train_loss, val_loss = train(network, device, dataloader_train, dataloader_val, loss_function, optimizer, epochs = 250)
Training on device: cpu
Phase: train, epoch loss: 0.7500095206700014
Phase: val, epoch loss: 0.746956148627084
Accuracy: 0.6759776536312849
/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py:1639: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
Phase: train, epoch loss: 0.6989852819549903
Phase: val, epoch loss: 0.688111523676185
Accuracy: 0.7374301675977654
Phase: train, epoch loss: 0.6610210236538662
Phase: val, epoch loss: 0.6522676478551087
Accuracy: 0.8044692737430168
Phase: train, epoch loss: 0.6394790317235368
Phase: val, epoch loss: 0.6450501553839145
Accuracy: 0.8156424581005587
Phase: train, epoch loss: 0.6441745650902223
Phase: val, epoch loss: 0.6406888695402518
Accuracy: 0.8156424581005587
Phase: train, epoch loss: 0.6182442140043451
Phase: val, epoch loss: 0.6365754324630652
Accuracy: 0.8212290502793296
Phase: train, epoch loss: 0.6528599503334989
Phase: val, epoch loss: 0.6293146357190009
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.6156364719519455
Phase: val, epoch loss: 0.6348543646615311
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6249435939145892
Phase: val, epoch loss: 0.6385784895060449
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6158574190032616
Phase: val, epoch loss: 0.6353226773565708
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6223965387665824
Phase: val, epoch loss: 0.6355847406653719
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6187425516964344
Phase: val, epoch loss: 0.6331084267387177
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6244893877693777
Phase: val, epoch loss: 0.6418670995275402
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6148248468891958
Phase: val, epoch loss: 0.6355740637752597
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.61067996935898
Phase: val, epoch loss: 0.6341138019242101
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6284844259197793
Phase: val, epoch loss: 0.6336266714767371
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6137268730763639
Phase: val, epoch loss: 0.6372238670647478
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.6046084902259741
Phase: val, epoch loss: 0.6455482823888683
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6147312876883517
Phase: val, epoch loss: 0.6365692309161138
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6180874310182721
Phase: val, epoch loss: 0.6368886212396888
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6117992776163509
Phase: val, epoch loss: 0.6310592310388661
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6172718305266305
Phase: val, epoch loss: 0.6451516604290328
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6047140453638655
Phase: val, epoch loss: 0.6597522330683703
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.5996088499433538
Phase: val, epoch loss: 0.6491866511339582
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.5983408167121116
Phase: val, epoch loss: 0.6487724687800062
Accuracy: 0.8435754189944135
Training complete
Evaluate
network = Network(X_train_t.shape[1])
network.load_state_dict(torch.load('../input/titanic-model-02-07-2021/model_02_07_2021_epoch_200.pt'))
network.eval()
Network(
(fc1): Linear(in_features=9, out_features=250, bias=True)
(fc2): Linear(in_features=250, out_features=1, bias=True)
)
test_var = Variable(torch.FloatTensor(X_test_t), requires_grad=True)
with torch.no_grad():
output_test = network(test_var)
survived = torch.round(output_test).numpy()
submission = [['PassengerId', 'Survived']]
for i in range(output_test.shape[0]):
submission.append([X_test_passengers[i], int(survived[i][0])])
with open('/kaggle/working/submission.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(submission)
print('Writing complete')
Writing complete