/Kaggle-Titanic

Kaggle Competition: Titanic - Machine Learning from Disaster

Primary LanguageJupyter Notebook

Titanic - Machine Learning from Disaster

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as functional
import torch.nn as nn
from torch.autograd import Variable

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import csv
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic-model-02-07-2021/model_02_07_2021_epoch_200.pt
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

Obviously, no 'Survived' column

def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

Encode categorical Variables

df = pd.concat([train, test], axis=0, sort=True)

#Convert to category dtype
df['Sex'] = df['Sex'].astype('category')
df['Sex'] = df['Sex'].cat.codes
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
df['Age'].fillna(df['Age'].median(), inplace = True)
df['Fare'].fillna(df['Fare'].median(), inplace = True)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)
del df['Embarked']
X_test_passengers = test['PassengerId']
df.drop(['Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Age Fare Parch Pclass Sex SibSp Survived Embarked_C Embarked_Q Embarked_S
0 22.0 7.2500 0 3 1 1 0.0 0 0 1
1 38.0 71.2833 0 1 0 1 1.0 1 0 0
2 26.0 7.9250 0 3 0 0 1.0 0 0 1
3 35.0 53.1000 0 1 0 1 1.0 0 0 1
4 35.0 8.0500 0 3 1 0 0.0 0 0 1

Scale continuous variables

continuous = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp']

scaler = StandardScaler()

for var in continuous:
    df[var] = df[var].astype('float32')
    df[var] = scaler.fit_transform(df[var].values.reshape(-1, 1))
display_all(df.describe(include='all').T)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
count mean std min 25% 50% 75% max
Age 1309.0 2.914207e-09 1.000382 -2.273836 -0.581628 -0.116523 0.426099 3.914388
Fare 1309.0 -2.914207e-09 1.000382 -0.643464 -0.490805 -0.364003 -0.038786 9.262028
Parch 1309.0 1.602814e-08 1.000382 -0.444999 -0.444999 -0.444999 -0.444999 9.956863
Pclass 1309.0 2.185655e-08 1.000382 -1.546098 -0.352091 0.841916 0.841916 0.841916
Sex 1309.0 6.440031e-01 0.478997 0.000000 0.000000 1.000000 1.000000 1.000000
SibSp 1309.0 -3.497049e-08 1.000382 -0.479087 -0.479087 -0.479087 0.481288 7.203909
Survived 891.0 3.838384e-01 0.486592 0.000000 0.000000 0.000000 1.000000 1.000000
Embarked_C 1309.0 2.062643e-01 0.404777 0.000000 0.000000 0.000000 0.000000 1.000000
Embarked_Q 1309.0 9.396486e-02 0.291891 0.000000 0.000000 0.000000 0.000000 1.000000
Embarked_S 1309.0 6.997708e-01 0.458533 0.000000 0.000000 1.000000 1.000000 1.000000

Neural Network

Seperate back train and test data

X_train = df[pd.notnull(df['Survived'])].drop(['Survived'], axis=1)
y_train = df[pd.notnull(df['Survived'])]['Survived']
X_test = df[pd.isnull(df['Survived'])].drop(['Survived'], axis=1)

Convert to tensors

X_train_t = torch.tensor(X_train.values)
y_train_t = torch.tensor(y_train.values).float()
X_test_t = torch.tensor(X_test.values)

Create a cross validation set

train_test_size = 0.8

seed_random = 123
X_train_t, X_val_t = train_test_split(X_train_t, random_state = seed_random, train_size = train_test_size, shuffle = True)
y_train_t, y_val_t = train_test_split(y_train_t, random_state = seed_random, train_size = train_test_size, shuffle = True)
print(X_train_t.shape, X_val_t.shape)
print(y_train_t.shape, y_val_t.shape)
print(X_test_t.shape)
torch.Size([712, 9]) torch.Size([179, 9])
torch.Size([712]) torch.Size([179])
torch.Size([418, 9])
class TitanicDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels.type(torch.float32)
        self.labels = torch.unsqueeze(self.labels, 1)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]
dataset_train = TitanicDataset(features = X_train_t, labels = y_train_t)
dataset_val = TitanicDataset(features = X_val_t, labels = y_val_t)

dataloader_train = DataLoader(dataset = dataset_train, batch_size = 64, shuffle = True)
dataloader_val = DataLoader(dataset = dataset_val, batch_size = 64, shuffle = True)
class Network(nn.Module):
    def __init__(self, channels_in):
        super().__init__()
        
        self.fc1 = nn.Linear(channels_in, 250)
        self.fc2 = nn.Linear(250, 1)
        
    def forward(self, x):
        x = self.fc1(x)
        x = functional.relu(x)
        x = self.fc2(x)
        x = functional.relu(x)
        x = functional.sigmoid(x)
        
        return x
network = Network(X_train_t.shape[1])
def train(network, device, dataloader_train, dataloader_val, loss_function, optimizer, epochs):
    network.to(device = device)
    train_loss, val_loss = [], []
    
    for epoch in range(epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                network.train(True)
                dataloader = dataloader_train
            else:
                network.train(False)
                dataloader = dataloader_val
            
            actual_loss = 0.0
            actual_acc = 0.0
            batch = 0
            
            for features, labels in dataloader:
                features = features.to(device = device)
                labels = labels.to(device = device)
                batch += 1
                
                if phase == 'train':
                    outputs = network(features)
                    loss = loss_function(outputs, labels)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                else:
                    with torch.no_grad():
                        outputs = network(features)
                        loss = loss_function(outputs, labels)
                        acc = accuracy_metric(outputs, labels)
                        actual_acc += acc
                actual_loss += loss.item() * dataloader.batch_size
            
            actual_acc /= len(dataloader.dataset)
            epoch_loss = actual_loss / len(dataloader.dataset)
            if epoch % 10 == 0:
                print('Phase: ' + str(phase) + ', epoch loss: ' + str(epoch_loss))
                if phase == 'val':
                    print('Accuracy: ' + str(actual_acc))
                torch.save(network.state_dict(), "/kaggle/working/model_02_07_2021_epoch_" + str(epoch) + '.pt')
            
            if phase == 'train':
                train_loss.append(epoch_loss)
            else:
                val_loss.append(epoch_loss)
    print('Training complete')
    return train_loss, val_loss
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(network.parameters(), lr = 0.0005)

def accuracy_metric(outputs, labels):
    acc = 0
    for i in range(outputs.shape[0]):
        if (outputs[i] > 0.5 and labels[i] == 1) or (outputs[i] <= 0.5 and labels[i] == 0):
            acc += 1
    return acc
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print('Training on device: ' + str(device))

train_loss, val_loss = train(network, device, dataloader_train, dataloader_val, loss_function, optimizer, epochs = 250)
Training on device: cpu
Phase: train, epoch loss: 0.7500095206700014
Phase: val, epoch loss: 0.746956148627084
Accuracy: 0.6759776536312849


/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py:1639: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
  warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")


Phase: train, epoch loss: 0.6989852819549903
Phase: val, epoch loss: 0.688111523676185
Accuracy: 0.7374301675977654
Phase: train, epoch loss: 0.6610210236538662
Phase: val, epoch loss: 0.6522676478551087
Accuracy: 0.8044692737430168
Phase: train, epoch loss: 0.6394790317235368
Phase: val, epoch loss: 0.6450501553839145
Accuracy: 0.8156424581005587
Phase: train, epoch loss: 0.6441745650902223
Phase: val, epoch loss: 0.6406888695402518
Accuracy: 0.8156424581005587
Phase: train, epoch loss: 0.6182442140043451
Phase: val, epoch loss: 0.6365754324630652
Accuracy: 0.8212290502793296
Phase: train, epoch loss: 0.6528599503334989
Phase: val, epoch loss: 0.6293146357190009
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.6156364719519455
Phase: val, epoch loss: 0.6348543646615311
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6249435939145892
Phase: val, epoch loss: 0.6385784895060449
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6158574190032616
Phase: val, epoch loss: 0.6353226773565708
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6223965387665824
Phase: val, epoch loss: 0.6355847406653719
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6187425516964344
Phase: val, epoch loss: 0.6331084267387177
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6244893877693777
Phase: val, epoch loss: 0.6418670995275402
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6148248468891958
Phase: val, epoch loss: 0.6355740637752597
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.61067996935898
Phase: val, epoch loss: 0.6341138019242101
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6284844259197793
Phase: val, epoch loss: 0.6336266714767371
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6137268730763639
Phase: val, epoch loss: 0.6372238670647478
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.6046084902259741
Phase: val, epoch loss: 0.6455482823888683
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6147312876883517
Phase: val, epoch loss: 0.6365692309161138
Accuracy: 0.8324022346368715
Phase: train, epoch loss: 0.6180874310182721
Phase: val, epoch loss: 0.6368886212396888
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6117992776163509
Phase: val, epoch loss: 0.6310592310388661
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6172718305266305
Phase: val, epoch loss: 0.6451516604290328
Accuracy: 0.8379888268156425
Phase: train, epoch loss: 0.6047140453638655
Phase: val, epoch loss: 0.6597522330683703
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.5996088499433538
Phase: val, epoch loss: 0.6491866511339582
Accuracy: 0.8435754189944135
Phase: train, epoch loss: 0.5983408167121116
Phase: val, epoch loss: 0.6487724687800062
Accuracy: 0.8435754189944135
Training complete

Evaluate

network = Network(X_train_t.shape[1])
network.load_state_dict(torch.load('../input/titanic-model-02-07-2021/model_02_07_2021_epoch_200.pt'))
network.eval()
Network(
  (fc1): Linear(in_features=9, out_features=250, bias=True)
  (fc2): Linear(in_features=250, out_features=1, bias=True)
)
test_var = Variable(torch.FloatTensor(X_test_t), requires_grad=True)
with torch.no_grad():
    output_test = network(test_var)
survived = torch.round(output_test).numpy()

submission = [['PassengerId', 'Survived']]
for i in range(output_test.shape[0]):
    submission.append([X_test_passengers[i], int(survived[i][0])])

with open('/kaggle/working/submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(submission)

print('Writing complete')
Writing complete