## The Titanic Survival Example Using PyTorch

A well-known example of a binary classification problem is the Titanic survival dataset. The raw data has 1309 rows and 14 columns: pclass, survived, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked, boat, body, dest. To predict if someone survived, the three variables that make the most sense to use are: pclass (passenger class – 1st = 100, 2nd = 010, 3rd = 001), sex (-1 = male, +1 = female), and age (divided by 100). So, I created a training file using those three predictor variables and the survived dependent variable. After removing 263 lines of data that had missing age values, the training file had 1046 rows. The resulting data looks like:

```# survived, pclass, sex, age
1,1,0,0,+1,0.2900
1,1,0,0,-1,0.0092
0,1,0,0,+1,0.0200
0,1,0,0,-1,0.3000
. . .
```

See jamesmccaffrey.wordpress.com/2023/01/17/preparing-the-titanic-dataset-for-pytorch/.

I created a PyTorch binary classifier for the Titanic dataset. I used an architecture of 5-(10-10)-1 with xavier_uniform() initialization, tanh hidden activation, and logistic sigmoid output activation. For training, I used a batch size of 10, Binary Cross Entropy loss, stochastic gradient descent optimization, and 600 epochs.

The model achieved 80.50% accuracy (842 out of 1046 correct).

I did a Google search for other results on the Titanic dataset. Almost all of them got about 80% accuracy, regardless of technique. I did see some outrageously exaggerated claims that were clearly bogus. Ultimately, the Titanic dataset is a bit too simple for the prediction technique to make much difference.

Good fun. In the 1930s, flying boats were the titans of the air. I always thought they had a certain kind of beauty. Left: The German Dornier Do X had a crew of 12 and could carry about 80 passengers. Right: The Boeing 314 Clipper had a crew of 11 and could carry about 68 passengers, or 36 passengers with sleeping compartments.

Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols.

```# titanic_survival.py
# binary classification
# PyTorch 1.12.1-CPU Anaconda3-2020.02  Python 3.7.6
# Windows 10/11

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

class PassengerDataset(T.utils.data.Dataset):
# surv p-class  sex   age
#  0    1 0 0   -1  0.3500
#  1    0 0 1   +1  0.2500

# surv: 0 = no, 1 = yes
# p-class: 1st, 2nd, 3rd
# sex: -1 = male, +1 = female
# age: div by 100

def __init__(self, src_file):

self.x_data = T.tensor(all_data[:,1:6],
dtype=T.float32).to(device)
self.y_data = T.tensor(all_data[:,0],
dtype=T.float32).to(device)  # float32 required

self.y_data = self.y_data.reshape(-1,1)  # 2-D required

def __len__(self):
return len(self.x_data)

def __getitem__(self, idx):
predictors = self.x_data[idx,:]  # idx row, all 5 cols
surv = self.y_data[idx,:]    # idx row, the only col
return predictors, surv      # as a Tuple

# ---------------------------------------------------------

class Net(T.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.hid1 = T.nn.Linear(5, 10)  # 5-(10-10)-1
self.hid2 = T.nn.Linear(10, 10)
self.oupt = T.nn.Linear(10, 1)

# T.nn.init.uniform_(self.hid1.weight, a=-0.01, b=+0.01)
# T.nn.init.zeros_(self.hid1.bias)
# T.nn.init.uniform_(self.hid2.weight, a=-0.01, b=+0.01)
# T.nn.init.zeros_(self.hid2.bias)
# T.nn.init.uniform_(self.oupt.weight, a=-0.01, b=+0.01)
# T.nn.init.zeros_(self.oupt.bias)

T.nn.init.xavier_uniform_(self.hid1.weight)
T.nn.init.zeros_(self.hid1.bias)
T.nn.init.xavier_uniform_(self.hid2.weight)
T.nn.init.zeros_(self.hid2.bias)
T.nn.init.xavier_uniform_(self.oupt.weight)
T.nn.init.zeros_(self.oupt.bias)

def forward(self, x):
z = T.tanh(self.hid1(x))
z = T.tanh(self.hid2(z))
z = T.sigmoid(self.oupt(z))  # for BCELoss()
return z

# ---------------------------------------------------------

def metrics(model, ds, thresh=0.5):
# note: N = total number of items = TP + FP + TN + FN
# accuracy  = (TP + TN)  / N
# precision = TP / (TP + FP)
# recall    = TP / (TP + FN)
# F1        = 2 / [(1 / precision) + (1 / recall)]

tp = 0; tn = 0; fp = 0; fn = 0
for i in range(len(ds)):
inpts = ds[i]         # dictionary style
target = ds[i]        # float32  [0.0] or [1.0]
target = target.int()    # 0 or 1

p = model(inpts).item()       # between 0.0 and 1.0

# most common def FN, FP
if target == 1 and p "gte" thresh:    # TP
tp += 1
elif target == 1 and p "lt" thresh:   # FN
fn += 1
elif target == 0 and p "lt" thresh:   # TN
tn += 1
elif target == 0 and p "gte" thresh:  # FP
fp += 1

N = tp + fp + tn + fn
if N != len(ds):
print("FATAL LOGIC ERROR in metrics()")

accuracy = (tp + tn) / (N * 1.0)
precision = (1.0 * tp) / (tp + fp)  # check tp+fp != 0
recall = (1.0 * tp) / (tp + fn)     # check tp+fn != 0
f1 = 2.0 / ((1.0 / precision) + (1.0 / recall))
return (accuracy, precision, recall, f1)  # as a Tuple

# ---------------------------------------------------------

def main():
# 0. get started
print("\nTitanic survival using PyTorch ")
T.manual_seed(1)
np.random.seed(1)

# 1. create Dataset and DataLoader objects
print("\nCreating Passenger train Datasets ")
train_file = ".\\Data\\titanic_train.txt"   # 1046 items
train_ds = PassengerDataset(train_file)

bat_size = 10
batch_size=bat_size, shuffle=True)

# 2. create neural network
print("\nCreating 5-(10-10)-1 binary NN classifier \n")
net = Net().to(device)
net.train()  # set training mode

# 3. train network
lrn_rate = 0.05
loss_func = T.nn.BCELoss()  # binary cross entropy
# loss_func = T.nn.MSELoss()
optimizer = T.optim.SGD(net.parameters(), lr=lrn_rate)
max_epochs = 600
ep_log_interval = 100

print("Loss function: " + str(loss_func))
print("Optimizer: " + str(optimizer.__class__.__name__))
print("Learn rate: " + "%0.3f" % lrn_rate)
print("Batch size: " + str(bat_size))
print("Max epochs: " + str(max_epochs))

print("\nStarting training")
for epoch in range(0, max_epochs):
epoch_loss = 0.0            # for one full epoch
for (batch_idx, batch) in enumerate(train_ldr):
X = batch             # [bs,8]  inputs
Y = batch             # [bs,1]  targets
oupt = net(X)            # [bs,1]  computeds

loss_val = loss_func(oupt, Y)   # a tensor
epoch_loss += loss_val.item()   # accumulate
optimizer.step()      # update all weights

if epoch % ep_log_interval == 0:
print("epoch = %4d   loss = %8.4f" % \
(epoch, epoch_loss))
print("Done ")

# ---------------------------------------------------------

# 4. evaluate model
net.eval()
metrics_train = metrics(net, train_ds, thresh=0.5)
print("\nMetrics for train data: ")
print("accuracy  = %0.4f " % metrics_train)
# print("precision = %0.4f " % metrics_train)
# print("recall    = %0.4f " % metrics_train)
print("F1        = %0.4f " % metrics_train)

# 5. save model
print("\nSaving trained model state_dict ")
net.eval()
path = ".\\Models\\titanic_model.pt"
T.save(net.state_dict(), path)

# 6. make a prediction
print("\nSetting p-class = 2, sex = M, age = 45 ")
inpt = np.array([[0,1,0,  -1, 0.4500]],
dtype=np.float32)
inpt = T.tensor(inpt, dtype=T.float32).to(device)

net.eval()
oupt = net(inpt)    # a Tensor
pred_prob = oupt.item()  # scalar, [0.0, 1.0]
print("Computed output: ", end="")
print("%0.4f" % pred_prob)

if pred_prob "lt" 0.5:
print("Prediction = died")
else:
print("Prediction = survived")

print("\nEnd Titanic survival demo ")

if __name__== "__main__":
main()
```
This entry was posted in PyTorch. Bookmark the permalink.