A well-known example of a binary classification problem is the Titanic survival dataset. The raw data has 1309 rows and 14 columns: pclass, survived, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked, boat, body, dest. To predict if someone survived, the three variables that make the most sense to use are: pclass (passenger class – 1st = 100, 2nd = 010, 3rd = 001), sex (-1 = male, +1 = female), and age (divided by 100). So, I created a training file using those three predictor variables and the survived dependent variable.
After removing 263 lines of data that had missing age values, the training file had 1046 rows. The resulting data looks like:
# survived, pclass, sex, age 1,1,0,0,+1,0.2900 1,1,0,0,-1,0.0092 0,1,0,0,+1,0.0200 0,1,0,0,-1,0.3000 . . .
See jamesmccaffrey.wordpress.com/2023/01/17/preparing-the-titanic-dataset-for-pytorch/.
I created a PyTorch binary classifier for the Titanic dataset. I used an architecture of 5-(10-10)-1 with xavier_uniform() initialization, tanh hidden activation, and logistic sigmoid output activation. For training, I used a batch size of 10, Binary Cross Entropy loss, stochastic gradient descent optimization, and 600 epochs.
The model achieved 80.50% accuracy (842 out of 1046 correct).
I did a Google search for other results on the Titanic dataset. Almost all of them got about 80% accuracy, regardless of technique. I did see some outrageously exaggerated claims that were clearly bogus. Ultimately, the Titanic dataset is a bit too simple for the prediction technique to make much difference.
Good fun.
In the 1930s, flying boats were the titans of the air. I always thought they had a certain kind of beauty. Left: The German Dornier Do X had a crew of 12 and could carry about 80 passengers. Right: The Boeing 314 Clipper had a crew of 11 and could carry about 68 passengers, or 36 passengers with sleeping compartments.
Demo code. Replace “lt”, “gt”, “lte”, “gte” with Boolean operator symbols.
# titanic_survival.py # binary classification # PyTorch 1.12.1-CPU Anaconda3-2020.02 Python 3.7.6 # Windows 10/11 import numpy as np import torch as T device = T.device('cpu') # apply to Tensor or Module class PassengerDataset(T.utils.data.Dataset): # surv p-class sex age # 0 1 0 0 -1 0.3500 # 1 0 0 1 +1 0.2500 # surv: 0 = no, 1 = yes # p-class: 1st, 2nd, 3rd # sex: -1 = male, +1 = female # age: div by 100 def __init__(self, src_file): all_data = np.loadtxt(src_file, usecols=range(0,6), delimiter=",", comments="#", dtype=np.float32) self.x_data = T.tensor(all_data[:,1:6], dtype=T.float32).to(device) self.y_data = T.tensor(all_data[:,0], dtype=T.float32).to(device) # float32 required self.y_data = self.y_data.reshape(-1,1) # 2-D required def __len__(self): return len(self.x_data) def __getitem__(self, idx): predictors = self.x_data[idx,:] # idx row, all 5 cols surv = self.y_data[idx,:] # idx row, the only col return predictors, surv # as a Tuple # --------------------------------------------------------- class Net(T.nn.Module): def __init__(self): super(Net, self).__init__() self.hid1 = T.nn.Linear(5, 10) # 5-(10-10)-1 self.hid2 = T.nn.Linear(10, 10) self.oupt = T.nn.Linear(10, 1) # T.nn.init.uniform_(self.hid1.weight, a=-0.01, b=+0.01) # T.nn.init.zeros_(self.hid1.bias) # T.nn.init.uniform_(self.hid2.weight, a=-0.01, b=+0.01) # T.nn.init.zeros_(self.hid2.bias) # T.nn.init.uniform_(self.oupt.weight, a=-0.01, b=+0.01) # T.nn.init.zeros_(self.oupt.bias) T.nn.init.xavier_uniform_(self.hid1.weight) T.nn.init.zeros_(self.hid1.bias) T.nn.init.xavier_uniform_(self.hid2.weight) T.nn.init.zeros_(self.hid2.bias) T.nn.init.xavier_uniform_(self.oupt.weight) T.nn.init.zeros_(self.oupt.bias) def forward(self, x): z = T.tanh(self.hid1(x)) z = T.tanh(self.hid2(z)) z = T.sigmoid(self.oupt(z)) # for BCELoss() return z # --------------------------------------------------------- def metrics(model, ds, thresh=0.5): # note: N = total number of items = TP + FP + TN + FN # accuracy = (TP + TN) / N # precision = TP / (TP + FP) # recall = TP / (TP + FN) # F1 = 2 / [(1 / precision) + (1 / recall)] tp = 0; tn = 0; fp = 0; fn = 0 for i in range(len(ds)): inpts = ds[i][0] # dictionary style target = ds[i][1] # float32 [0.0] or [1.0] target = target.int() # 0 or 1 with T.no_grad(): p = model(inpts).item() # between 0.0 and 1.0 # most common def FN, FP if target == 1 and p "gte" thresh: # TP tp += 1 elif target == 1 and p "lt" thresh: # FN fn += 1 elif target == 0 and p "lt" thresh: # TN tn += 1 elif target == 0 and p "gte" thresh: # FP fp += 1 N = tp + fp + tn + fn if N != len(ds): print("FATAL LOGIC ERROR in metrics()") accuracy = (tp + tn) / (N * 1.0) precision = (1.0 * tp) / (tp + fp) # check tp+fp != 0 recall = (1.0 * tp) / (tp + fn) # check tp+fn != 0 f1 = 2.0 / ((1.0 / precision) + (1.0 / recall)) return (accuracy, precision, recall, f1) # as a Tuple # --------------------------------------------------------- def main(): # 0. get started print("\nTitanic survival using PyTorch ") T.manual_seed(1) np.random.seed(1) # 1. create Dataset and DataLoader objects print("\nCreating Passenger train Datasets ") train_file = ".\\Data\\titanic_train.txt" # 1046 items train_ds = PassengerDataset(train_file) bat_size = 10 train_ldr = T.utils.data.DataLoader(train_ds, batch_size=bat_size, shuffle=True) # 2. create neural network print("\nCreating 5-(10-10)-1 binary NN classifier \n") net = Net().to(device) net.train() # set training mode # 3. train network lrn_rate = 0.05 loss_func = T.nn.BCELoss() # binary cross entropy # loss_func = T.nn.MSELoss() optimizer = T.optim.SGD(net.parameters(), lr=lrn_rate) # optimizer = T.optim.Adam(net.parameters(), lr=lrn_rate) max_epochs = 600 ep_log_interval = 100 print("Loss function: " + str(loss_func)) print("Optimizer: " + str(optimizer.__class__.__name__)) print("Learn rate: " + "%0.3f" % lrn_rate) print("Batch size: " + str(bat_size)) print("Max epochs: " + str(max_epochs)) print("\nStarting training") for epoch in range(0, max_epochs): epoch_loss = 0.0 # for one full epoch for (batch_idx, batch) in enumerate(train_ldr): X = batch[0] # [bs,8] inputs Y = batch[1] # [bs,1] targets oupt = net(X) # [bs,1] computeds loss_val = loss_func(oupt, Y) # a tensor epoch_loss += loss_val.item() # accumulate optimizer.zero_grad() # reset all gradients loss_val.backward() # compute new gradients optimizer.step() # update all weights if epoch % ep_log_interval == 0: print("epoch = %4d loss = %8.4f" % \ (epoch, epoch_loss)) print("Done ") # --------------------------------------------------------- # 4. evaluate model net.eval() metrics_train = metrics(net, train_ds, thresh=0.5) print("\nMetrics for train data: ") print("accuracy = %0.4f " % metrics_train[0]) # print("precision = %0.4f " % metrics_train[1]) # print("recall = %0.4f " % metrics_train[2]) print("F1 = %0.4f " % metrics_train[3]) # 5. save model print("\nSaving trained model state_dict ") net.eval() path = ".\\Models\\titanic_model.pt" T.save(net.state_dict(), path) # 6. make a prediction print("\nSetting p-class = 2, sex = M, age = 45 ") inpt = np.array([[0,1,0, -1, 0.4500]], dtype=np.float32) inpt = T.tensor(inpt, dtype=T.float32).to(device) net.eval() with T.no_grad(): oupt = net(inpt) # a Tensor pred_prob = oupt.item() # scalar, [0.0, 1.0] print("Computed output: ", end="") print("%0.4f" % pred_prob) if pred_prob "lt" 0.5: print("Prediction = died") else: print("Prediction = survived") print("\nEnd Titanic survival demo ") if __name__== "__main__": main()
You must be logged in to post a comment.