When working with PyTorch (or Keras) neural networks, a surprisingly tricky task is dealing with training data that has IDs. Data IDs are useful when analyzing a model to diagnose items that are incorrectly predicted.
You need to store the ID information, but you don’t want to feed the ID information to the neural network.
There are dozens of design choices but my preferred technique is to design a Dataset object that returns items with three fields: predictor values, target values, and ID values. This is a technique that is best understood by examining a concrete example.
Suppose the training data looks like:
train_0001, 5.1, 3.5, 1.4, 0.2, 0 train_0002, 4.9, 3.0, 1.4, 0.2, 0 . . . train_0120, 6.9, 3.1, 5.4, 2.1, 2
This is the Iris dataset. The first column is a data ID that I added, the next four columns are predictor values (sepal length and width, petal length and width), and the last column is the species class label (0 = setosa, 1 = versicolor, 2 = virginica).
A PyTorch Dataset definition for the data is:
def __init__(self, src_file, num_rows=None): # 'train_0001', 5.0, 3.5, 1.3, 0.3, 0 tmp_all = np.loadtxt(src_file, max_rows=num_rows, usecols=range(0,6), delimiter=",", skiprows=0, comments="#", dtype=np.str) # IDs are str tmp_x = tmp_all[:,1:5].astype(np.float32) # cols 1,2,3,4 tmp_y = tmp_all[:,5].astype(np.int64) # col 5 self.x_data = T.tensor(tmp_x, dtype=T.float32) self.y_data = T.tensor(tmp_y, dtype=T.int64) self.id_data = tmp_all[:,0].astype(np.str) def __len__(self): return len(self.x_data) def __getitem__(self, idx): preds = self.x_data[idx] spcs = self.y_data[idx] id = self.id_data[idx] # np.str sample = { 'predictors' : preds, 'species' : spcs, 'id' : id } return sample
The data is read into memory as NumPy arrays, and the predictor and label values are converted to PyTorch tensors. A data item is returned as a Dictionary object with keys ‘predictors’, ‘species’, and ‘id’. You could return a data item as a tuple but using string keys like batch[‘predictors’] and batch[‘id]’ is less error-prone than index values like batch[0] and batch[2].
Accessing the data looks like this:
for (b_ix, batch) in enumerate(dataldr): X = batch['predictors'] Y = batch['species'] id = batch['id'] with T.no_grad(): oupt = model(X) # logits form print("ID = ", end=""); print(id) print("X = ", end=""); print(X) print("Y = ", end=""); print(Y) print("oupt = ", end=""); print(oupt) . . .
The ID information is attached to each data item but isn’t fed to the network.
Learning how to use neural networks is a long journey that has many small conceptual sub-voyages.
The Matson Navigation Company was founded in 1882. The SS Mariposa and SS Monterey passenger ships were launched in 1931 and were famous as the most elegant way to travel to Hawaii and the South Pacific in the days before jet air travel. Reading about Matson ships and seeing them in old movies inspired me to want to work on a cruise ship, which I eventually did after I graduated from college (on the Royal Viking Line as an assistant cruise director).
Demo code:
# iris_ids.py # iris example dealing with data IDs # PyTorch 1.9.0-CPU Anaconda3-2020.02 Python 3.7.6 # Windows 10 import numpy as np import torch as T device = T.device("cpu") # apply to Tensor or Module # ----------------------------------------------------------- class IrisDataset(T.utils.data.Dataset): def __init__(self, src_file, num_rows=None): # 'train_0001', 5.0, 3.5, 1.3, 0.3, 0 tmp_all = np.loadtxt(src_file, max_rows=num_rows, usecols=range(0,6), delimiter=",", skiprows=0, comments="#", dtype=np.str) # IDs are str tmp_x = tmp_all[:,1:5].astype(np.float32) # cols 1,2,3,4 tmp_y = tmp_all[:,5].astype(np.int64) # col 5 self.x_data = T.tensor(tmp_x, dtype=T.float32) self.y_data = T.tensor(tmp_y, dtype=T.int64) self.id_data = tmp_all[:,0].astype(np.str) def __len__(self): return len(self.x_data) def __getitem__(self, idx): preds = self.x_data[idx] spcs = self.y_data[idx] id = self.id_data[idx] # np.str sample = { 'predictors' : preds, 'species' : spcs, 'id' : id } return sample # ----------------------------------------------------------- class Net(T.nn.Module): def __init__(self): super(Net, self).__init__() self.hid1 = T.nn.Linear(4, 7) # 4-7-3 self.oupt = T.nn.Linear(7, 3) T.nn.init.xavier_uniform_(self.hid1.weight) T.nn.init.zeros_(self.hid1.bias) T.nn.init.xavier_uniform_(self.oupt.weight) T.nn.init.zeros_(self.oupt.bias) def forward(self, x): z = T.tanh(self.hid1(x)) z = self.oupt(z) # no softmax: CrossEntropyLoss() return z # ----------------------------------------------------------- def accuracy(model, dataset): # assumes model.eval() mode dataldr = T.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) n_correct = 0; n_wrong = 0 for (_, batch) in enumerate(dataldr): X = batch['predictors'] Y = batch['species'] # already flattened by Dataset id = batch['id'] with T.no_grad(): oupt = model(X) # logits form print("ID = ", end=""); print(id) print("X = ", end=""); print(X) print("Y = ", end=""); print(Y) print("oupt = ", end=""); print(oupt) input() big_idx = T.argmax(oupt) # if big_idx.item() == Y.item(): if big_idx == Y: n_correct += 1 else: n_wrong += 1 acc = (n_correct * 1.0) / (n_correct + n_wrong) return acc # ----------------------------------------------------------- def main(): # 0. get started print("\nBegin PyTorch Iris dataset with IDs demo \n") T.manual_seed(1) np.random.seed(1) # 1. create DataLoader objects print("Creating Iris train and test DataLoader ") train_file = ".\\Data\\iris_train_with_ids.txt" test_file = ".\\Data\\iris_test_with_ids.txt" train_ds = IrisDataset(train_file, num_rows=120) test_ds = IrisDataset(test_file) # 30 rows bat_size = 4 train_ldr = T.utils.data.DataLoader(train_ds, batch_size=bat_size, shuffle=True) test_ldr = T.utils.data.DataLoader(test_ds, batch_size=1, shuffle=False) # 2. create network net = Net().to(device) # 3. train model max_epochs = 12 ep_log_interval = 2 lrn_rate = 0.05 loss_func = T.nn.CrossEntropyLoss() # applies softmax() opt = T.optim.SGD(net.parameters(), lr=lrn_rate) print("\nbat_size = %3d " % bat_size) print("loss = " + str(loss_func)) print("optimizer = SGD") print("max_epochs = %3d " % max_epochs) print("lrn_rate = %0.3f " % lrn_rate) print("\nStarting training") net.train() # set the mode for epoch in range(0, max_epochs): epoch_loss = 0 # for one full epoch for (batch_idx, batch) in enumerate(train_ldr): X = batch['predictors'] # [10,4] Y = batch['species'] # OK; alreay flattened # do not use IDs during training opt.zero_grad() oupt = net(X) loss_obj = loss_func(oupt, Y) # a tensor epoch_loss += loss_obj.item() # accumulate loss_obj.backward() opt.step() if epoch % ep_log_interval == 0: print("epoch = %4d loss = %0.4f" % (epoch, epoch_loss)) print("Done ") # 4. evaluate model accuracy print("\nComputing accuracy item-by-item \n") net.eval() acc = accuracy(net, train_ds) # item-by-item print("Accuracy on train data = %0.4f" % acc) # 5. make a prediction print("\nPredicting species for [6.1, 3.1, 5.1, 1.1]: ") unk = np.array([[6.1, 3.1, 5.1, 1.1]], dtype=np.float32) unk = T.tensor(unk, dtype=T.float32).to(device) with T.no_grad(): logits = net(unk).to(device) # do not sum to 1.0 probs = T.softmax(logits, dim=1) # to device T.set_printoptions(precision=4) print(probs) # 6. save model (state_dict approach) print("\nSaving trained model state") fn = ".\\Models\\iris_model.pt" T.save(net.state_dict(), fn) # saved_model = Net() # saved_model.load_state_dict(T.load(fn)) # use saved_model to make prediction(s) print("\nEnd Iris with IDs demo") if __name__ == "__main__": main()
Pingback: Dealing With PyTorch Training Data That Has IDs – Open Source Biology & Genetics Interest Group