Using PyTorch to Generate Synthetic Data for a Regression Problem

There are a lot of good datasets for experimenting with machine learning classification. But there are very few datasets for ML regression experiemnts. Creating a synthetic dataset for regression is relatively easy and effective.

In high-level pseudo-code:

create PyTorch network with random weights
loop n_items times
  generate random input X
  compute y = net(X)
  add a bit of random noise to y
  write X values and y to file
end-loop

Here’s a program that generates 200 training items and 40 test items using a PyTorch neural network. Each input item has 6 predictor values, where each value is between -1.0 and +1.0. The target output value is between 0.0 and 1.0.

The devil is in the details. The resulting data looks like:

-0.1660  0.4406  -0.9998  -0.3953  -0.7065  -0.8153  0.7022
-0.2065  0.0776  -0.1616   0.3704  -0.5911   0.7562  0.5666
-0.9452  0.3409  -0.1654   0.1174  -0.7192  -0.6038  0.8186
 0.7528  0.7892  -0.8299  -0.9219  -0.6603   0.7563  0.3687
. . .
# make_synthetic.py
# create synthetic train and test datasets for regression
# PyTorch 2.0.1-CPU  Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

import numpy as np
import torch as T  # non-standard alias
device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self, n_in):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(n_in, 10)  # n-(10)-1
    self.oupt = T.nn.Linear(10, 1)
  
    lim = 0.80
    T.nn.init.uniform_(self.hid1.weight, -lim, lim) 
    T.nn.init.uniform_(self.hid1.bias, -lim, lim)
    T.nn.init.uniform_(self.oupt.weight, -lim, lim) 
    T.nn.init.uniform_(self.oupt.bias, -lim, lim)

  def forward(self, x):
    z = T.tanh(self.hid1(x))
    z = T.sigmoid(self.oupt(z))  # oupt in [0.0, 1.0]
    return z

# -----------------------------------------------------------

def create_data_file(net, n_in, fn, n_items):
  f = open(fn, "w")
  x_lo = -1.0; x_hi = 1.0
  for i in range(n_items):
    s = ""
    X = (x_hi - x_lo) * np.random.random(size=(1,n_in)) + x_lo
    for j in range(n_in):
      s += ("%0.4f" % X[0][j]) + "\t"
    X = T.tensor(X, dtype=T.float32)

    with T.no_grad():
      y = net(X).item()
    y += np.random.normal(loc=0.0, scale=0.01)  # could be neg
    if y "lt" 0.0: y += 0.01 * np.random.random() * 0.01  # pos
    s += ("%0.4f" % y) + "\n"

    f.write(s)
  f.close()

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nCreating synthetic datasets for regression ")
  np.random.seed(1)
  T.manual_seed(1) 

  # 1. create neural generator model
  n_in = 6
  print("\nCreating 6-(10)-1 regression model ")
  net = Net(n_in).to(device)

  # 2. use model to create data
  print("\nCreating data files ")
  create_data_file(net, n_in, ".\\synthetic_train.txt", 200)
  create_data_file(net, n_in, ".\\synthetic_test.txt", 40)

  print("\nEnd create synthetic data ")

# -----------------------------------------------------------

if __name__=="__main__":
  main()

The synthetic data generation program can be easily modified in several ways. To test the data, I wote a PyTorch regression network. The network created a good model quickly and scored 97.50% accuracy on the 40-item test data, where a correct prediction is one that’s within 10% of the true target value.

Good fun.



“Synthetic Men of Mars” (1939) by Edgar Rice Burroughs is the ninth book in the Mars series. In this novel, John Carter and Vor Daj (a member of Carter’s personal guard) seek scientist Ras Thavas who is the only one who can save queen Dejah Thoris. But Thavas has been experimenting with clones and those experiments have not gone well.

The Mars series had a huge influence on nearly every modern science fiction author. Here are three different covers. Left: By artist Robert Abbett. Center: By artist Gino D’Achille. Right: Unattributed but I believe this might be the work of artist Richard Clifton-Dey.


Demo program.

# synthetic_dnn.py
# synthetic dataset regression
# PyTorch 2.0.1-CPU  Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

import numpy as np
import torch as T  # non-standard alias

device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class SynthDataset(T.utils.data.Dataset):
  def __init__(self, src_file):
    tmp_x = np.loadtxt(src_file, delimiter="\t",
      usecols=[0,1,2,3,4,5], dtype=np.float32)
    tmp_y = np.loadtxt(src_file, usecols=6, delimiter="\t",
      dtype=np.float32)
    tmp_y = tmp_y.reshape(-1,1)  # 2D required

    self.x_data = T.tensor(tmp_x, dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y, dtype=T.float32).to(device)

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx]
    trgts = self.y_data[idx] 
    return (preds, trgts)  # as a tuple

# -----------------------------------------------------------

def accuracy(model, ds, pct_close):
  # assumes model.eval()
  # correct within pct of true income
  n_correct = 0; n_wrong = 0

  for i in range(len(ds)):
    X = ds[i][0]   # 2-d
    Y = ds[i][1]   # 2-d
    with T.no_grad():
      oupt = model(X)    # computed median price

    if T.abs(oupt - Y) "lt" T.abs(pct_close * Y):  # less-than
      n_correct += 1
    else:
      n_wrong += 1
  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(6, 10)  # 6-(10-10)-1
    self.hid2 = T.nn.Linear(10, 10)
    self.oupt = T.nn.Linear(10, 1)

    T.nn.init.xavier_uniform_(self.hid1.weight)  # glorot
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight)
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight)
    T.nn.init.zeros_(self.oupt.bias)

  def forward(self, x):
    z = T.tanh(self.hid1(x))  # or T.nn.Tanh()
    z = T.tanh(self.hid2(z))
    z = self.oupt(z)  # no activation, aka Identity()
    return z

# -----------------------------------------------------------

def train(model, ds, bs, lr, me, le):
  # dataset, bat_size, lrn_rate, max_epochs, log interval
  train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  loss_func = T.nn.MSELoss()
  optimizer = T.optim.Adam(model.parameters(), lr=lr)
  # optimizer = T.optim.SGD(model.parameters(), lr=lr)

  for epoch in range(0, me):
    epoch_loss = 0.0  # for one full epoch
    for (b_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # predictors
      y = batch[1]  # target house price
      optimizer.zero_grad()
      oupt = model(X)
      loss_val = loss_func(oupt, y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate
      loss_val.backward()  # compute gradients
      optimizer.step()     # update weights

    if epoch % le == 0:
      print("epoch = %4d  |  loss = %0.4f" % \
        (epoch, epoch_loss)) 

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nSynthetic data regression using PyTorch 2.0 ")
  np.random.seed(1)
  T.manual_seed(1) 

  # 1. load data
  print("\nLoading synthetic data into memory ")
  train_file = ".\\Data\\synthetic_train.txt"
  train_ds = SynthDataset(train_file)  # 200 rows
  test_file = ".\\Data\\synthetic_test.txt"
  test_ds = SynthDataset(test_file)    # 40 rows

  # 2. create model
  print("\nCreating 6-(10-10)-1 regression model ")
  net = Net().to(device)

  # 3. train model
  print("\nbat_size = 10 ")
  print("loss = MSELoss() ")
  print("optimizer = Adam ")
  print("lrn_rate = 0.01 ")

  print("\nStarting training")
  net.train()
  train(net, train_ds, bs=10, lr=0.01, me=100, le=10)
  print("Done ")

# -----------------------------------------------------------

  # 4. evaluate model accuracy
  net.eval()
  print("\nComputing model accuracy (within 0.10 of true) ")
  acc_train = accuracy(net, train_ds, 0.10)  # item-by-item
  print("Accuracy on train data = %0.4f" % acc_train)

  acc_test = accuracy(net, test_ds, 0.10) 
  print("Accuracy on test data = %0.4f" % acc_test)

# -----------------------------------------------------------

  # 5. make a prediction
  print("\nPredicting target for dummy inputs: ")
  x = np.array([[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], dtype=np.float32)
  x = T.tensor(x, dtype=T.float32).to(device) 

  with T.no_grad():
    y = net(x)
  pred_raw = y.item()  # scalar
  print("%0.4f" % pred_raw)  

# -----------------------------------------------------------

  # 6. TODO: save model (state_dict approach)

if __name__=="__main__":
  main()
Posted in PyTorch | Leave a comment

Refactoring My PyTorch Hyperparameter Search Using Evolutionary Optimization Demo

One of the advantages that experienced developers have compared to early-career developers is subjective intuition. Several days ago, I implemented a program that searches for PyTorch neural network hyperparameter values (number of hidden nodes, batch size, and so on) using an evolutionary optimization algorithm. The demo worked but my intuition told me the demo needed refactoring.

The refactoring effort took me about 12 hours but I was satisfied with the resulting demo. I tidied up a lot of details but the primary change was using a completely object oriented design. This makes the main() function look very simple because all the details are hidden:

  # 1. create train_ds and test_ds datasets

  # 2. find best model, save to Models directory
  print("Creating EO Searcher object ")
  scr = Searcher(train_ds, test_ds, pop_sz=10, dim=6,
    max_gen=100, p_mutate=0.5, seed=0)
  print("Searching for best hyperparams ")
  scr.search()

  # 3. display results

An encoded solution is an array of 6 integers, each a value from 0 to 9. For example, the best solution found in my demo is [4 5 6 2 7 5]. Each integer represents a hyperparameter value, in this case:

4 : num hidden nodes = 12
5 : hidden activation = relu
6 : batch size = 12
2 : learn rate = 0.0080
7 : max epochs = 800
5 : optimizer = adam

I used one of my standard synthetic datasets where the goal is to predict a person’s political leaning (conservative = 0, moderate = 1, liberal = 2) from sex, age, State of residence, and annual income.

The resulting model had 90.50% accuracy on the 200-item training data, and 80.00% accuracy on the 40-item test dataset.

My motivation for hyperparameter search using evolutionary optimization is to apply it to complex neural systems that use a Transformer component.



A home remodeling project is analogous to code refactoring. Here are two examples of shower remodels that I’d rate as not entirely successful.


Demo code. Replace “lt” (less-than) etc. with Boolean operator symbols. The data is at https://jamesmccaffrey.wordpress.com/2022/09/01/multi-class-classification-using-pytorch-1-12-1-on-windows-10-11/.

# people_evo_hyperparameter_2.py

# PyTorch 2.0.0-CPU Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

import numpy as np
import torch as T
import pickle
from datetime import datetime

device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class PeopleDataset(T.utils.data.Dataset):
  # sex  age    state    income   politics
  # -1   0.27   0  1  0   0.7610   2
  # +1   0.19   0  0  1   0.6550   0
  # sex: -1 = male, +1 = female
  # state: michigan, nebraska, oklahoma
  # politics: conservative, moderate, liberal

  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,7),
      delimiter="\t", comments="#", dtype=np.float32)
    tmp_x = all_xy[:,0:6]   # cols [0,6) = [0,5]
    tmp_y = all_xy[:,6]     # 1-D

    self.x_data = T.tensor(tmp_x, 
      dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y,
      dtype=T.int64).to(device)  # 1-D

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx]
    trgts = self.y_data[idx] 
    return preds, trgts  # as a Tuple

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self, n_hid, activ='tanh'):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(6, n_hid)  # 6-(nh-nh)-3
    self.hid2 = T.nn.Linear(n_hid, n_hid)
    self.oupt = T.nn.Linear(n_hid, 3)

    if activ == 'tanh':
      self.activ = T.nn.Tanh()
    elif activ == 'relu':
      self.activ = T.nn.ReLU()

    # use default weight init

  def forward(self, x):
    z = self.activ(self.hid1(x))
    z = self.activ(self.hid2(z)) 
    z = T.log_softmax(self.oupt(z), dim=1)  # NLLLoss() 
    return z

# -----------------------------------------------------------

def train(net, ds, bs, lr, me, opt, verbose=False):
  # dataset, bat_size, lrn_rate, max_epochs, optimizer
  v = verbose
  train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  loss_func = T.nn.NLLLoss()  # log_softmax() activation
  if opt == 'sgd':
    optimizer = T.optim.SGD(net.parameters(), lr=lr)
  elif opt == 'adam':
    optimizer = T.optim.Adam(net.parameters(), lr=lr)  

  if v: print("\nStarting training ")
  le = me // 4  # log interval: 4 log prints
  for epoch in range(0, me):
    epoch_loss = 0.0  # for one full epoch
    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # inputs
      Y = batch[1]  # correct class/label/politics

      optimizer.zero_grad()
      oupt = net(X)
      loss_val = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate
      loss_val.backward()
      optimizer.step()

    if v:
      if epoch % le == 0:
        print("epoch = %5d  |  loss = %10.4f" % \
          (epoch, epoch_loss)) 
  if v: print("Done ") 

# -----------------------------------------------------------

def accuracy_q(model, dataset):
  # assumes model.eval()
  X = dataset[0:len(dataset)][0]
  Y = dataset[0:len(dataset)][1]
  with T.no_grad():
    oupt = model(X)  #  [40,3]  logits
  arg_maxs = T.argmax(oupt, dim=1)  # argmax() is new
  num_correct = T.sum(Y==arg_maxs)
  acc = (num_correct * 1.0 / len(dataset))
  return acc.item()

# -----------------------------------------------------------

class Searcher():
  # assumes Net(), train(), accuracy_q() exist

  def __init__(self, trn_ds, tst_ds, pop_sz, dim, max_gen,
    p_mutate, seed):
    self.train_ds = trn_ds
    self.test_ds = tst_ds
    self.pop_size = pop_sz
    self.dim = dim  # 6
    self.max_gen = max_gen
    self.p_mutate = p_mutate
    self.rnd = np.random.RandomState(seed)

    self.pop = []
    self.used = {}  # avoid duplicating a solution

    self.best_soln = np.array([0,0,0,0,0,0], dtype=int)
    self.best_err = 10.0
    self.best_train_acc = 0.0
    self.best_test_acc = 0.0

  # ---------------------------------------------------------

  def make_rnd_soln(self):
    soln = self.rnd.randint(low=0, high=10, size=self.dim,
      dtype=int)
    soln_key = "".join(str(x) for x in soln)

    while soln_key in self.used:
      soln = self.rnd.randint(low=0, high=10, size=self.dim,
        dtype=int)
      soln_key = "".join(str(x) for x in soln)

    self.used[soln_key] = 1
    return soln  # not used before

  # ---------------------------------------------------------

  def make_child(self, parent_idxs):
    i = parent_idxs[0]
    j = parent_idxs[1]
    child_soln = np.zeros(self.dim, dtype=int)
    parent1 = self.pop[i][0]
    parent2 = self.pop[j][0]
    for k in range(0, self.dim // 2):  # left half
      child_soln[k] = parent1[k]
    for k in range(self.dim // 2, self.dim):  # right half
      child_soln[k] = parent2[k]
    return child_soln  # possible dup -- mutate() will handle

  # ---------------------------------------------------------

  def mutate(self, child_soln):
    for k in range(self.dim):
      q = self.rnd.random()  # [0.0, 1.0] 
      if q "lt" self.p_mutate:
        child_soln[k] = self.rnd.randint(0, 10, size=1,
          dtype=int)
    child_key = "".join(str(x) for x in child_soln)

    while child_key in self.used:
      for k in range(self.dim):  # mutate again
        q = self.rnd.random()  # [0.0, 1.0] 
        if q "lt" self.p_mutate:
          child_soln[k] = self.rnd.randint(0, 10, size=1, 
            dtype=int)
      child_key = "".join(str(x) for x in child_soln)

    self.used[child_key] = 1
    return  # in-place modification

  # ---------------------------------------------------------

  def evaluate(self, soln, verbose=False):
    # [n_hid, activ, bs, lr, me, opt]
    #   [0]    [1]   [2] [3] [4] [5]
    v = verbose

    # hard-coded. modify as needed
    n_hids = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    activs = ['tanh', 'tanh','tanh','tanh','tanh',
      'relu', 'relu', 'relu', 'relu', 'relu']
    b_szs = [1, 2, 4, 6, 8, 10, 12, 14, 16, 20]
    rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
      0.08, 0.10, 0.12]
    max_eps = [50, 100, 200, 300, 400, 500, 600, 700,
      800, 1000]
    opts = ['sgd', 'sgd', 'sgd', 'sgd', 'sgd',
      'adam', 'adam', 'adam', 'adam', 'adam']

    n_hid = n_hids[soln[0]]
    activ = activs[soln[1]]
    bs = b_szs[soln[2]]
    lr = rates[soln[3]]
    me = max_eps[soln[4]]
    opt = opts[soln[5]]

    T.manual_seed(1)  # controls weight init, not EO
    np.random.seed(1)

    net = Net(n_hid, activ).to(device)  # create NN
    net.train()
    if v: print("\nsoln: " + str(soln))
    train(net, self.train_ds, bs, lr, me, opt, verbose) 

    net.eval()
    acc_train = accuracy_q(net, self.train_ds)
    acc_test = accuracy_q(net, self.test_ds) 
    acc_weighted = ((1 * acc_train) + (3 * acc_test)) / 4
    error = 1.0 - acc_weighted  # [0.0, 1.0]
    if v: print("train acc = %0.4f " % acc_train)
    if v: print("test_acc = %0.4f " % acc_test)
    return (acc_train, acc_test, error)

  # ---------------------------------------------------------

  def save_info(self):
    # as date_time_soln_trainAcc_testAcc.txt
    dt = datetime.now().strftime('%Y-%m-%d_%H-%M')
    ss = "".join(str(x) for x in self.best_soln)  # soln str
    trna = str("%0.4f" % self.best_train_acc) 
    tsta = str("%0.4f" % self.best_test_acc) 
    fn = ".\\Models\\" + dt + "_" + ss + "_" + \
      trna + "_" + tsta + ".txt"
    f = open(fn, "w")
    f.write("soln = " + ss + "\n")
    f.write("train acc = " + trna + "\n")
    f.write("test acc = " + tsta + "\n")
    f.close()

  # ---------------------------------------------------------

  def create_pop(self):
    for i in range(self.pop_size):
      soln = self.make_rnd_soln()  # unique soln, not yet used
      trn_acc, tst_acc, err = self.evaluate(soln, verbose=True)
      self.pop.append( (soln,err) )
      if err "lt" self.best_err:
        self.best_err = err
        self.best_soln = soln.copy()
        self.best_train_acc = trn_acc
        self.best_test_acc = tst_acc

    self.pop = sorted(self.pop, key=lambda tup:tup[1])  # by err
    self.save_info()

  # ---------------------------------------------------------

  def search(self):
    print("\nCreating size = " + \
      str(self.pop_size) + " initial population ")
    self.create_pop()

    for gen in range(self.max_gen):
      print("\ngeneration = " + str(gen))

      # 4a. pick two parents
      first = \
        self.rnd.randint(0, self.pop_size // 2)  # good one
      second = \
        self.rnd.randint(self.pop_size // 2, self.pop_size) 
      flip = self.rnd.randint(2)  # 0 or 1
      if flip == 0:
        parent_idxs = (first, second)
      else:
        parent_idxs = (second, first)

      # 4b. make a child
      child_soln = self.make_child(parent_idxs)

      # 4c. mutate child (and avoid duplicate)
      self.mutate(child_soln)

      # 4d. evaluate child soln
      (trn_acc, tst_acc, child_err) = \
        self.evaluate(child_soln, verbose=True)
      if child_err "lt" self.best_err:
        print("New best solution found in gen " + str(gen))
        self.best_soln = child_soln.copy()
        self.best_err = child_err
        self.best_train_acc = trn_acc
        self.best_test_acc = tst_acc
        self.save_info()
      else:
        pass  # could print a message here

      # 4e. replace weak pop soln with child
      idx = self.rnd.randint(self.pop_size // 2, \
        self.pop_size)
      self.pop[idx] = (child_soln, child_err)  # Tuple
      self.pop = sorted(self.pop, key=lambda tup:tup[1])
    
    print("\nEnd evolution ")

  # ---------------------------------------------------------    

# -----------------------------------------------------------

def show_soln_to_hyperparams(soln):
  # hard-coded. modify as needed
  n_hids = [4, 6, 8, 10, 12, 14, 16, 18, 20, 24]
  activs = ['tanh', 'tanh','tanh','tanh','tanh',
    'relu', 'relu', 'relu', 'relu', 'relu']
  b_szs = [1, 2, 4, 6, 8, 10, 12, 14, 16, 20]
  rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
    0.08, 0.10, 0.12]
  max_eps = [100, 200, 300, 400, 500, 600, 700, 800,
    900, 1000]
  opts = ['sgd', 'sgd', 'sgd', 'sgd', 'sgd',
    'adam', 'adam', 'adam', 'adam', 'adam']

  n_hid = n_hids[soln[0]]
  activ = activs[soln[1]]
  bs = b_szs[soln[2]]
  lr = rates[soln[3]]
  me = max_eps[soln[4]]
  opt = opts[soln[5]]

  print("num hidden nodes = " + str(n_hid))
  print("hidden activation = " + str(activ))
  print("batch size = " + str(bs))
  print("learn rate = %0.4f " % lr)
  print("max epochs = " + str(me))
  print("optimizer = " + str(opt))

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin People politics EO parameter search ")
  T.manual_seed(1)  # is reset in evaluate()
  np.random.seed(1)  
  
  # 1. create Dataset objects
  print("\nCreating People train and test Datasets ")
  train_file = ".\\Data\\people_train.txt"
  train_ds = PeopleDataset(train_file)  # 200 rows

  test_file = ".\\Data\\people_test.txt"
  test_ds = PeopleDataset(test_file)    # 40 rows

  # 2. find best model, save to Models directory
  print("\nCreating EO Searcher object ")
  scr = Searcher(train_ds, test_ds, pop_sz=10, dim=6,
    max_gen=100, p_mutate=0.5, seed=0)
  print("\nSearching for best hyperparams ")
  scr.search()

  # 3. display results
  print("\nBest solution found = " + \
    str(scr.best_soln))
  print("Best train accuracy = %0.4f " % scr.best_train_acc)
  print("Best test accuracy = %0.4f " % scr.best_test_acc)
  print("\nHyperparameters are: \n ")
  show_soln_to_hyperparams(scr.best_soln)

  print("\nEnd evolutionary parameter search ")

if __name__ == "__main__":
  main()
Posted in PyTorch | Leave a comment

Why Isn’t PyTorch Training Code Placed Inside the Network Class Definition?

I’ve been using PyTorch as my neural code library of choice for the past several years. Every PyTorch example I’ve ever seen has a program structure where there’s a class that defines the network, a class that defines a Dataset, a separate function that has training code, and a seperate function that computes model accuracy:

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

class PeopleDataset(T.utils.data.Dataset): . . .
class Net(T.nn.Module): . . .
def train(net, ds, bs, lr, opt, me): . . .
def accuracy(model, ds): . . .

def main():
  # load Dataset
  net = Net().to(device)
  # set up training parameters
  net.train()  # set mode
  train(net, train_ds, bat_sz, lrn_rt, opt, max_epochs)  # train
  acc = accuracy(net, train_ds)

if __name__ == "main":
  main()

I always wondered why the training code isn’t placed inside a method in the class that defines the Network:

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

class PeopleDataset(T.utils.data.Dataset): . . .  
class Net(T.nn.Module): . . .  # has a training code method
def accuracy(model, ds): . . .

def main():
  # load Dataset
  net = Net().to(device)
  # set up training parameters
  net.train()  # set mode
  net.do_train(train_ds, bat_sz, lrn_rt, opt, max_epochs)  # train
  acc = accuracy(net, train_ds)

if __name__ == "main":
  main()

I put together a demo to explore. My conclusion: Placing training code in a method inside the Network class definition instead of a standalone training function works fine, but there’s no significant advantage for that design. Therefore, because the train-code-in-method design is never used by the thousands of examples on the Internet, it’s probably better to use the standalone training code function design.

Notice that I couldn’t name the training method train() because there’s already a net.train() inherited from the torch.nn.Module class, which is used to set the network object into training mode. I called my training code method do_train() instead.

For my demo, I used one of my standard synthetic datasets. The goal is to predict a person’s political leaning (conservative = 0, moderate = 1, liberal = 2) from sex (male = -1, female = 1), age (divided by 100), State (Michigan = 100, Nebraska = 010, Oklahoma = 001), and income (divided by $100,000).

The moral of this blog post is that deciding when and how to integrate software components is often more difficult than writing the code. Interesting stuff.



While I was a university professor in Hawaii, I taught programming classes to military personnel, including classes on the Pearl Harbor Navy base. The Aegis Combat System for ships was developed in the 1970s. It used a programming language called CMS-2. Aegis hardware is so big that for repairs, it was often necessary to cut a hole in the ship.

Left: The Aegis system integrates a lot of components. Right: Aegis is used on over 100 ships, including the guided-missile destroyer USS Arleigh Burke.


Demo code below. Training and test data are at https://jamesmccaffrey.wordpress.com/2022/09/01/multi-class-classification-using-pytorch-1-12-1-on-windows-10-11/.

# people_politics_internal_train.py
# predict politics type from sex, age, state, income
# PyTorch 2.0.0-CPU Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

# training code is in a do_train() method, not standalone

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class PeopleDataset(T.utils.data.Dataset):
  # sex  age    state    income   politics
  # -1   0.27   0  1  0   0.7610   2
  # +1   0.19   0  0  1   0.6550   0
  # sex: -1 = male, +1 = female
  # state: michigan, nebraska, oklahoma
  # politics: conservative, moderate, liberal

  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,7),
      delimiter="\t", comments="#", dtype=np.float32)
    tmp_x = all_xy[:,0:6]   # cols [0,6) = [0,5]
    tmp_y = all_xy[:,6]     # 1-D

    self.x_data = T.tensor(tmp_x, 
      dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y,
      dtype=T.int64).to(device)  # 1-D

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx]
    trgts = self.y_data[idx] 
    return preds, trgts  # as a Tuple

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(6, 10)  # 6-(10-10)-3
    self.hid2 = T.nn.Linear(10, 10)
    self.oupt = T.nn.Linear(10, 3)

    # use default init

  def forward(self, x):
    z = T.tanh(self.hid1(x))
    z = T.tanh(self.hid2(z))
    z = T.log_softmax(self.oupt(z), dim=1)  # NLLLoss() 
    return z

  def do_train(self, ds, bs, lr, opt, me):
    # net, dataset, bat_size, lrn_rate, optimizer, max_eps

    train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
      shuffle=True)
    loss_func = T.nn.NLLLoss()  # log_softmax() activation
    if opt == 'sgd':
      optimizer = T.optim.SGD(self.parameters(), lr=lr)
    elif opt == 'adam':
      optimizer = T.optim.Adam(self.parameters(), lr=lr)  

    le = me // 5  # log interval: 5 log prints
    for epoch in range(0, me):
      epoch_loss = 0.0  # for one full epoch
      for (batch_idx, batch) in enumerate(train_ldr):
        X = batch[0]  # inputs
        Y = batch[1]  # correct class/label/politics

        optimizer.zero_grad()
        oupt = self(X)
        loss_val = loss_func(oupt, Y)  # a tensor
        epoch_loss += loss_val.item()  # accumulate
        loss_val.backward()
        optimizer.step()

      if epoch % le == 0:
        print("epoch = %5d  |  loss = %10.4f" % \
          (epoch, epoch_loss)) 

# -----------------------------------------------------------

def accuracy_q(model, dataset):  # could also be a method
  # assumes model.eval()
  X = dataset[0:len(dataset)][0]
  # Y = T.flatten(dataset[0:len(dataset)][1])
  Y = dataset[0:len(dataset)][1]
  with T.no_grad():
    oupt = model(X)  #  [40,3]  logits

  arg_maxs = T.argmax(oupt, dim=1)  # argmax() is new
  num_correct = T.sum(Y==arg_maxs)
  acc = (num_correct * 1.0 / len(dataset))
  return acc.item()

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin People predict politics type ")
  T.manual_seed(1)
  np.random.seed(1)
  
  # 1. create Dataset objects
  print("\nCreating People Datasets ")

  train_file = ".\\Data\\people_train.txt"
  train_ds = PeopleDataset(train_file)  # 200 rows

  test_file = ".\\Data\\people_test.txt"
  test_ds = PeopleDataset(test_file)    # 40 rows

# -----------------------------------------------------------

  # 2. create network
  print("\nCreating 6-(10-10)-3 neural network ")
  net = Net().to(device)

# -----------------------------------------------------------

  # 3. train model
  bat_size = 10
  max_epochs = 1000
  lrn_rate = 0.01

  print("\nStarting training")
  net.train()  # set mode
  net.do_train(train_ds, bat_size, lrn_rate, 'sgd', max_epochs)
  print("Done ")

# -----------------------------------------------------------

  # 4. evaluate model accuracy
  print("\nComputing model accuracy")
  net.eval()
  acc_train = accuracy_q(net, train_ds)  # item-by-item
  print("Accuracy on training data = %0.4f" % acc_train)
  acc_test = accuracy_q(net, test_ds) 
  print("Accuracy on test data = %0.4f" % acc_test)

  print("\nEnd People predict politics demo")

if __name__ == "__main__":
  main()
Posted in PyTorch | Leave a comment

Example of Principal Component Regression Using the scikit Library

The goal of a regression problem is to predict a single numeric value. For example, you might want to predict the selling price of a house based on its square footage, number of bedrooms, age, and so on. The simplest regression technique is linear regression. Enhanced variations of linear regression include kernel ridge regression and lasso regression. More powerful regression techniques include Gaussian process regression and neural network regression. Each technique has pros and cons.

A relatively obscure regression technique is called principal component regression (PCR). Before I go any further, let me point out that PCR is obscure because in most problem scenarios other regression techniques are superior. Briefly, PCR is a variation of linear regression that simultaneously attempts to make the data more linear (for a better prediction model) and removes some data as a form of regularization to prevent model overfitting. PCR is loosely analogous to the way adding dropout to a neural network deters model overfitting.

Spoiler alert: the principal component regression technique doesn’t seem to work very well.

In a nutshell, the PCR technique starts with source training and test data. The training data is transformed using principal component analysis to find important predictors. The important predictors are used to create a standard linear regression model. Because some unimportant predictors have been stripped away, the resulting linear regression model is less prone to overfitting. At least in theory.

I put together a complete demo. I started with synthetic data that has 6 predictor values, all between -1.0 and +1.0. The target values to predict are all between 0.0 and 1.0. There are 200 training items and 40 test items. This synthetic data corresponds to realistic data that has been normalized in some way (z-score, min-max, divide-by-k, etc.) The data looks like:

-0.9452  0.3409  -0.1654   0.1174  -0.7192  -0.6038  0.8186
 0.7528  0.7892  -0.8299  -0.9219  -0.6603   0.7563  0.3687
. . .

The demo program code begins with:

# synthetic_prin_comp_regression.py
# principal component regression on synthetic dataset

# Anaconda3-2022.10  Python 3.9.13
# scikit 1.0.2  Windows 10/11 

import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

Next, the demo defines a program-defined accuracy() function:

def accuracy(model, data_X, data_y, pct_close):
  # correct within pct of true income
  n_correct = 0; n_wrong = 0

  for i in range(len(data_X)):
    X = data_X[i].reshape(1, -1)  # one-item batch
    y = data_y[i]
    pred = model.predict(X)       # predicted target

    if np.abs(pred - y) < np.abs(pct_close * y):
      n_correct += 1
    else:
      n_wrong += 1
  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc

A correct prediction is one that’s within pct_close of the true target value. The demo is organized using a main() function that starts with:

def main():
  # 0. prepare
  print("\nBegin principal component regression demo ")
  np.random.seed(1)
  np.set_printoptions(precision=4, suppress=True)
. . .

Next, the demo loads training and test data:

  # 1. load data
  print("\nLoading (synthetic) train and test data ")
  train_file = ".\\Data\\synthetic_train.txt"
  train_X = np.loadtxt(train_file, delimiter="\t", 
    usecols=[0,1,2,3,4,5], comments="#", dtype=np.float32)
  train_y = np.loadtxt(train_file, delimiter="\t", 
    usecols=6, comments="#", dtype=np.float32) 

  test_file = ".\\Data\\synthetic_test.txt"
  test_X = np.loadtxt(test_file, delimiter="\t",
    usecols=[0,1,2,3,4,5], comments="#", dtype=np.float32)
  test_y = np.loadtxt(test_file, delimiter="\t",
    usecols=6, comments="#", dtype=np.float32) 
  print("Done ")

The code assumes the data files are in a sub-directory named Data. I use np.loadtxt() but there are several alternatives, including Pandas. The demo displays the first four rows of training data to make sure the data was read correctly:

  print("\nPredictor X data: ")
  print(train_X[0:4][:])
  print(". . .")
  print("\nTarget y values: ")
  print(train_y[0:4])
  print(". . .")

Although it’s not essential, the demo creates a linear regression model on the source data, and computes the accuracy of the model (within 15% of correct target):

  # 1a. establish a baseline LR model
  base_model = LinearRegression()
  base_model.fit(train_X, train_y)  # train on source data
  base_acc_train = accuracy(base_model, train_X, train_y, 0.15)
  print("\nBase accuracy on train data = %0.4f " % base_acc_train)
  base_acc_test = accuracy(base_model, test_X, test_y, 0.15)
  print("Base accuracy on test data = %0.4f " % base_acc_test)

For the demo data, the accuracy on the training and test data is 0.8900 and 0.9250 — quite strong. Next, the demo uses the source predictors to compute the underlying principal components. Because there are 6 predictor variables, there are 6 principal components.

  # 2. compute principal components and transform predictors
  print("\nComputing principal components ")
  pca = PCA()       # all 6 components
  pca.fit(train_X)  # compute them
  print("Done. Variance explained: ")
  print(pca.explained_variance_ratio_)

The percent of variance explained by the principal components are: [0.211, 0.186, 0.171, 0.152, 0.140, 0.138]. The values are ordered from most explained variance to least explained. For the demo data, all the explained values are roughly equal. Next, the source predictor values are transformed:

  # 3. transform the source X data
  print("\nComputing transformed X data ")
  transformed_X = pca.transform(train_X)
  print(transformed_X)

  # print("\nComputing original X from transformed X ")
  # orig_X = pca.inverse_transform(transformed_X)
  # print(orig_X)

The important idea here is that no new information is being added. The transformed data is just a different representation ordered by most important to least important. The commented code shows how the original data can be reconstructed using the inverse_transform() method. Next, the demo creates a subset of the transformed data, using 5 of the 6 variables. This removes a chunk of information. Then a standard linear regression model is created using the subset of transformed data:

  # 4. create linear regression model using transformed X
  n = 5  # number components to use
  print("\nTraining model on first " + str(n) + " components ")
  subset_trans_X = transformed_X[:,0:n]
  model = LinearRegression()
  model.fit(subset_trans_X, train_y)  # train

The demo computes the accuracy of the principal component regression model like so:

  # 5. compute model accuracy
  print("\nComputing model accuracy within 15% of true target ")
  acc_train = accuracy(model, subset_trans_X, train_y, 0.15)
  print("Accuracy on train data = %0.4f " % acc_train)

  transformed_test_X = pca.transform(test_X)
  subset_trans_test_X = transformed_test_X[:,0:n]
  acc_test = accuracy(model, subset_trans_test_X, test_y, 0.15)
  print("Accuracy on test data = %0.4f " % acc_test)

The accuracy of the PCR model on the training and test data is just 0.40 (160 out of 200 correct) and 0.25 (10 out of 40 correct). Terrible! By removing information, the linear regression model isn’t as nearly good as the base model that uses all source data. This happened because all 6 components are important. The intended effect of dropping less important data to prevent model overfitting did not happen.

If the linear regression model had been created using all 6 components instead of just 5, the accuracy of the model would be identical to the baseline model created using all source data. Again — applying principal component analysis does not add any new information.

The demo concludes by showing how to use the principal component regression model to make a prediction on a new, previously unseen data item:

. . .
  # 6. use model
  print("\nPredicting for [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] ")
  x = np.array([[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]],
    dtype=np.float32)
  x_trans = pca.transform(x)
  x_trans_subset = x_trans[:,0:n]  # first n values
  predicted = model.predict(x_trans_subset)
  print("y = %0.4f " % predicted)

  print("\nEnd principal component regression demo ")
if __name__ == "__main__":
  main()

In essence, principal component regression as described in this blog post is a just type of linear regression where relatively unimportant predictor data is removed in order to prevent model overfitting. An interesting idea that I haven’t seen tried before is to use principal component analysis followed by Gaussian process regression (GPR) instead of linear regression. GPR is powerful but highly susceptible to overfitting so a “principal component Gaussian process regression” technique might be useful. However, GPR already has two ways to add noise: the alpha parameter of the model constructor, and the WhiteKernel() module for the model kernel parameter. But using PCA + GPR is something for me to try when I get a chance.



In poster art, I think a key to success is how much detail to include and how much to remove. And in burlesque, a key to success is arguably how much costume to include and how much to remove. The Folies Bergere is a music hall in Paris. It opened in 1869. The name loosely means “variety show near Bergere (Shepherd) Street”. The Paris Folies Bergere inspired a Las Vegas Folies Bergere that ran in the Tropicana Hotel from 1959 to 2009. I saw that show when I was in Vegas for a conference — the show was good but I prefer modern show like those from Cirque du Soleil.


Demo training data. Replace comma characters with tabs or modify data-reading code.

# synthetic_train.txt
# six predictors followed by target y
#
-0.1660,0.4406,-0.9998,-0.3953,-0.7065,-0.8153,0.7022
-0.2065,0.0776,-0.1616,0.3704,-0.5911,0.7562,0.5666
-0.9452,0.3409,-0.1654,0.1174,-0.7192,-0.6038,0.8186
0.7528,0.7892,-0.8299,-0.9219,-0.6603,0.7563,0.3687
-0.8033,-0.1578,0.9158,0.0663,0.3838,-0.3690,0.7535
-0.9634,0.5003,0.9777,0.4963,-0.4391,0.5786,0.7076
-0.7935,-0.1042,0.8172,-0.4128,-0.4244,-0.7399,0.8454
-0.0169,-0.8933,0.1482,-0.7065,0.1786,0.3995,0.7302
-0.7953,-0.1719,0.3888,-0.1716,-0.9001,0.0718,0.8692
0.8892,0.1731,0.8068,-0.7251,-0.7214,0.6148,0.4740
-0.2046,-0.6693,0.8550,-0.3045,0.5016,0.4520,0.6714
0.5019,-0.3022,-0.4601,0.7918,-0.1438,0.9297,0.4331
0.3269,0.2434,-0.7705,0.8990,-0.1002,0.1568,0.3716
0.8068,0.1474,-0.9943,0.2343,-0.3467,0.0541,0.3829
0.7719,-0.2855,0.8171,0.2467,-0.9684,0.8589,0.4700
0.8652,0.3936,-0.8680,0.5109,0.5078,0.8460,0.2648
0.4230,-0.7515,-0.9602,-0.9476,-0.9434,-0.5076,0.8059
0.1056,0.6841,-0.7517,-0.4416,0.1715,0.9392,0.3512
0.1221,-0.9627,0.6013,-0.5341,0.6142,-0.2243,0.6840
0.1125,-0.7271,-0.8802,-0.7573,-0.9109,-0.7850,0.8640
-0.5486,0.4260,0.1194,-0.9749,-0.8561,0.9346,0.6109
-0.4953,0.4877,-0.6091,0.1627,0.9400,0.6937,0.3382
-0.5203,-0.0125,0.2399,0.6580,-0.6864,-0.9628,0.7400
0.2127,0.1377,-0.3653,0.9772,0.1595,-0.2397,0.4081
0.1019,0.4907,0.3385,-0.4702,-0.8673,-0.2598,0.6582
0.5055,-0.8669,-0.4794,0.6095,-0.6131,0.2789,0.6644
0.0493,0.8496,-0.4734,-0.8681,0.4701,0.5444,0.3214
0.9004,0.1133,0.8312,0.2831,-0.2200,-0.0280,0.3149
0.2086,0.0991,0.8524,0.8375,-0.2102,0.9265,0.3619
-0.7298,0.0113,-0.9570,0.8959,0.6542,-0.9700,0.6451
-0.6476,-0.3359,-0.7380,0.6190,-0.3105,0.8802,0.6606
0.6895,0.8108,-0.0802,0.0927,0.5972,-0.4286,0.2427
-0.0195,0.1982,-0.9689,0.1870,-0.1326,0.6147,0.4773
0.1557,-0.6320,0.5759,0.2241,-0.8922,-0.1596,0.7581
0.3581,0.8372,-0.9992,0.9535,-0.2468,0.9476,0.2962
0.1494,0.2562,-0.4288,0.1737,0.5000,0.7166,0.3513
0.5102,0.3961,0.7290,-0.3546,0.3416,-0.0983,0.3153
-0.1970,-0.3652,0.2438,-0.1395,0.9476,0.3556,0.4719
-0.6029,-0.1466,-0.3133,0.5953,0.7600,0.8077,0.3875
-0.4953,0.7098,0.0554,0.6043,0.1450,0.4663,0.4739
0.0380,0.5418,0.1377,-0.0686,-0.3146,-0.8636,0.6048
0.9656,-0.6368,0.6237,0.7499,0.3768,0.1390,0.3705
-0.6781,-0.0662,-0.3097,-0.5499,0.1850,-0.3755,0.7668
-0.6141,-0.0008,0.4572,-0.5836,-0.5039,0.7033,0.7301
-0.1683,0.2334,-0.5327,-0.7961,0.0317,-0.0457,0.5777
0.0880,0.3083,-0.7109,0.5031,-0.5559,0.0387,0.5118
0.5706,-0.9553,-0.3513,0.7458,0.6894,0.0769,0.4329
-0.8025,0.3026,0.4070,0.2205,0.5992,-0.9309,0.7098
0.5405,0.4635,-0.4806,-0.4859,0.2646,-0.3094,0.3566
0.5655,0.9809,-0.3995,-0.7140,0.8026,0.0831,0.2551
0.9495,0.2732,0.9878,0.0921,0.0529,-0.7291,0.3074
-0.6792,0.4913,-0.9392,-0.2669,0.7247,0.3854,0.4362
0.3819,-0.6227,-0.1162,0.1632,0.9795,-0.5922,0.4435
0.5003,-0.0860,-0.8861,0.0170,-0.5761,0.5972,0.5136
-0.4053,-0.9448,0.1869,0.6877,-0.2380,0.4997,0.7859
0.9189,0.6079,-0.9354,0.4188,-0.0700,0.8951,0.2696
-0.5571,-0.4659,-0.8371,-0.1428,-0.7820,0.2676,0.8566
0.5324,-0.3151,0.6917,-0.1425,0.6480,0.2530,0.4252
-0.7132,-0.8432,-0.9633,-0.8666,-0.0828,-0.7733,0.9217
-0.0952,-0.0998,-0.0439,-0.0520,0.6063,-0.1952,0.5140
0.8094,-0.9259,0.5477,-0.7487,0.2370,-0.9793,0.5562
0.9024,0.8108,0.5919,0.8305,-0.7089,-0.6845,0.2993
-0.6247,0.2450,0.8116,0.9799,0.4222,0.4636,0.4619
-0.5003,-0.6531,-0.7611,0.6252,-0.7064,-0.4714,0.8452
0.6382,-0.3788,0.9648,-0.4667,0.0673,-0.3711,0.5070
-0.1328,0.0246,0.8778,-0.9381,0.4338,0.7820,0.5680
-0.9454,0.0441,-0.3480,0.7190,0.1170,0.3805,0.6562
-0.4198,-0.9813,0.1535,-0.3771,0.0345,0.8328,0.7707
-0.1471,-0.5052,-0.2574,0.8637,0.8737,0.6887,0.3436
-0.3712,-0.6505,0.2142,-0.1728,0.6327,-0.6297,0.7430
0.4038,-0.5193,0.1484,-0.3020,-0.8861,-0.5424,0.7499
0.0380,-0.6506,0.1414,0.9935,0.6337,0.1887,0.4509
0.9520,0.8031,0.1912,-0.9351,-0.8128,-0.8693,0.5336
0.9507,-0.6640,0.9456,0.5349,0.6485,0.2652,0.3616
0.3375,-0.0462,-0.9737,-0.2940,-0.0159,0.4602,0.4840
-0.7247,-0.9782,0.5166,-0.3601,0.9688,-0.5595,0.7751
-0.3226,0.0478,0.5098,-0.0723,-0.7504,-0.3750,0.8025
0.5403,-0.7393,-0.9542,0.0382,0.6200,-0.9748,0.5359
0.3449,0.3736,-0.1015,0.8296,0.2887,-0.9895,0.4390
0.6608,0.2983,0.3474,0.1570,-0.4518,0.1211,0.3624
0.3435,-0.2951,0.7117,-0.6099,0.4946,-0.4208,0.5283
0.6154,-0.2929,-0.5726,0.5346,-0.3827,0.4665,0.4907
0.4889,-0.5572,-0.5718,-0.6021,-0.7150,-0.2458,0.7202
-0.8389,-0.5366,-0.5847,0.8347,0.4226,0.1078,0.6391
-0.3910,0.6697,-0.1294,0.8469,0.4121,-0.0439,0.4693
-0.1376,-0.1916,-0.7065,0.4586,-0.6225,0.2878,0.6695
0.5086,-0.5785,0.2019,0.4979,0.2764,0.1943,0.4666
0.8906,-0.1489,0.5644,-0.8877,0.6705,-0.6155,0.3480
-0.2098,-0.3998,-0.8398,0.8093,-0.2597,0.0614,0.6341
-0.5871,-0.8476,0.0158,-0.4769,-0.2859,-0.7839,0.9006
0.5751,-0.7868,0.9714,-0.6457,0.1448,-0.9103,0.6049
0.0558,0.4802,-0.7001,0.1022,-0.5668,0.5184,0.4612
0.4458,-0.6469,0.7239,-0.9604,0.7205,0.1178,0.5941
0.4339,0.9747,-0.4438,-0.9924,0.8678,0.7158,0.2627
0.4577,0.0334,0.4139,0.5611,-0.2502,0.5406,0.3847
-0.1963,0.3946,-0.9938,0.5498,0.7928,-0.5214,0.5025
-0.7585,-0.5594,-0.3958,0.7661,0.0863,-0.4266,0.7481
0.2277,-0.3517,-0.0853,-0.1118,0.6563,-0.1473,0.4798
-0.3086,0.3499,-0.5570,-0.0655,-0.3705,0.2537,0.5768
0.5689,-0.0861,0.3125,-0.7363,-0.1340,0.8186,0.5035
0.2110,0.5335,0.0094,-0.0039,0.6858,-0.8644,0.4243
0.0357,-0.6111,0.6959,-0.4967,0.4015,0.0805,0.6611
0.8977,0.2487,0.6760,-0.9841,0.9787,-0.8446,0.2873
-0.9821,0.6455,0.7224,-0.1203,-0.4885,0.6054,0.6908
-0.0443,-0.7313,0.8557,0.7919,-0.0169,0.7134,0.6039
-0.2040,0.0115,-0.6209,0.9300,-0.4116,-0.7931,0.6495
-0.7114,-0.9718,0.4319,0.1290,0.5892,0.0142,0.7675
0.5557,-0.1870,0.2955,-0.6404,-0.3564,-0.6548,0.6295
-0.1827,-0.5172,-0.1862,0.9504,-0.3594,0.9650,0.5685
0.7150,0.2392,-0.4959,0.5857,-0.1341,-0.2850,0.3585
-0.3394,0.3947,-0.4627,0.6166,-0.4094,0.0882,0.5962
0.7768,-0.6312,0.1707,0.7964,-0.1078,0.8437,0.4243
-0.4420,0.2177,0.3649,-0.5436,-0.9725,-0.1666,0.8086
0.5595,-0.6505,-0.3161,-0.7108,0.4335,0.3986,0.5846
0.3770,-0.4932,0.3847,-0.5454,-0.1507,-0.2562,0.6335
0.2633,0.4146,0.2272,0.2966,-0.6601,-0.7011,0.5653
0.0284,0.7507,-0.6321,-0.0743,-0.1421,-0.0054,0.4219
-0.4762,0.6891,0.6007,-0.1467,0.2140,-0.7091,0.6098
0.0192,-0.4061,0.7193,0.3432,0.2669,-0.7505,0.6549
0.8966,0.2902,-0.6966,0.2783,0.1313,-0.0627,0.2876
-0.1439,0.1985,0.6999,0.5022,0.1587,0.8494,0.3872
0.2473,-0.9040,-0.4308,-0.8779,0.4070,0.3369,0.6825
-0.2428,-0.6236,0.4940,-0.3192,0.5906,-0.0242,0.6770
0.2885,-0.2987,-0.5416,-0.1322,-0.2351,-0.0604,0.6106
0.9590,-0.2712,0.5488,0.1055,0.7783,-0.2901,0.2956
-0.9129,0.9015,0.1128,-0.2473,0.9901,-0.8833,0.6500
0.0334,-0.9378,0.1424,-0.6391,0.2619,0.9618,0.7033
0.4169,0.5549,-0.0103,0.0571,-0.6984,-0.2612,0.4935
-0.7156,0.4538,-0.0460,-0.1022,0.7720,0.0552,0.4983
-0.8560,-0.1637,-0.9485,-0.4177,0.0070,0.9319,0.6445
-0.7812,0.3461,-0.0001,0.5542,-0.7128,-0.8336,0.7720
-0.6166,0.5356,-0.4194,-0.5662,-0.9666,-0.2027,0.7401
-0.2378,0.3187,-0.8582,-0.6948,-0.9668,-0.7724,0.7670
-0.3579,0.1158,0.9869,0.6690,0.3992,0.8365,0.4184
-0.9205,-0.8593,-0.0520,-0.3017,0.8745,-0.0209,0.7723
-0.1067,0.7541,-0.4928,-0.4524,-0.3433,0.0951,0.4645
-0.5597,0.3429,-0.7144,-0.8118,0.7404,-0.5263,0.6117
0.0516,-0.8480,0.7483,0.9023,0.6250,-0.4324,0.5987
0.0557,-0.3212,0.1093,0.9488,-0.3766,0.3376,0.5739
-0.3484,0.7797,0.5034,0.5253,-0.0610,-0.5785,0.5365
-0.9170,-0.3563,-0.9258,0.3877,0.3407,-0.1391,0.7131
-0.9203,-0.7304,-0.6132,-0.3287,-0.8954,0.2102,0.9329
0.0241,0.2349,-0.1353,0.6954,-0.0919,-0.9692,0.5744
0.6460,0.9036,-0.8982,-0.5299,-0.8733,-0.1567,0.4425
0.7277,-0.8368,-0.0538,-0.7489,0.5458,0.6828,0.5848
-0.5212,0.9049,0.8878,0.2279,0.9470,-0.3103,0.4255
0.7957,-0.1308,-0.5284,0.8817,0.3684,-0.8702,0.3969
0.2099,0.4647,-0.4931,0.2010,0.6292,-0.8918,0.4620
-0.7390,0.6849,0.2367,0.0626,-0.5034,-0.4098,0.7137
-0.8711,0.7940,-0.5932,0.6525,0.7635,-0.0265,0.5705
0.1969,0.0545,0.2496,0.7101,-0.4357,0.7675,0.4242
-0.5460,0.1920,-0.5211,-0.7372,-0.6763,0.6897,0.6769
0.2044,0.9271,-0.3086,0.1913,0.1980,0.2314,0.2998
-0.6149,0.5059,-0.9854,-0.3435,0.8352,0.1767,0.4497
0.7104,0.2093,0.6452,0.7590,-0.3580,-0.7541,0.4076
-0.7465,0.1796,-0.9279,-0.5996,0.5766,-0.9758,0.7713
-0.3933,-0.9572,0.9950,0.1641,-0.4132,0.8579,0.7421
0.1757,-0.4717,-0.3894,-0.2567,-0.5111,0.1691,0.7088
0.3917,-0.8561,0.9422,0.5061,0.6123,0.5033,0.4824
-0.1087,0.3449,-0.1025,0.4086,0.3633,0.3943,0.3760
0.2372,-0.6980,0.5216,0.5621,0.8082,-0.5325,0.5297
-0.3589,0.6310,0.2271,0.5200,-0.1447,-0.8011,0.5903
-0.7699,-0.2532,-0.6123,0.6415,0.1993,0.3777,0.6039
-0.5298,-0.0768,-0.6028,-0.9490,0.4588,0.4498,0.6159
-0.3392,0.6870,-0.1431,0.7294,0.3141,0.1621,0.4501
0.7889,-0.3900,0.7419,0.8175,-0.3403,0.3661,0.4087
0.7984,-0.8486,0.7572,-0.6183,0.6995,0.3342,0.5025
0.2707,0.6956,0.6437,0.2565,0.9126,0.1798,0.2331
-0.6043,-0.1413,-0.3265,0.9839,-0.2395,0.9854,0.5444
-0.8509,-0.2594,-0.7532,0.2690,-0.1722,0.9818,0.6516
0.8599,-0.7015,-0.2102,-0.0768,0.1219,0.5607,0.4747
-0.4760,0.8216,-0.9555,0.6422,-0.6231,0.3715,0.5485
-0.2896,0.9484,-0.7545,-0.6249,0.7789,0.1668,0.3415
-0.5931,0.7926,0.7462,0.4006,-0.0590,0.6543,0.4781
-0.0083,-0.2730,-0.4488,0.8495,-0.2260,-0.0142,0.5854
-0.2335,-0.4049,0.4352,-0.6183,-0.7636,0.6740,0.7596
0.4883,0.1810,-0.5142,0.2465,0.2767,-0.3449,0.3995
-0.4922,0.1828,-0.1424,-0.2358,-0.7466,-0.5115,0.7968
-0.8413,-0.3943,0.4834,0.2300,0.3448,-0.9832,0.7989
-0.5382,-0.6502,-0.6300,0.6885,0.9652,0.8275,0.4353
-0.3053,0.5604,0.0929,0.6329,-0.0325,0.1799,0.4848
0.0740,-0.2680,0.2086,0.9176,-0.2144,-0.2141,0.5856
0.5813,0.2902,-0.2122,0.3779,-0.1920,-0.7278,0.4079
-0.5641,0.8515,0.3793,0.1976,0.4933,0.0839,0.4716
0.4011,0.8611,0.7252,-0.6651,-0.4737,-0.8568,0.5708
-0.5785,0.0056,-0.7901,-0.2223,0.0760,-0.3216,0.7252
0.1118,0.0735,-0.2188,0.3925,0.3570,0.3746,0.3688
0.2262,0.8715,0.1938,0.9592,-0.1180,0.4792,0.2952
-0.9248,0.5295,0.0366,-0.9894,-0.4456,0.0697,0.7335
0.2992,0.8629,-0.8505,-0.4464,0.8385,0.5300,0.2702
0.1995,0.6659,0.7921,0.9454,0.9970,-0.7207,0.2996
-0.3066,-0.2927,-0.4923,0.8220,0.4513,-0.9481,0.6617
-0.0770,-0.4374,-0.9421,0.7694,0.5420,-0.3405,0.5131
-0.3842,0.8562,0.9538,0.0471,0.9039,0.7760,0.3215
0.0361,-0.2545,0.4207,-0.0887,0.2104,0.9808,0.5202
-0.8220,-0.6302,0.0537,-0.1658,0.6013,0.8664,0.6598
-0.6443,0.7201,0.9148,0.9189,-0.9243,-0.8848,0.6095
-0.2880,0.9074,-0.0461,-0.4435,0.0060,0.2867,0.4025
-0.7775,0.5161,0.7039,0.6885,0.7810,-0.2363,0.5234
-0.5484,0.9426,-0.4308,0.8148,0.7811,0.8450,0.3479

Demo test data.

# synthetic_test.txt
# six predictors followed by target y
#
-0.6877,0.7594,0.2640,-0.5787,-0.3098,-0.6802,0.7071
-0.6694,-0.6056,0.3821,0.1476,0.7466,-0.5107,0.7282
0.2592,-0.9311,0.0324,0.7265,0.9683,-0.9803,0.5832
-0.9049,-0.9797,-0.0196,-0.9090,-0.4433,0.2799,0.9018
-0.4106,-0.4607,0.1811,-0.2389,0.4050,-0.0078,0.6916
-0.4259,-0.7336,0.8742,0.6097,0.8761,-0.6292,0.6728
0.8663,0.8715,-0.4329,-0.4507,0.1029,-0.6294,0.2936
0.8948,-0.0124,0.9278,0.2899,-0.0314,0.9354,0.3160
-0.7136,0.2647,0.3238,-0.1323,-0.8813,-0.0146,0.8133
-0.4867,-0.2171,-0.5197,0.3729,0.9798,-0.6451,0.5820
0.6429,-0.5380,-0.8840,-0.7224,0.8703,0.7771,0.5777
0.6999,-0.1307,-0.0639,0.2597,-0.6839,-0.9704,0.5796
-0.4690,-0.9691,0.3490,0.1029,-0.3567,0.5604,0.8151
-0.4154,-0.6081,-0.8241,0.7400,-0.8236,0.3674,0.7881
-0.7592,-0.9786,0.1145,0.8142,0.7209,-0.3231,0.6968
0.3393,0.6156,0.7950,-0.0923,0.1157,0.0123,0.3229
0.3840,0.3658,0.0406,0.6569,0.0116,0.6497,0.2879
0.9397,0.4839,-0.4804,0.1625,0.9105,-0.8385,0.2410
-0.8329,0.2383,-0.5510,0.5304,0.1363,0.3324,0.5862
-0.8255,-0.2579,0.3443,-0.6208,0.7915,0.8997,0.6109
0.9231,0.4602,-0.1874,0.4875,-0.4240,-0.3712,0.3165
0.7573,-0.4908,0.5324,0.8820,-0.9979,-0.0478,0.6093
0.3141,0.6866,-0.6325,0.7123,-0.2713,0.7845,0.3050
-0.1647,-0.6616,0.2998,-0.9260,-0.3768,-0.3530,0.8315
0.2149,0.3017,0.6921,0.8552,0.3209,0.1563,0.3157
-0.6918,0.7902,-0.3780,0.0970,0.3641,-0.5271,0.6323
-0.6645,0.0170,0.5837,0.3848,-0.7621,0.8015,0.7440
0.1069,-0.8304,-0.5951,0.7085,0.4119,0.7899,0.4998
-0.3417,0.0560,0.3008,0.1886,-0.5371,-0.1464,0.7339
0.9734,-0.8669,0.4279,-0.3398,0.2509,-0.4837,0.4665
0.3020,-0.2577,-0.4104,0.8235,0.8850,0.2271,0.3066
-0.5766,0.6603,-0.5198,0.2632,0.4215,0.4848,0.4478
-0.2195,0.5197,0.8059,0.1748,-0.8192,-0.7420,0.6740
-0.9212,-0.5169,0.7581,0.9470,0.2108,0.9525,0.6180
-0.9131,0.8971,-0.3774,0.5979,0.6213,0.7200,0.4642
-0.4842,0.8689,0.2382,0.9709,-0.9347,0.4503,0.5662
0.1311,-0.0152,-0.4816,-0.3463,-0.5011,-0.5615,0.6979
-0.8336,0.5540,0.0673,0.4788,0.0308,-0.2001,0.6917
0.9725,-0.9435,0.8655,0.8617,-0.2182,-0.5711,0.6021
0.6064,-0.4921,-0.4184,0.8318,0.8058,0.0708,0.3221
Posted in Scikit | Leave a comment

Hyperparameter Search Using Evolutionary Optimization for a PyTorch Binary Classifier

Bear with me for a moment — it’s difficult to explain the topic of this blog post. I’ll work backwards from the output of a demo program:

. . .
End evolution

Final best soln found:
[9 6 4 9 5 4]

Final best hyperparameters found:
num hidden nodes = 20
hidden activation = relu
batch size = 8
learn rate = 0.1200
max epochs = 500
optimizer = sgd

Final best error = 0.1300
Final best weighted acc = 0.8700

Final best train acc = 0.9300
Final best test acc = 0.8500

End evolutionary parameter search

The problem is to programmatically find a good set of hyperparameters (number of hidden nodes, learning rate, etc.) for a neural network. The demo program used evolutionary optimization, and found values that give 93% accuracy on a set of test data and 85% accuracy on a set of training data — very good results.

For simple neural networks and datasets, it’s usually possible to manually search for a good set of hyperparameter values. But for complex neural networks (especially those with a Transformer component) and/or with large datasets, it’s usually necessary to programmatically search. Grid search and random search sometimes work well but I prefer using evolutionary optimization (EO):

create a population of N random solutions (hyperparam values)
loop max_gen generations
  select two parent solutions
  make a child solution
  mutate child slightly
  replace a weak soln in population with new child
end-loop
return best solution found

There are many, many design decisions. My latest design incorporates functionality to prevent duplicate solutions from being evaluated (which, although conceptually simple, required an annoyingly large amount of code). A solution (set of hyperparameter values) looks like [9, 0, 2, 2, 4, 3]. For each solution, I created a string key like “902243” and added to a Dictionary collection with a dummy value of 1.

Also, I weighted solution goodness by using 3 times accuracy on test data compared to accuracy on training data (to avoid situations with near 100% accuracy on training data but only so-so accuracy on test data). For example, the best solution found in the demo had 0.85 accuracy on training data and 0.93 accuracy on test data so the weighted accuracy is ((3 * 0.85) + (1 * 0.93)) / 4 = 0.87.

My demo uses one of my standard synthetic datasets. The goal is to predict a person’s sex (male = 0, female = 1) from age, State of residence (Michigan, Nebraska, Oklahoma), annual income, and political leaning (conservative, moderate, liberal).

My demo code works quite well but it’s complex. Sometimes difficult problems require complex solutions.



There’s a huge amount of research that examines the many big differences between males and females. Women tend to be more sociable, sensitive, warm, compassionate, polite, anxious, self-doubting, and more open to aesthetics. Men tend to be more dominant, assertive, risk-prone, thrill-seeking, tough-minded, emotionally stable, utilitarian, and open to abstract ideas. In short, men prefer working mostly alone with things and abstract ideas (like computer programming) and women prefer working in groups with verbal communication (like administrative assistants). These sex differences appear starting at early childhood and are firmly entrenched by teen years. See blogs.scientificamerican.com/beautiful-minds/taking-sex-differences-in-personality-seriously/ for a nice summary of the research.


Demo code below. Replace “lt” (less-than) etc. with Boolean operator symbols. The training and test data are at https://jamesmccaffrey.wordpress.com/2022/09/23/binary-classification-using-pytorch-1-12-1-on-windows-10-11/.

# people_gender_evo_hyperparams.py
# binary classification, evolutionary hyperparam search
# PyTorch 2.0.0-CPU Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

class PeopleDataset(T.utils.data.Dataset):
  # sex age   state  income  politics
  #  0  0.27  0 1 0  0.7610  0 0 1
  #  1  0.19  0 0 1  0.6550  1 0 0
  # sex: 0 = male, 1 = female
  # state: michigan, nebraska, oklahoma
  # politics: conservative, moderate, liberal

  def __init__(self, src_file):
    all_data = np.loadtxt(src_file, usecols=range(0,9),
      delimiter="\t", comments="#", dtype=np.float32) 

    self.x_data = T.tensor(all_data[:,1:9],
      dtype=T.float32).to(device)
    self.y_data = T.tensor(all_data[:,0],
      dtype=T.float32).to(device)  # float32 required

    self.y_data = self.y_data.reshape(-1,1)  # 2-D required

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    feats = self.x_data[idx,:]  # idx row, all 8 cols
    sex = self.y_data[idx,:]    # idx row, the only col
    return feats, sex  # as a Tuple

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self, n_hid, activ):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(8, n_hid, activ)  # like 8-(10-10)-1
    self.hid2 = T.nn.Linear(n_hid, n_hid)
    self.oupt = T.nn.Linear(n_hid, 1)

    if activ == 'tanh':
      self.activ = T.nn.Tanh()
    elif activ == 'relu':
      self.activ = T.nn.ReLU()

    T.nn.init.xavier_uniform_(self.hid1.weight) 
    T.nn.init.zeros_(self.hid1.bias)
    T.nn.init.xavier_uniform_(self.hid2.weight) 
    T.nn.init.zeros_(self.hid2.bias)
    T.nn.init.xavier_uniform_(self.oupt.weight) 
    T.nn.init.zeros_(self.oupt.bias)

  def forward(self, x):
    z = self.activ(self.hid1(x))
    z = self.activ(self.hid2(z))
    z = T.sigmoid(self.oupt(z))  # for BCELoss()
    return z

# -----------------------------------------------------------

def accuracy_q(model, ds):
  inpts = ds[:][0]      # all input rows
  targets = ds[:][1]    # all targets 0s and 1s
  with T.no_grad():
    oupts = model(inpts)       # all computed ouputs
  pred_y = oupts "gte" 0.5        # tensor of 0s and 1s
  num_correct = T.sum(targets==pred_y)
  acc = (num_correct.item() * 1.0 / len(ds))  # scalar
  return acc

# -----------------------------------------------------------

def metrics(model, ds, thresh=0.5):
  # note: N = total number of items = TP + FP + TN + FN
  # accuracy  = (TP + TN)  / N
  # precision = TP / (TP + FP)
  # recall    = TP / (TP + FN)
  # F1        = 2 / [(1 / precision) + (1 / recall)]

  tp = 0; tn = 0; fp = 0; fn = 0
  for i in range(len(ds)):
    inpts = ds[i][0]         # dictionary style
    target = ds[i][1]        # float32  [0.0] or [1.0]
    target = target.int()    # int 0 or 1
    with T.no_grad():
      p = model(inpts)       # between 0.0 and 1.0

    # should really avoid 'target == 1.0'
    if target == 1 and p "gte" thresh:    # TP
      tp += 1
    elif target == 1 and p "lt" thresh:   # FP
      fn += 1
    elif target == 0 and p "lt" thresh:   # TN
      tn += 1
    elif target == 0 and p "gte" thresh:  # FN
      fp += 1

  N = tp + fp + tn + fn
  if N != len(ds):
    print("FATAL LOGIC ERROR in metrics()")

  accuracy = (tp + tn) / (N * 1.0)
  precision = (1.0 * tp) / (tp + fp)  # tp + fp != 0
  recall = (1.0 * tp) / (tp + fn)     # tp + fn != 0
  f1 = 2.0 / ((1.0 / precision) + (1.0 / recall))
  return (accuracy, precision, recall, f1)  # as a Tuple

# -----------------------------------------------------------

def train(net, ds, bs, lr, me, opt='sgd', verbose=False):
  # dataset, bat_size, lrn_rate, max_epochs, optimizer
  v = verbose
  train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  loss_func = T.nn.BCELoss()  # sigmoid activation
  if opt == 'sgd':
    optimizer = T.optim.SGD(net.parameters(), lr=lr)
  elif opt == 'adam':
    optimizer = T.optim.Adam(net.parameters(), lr=lr)  

  if v: print("\nStarting training ")
  le = me // 4  # log interval: n log prints
  for epoch in range(0, me):
    epoch_loss = 0.0  # for one full epoch
    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # inputs
      y = batch[1]  # correct class/label/politics

      optimizer.zero_grad()
      oupt = net(X)
      loss_val = loss_func(oupt, y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate
      loss_val.backward()
      optimizer.step()

    if v:
      if epoch % le == 0:
        print("epoch = %5d  |  loss = %10.4f" % \
          (epoch, epoch_loss)) 
  if v: print("Done ") 

# -----------------------------------------------------------

def evaluate(soln, trn_ds, tst_ds, verbose=False):
  # compute the meta error of a soln
  # [n_hid, activ, bs, lr, me, opt]
  #   [0]    [1]   [2] [3] [4] [5]
  v = verbose

  n_hids = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
  activs = ['tanh', 'tanh','tanh','tanh','tanh',
    'relu', 'relu', 'relu', 'relu', 'relu']
  b_szs = [1, 2, 4, 6, 8, 10, 12, 14, 16, 20]
  rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
    0.08, 0.10, 0.12]
  max_eps = [50, 100, 200, 300, 400, 500, 600, 700, 800, 1000]
  opts = ['sgd', 'sgd', 'sgd', 'sgd', 'sgd',
    'adam', 'adam', 'adam', 'adam', 'adam']

  n_hid = n_hids[soln[0]]
  activ = activs[soln[1]]
  bs = b_szs[soln[2]]
  lr = rates[soln[3]]
  me = max_eps[soln[4]]
  opt = opts[soln[5]]

  T.manual_seed(1)  # prepare
  np.random.seed(1)

  net = Net(n_hid, activ).to(device)  # create NN

  net.train()
  train(net, trn_ds, bs, lr, me, opt, verbose)  # train NN

  net.eval()
  acc_train = accuracy_q(net, trn_ds)  # evaluate NN accuracy
  acc_test = accuracy_q(net, tst_ds) 
  acc_weighted = ((1 * acc_train) + (3 * acc_test)) / 4
  error = 1.0 - acc_weighted  # [0.0, 1.0]
  if v: print("train acc = %0.4f " % acc_train)
  if v: print("test_acc = %0.4f " % acc_test)
  return (acc_train, acc_test, error)

# -----------------------------------------------------------

def show_soln_to_hyperparams(soln):
  n_hids = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
  activs = ['tanh', 'tanh','tanh','tanh','tanh',
    'relu', 'relu', 'relu', 'relu', 'relu']
  b_szs = [1, 2, 4, 6, 8, 10, 12, 14, 16, 20]
  rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
    0.08, 0.10, 0.12]
  max_eps = [50, 100, 200, 300, 400, 500, 600, 700, 800, 1000]
  opts = ['sgd', 'sgd', 'sgd', 'sgd', 'sgd',
    'adam', 'adam', 'adam', 'adam', 'adam']

  n_hid = n_hids[soln[0]]
  activ = activs[soln[1]]
  bs = b_szs[soln[2]]
  lr = rates[soln[3]]
  me = max_eps[soln[4]]
  opt = opts[soln[5]]

  print("num hidden nodes = " + str(n_hid))
  print("hidden activation = " + str(activ))
  print("batch size = " + str(bs))
  print("learn rate = %0.4f " % lr)
  print("max epochs = " + str(me))
  print("optimizer = " + str(opt))

# -----------------------------------------------------------

def make_random_soln(dim, rnd):
  soln = rnd.randint(low=0, high=10, size=dim, dtype=int)
  return soln

def mutate(child_soln, mutate_prob, rnd):
  for k in range(len(child_soln)):
    q = rnd.random()  # [0.0, 1.0] 
    if q "lt" mutate_prob:
      child_soln[k] = rnd.randint(0, 10, size=1, dtype=int)
  return

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nPeople gender evolutionary hyperparam search ")
  T.manual_seed(1)
  np.random.seed(1)
  rnd = np.random.RandomState(1)  # controls initial pop

  # 1. create Dataset objects
  print("\nCreating People train and test Datasets ")

  train_file = ".\\Data\\people_train.txt"
  test_file = ".\\Data\\people_test.txt"

  train_ds = PeopleDataset(train_file)  # 200 rows
  test_ds = PeopleDataset(test_file)    # 40 rows

  # 2. create pop. of possible solutions (hyperparams)
  pop_size = 10
  dim = 6  # number of hyperparameters to examine
  max_gen = 12
  used = {}  # set of hyperparams that have been evaluated

  print("\nCreating population of " + \
    str(pop_size) + " random possible solns ")
  pop = []  # list of tuples, tuple is (np arr, float)

  for i in range(pop_size):  # each set of hyperparams
    soln = make_random_soln(dim, rnd)
    soln_key = "".join(str(x) for x in soln)
    while soln_key in used:
      soln = make_random_soln(dim, rnd)
      soln_key = "".join(str(x) for x in soln)
    used[soln_key] = 1

    trn_acc, tst_acc, err = evaluate(soln, train_ds, test_ds,
      verbose=True)
    pop.append((soln, err))
  pop = sorted(pop, key=lambda tup:tup[1])  # low err to hi

  # 3. find best set of initial hyperparams
  best_soln = pop[0][0].copy()
  best_err = pop[0][1]

  print("\nBest initial soln: ")
  print(best_soln)
  print("Best initial weighted error = %0.4f " % best_err)
  print("Best initial weighted acc = %0.4f " % (1-best_err))

# -----------------------------------------------------------

  # 4. evolve
  print("\nBegin evolution ")
  for gen in range(max_gen):
    print("\ngeneration = " + str(gen))
    # 4a. pick two parents
    first = rnd.randint(0, pop_size // 2)  # good one
    second = rnd.randint(pop_size // 2, pop_size)  # weaker
    flip = rnd.randint(2)  # 0 or 1
    if flip == 0:
      parent_idxs = (first, second)
    else:
      parent_idxs = (second, first)

    # 4b. create child
    child_soln = np.zeros(dim, dtype=int)
    i = parent_idxs[0]; j = parent_idxs[1]
    parent1 = pop[i][0]
    parent2 = pop[j][0]
    for k in range(0, dim // 2):  # left half
      child_soln[k] = parent1[k]
    for k in range(dim // 2, dim):  # right half
      child_soln[k] = parent2[k]

    # 4c. mutate child, avoid duplicate
    mutate_prob = 0.5
    mutate(child_soln, mutate_prob, rnd)
    child_soln_key = "".join(str(x) for x in child_soln)
    while child_soln_key in used:
      mutate(child_soln, mutate_prob, rnd)
      child_soln_key = "".join(str(x) for x in child_soln)
    used[child_soln_key] = 1

    trn_acc, tst_acc, child_err = evaluate(child_soln,
      train_ds, test_ds, verbose=True)
    print(child_soln)
    print("child err = %0.4f " % child_err)

    # 4d. is child new best soln?
    if child_err "lt" best_err: 
      print("New best soln found at gen " + str(gen))
      best_soln = child_soln.copy()
      best_err = child_err
    else:
      # print("No improvement at gen " + str(gen))
      pass

    # 4e. replace weak pop soln with child
    idx = rnd.randint(pop_size // 2, pop_size)
    pop[idx] = (child_soln, child_err)  # Tuple

    # 4f. sort solns from best (low error) to worst
    pop = sorted(pop, key=lambda tup:tup[1]) 

  print("\nEnd evolution ")

# -----------------------------------------------------------

  # 5. show best hyperparameters found
  print("\nFinal best soln found: ")
  print(best_soln)
  print("\nFinal best hyperparameters found: ")
  show_soln_to_hyperparams(best_soln)

  print("\nFinal best error = %0.4f " % best_err)
  print("Final best weighted acc = %0.4f " % (1-best_err))

  train_acc, test_acc, _ = evaluate(best_soln, train_ds, test_ds)
  print("\nFinal best train acc = %0.4f " % train_acc)
  print("Final best test acc = %0.4f " % test_acc)

  print("\nEnd evolutionary parameter search ")

# -----------------------------------------------------------

  # 5. TODO: save model

if __name__== "__main__":
  main()
Posted in PyTorch | Leave a comment

“Researchers Create a Computer Program that has Superhuman Skill at Stratego” on the Pure AI Web Site

I contributed to an article titled “Researchers Create a Computer Program that has Superhuman Skill at Stratego” on the May 2023 edition of the Pure AI web site. See https://pureai.com/articles/2023/05/01/stratego.aspx.

Researchers at Google have demonstrated a new algorithm that achieves superhuman performance on the Stratego board game. The technique is explained in the 2022 research paper “Mastering the Game of Stratego with Model-Free Multiagent Reinforcement Learning” by J. Perolat et al.

Stratego is extraordinarily difficult for computers. The two key challenges for a software program are that the game has imperfect information (opponent pieces are hidden, which allows bluffing) and sheer size — the number of possible positions vastly exceeds the number of atoms in the known universe.

I’m quoted in the article. McCaffrey commented, “The DeepNash program is impressive and represents a significant step forward on the path to mastering complex strategy games with imperfect information.”

McCaffrey added, “The key question however, is the extent to which the ideas used programs like DeepNash can generalize. Can the fundamental ideas be applied to useful applications, or are such systems strictly one-off applications that don’t readily generalize?”

McCaffrey concluded, “Either way, the Stratego program is an interesting new development in deep reinforcement learning.”



I played Stratego when I was growing up. Stratego is published by the Milton Bradley game company. As a young man, I also loved playing the MB Dogfight (World War I aircraft) and Battle-Cry (Civil War) war strategy games. I suspect that my young brain was predisposed to liking strategy games which in turn influenced my love of computer science.


Posted in Machine Learning | Leave a comment

Hyperparameter Search Using Evolutionary Optimization for a PyTorch Multi-Class Classifier

Neural networks can create powerful prediction systems. But NNs have a lot of hyperparameters that must be specified and the values of the hyperparameters have a huge effect on the effectiveness of a NN. Architecture hyperparameters include number of hidden layers, number of nodes in each hidden layer, the presence or absence of dropout layers (and the associated dropout rates), the hidden layer activation function, the weight and bias initialization algorithm, and so on. Training hyperparameters include batch size, optimization algorithm, learning rate, and so on.

The number of combinations of hyperparameter values is, quite literally, infinite. When a machine learning engineer develops a neural system, he starts by manually experimenting with hyperparameter values, using experience and intuition as a guide. But in a production environment, searching for good hyperparameter values is often done programmatically. The two most common techniques are grid search and random search.

But I’ve always been a fan of using evolutionary optimization to search for good hyperparameter values. I put together a demo. I used one of my standard synthetic datasets where the goal is to predict a person’s political leaning (conservative = 0, moderate = 1, liberal = 2) based on sex (male = -1, female = +1), age (divided by 100), State (Michigan = 100, Nebraska = 010, Oklahoma =001), and income (divided by $100,000).

It’s not feasible to parameterize everything in a NN. For architecture, I set the number of hidden layers to 2 and parameterized the number of nodes (2 to 38), and the activation to tanh() or relu(). For training, I parameterized batch size, learning rate, maximum epochs, and optimizer (SGD or ReLU).

I encoded a set of hyperparameter values as an integer array with 6 cells, where each cell has value 0 to 9. For example, in my demo, a possible set of encoded hyperparameters looks like: [5 1 9 3 9 7]. This represents 22 hidden nodes, tanh() activation, batch size = 20, learning rate = 0.01, 1000 max epochs, and Adam optimization.

In very high level pseudo-code, evolutionary optimization is:

create a population of n random possible solutions
loop g generations
  select 2 parents from population
  create a new child solution from parents
  mutate child solution
  check if child is new best solution
  replace a weak soln in population with child
end-loop
return best solution found

There are many design choices for evolutionary optimization, such as how to select 2 parents, how to combine parents to create a child, how to mutate, and so on. One obvious enhancement is to maintain a Dictionary object of potential solutions/hyperparameters that have been used, in order to prevent duplicated effort. Another easy design option to explore is the child creation mechanism by selecting a random crossover index instead of always using the middle index.

Hyperparameter search using evolutionary optimization isn’t magic, but for the type of work I do, EO is often highly effective. And super interesting!



I’ve always been fascinated by the history of the evolution of submarines. As a young man I built many scale models and submarines were one of my favorite things to draw.

Left: The USS Plunger (SS-2, “submersible ship #2”) was commissioned in 1903. It was arguably the first (barely) practical, modern submarine. It was 64 feet long, had a crew of 7, a single torpedo tube, and could submerge to approximately 75 feet.

Right: The USS Plunger (SSN-595) was commissioned in 1990. It was a nuclear powered attack submarine with fearsome capabilities. It was 278 feet long, had a crew of 100, four torpedo tubes, and could submerge to approximately 1,300 feet.


Demo code below. Replace “lt” and “lte” with Boolean operator symbols. Not thoroughly tested; do not use for production. Training and test data at https://jamesmccaffrey.wordpress.com/2022/09/01/multi-class-classification-using-pytorch-1-12-1-on-windows-10-11/.

# people_evo_hyperparameter.py

# PyTorch 2.0.1-CPU Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class PeopleDataset(T.utils.data.Dataset):
  # sex  age    state    income   politics
  # -1   0.27   0  1  0   0.7610   2
  # +1   0.19   0  0  1   0.6550   0
  # sex: -1 = male, +1 = female
  # state: michigan, nebraska, oklahoma
  # politics: conservative, moderate, liberal

  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,7),
      delimiter="\t", comments="#", dtype=np.float32)
    tmp_x = all_xy[:,0:6]   # cols [0,6) = [0,5]
    tmp_y = all_xy[:,6]     # 1-D

    self.x_data = T.tensor(tmp_x, 
      dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y,
      dtype=T.int64).to(device)  # 1-D

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx]
    trgts = self.y_data[idx] 
    return preds, trgts  # as a Tuple

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self, n_hid, activ='tanh'):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(6, n_hid)  # 6-(nh-nh)-3
    self.hid2 = T.nn.Linear(n_hid, n_hid)
    self.oupt = T.nn.Linear(n_hid, 3)

    if activ == 'tanh':
      self.activ = T.nn.Tanh()
    elif activ == 'relu':
      self.activ = T.nn.ReLU()

    # use default weight init

  def forward(self, x):
    z = self.activ(self.hid1(x))
    z = self.activ(self.hid2(z)) 
    z = T.log_softmax(self.oupt(z), dim=1)  # NLLLoss() 
    return z

# -----------------------------------------------------------

def accuracy_quick(model, dataset):
  # assumes model.eval()
  X = dataset[0:len(dataset)][0]
  Y = dataset[0:len(dataset)][1]
  with T.no_grad():
    oupt = model(X)  #  [40,3]  logits
  arg_maxs = T.argmax(oupt, dim=1)  # argmax() is new
  num_correct = T.sum(Y==arg_maxs)
  acc = (num_correct * 1.0 / len(dataset))
  return acc.item()

# -----------------------------------------------------------

def train(net, ds, bs, lr, me, opt='sgd', verbose=False):
  # dataset, bat_size, lrn_rate, max_epochs, optimizer
  v = verbose
  train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  loss_func = T.nn.NLLLoss()  # log_softmax() activation
  if opt == 'sgd':
    optimizer = T.optim.SGD(net.parameters(), lr=lr)
  elif opt == 'adam':
    optimizer = T.optim.Adam(net.parameters(), lr=lr)  

  if v: print("\nStarting training ")
  le = me // 4  # log interval: 4 log prints
  for epoch in range(0, me):
    epoch_loss = 0.0  # for one full epoch
    for (batch_idx, batch) in enumerate(train_ldr):
      X = batch[0]  # inputs
      Y = batch[1]  # correct class/label/politics

      optimizer.zero_grad()
      oupt = net(X)
      loss_val = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate
      loss_val.backward()
      optimizer.step()

    if v:
      if epoch % le == 0:
        print("epoch = %5d  |  loss = %10.4f" % \
          (epoch, epoch_loss)) 
  if v: print("Done ") 

# -----------------------------------------------------------

def evaluate(soln, trn_ds, tst_ds, verbose=False):
  # [n_hid, activ, bs, lr, me, opt]
  #   [0]    [1]   [2] [3] [4] [5]

  v = verbose

  # map soln cell values to hyperparameter values
  n_hid = (soln[0] * 4) + 2  # 2, 6, . . 38

  if soln[1] "lte" 4: activ = 'tanh'
  else: activ = 'relu'

  bs = int( (2 * soln[2]) + 2 )  # 2, 4, . . 20

  rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
    0.08, 0.10, 0.12]
  lr = rates[soln[3]]

  me = (100 * soln[4]) + 100  # 100, 200, . . 1000

  if soln[5] "lte" 4: opt = 'sgd'
  else: opt = 'adam'

  T.manual_seed(1)  # prepare
  np.random.seed(1)

  net = Net(n_hid, activ).to(device)  # create

  net.train()
  train(net, trn_ds, bs, lr, me, opt, verbose)  # train

  net.eval()
  acc_train = accuracy_quick(net, trn_ds)  # evaluate
  acc_test = accuracy_quick(net, tst_ds) 
  acc_avg = (acc_train + acc_test) / 2
  error = 1.0 - acc_avg  # [0.0, 1.0]
  if v: print("train acc = %0.4f " % acc_train)
  if v: print("test_acc = %0.4f " % acc_test)
  return error

# -----------------------------------------------------------

def show_soln_to_hyperparams(soln):
  n_hid = (soln[0] * 4) + 2
  if soln[1] "lte" 4: activ = 'tanh'
  else: activ = 'relu'
  bs = int( (2 * soln[2]) + 2 )  # 2, 4, . . 20
  rates = [0.001, 0.005, 0.008, 0.01, 0.02, 0.03, 0.05,
    0.08, 0.10, 0.12]
  lr = rates[soln[3]]
  me = (100 * soln[4]) + 100  # 100, 200, . . 1000
  if soln[5] "lte" 4: opt = 'sgd'
  else: opt = 'adam'

  print("num hidden nodes = " + str(n_hid))
  print("hidden activation = " + str(activ))
  print("batch size = " + str(bs))
  print("learn rate = %0.4f " % lr)
  print("max epochs = " + str(me))
  print("optimizer = " + str(opt))

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin People politics evolutionary parameter search ")
  T.manual_seed(1)  # is reset in evaluate()
  np.random.seed(1)  
  rnd = np.random.RandomState(1)  # controls initial population
  
  # 1. create Dataset objects
  print("\nCreating People Datasets ")

  train_file = ".\\Data\\people_train.txt"
  train_ds = PeopleDataset(train_file)  # 200 rows

  test_file = ".\\Data\\people_test.txt"
  test_ds = PeopleDataset(test_file)    # 40 rows

  # 2. create population of possible solutions (hyperparams)
  pop_size = 8
  dim = 6
  print("\nCreating population of " + \
    str(pop_size) + " possible solns ")
  pop = []  # list of tuples, tuple is (np arr, float)
  for i in range(pop_size):  # soln-err / set of hyperparams
    soln = rnd.randint(low=0, high=10, size=dim, dtype=int)
    err = evaluate(soln, train_ds, test_ds, verbose=False)
    pop.append((soln, err))
  pop = sorted(pop, key=lambda tup:tup[1])  # low to hi

  # 3. find best set of hyperparams
  best_soln = pop[0][0].copy()
  best_err = pop[0][1]

  print("\nBest initial soln: ")
  print(best_soln)
  print("Best initial error = %0.4f " % best_err)
  print("Best initial avg acc = %0.4f " % (1-best_err))

# -----------------------------------------------------------

  # 4. evolve
  print("\nBegin evolution ")
  max_gen = 7
  for gen in range(max_gen):
    print("\ngeneration = " + str(gen))
    # 4a. pick two parents and make a child
    first = rnd.randint(0, pop_size // 2)  # good one
    second = rnd.randint(pop_size // 2, pop_size)  # weaker
    flip = rnd.randint(2)  # 0 or 1
    if flip == 0:
      parent_idxs = (first, second)
    else:
      parent_idxs = (second, first)

    # 4b. create child
    child_soln = np.zeros(dim, dtype=int)
    i = parent_idxs[0]; j = parent_idxs[1]
    parent1 = pop[i][0]
    parent2 = pop[j][0]
    for k in range(0, dim // 2):  # left half
      child_soln[k] = parent1[k]
    for k in range(dim // 2, dim):  # right half
      child_soln[k] = parent2[k]

    # 4c. mutate child
    mutate_prob = 0.5
    for k in range(dim):
      q = rnd.random()  # [0.0, 1.0] 
      if q "lt" mutate_prob:
        child_soln[k] = rnd.randint(0, 10, size=1,
          dtype=int)
    child_err = evaluate(child_soln, train_ds, test_ds,
      verbose=True)
    print(child_soln)
    print("%0.4f " % child_err)

# -----------------------------------------------------------

    # 4d. is child new best soln?
    if child_err "lt" best_err: 
      print("New best soln found at gen " + str(gen))
      best_soln = child_soln.copy()
      best_err = child_err
    else:
      # print("No improvement at gen " + str(gen))
      pass

    # 4e. replace weak pop soln with child
    idx = rnd.randint(pop_size // 2, pop_size)
    pop[idx] = (child_soln, child_err)  # Tuple

    # 4f. sort solns from best to worst
    pop = sorted(pop, key=lambda tup:tup[1]) 

  print("\nEnd evolution ")

# -----------------------------------------------------------

  # 5. show best hyperparameters found
  # best_soln = pop[0][0].copy()
  # best_err = pop[0][1]

  print("\nFinal best soln found: ")
  print(best_soln)
  print("\nFinal best hyperparameters found: ")
  show_soln_to_hyperparams(best_soln)

  print("\nFinal best error = %0.4f " % best_err)
  print("Final best avg acc = %0.4f " % (1-best_err))

  print("\nEnd evolutionary parameter search ")

if __name__ == "__main__":
  main()
Posted in PyTorch | Leave a comment

Understanding the Name of the scikit-learn Python Machine Learning Library

I was presenting a talk recently. One of my examples used the scikit-learn code library for machine learning. One of the attendees asked about the difference between “scikit” and “scikit-learn”.

Briefly, scikit-learn is a Python language code library for machine learning. The scikit-learn library is heavily dependent on the Python NumPy (numerical Python) and SciPy (scientific Python). There are other libraries based on SciPy such as scikit-sound and scikit-image, but scikit-learn is by far the most widely used and so scikit-learn is often referred to as just scikit.

Years ago, the developers of SciPy realized that open source contributors would create code libraries that use SciPy, so the SciPy people suggested that related libraries be named scikit-xyz where xyz describes the purpose of the library. There are many hundreds of Python libraries that use SciPy but most of these libraries just created their own name and did not use the scikit-xyz name format.

There is a Web site that used to have an index of all the scikit-xyz libraries, but that Web site (http://scikits.appspot.com/about) appears to be abandoned. Most Python libraries are maintained by http://www.pypi.org (Python package index) site.

Here are a few of the scientific Python libraries that actually use the scikit-xyz name format. I’ve included the current version number.

scikit-hep 5.1.1 - high energy particle physics tools.

scikit-gstat 1.0.10 - geostatistical expansion.

scikit-sound 0.2.11 - working with sound signals.

scikit-rf 0.26.0 - radio frequency / microwave engineering.

scikit-network 0.29.0 - analysis of large graphs.

scikit-decide 0.9.6 - framework for reinforcement learning.

scikit-survival 0.20.0 - survival analysis with scikit-learn.

scikit-mol 0.1.2 - scikit-learn for molecular vectorization.

scikit-na 0.1.1 - missing data (NA) analysis. 

scikit-image 0.20.0 - image processing.

There’s no moral to this blog post. But naming anything is important for communication within a scientific field. On the other hand, there are some situations where naming takes on a life of its own. One of my problems with some of the psychology classes I took in college was that there was a tremendous amount of vocabulary that was just slapping labels on things without any real purpose. Fortunately, machine learning doesn’t suffer from this naming-without-purpose except in very rare scenarios.



Three restaurant names that may not convey what the owners intended.


Posted in Scikit | Leave a comment

Using PairwiseKernel in scikit GaussianProcessRegressor for Polynomial Kernel

Briefly, I put together a demo of a scikit Gaussian process regression model that uses the polynomial kernel from the sklearn.metrics.pairwise module via the PairwiseKernel class. Whew, what a mouthful.

A Gaussian process regressor model predicts a single numeric value using a very complex mathematical algorithm. To use the scikit GaussianProcessRegressor module, you must specify a kernel function, for example:

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

print("Creating GPR with radial basis function kernel ")
krnl = RBF(2.0)  # this 2.0 parameter will be optimized in fit()
model = GaussianProcessRegressor(kernel=krnl, \
  normalize_y=True, random_state=0, alpha=0.05)

print("Training model ")
model.fit(train_X, train_y)

The scikit documentation for Gaussian process kernels is very weak (like virtually non-existent), but the source code at https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/gaussian_process/kernels.py indicates there are the following six built-in kernels that can be used with GPR: ConstantKernel, RBF, DotProduct, WhiteKernel, ExpSineSquared, RationalQuadratic. Note: These built-in kernels can be combined using addition or multiplication.

When I first looked at GPR kernels, I was surprised that I didn’t see a polynomial kernel because it’s one of the most common. But after a bit of poking around, I discovered that scikit has a PairwiseKernel class that is a wrapper for the large collection of kernels in the sklearn.metrics.pairwise module. These include: “linear”, “additive_chi2”, “chi2”, “poly”, “polynomial”, “rbf”, “laplacian”, “sigmoid”, “cosine”.

So I realized that the intent of the design is to pass a metrics.pairwise kernel function such as ‘poly’ to the GPR PairwiseKernel class. But there were no examples. After a bit of futzing around . . . well, after a lot of futzing around . . . I put together a demo of a scikit Gaussian process regressor that uses the polynomial kernel from the sklearn.metrics.pairwise module via the PairwiseKernel class:

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel

# sklearn.metrics.pairwise.polynomial_kernel(X, Y=None,
#  degree=3, gamma=None, coef0=1)

krnl = PairwiseKernel(metric='poly', \
  pairwise_kernels_kwargs={'degree':4})
model = GaussianProcessRegressor(kernel=krnl, normalize_y=True,
  random_state=0, alpha=0.05)

It took me quite a bit of time to figure out how to pass the degree=4 argument to PairwiseKernel() using the pairwise_kernels_kwargs parameter.

For my demo, I used the Boston Area House Prices dataset.

This was an interesting experiment and I learned a lot of new tricks and techniques related to kernel functions and scikit library design.



In Gaussian process regression, a kernel function is used to estimate the covariance matrix. More generally, a kernel function is a type of similarity/dissimilarity function. The connection between the two ways of thinking about kernel functions is not obvious.

Biological twins are similar. Identical twins are very similar. Conjoined twins are very, very similar. There are more movies that feature conjoined twins than you might guess.

Left: In the comedy “Stuck on You” (2003), conjoined twins Bob (actor Greg Kinnear) and Walt (actor Matt Damon) Tenor own a diner in New England. They go to Hollywood so Bob can pursue his dream of being an actor. The movie is hilarious and it’s one of my all-time favorites. Instead of virtue signaling about diversity and inclusion, it uses a simple “treat others as you’d wish to be treated” approach. Grade = solid A.

Center: In the fantasy-drama “Big Fish” (2003), Edward Bloom (actor Ewan McGregor) tells tall tales to his son. In one such tale, Bloom meets conjoined sisters Ping and Jing during the Korean War. The movie examines loyalty, the meaning of truth, friendship, and father-son relationships. Grade = A-.

Right: In the 1995 French-American fantasy movie “The City of Lost Children” (aka La Cite des Enfants Perdus), evil conjoined sisters called The Octopus kidnap children so their dreams can be harvested. Like many thing French, this movie had a lot of style and is fun to look at, but there was no logic and the plot made little sense. Grade = C+.


Demo code. Replace lt” with Boolean operator symbol. See my “Gaussian Process Regression on the Boston Housing Dataset Using the scikit Library” on this site for the data.

# boston_gauss_process.py
# Gaussian process regression

# Anaconda3-2022.10  Python 3.9.13
# scikit 1.0.2
# Windows 10/11 

import numpy as np
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel

# -----------------------------------------------------------

def accuracy(model, data_X, data_y, pct_close):
  # correct within pct of true income
  n_correct = 0; n_wrong = 0

  for i in range(len(data_X)):
    X = data_X[i].reshape(1, -1)  # one-item batch
    y = data_y[i]
    pred = model.predict(X)       # predicted income

    if np.abs(pred - y) "lt" np.abs(pct_close * y):
      n_correct += 1
    else:
      n_wrong += 1
  acc = (n_correct * 1.0) / (n_correct + n_wrong)
  return acc

# -----------------------------------------------------------

def main():
  # 0. prepare
  print("\nBegin scikit Gaussian process regression ")
  print("Predict Boston area house median price ")
  np.random.seed(1)
  np.set_printoptions(precision=4, suppress=True)

# -----------------------------------------------------------

  # 1. load data
  print("\nLoading train and test data ")
  train_file = ".\\Data\\boston_train.txt"
  train_X = np.loadtxt(train_file, delimiter="\t", 
    usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12],
    comments="#", dtype=np.float32)
  train_y = np.loadtxt(train_file, delimiter="\t", 
    usecols=13, comments="#", dtype=np.float32) 

  test_file = ".\\Data\\boston_test.txt"
  test_X = np.loadtxt(test_file, delimiter="\t",
    usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12],
    comments="#", dtype=np.float32)
  test_y = np.loadtxt(test_file, delimiter="\t",
    usecols=13, comments="#", dtype=np.float32) 
  print("Done ")

  print("\nData: ")
  print(train_X[0:4][:])
  print(". . .")
  print("\nActual prices: ")
  print(train_y[0:4])
  print(". . .")

# -----------------------------------------------------------

  # 2. create and train GPR model
  print("\nCreating GPR model with poly(4) kernel ")

  # sklearn.metrics.pairwise.polynomial_kernel(X, Y=None,
  #  degree=3, gamma=None, coef0=1)

  krnl = PairwiseKernel(metric='poly', \
    pairwise_kernels_kwargs={'degree':4})
  model = GaussianProcessRegressor(kernel=krnl,
    normalize_y=True, random_state=0, alpha=0.05)

  print("\nTraining model ")
  model.fit(train_X, train_y)
  print("Done ")

  # 3. compute model accuracy
  print("\nComputing accuracy (within 0.15 of true price) ")
  acc_train = accuracy(model, train_X, train_y, 0.15)
  print("\nAccuracy on train data = %0.4f " % acc_train)
  acc_test = accuracy(model, test_X, test_y, 0.15)
  print("Accuracy on test data = %0.4f " % acc_test)

  print("\nEnd GPR demo ")

if __name__ == "__main__":
  main()
Posted in Scikit | Leave a comment

Mapping an Integer Vector to PyTorch Hyperparameter Values

I’ve been investigating the idea of using evolutionary optimization to find good values for the hyperparameters of a PyTorch neural network. Evolutionary optimization sets up a population of possible solutions and then:

create poulation of possible solutions
loop several times
  pick two parent solutions
  use parents to create child solution
  mutate child slightly
  evaluate child
  replace a weak solution in pop with child
end-loop
return best possible solution found

There are many design options, including how to encode a possible solution. In my experiments, I encode a possible solution as an integer array with six cells, where each cell holds a value between 0 and 9. Each integer value maps to a hyperparameter value: number hidden layer nodes, hidden layer activation function, training batch size, learning rate, max epochs, training optimizer.

The key mapping code looks like:

def evaluate(soln, trn_ds, tst_ds):
  # [n_hid, activ, bs, lr, me, opt]
  #   [0]    [1]   [2] [3] [4] [5]

  # map soln cell values to hyperparameter values
  if soln[0] "lte" 2: n_hid = 8
  elif soln[0] "lte" 5: n_hid = 10
  elif soln[0] "lte" 9: n_hid = 14
  else: n_hid = 20

  if soln[1] "lte" 4: activ = 'tanh'
  else: activ = 'relu'

  bs = (2 * soln[2]) + 2

  if soln[3] "lte"= 2: lr = 0.005
  elif soln[3] "lte" 5: lr = 0.01
  elif soln[3] "lte" 9: lr = 0.05
  else: lr = 0.10

  me = (100 * soln[4]) + 100

  if soln[5] "lte" 4: opt = 'sgd'
  else: opt = 'adam'

  T.manual_seed(1)  # prepare
  np.random.seed(1)

  net = Net(n_hid, activ).to(device)  # create

  net.train()
  train(net, trn_ds, bs, lr, me, opt)  # train

  net.eval()
  acc_train = accuracy_quick(net, trn_ds)  # evaluate
  acc_test = accuracy_quick(net, tst_ds) 
  return (acc_train + acc_test) / 2

In my demo, the number of hidden nodes can be 8, 10, or 14. The activation can be tanh() or relu(). And so on. There are a total of 3 * 2 * 10 * 3 * 10 * 2 = 3,600 possible combinations of hyperparameter values so a grid search might be feasible but for a non-demo problem, grid search likely wouldn’t be feasible.

I tested the demo like so:

  parent1 = [1, 0, 8,   2, 2, 5]  
  print("parent1 soln: ")
  print(parent1) 
  f = evaluate(parent1, train_ds, test_ds)
  print("fitness = %0.4f " % f)

  parent2 = [4, 4, 6,   8, 5, 3]
  print("parent2 soln: ")
  print(parent2) 
  f = evaluate(parent2, train_ds, test_ds)
  print("fitness = %0.4f " % f)

  child = [1, 0, 8,   8, 5, 3]
  print("child soln: ")
  print(child) 
  f = evaluate(child, train_ds, test_ds)
  print("fitness = %0.4f " % f)

I hard-coded the child solution by taking the left half of parent1 and the right half of parent2.

My design doesn’t feel entirely “right”. Sensing when a design can be improved is something experienced engineers can do but new engineers cannot. But I’m one step closer to implementing a complete system for hyperparameter tuning using evolutionary optimization.



I visited London recently. The city is incredibly complicated to navigate, even with maps and sophisticated mobile device applications. I’ve always found maps to be beautiful works of art. Here’s a map of London from about 1650 AD when the city already had about 400,000 residents. By an incredible coincidence, my good friend Ed Koolish (we were college roommates) was in London the exact same dates that I was there so I got to visit with him.

Notice the original London Bridge spanning the Thames River. It was built in 1209 AD.

Click to enlarge.


Demo program below. Replace “lte” (less-than-or-equal”) with Boolean operator symbols. The data can be found at https://jamesmccaffrey.wordpress.com/2022/09/01/multi-class-classification-using-pytorch-1-12-1-on-windows-10-11/.

# people_politics_encoded_2.py
# predict politics type from sex, age, state, income
# PyTorch 2.0.1-CPU Anaconda3-2022.10  Python 3.9.13
# Windows 10/11 

# experiemnt for hyperparameter evolutionary optimization
# map integer vector to hyperparameter values

import numpy as np
import torch as T
device = T.device('cpu')  # apply to Tensor or Module

# -----------------------------------------------------------

class PeopleDataset(T.utils.data.Dataset):
  # sex  age    state    income   politics
  # -1   0.27   0  1  0   0.7610   2
  # +1   0.19   0  0  1   0.6550   0
  # sex: -1 = male, +1 = female
  # state: michigan, nebraska, oklahoma
  # politics: conservative, moderate, liberal

  def __init__(self, src_file):
    all_xy = np.loadtxt(src_file, usecols=range(0,7),
      delimiter="\t", comments="#", dtype=np.float32)
    tmp_x = all_xy[:,0:6]   # cols [0,6) = [0,5]
    tmp_y = all_xy[:,6]     # 1-D

    self.x_data = T.tensor(tmp_x, 
      dtype=T.float32).to(device)
    self.y_data = T.tensor(tmp_y,
      dtype=T.int64).to(device)  # 1-D

  def __len__(self):
    return len(self.x_data)

  def __getitem__(self, idx):
    preds = self.x_data[idx]
    trgts = self.y_data[idx] 
    return preds, trgts  # as a Tuple

# -----------------------------------------------------------

class Net(T.nn.Module):
  def __init__(self, n_hid, activ='tanh'):
    super(Net, self).__init__()
    self.hid1 = T.nn.Linear(6, n_hid)  # 6-(nh-nh)-3
    self.hid2 = T.nn.Linear(n_hid, n_hid)
    self.oupt = T.nn.Linear(n_hid, 3)

    if activ == 'tanh':
      self.activ = T.nn.Tanh()
    elif activ == 'relu':
      self.activ = T.nn.ReLU()

    # use default weight init

  def forward(self, x):
    z = self.activ(self.hid1(x))
    z = self.activ(self.hid2(z)) 
    z = T.log_softmax(self.oupt(z), dim=1)  # NLLLoss() 
    return z

# -----------------------------------------------------------

def accuracy_quick(model, dataset):
  # assumes model.eval()
  X = dataset[0:len(dataset)][0]
  Y = dataset[0:len(dataset)][1]
  with T.no_grad():
    oupt = model(X)  #  [40,3]  logits
  arg_maxs = T.argmax(oupt, dim=1)  # argmax() is new
  num_correct = T.sum(Y==arg_maxs)
  acc = (num_correct * 1.0 / len(dataset))
  return acc.item()

# -----------------------------------------------------------

def train(net, ds, bs, lr, me, opt='sgd'):
  # dataset, bat_size, lrn_rate, max_epochs, optimizer
  train_ldr = T.utils.data.DataLoader(ds, batch_size=bs,
    shuffle=True)
  loss_func = T.nn.NLLLoss()
  if opt == 'sgd':
    optimizer = T.optim.SGD(net.parameters(), lr=lr)
  elif opt == 'adam':
    optimizer = T.optim.Adam(net.parameters(), lr=lr)  

  print("\nStarting training ")
  le = me // 5  # log interval: 5 log prints
  for epoch in range(0, me):
    epoch_loss = 0.0  # for one full epoch
    for (batch_idx, batch) in enumerate(train_ldr):
     X = batch[0]  # inputs
      Y = batch[1]  # correct class/label/politics

      optimizer.zero_grad()
      oupt = net(X)
      loss_val = loss_func(oupt, Y)  # a tensor
      epoch_loss += loss_val.item()  # accumulate
      loss_val.backward()
      optimizer.step()

    if epoch % le == 0:
      print("epoch = %5d  |  loss = %10.4f" % \
        (epoch, epoch_loss)) 
  print("Done ") 

# -----------------------------------------------------------

def evaluate(soln, trn_ds, tst_ds):
  # [n_hid, activ, bs, lr, me, opt]
  #   [0]    [1]   [2] [3] [4] [5]

  # map soln cell values to hyperparameter values
  if soln[0] "lte" 2: n_hid = 8
  elif soln[0] "lte" 5: n_hid = 10
  elif soln[0] "lte" 9: n_hid = 14
  else: n_hid = 20

  if soln[1] "lte" 4: activ = 'tanh'
  else: activ = 'relu'

  bs = (2 * soln[2]) + 2

  if soln[3] "lte" 2: lr = 0.005
  elif soln[3] "lte" 5: lr = 0.01
  elif soln[3] "lte" 9: lr = 0.05
  else: lr = 0.10

  me = (100 * soln[4]) + 100

  if soln[5] "lte" 4: opt = 'sgd'
  else: opt = 'adam'

  T.manual_seed(1)  # prepare
  np.random.seed(1)

  net = Net(n_hid, activ).to(device)  # create

  net.train()
  train(net, trn_ds, bs, lr, me, opt)  # train

  net.eval()
  acc_train = accuracy_quick(net, trn_ds)  # evaluate
  acc_test = accuracy_quick(net, tst_ds) 
  return (acc_train + acc_test) / 2

# -----------------------------------------------------------

def main():
  # 0. get started
  print("\nBegin People predict politics type ")
  
  # 1. create Dataset objects
  print("\nCreating People Datasets ")

  train_file = ".\\Data\\people_train.txt"
  train_ds = PeopleDataset(train_file)  # 200 rows

  test_file = ".\\Data\\people_test.txt"
  test_ds = PeopleDataset(test_file)    # 40 rows

  # 2. compute fitness using soln vector of ints
  # n_hid, activ, bs, lr, me, opt
  parent1 = [1, 0, 8,   2, 2, 5]  
  print("\nparent1 soln: ")
  print(parent1) 
  f = evaluate(parent1, train_ds, test_ds)
  print("fitness = %0.4f " % f)

  parent2 = [4, 4, 6,   8, 5, 3]
  print("\nparent2 soln: ")
  print(parent2) 
  f = evaluate(parent2, train_ds, test_ds)
  print("fitness = %0.4f " % f)

  child = [1, 0, 8,   8, 5, 3]
  print("\nchild soln: ")
  print(child) 
  f = evaluate(child, train_ds, test_ds)
  print("fitness = %0.4f " % f)

  print("\nEnd People politics parameter encoding demo ")

if __name__ == "__main__":
  main()
Posted in PyTorch | 1 Comment