There are a lot of good datasets for experimenting with machine learning classification. But there are very few datasets for ML regression experiemnts. Creating a synthetic dataset for regression is relatively easy and effective.

In high-level pseudo-code:

create PyTorch network with random weights loop n_items times generate random input X compute y = net(X) add a bit of random noise to y write X values and y to file end-loop

Here’s a program that generates 200 training items and 40 test items using a PyTorch neural network. Each input item has 6 predictor values, where each value is between -1.0 and +1.0. The target output value is between 0.0 and 1.0.

The devil is in the details. The resulting data looks like:

-0.1660 0.4406 -0.9998 -0.3953 -0.7065 -0.8153 0.7022 -0.2065 0.0776 -0.1616 0.3704 -0.5911 0.7562 0.5666 -0.9452 0.3409 -0.1654 0.1174 -0.7192 -0.6038 0.8186 0.7528 0.7892 -0.8299 -0.9219 -0.6603 0.7563 0.3687 . . .

# make_synthetic.py # create synthetic train and test datasets for regression # PyTorch 2.0.1-CPU Anaconda3-2022.10 Python 3.9.13 # Windows 10/11 import numpy as np import torch as T # non-standard alias device = T.device('cpu') # apply to Tensor or Module # ----------------------------------------------------------- class Net(T.nn.Module): def __init__(self, n_in): super(Net, self).__init__() self.hid1 = T.nn.Linear(n_in, 10) # n-(10)-1 self.oupt = T.nn.Linear(10, 1) lim = 0.80 T.nn.init.uniform_(self.hid1.weight, -lim, lim) T.nn.init.uniform_(self.hid1.bias, -lim, lim) T.nn.init.uniform_(self.oupt.weight, -lim, lim) T.nn.init.uniform_(self.oupt.bias, -lim, lim) def forward(self, x): z = T.tanh(self.hid1(x)) z = T.sigmoid(self.oupt(z)) # oupt in [0.0, 1.0] return z # ----------------------------------------------------------- def create_data_file(net, n_in, fn, n_items): f = open(fn, "w") x_lo = -1.0; x_hi = 1.0 for i in range(n_items): s = "" X = (x_hi - x_lo) * np.random.random(size=(1,n_in)) + x_lo for j in range(n_in): s += ("%0.4f" % X[0][j]) + "\t" X = T.tensor(X, dtype=T.float32) with T.no_grad(): y = net(X).item() y += np.random.normal(loc=0.0, scale=0.01) # could be neg if y "lt" 0.0: y += 0.01 * np.random.random() * 0.01 # pos s += ("%0.4f" % y) + "\n" f.write(s) f.close() # ----------------------------------------------------------- def main(): # 0. get started print("\nCreating synthetic datasets for regression ") np.random.seed(1) T.manual_seed(1) # 1. create neural generator model n_in = 6 print("\nCreating 6-(10)-1 regression model ") net = Net(n_in).to(device) # 2. use model to create data print("\nCreating data files ") create_data_file(net, n_in, ".\\synthetic_train.txt", 200) create_data_file(net, n_in, ".\\synthetic_test.txt", 40) print("\nEnd create synthetic data ") # ----------------------------------------------------------- if __name__=="__main__": main()

The synthetic data generation program can be easily modified in several ways. To test the data, I wote a PyTorch regression network. The network created a good model quickly and scored 97.50% accuracy on the 40-item test data, where a correct prediction is one that’s within 10% of the true target value.

Good fun.

*“Synthetic Men of Mars” (1939) by Edgar Rice Burroughs is the ninth book in the Mars series. In this novel, John Carter and Vor Daj (a member of Carter’s personal guard) seek scientist Ras Thavas who is the only one who can save queen Dejah Thoris. But Thavas has been experimenting with clones and those experiments have not gone well.*

*The Mars series had a huge influence on nearly every modern science fiction author. Here are three different covers. Left: By artist Robert Abbett. Center: By artist Gino D’Achille. Right: Unattributed but I believe this might be the work of artist Richard Clifton-Dey.*

Demo program.

# synthetic_dnn.py # synthetic dataset regression # PyTorch 2.0.1-CPU Anaconda3-2022.10 Python 3.9.13 # Windows 10/11 import numpy as np import torch as T # non-standard alias device = T.device('cpu') # apply to Tensor or Module # ----------------------------------------------------------- class SynthDataset(T.utils.data.Dataset): def __init__(self, src_file): tmp_x = np.loadtxt(src_file, delimiter="\t", usecols=[0,1,2,3,4,5], dtype=np.float32) tmp_y = np.loadtxt(src_file, usecols=6, delimiter="\t", dtype=np.float32) tmp_y = tmp_y.reshape(-1,1) # 2D required self.x_data = T.tensor(tmp_x, dtype=T.float32).to(device) self.y_data = T.tensor(tmp_y, dtype=T.float32).to(device) def __len__(self): return len(self.x_data) def __getitem__(self, idx): preds = self.x_data[idx] trgts = self.y_data[idx] return (preds, trgts) # as a tuple # ----------------------------------------------------------- def accuracy(model, ds, pct_close): # assumes model.eval() # correct within pct of true income n_correct = 0; n_wrong = 0 for i in range(len(ds)): X = ds[i][0] # 2-d Y = ds[i][1] # 2-d with T.no_grad(): oupt = model(X) # computed median price if T.abs(oupt - Y) "lt" T.abs(pct_close * Y): # less-than n_correct += 1 else: n_wrong += 1 acc = (n_correct * 1.0) / (n_correct + n_wrong) return acc # ----------------------------------------------------------- class Net(T.nn.Module): def __init__(self): super(Net, self).__init__() self.hid1 = T.nn.Linear(6, 10) # 6-(10-10)-1 self.hid2 = T.nn.Linear(10, 10) self.oupt = T.nn.Linear(10, 1) T.nn.init.xavier_uniform_(self.hid1.weight) # glorot T.nn.init.zeros_(self.hid1.bias) T.nn.init.xavier_uniform_(self.hid2.weight) T.nn.init.zeros_(self.hid2.bias) T.nn.init.xavier_uniform_(self.oupt.weight) T.nn.init.zeros_(self.oupt.bias) def forward(self, x): z = T.tanh(self.hid1(x)) # or T.nn.Tanh() z = T.tanh(self.hid2(z)) z = self.oupt(z) # no activation, aka Identity() return z # ----------------------------------------------------------- def train(model, ds, bs, lr, me, le): # dataset, bat_size, lrn_rate, max_epochs, log interval train_ldr = T.utils.data.DataLoader(ds, batch_size=bs, shuffle=True) loss_func = T.nn.MSELoss() optimizer = T.optim.Adam(model.parameters(), lr=lr) # optimizer = T.optim.SGD(model.parameters(), lr=lr) for epoch in range(0, me): epoch_loss = 0.0 # for one full epoch for (b_idx, batch) in enumerate(train_ldr): X = batch[0] # predictors y = batch[1] # target house price optimizer.zero_grad() oupt = model(X) loss_val = loss_func(oupt, y) # a tensor epoch_loss += loss_val.item() # accumulate loss_val.backward() # compute gradients optimizer.step() # update weights if epoch % le == 0: print("epoch = %4d | loss = %0.4f" % \ (epoch, epoch_loss)) # ----------------------------------------------------------- def main(): # 0. get started print("\nSynthetic data regression using PyTorch 2.0 ") np.random.seed(1) T.manual_seed(1) # 1. load data print("\nLoading synthetic data into memory ") train_file = ".\\Data\\synthetic_train.txt" train_ds = SynthDataset(train_file) # 200 rows test_file = ".\\Data\\synthetic_test.txt" test_ds = SynthDataset(test_file) # 40 rows # 2. create model print("\nCreating 6-(10-10)-1 regression model ") net = Net().to(device) # 3. train model print("\nbat_size = 10 ") print("loss = MSELoss() ") print("optimizer = Adam ") print("lrn_rate = 0.01 ") print("\nStarting training") net.train() train(net, train_ds, bs=10, lr=0.01, me=100, le=10) print("Done ") # ----------------------------------------------------------- # 4. evaluate model accuracy net.eval() print("\nComputing model accuracy (within 0.10 of true) ") acc_train = accuracy(net, train_ds, 0.10) # item-by-item print("Accuracy on train data = %0.4f" % acc_train) acc_test = accuracy(net, test_ds, 0.10) print("Accuracy on test data = %0.4f" % acc_test) # ----------------------------------------------------------- # 5. make a prediction print("\nPredicting target for dummy inputs: ") x = np.array([[0.5, 0.5, 0.5, 0.5, 0.5, 0.5]], dtype=np.float32) x = T.tensor(x, dtype=T.float32).to(device) with T.no_grad(): y = net(x) pred_raw = y.item() # scalar print("%0.4f" % pred_raw) # ----------------------------------------------------------- # 6. TODO: save model (state_dict approach) if __name__=="__main__": main()

You must be logged in to post a comment.