Simulating the NumPy loadtxt() Function in JavaScript

When I write Python/PyTorch code, when I load numeric data from a text file into memory, I usually use the NumPy loadtxt() function. For example:

all_xy = np.loadtxt(src_file, usecols=range(0,7),
  delimiter="\t", comments="#", dtype=np.float32)

I have an inexplicable liking for the JavaScript language. I’ve implemented neural networks completely from scratch using JavaScript. To load data into memory, I had written a JavaScript loadTxt() function that mimics the Python loadtxt() function. However, my JavaScript loadTxt() function didn’t handle comment lines in the source text file because doing so is surprisingly tricky.

So, one weekend evening I decided to enhance my JavaScript loadTxt() function to handle comment lines. Along the way, I ran into a common theme: I could use a relatively simple algorithm that is not efficient because it duplicates the data in memory, or I could use a more complex algorithm that is memory efficient.

The simple version is:

function loadTxt2(fn, delimit, usecols, comment) {
  // simple but doubles in-memory usage
  let all = FS.readFileSync(fn, "utf8");  // giant string
  all = all.trim();  // strip final crlf in file
  let lines = all.split("\n");  // array of lines

  let validLines = [];
  for (let i = 0; i < lines.length; ++i) {
    if (!lines[i].startsWith(comment))
      validLines.push(lines[i]);
  }
  
  let rows = validLines.length;
  let cols = usecols.length;
  let result = matMake(rows, cols, 0.0); 
  for (let i = 0; i < rows; ++i) {  // each line
    let tokens = validLines[i].split(delimit);
    for (let j = 0; j < cols; ++j) {
      result[i][j] = parseFloat(tokens[usecols[j]]);
    }
  }
  return result;
}

My test data file is:

// people_train_4.txt
// sex (M = -1, F = +1), age (div by 100)
// state (Michigan = 100, Nebraska = 010, Oklahoma = 001)
// income (div by $100,000)
// politics (conservative = 0, moderate = 1, liberal = 2)
//
 1	0.24	1	0	0	0.2950	2
-1	0.39	0	0	1	0.5120	1
 1	0.63	0	1	0	0.7580	0
-1	0.36	1	0	0	0.4450	1
// end data

The program to call the function is:

// test_loadTxt.js

let U = require("../../Utilities/utilities_lib.js");
let FS = require("fs");

// ----------------------------------------------------------

function main()
{
  console.log("\nBegin test loadTxt() with JavaScript ");

  // raw data looks like:   M   32   michigan  52,000.00  liberal
  // norm data looks like: -1  0.32   1 0 0     0.5250     2

  // memory inefficient but simple
  let trainX = U.loadTxt2(".\\Data\\people_train_4.txt", "\t",
    [0,1,2,3,4,5], "//");
  console.log("");
  U.matShow(trainX, 4, 12);

  // memory efficient but complicated
  trainX = U.loadTxt3(".\\Data\\people_train_4.txt", "\t",
    [0,1,2,3,4,5], "//");
  console.log("");
  U.matShow(trainX, 4, 12);

  console.log("\nEnd demo ");
}

main();

I put the loadTxt2() and loadTxt3() functions in a Utility library.

The more-efficient but less-simple version is:

function loadTxt3(fn, delimit, usecols, comment) {
  // efficient but complicated
  let all = FS.readFileSync(fn, "utf8");  // giant string
  all = all.trim();  // strip final crlf in file
  let lines = all.split("\n");  // array of lines

  // count number non-comment lines
  let nRows = 0;
  for (let i = 0; i < lines.length; ++i) {
    if (!lines[i].startsWith(comment))
      ++nRows;
  }
  nCols = usecols.length;
  let result = matMake(nRows, nCols, 0.0); 
 
  let r = 0;  // into lines
  let i = 0;  // into result[][]
  while (r < lines.length) {
    if (lines[r].startsWith(comment)) {
      ++r;  // next row
    }
    else {
      let tokens = lines[r].split(delimit);
      for (let j = 0; j < nCols; ++j) {
        result[i][j] = parseFloat(tokens[usecols[j]]);
      }
      ++r;
      ++i;
    }
  }

  return result;
}

Good fun.



There are several traditional tradeoff themes in computer science, such as performance vs. simplicity. Here are three examples of traditional Eurasian clothing that trade off attractive complexity vs. functional simplicity.


This entry was posted in JavaScript. Bookmark the permalink.

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s