When I write Python/PyTorch code, when I load numeric data from a text file into memory, I usually use the NumPy loadtxt() function. For example:
all_xy = np.loadtxt(src_file, usecols=range(0,7), delimiter="\t", comments="#", dtype=np.float32)
I have an inexplicable liking for the JavaScript language. I’ve implemented neural networks completely from scratch using JavaScript. To load data into memory, I had written a JavaScript loadTxt() function that mimics the Python loadtxt() function. However, my JavaScript loadTxt() function didn’t handle comment lines in the source text file because doing so is surprisingly tricky.
So, one weekend evening I decided to enhance my JavaScript loadTxt() function to handle comment lines. Along the way, I ran into a common theme: I could use a relatively simple algorithm that is not efficient because it duplicates the data in memory, or I could use a more complex algorithm that is memory efficient.
The simple version is:
function loadTxt2(fn, delimit, usecols, comment) { // simple but doubles in-memory usage let all = FS.readFileSync(fn, "utf8"); // giant string all = all.trim(); // strip final crlf in file let lines = all.split("\n"); // array of lines let validLines = []; for (let i = 0; i < lines.length; ++i) { if (!lines[i].startsWith(comment)) validLines.push(lines[i]); } let rows = validLines.length; let cols = usecols.length; let result = matMake(rows, cols, 0.0); for (let i = 0; i < rows; ++i) { // each line let tokens = validLines[i].split(delimit); for (let j = 0; j < cols; ++j) { result[i][j] = parseFloat(tokens[usecols[j]]); } } return result; }
My test data file is:
// people_train_4.txt // sex (M = -1, F = +1), age (div by 100) // state (Michigan = 100, Nebraska = 010, Oklahoma = 001) // income (div by $100,000) // politics (conservative = 0, moderate = 1, liberal = 2) // 1 0.24 1 0 0 0.2950 2 -1 0.39 0 0 1 0.5120 1 1 0.63 0 1 0 0.7580 0 -1 0.36 1 0 0 0.4450 1 // end data
The program to call the function is:
// test_loadTxt.js let U = require("../../Utilities/utilities_lib.js"); let FS = require("fs"); // ---------------------------------------------------------- function main() { console.log("\nBegin test loadTxt() with JavaScript "); // raw data looks like: M 32 michigan 52,000.00 liberal // norm data looks like: -1 0.32 1 0 0 0.5250 2 // memory inefficient but simple let trainX = U.loadTxt2(".\\Data\\people_train_4.txt", "\t", [0,1,2,3,4,5], "//"); console.log(""); U.matShow(trainX, 4, 12); // memory efficient but complicated trainX = U.loadTxt3(".\\Data\\people_train_4.txt", "\t", [0,1,2,3,4,5], "//"); console.log(""); U.matShow(trainX, 4, 12); console.log("\nEnd demo "); } main();
I put the loadTxt2() and loadTxt3() functions in a Utility library.
The more-efficient but less-simple version is:
function loadTxt3(fn, delimit, usecols, comment) { // efficient but complicated let all = FS.readFileSync(fn, "utf8"); // giant string all = all.trim(); // strip final crlf in file let lines = all.split("\n"); // array of lines // count number non-comment lines let nRows = 0; for (let i = 0; i < lines.length; ++i) { if (!lines[i].startsWith(comment)) ++nRows; } nCols = usecols.length; let result = matMake(nRows, nCols, 0.0); let r = 0; // into lines let i = 0; // into result[][] while (r < lines.length) { if (lines[r].startsWith(comment)) { ++r; // next row } else { let tokens = lines[r].split(delimit); for (let j = 0; j < nCols; ++j) { result[i][j] = parseFloat(tokens[usecols[j]]); } ++r; ++i; } } return result; }
Good fun.
There are several traditional tradeoff themes in computer science, such as performance vs. simplicity. Here are three examples of traditional Eurasian clothing that trade off attractive complexity vs. functional simplicity.
You must be logged in to post a comment.