Skip to content

Commit

Permalink
feat: Support flexible column selections for Arrow import.
Browse files Browse the repository at this point in the history
  • Loading branch information
jheer committed Dec 17, 2020
1 parent 811eff9 commit 89d3319
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/api/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,15 @@ The *unpack* option determines if Arrow data should be "unpacked" from binary fo
* *arrowTable*: An [Apache Arrow](https://arrow.apache.org/docs/js/) data table.
* *options*: An Arrow import options object:
* *columns*: Ordered list of column names to include.
* *columns*: An ordered set of columns to import. The input may consist of: column name strings, column integer indices, objects with current column names as keys and new column names as values (for renaming), or a selection helper function such as [all](#all), [not](#not), or [range](#range)).
* *unpack*: A boolean flag (default `false`) to unpack binary-encoded Arrow data to standard JavaScript values. Unpacking can incur an upfront time and memory cost to extract data to new arrays, but can speed up later query processing by enabling faster data access.
*Examples*
```js
// requires that Apache Arrow has been imported as the 'arrow' variable
// load an Arrow file from 'url' and create a corresponding Arquero table
const arrow = require('apache-arrow');
aq.fromArrow(await arrow.Table.from(fetch(url)))
```
Expand Down
33 changes: 26 additions & 7 deletions src/format/from-arrow.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import error from '../util/error';
import toString from '../util/to-string';
import unroll from '../util/unroll';
import unroll2 from '../util/unroll2';
import resolve, { all } from '../verbs/expr/selection';

// Hardwire Arrow type ids to avoid explicit dependency
// https://github.com/apache/arrow/blob/master/js/src/enum.ts
Expand All @@ -12,7 +13,12 @@ export const STRUCT = 13;
/**
* Options for Apache Arrow import.
* @typedef {object} ArrowOptions
* @property {string[]} [columns] Ordered list of column names to import.
* @property {string|string[]|number|number[]|object|Function} columns
* An ordered set of columns to import. The input may consist of:
* - column name strings,
* - column integer indices,
* - objects with current column names as keys and new column names as values (for renaming), or
* - selection helper functions such as {@link all}, {@link not}, or {@link range}.
* @property {boolean} [unpack=false] Flag to unpack binary-encoded Arrow
* data to standard JavaScript values. Unpacking can incur an upfront time
* and memory cost to extract data to new arrays, but can speed up later
Expand All @@ -26,21 +32,34 @@ export const STRUCT = 13;
* @param {ColumnTable} table A new table containing the imported values.
*/
export default function(arrowTable, options = {}) {
const names = options.columns || arrowTable.schema.fields.map(f => f.name);
const { names, cols } = resolveColumns(arrowTable, options.columns);
const count = arrowTable.count();

const cols = names.map(name =>
arrowTable.getColumn(name) ||
error(`Arrow column does not exist: ${toString(name)}`)
);

const data = arrowTable.length === count
? collect(names, cols, !!options.unpack)
: collectFiltered(arrowTable, names, cols, count);

return new ColumnTable(data, null, null, null, null, names);
}

function resolveColumns(arrowTable, selection) {
const fields = arrowTable.schema.fields.map(f => f.name);
const map = resolve({
columnNames: filter => filter ? fields.filter(filter) : fields.slice(),
columnIndex: name => fields.indexOf(name)
}, selection || all());

const names = [];
const cols = [];

map.forEach((name, key) => {
names.push(name);
cols.push(arrowTable.getColumn(key));
});

return { names, cols };
}

function collect(names, cols, unpack) {
const data = {};

Expand Down
2 changes: 1 addition & 1 deletion src/verbs/expr/selection.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ export function range(start, end) {
export function matches(pattern) {
if (isString(pattern)) pattern = RegExp(escapeRegExp(pattern));
return decorate(
table => table.columnNames().filter(name => pattern.test(name)),
table => table.columnNames(name => pattern.test(name)),
() => ({ matches: [pattern.source, pattern.flags] })
);
}
Expand Down
30 changes: 30 additions & 0 deletions test/format/arrow-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import tape from 'tape';
import fromArrow, { LIST, STRUCT } from '../../src/format/from-arrow';
import { not } from '../../src/verbs';

// test stubs for Arrow Column API
function arrowColumn(data, nullCount = 0) {
Expand Down Expand Up @@ -125,6 +126,35 @@ tape('fromArrow can unpack Apache Arrow tables', t => {
t.end();
});

tape('fromArrow can select Apache Arrow columns', t => {
const u = arrowColumn([1, 2, 3, 4, 5]);
const v = arrowColumn(['a', 'b', null, 'd', 'e'], 1);
const x = arrowDictionary(['cc', 'dd', 'cc', 'dd', 'cc']);
const y = arrowDictionary(['aa', 'aa', null, 'bb', 'bb']);
const at = arrowTable({ u, v, x, y });

const s1 = fromArrow(at, { columns: 'x' });
t.deepEqual(s1.columnNames(), ['x'], 'select by column name');
t.equal(s1.column('x'), x, 'correct column selected');

const s2 = fromArrow(at, { columns: ['u', 'y'] });
t.deepEqual(s2.columnNames(), ['u', 'y'], 'select by column names');
t.equal(s2.column('u'), u, 'correct column selected');
t.equal(s2.column('y'), y, 'correct column selected');

const s3 = fromArrow(at, { columns: not('u', 'y') });
t.deepEqual(s3.columnNames(), ['v', 'x'], 'select by helper');
t.equal(s3.column('v'), v, 'correct column selected');
t.equal(s3.column('x'), x, 'correct column selected');

const s4 = fromArrow(at, { columns: { u: 'a', x: 'b'} });
t.deepEqual(s4.columnNames(), ['a', 'b'], 'select by helper');
t.equal(s4.column('a'), u, 'correct column selected');
t.equal(s4.column('b'), x, 'correct column selected');

t.end();
});

tape('fromArrow can read Apache Arrow lists', t => {
const d = [[1, 2, 3], null, [4, 5]];
const l = arrowListColumn(d);
Expand Down

0 comments on commit 89d3319

Please sign in to comment.