diff --git a/docs/api/index.md b/docs/api/index.md index 6f358743..a91be176 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -91,7 +91,7 @@ The *unpack* option determines if Arrow data should be "unpacked" from binary fo * *arrowTable*: An [Apache Arrow](https://arrow.apache.org/docs/js/) data table. * *options*: An Arrow import options object: - * *columns*: Ordered list of column names to include. + * *columns*: An ordered set of columns to import. The input may consist of: column name strings, column integer indices, objects with current column names as keys and new column names as values (for renaming), or a selection helper function such as [all](#all), [not](#not), or [range](#range)). * *unpack*: A boolean flag (default `false`) to unpack binary-encoded Arrow data to standard JavaScript values. Unpacking can incur an upfront time and memory cost to extract data to new arrays, but can speed up later query processing by enabling faster data access. *Examples* @@ -99,6 +99,7 @@ The *unpack* option determines if Arrow data should be "unpacked" from binary fo ```js // requires that Apache Arrow has been imported as the 'arrow' variable // load an Arrow file from 'url' and create a corresponding Arquero table +const arrow = require('apache-arrow'); aq.fromArrow(await arrow.Table.from(fetch(url))) ``` diff --git a/src/format/from-arrow.js b/src/format/from-arrow.js index e65a8302..4d516595 100644 --- a/src/format/from-arrow.js +++ b/src/format/from-arrow.js @@ -3,6 +3,7 @@ import error from '../util/error'; import toString from '../util/to-string'; import unroll from '../util/unroll'; import unroll2 from '../util/unroll2'; +import resolve, { all } from '../verbs/expr/selection'; // Hardwire Arrow type ids to avoid explicit dependency // https://github.com/apache/arrow/blob/master/js/src/enum.ts @@ -12,7 +13,12 @@ export const STRUCT = 13; /** * Options for Apache Arrow import. * @typedef {object} ArrowOptions - * @property {string[]} [columns] Ordered list of column names to import. + * @property {string|string[]|number|number[]|object|Function} columns + * An ordered set of columns to import. The input may consist of: + * - column name strings, + * - column integer indices, + * - objects with current column names as keys and new column names as values (for renaming), or + * - selection helper functions such as {@link all}, {@link not}, or {@link range}. * @property {boolean} [unpack=false] Flag to unpack binary-encoded Arrow * data to standard JavaScript values. Unpacking can incur an upfront time * and memory cost to extract data to new arrays, but can speed up later @@ -26,14 +32,9 @@ export const STRUCT = 13; * @param {ColumnTable} table A new table containing the imported values. */ export default function(arrowTable, options = {}) { - const names = options.columns || arrowTable.schema.fields.map(f => f.name); + const { names, cols } = resolveColumns(arrowTable, options.columns); const count = arrowTable.count(); - const cols = names.map(name => - arrowTable.getColumn(name) || - error(`Arrow column does not exist: ${toString(name)}`) - ); - const data = arrowTable.length === count ? collect(names, cols, !!options.unpack) : collectFiltered(arrowTable, names, cols, count); @@ -41,6 +42,24 @@ export default function(arrowTable, options = {}) { return new ColumnTable(data, null, null, null, null, names); } +function resolveColumns(arrowTable, selection) { + const fields = arrowTable.schema.fields.map(f => f.name); + const map = resolve({ + columnNames: filter => filter ? fields.filter(filter) : fields.slice(), + columnIndex: name => fields.indexOf(name) + }, selection || all()); + + const names = []; + const cols = []; + + map.forEach((name, key) => { + names.push(name); + cols.push(arrowTable.getColumn(key)); + }); + + return { names, cols }; +} + function collect(names, cols, unpack) { const data = {}; diff --git a/src/verbs/expr/selection.js b/src/verbs/expr/selection.js index 7fab3591..72cd8b0e 100644 --- a/src/verbs/expr/selection.js +++ b/src/verbs/expr/selection.js @@ -93,7 +93,7 @@ export function range(start, end) { export function matches(pattern) { if (isString(pattern)) pattern = RegExp(escapeRegExp(pattern)); return decorate( - table => table.columnNames().filter(name => pattern.test(name)), + table => table.columnNames(name => pattern.test(name)), () => ({ matches: [pattern.source, pattern.flags] }) ); } diff --git a/test/format/arrow-test.js b/test/format/arrow-test.js index 15dc2265..29838812 100644 --- a/test/format/arrow-test.js +++ b/test/format/arrow-test.js @@ -1,5 +1,6 @@ import tape from 'tape'; import fromArrow, { LIST, STRUCT } from '../../src/format/from-arrow'; +import { not } from '../../src/verbs'; // test stubs for Arrow Column API function arrowColumn(data, nullCount = 0) { @@ -125,6 +126,35 @@ tape('fromArrow can unpack Apache Arrow tables', t => { t.end(); }); +tape('fromArrow can select Apache Arrow columns', t => { + const u = arrowColumn([1, 2, 3, 4, 5]); + const v = arrowColumn(['a', 'b', null, 'd', 'e'], 1); + const x = arrowDictionary(['cc', 'dd', 'cc', 'dd', 'cc']); + const y = arrowDictionary(['aa', 'aa', null, 'bb', 'bb']); + const at = arrowTable({ u, v, x, y }); + + const s1 = fromArrow(at, { columns: 'x' }); + t.deepEqual(s1.columnNames(), ['x'], 'select by column name'); + t.equal(s1.column('x'), x, 'correct column selected'); + + const s2 = fromArrow(at, { columns: ['u', 'y'] }); + t.deepEqual(s2.columnNames(), ['u', 'y'], 'select by column names'); + t.equal(s2.column('u'), u, 'correct column selected'); + t.equal(s2.column('y'), y, 'correct column selected'); + + const s3 = fromArrow(at, { columns: not('u', 'y') }); + t.deepEqual(s3.columnNames(), ['v', 'x'], 'select by helper'); + t.equal(s3.column('v'), v, 'correct column selected'); + t.equal(s3.column('x'), x, 'correct column selected'); + + const s4 = fromArrow(at, { columns: { u: 'a', x: 'b'} }); + t.deepEqual(s4.columnNames(), ['a', 'b'], 'select by helper'); + t.equal(s4.column('a'), u, 'correct column selected'); + t.equal(s4.column('b'), x, 'correct column selected'); + + t.end(); +}); + tape('fromArrow can read Apache Arrow lists', t => { const d = [[1, 2, 3], null, [4, 5]]; const l = arrowListColumn(d);