From bdb22f372753259aca1eb40b75f4355e76607aa3 Mon Sep 17 00:00:00 2001 From: jheer Date: Wed, 9 Dec 2020 12:50:59 +0100 Subject: [PATCH] fix: Fix fromArrow nested null value extraction. --- src/format/from-arrow.js | 29 +++++++++++++---------------- test/format/arrow-test.js | 26 ++++++++++++++------------ 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/format/from-arrow.js b/src/format/from-arrow.js index ebb6294e..da870936 100644 --- a/src/format/from-arrow.js +++ b/src/format/from-arrow.js @@ -44,35 +44,32 @@ export default function(arrowTable, options = {}) { } function arrayFromNested(vector) { - const create = vector.typeId === LIST ? listExtractor(vector) + const extract = vector.typeId === LIST ? i => arrayExtractor(vector.get(i)) : vector.typeId === STRUCT ? structExtractor(vector) : error(`Unsupported Arrow type: ${toString(vector.VectorName)}`); // generate and return objects for each nested value - return Array.from({ length: vector.length }, create); + return Array.from( + { length: vector.length }, + (_, i) => vector.isValid(i) ? extract(i) : null + ); } -function listExtractor(vector) { - // extract a list value. recurse if nested, otherwise convert to array - return (_, i) => { - const v = vector.get(i); - return v.numChildren ? arrayFromNested(v) : arrayFromVector(v); - }; +function arrayExtractor(vector) { + // extract an array, recurse if nested. + return vector.numChildren + ? arrayFromNested(vector) + : arrayFromVector(vector); } function structExtractor(vector) { - // extract struct field names + // extract struct field names and values const names = vector.type.children.map(field => field.name); - - // extract struct field values into parallel arrays - const data = names.map((_, i) => { - const v = vector.getChildAt(i); - return v.numChildren ? arrayFromNested(v) : arrayFromVector(v); - }); + const values = names.map((_, i) => arrayExtractor(vector.getChildAt(i))); // function to generate objects with field name properties return unroll( - data, '_,i', + values, 'i', '({' + names.map((_, d) => `${toString(_)}:_${d}[i]`) + '})' ); } diff --git a/test/format/arrow-test.js b/test/format/arrow-test.js index f5b5ba49..f528aaaa 100644 --- a/test/format/arrow-test.js +++ b/test/format/arrow-test.js @@ -7,6 +7,7 @@ function arrowColumn(data, nullCount = 0) { length: data.length, get: row => data[row], toArray: () => data, + isValid: row => data[row] != null, [Symbol.iterator]: () => data[Symbol.iterator](), nullCount, _data: data @@ -51,19 +52,20 @@ function arrowDictionary(data) { } function arrowListColumn(data) { - const c = arrowColumn(data.map(d => arrowColumn(d))); + const c = arrowColumn(data.map(d => d ? arrowColumn(d) : null)); c.typeId = LIST; c.numChildren = 1; return c; } -function arrowStructColumn(names, children) { +function arrowStructColumn(valid, names, children) { return { type: { children: names.map(name => ({ name })) }, typeId: STRUCT, - length: children[0].length, + length: valid.length, numChildren: names.length, - getChildAt: i => children[i] + getChildAt: i => children[i], + isValid: row => !!valid[row] }; } @@ -104,7 +106,7 @@ tape('fromArrow can unpack Apache Arrow tables', t => { }); tape('fromArrow can read Apache Arrow lists', t => { - const d = [[1, 2, 3], [4, 5]]; + const d = [[1, 2, 3], null, [4, 5]]; const l = arrowListColumn(d); const at = arrowTable({ l }); const dt = fromArrow(at); @@ -114,10 +116,10 @@ tape('fromArrow can read Apache Arrow lists', t => { }); tape('fromArrow can read Apache Arrow structs', t => { - const d = [{ foo: 1, bar: [2, 3] }, { foo: 2, bar: [4] }]; - const s = arrowStructColumn(Object.keys(d[0]), [ - arrowColumn(d.map(v => v.foo)), - arrowListColumn(d.map(v => v.bar)) + const d = [{ foo: 1, bar: [2, 3] }, null, { foo: 2, bar: [4] }]; + const s = arrowStructColumn(d, Object.keys(d[0]), [ + arrowColumn(d.map(v => v ? v.foo : null)), + arrowListColumn(d.map(v => v ? v.bar : null)) ]); const at = arrowTable({ s }); const dt = fromArrow(at); @@ -128,9 +130,9 @@ tape('fromArrow can read Apache Arrow structs', t => { tape('fromArrow can read nested Apache Arrow structs', t => { const d = [{ foo: 1, bar: { bop: 2 } }, { foo: 2, bar: { bop: 3 } }]; - const s = arrowStructColumn(Object.keys(d[0]), [ - arrowColumn(d.map(v => v.foo)), - arrowStructColumn(['bop'], [ arrowColumn([2, 3]) ]) + const s = arrowStructColumn(d, Object.keys(d[0]), [ + arrowColumn(d.map(v => v ? v.foo : null)), + arrowStructColumn([1, 1], ['bop'], [ arrowColumn([2, 3]) ]) ]); const at = arrowTable({ s }); const dt = fromArrow(at);