From 8bcb953bf8f773e0ca0db031f7f868bfb22c046a Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Wed, 16 Aug 2023 04:10:25 -0500 Subject: [PATCH 1/7] feat: add recipe for vector schema root appender cases --- java/source/data.rst | 79 ++++++++++++++++++++++++++++++++++++++++++ java/source/flight.rst | 6 ++-- 2 files changed, 82 insertions(+), 3 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index 929f9cb2..5b8714d1 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -23,6 +23,85 @@ Recipes related to compare, filtering or transforming data. .. contents:: +Append VectorSchemaRoots +======================== + +In some cases, VectorSchemaRoot needs to be modeled as a container. To accomplish +this, you can use ``VectorSchemaRootAppender.append``. The following code reads a +Parquet file with three row groups, gets the three vectors separately, and then +appends the three vectors together: + +.. testcode:: + + import java.io.IOException; + + import org.apache.arrow.dataset.file.FileFormat; + import org.apache.arrow.dataset.file.FileSystemDatasetFactory; + import org.apache.arrow.dataset.jni.NativeMemoryPool; + import org.apache.arrow.dataset.scanner.ScanOptions; + import org.apache.arrow.dataset.scanner.Scanner; + import org.apache.arrow.dataset.source.Dataset; + import org.apache.arrow.dataset.source.DatasetFactory; + import org.apache.arrow.memory.BufferAllocator; + import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.ipc.ArrowReader; + import org.apache.arrow.vector.util.VectorSchemaRootAppender; + + VectorSchemaRoot appendVectorSchemaRootAsOne(BufferAllocator allocator) { + VectorSchemaRoot result = null; + final ScanOptions options = new ScanOptions(/*batchSize*/ 32768); + final String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data4_3rg_gzip.parquet"; + try ( + final DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, + NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); + final Dataset dataset = datasetFactory.finish(); + final Scanner scanner = dataset.newScan(options); + final ArrowReader reader = scanner.scanBatches() + ) { + int rowgroup = 1; + while (reader.loadNextBatch()) { + try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) { + if(result == null) { + result = VectorSchemaRoot.create(root.getSchema(), allocator); + result.allocateNew(); // allocate each vector before append data for memory purposes + } + System.out.println("Loading VectorSchemaRoot: " + rowgroup++ + ", Records to append: " + root.getRowCount()); + VectorSchemaRootAppender.append(result, root); + } + } + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + // reading final VectorSchemaRoot + try (final VectorSchemaRoot root = appendVectorSchemaRootAsOne(new RootAllocator())) { + System.out.println("Total Rowcount: " + root.getRowCount()); + System.out.println(root.contentToTSVString()); + } + +.. testoutput:: + + Loading VectorSchemaRoot: 1, Records to append: 4 + Loading VectorSchemaRoot: 2, Records to append: 4 + Loading VectorSchemaRoot: 3, Records to append: 3 + Total Rowcount: 11 + age name + 10 Jean + 10 Lu + 10 Kei + 10 Sophia + 10 Mara + 20 Arit + 20 Neil + 20 Jason + 20 John + 20 Peter + 20 Ismael + Compare Vectors for Field Equality ================================== diff --git a/java/source/flight.rst b/java/source/flight.rst index 53017b24..fba29e71 100644 --- a/java/source/flight.rst +++ b/java/source/flight.rst @@ -287,7 +287,7 @@ Flight Client and Server S1: Server (Location): Listening on port 33333 C1: Client (Location): Connected to grpc+tcp://0.0.0.0:33333 C2: Client (Populate Data): Wrote 2 batches with 3 rows each - C3: Client (Get Metadata): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a}], bytes=-1, records=6, ordered=false} + C3: Client (Get Metadata): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6, ordered=false} C4: Client (Get Stream): Client Received batch #1, Data: name @@ -299,7 +299,7 @@ Flight Client and Server Manuel Felipe JJ - C5: Client (List Flights Info): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a}], bytes=-1, records=6, ordered=false} + C5: Client (List Flights Info): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6, ordered=false} C6: Client (Do Delete Action): Delete completed C7: Client (List Flights Info): After delete - No records C8: Server shut down successfully @@ -421,7 +421,7 @@ Once we do so, we can retrieve the metadata for that dataset. .. code-block:: shell - C3: Client (Get Metadata): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a}], bytes=-1, records=6} + C3: Client (Get Metadata): FlightInfo{schema=Schema, descriptor=profiles, endpoints=[FlightEndpoint{locations=[Location{uri=grpc+tcp://0.0.0.0:33333}], ticket=org.apache.arrow.flight.Ticket@58871b0a, expirationTime=(none)}], bytes=-1, records=6} Get Data ******** From 9e0c9bcdf867cbf0d2a00329c0e6c4fb6ab57a58 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Wed, 16 Aug 2023 04:15:44 -0500 Subject: [PATCH 2/7] fix: clean testoutput --- java/source/data.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index 5b8714d1..a330d56d 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -89,18 +89,18 @@ appends the three vectors together: Loading VectorSchemaRoot: 2, Records to append: 4 Loading VectorSchemaRoot: 3, Records to append: 3 Total Rowcount: 11 - age name - 10 Jean - 10 Lu - 10 Kei - 10 Sophia - 10 Mara - 20 Arit - 20 Neil - 20 Jason - 20 John - 20 Peter - 20 Ismael + age name + 10 Jean + 10 Lu + 10 Kei + 10 Sophia + 10 Mara + 20 Arit + 20 Neil + 20 Jason + 20 John + 20 Peter + 20 Ismael Compare Vectors for Field Equality ================================== From 459d3a0ba086dc0c933dc56fbabf18ae39292863 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Wed, 16 Aug 2023 04:19:46 -0500 Subject: [PATCH 3/7] fix: clean testoutput --- java/source/data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/source/data.rst b/java/source/data.rst index a330d56d..c48da9af 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -80,7 +80,7 @@ appends the three vectors together: // reading final VectorSchemaRoot try (final VectorSchemaRoot root = appendVectorSchemaRootAsOne(new RootAllocator())) { System.out.println("Total Rowcount: " + root.getRowCount()); - System.out.println(root.contentToTSVString()); + System.out.print(root.contentToTSVString()); } .. testoutput:: From d6edf868e2b7784e4db75119dcf7bc74f5305ddb Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Fri, 25 Aug 2023 12:22:11 -0500 Subject: [PATCH 4/7] fix: move from dataset to in memory data --- java/source/data.rst | 98 ++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 57 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index c48da9af..f23e04e8 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -31,76 +31,60 @@ this, you can use ``VectorSchemaRootAppender.append``. The following code reads Parquet file with three row groups, gets the three vectors separately, and then appends the three vectors together: -.. testcode:: +There will be another cases of integration w - import java.io.IOException; +.. testcode:: - import org.apache.arrow.dataset.file.FileFormat; - import org.apache.arrow.dataset.file.FileSystemDatasetFactory; - import org.apache.arrow.dataset.jni.NativeMemoryPool; - import org.apache.arrow.dataset.scanner.ScanOptions; - import org.apache.arrow.dataset.scanner.Scanner; - import org.apache.arrow.dataset.source.Dataset; - import org.apache.arrow.dataset.source.DatasetFactory; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; + import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.VectorSchemaRoot; - import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.util.VectorSchemaRootAppender; - VectorSchemaRoot appendVectorSchemaRootAsOne(BufferAllocator allocator) { - VectorSchemaRoot result = null; - final ScanOptions options = new ScanOptions(/*batchSize*/ 32768); - final String uri = "file:" + System.getProperty("user.dir") + "/thirdpartydeps/parquetfiles/data4_3rg_gzip.parquet"; - try ( - final DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, - NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); - final Dataset dataset = datasetFactory.finish(); - final Scanner scanner = dataset.newScan(options); - final ArrowReader reader = scanner.scanBatches() - ) { - int rowgroup = 1; - while (reader.loadNextBatch()) { - try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) { - if(result == null) { - result = VectorSchemaRoot.create(root.getSchema(), allocator); - result.allocateNew(); // allocate each vector before append data for memory purposes + try ( + BufferAllocator allocator = new RootAllocator() + ) { + VectorSchemaRoot result = null; + try ( + IntVector appenderOne = new IntVector("column", + allocator); + IntVector appenderTwo = new IntVector("column", + allocator) + ) { + appenderOne.allocateNew(2); + appenderOne.set(0, 100); + appenderOne.set(1, 20); + appenderOne.setValueCount(2); + appenderTwo.allocateNew(2); + appenderTwo.set(0, 34); + appenderTwo.set(1, 75); + appenderTwo.setValueCount(2); + try ( + final VectorSchemaRoot rootOne = new VectorSchemaRoot( + Collections.singletonList(appenderOne)); + final VectorSchemaRoot rootTwo = new VectorSchemaRoot( + Collections.singletonList(appenderTwo)) + ) { + if (result == null) { + result = VectorSchemaRoot.create(rootOne.getSchema(), + allocator); + result.allocateNew(); + } + VectorSchemaRootAppender.append(result, rootOne, rootTwo); + + System.out.print(result.contentToTSVString()); } - System.out.println("Loading VectorSchemaRoot: " + rowgroup++ + ", Records to append: " + root.getRowCount()); - VectorSchemaRootAppender.append(result, root); - } } - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - // reading final VectorSchemaRoot - try (final VectorSchemaRoot root = appendVectorSchemaRootAsOne(new RootAllocator())) { - System.out.println("Total Rowcount: " + root.getRowCount()); - System.out.print(root.contentToTSVString()); + result.close(); } .. testoutput:: - Loading VectorSchemaRoot: 1, Records to append: 4 - Loading VectorSchemaRoot: 2, Records to append: 4 - Loading VectorSchemaRoot: 3, Records to append: 3 - Total Rowcount: 11 - age name - 10 Jean - 10 Lu - 10 Kei - 10 Sophia - 10 Mara - 20 Arit - 20 Neil - 20 Jason - 20 John - 20 Peter - 20 Ismael + column + 100 + 20 + 34 + 75 Compare Vectors for Field Equality ================================== From 601a39ae9ead856e73923f0208ed875a16e38055 Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 29 Aug 2023 08:59:26 -0500 Subject: [PATCH 5/7] fix: code review --- java/source/data.rst | 56 +++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index f23e04e8..fa0e2103 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -27,11 +27,8 @@ Append VectorSchemaRoots ======================== In some cases, VectorSchemaRoot needs to be modeled as a container. To accomplish -this, you can use ``VectorSchemaRootAppender.append``. The following code reads a -Parquet file with three row groups, gets the three vectors separately, and then -appends the three vectors together: - -There will be another cases of integration w +this, you can use ``VectorSchemaRootAppender.append``. The following code gets +the three vectors separately, and then appends the three vectors together: .. testcode:: @@ -41,39 +38,30 @@ There will be another cases of integration w import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.util.VectorSchemaRootAppender; - try ( - BufferAllocator allocator = new RootAllocator() - ) { + try (BufferAllocator allocator = new RootAllocator()) { VectorSchemaRoot result = null; + IntVector appenderOne = new IntVector("column", allocator); + IntVector appenderTwo = new IntVector("column", allocator); + appenderOne.allocateNew(2); + appenderOne.set(0, 100); + appenderOne.set(1, 20); + appenderOne.setValueCount(2); + appenderTwo.allocateNew(2); + appenderTwo.set(0, 34); + appenderTwo.set(1, 75); + appenderTwo.setValueCount(2); try ( - IntVector appenderOne = new IntVector("column", - allocator); - IntVector appenderTwo = new IntVector("column", - allocator) + final VectorSchemaRoot rootOne = new VectorSchemaRoot( + Collections.singletonList(appenderOne)); + final VectorSchemaRoot rootTwo = new VectorSchemaRoot( + Collections.singletonList(appenderTwo)) ) { - appenderOne.allocateNew(2); - appenderOne.set(0, 100); - appenderOne.set(1, 20); - appenderOne.setValueCount(2); - appenderTwo.allocateNew(2); - appenderTwo.set(0, 34); - appenderTwo.set(1, 75); - appenderTwo.setValueCount(2); - try ( - final VectorSchemaRoot rootOne = new VectorSchemaRoot( - Collections.singletonList(appenderOne)); - final VectorSchemaRoot rootTwo = new VectorSchemaRoot( - Collections.singletonList(appenderTwo)) - ) { - if (result == null) { - result = VectorSchemaRoot.create(rootOne.getSchema(), - allocator); - result.allocateNew(); - } - VectorSchemaRootAppender.append(result, rootOne, rootTwo); - - System.out.print(result.contentToTSVString()); + if (result == null) { + result = VectorSchemaRoot.create(rootOne.getSchema(), allocator); + result.allocateNew(); } + VectorSchemaRootAppender.append(result, rootOne, rootTwo); + System.out.print(result.contentToTSVString()); } result.close(); } From 3b1151a95f73070bfd15ee5bb8397618b930c1ca Mon Sep 17 00:00:00 2001 From: david dali susanibar arce Date: Tue, 29 Aug 2023 09:47:25 -0500 Subject: [PATCH 6/7] fix: code review --- java/source/data.rst | 47 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index fa0e2103..f73ad21b 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -36,39 +36,40 @@ the three vectors separately, and then appends the three vectors together: import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.VectorSchemaRoot; + import org.apache.arrow.vector.types.pojo.ArrowType; + import org.apache.arrow.vector.types.pojo.Field; + import org.apache.arrow.vector.types.pojo.FieldType; + import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.VectorSchemaRootAppender; - try (BufferAllocator allocator = new RootAllocator()) { - VectorSchemaRoot result = null; - IntVector appenderOne = new IntVector("column", allocator); - IntVector appenderTwo = new IntVector("column", allocator); - appenderOne.allocateNew(2); + import static java.util.Arrays.asList; + + Field column_one = new Field("column-one", FieldType.nullable(new ArrowType.Int(32, true)), null); + Schema schema = new Schema(asList(column_one)); + try ( + BufferAllocator allocator = new RootAllocator(); + VectorSchemaRoot rootOne = VectorSchemaRoot.create(schema, allocator); + VectorSchemaRoot rootTwo = VectorSchemaRoot.create(schema, allocator); + VectorSchemaRoot result = VectorSchemaRoot.create(schema, allocator); + ) { + IntVector appenderOne = (IntVector) rootOne.getVector(0); + rootOne.allocateNew(); appenderOne.set(0, 100); appenderOne.set(1, 20); - appenderOne.setValueCount(2); - appenderTwo.allocateNew(2); + rootOne.setRowCount(2); + IntVector appenderTwo = (IntVector) rootTwo.getVector(0); + rootTwo.allocateNew(); appenderTwo.set(0, 34); appenderTwo.set(1, 75); - appenderTwo.setValueCount(2); - try ( - final VectorSchemaRoot rootOne = new VectorSchemaRoot( - Collections.singletonList(appenderOne)); - final VectorSchemaRoot rootTwo = new VectorSchemaRoot( - Collections.singletonList(appenderTwo)) - ) { - if (result == null) { - result = VectorSchemaRoot.create(rootOne.getSchema(), allocator); - result.allocateNew(); - } - VectorSchemaRootAppender.append(result, rootOne, rootTwo); - System.out.print(result.contentToTSVString()); - } - result.close(); + rootTwo.setRowCount(2); + result.allocateNew(); + VectorSchemaRootAppender.append(result, rootOne, rootTwo); + System.out.print(result.contentToTSVString()); } .. testoutput:: - column + column-one 100 20 34 From 2067de5c2852633cabfa6a3b5fd469c3e75f2be8 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 4 Sep 2023 15:50:35 -0400 Subject: [PATCH 7/7] Apply suggestions from code review --- java/source/data.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/source/data.rst b/java/source/data.rst index f73ad21b..04c7ce02 100644 --- a/java/source/data.rst +++ b/java/source/data.rst @@ -23,12 +23,12 @@ Recipes related to compare, filtering or transforming data. .. contents:: -Append VectorSchemaRoots -======================== +Concatenate VectorSchemaRoots +============================= In some cases, VectorSchemaRoot needs to be modeled as a container. To accomplish -this, you can use ``VectorSchemaRootAppender.append``. The following code gets -the three vectors separately, and then appends the three vectors together: +this, you can use ``VectorSchemaRootAppender.append``. The following code +creates two roots, then concatenates them together: .. testcode::