Skip to content

Commit

Permalink
Parquet Reader
Browse files Browse the repository at this point in the history
Added reading MAP logical types

Added reading LIST type

Fixed nullable lists handling

First implementation of Dremel encoding

Added logging to Dremel algorithm

Reconstruction of nested data structures based on schema definition

Performance optimization

Attempt to read map with values as a lists

Rebuilding structures

Extracted rebuilding columns logic to ColumnBuilder class

Reading nested structures

Restored usage of flow array functions

Added dremel/parquet to test suite

Updated github workflows

Avoid calculating remaining lenght/current position in BinaryReader on the fly

Make DataSize value object mutable

Move reading multiple values from Buffer into BufferReader

Allow to read from stream

Retrieve column chunks as generator

Moved reading flat columns into generics

Read parquet files struct columns through generators

Fixed reading column chunks

Reduced number of iterations over generators

Keep stream offset to avoid generators overlapping

Read all column chunks from a row group at once to avoid dealing with rows split between pages

Added notes for performance optimizations

Added PageByPage ChunkReader implementation

Fixed reding bytes of array when it's not a string

Adjusted schema ddl generation

Allow to limit numbers of returned rows

Fixed limit when there is more than one column chunk

Adjusted composer.json files in all subrepos

Added Parquet Reader options - handling INT96 as DateTime, reading byte arrays as strings, convert nanos to micros timestamps

Marked codename parquet extractor as deprecated

Added snappy extension detection

Converted testsuite fixtures into gzip from snappy

Fixed issues related to missing snapy_uncompress function

Added python scripts used to generate test/fixtures data for reader

Added resources folder into gitattributes as export-igonre
  • Loading branch information
norberttech committed Oct 13, 2023
1 parent aa58423 commit e559f6f
Show file tree
Hide file tree
Showing 169 changed files with 14,490 additions and 512 deletions.
6 changes: 6 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ lib-array-dot:
- any: ["src/lib/array-dot/**/*"]
lib-doctrine-dbal-bulk:
- any: ["src/lib/doctrine-dbal-bulk/**/*"]
lib-parquet:
- any: ["src/lib/parquet/**/*"]
lib-dremel:
- any: ["src/lib/dremel/**/*"]

adapter-amphp:
- any: ["src/adapter/etl-adapter-amphp/**/*"]
Expand All @@ -26,6 +30,8 @@ adapter-doctrine:
- any: ["src/adapter/etl-adapter-doctrine/**/*"]
adapter-elasticsearch:
- any: ["src/adapter/etl-adapter-elasticsearch/**/*"]
adapter-meilisearch:
- any: ["src/adapter/etl-adapter-meilisearch/**/*"]
adapter-google-sheet:
- any: ["src/adapter/etl-adapter-google-sheet/**/*"]
adapter-http:
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/monorepo-split.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ jobs:
split_repository: 'array-dot'
- local_path: 'src/lib/doctrine-dbal-bulk'
split_repository: 'doctrine-dbal-bulk'
- local_path: 'src/lib/parquet'
split_repository: 'parquet'
- local_path: 'src/lib/dremel'
split_repository: 'dremel'

- local_path: 'src/adapter/etl-adapter-amphp'
split_repository: 'etl-adapter-amphp'
Expand All @@ -40,7 +44,7 @@ jobs:
- local_path: 'src/adapter/etl-adapter-elasticsearch'
split_repository: 'etl-adapter-elasticsearch'
- local_path: 'src/adapter/etl-adapter-meilisearch'
split_repository: 'etl-adapter-meilisearch'
split_repository: 'etl-adapter-meilisearch'
- local_path: 'src/adapter/etl-adapter-google-sheet'
split_repository: 'etl-adapter-google-sheet'
- local_path: 'src/adapter/etl-adapter-http'
Expand Down
22 changes: 15 additions & 7 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"license": "MIT",
"require": {
"php": "~8.1 || ~8.2",
"ext-bcmath": "*",
"ext-dom": "*",
"ext-hash": "*",
"ext-json": "*",
Expand All @@ -25,7 +26,9 @@
"google/apiclient": "^2.13",
"halaxa/json-machine": "^1.0",
"league/flysystem": "^3.0",
"meilisearch/meilisearch-php": "^1.1",
"monolog/monolog": "^3.0",
"packaged/thrift": "^0.15.0",
"psr/http-client": "^1.0",
"psr/log": "^2.0 || ^3.0",
"psr/simple-cache": "^1.0 || ^2.0 || ^3.0",
Expand All @@ -41,7 +44,6 @@
"laravel/serializable-closure": "^1.1",
"league/flysystem-aws-s3-v3": "^3.0",
"league/flysystem-azure-blob-storage": "^3.0",
"meilisearch/meilisearch-php": "^1.1",
"moneyphp/money": "^4",
"nyholm/psr7": "^1.4",
"php-http/curl-client": "^2.2",
Expand All @@ -56,7 +58,8 @@
"files": [
"build/version.php",
"src/core/etl/src/Flow/ETL/DSL/functions.php",
"src/lib/array-dot/src/Flow/ArrayDot/array_dot.php"
"src/lib/array-dot/src/Flow/ArrayDot/array_dot.php",
"src/lib/parquet/src/Flow/Parquet/functions.php"
],
"psr-4": {
"Flow\\": [
Expand All @@ -77,7 +80,9 @@
"src/adapter/etl-adapter-xml/src/Flow",
"src/core/etl/src/Flow",
"src/lib/array-dot/src/Flow",
"src/lib/doctrine-dbal-bulk/src/Flow"
"src/lib/doctrine-dbal-bulk/src/Flow",
"src/lib/dremel/src/Flow",
"src/lib/parquet/src/Flow"
],
"Flow\\Doctrine\\Bulk\\": [
"src/lib/doctrine-dbal-bulk/src/Flow/Doctrine/Bulk"
Expand All @@ -92,8 +97,8 @@
"Flow\\": [
"src/adapter/etl-adapter-amphp/tests/Flow",
"src/adapter/etl-adapter-avro/tests/Flow",
"src/adapter/etl-adapter-csv/tests/Flow",
"src/adapter/etl-adapter-chartjs/tests/Flow",
"src/adapter/etl-adapter-csv/tests/Flow",
"src/adapter/etl-adapter-doctrine/tests/Flow",
"src/adapter/etl-adapter-elasticsearch/tests/Flow",
"src/adapter/etl-adapter-google-sheet/tests/Flow",
Expand All @@ -107,7 +112,9 @@
"src/adapter/etl-adapter-xml/tests/Flow",
"src/core/etl/tests/Flow",
"src/lib/array-dot/tests/Flow",
"src/lib/doctrine-dbal-bulk/tests/Flow"
"src/lib/doctrine-dbal-bulk/tests/Flow",
"src/lib/dremel/tests/Flow",
"src/lib/parquet/tests/Flow"
],
"Flow\\Doctrine\\Bulk\\Tests\\": [
"src/lib/doctrine-dbal-bulk/tests/Flow/Doctrine/Bulk/Tests"
Expand All @@ -126,11 +133,12 @@
"flow-php/array-dot": "self.version",
"flow-php/doctrine-dbal-bulk": "self.version",
"flow-php/doctrine-dbal-bulk-tools": "self.version",
"flow-php/dremel": "self.version",
"flow-php/etl": "self.version",
"flow-php/etl-adapter-amphp": "self.version",
"flow-php/etl-adapter-avro": "self.version",
"flow-php/etl-adapter-csv": "self.version",
"flow-php/etl-adapter-chartjs": "self.version",
"flow-php/etl-adapter-csv": "self.version",
"flow-php/etl-adapter-dbal-tools": "self.version",
"flow-php/etl-adapter-doctrine": "self.version",
"flow-php/etl-adapter-elasticsearch": "self.version",
Expand All @@ -144,7 +152,7 @@
"flow-php/etl-adapter-reactphp": "self.version",
"flow-php/etl-adapter-text": "self.version",
"flow-php/etl-adapter-xml": "self.version",
"flow-php/etl-tools": "self.version"
"flow-php/parquet": "self.version"
},
"scripts": {
"build": [
Expand Down
Loading

0 comments on commit e559f6f

Please sign in to comment.