Skip to content

Commit

Permalink
Parquet - Change the way of how dictionary page header is detected wh…
Browse files Browse the repository at this point in the history
…ile reading column chunks (#1005)

* Parquet - Change the way of how dictionary page header is detected while reading column chunks

* Fixed reading column statistics
  • Loading branch information
norberttech authored Mar 6, 2024
1 parent 85cbeb0 commit 8da52fd
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,23 @@ public function read(ColumnChunk $columnChunk, FlatColumn $column, $stream) : \G

\fseek($stream, $offset);

if ($columnChunk->dictionaryPageOffset()) {
$dictionaryHeader = $this->readHeader($stream, $offset);
$firstHeader = $this->readHeader($stream, $offset);

if ($dictionaryHeader === null) {
throw new RuntimeException('Dictionary page header not found in column chunk under offset: ' . $offset);
}
if ($firstHeader === null) {
throw new RuntimeException('Cannot read first page header');
}

if ($firstHeader->type()->isDictionaryPage()) {
$dictionary = $this->pageReader->readDictionary(
$column,
$dictionaryHeader,
$firstHeader,
$columnChunk->codec(),
$stream
);
$offset = \ftell($stream);
} else {
$dictionary = null;
\fseek($stream, $offset);
}

$columnData = ColumnData::initialize($column);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,9 @@ public function isDataPage() : bool
{
return $this->value === self::DATA_PAGE->value || $this->value === self::DATA_PAGE_V2->value;
}

public function isDictionaryPage() : bool
{
return $this->value === self::DICTIONARY_PAGE->value;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Flow\Parquet\BinaryReader\BinaryBufferReader;
use Flow\Parquet\ParquetFile\Data\PlainValueUnpacker;
use Flow\Parquet\ParquetFile\Schema\ColumnPrimitiveType;
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
use Flow\Parquet\ParquetFile\Statistics;

Expand All @@ -24,6 +25,10 @@ public function max(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->max, 'UTF-8')) {
return $this->statistics->max;
}

return (new PlainValueUnpacker((new BinaryBufferReader($this->statistics->max))))->unpack($column, 1)[0];
}

Expand All @@ -33,6 +38,10 @@ public function maxValue(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->maxValue, 'UTF-8')) {
return $this->statistics->maxValue;
}

return (new PlainValueUnpacker((new BinaryBufferReader($this->statistics->maxValue))))->unpack($column, 1)[0];
}

Expand All @@ -42,6 +51,10 @@ public function min(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->min, 'UTF-8')) {
return $this->statistics->min;
}

return (new PlainValueUnpacker((new BinaryBufferReader($this->statistics->min))))->unpack($column, 1)[0];
}

Expand All @@ -51,6 +64,10 @@ public function minValue(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->minValue, 'UTF-8')) {
return $this->statistics->minValue;
}

return (new PlainValueUnpacker((new BinaryBufferReader($this->statistics->minValue))))->unpack($column, 1)[0];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ public static function isString(FlatColumn $column) : bool
$logicalType = $column->logicalType();

if ($logicalType === null) {
if ($column->convertedType() === ConvertedType::UTF8) {
return true;
}

return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ public static function uuid(string $string, Repetition $repetition = Repetition:
return new self($string, PhysicalType::BYTE_ARRAY, null, LogicalType::uuid(), $repetition);
}

public function convertedType() : ?ConvertedType
{
return $this->convertedType;
}

/**
* @psalm-suppress PossiblyNullOperand
*/
Expand Down

0 comments on commit 8da52fd

Please sign in to comment.