-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Parquet: Add readers and writers for the internal object model #11904
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
b0658f9
Parquet: Refactor BaseParquetWriter
ajantha-bhat 9dfca7d
Parquet: Refactor BaseParquetReaders
ajantha-bhat 868cc50
Parquet: Add internal writer and reader
ajantha-bhat cb68af4
Address comments
ajantha-bhat 2f210d5
Address new comments
ajantha-bhat 0951235
Address latest comments
ajantha-bhat d5fb4ca
Handle remaining comments
ajantha-bhat 3b7ba50
Fix type params and time reader.
rdblue 20f7c26
Apply spotless
ajantha-bhat File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,19 +18,9 @@ | |
*/ | ||
package org.apache.iceberg.data.parquet; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.ByteOrder; | ||
import java.time.Instant; | ||
import java.time.LocalDate; | ||
import java.time.LocalDateTime; | ||
import java.time.LocalTime; | ||
import java.time.OffsetDateTime; | ||
import java.time.ZoneOffset; | ||
import java.time.temporal.ChronoUnit; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.concurrent.TimeUnit; | ||
import org.apache.iceberg.MetadataColumns; | ||
import org.apache.iceberg.Schema; | ||
import org.apache.iceberg.parquet.ParquetSchemaUtil; | ||
|
@@ -50,6 +40,10 @@ | |
import org.apache.parquet.schema.PrimitiveType; | ||
import org.apache.parquet.schema.Type; | ||
|
||
/** | ||
* @deprecated since 1.8.0, will be made package-private in 1.9.0 | ||
*/ | ||
@Deprecated | ||
public abstract class BaseParquetReaders<T> { | ||
protected BaseParquetReaders() {} | ||
|
||
|
@@ -76,6 +70,46 @@ protected ParquetValueReader<T> createReader( | |
protected abstract ParquetValueReader<T> createStructReader( | ||
List<Type> types, List<ParquetValueReader<?>> fieldReaders, Types.StructType structType); | ||
|
||
protected ParquetValueReader<?> fixedReader(ColumnDescriptor desc) { | ||
return new GenericParquetReaders.FixedReader(desc); | ||
} | ||
|
||
protected ParquetValueReader<?> dateReader(ColumnDescriptor desc) { | ||
return new GenericParquetReaders.DateReader(desc); | ||
} | ||
|
||
protected ParquetValueReader<?> timeReader( | ||
ColumnDescriptor desc, LogicalTypeAnnotation.TimeUnit unit) { | ||
switch (unit) { | ||
case MICROS: | ||
return new GenericParquetReaders.TimeReader(desc); | ||
case MILLIS: | ||
return new GenericParquetReaders.TimeMillisReader(desc); | ||
default: | ||
throw new UnsupportedOperationException("Unsupported unit for time: " + unit); | ||
} | ||
} | ||
|
||
protected ParquetValueReader<?> timestampReader( | ||
ColumnDescriptor desc, LogicalTypeAnnotation.TimeUnit unit, boolean isAdjustedToUTC) { | ||
if (desc.getPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) { | ||
return new GenericParquetReaders.TimestampInt96Reader(desc); | ||
} | ||
|
||
switch (unit) { | ||
case MICROS: | ||
return isAdjustedToUTC | ||
? new GenericParquetReaders.TimestamptzReader(desc) | ||
: new GenericParquetReaders.TimestampReader(desc); | ||
case MILLIS: | ||
return isAdjustedToUTC | ||
? new GenericParquetReaders.TimestamptzMillisReader(desc) | ||
: new GenericParquetReaders.TimestampMillisReader(desc); | ||
default: | ||
throw new UnsupportedOperationException("Unsupported unit for timestamp: " + unit); | ||
} | ||
} | ||
|
||
protected Object convertConstant(org.apache.iceberg.types.Type type, Object value) { | ||
return value; | ||
} | ||
|
@@ -164,37 +198,23 @@ public Optional<ParquetValueReader<?>> visit(DecimalLogicalTypeAnnotation decima | |
@Override | ||
public Optional<ParquetValueReader<?>> visit( | ||
LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { | ||
return Optional.of(new DateReader(desc)); | ||
ajantha-bhat marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return Optional.of(dateReader(desc)); | ||
} | ||
|
||
@Override | ||
public Optional<ParquetValueReader<?>> visit( | ||
LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { | ||
if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { | ||
return Optional.of(new TimeReader(desc)); | ||
} else if (timeLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { | ||
return Optional.of(new TimeMillisReader(desc)); | ||
} | ||
|
||
return Optional.empty(); | ||
return Optional.of(timeReader(desc, timeLogicalType.getUnit())); | ||
} | ||
|
||
@Override | ||
public Optional<ParquetValueReader<?>> visit( | ||
LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { | ||
if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MICROS) { | ||
Types.TimestampType tsMicrosType = (Types.TimestampType) expected; | ||
return tsMicrosType.shouldAdjustToUTC() | ||
? Optional.of(new TimestamptzReader(desc)) | ||
: Optional.of(new TimestampReader(desc)); | ||
} else if (timestampLogicalType.getUnit() == LogicalTypeAnnotation.TimeUnit.MILLIS) { | ||
Types.TimestampType tsMillisType = (Types.TimestampType) expected; | ||
return tsMillisType.shouldAdjustToUTC() | ||
? Optional.of(new TimestamptzMillisReader(desc)) | ||
: Optional.of(new TimestampMillisReader(desc)); | ||
} | ||
|
||
return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(timestampLogicalType); | ||
return Optional.of( | ||
timestampReader( | ||
desc, | ||
timestampLogicalType.getUnit(), | ||
((Types.TimestampType) expected).shouldAdjustToUTC())); | ||
} | ||
|
||
@Override | ||
|
@@ -219,6 +239,12 @@ public Optional<ParquetValueReader<?>> visit( | |
LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { | ||
return Optional.of(new ParquetValueReaders.BytesReader(desc)); | ||
} | ||
|
||
@Override | ||
public Optional<ParquetValueReader<?>> visit( | ||
LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { | ||
return Optional.of(ParquetValueReaders.uuids(desc)); | ||
} | ||
} | ||
|
||
private class ReadBuilder extends TypeWithSchemaVisitor<ParquetValueReader<?>> { | ||
|
@@ -359,7 +385,7 @@ public ParquetValueReader<?> primitive( | |
|
||
ColumnDescriptor desc = type.getColumnDescription(currentPath()); | ||
|
||
if (primitive.getOriginalType() != null) { | ||
if (primitive.getLogicalTypeAnnotation() != null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with this change, but please point these kinds of changes out for reviewers. The old version worked because all of the supported logical type annotations had an equivalent |
||
return primitive | ||
.getLogicalTypeAnnotation() | ||
.accept(new LogicalTypeAnnotationParquetValueReaderVisitor(desc, expected, primitive)) | ||
|
@@ -371,7 +397,7 @@ public ParquetValueReader<?> primitive( | |
|
||
switch (primitive.getPrimitiveTypeName()) { | ||
case FIXED_LEN_BYTE_ARRAY: | ||
return new FixedReader(desc); | ||
return fixedReader(desc); | ||
case BINARY: | ||
if (expected.typeId() == org.apache.iceberg.types.Type.TypeID.STRING) { | ||
return new ParquetValueReaders.StringReader(desc); | ||
|
@@ -397,7 +423,7 @@ public ParquetValueReader<?> primitive( | |
case INT96: | ||
// Impala & Spark used to write timestamps as INT96 without a logical type. For backwards | ||
// compatibility we try to read INT96 as timestamps. | ||
return new TimestampInt96Reader(desc); | ||
return timestampReader(desc, LogicalTypeAnnotation.TimeUnit.NANOS, true); | ||
default: | ||
throw new UnsupportedOperationException("Unsupported type: " + primitive); | ||
} | ||
|
@@ -407,124 +433,4 @@ MessageType type() { | |
return type; | ||
} | ||
} | ||
|
||
private static final OffsetDateTime EPOCH = Instant.ofEpochSecond(0).atOffset(ZoneOffset.UTC); | ||
private static final LocalDate EPOCH_DAY = EPOCH.toLocalDate(); | ||
|
||
private static class DateReader extends ParquetValueReaders.PrimitiveReader<LocalDate> { | ||
private DateReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public LocalDate read(LocalDate reuse) { | ||
return EPOCH_DAY.plusDays(column.nextInteger()); | ||
} | ||
} | ||
|
||
private static class TimestampReader extends ParquetValueReaders.PrimitiveReader<LocalDateTime> { | ||
private TimestampReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public LocalDateTime read(LocalDateTime reuse) { | ||
return EPOCH.plus(column.nextLong(), ChronoUnit.MICROS).toLocalDateTime(); | ||
} | ||
} | ||
|
||
private static class TimestampMillisReader | ||
extends ParquetValueReaders.PrimitiveReader<LocalDateTime> { | ||
private TimestampMillisReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public LocalDateTime read(LocalDateTime reuse) { | ||
return EPOCH.plus(column.nextLong() * 1000, ChronoUnit.MICROS).toLocalDateTime(); | ||
} | ||
} | ||
|
||
private static class TimestampInt96Reader | ||
extends ParquetValueReaders.PrimitiveReader<OffsetDateTime> { | ||
private static final long UNIX_EPOCH_JULIAN = 2_440_588L; | ||
|
||
private TimestampInt96Reader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public OffsetDateTime read(OffsetDateTime reuse) { | ||
final ByteBuffer byteBuffer = | ||
column.nextBinary().toByteBuffer().order(ByteOrder.LITTLE_ENDIAN); | ||
final long timeOfDayNanos = byteBuffer.getLong(); | ||
final int julianDay = byteBuffer.getInt(); | ||
|
||
return Instant.ofEpochMilli(TimeUnit.DAYS.toMillis(julianDay - UNIX_EPOCH_JULIAN)) | ||
.plusNanos(timeOfDayNanos) | ||
.atOffset(ZoneOffset.UTC); | ||
} | ||
} | ||
|
||
private static class TimestamptzReader | ||
extends ParquetValueReaders.PrimitiveReader<OffsetDateTime> { | ||
private TimestamptzReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public OffsetDateTime read(OffsetDateTime reuse) { | ||
return EPOCH.plus(column.nextLong(), ChronoUnit.MICROS); | ||
} | ||
} | ||
|
||
private static class TimestamptzMillisReader | ||
extends ParquetValueReaders.PrimitiveReader<OffsetDateTime> { | ||
private TimestamptzMillisReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public OffsetDateTime read(OffsetDateTime reuse) { | ||
return EPOCH.plus(column.nextLong() * 1000, ChronoUnit.MICROS); | ||
} | ||
} | ||
|
||
private static class TimeMillisReader extends ParquetValueReaders.PrimitiveReader<LocalTime> { | ||
private TimeMillisReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public LocalTime read(LocalTime reuse) { | ||
return LocalTime.ofNanoOfDay(column.nextLong() * 1000000L); | ||
} | ||
} | ||
|
||
private static class TimeReader extends ParquetValueReaders.PrimitiveReader<LocalTime> { | ||
private TimeReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public LocalTime read(LocalTime reuse) { | ||
return LocalTime.ofNanoOfDay(column.nextLong() * 1000L); | ||
} | ||
} | ||
|
||
private static class FixedReader extends ParquetValueReaders.PrimitiveReader<byte[]> { | ||
private FixedReader(ColumnDescriptor desc) { | ||
super(desc); | ||
} | ||
|
||
@Override | ||
public byte[] read(byte[] reuse) { | ||
if (reuse != null) { | ||
column.nextBinary().toByteBuffer().duplicate().get(reuse); | ||
return reuse; | ||
} else { | ||
return column.nextBinary().getBytes(); | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressing the nit from #11919
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about
elementSupplier
instead? I think that's more accurate thanelements
for two reasons. First, the argument is not an element, it is a function that produces an element. Second, there is only one supplier so the name shouldn't be plural.