Skip to content

Commit

Permalink
ORC: Add configurable write properties (apache#3810)
Browse files Browse the repository at this point in the history
  • Loading branch information
hililiwei authored Mar 3, 2022
1 parent 3dc245b commit 79c8978
Show file tree
Hide file tree
Showing 5 changed files with 212 additions and 0 deletions.
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,7 @@ project(':iceberg-orc') {
}

testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
testImplementation project(':iceberg-common')
}
}

Expand Down
8 changes: 8 additions & 0 deletions core/src/main/java/org/apache/iceberg/TableProperties.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,14 @@ private TableProperties() {
public static final String DELETE_AVRO_COMPRESSION_LEVEL = "write.delete.avro.compression-level";
public static final String AVRO_COMPRESSION_LEVEL_DEFAULT = null;

public static final String ORC_STRIPE_SIZE_BYTES = "write.orc.stripe-size-bytes";
public static final String DELETE_ORC_STRIPE_SIZE_BYTES = "write.delete.orc.stripe-size-bytes";
public static final long ORC_STRIPE_SIZE_BYTES_DEFAULT = 64L * 1024 * 1024; // 64 MB

public static final String ORC_BLOCK_SIZE_BYTES = "write.orc.block-size-bytes";
public static final String DELETE_ORC_BLOCK_SIZE_BYTES = "write.delete.orc.block-size-bytes";
public static final long ORC_BLOCK_SIZE_BYTES_DEFAULT = 256L * 1024 * 1024; // 256 MB

public static final String SPLIT_SIZE = "read.split.target-size";
public static final long SPLIT_SIZE_DEFAULT = 128 * 1024 * 1024; // 128 MB

Expand Down
2 changes: 2 additions & 0 deletions docs/versioned/tables/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ Iceberg tables support table properties to configure table behavior, like the de
| write.parquet.compression-level | null | Parquet compression level |
| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), gzip, snappy, uncompressed |
| write.avro.compression-level | null | Avro compression level |
| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes |
| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files |
| write.location-provider.impl | null | Optional custom implemention for LocationProvider |
| write.metadata.compression-codec | none | Metadata compression codec; none or gzip |
| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full |
Expand Down
71 changes: 71 additions & 0 deletions orc/src/main/java/org/apache/iceberg/orc/ORC.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.iceberg.SortOrder;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.data.orc.GenericOrcWriter;
import org.apache.iceberg.data.orc.GenericOrcWriters;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
Expand Down Expand Up @@ -82,6 +83,7 @@ public static class WriteBuilder {
private BiFunction<Schema, TypeDescription, OrcRowWriter<?>> createWriterFunc;
private Map<String, byte[]> metadata = Maps.newHashMap();
private MetricsConfig metricsConfig;
private Function<Configuration, Context> createContextFunc = Context::dataContext;

private WriteBuilder(OutputFile file) {
this.file = file;
Expand All @@ -92,6 +94,13 @@ private WriteBuilder(OutputFile file) {
}
}

public WriteBuilder forTable(Table table) {
schema(table.schema());
setAll(table.properties());
metricsConfig(MetricsConfig.forTable(table));
return this;
}

public WriteBuilder metadata(String property, String value) {
metadata.put(property, value.getBytes(StandardCharsets.UTF_8));
return this;
Expand Down Expand Up @@ -143,12 +152,71 @@ public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) {
return this;
}

// supposed to always be a private method used strictly by data and delete write builders
private WriteBuilder createContextFunc(Function<Configuration, Context> newCreateContextFunc) {
this.createContextFunc = newCreateContextFunc;
return this;
}

public <D> FileAppender<D> build() {
Preconditions.checkNotNull(schema, "Schema is required");

// Map Iceberg properties to pass down to the ORC writer
Context context = createContextFunc.apply(conf);
long stripeSize = context.stripeSize();
long blockSize = context.blockSize();

conf.setLong(OrcConf.STRIPE_SIZE.getAttribute(), stripeSize);
conf.setLong(OrcConf.BLOCK_SIZE.getAttribute(), blockSize);

return new OrcFileAppender<>(schema,
this.file, createWriterFunc, conf, metadata,
conf.getInt(VECTOR_ROW_BATCH_SIZE, VectorizedRowBatch.DEFAULT_SIZE), metricsConfig);
}

private static class Context {
private final long stripeSize;
private final long blockSize;

public long stripeSize() {
return stripeSize;
}

public long blockSize() {
return blockSize;
}

private Context(long stripeSize, long blockSize) {
this.stripeSize = stripeSize;
this.blockSize = blockSize;
}

static Context dataContext(Configuration conf) {
long stripeSize = TableProperties.ORC_STRIPE_SIZE_BYTES_DEFAULT;
stripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getAttribute(), stripeSize);
stripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), stripeSize);
stripeSize = conf.getLong(TableProperties.ORC_STRIPE_SIZE_BYTES, stripeSize);

long blockSize = TableProperties.ORC_BLOCK_SIZE_BYTES_DEFAULT;
blockSize = conf.getLong(OrcConf.BLOCK_SIZE.getAttribute(), blockSize);
blockSize = conf.getLong(OrcConf.BLOCK_SIZE.getHiveConfName(), blockSize);
blockSize = conf.getLong(TableProperties.ORC_BLOCK_SIZE_BYTES, blockSize);

return new Context(stripeSize, blockSize);
}

static Context deleteContext(Configuration conf) {
Context dataContext = dataContext(conf);

long stripeSize =
conf.getLong(TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES, dataContext.stripeSize());

long blockSize =
conf.getLong(TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES, dataContext.stripeSize());

return new Context(stripeSize, blockSize);
}
}
}

public static DataWriteBuilder writeData(OutputFile file) {
Expand Down Expand Up @@ -365,6 +433,7 @@ public <T> EqualityDeleteWriter<T> buildEqualityWriter() {
// the appender uses the row schema without extra columns
appenderBuilder.schema(rowSchema);
appenderBuilder.createWriterFunc(createWriterFunc);
appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);

return new EqualityDeleteWriter<>(
appenderBuilder.build(), FileFormat.ORC, location, spec, partition, keyMetadata,
Expand Down Expand Up @@ -395,6 +464,8 @@ public <T> PositionDeleteWriter<T> buildPositionWriter() {
GenericOrcWriter.buildWriter(schema, typeDescription), Function.identity()));
}

appenderBuilder.createContextFunc(WriteBuilder.Context::deleteContext);

return new PositionDeleteWriter<>(
appenderBuilder.build(), FileFormat.ORC, location, spec, partition, keyMetadata);
}
Expand Down
130 changes: 130 additions & 0 deletions orc/src/test/java/org/apache/iceberg/orc/TestTableProperties.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iceberg.orc;

import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.Files;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.common.DynFields;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.orc.GenericOrcWriter;
import org.apache.iceberg.deletes.EqualityDeleteWriter;
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.types.Types;
import org.apache.orc.OrcConf;
import org.junit.Assert;
import org.junit.ClassRule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

public class TestTableProperties {

public static final Schema SCHEMA = new Schema(
Types.NestedField.optional(1, "id", Types.IntegerType.get()),
Types.NestedField.optional(2, "data", Types.StringType.get())
);

@ClassRule
public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder();

@Test
public void testOrcTableProperties() throws Exception {
Long stripeSizeBytes = 32L * 1024 * 1024;
Long blockSizeBytes = 128L * 1024 * 1024;

ImmutableMap<String, String> properties = ImmutableMap.of(
TableProperties.ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes),
TableProperties.ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes),
TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name());

File folder = TEMPORARY_FOLDER.newFolder();

String warehouse = folder.getAbsolutePath();
String tablePath = warehouse.concat("/test");
Assert.assertTrue("Should create the table path correctly.", new File(tablePath).mkdir());

PartitionSpec spec = PartitionSpec.unpartitioned();
Table table = new HadoopTables().create(SCHEMA, spec, properties, tablePath);

File testFile = TEMPORARY_FOLDER.newFile();
Assert.assertTrue(testFile.delete());

FileAppender<Record> writer = ORC.write(Files.localOutput(testFile))
.forTable(table)
.createWriterFunc(GenericOrcWriter::buildWriter)
.build();

DynFields.BoundField<Configuration> confField =
DynFields.builder().hiddenImpl(writer.getClass(), "conf").build(writer);

Configuration configuration = confField.get();
Assert.assertEquals(String.valueOf(blockSizeBytes), configuration.get(OrcConf.BLOCK_SIZE.getAttribute()));
Assert.assertEquals(String.valueOf(stripeSizeBytes), configuration.get(OrcConf.STRIPE_SIZE.getAttribute()));
Assert.assertEquals(FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT));
}

@Test
public void testOrcTableDeleteProperties() throws Exception {
Long stripeSizeBytes = 32L * 1024 * 1024;
Long blockSizeBytes = 128L * 1024 * 1024;

ImmutableMap<String, String> properties = ImmutableMap.of(
TableProperties.DELETE_ORC_STRIPE_SIZE_BYTES, String.valueOf(stripeSizeBytes),
TableProperties.DELETE_ORC_BLOCK_SIZE_BYTES, String.valueOf(blockSizeBytes),
TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.name());

File folder = TEMPORARY_FOLDER.newFolder();

String warehouse = folder.getAbsolutePath();
String tablePath = warehouse.concat("/test");
Assert.assertTrue("Should create the table path correctly.", new File(tablePath).mkdir());

PartitionSpec spec = PartitionSpec.unpartitioned();
Table table = new HadoopTables().create(SCHEMA, spec, properties, tablePath);

File testFile = TEMPORARY_FOLDER.newFile();
Assert.assertTrue(testFile.delete());

EqualityDeleteWriter<Object> deleteWriter = ORC.writeDeletes(Files.localOutput(testFile))
.forTable(table)
.equalityFieldIds(1)
.createWriterFunc(GenericOrcWriter::buildWriter)
.buildEqualityWriter();

DynFields.BoundField<OrcFileAppender<Record>> writer =
DynFields.builder().hiddenImpl(deleteWriter.getClass(), "appender").build(deleteWriter);

OrcFileAppender<Record> orcFileAppender = writer.get();
DynFields.BoundField<Configuration> confField =
DynFields.builder().hiddenImpl(orcFileAppender.getClass(), "conf").build(orcFileAppender);

Configuration configuration = confField.get();
Assert.assertEquals(String.valueOf(blockSizeBytes), configuration.get(OrcConf.BLOCK_SIZE.getAttribute()));
Assert.assertEquals(String.valueOf(stripeSizeBytes), configuration.get(OrcConf.STRIPE_SIZE.getAttribute()));
Assert.assertEquals(FileFormat.ORC.name(), configuration.get(TableProperties.DEFAULT_FILE_FORMAT));
}
}

0 comments on commit 79c8978

Please sign in to comment.