Skip to content

Commit

Permalink
add bestEffortDeduplication config option
Browse files Browse the repository at this point in the history
The connector is currently adding an InsertId per row, which is used by
BigQuery to dedupe rows that have the same insertId (in a 1 minute
window). Using insertIds throttles the ingestion rate to a maximum of
100k rows per second & 100 MB/s.

Insertions without a insertId disable best effort de-duplication [1],
which increases the ingestion quota to a maximum of 1 GB/s. For high
throughput applications, its desirable to disable dedupe, handling
duplication on the query side.

[1] https://cloud.google.com/bigquery/streaming-data-into-bigquery#disabling_best_effort_de-duplication

Signed-off-by: Alejandro del Castillo <[email protected]>
  • Loading branch information
Alejandro del Castillo committed Jun 15, 2020
1 parent 2797a48 commit 37bcaa2
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ private RowToInsert getRecordRow(SinkRecord record) {
if (config.getBoolean(config.SANITIZE_FIELD_NAME_CONFIG)) {
convertedRecord = FieldNameSanitizer.replaceInvalidKeys(convertedRecord);
}
return RowToInsert.of(getRowId(record), convertedRecord);
if (config.getBoolean(config.BEST_EFFORT_DEDUPLICATION_CONFIG)) {
return RowToInsert.of(getRowId(record), convertedRecord);
} else {
return RowToInsert.of(convertedRecord);
}
}

private String getRowId(SinkRecord record) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,15 @@ public class BigQuerySinkConfig extends AbstractConfig {
private static final String TABLE_CREATE_DOC =
"Automatically create BigQuery tables if they don't already exist";

public static final String BEST_EFFORT_DEDUPLICATION_CONFIG = "bestEffortDeduplication";
private static final ConfigDef.Type BEST_EFFORT_DEDUPLICATION_TYPE = ConfigDef.Type.BOOLEAN;
public static final Boolean BEST_EFFORT_DEDUPLICATION_DEFAULT = true;
private static final ConfigDef.Importance BEST_EFFORT_DEDUPLICATION_IMPORTANCE =
ConfigDef.Importance.MEDIUM;
private static final String BEST_EFFORT_DEDUPLICATION_DOC =
"If false, Big Query best effort de-duplication will be disabled, which increases "
+ "the streaming ingest quota, at the expense of not checking for duplicates";

static {
config = new ConfigDef()
.define(
Expand Down Expand Up @@ -365,7 +374,13 @@ public class BigQuerySinkConfig extends AbstractConfig {
TABLE_CREATE_DEFAULT,
TABLE_CREATE_IMPORTANCE,
TABLE_CREATE_DOC
);
).define(
BEST_EFFORT_DEDUPLICATION_CONFIG,
BEST_EFFORT_DEDUPLICATION_TYPE,
BEST_EFFORT_DEDUPLICATION_DEFAULT,
BEST_EFFORT_DEDUPLICATION_IMPORTANCE,
BEST_EFFORT_DEDUPLICATION_DOC
);
}
/**
* Throw an exception if the passed-in properties do not constitute a valid sink.
Expand Down

0 comments on commit 37bcaa2

Please sign in to comment.