Skip to content

Commit

Permalink
fix(ingest/bigquery): use correct row count in null count profiling c… (
Browse files Browse the repository at this point in the history
datahub-project#9123)

Co-authored-by: Harshal Sheth <[email protected]>
Co-authored-by: Aseem Bansal <[email protected]>
  • Loading branch information
3 people authored Nov 2, 2023
1 parent bab9d1c commit 12b4171
Showing 1 changed file with 11 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,16 @@ def generate_dataset_profile( # noqa: C901 (complexity)
self.query_combiner.flush()

assert profile.rowCount is not None
row_count: int = profile.rowCount
row_count: int # used for null counts calculation
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
# We can alternatively use `self._get_dataset_rows(profile)` to get
# exact count of rows in sample, as actual rows involved in sample
# may be slightly different (more or less) than configured `sample_size`.
# However not doing so to start with, as that adds another query overhead
# plus approximate metrics should work for sampling based profiling.
row_count = self.config.sample_size
else:
row_count = profile.rowCount

for column_spec in columns_profiling_queue:
column = column_spec.column
Expand Down Expand Up @@ -811,7 +820,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
sample_pc = 100 * self.config.sample_size / profile.rowCount
sql = (
f"SELECT * FROM {str(self.dataset._table)} "
+ f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)"
+ f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)"
)
temp_table_name = create_bigquery_temp_table(
self,
Expand Down

0 comments on commit 12b4171

Please sign in to comment.