Skip to content

Commit

Permalink
update only local row count assignment
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate committed Oct 30, 2023
1 parent 0ce4d1e commit f3cd6d4
Showing 1 changed file with 10 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,16 @@ def generate_dataset_profile( # noqa: C901 (complexity)
self.query_combiner.flush()

assert profile.rowCount is not None
row_count: int = profile.rowCount
row_count: int # used for null counts calculation
if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition:
# We can alternatively use `self._get_dataset_rows(profile)` to get
# exact count of rows in sample, as actual rows involved in sample
# may be slightly different (more or less) than configured `sample_size`.
# However not doing so to start with, as that adds another query overhead
# plus approximate metrics should work for sampling based profiling.
row_count = self.config.sample_size
else:
row_count = profile.rowCount

for column_spec in columns_profiling_queue:
column = column_spec.column
Expand Down Expand Up @@ -793,13 +802,6 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non
self.dataset._table = sa.text(temp_table_name)
logger.debug(f"Setting table name to be {self.dataset._table}")

# We can alternatively use `self._get_dataset_rows(profile)` to get
# exact count of rows in sample, as actual rows involved in sample
# may be slightly different (more or less) than configured `sample_size`.
# However not doing so to start with, as that adds another query overhead
# plus approximate metrics should work for sampling based profiling.
profile.rowCount = self.config.sample_size

if (
profile.partitionSpec
and profile.partitionSpec.type == PartitionTypeClass.FULL_TABLE
Expand Down

0 comments on commit f3cd6d4

Please sign in to comment.