Skip to content

Commit

Permalink
increase default sample size, use fraction based sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate committed Sep 26, 2023
1 parent 10d7fe9 commit 1d7376b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ class GEProfilingConfig(ConfigModel):
)

sample_size: int = Field(
default=1000,
default=10000,
description="Number of rows to be sampled from table for column level profiling."
"Applicable only if `use_sampling` is set to True.",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,12 @@ def get_batch_kwargs(
and table.rows_count
and table.rows_count > self.config.profiling.sample_size
):
custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({self.config.profiling.sample_size} rows)'
# GX creates a temporary table from query if query is passed as batch kwargs.
# We are using fraction-based sampling here, instead of fixed-size sampling because
# Fixed-size sampling can be slower than equivalent fraction-based sampling
# as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations
sample_pc = 100 * self.config.profiling.sample_size / table.rows_count
custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({sample_pc:.3f})'
return {
**super().get_batch_kwargs(table, schema_name, db_name),
# Lowercase/Mixedcase table names in Snowflake do not work by default.
Expand Down

0 comments on commit 1d7376b

Please sign in to comment.