From 1d7376b8161949bdbafa7d49dc3608c3ad6f8172 Mon Sep 17 00:00:00 2001 From: Mayuri N Date: Tue, 26 Sep 2023 18:10:25 +0530 Subject: [PATCH] increase default sample size, use fraction based sampling --- .../src/datahub/ingestion/source/ge_profiling_config.py | 2 +- .../ingestion/source/snowflake/snowflake_profiler.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 6e0f0583858e7..24a3e520d8caf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -162,7 +162,7 @@ class GEProfilingConfig(ConfigModel): ) sample_size: int = Field( - default=1000, + default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 3abf8ca1fef97..24275dcdff34d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -81,7 +81,12 @@ def get_batch_kwargs( and table.rows_count and table.rows_count > self.config.profiling.sample_size ): - custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({self.config.profiling.sample_size} rows)' + # GX creates a temporary table from query if query is passed as batch kwargs. + # We are using fraction-based sampling here, instead of fixed-size sampling because + # Fixed-size sampling can be slower than equivalent fraction-based sampling + # as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations + sample_pc = 100 * self.config.profiling.sample_size / table.rows_count + custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({sample_pc:.3f})' return { **super().get_batch_kwargs(table, schema_name, db_name), # Lowercase/Mixedcase table names in Snowflake do not work by default.