From 0efc4dae6765e8d3383d1d9c21d9b0a97b88470d Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 15 May 2024 19:29:13 -0400 Subject: [PATCH 1/8] pull out the call to show objects --- dbt/include/snowflake/macros/adapters.sql | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dbt/include/snowflake/macros/adapters.sql b/dbt/include/snowflake/macros/adapters.sql index 157738187..b8e5f5db0 100644 --- a/dbt/include/snowflake/macros/adapters.sql +++ b/dbt/include/snowflake/macros/adapters.sql @@ -73,7 +73,7 @@ {% for _ in range(0, max_iter) %} {%- set paginated_sql -%} - show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }} from '{{ watermark.table_name }}' + {{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }} from '{{ watermark.table_name }}' {%- endset -%} {%- set paginated_result = run_query(paginated_sql) %} @@ -119,12 +119,16 @@ {% endmacro %} +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} + {% macro snowflake__list_relations_without_caching(schema_relation, max_iter=10, max_results_per_iter=10000) %} {%- set max_total_results = max_results_per_iter * max_iter -%} {%- set sql -%} - show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }} + {{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }} {%- endset -%} {%- set result = run_query(sql) -%} From fbaad5dca64e1b12876d3897d9e623d42aa3f8b7 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 15 May 2024 19:30:08 -0400 Subject: [PATCH 2/8] create a test that overrides the show objects calls with show terse objects and show objects --- tests/functional/test_show_objects.py | 131 ++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 tests/functional/test_show_objects.py diff --git a/tests/functional/test_show_objects.py b/tests/functional/test_show_objects.py new file mode 100644 index 000000000..04cdfbc6a --- /dev/null +++ b/tests/functional/test_show_objects.py @@ -0,0 +1,131 @@ +from datetime import datetime, timedelta +import os +from statistics import mean +from typing import List, Tuple + +import pytest + +from dbt.adapters.factory import get_adapter_by_type +from dbt.adapters.snowflake import SnowflakeRelation + +from dbt.tests.util import run_dbt, get_connection + + +MY_SEED = """ +id,value +0,red +1,yellow +2,blue +""".strip() + + +VIEW = """ +select * from {{ ref('my_seed') }} +""" + + +TABLE = """ +{{ config(materialized='table') }} +select * from {{ ref('my_seed') }} +""" + + +DYNAMIC_TABLE = ( + """ +{{ config( + materialized='dynamic_table', + target_lag='1 minute', + snowflake_warehouse='""" + + os.getenv("SNOWFLAKE_TEST_WAREHOUSE") + + """', +) }} +select * from {{ ref('my_seed') }} +""" +) + + +SHOW_OBJECTS = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +SHOW_TERSE_OBJECTS = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show terse objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +class ListRelations: + views: int = 100 + tables: int = 100 + dynamic_tables: int = 100 + iterations: int = 10 + expected_duration: float + + @pytest.fixture(scope="class") + def seeds(self): + yield {"my_seed.csv": MY_SEED} + + @pytest.fixture(scope="class") + def models(self): + models = {} + models.update({f"my_view_{i}.sql": VIEW for i in range(self.views)}) + models.update({f"my_table_{i}.sql": TABLE for i in range(self.tables)}) + models.update( + {f"my_dynamic_table_{i}.sql": DYNAMIC_TABLE for i in range(self.dynamic_tables)} + ) + yield models + + @pytest.fixture(scope="class", autouse=True) + def setup(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + + def list_relations(self, project) -> Tuple[List[SnowflakeRelation], timedelta]: + my_adapter = get_adapter_by_type("snowflake") + schema = my_adapter.Relation.create( + database=project.database, schema=project.test_schema, identifier="" + ) + + start = datetime.utcnow() + with get_connection(my_adapter) as conn: + relations = my_adapter.list_relations_without_caching(schema) + end = datetime.utcnow() + duration = end - start + return relations, duration + + def test_show_terse_objects(self, project): + durations = [] + for i in range(self.iterations): + relations, duration = self.list_relations(project) + durations.append(duration.total_seconds()) + assert len([relation for relation in relations if relation.is_view]) == self.views + assert ( + len([relation for relation in relations if relation.is_table]) == self.tables + 1 + ) # add the seed + assert ( + len([relation for relation in relations if relation.is_dynamic_table]) + == self.dynamic_tables + ) + assert mean(durations) < self.expected_duration + + +class TestShowObjects(ListRelations): + expected_duration = timedelta( + seconds=1, microseconds=100_000 + ).total_seconds() # allows 10% error + + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_OBJECTS} + + +class TestShowTerseObjects(ListRelations): + expected_duration = timedelta(seconds=1, microseconds=0).total_seconds() # allows 10% error + + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_TERSE_OBJECTS} From 7d13d13d139adc6e77071aeb0db712142906938f Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 14:53:51 -0400 Subject: [PATCH 3/8] allow for both versions of show objects and show terse objects in the list_relations_without_caching macro --- dbt/adapters/snowflake/impl.py | 47 ++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/dbt/adapters/snowflake/impl.py b/dbt/adapters/snowflake/impl.py index ebb15f753..caa5ed3ea 100644 --- a/dbt/adapters/snowflake/impl.py +++ b/dbt/adapters/snowflake/impl.py @@ -146,21 +146,40 @@ def list_relations_without_caching( relations = [] quote_policy = {"database": True, "schema": True, "identifier": True} - columns = ["database_name", "schema_name", "name", "kind"] - for _database, _schema, _identifier, _type in results.select(columns): - try: - _type = self.Relation.get_relation_type(_type.lower()) - except ValueError: - _type = self.Relation.External - relations.append( - self.Relation.create( - database=_database, - schema=_schema, - identifier=_identifier, - quote_policy=quote_policy, - type=_type, + if "is_dynamic" in results.column_names: + columns = ["database_name", "schema_name", "name", "kind", "is_dynamic"] + for _database, _schema, _identifier, _type, is_dynamic in results.select(columns): + try: + _type = self.Relation.get_relation_type(_type.lower()) + if _type == self.Relation.Table and is_dynamic == "Y": + _type = self.Relation.DynamicTable + except ValueError: + _type = self.Relation.External + relations.append( + self.Relation.create( + database=_database, + schema=_schema, + identifier=_identifier, + quote_policy=quote_policy, + type=_type, + ) + ) + else: + columns = ["database_name", "schema_name", "name", "kind"] + for _database, _schema, _identifier, _type in results.select(columns): + try: + _type = self.Relation.get_relation_type(_type.lower()) + except ValueError: + _type = self.Relation.External + relations.append( + self.Relation.create( + database=_database, + schema=_schema, + identifier=_identifier, + quote_policy=quote_policy, + type=_type, + ) ) - ) return relations From e5e7ad332b4f256d76eee81fbbbbc083b10f131f Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 14:54:34 -0400 Subject: [PATCH 4/8] skip performance tests unless explicitly turned on --- test.env.example | 2 ++ tests/performance/conftest.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/performance/conftest.py diff --git a/test.env.example b/test.env.example index bdf5d68e1..dd77d1895 100644 --- a/test.env.example +++ b/test.env.example @@ -33,3 +33,5 @@ SNOWFLAKE_TEST_WAREHOUSE=my_warehouse_name DBT_TEST_USER_1=dbt_test_role_1 DBT_TEST_USER_2=dbt_test_role_2 DBT_TEST_USER_3=dbt_test_role_3 + +DBT_PERFORMANCE_TESTING=0 diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py new file mode 100644 index 000000000..710fe492c --- /dev/null +++ b/tests/performance/conftest.py @@ -0,0 +1,19 @@ +import os + +import pytest + +my_check = os.environ.get("DBT_PERFORMANCE_TESTING", 0) + + +def _get_setting(environment_variable: str) -> bool: + raw_value = os.environ.get(environment_variable, False) + return raw_value in [True, "True", "TRUE", 1, "1"] + + +performance_test = pytest.mark.skipif( + not _get_setting("DBT_PERFORMANCE_TESTING"), + reason=( + "Performance test skipped, to turn on performance testing, " + "please set the environment variable `DBT_PERFORMANCE_TESTING`" + ), +) From 66f10f01fc21c4ea41707771fbe8aff6f8ac5ad9 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 14:55:01 -0400 Subject: [PATCH 5/8] create and document performance tests --- .../list_relations_tests/README.md | 51 +++++++++++++++++++ .../test_show_terse_objects.py | 48 +++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tests/performance/list_relations_tests/README.md create mode 100644 tests/performance/list_relations_tests/test_show_terse_objects.py diff --git a/tests/performance/list_relations_tests/README.md b/tests/performance/list_relations_tests/README.md new file mode 100644 index 000000000..5094eaba3 --- /dev/null +++ b/tests/performance/list_relations_tests/README.md @@ -0,0 +1,51 @@ +Performance tests were run using both `show objects` and `show terse objects` at three scales. +With `2024_03` turned off, both methods are able to correctly identify a dynamic table. +However, when `2024_03` is turned on, only `show objects` is able to correctly identify +a dynamic table. This is done by inspecting the new column `is_dynamic` since both a table +and a dynamic table show up with a `kind` of table. +In order to properly compare the two methods, an additional scenario was added that does not +create dynamic tables, and instead splits those objects evenly between views and tables. + +Let's take the small scale as an example. The small scale creates 30 objects. +There is a run that creates 10 of each object, resulting in 30 objects. +This is successful for `show objects` whether `2024_03` is turned on or off. +It is also successful for `show terse objects` when `2024_03` is turned off. +There is another scenario that creates 15 views and 15 table, but no dynamic tables. +This scenario still creates 30 objects, and both methods return the correct types +regardless of setting for `2024_03`. +These scenarios can be combined to compare `show terse objects` with `2024_03` off +to `show objects` with `2024_03` turned on. +This comparison represents the change that will happen when `2024_03` becomes a mandatory bundle. + +### 30 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 1.02 s | -- | +| YES | show objects | 0.91 s | 0.92 s | +| YES | show terse objects | -- | 0.94 s | + +- 11% improved run time of `list_relations_without_caching` when turning on `2024_03` +- similar performance of `show objects` and `show terse objects` in `2024_03` + +### 300 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 0.96 s | -- | +| YES | show objects | 1.19 s | 1.37 s | +| YES | show terse objects | -- | 0.92 s | + +- 24% longer run time of `list_relations_without_caching` when turning on `2024_03` +- 49% longer run time of `show objects` than `show terse objects` in `2024_03` + +### 3000 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 2.00 s | -- | +| YES | show objects | 3.05 s | 3.22 s | +| YES | show terse objects | -- | 2.33 s | + +- 53% longer run time of `list_relations_without_caching` when turning on `2024_03` +- 38% longer run time of `show objects` than `show terse objects` in `2024_03` diff --git a/tests/performance/list_relations_tests/test_show_terse_objects.py b/tests/performance/list_relations_tests/test_show_terse_objects.py new file mode 100644 index 000000000..fcd98d3e5 --- /dev/null +++ b/tests/performance/list_relations_tests/test_show_terse_objects.py @@ -0,0 +1,48 @@ +from datetime import timedelta + +import pytest + +from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario + + +SHOW_TERSE_OBJECTS_MACRO = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show terse objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +class ShowTerseObjects(BaseConfig): + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_TERSE_OBJECTS_MACRO} + + +class TestShowTerseObjects10View10Table10Dynamic(ShowTerseObjects): + scenario = Scenario(10, 10, 10) + expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds() + + +class TestShowTerseObjects15View15Table0Dynamic(ShowTerseObjects): + scenario = Scenario(15, 15, 0) + expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds() + + +class TestShowTerseObjects100View100Table100Dynamic(ShowTerseObjects): + scenario = Scenario(100, 100, 100) + expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds() + + +class TestShowTerseObjects150View150Table0Dynamic(ShowTerseObjects): + scenario = Scenario(150, 150, 0) + expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds() + + +class TestShowTerseObjects1000View1000Table1000Dynamic(ShowTerseObjects): + scenario = Scenario(1000, 1000, 1000) + expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds() + + +class TestShowTerseObjects1500View1500Table0Dynamic(ShowTerseObjects): + scenario = Scenario(1500, 1500, 0) + expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds() From fd1ab4fab5676f21a74f6234a860714cadd514d0 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 14:55:06 -0400 Subject: [PATCH 6/8] create and document performance tests --- .../list_relations_tests/list_relations.py} | 75 +++++++------------ .../list_relations_tests/test_show_objects.py | 48 ++++++++++++ 2 files changed, 77 insertions(+), 46 deletions(-) rename tests/{functional/test_show_objects.py => performance/list_relations_tests/list_relations.py} (57%) create mode 100644 tests/performance/list_relations_tests/test_show_objects.py diff --git a/tests/functional/test_show_objects.py b/tests/performance/list_relations_tests/list_relations.py similarity index 57% rename from tests/functional/test_show_objects.py rename to tests/performance/list_relations_tests/list_relations.py index 04cdfbc6a..a55305ebb 100644 --- a/tests/functional/test_show_objects.py +++ b/tests/performance/list_relations_tests/list_relations.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datetime import datetime, timedelta import os from statistics import mean @@ -9,6 +10,7 @@ from dbt.adapters.snowflake import SnowflakeRelation from dbt.tests.util import run_dbt, get_connection +from tests.performance.conftest import performance_test MY_SEED = """ @@ -34,7 +36,7 @@ """ {{ config( materialized='dynamic_table', - target_lag='1 minute', + target_lag='1 day', snowflake_warehouse='""" + os.getenv("SNOWFLAKE_TEST_WAREHOUSE") + """', @@ -44,26 +46,17 @@ ) -SHOW_OBJECTS = """ -{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} - show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} -{% endmacro %} -""" - +@dataclass +class Scenario: + views: int + tables: int + dynamic_tables: int -SHOW_TERSE_OBJECTS = """ -{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} - show terse objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} -{% endmacro %} -""" - -class ListRelations: - views: int = 100 - tables: int = 100 - dynamic_tables: int = 100 - iterations: int = 10 +class BaseConfig: + scenario: Scenario expected_duration: float + iterations: int = 10 @pytest.fixture(scope="class") def seeds(self): @@ -72,10 +65,13 @@ def seeds(self): @pytest.fixture(scope="class") def models(self): models = {} - models.update({f"my_view_{i}.sql": VIEW for i in range(self.views)}) - models.update({f"my_table_{i}.sql": TABLE for i in range(self.tables)}) + models.update({f"my_view_{i}.sql": VIEW for i in range(self.scenario.views)}) + models.update({f"my_table_{i}.sql": TABLE for i in range(self.scenario.tables)}) models.update( - {f"my_dynamic_table_{i}.sql": DYNAMIC_TABLE for i in range(self.dynamic_tables)} + { + f"my_dynamic_table_{i}.sql": DYNAMIC_TABLE + for i in range(self.scenario.dynamic_tables) + } ) yield models @@ -91,41 +87,28 @@ def list_relations(self, project) -> Tuple[List[SnowflakeRelation], timedelta]: ) start = datetime.utcnow() - with get_connection(my_adapter) as conn: + with get_connection(my_adapter): relations = my_adapter.list_relations_without_caching(schema) end = datetime.utcnow() duration = end - start return relations, duration - def test_show_terse_objects(self, project): + @performance_test + def test_list_relations(self, project): durations = [] for i in range(self.iterations): relations, duration = self.list_relations(project) durations.append(duration.total_seconds()) - assert len([relation for relation in relations if relation.is_view]) == self.views assert ( - len([relation for relation in relations if relation.is_table]) == self.tables + 1 - ) # add the seed + len([relation for relation in relations if relation.is_view]) + == self.scenario.views + ) + assert ( + len([relation for relation in relations if relation.is_table]) + == self.scenario.tables + 1 # add the seed + ) assert ( len([relation for relation in relations if relation.is_dynamic_table]) - == self.dynamic_tables + == self.scenario.dynamic_tables ) - assert mean(durations) < self.expected_duration - - -class TestShowObjects(ListRelations): - expected_duration = timedelta( - seconds=1, microseconds=100_000 - ).total_seconds() # allows 10% error - - @pytest.fixture(scope="class") - def macros(self): - yield {"snowflake__get_show_objects_sql.sql": SHOW_OBJECTS} - - -class TestShowTerseObjects(ListRelations): - expected_duration = timedelta(seconds=1, microseconds=0).total_seconds() # allows 10% error - - @pytest.fixture(scope="class") - def macros(self): - yield {"snowflake__get_show_objects_sql.sql": SHOW_TERSE_OBJECTS} + assert mean(durations) < self.expected_duration * 1.10 # allow for 10% error diff --git a/tests/performance/list_relations_tests/test_show_objects.py b/tests/performance/list_relations_tests/test_show_objects.py new file mode 100644 index 000000000..1d395aff9 --- /dev/null +++ b/tests/performance/list_relations_tests/test_show_objects.py @@ -0,0 +1,48 @@ +from datetime import timedelta + +import pytest + +from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario + + +SHOW_OBJECTS_MACRO = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +class ShowObjects(BaseConfig): + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_OBJECTS_MACRO} + + +class TestShowObjects10View10Table10Dynamic(ShowObjects): + scenario = Scenario(10, 10, 10) + expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds() + + +class TestShowObjects15View15Table0Dynamic(ShowObjects): + scenario = Scenario(15, 15, 0) + expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds() + + +class TestShowObjects100View100Table100Dynamic(ShowObjects): + scenario = Scenario(100, 100, 100) + expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds() + + +class TestShowObjects150View150Table0Dynamic(ShowObjects): + scenario = Scenario(150, 150, 0) + expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds() + + +class TestShowObjects1000View1000Table1000Dynamic(ShowObjects): + scenario = Scenario(1000, 1000, 1000) + expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds() + + +class TestShowObjects1500View1500Table0Dynamic(ShowObjects): + scenario = Scenario(1500, 1500, 0) + expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds() From 3d3b2a4955018d8db4e3419a6fe85cea8bc17f48 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 15:29:12 -0400 Subject: [PATCH 7/8] naming --- tests/performance/list_relations_tests/list_relations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/performance/list_relations_tests/list_relations.py b/tests/performance/list_relations_tests/list_relations.py index a55305ebb..61d35542d 100644 --- a/tests/performance/list_relations_tests/list_relations.py +++ b/tests/performance/list_relations_tests/list_relations.py @@ -13,7 +13,7 @@ from tests.performance.conftest import performance_test -MY_SEED = """ +SEED = """ id,value 0,red 1,yellow @@ -60,7 +60,7 @@ class BaseConfig: @pytest.fixture(scope="class") def seeds(self): - yield {"my_seed.csv": MY_SEED} + yield {"my_seed.csv": SEED} @pytest.fixture(scope="class") def models(self): From 95167239544e6b2792b2c1b3ade638bad1001be7 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Thu, 16 May 2024 15:30:54 -0400 Subject: [PATCH 8/8] remove unused code --- tests/performance/conftest.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py index 710fe492c..3056dd06c 100644 --- a/tests/performance/conftest.py +++ b/tests/performance/conftest.py @@ -2,8 +2,6 @@ import pytest -my_check = os.environ.get("DBT_PERFORMANCE_TESTING", 0) - def _get_setting(environment_variable: str) -> bool: raw_value = os.environ.get(environment_variable, False)