diff --git a/dbt/adapters/snowflake/impl.py b/dbt/adapters/snowflake/impl.py index ebb15f753..caa5ed3ea 100644 --- a/dbt/adapters/snowflake/impl.py +++ b/dbt/adapters/snowflake/impl.py @@ -146,21 +146,40 @@ def list_relations_without_caching( relations = [] quote_policy = {"database": True, "schema": True, "identifier": True} - columns = ["database_name", "schema_name", "name", "kind"] - for _database, _schema, _identifier, _type in results.select(columns): - try: - _type = self.Relation.get_relation_type(_type.lower()) - except ValueError: - _type = self.Relation.External - relations.append( - self.Relation.create( - database=_database, - schema=_schema, - identifier=_identifier, - quote_policy=quote_policy, - type=_type, + if "is_dynamic" in results.column_names: + columns = ["database_name", "schema_name", "name", "kind", "is_dynamic"] + for _database, _schema, _identifier, _type, is_dynamic in results.select(columns): + try: + _type = self.Relation.get_relation_type(_type.lower()) + if _type == self.Relation.Table and is_dynamic == "Y": + _type = self.Relation.DynamicTable + except ValueError: + _type = self.Relation.External + relations.append( + self.Relation.create( + database=_database, + schema=_schema, + identifier=_identifier, + quote_policy=quote_policy, + type=_type, + ) + ) + else: + columns = ["database_name", "schema_name", "name", "kind"] + for _database, _schema, _identifier, _type in results.select(columns): + try: + _type = self.Relation.get_relation_type(_type.lower()) + except ValueError: + _type = self.Relation.External + relations.append( + self.Relation.create( + database=_database, + schema=_schema, + identifier=_identifier, + quote_policy=quote_policy, + type=_type, + ) ) - ) return relations diff --git a/dbt/include/snowflake/macros/adapters.sql b/dbt/include/snowflake/macros/adapters.sql index 157738187..b8e5f5db0 100644 --- a/dbt/include/snowflake/macros/adapters.sql +++ b/dbt/include/snowflake/macros/adapters.sql @@ -73,7 +73,7 @@ {% for _ in range(0, max_iter) %} {%- set paginated_sql -%} - show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }} from '{{ watermark.table_name }}' + {{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }} from '{{ watermark.table_name }}' {%- endset -%} {%- set paginated_result = run_query(paginated_sql) %} @@ -119,12 +119,16 @@ {% endmacro %} +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} + {% macro snowflake__list_relations_without_caching(schema_relation, max_iter=10, max_results_per_iter=10000) %} {%- set max_total_results = max_results_per_iter * max_iter -%} {%- set sql -%} - show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }} + {{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }} {%- endset -%} {%- set result = run_query(sql) -%} diff --git a/test.env.example b/test.env.example index bdf5d68e1..dd77d1895 100644 --- a/test.env.example +++ b/test.env.example @@ -33,3 +33,5 @@ SNOWFLAKE_TEST_WAREHOUSE=my_warehouse_name DBT_TEST_USER_1=dbt_test_role_1 DBT_TEST_USER_2=dbt_test_role_2 DBT_TEST_USER_3=dbt_test_role_3 + +DBT_PERFORMANCE_TESTING=0 diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py new file mode 100644 index 000000000..3056dd06c --- /dev/null +++ b/tests/performance/conftest.py @@ -0,0 +1,17 @@ +import os + +import pytest + + +def _get_setting(environment_variable: str) -> bool: + raw_value = os.environ.get(environment_variable, False) + return raw_value in [True, "True", "TRUE", 1, "1"] + + +performance_test = pytest.mark.skipif( + not _get_setting("DBT_PERFORMANCE_TESTING"), + reason=( + "Performance test skipped, to turn on performance testing, " + "please set the environment variable `DBT_PERFORMANCE_TESTING`" + ), +) diff --git a/tests/performance/list_relations_tests/README.md b/tests/performance/list_relations_tests/README.md new file mode 100644 index 000000000..5094eaba3 --- /dev/null +++ b/tests/performance/list_relations_tests/README.md @@ -0,0 +1,51 @@ +Performance tests were run using both `show objects` and `show terse objects` at three scales. +With `2024_03` turned off, both methods are able to correctly identify a dynamic table. +However, when `2024_03` is turned on, only `show objects` is able to correctly identify +a dynamic table. This is done by inspecting the new column `is_dynamic` since both a table +and a dynamic table show up with a `kind` of table. +In order to properly compare the two methods, an additional scenario was added that does not +create dynamic tables, and instead splits those objects evenly between views and tables. + +Let's take the small scale as an example. The small scale creates 30 objects. +There is a run that creates 10 of each object, resulting in 30 objects. +This is successful for `show objects` whether `2024_03` is turned on or off. +It is also successful for `show terse objects` when `2024_03` is turned off. +There is another scenario that creates 15 views and 15 table, but no dynamic tables. +This scenario still creates 30 objects, and both methods return the correct types +regardless of setting for `2024_03`. +These scenarios can be combined to compare `show terse objects` with `2024_03` off +to `show objects` with `2024_03` turned on. +This comparison represents the change that will happen when `2024_03` becomes a mandatory bundle. + +### 30 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 1.02 s | -- | +| YES | show objects | 0.91 s | 0.92 s | +| YES | show terse objects | -- | 0.94 s | + +- 11% improved run time of `list_relations_without_caching` when turning on `2024_03` +- similar performance of `show objects` and `show terse objects` in `2024_03` + +### 300 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 0.96 s | -- | +| YES | show objects | 1.19 s | 1.37 s | +| YES | show terse objects | -- | 0.92 s | + +- 24% longer run time of `list_relations_without_caching` when turning on `2024_03` +- 49% longer run time of `show objects` than `show terse objects` in `2024_03` + +### 3000 Objects + +| 2024_03 | method | mean time | mean time - no DTs | +|:-------:|--------------------|----------:|-------------------:| +| NO | show terse objects | 2.00 s | -- | +| YES | show objects | 3.05 s | 3.22 s | +| YES | show terse objects | -- | 2.33 s | + +- 53% longer run time of `list_relations_without_caching` when turning on `2024_03` +- 38% longer run time of `show objects` than `show terse objects` in `2024_03` diff --git a/tests/performance/list_relations_tests/list_relations.py b/tests/performance/list_relations_tests/list_relations.py new file mode 100644 index 000000000..61d35542d --- /dev/null +++ b/tests/performance/list_relations_tests/list_relations.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from datetime import datetime, timedelta +import os +from statistics import mean +from typing import List, Tuple + +import pytest + +from dbt.adapters.factory import get_adapter_by_type +from dbt.adapters.snowflake import SnowflakeRelation + +from dbt.tests.util import run_dbt, get_connection +from tests.performance.conftest import performance_test + + +SEED = """ +id,value +0,red +1,yellow +2,blue +""".strip() + + +VIEW = """ +select * from {{ ref('my_seed') }} +""" + + +TABLE = """ +{{ config(materialized='table') }} +select * from {{ ref('my_seed') }} +""" + + +DYNAMIC_TABLE = ( + """ +{{ config( + materialized='dynamic_table', + target_lag='1 day', + snowflake_warehouse='""" + + os.getenv("SNOWFLAKE_TEST_WAREHOUSE") + + """', +) }} +select * from {{ ref('my_seed') }} +""" +) + + +@dataclass +class Scenario: + views: int + tables: int + dynamic_tables: int + + +class BaseConfig: + scenario: Scenario + expected_duration: float + iterations: int = 10 + + @pytest.fixture(scope="class") + def seeds(self): + yield {"my_seed.csv": SEED} + + @pytest.fixture(scope="class") + def models(self): + models = {} + models.update({f"my_view_{i}.sql": VIEW for i in range(self.scenario.views)}) + models.update({f"my_table_{i}.sql": TABLE for i in range(self.scenario.tables)}) + models.update( + { + f"my_dynamic_table_{i}.sql": DYNAMIC_TABLE + for i in range(self.scenario.dynamic_tables) + } + ) + yield models + + @pytest.fixture(scope="class", autouse=True) + def setup(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + + def list_relations(self, project) -> Tuple[List[SnowflakeRelation], timedelta]: + my_adapter = get_adapter_by_type("snowflake") + schema = my_adapter.Relation.create( + database=project.database, schema=project.test_schema, identifier="" + ) + + start = datetime.utcnow() + with get_connection(my_adapter): + relations = my_adapter.list_relations_without_caching(schema) + end = datetime.utcnow() + duration = end - start + return relations, duration + + @performance_test + def test_list_relations(self, project): + durations = [] + for i in range(self.iterations): + relations, duration = self.list_relations(project) + durations.append(duration.total_seconds()) + assert ( + len([relation for relation in relations if relation.is_view]) + == self.scenario.views + ) + assert ( + len([relation for relation in relations if relation.is_table]) + == self.scenario.tables + 1 # add the seed + ) + assert ( + len([relation for relation in relations if relation.is_dynamic_table]) + == self.scenario.dynamic_tables + ) + assert mean(durations) < self.expected_duration * 1.10 # allow for 10% error diff --git a/tests/performance/list_relations_tests/test_show_objects.py b/tests/performance/list_relations_tests/test_show_objects.py new file mode 100644 index 000000000..1d395aff9 --- /dev/null +++ b/tests/performance/list_relations_tests/test_show_objects.py @@ -0,0 +1,48 @@ +from datetime import timedelta + +import pytest + +from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario + + +SHOW_OBJECTS_MACRO = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +class ShowObjects(BaseConfig): + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_OBJECTS_MACRO} + + +class TestShowObjects10View10Table10Dynamic(ShowObjects): + scenario = Scenario(10, 10, 10) + expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds() + + +class TestShowObjects15View15Table0Dynamic(ShowObjects): + scenario = Scenario(15, 15, 0) + expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds() + + +class TestShowObjects100View100Table100Dynamic(ShowObjects): + scenario = Scenario(100, 100, 100) + expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds() + + +class TestShowObjects150View150Table0Dynamic(ShowObjects): + scenario = Scenario(150, 150, 0) + expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds() + + +class TestShowObjects1000View1000Table1000Dynamic(ShowObjects): + scenario = Scenario(1000, 1000, 1000) + expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds() + + +class TestShowObjects1500View1500Table0Dynamic(ShowObjects): + scenario = Scenario(1500, 1500, 0) + expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds() diff --git a/tests/performance/list_relations_tests/test_show_terse_objects.py b/tests/performance/list_relations_tests/test_show_terse_objects.py new file mode 100644 index 000000000..fcd98d3e5 --- /dev/null +++ b/tests/performance/list_relations_tests/test_show_terse_objects.py @@ -0,0 +1,48 @@ +from datetime import timedelta + +import pytest + +from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario + + +SHOW_TERSE_OBJECTS_MACRO = """ +{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %} + show terse objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }} +{% endmacro %} +""" + + +class ShowTerseObjects(BaseConfig): + @pytest.fixture(scope="class") + def macros(self): + yield {"snowflake__get_show_objects_sql.sql": SHOW_TERSE_OBJECTS_MACRO} + + +class TestShowTerseObjects10View10Table10Dynamic(ShowTerseObjects): + scenario = Scenario(10, 10, 10) + expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds() + + +class TestShowTerseObjects15View15Table0Dynamic(ShowTerseObjects): + scenario = Scenario(15, 15, 0) + expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds() + + +class TestShowTerseObjects100View100Table100Dynamic(ShowTerseObjects): + scenario = Scenario(100, 100, 100) + expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds() + + +class TestShowTerseObjects150View150Table0Dynamic(ShowTerseObjects): + scenario = Scenario(150, 150, 0) + expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds() + + +class TestShowTerseObjects1000View1000Table1000Dynamic(ShowTerseObjects): + scenario = Scenario(1000, 1000, 1000) + expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds() + + +class TestShowTerseObjects1500View1500Table0Dynamic(ShowTerseObjects): + scenario = Scenario(1500, 1500, 0) + expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds()