Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance test show objects vs show terse objects #1046

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
47 changes: 33 additions & 14 deletions dbt/adapters/snowflake/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,40 @@ def list_relations_without_caching(
relations = []
quote_policy = {"database": True, "schema": True, "identifier": True}

columns = ["database_name", "schema_name", "name", "kind"]
for _database, _schema, _identifier, _type in results.select(columns):
try:
_type = self.Relation.get_relation_type(_type.lower())
except ValueError:
_type = self.Relation.External
relations.append(
self.Relation.create(
database=_database,
schema=_schema,
identifier=_identifier,
quote_policy=quote_policy,
type=_type,
if "is_dynamic" in results.column_names:
columns = ["database_name", "schema_name", "name", "kind", "is_dynamic"]
for _database, _schema, _identifier, _type, is_dynamic in results.select(columns):
try:
_type = self.Relation.get_relation_type(_type.lower())
if _type == self.Relation.Table and is_dynamic == "Y":
_type = self.Relation.DynamicTable
except ValueError:
_type = self.Relation.External
relations.append(
self.Relation.create(
database=_database,
schema=_schema,
identifier=_identifier,
quote_policy=quote_policy,
type=_type,
)
)
else:
columns = ["database_name", "schema_name", "name", "kind"]
for _database, _schema, _identifier, _type in results.select(columns):
try:
_type = self.Relation.get_relation_type(_type.lower())
except ValueError:
_type = self.Relation.External
relations.append(
self.Relation.create(
database=_database,
schema=_schema,
identifier=_identifier,
quote_policy=quote_policy,
type=_type,
)
)
)

return relations

Expand Down
8 changes: 6 additions & 2 deletions dbt/include/snowflake/macros/adapters.sql
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
{% for _ in range(0, max_iter) %}

{%- set paginated_sql -%}
show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }} from '{{ watermark.table_name }}'
{{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }} from '{{ watermark.table_name }}'
{%- endset -%}

{%- set paginated_result = run_query(paginated_sql) %}
Expand Down Expand Up @@ -119,12 +119,16 @@

{% endmacro %}

{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %}
show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }}
{% endmacro %}

{% macro snowflake__list_relations_without_caching(schema_relation, max_iter=10, max_results_per_iter=10000) %}

{%- set max_total_results = max_results_per_iter * max_iter -%}

{%- set sql -%}
show terse objects in {{ schema_relation.database }}.{{ schema_relation.schema }} limit {{ max_results_per_iter }}
{{ snowflake__get_show_objects_sql(schema_relation, max_results_per_iter) }}
{%- endset -%}

{%- set result = run_query(sql) -%}
Expand Down
2 changes: 2 additions & 0 deletions test.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,5 @@ SNOWFLAKE_TEST_WAREHOUSE=my_warehouse_name
DBT_TEST_USER_1=dbt_test_role_1
DBT_TEST_USER_2=dbt_test_role_2
DBT_TEST_USER_3=dbt_test_role_3

DBT_PERFORMANCE_TESTING=0
17 changes: 17 additions & 0 deletions tests/performance/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os

import pytest


def _get_setting(environment_variable: str) -> bool:
raw_value = os.environ.get(environment_variable, False)
return raw_value in [True, "True", "TRUE", 1, "1"]


performance_test = pytest.mark.skipif(
not _get_setting("DBT_PERFORMANCE_TESTING"),
reason=(
"Performance test skipped, to turn on performance testing, "
"please set the environment variable `DBT_PERFORMANCE_TESTING`"
),
)
51 changes: 51 additions & 0 deletions tests/performance/list_relations_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Performance tests were run using both `show objects` and `show terse objects` at three scales.
With `2024_03` turned off, both methods are able to correctly identify a dynamic table.
However, when `2024_03` is turned on, only `show objects` is able to correctly identify
a dynamic table. This is done by inspecting the new column `is_dynamic` since both a table
and a dynamic table show up with a `kind` of table.
In order to properly compare the two methods, an additional scenario was added that does not
create dynamic tables, and instead splits those objects evenly between views and tables.

Let's take the small scale as an example. The small scale creates 30 objects.
There is a run that creates 10 of each object, resulting in 30 objects.
This is successful for `show objects` whether `2024_03` is turned on or off.
It is also successful for `show terse objects` when `2024_03` is turned off.
There is another scenario that creates 15 views and 15 table, but no dynamic tables.
This scenario still creates 30 objects, and both methods return the correct types
regardless of setting for `2024_03`.
These scenarios can be combined to compare `show terse objects` with `2024_03` off
to `show objects` with `2024_03` turned on.
This comparison represents the change that will happen when `2024_03` becomes a mandatory bundle.

### 30 Objects

| 2024_03 | method | mean time | mean time - no DTs |
|:-------:|--------------------|----------:|-------------------:|
| NO | show terse objects | 1.02 s | -- |
| YES | show objects | 0.91 s | 0.92 s |
| YES | show terse objects | -- | 0.94 s |

- 11% improved run time of `list_relations_without_caching` when turning on `2024_03`
- similar performance of `show objects` and `show terse objects` in `2024_03`

### 300 Objects

| 2024_03 | method | mean time | mean time - no DTs |
|:-------:|--------------------|----------:|-------------------:|
| NO | show terse objects | 0.96 s | -- |
| YES | show objects | 1.19 s | 1.37 s |
| YES | show terse objects | -- | 0.92 s |

- 24% longer run time of `list_relations_without_caching` when turning on `2024_03`
- 49% longer run time of `show objects` than `show terse objects` in `2024_03`

### 3000 Objects

| 2024_03 | method | mean time | mean time - no DTs |
|:-------:|--------------------|----------:|-------------------:|
| NO | show terse objects | 2.00 s | -- |
| YES | show objects | 3.05 s | 3.22 s |
| YES | show terse objects | -- | 2.33 s |

- 53% longer run time of `list_relations_without_caching` when turning on `2024_03`
- 38% longer run time of `show objects` than `show terse objects` in `2024_03`
114 changes: 114 additions & 0 deletions tests/performance/list_relations_tests/list_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from dataclasses import dataclass
from datetime import datetime, timedelta
import os
from statistics import mean
from typing import List, Tuple

import pytest

from dbt.adapters.factory import get_adapter_by_type
from dbt.adapters.snowflake import SnowflakeRelation

from dbt.tests.util import run_dbt, get_connection
from tests.performance.conftest import performance_test


SEED = """
id,value
0,red
1,yellow
2,blue
""".strip()


VIEW = """
select * from {{ ref('my_seed') }}
"""


TABLE = """
{{ config(materialized='table') }}
select * from {{ ref('my_seed') }}
"""


DYNAMIC_TABLE = (
"""
{{ config(
materialized='dynamic_table',
target_lag='1 day',
snowflake_warehouse='"""
+ os.getenv("SNOWFLAKE_TEST_WAREHOUSE")
+ """',
) }}
select * from {{ ref('my_seed') }}
"""
)


@dataclass
class Scenario:
views: int
tables: int
dynamic_tables: int


class BaseConfig:
scenario: Scenario
expected_duration: float
iterations: int = 10

@pytest.fixture(scope="class")
def seeds(self):
yield {"my_seed.csv": SEED}

@pytest.fixture(scope="class")
def models(self):
models = {}
models.update({f"my_view_{i}.sql": VIEW for i in range(self.scenario.views)})
models.update({f"my_table_{i}.sql": TABLE for i in range(self.scenario.tables)})
models.update(
{
f"my_dynamic_table_{i}.sql": DYNAMIC_TABLE
for i in range(self.scenario.dynamic_tables)
}
)
yield models

@pytest.fixture(scope="class", autouse=True)
def setup(self, project):
run_dbt(["seed"])
run_dbt(["run"])

def list_relations(self, project) -> Tuple[List[SnowflakeRelation], timedelta]:
my_adapter = get_adapter_by_type("snowflake")
schema = my_adapter.Relation.create(
database=project.database, schema=project.test_schema, identifier=""
)

start = datetime.utcnow()
with get_connection(my_adapter):
relations = my_adapter.list_relations_without_caching(schema)
end = datetime.utcnow()
duration = end - start
return relations, duration

@performance_test
def test_list_relations(self, project):
durations = []
for i in range(self.iterations):
relations, duration = self.list_relations(project)
durations.append(duration.total_seconds())
assert (
len([relation for relation in relations if relation.is_view])
== self.scenario.views
)
assert (
len([relation for relation in relations if relation.is_table])
== self.scenario.tables + 1 # add the seed
)
assert (
len([relation for relation in relations if relation.is_dynamic_table])
== self.scenario.dynamic_tables
)
assert mean(durations) < self.expected_duration * 1.10 # allow for 10% error
48 changes: 48 additions & 0 deletions tests/performance/list_relations_tests/test_show_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from datetime import timedelta

import pytest

from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario


SHOW_OBJECTS_MACRO = """
{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %}
show objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }}
{% endmacro %}
"""


class ShowObjects(BaseConfig):
@pytest.fixture(scope="class")
def macros(self):
yield {"snowflake__get_show_objects_sql.sql": SHOW_OBJECTS_MACRO}


class TestShowObjects10View10Table10Dynamic(ShowObjects):
scenario = Scenario(10, 10, 10)
expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds()


class TestShowObjects15View15Table0Dynamic(ShowObjects):
scenario = Scenario(15, 15, 0)
expected_duration = timedelta(seconds=0, microseconds=920_000).total_seconds()


class TestShowObjects100View100Table100Dynamic(ShowObjects):
scenario = Scenario(100, 100, 100)
expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds()


class TestShowObjects150View150Table0Dynamic(ShowObjects):
scenario = Scenario(150, 150, 0)
expected_duration = timedelta(seconds=1, microseconds=370_000).total_seconds()


class TestShowObjects1000View1000Table1000Dynamic(ShowObjects):
scenario = Scenario(1000, 1000, 1000)
expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds()


class TestShowObjects1500View1500Table0Dynamic(ShowObjects):
scenario = Scenario(1500, 1500, 0)
expected_duration = timedelta(seconds=3, microseconds=400_000).total_seconds()
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from datetime import timedelta

import pytest

from tests.performance.list_relations_tests.list_relations import BaseConfig, Scenario


SHOW_TERSE_OBJECTS_MACRO = """
{% macro snowflake__get_show_objects_sql(schema, results_per_iteration) %}
show terse objects in {{ schema.database }}.{{ schema.schema }} limit {{ results_per_iteration }}
{% endmacro %}
"""


class ShowTerseObjects(BaseConfig):
@pytest.fixture(scope="class")
def macros(self):
yield {"snowflake__get_show_objects_sql.sql": SHOW_TERSE_OBJECTS_MACRO}


class TestShowTerseObjects10View10Table10Dynamic(ShowTerseObjects):
scenario = Scenario(10, 10, 10)
expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds()


class TestShowTerseObjects15View15Table0Dynamic(ShowTerseObjects):
scenario = Scenario(15, 15, 0)
expected_duration = timedelta(seconds=1, microseconds=20_000).total_seconds()


class TestShowTerseObjects100View100Table100Dynamic(ShowTerseObjects):
scenario = Scenario(100, 100, 100)
expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds()


class TestShowTerseObjects150View150Table0Dynamic(ShowTerseObjects):
scenario = Scenario(150, 150, 0)
expected_duration = timedelta(seconds=0, microseconds=960_000).total_seconds()


class TestShowTerseObjects1000View1000Table1000Dynamic(ShowTerseObjects):
scenario = Scenario(1000, 1000, 1000)
expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds()


class TestShowTerseObjects1500View1500Table0Dynamic(ShowTerseObjects):
scenario = Scenario(1500, 1500, 0)
expected_duration = timedelta(seconds=2, microseconds=330_000).total_seconds()
Loading