Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug] Fix scenario where dbt attempts to add existing columns to relations when using the SDK for column metadata #919

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Fixes-20240927-171725.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Fixes
body: Fix scenario where dbt attempts to add existing columns to relations when using the SDK for column metadata
time: 2024-09-27T17:17:25.584838-04:00
custom:
Author: mikealfare
Issue: "914"
9 changes: 9 additions & 0 deletions dbt/adapters/redshift/connections.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,15 @@ def _parse_column_results(record: Tuple[Any, ...]) -> Dict[str, Any]:
char_dtypes = [1, 12]
num_dtypes = [2, 3, 4, 5, 6, 7, 8, -5, 2003]

# the results from `get_columns` vary slightly from the pg_catalog tables for dtype names
dtype_alias = {
"bool": "boolean",
"int4": "integer",
"timestamp": "timestamp without time zone",
"varchar": "character varying",
}
dtype_name = dtype_alias.get(dtype_name, dtype_name)

if dtype_code in char_dtypes:
return {"column": column_name, "dtype": dtype_name, "char_size": column_size}
elif dtype_code in num_dtypes:
Expand Down
2 changes: 1 addition & 1 deletion dbt/adapters/redshift/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def _behavior_flags(self) -> List[BehaviorFlag]:
return [
{
"name": "restrict_direct_pg_catalog_access",
"default": False,
"default": True,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allows us to run all tests as if the flag were enabled without adjusting the tests. This should be reverted prior to merging.

"description": (
"The dbt-redshift adapter is migrating from using pg_ tables "
"to using Redshift Metadata API and information_schema tables "
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@ def models(self):
def setup(self, project):
run_dbt(["run"])

@pytest.fixture(scope="class")
def expected_columns(self):
return []

def test_columns_in_relation(self, project, expected_columns):
def test_columns_in_relation(self, project):
my_relation = RedshiftRelation.create(
database=project.database,
schema=project.test_schema,
Expand All @@ -28,6 +24,10 @@ def test_columns_in_relation(self, project, expected_columns):
)
with project.adapter.connection_named("_test"):
actual_columns = project.adapter.get_columns_in_relation(my_relation)
expected_columns = [
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are aliasing dtypes back to the original names, we don't need to specify the expected column dtypes based on whether the flag is enabled.

Column(column="my_num", dtype="numeric", numeric_precision=3, numeric_scale=2),
Column(column="my_char", dtype="character varying", char_size=1),
]
assert actual_columns == expected_columns


Expand All @@ -36,24 +36,8 @@ class TestColumnsInRelationBehaviorFlagOff(ColumnsInRelation):
def project_config_update(self):
return {"flags": {}}

@pytest.fixture(scope="class")
def expected_columns(self):
# the SDK query returns "varchar" whereas our custom query returns "character varying"
return [
Column(column="my_num", dtype="numeric", numeric_precision=3, numeric_scale=2),
Column(column="my_char", dtype="character varying", char_size=1),
]


class TestColumnsInRelationBehaviorFlagOn(ColumnsInRelation):
@pytest.fixture(scope="class")
def project_config_update(self):
return {"flags": {"restrict_direct_pg_catalog_access": True}}

@pytest.fixture(scope="class")
def expected_columns(self):
# the SDK query returns "varchar" whereas our custom query returns "character varying"
return [
Column(column="my_num", dtype="numeric", numeric_precision=3, numeric_scale=2),
Column(column="my_char", dtype="varchar", char_size=1),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from dbt.tests.util import run_dbt
import pytest

from tests.functional.utils import update_model


SEED = """
id,col7,col6,occurred_at
1,Cheetara,thunder,'2024-01-01'
2,Tygra,thunder,'2024-01-01'
2,Tygra,THUNDER,'2024-02-01'
3,Lion-O,thunder,'2024-01-01'
3,Lion-O,THUNDER,'2024-02-01'
3,Lion-O,THUNDERCATS,'2024-03-01'
""".strip()


MODEL_INITIAL = """
{{ config(
materialized='incremental',
dist='col6',
on_schema_change='append_new_columns',
) }}
select
id::bigint as id,
col6::varchar(128) as col6,
occurred_at::timestamptz as occurred_at
from {{ ref('my_seed') }}
where occurred_at::timestamptz >= '2024-01-01'::timestamptz
and occurred_at::timestamptz < '2024-02-01'::timestamptz
"""


MODEL_UPDATE = """
{{ config(
materialized='incremental',
dist='col6',
on_schema_change='append_new_columns',
) }}
select
id::bigint as id,
col6::varchar(128) as col6,
occurred_at::timestamptz as occurred_at,
col7::varchar(56) as col7
from {{ ref('my_seed') }}
where occurred_at::timestamptz >= '2024-02-01'::timestamptz
and occurred_at::timestamptz < '2024-03-01'::timestamptz
"""


class TestIncrementalOnSchemaChange:
"""
This addresses: https://github.com/dbt-labs/dbt-redshift/issues/914
"""

@pytest.fixture(scope="class")
def project_config_update(self):
return {"flags": {"restrict_direct_pg_catalog_access": False}}

@pytest.fixture(scope="class")
def seeds(self):
return {"my_seed.csv": SEED}

@pytest.fixture(scope="class")
def models(self):
return {"my_model.sql": MODEL_INITIAL}

def test_columns_in_relation(self, project):
run_dbt(["seed"])
run_dbt(["run"])
update_model(project, "my_model", MODEL_UPDATE)
run_dbt(["run"])
# a successful run is a pass
8 changes: 8 additions & 0 deletions tests/functional/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from dbt.tests.util import get_model_file, relation_from_name, set_model_file


def update_model(project, name: str, model: str) -> str:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was copy/pasted from other adapters. We should move it to dbt-tests-adapter eventually, but that would complicate this PR. It should be done as a separate exercise.

relation = relation_from_name(project.adapter, name)
original_model = get_model_file(project, relation)
set_model_file(project, relation, model)
return original_model
Loading