Skip to content

Commit

Permalink
Feat: Handle special chars in column and table name normalization (#239)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronsteers authored May 23, 2024
1 parent bd8ee6d commit ad850f0
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 6 deletions.
6 changes: 2 additions & 4 deletions airbyte/_processors/sql/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ def path_str(path: Path) -> str:
]
files_list = ", ".join([f"'{f.name}'" for f in files])
columns_list_str: str = indent("\n, ".join(columns_list), " " * 12)
variant_cols_str: str = ("\n" + " " * 21 + ", ").join(
[f"$1:{self.normalizer.normalize(col)}" for col in columns_list]
)
variant_cols_str: str = ("\n" + " " * 21 + ", ").join([f"$1:{col}" for col in columns_list])
copy_statement = dedent(
f"""
COPY INTO {temp_table_name}
Expand All @@ -143,7 +141,7 @@ def path_str(path: Path) -> str:
FROM {internal_sf_stage_name}
)
FILES = ( {files_list} )
FILE_FORMAT = ( TYPE = JSON )
FILE_FORMAT = ( TYPE = JSON, COMPRESSION = GZIP )
;
"""
)
Expand Down
39 changes: 37 additions & 2 deletions airbyte/_util/name_normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from __future__ import annotations

import abc
import re
from typing import TYPE_CHECKING

from airbyte import exceptions as exc


if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -46,8 +49,40 @@ class LowerCaseNormalizer(NameNormalizerBase):

@staticmethod
def normalize(name: str) -> str:
"""Return the normalized name."""
return name.lower().replace(" ", "_").replace("-", "_")
"""Return the normalized name.
- All non-alphanumeric characters are replaced with underscores.
- Any names that start with a numeric ("1", "2", "123", "1b" etc.) are prefixed
with and underscore ("_1", "_2", "_123", "_1b" etc.)
Examples:
- "Hello World!" -> "hello_world"
- "Hello, World!" -> "hello__world"
- "Hello - World" -> "hello___world"
- "___Hello, World___" -> "___hello__world___"
- "Average Sales (%)" -> "average_sales____"
- "Average Sales (#)" -> "average_sales____"
- "+1" -> "_1"
- "-1" -> "_1"
"""
result = name

# Replace all non-alphanumeric characters with underscores.
result = re.sub("[^A-Za-z0-9]", "_", result.lower())

# Check if name starts with a number and prepend "_" if it does.
if result and result[0].isdigit():
# Most databases do not allow identifiers to start with a number.
result = f"_{result}"

if not result.replace("_", ""):
raise exc.PyAirbyteNameNormalizationError(
message="Name cannot be empty after normalization.",
raw_name=name,
normalization_result=result,
)

return result


__all__ = [
Expand Down
17 changes: 17 additions & 0 deletions airbyte/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,23 @@ class PyAirbyteNoStreamsSelectedError(PyAirbyteInputError):
available_streams: list[str] | None = None


# Normalization Errors


@dataclass
class PyAirbyteNameNormalizationError(PyAirbyteError, ValueError):
"""Error occurred while normalizing a table or column name."""

guidance = (
"Please consider renaming the source object if possible, or "
"raise an issue in GitHub if not."
)
help_url = NEW_ISSUE_URL

raw_name: str | None = None
normalization_result: str | None = None


# PyAirbyte Cache Errors


Expand Down
39 changes: 39 additions & 0 deletions tests/unit_tests/test_text_normalization.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import pytest
from airbyte import exceptions as exc
from airbyte._util.name_normalizers import LowerCaseNormalizer
from airbyte.constants import AB_INTERNAL_COLUMNS
from airbyte.records import StreamRecord

Expand Down Expand Up @@ -116,3 +118,40 @@ def test_case_insensitive_w_pretty_keys() -> None:
# Assert case insensitivity when comparing natively to a dict
assert cid == {"UPPER": 1, "lower": 2, "other": None}
assert cid == {"upper": 1, "lower": 2, "other": None}


@pytest.mark.parametrize(
"raw_value, expected_result, should_raise",
[
("_airbyte_meta", "_airbyte_meta", False),
("Test_String", "test_string", False),
("ANOTHER-TEST", "another_test", False),
("another.test", "another_test", False),
("sales(%)", "sales___", False),
("something_-_-_-_else", "something_______else", False),
("sales (%)", "sales____", False),
("sales-%", "sales__", False),
("sales(#)", "sales___", False),
("sales (#)", "sales____", False),
("sales--(#)", "sales_____", False),
("sales-#", "sales__", False),
("+1", "_1", False),
("1", "_1", False),
("2", "_2", False),
("3", "_3", False),
("-1", "_1", False),
("+#$", "", True),
("+", "", True),
("", "", True),
("*", "", True),
("!@$", "", True),
],
)
def test_lower_case_normalizer(raw_value, expected_result, should_raise):
normalizer = LowerCaseNormalizer()

if should_raise:
with pytest.raises(exc.PyAirbyteNameNormalizationError):
assert normalizer.normalize(raw_value) == expected_result
else:
assert normalizer.normalize(raw_value) == expected_result

0 comments on commit ad850f0

Please sign in to comment.