Feat: Handle special chars in column and table name normalization (#239)

airbytehq · May 23, 2024 · ad850f0 · ad850f0
1 parent bd8ee6d
commit ad850f0
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 6 deletions.
diff --git a/airbyte/_processors/sql/snowflake.py b/airbyte/_processors/sql/snowflake.py
@@ -129,9 +129,7 @@ def path_str(path: Path) -> str:
         ]
         files_list = ", ".join([f"'{f.name}'" for f in files])
         columns_list_str: str = indent("\n, ".join(columns_list), " " * 12)
-        variant_cols_str: str = ("\n" + " " * 21 + ", ").join(
-            [f"$1:{self.normalizer.normalize(col)}" for col in columns_list]
-        )
+        variant_cols_str: str = ("\n" + " " * 21 + ", ").join([f"$1:{col}" for col in columns_list])
         copy_statement = dedent(
             f"""
             COPY INTO {temp_table_name}
@@ -143,7 +141,7 @@ def path_str(path: Path) -> str:
                 FROM {internal_sf_stage_name}
             )
             FILES = ( {files_list} )
-            FILE_FORMAT = ( TYPE = JSON )
+            FILE_FORMAT = ( TYPE = JSON, COMPRESSION = GZIP )
             ;
             """
         )

diff --git a/airbyte/_util/name_normalizers.py b/airbyte/_util/name_normalizers.py
@@ -4,8 +4,11 @@
 from __future__ import annotations
 
 import abc
+import re
 from typing import TYPE_CHECKING
 
+from airbyte import exceptions as exc
+
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -46,8 +49,40 @@ class LowerCaseNormalizer(NameNormalizerBase):
 
     @staticmethod
     def normalize(name: str) -> str:
-        """Return the normalized name."""
-        return name.lower().replace(" ", "_").replace("-", "_")
+        """Return the normalized name.
+
+        - All non-alphanumeric characters are replaced with underscores.
+        - Any names that start with a numeric ("1", "2", "123", "1b" etc.) are prefixed
+          with and underscore ("_1", "_2", "_123", "_1b" etc.)
+
+        Examples:
+        - "Hello World!" -> "hello_world"
+        - "Hello, World!" -> "hello__world"
+        - "Hello - World" -> "hello___world"
+        - "___Hello, World___" -> "___hello__world___"
+        - "Average Sales (%)" -> "average_sales____"
+        - "Average Sales (#)" -> "average_sales____"
+        - "+1" -> "_1"
+        - "-1" -> "_1"
+        """
+        result = name
+
+        # Replace all non-alphanumeric characters with underscores.
+        result = re.sub("[^A-Za-z0-9]", "_", result.lower())
+
+        # Check if name starts with a number and prepend "_" if it does.
+        if result and result[0].isdigit():
+            # Most databases do not allow identifiers to start with a number.
+            result = f"_{result}"
+
+        if not result.replace("_", ""):
+            raise exc.PyAirbyteNameNormalizationError(
+                message="Name cannot be empty after normalization.",
+                raw_name=name,
+                normalization_result=result,
+            )
+
+        return result
 
 
 __all__ = [

diff --git a/airbyte/exceptions.py b/airbyte/exceptions.py
@@ -172,6 +172,23 @@ class PyAirbyteNoStreamsSelectedError(PyAirbyteInputError):
     available_streams: list[str] | None = None
 
 
+# Normalization Errors
+
+
+@dataclass
+class PyAirbyteNameNormalizationError(PyAirbyteError, ValueError):
+    """Error occurred while normalizing a table or column name."""
+
+    guidance = (
+        "Please consider renaming the source object if possible, or "
+        "raise an issue in GitHub if not."
+    )
+    help_url = NEW_ISSUE_URL
+
+    raw_name: str | None = None
+    normalization_result: str | None = None
+
+
 # PyAirbyte Cache Errors
 
 

diff --git a/tests/unit_tests/test_text_normalization.py b/tests/unit_tests/test_text_normalization.py
@@ -1,4 +1,6 @@
 import pytest
+from airbyte import exceptions as exc
+from airbyte._util.name_normalizers import LowerCaseNormalizer
 from airbyte.constants import AB_INTERNAL_COLUMNS
 from airbyte.records import StreamRecord
 
@@ -116,3 +118,40 @@ def test_case_insensitive_w_pretty_keys() -> None:
     # Assert case insensitivity when comparing natively to a dict
     assert cid == {"UPPER": 1, "lower": 2, "other": None}
     assert cid == {"upper": 1, "lower": 2, "other": None}
+
+
+@pytest.mark.parametrize(
+    "raw_value, expected_result, should_raise",
+    [
+        ("_airbyte_meta", "_airbyte_meta", False),
+        ("Test_String", "test_string", False),
+        ("ANOTHER-TEST", "another_test", False),
+        ("another.test", "another_test", False),
+        ("sales(%)", "sales___", False),
+        ("something_-_-_-_else", "something_______else", False),
+        ("sales (%)", "sales____", False),
+        ("sales-%", "sales__", False),
+        ("sales(#)", "sales___", False),
+        ("sales (#)", "sales____", False),
+        ("sales--(#)", "sales_____", False),
+        ("sales-#", "sales__", False),
+        ("+1", "_1", False),
+        ("1", "_1", False),
+        ("2", "_2", False),
+        ("3", "_3", False),
+        ("-1", "_1", False),
+        ("+#$", "", True),
+        ("+", "", True),
+        ("", "", True),
+        ("*", "", True),
+        ("!@$", "", True),
+    ],
+)
+def test_lower_case_normalizer(raw_value, expected_result, should_raise):
+    normalizer = LowerCaseNormalizer()
+
+    if should_raise:
+        with pytest.raises(exc.PyAirbyteNameNormalizationError):
+            assert normalizer.normalize(raw_value) == expected_result
+    else:
+        assert normalizer.normalize(raw_value) == expected_result