Merge pull request #278 from capitalone/develop

v0.5.9
capitalone · Jun 20, 2024 · d6d098a · d6d098a
2 parents 90c8c8a + e280467
commit d6d098a
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 20 deletions.
diff --git a/locopy/_version.py b/locopy/_version.py
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.5.8"
+__version__ = "0.5.9"
diff --git a/locopy/utility.py b/locopy/utility.py
@@ -251,14 +251,14 @@ def find_column_type(dataframe, warehouse_type: str):
 
     Following is the list of pandas data types that the function checks and their mapping in sql:
 
-        - bool -> boolean
-        - datetime64[ns] -> timestamp
+        - bool/pd.BooleanDtype -> boolean
+        - datetime64[ns, <tz>] -> timestamp
         - M8[ns] -> timestamp
-        - int -> int
-        - float -> float
+        - int/pd.Int64Dtype -> int
+        - float/pd.Float64Dtype -> float
         - float object -> float
         - datetime object -> timestamp
-        - object -> varchar
+        - object/pd.StringDtype -> varchar
 
     For all other data types, the column will be mapped to varchar type.
 
@@ -313,19 +313,19 @@ def validate_float_object(column):
         data = dataframe[column].dropna().reset_index(drop=True)
         if data.size == 0:
             column_type.append("varchar")
-        elif data.dtype in ["datetime64[ns]", "M8[ns]"]:
+        elif (data.dtype in ["datetime64[ns]", "M8[ns]"]) or (re.match("(datetime64\[ns\,\W)([a-zA-Z]+)(\])",str(data.dtype))):
             column_type.append("timestamp")
-        elif data.dtype == "bool":
+        elif str(data.dtype).lower().startswith("bool"):
             column_type.append("boolean")
         elif str(data.dtype).startswith("object"):
             data_type = validate_float_object(data) or validate_date_object(data)
             if not data_type:
                 column_type.append("varchar")
             else:
                 column_type.append(data_type)
-        elif str(data.dtype).startswith("int"):
+        elif str(data.dtype).lower().startswith("int"):
             column_type.append("int")
-        elif str(data.dtype).startswith("float"):
+        elif str(data.dtype).lower().startswith("float"):
             column_type.append("float")
         else:
             column_type.append("varchar")

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ authors = [
   { name="Faisal Dosani", email="[email protected]" },
 ]
 license = {text = "Apache Software License"}
-dependencies = ["boto3<=1.34.83,>=1.9.92", "PyYAML<=6.0.1,>=5.1", "pandas<=2.2.2,>=0.25.2", "numpy<=1.26.4,>=1.22.0"]
+dependencies = ["boto3<=1.34.126,>=1.9.92", "PyYAML<=6.0.1,>=5.1", "pandas<=2.2.2,>=0.25.2", "numpy<=1.26.4,>=1.22.0"]
 
 requires-python = ">=3.8.0"
 classifiers = [

diff --git a/requirements.txt b/requirements.txt
@@ -4,35 +4,36 @@
 #
 #    pip-compile --output-file=requirements.txt pyproject.toml
 #
-boto3==1.28.63
+
+boto3==1.34.126
     # via locopy (pyproject.toml)
-botocore==1.31.67
+botocore==1.34.130
     # via
     #   boto3
     #   s3transfer
 jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-numpy==1.26.0
+numpy==1.26.4
     # via
     #   locopy (pyproject.toml)
     #   pandas
-pandas==2.1.1
+pandas==2.2.2
     # via locopy (pyproject.toml)
-python-dateutil==2.8.2
+python-dateutil==2.9.0.post0
     # via
     #   botocore
     #   pandas
-pytz==2023.3.post1
+pytz==2024.1
     # via pandas
 pyyaml==6.0.1
     # via locopy (pyproject.toml)
-s3transfer==0.7.0
+s3transfer==0.10.1
     # via boto3
 six==1.16.0
     # via python-dateutil
-tzdata==2023.3
+tzdata==2024.1
     # via pandas
-urllib3==2.0.7
+urllib3==2.2.2
     # via botocore
diff --git a/tests/test_utility.py b/tests/test_utility.py
@@ -26,6 +26,7 @@
 from itertools import cycle
 from pathlib import Path
 from unittest import mock
+import datetime
 
 import pytest
 
@@ -340,6 +341,52 @@ def test_find_column_type():
     assert find_column_type(input_text, "snowflake") == output_text_snowflake
     assert find_column_type(input_text, "redshift") == output_text_redshift
 
+def test_find_column_type_new():
+
+    from decimal import Decimal
+
+    import pandas as pd
+
+    input_text = pd.DataFrame.from_dict(
+    {
+        "a": [1],
+        "b": [pd.Timestamp('2017-01-01T12+0')],
+        "c": [1.2],
+        "d": ["a"],
+        "e": [True]
+    }
+)
+
+    input_text = input_text.astype(
+        dtype={
+            "a": pd.Int64Dtype(), 
+            "b": pd.DatetimeTZDtype(tz=datetime.timezone.utc), 
+            "c": pd.Float64Dtype(), 
+            "d": pd.StringDtype(), 
+            "e": pd.BooleanDtype()
+        }
+    )
+
+    output_text_snowflake = {
+        "a": "int",
+        "b": "timestamp",
+        "c": "float",
+        "d": "varchar",
+        "e": "boolean",
+    }
+
+    output_text_redshift = {      
+        "a": "int",
+        "b": "timestamp",
+        "c": "float",
+        "d": "varchar",
+        "e": "boolean",
+    }
+
+    assert find_column_type(input_text, "snowflake") == output_text_snowflake
+    assert find_column_type(input_text, "redshift") == output_text_redshift
+
+
 
 def test_get_ignoreheader_number():
     assert (