Skip to content

Commit

Permalink
Update hive.py
Browse files Browse the repository at this point in the history
  • Loading branch information
acrylJonny committed Nov 12, 2024
1 parent 4f3b64d commit 5f7af5b
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions metadata-ingestion/src/datahub/ingestion/source/sql/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
StringTypeClass,
BytesTypeClass,
BooleanTypeClass,
SchemaFieldDataTypeClass,
)

from datahub.utilities import config_clean
Expand Down Expand Up @@ -530,7 +531,7 @@ def _schema_from_spark(
version=0,
fields=fields,
hash="",
platformSchema={"name": "spark", "version": "1.0"}
platformSchema=OtherSchemaClass(rawSchema="")
)

def _schema_from_arrow(
Expand Down Expand Up @@ -559,9 +560,16 @@ def _schema_from_arrow(
platformSchema=OtherSchemaClass(rawSchema="")
)

def _get_field_type(self, data_type: Any) -> SchemaFieldClass.typeClass:
"""Map storage data types to DataHub types"""
# Map Spark/Arrow types to DataHub types
def _get_field_type(self, data_type: Any) -> SchemaFieldDataTypeClass:
"""
Map storage data types to DataHub types
Args:
data_type: The source data type (from Spark/Arrow/etc.)
Returns:
DataHub SchemaFieldDataTypeClass instance
"""
type_mapping = {
"string": StringTypeClass,
"binary": BytesTypeClass,
Expand All @@ -580,9 +588,9 @@ def _get_field_type(self, data_type: Any) -> SchemaFieldClass.typeClass:
type_str = str(data_type).lower()
for key, type_class in type_mapping.items():
if key in type_str:
return type_class()
return SchemaFieldDataTypeClass(type=type_class())

return NullTypeClass()
return SchemaFieldDataTypeClass(type=NullTypeClass())

def get_storage_dataset_mcp(
self,
Expand Down

0 comments on commit 5f7af5b

Please sign in to comment.