Skip to content

Commit

Permalink
fix: Handle more cases in the Hive parser for nested column support (#…
Browse files Browse the repository at this point in the history
…1755)

* Handle non alpha only cases for the hive parser

Signed-off-by: Kristen Armes <[email protected]>

* Default to scalar type if the type string cannot be parsed

Signed-off-by: Kristen Armes <[email protected]>

* Logging/comments

Signed-off-by: Kristen Armes <[email protected]>
  • Loading branch information
kristenarmes authored Mar 10, 2022
1 parent 573219d commit 25fa0c4
Show file tree
Hide file tree
Showing 7 changed files with 207 additions and 65 deletions.
37 changes: 27 additions & 10 deletions databuilder/databuilder/models/type_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@

class TypeMetadata(abc.ABC, GraphSerializable):
NODE_LABEL = 'Type_Metadata'
RELATION_TYPE = 'TYPE_METADATA'
INVERSE_RELATION_TYPE = 'TYPE_METADATA_OF'
COL_TM_RELATION_TYPE = 'TYPE_METADATA'
TM_COL_RELATION_TYPE = 'TYPE_METADATA_OF'
SUBTYPE_RELATION_TYPE = 'SUBTYPE'
INVERSE_SUBTYPE_RELATION_TYPE = 'SUBTYPE_OF'
KIND = 'kind'
NAME = 'name'
DATA_TYPE = 'data_type'
Expand Down Expand Up @@ -85,6 +87,16 @@ def description_key(self) -> Optional[str]:
return f"{self.key()}/{description_id}"
return None

def relation_type(self) -> str:
if isinstance(self.parent, ColumnMetadata):
return TypeMetadata.COL_TM_RELATION_TYPE
return TypeMetadata.SUBTYPE_RELATION_TYPE

def inverse_relation_type(self) -> str:
if isinstance(self.parent, ColumnMetadata):
return TypeMetadata.TM_COL_RELATION_TYPE
return TypeMetadata.INVERSE_SUBTYPE_RELATION_TYPE

def parent_key(self) -> str:
if isinstance(self.parent, ColumnMetadata):
column_key = self.parent.get_column_key()
Expand Down Expand Up @@ -152,8 +164,8 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]:
start_key=self.parent_key(),
end_label=TypeMetadata.NODE_LABEL,
end_key=self.key(),
type=TypeMetadata.RELATION_TYPE,
reverse_type=TypeMetadata.INVERSE_RELATION_TYPE,
type=self.relation_type(),
reverse_type=self.inverse_relation_type(),
attributes={}
)

Expand Down Expand Up @@ -226,8 +238,8 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]:
start_key=self.parent_key(),
end_label=TypeMetadata.NODE_LABEL,
end_key=self.key(),
type=TypeMetadata.RELATION_TYPE,
reverse_type=TypeMetadata.INVERSE_RELATION_TYPE,
type=self.relation_type(),
reverse_type=self.inverse_relation_type(),
attributes={}
)

Expand All @@ -248,6 +260,11 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]:


class ScalarTypeMetadata(TypeMetadata):
"""
ScalarTypeMetadata represents any non complex type that does not
require special handling. It is also used as the default TypeMetadata
class when a type string cannot be parsed.
"""
kind = 'scalar'

def __init__(self, *args: Any, **kwargs: Any) -> None:
Expand Down Expand Up @@ -292,8 +309,8 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]:
start_key=self.parent_key(),
end_label=TypeMetadata.NODE_LABEL,
end_key=self.key(),
type=TypeMetadata.RELATION_TYPE,
reverse_type=TypeMetadata.INVERSE_RELATION_TYPE,
type=self.relation_type(),
reverse_type=self.inverse_relation_type(),
attributes={}
)

Expand Down Expand Up @@ -360,8 +377,8 @@ def create_relation_iterator(self) -> Iterator[GraphRelationship]:
start_key=self.parent_key(),
end_label=TypeMetadata.NODE_LABEL,
end_key=self.key(),
type=TypeMetadata.RELATION_TYPE,
reverse_type=TypeMetadata.INVERSE_RELATION_TYPE,
type=self.relation_type(),
reverse_type=self.inverse_relation_type(),
attributes={}
)

Expand Down
30 changes: 25 additions & 5 deletions databuilder/databuilder/utils/hive_complex_type_parser.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,42 @@
# Copyright Contributors to the Amundsen project.
# SPDX-License-Identifier: Apache-2.0

import logging
from typing import Union

from pyparsing import (
Forward, Group, Keyword, Word, alphanums, alphas, delimitedList, nestedExpr, originalTextFor,
Forward, Group, Keyword, OneOrMore, Optional, ParseException, Word, alphanums, delimitedList, nestedExpr, nums,
originalTextFor,
)

from databuilder.models.table_metadata import ColumnMetadata
from databuilder.models.type_metadata import (
ArrayTypeMetadata, MapTypeMetadata, ScalarTypeMetadata, StructTypeMetadata, TypeMetadata,
)

LOGGER = logging.getLogger(__name__)

array_keyword = Keyword("array")
map_keyword = Keyword("map")
struct_keyword = Keyword("struct")
union_keyword = Keyword("uniontype")

field_name = Word(alphanums + "_")
field_type = Forward()

# Scalar types
union_list = delimitedList(field_type)
union_type = nestedExpr(
opener=union_keyword + "<", closer=">", content=union_list, ignoreExpr=None
)
scalar_quantifier = "(" + Word(nums) + Optional(")" | "," + Word(nums) + ")")
scalar_type = union_type | OneOrMore(Word(alphanums + "_")) + Optional(scalar_quantifier)

# Complex types
array_field = "<" + field_type("type")
map_field = field_name("key") + "," + field_type("type")
map_field = originalTextFor(scalar_type)("key") + "," + field_type("type")
struct_field = field_name("name") + ":" + field_type("type")
struct_list = delimitedList(Group(struct_field))

scalar_type = Word(alphas)
array_type = nestedExpr(
opener=array_keyword, closer=">", content=array_field, ignoreExpr=None
)
Expand All @@ -42,7 +55,14 @@

def parse_hive_type(type_str: str, name: str, parent: Union[ColumnMetadata, TypeMetadata]) -> TypeMetadata:
type_str = type_str.lower()
parsed_type = complex_type.parseString(type_str, parseAll=True)
try:
parsed_type = complex_type.parseString(type_str, parseAll=True)
except ParseException:
# Default to scalar type if the type string cannot be parsed
LOGGER.warning(f"Could not parse type string, so defaulting to scalar value for type: {type_str}")
return ScalarTypeMetadata(name=name,
parent=parent,
type_str=type_str)

if parsed_type.scalar_type:
return ScalarTypeMetadata(name=name,
Expand Down
2 changes: 1 addition & 1 deletion databuilder/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from setuptools import find_packages, setup

__version__ = '6.7.1'
__version__ = '6.7.2'

requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')
with open(requirements_path) as requirements_file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -764,20 +764,20 @@
'/type/has_nested_type',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
label='TYPE_METADATA'
label='SUBTYPE'
),
METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format(
from_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
label='TYPE_METADATA'
label='SUBTYPE'
),
NEPTUNE_RELATIONSHIP_HEADER_FROM:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type',
NEPTUNE_RELATIONSHIP_HEADER_TO:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
NEPTUNE_HEADER_LABEL: 'TYPE_METADATA',
NEPTUNE_HEADER_LABEL: 'SUBTYPE',
NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
},
Expand All @@ -786,19 +786,19 @@
from_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type',
label='TYPE_METADATA_OF'
label='SUBTYPE_OF'
),
METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format(
from_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type',
label='TYPE_METADATA_OF'
label='SUBTYPE_OF'
),
NEPTUNE_RELATIONSHIP_HEADER_FROM:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
NEPTUNE_RELATIONSHIP_HEADER_TO:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type',
NEPTUNE_HEADER_LABEL: 'TYPE_METADATA_OF',
NEPTUNE_HEADER_LABEL: 'SUBTYPE_OF',
NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
}
Expand All @@ -810,20 +810,20 @@
'/type/has_nested_type/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_/_inner_',
label='TYPE_METADATA'
label='SUBTYPE'
),
METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format(
from_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_/_inner_',
label='TYPE_METADATA'
label='SUBTYPE'
),
NEPTUNE_RELATIONSHIP_HEADER_FROM:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
NEPTUNE_RELATIONSHIP_HEADER_TO: 'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_/_inner_',
NEPTUNE_HEADER_LABEL: 'TYPE_METADATA',
NEPTUNE_HEADER_LABEL: 'SUBTYPE',
NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
},
Expand All @@ -833,20 +833,20 @@
'/type/has_nested_type/_inner_/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
label='TYPE_METADATA_OF'
label='SUBTYPE_OF'
),
METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format(
from_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_/_inner_',
to_vertex_id='Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_',
label='TYPE_METADATA_OF'
label='SUBTYPE_OF'
),
NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type'
'/type/has_nested_type/_inner_/_inner_',
NEPTUNE_RELATIONSHIP_HEADER_TO:
'Type_Metadata:hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
NEPTUNE_HEADER_LABEL: 'TYPE_METADATA_OF',
NEPTUNE_HEADER_LABEL: 'SUBTYPE_OF',
NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
}
Expand Down
8 changes: 4 additions & 4 deletions databuilder/tests/unit/models/test_table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ def test_serialize(self) -> None:
'START_LABEL': 'Column', 'TYPE': 'TYPE_METADATA', 'REVERSE_TYPE': 'TYPE_METADATA_OF'},
{'END_KEY': 'hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
'START_KEY': 'hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type',
'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', 'TYPE': 'TYPE_METADATA',
'REVERSE_TYPE': 'TYPE_METADATA_OF'},
'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', 'TYPE': 'SUBTYPE',
'REVERSE_TYPE': 'SUBTYPE_OF'},
{'END_KEY': 'hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_/_inner_',
'START_KEY': 'hive://gold.test_schema1/test_table1/has_nested_type/type/has_nested_type/_inner_',
'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', 'TYPE': 'TYPE_METADATA',
'REVERSE_TYPE': 'TYPE_METADATA_OF'}
'END_LABEL': 'Type_Metadata', 'START_LABEL': 'Type_Metadata', 'TYPE': 'SUBTYPE',
'REVERSE_TYPE': 'SUBTYPE_OF'}
]

self.expected_rels = copy.deepcopy(self.expected_rels_deduped)
Expand Down
Loading

0 comments on commit 25fa0c4

Please sign in to comment.