diff --git a/CHANGES.md b/CHANGES.md index e947712..2cad8c2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ ## Unreleased - MongoDB: Rename columns with leading underscores to use double leading underscores +- MongoDB: Add support for UUID types ## 2024/09/02 v0.0.21 - DynamoDB: Add special decoding for varied lists. diff --git a/cratedb_toolkit/io/mongodb/export.py b/cratedb_toolkit/io/mongodb/export.py index 87cafc1..bc301fc 100644 --- a/cratedb_toolkit/io/mongodb/export.py +++ b/cratedb_toolkit/io/mongodb/export.py @@ -24,8 +24,10 @@ Export the documents from a MongoDB collection as JSON, to be ingested into CrateDB. """ +import base64 import calendar import typing as t +from uuid import UUID import bsonjs import dateutil.parser as dateparser @@ -65,6 +67,9 @@ def extract_value(value, parent_type=None): """ if isinstance(value, dict): if len(value) == 1: + if "$binary" in value and value["$binary"]["subType"] in ["03", "04"]: + decoded = UUID(bytes=base64.b64decode(value["$binary"]["base64"])) + return extract_value(decoded, parent_type) for k, v in value.items(): if k.startswith("$"): return extract_value(v, k.lstrip("$")) diff --git a/cratedb_toolkit/io/mongodb/extract.py b/cratedb_toolkit/io/mongodb/extract.py index 74e58e8..9872a50 100644 --- a/cratedb_toolkit/io/mongodb/extract.py +++ b/cratedb_toolkit/io/mongodb/extract.py @@ -67,6 +67,7 @@ import typing as t import bson +from bson import OLD_UUID_SUBTYPE, UUID_SUBTYPE from pymongo.collection import Collection from rich import progress from rich.console import Console @@ -197,4 +198,7 @@ def get_type(value): return "INTEGER" else: return "INT64" + if type_ is bson.binary.Binary: + if value.subtype in [OLD_UUID_SUBTYPE, UUID_SUBTYPE]: + return "UUID" return TYPES_MAP.get(type_, "UNKNOWN") diff --git a/cratedb_toolkit/io/mongodb/translate.py b/cratedb_toolkit/io/mongodb/translate.py index 6d7dcf4..ce1c42d 100644 --- a/cratedb_toolkit/io/mongodb/translate.py +++ b/cratedb_toolkit/io/mongodb/translate.py @@ -37,6 +37,7 @@ TYPES = { "OID": "TEXT", + "UUID": "TEXT", "DATETIME": "TIMESTAMP WITH TIME ZONE", "TIMESTAMP": "TIMESTAMP WITHOUT TIME ZONE", "INT64": "BIGINT",