diff --git a/CHANGES.md b/CHANGES.md index 4b1aec1..2e0ab34 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,8 @@ - MongoDB: Add support for UUID types - MongoDB: Improve reading timestamps in previous BSON formats - MongoDB: Fix processing empty arrays/lists. By default, assume `TEXT` as inner type. +- MongoDB: For `ctk load table`, use "partial" scan for inferring the collection schema, + based on the first 10,000 documents. ## 2024/09/02 v0.0.21 - DynamoDB: Add special decoding for varied lists. diff --git a/cratedb_toolkit/io/mongodb/api.py b/cratedb_toolkit/io/mongodb/api.py index c4b2d20..7ae679b 100644 --- a/cratedb_toolkit/io/mongodb/api.py +++ b/cratedb_toolkit/io/mongodb/api.py @@ -40,7 +40,7 @@ def mongodb_copy(source_url, target_url, transformation: Path = None, limit: int url=str(mongodb_uri), database=mongodb_database, collection=mongodb_collection, - scan="full", + scan="partial", transformation=transformation, limit=limit, ) diff --git a/cratedb_toolkit/io/mongodb/extract.py b/cratedb_toolkit/io/mongodb/extract.py index 9872a50..f7f735e 100644 --- a/cratedb_toolkit/io/mongodb/extract.py +++ b/cratedb_toolkit/io/mongodb/extract.py @@ -85,6 +85,10 @@ ) +# TODO: Make configurable. +PARTIAL_SCAN_COUNT = 10_000 + + def extract_schema_from_collection(collection: Collection, partial: bool, limit: int = 0) -> t.Dict[str, t.Any]: """ Extract a schema definition from a collection. @@ -95,7 +99,7 @@ def extract_schema_from_collection(collection: Collection, partial: bool, limit: schema: dict = {"count": 0, "document": {}} if partial: - count = 1 + count = PARTIAL_SCAN_COUNT else: count = collection.estimated_document_count() with progressbar: @@ -105,7 +109,7 @@ def extract_schema_from_collection(collection: Collection, partial: bool, limit: schema["count"] += 1 schema["document"] = extract_schema_from_document(document, schema["document"]) progressbar.update(task, advance=1) - if partial: + if partial and schema["count"] >= PARTIAL_SCAN_COUNT: break except KeyboardInterrupt: return schema