From bf29e2385779de120288926902c683101a811bda Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 21 Aug 2024 02:16:15 +0200 Subject: [PATCH] MongoDB: Improve software tests. Naming things. - Add utility functions `get_types` and `trim_schema` for streamlined "type extractor" unit tests - Do not use `t` as variable, because it's used as an alias for `typing` --- cratedb_toolkit/io/mongodb/extract.py | 30 +++++++-------- tests/io/mongodb/test_extract.py | 53 ++++++++++++++++----------- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/cratedb_toolkit/io/mongodb/extract.py b/cratedb_toolkit/io/mongodb/extract.py index e11b99c8..a0495ca9 100644 --- a/cratedb_toolkit/io/mongodb/extract.py +++ b/cratedb_toolkit/io/mongodb/extract.py @@ -98,12 +98,12 @@ def extract_schema_from_collection(collection: Collection, partial: bool, limit: else: count = collection.estimated_document_count() with progressbar: - t = progressbar.add_task(collection.name, total=count) + task = progressbar.add_task(collection.name, total=count) try: for document in collection.find().limit(limit=limit): schema["count"] += 1 schema["document"] = extract_schema_from_document(document, schema["document"]) - progressbar.update(t, advance=1) + progressbar.update(task, advance=1) if partial: break except KeyboardInterrupt: @@ -148,20 +148,20 @@ def extract_schema_from_array(array: list, schema: dict): """ for item in array: - t = get_type(item) - if t not in schema: - if t == "OBJECT": - schema[t] = {"count": 0, "document": {}} - elif t == "ARRAY": - schema[t] = {"count": 0, "types": {}} + type_ = get_type(item) + if type_ not in schema: + if type_ == "OBJECT": + schema[type_] = {"count": 0, "document": {}} + elif type_ == "ARRAY": + schema[type_] = {"count": 0, "types": {}} else: - schema[t] = {"count": 0} + schema[type_] = {"count": 0} - schema[t]["count"] += 1 - if t == "OBJECT": - schema[t]["document"] = extract_schema_from_document(item, schema[t]["document"]) - elif t == "ARRAY": - schema[t]["types"] = extract_schema_from_array(item, schema[t]["types"]) + schema[type_]["count"] += 1 + if type_ == "OBJECT": + schema[type_]["document"] = extract_schema_from_document(item, schema[type_]["document"]) + elif type_ == "ARRAY": + schema[type_]["types"] = extract_schema_from_array(item, schema[type_]["types"]) return schema @@ -176,7 +176,7 @@ def extract_schema_from_array(array: list, schema: dict): bool: "BOOLEAN", int: "INTEGER", float: "FLOAT", - # collection types + # container types list: "ARRAY", dict: "OBJECT", } diff --git a/tests/io/mongodb/test_extract.py b/tests/io/mongodb/test_extract.py index 8d161a7b..a5f82f7a 100644 --- a/tests/io/mongodb/test_extract.py +++ b/tests/io/mongodb/test_extract.py @@ -1,5 +1,7 @@ # ruff: noqa: E402 +import typing as t import unittest +from collections import OrderedDict import pytest @@ -16,54 +18,49 @@ class TestExtractTypes(unittest.TestCase): def test_primitive_types(self): - i = {"a": "a", "b": True, "c": 3, "d": 4.4} + data = {"a": "a", "b": True, "c": 3, "d": 4.4} expected = {"a": "STRING", "b": "BOOLEAN", "c": "INTEGER", "d": "FLOAT"} - s = extract.extract_schema_from_document(i, {}) - for key, value in expected.items(): - types = list(s[key]["types"].keys()) - self.assertListEqual([value], types) + schema = trim_schema(extract.extract_schema_from_document(data, {})) + self.assertDictEqual(schema, expected) def test_bson_types(self): - i = { + data = { "a": bson.ObjectId("55153a8014829a865bbf700d"), "b": bson.datetime.datetime.now(), "c": bson.Timestamp(0, 0), } expected = {"a": "OID", "b": "DATETIME", "c": "TIMESTAMP"} - s = extract.extract_schema_from_document(i, {}) - for key, value in expected.items(): - types = list(s[key]["types"].keys()) - self.assertListEqual([value], types) + schema = trim_schema(extract.extract_schema_from_document(data, {})) + self.assertDictEqual(schema, expected) def test_collection_types(self): - i = {"a": [1, 2, 3], "b": {"a": "hello world"}} + data = {"a": [1, 2, 3], "b": {"a": "hello world"}} expected = {"a": "ARRAY", "b": "OBJECT"} - s = extract.extract_schema_from_document(i, {}) - for key, value in expected.items(): - types = list(s[key]["types"].keys()) - self.assertListEqual([value], types) + schema = trim_schema(extract.extract_schema_from_document(data, {})) + self.assertDictEqual(schema, expected) def test_list_subtypes(self): - i = { + data = { "a": ["a", "b", 3], "b": [[1, 2, 3]], "c": [{"a": "a"}, {"a": "b"}], } - subtypes = extract.extract_schema_from_array(i["a"], {}) + subtypes = extract.extract_schema_from_array(data["a"], {}) self.assertListEqual(["STRING", "INTEGER"], list(subtypes.keys())) - subtypes = extract.extract_schema_from_array(i["b"], {}) + subtypes = extract.extract_schema_from_array(data["b"], {}) self.assertListEqual(["ARRAY"], list(subtypes.keys())) self.assertListEqual(["INTEGER"], list(subtypes["ARRAY"]["types"].keys())) - subtypes = extract.extract_schema_from_array(i["c"], {}) + subtypes = extract.extract_schema_from_array(data["c"], {}) self.assertListEqual(["OBJECT"], list(subtypes.keys())) def test_object_type(self): - i = {"a": {"b": "c"}} - s = extract.extract_schema_from_document(i, {}) - self.assertListEqual(["OBJECT"], list(s["a"]["types"].keys())) + data = {"a": {"b": "c"}} + expected = {"a": "OBJECT"} + schema = trim_schema(extract.extract_schema_from_document(data, {})) + self.assertDictEqual(schema, expected) class TestTypeCount(unittest.TestCase): @@ -84,3 +81,15 @@ def test_multiple_of_different_type(self): self.assertEqual(s["a"]["types"]["INTEGER"]["count"], 1) self.assertEqual(s["a"]["types"]["STRING"]["count"], 1) self.assertEqual(s["a"]["types"]["BOOLEAN"]["count"], 1) + + +def get_types(schema_item) -> t.List[str]: + return list(schema_item["types"].keys()) + + +def trim_schema(schema) -> t.Dict[str, t.Any]: + result = OrderedDict() + for key, value in schema.items(): + types = get_types(value) + result[key] = types[0] + return result