From bf29e2385779de120288926902c683101a811bda Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Wed, 21 Aug 2024 02:16:15 +0200
Subject: [PATCH] MongoDB: Improve software tests. Naming things.

- Add utility functions `get_types` and `trim_schema` for streamlined
  "type extractor" unit tests
- Do not use `t` as variable, because it's used as an alias for `typing`
---
 cratedb_toolkit/io/mongodb/extract.py | 30 +++++++--------
 tests/io/mongodb/test_extract.py      | 53 ++++++++++++++++-----------
 2 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/cratedb_toolkit/io/mongodb/extract.py b/cratedb_toolkit/io/mongodb/extract.py
index e11b99c8..a0495ca9 100644
--- a/cratedb_toolkit/io/mongodb/extract.py
+++ b/cratedb_toolkit/io/mongodb/extract.py
@@ -98,12 +98,12 @@ def extract_schema_from_collection(collection: Collection, partial: bool, limit:
     else:
         count = collection.estimated_document_count()
     with progressbar:
-        t = progressbar.add_task(collection.name, total=count)
+        task = progressbar.add_task(collection.name, total=count)
         try:
             for document in collection.find().limit(limit=limit):
                 schema["count"] += 1
                 schema["document"] = extract_schema_from_document(document, schema["document"])
-                progressbar.update(t, advance=1)
+                progressbar.update(task, advance=1)
                 if partial:
                     break
         except KeyboardInterrupt:
@@ -148,20 +148,20 @@ def extract_schema_from_array(array: list, schema: dict):
     """
 
     for item in array:
-        t = get_type(item)
-        if t not in schema:
-            if t == "OBJECT":
-                schema[t] = {"count": 0, "document": {}}
-            elif t == "ARRAY":
-                schema[t] = {"count": 0, "types": {}}
+        type_ = get_type(item)
+        if type_ not in schema:
+            if type_ == "OBJECT":
+                schema[type_] = {"count": 0, "document": {}}
+            elif type_ == "ARRAY":
+                schema[type_] = {"count": 0, "types": {}}
             else:
-                schema[t] = {"count": 0}
+                schema[type_] = {"count": 0}
 
-        schema[t]["count"] += 1
-        if t == "OBJECT":
-            schema[t]["document"] = extract_schema_from_document(item, schema[t]["document"])
-        elif t == "ARRAY":
-            schema[t]["types"] = extract_schema_from_array(item, schema[t]["types"])
+        schema[type_]["count"] += 1
+        if type_ == "OBJECT":
+            schema[type_]["document"] = extract_schema_from_document(item, schema[type_]["document"])
+        elif type_ == "ARRAY":
+            schema[type_]["types"] = extract_schema_from_array(item, schema[type_]["types"])
     return schema
 
 
@@ -176,7 +176,7 @@ def extract_schema_from_array(array: list, schema: dict):
     bool: "BOOLEAN",
     int: "INTEGER",
     float: "FLOAT",
-    # collection types
+    # container types
     list: "ARRAY",
     dict: "OBJECT",
 }
diff --git a/tests/io/mongodb/test_extract.py b/tests/io/mongodb/test_extract.py
index 8d161a7b..a5f82f7a 100644
--- a/tests/io/mongodb/test_extract.py
+++ b/tests/io/mongodb/test_extract.py
@@ -1,5 +1,7 @@
 # ruff: noqa: E402
+import typing as t
 import unittest
+from collections import OrderedDict
 
 import pytest
 
@@ -16,54 +18,49 @@
 
 class TestExtractTypes(unittest.TestCase):
     def test_primitive_types(self):
-        i = {"a": "a", "b": True, "c": 3, "d": 4.4}
+        data = {"a": "a", "b": True, "c": 3, "d": 4.4}
         expected = {"a": "STRING", "b": "BOOLEAN", "c": "INTEGER", "d": "FLOAT"}
-        s = extract.extract_schema_from_document(i, {})
-        for key, value in expected.items():
-            types = list(s[key]["types"].keys())
-            self.assertListEqual([value], types)
+        schema = trim_schema(extract.extract_schema_from_document(data, {}))
+        self.assertDictEqual(schema, expected)
 
     def test_bson_types(self):
-        i = {
+        data = {
             "a": bson.ObjectId("55153a8014829a865bbf700d"),
             "b": bson.datetime.datetime.now(),
             "c": bson.Timestamp(0, 0),
         }
         expected = {"a": "OID", "b": "DATETIME", "c": "TIMESTAMP"}
-        s = extract.extract_schema_from_document(i, {})
-        for key, value in expected.items():
-            types = list(s[key]["types"].keys())
-            self.assertListEqual([value], types)
+        schema = trim_schema(extract.extract_schema_from_document(data, {}))
+        self.assertDictEqual(schema, expected)
 
     def test_collection_types(self):
-        i = {"a": [1, 2, 3], "b": {"a": "hello world"}}
+        data = {"a": [1, 2, 3], "b": {"a": "hello world"}}
         expected = {"a": "ARRAY", "b": "OBJECT"}
-        s = extract.extract_schema_from_document(i, {})
-        for key, value in expected.items():
-            types = list(s[key]["types"].keys())
-            self.assertListEqual([value], types)
+        schema = trim_schema(extract.extract_schema_from_document(data, {}))
+        self.assertDictEqual(schema, expected)
 
     def test_list_subtypes(self):
-        i = {
+        data = {
             "a": ["a", "b", 3],
             "b": [[1, 2, 3]],
             "c": [{"a": "a"}, {"a": "b"}],
         }
 
-        subtypes = extract.extract_schema_from_array(i["a"], {})
+        subtypes = extract.extract_schema_from_array(data["a"], {})
         self.assertListEqual(["STRING", "INTEGER"], list(subtypes.keys()))
 
-        subtypes = extract.extract_schema_from_array(i["b"], {})
+        subtypes = extract.extract_schema_from_array(data["b"], {})
         self.assertListEqual(["ARRAY"], list(subtypes.keys()))
         self.assertListEqual(["INTEGER"], list(subtypes["ARRAY"]["types"].keys()))
 
-        subtypes = extract.extract_schema_from_array(i["c"], {})
+        subtypes = extract.extract_schema_from_array(data["c"], {})
         self.assertListEqual(["OBJECT"], list(subtypes.keys()))
 
     def test_object_type(self):
-        i = {"a": {"b": "c"}}
-        s = extract.extract_schema_from_document(i, {})
-        self.assertListEqual(["OBJECT"], list(s["a"]["types"].keys()))
+        data = {"a": {"b": "c"}}
+        expected = {"a": "OBJECT"}
+        schema = trim_schema(extract.extract_schema_from_document(data, {}))
+        self.assertDictEqual(schema, expected)
 
 
 class TestTypeCount(unittest.TestCase):
@@ -84,3 +81,15 @@ def test_multiple_of_different_type(self):
         self.assertEqual(s["a"]["types"]["INTEGER"]["count"], 1)
         self.assertEqual(s["a"]["types"]["STRING"]["count"], 1)
         self.assertEqual(s["a"]["types"]["BOOLEAN"]["count"], 1)
+
+
+def get_types(schema_item) -> t.List[str]:
+    return list(schema_item["types"].keys())
+
+
+def trim_schema(schema) -> t.Dict[str, t.Any]:
+    result = OrderedDict()
+    for key, value in schema.items():
+        types = get_types(value)
+        result[key] = types[0]
+    return result