Skip to content

Commit

Permalink
MongoDB: Improve software tests. Naming things.
Browse files Browse the repository at this point in the history
- Add utility functions `get_types` and `trim_schema` for streamlined
  "type extractor" unit tests
- Do not use `t` as variable, because it's used as an alias for `typing`
  • Loading branch information
amotl committed Aug 21, 2024
1 parent 27716f7 commit bf29e23
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 37 deletions.
30 changes: 15 additions & 15 deletions cratedb_toolkit/io/mongodb/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ def extract_schema_from_collection(collection: Collection, partial: bool, limit:
else:
count = collection.estimated_document_count()
with progressbar:
t = progressbar.add_task(collection.name, total=count)
task = progressbar.add_task(collection.name, total=count)
try:
for document in collection.find().limit(limit=limit):
schema["count"] += 1
schema["document"] = extract_schema_from_document(document, schema["document"])
progressbar.update(t, advance=1)
progressbar.update(task, advance=1)
if partial:
break
except KeyboardInterrupt:
Expand Down Expand Up @@ -148,20 +148,20 @@ def extract_schema_from_array(array: list, schema: dict):
"""

for item in array:
t = get_type(item)
if t not in schema:
if t == "OBJECT":
schema[t] = {"count": 0, "document": {}}
elif t == "ARRAY":
schema[t] = {"count": 0, "types": {}}
type_ = get_type(item)
if type_ not in schema:
if type_ == "OBJECT":
schema[type_] = {"count": 0, "document": {}}
elif type_ == "ARRAY":
schema[type_] = {"count": 0, "types": {}}
else:
schema[t] = {"count": 0}
schema[type_] = {"count": 0}

schema[t]["count"] += 1
if t == "OBJECT":
schema[t]["document"] = extract_schema_from_document(item, schema[t]["document"])
elif t == "ARRAY":
schema[t]["types"] = extract_schema_from_array(item, schema[t]["types"])
schema[type_]["count"] += 1
if type_ == "OBJECT":
schema[type_]["document"] = extract_schema_from_document(item, schema[type_]["document"])
elif type_ == "ARRAY":
schema[type_]["types"] = extract_schema_from_array(item, schema[type_]["types"])
return schema


Expand All @@ -176,7 +176,7 @@ def extract_schema_from_array(array: list, schema: dict):
bool: "BOOLEAN",
int: "INTEGER",
float: "FLOAT",
# collection types
# container types
list: "ARRAY",
dict: "OBJECT",
}
Expand Down
53 changes: 31 additions & 22 deletions tests/io/mongodb/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ruff: noqa: E402
import typing as t
import unittest
from collections import OrderedDict

import pytest

Expand All @@ -16,54 +18,49 @@

class TestExtractTypes(unittest.TestCase):
def test_primitive_types(self):
i = {"a": "a", "b": True, "c": 3, "d": 4.4}
data = {"a": "a", "b": True, "c": 3, "d": 4.4}
expected = {"a": "STRING", "b": "BOOLEAN", "c": "INTEGER", "d": "FLOAT"}
s = extract.extract_schema_from_document(i, {})
for key, value in expected.items():
types = list(s[key]["types"].keys())
self.assertListEqual([value], types)
schema = trim_schema(extract.extract_schema_from_document(data, {}))
self.assertDictEqual(schema, expected)

def test_bson_types(self):
i = {
data = {
"a": bson.ObjectId("55153a8014829a865bbf700d"),
"b": bson.datetime.datetime.now(),
"c": bson.Timestamp(0, 0),
}
expected = {"a": "OID", "b": "DATETIME", "c": "TIMESTAMP"}
s = extract.extract_schema_from_document(i, {})
for key, value in expected.items():
types = list(s[key]["types"].keys())
self.assertListEqual([value], types)
schema = trim_schema(extract.extract_schema_from_document(data, {}))
self.assertDictEqual(schema, expected)

def test_collection_types(self):
i = {"a": [1, 2, 3], "b": {"a": "hello world"}}
data = {"a": [1, 2, 3], "b": {"a": "hello world"}}
expected = {"a": "ARRAY", "b": "OBJECT"}
s = extract.extract_schema_from_document(i, {})
for key, value in expected.items():
types = list(s[key]["types"].keys())
self.assertListEqual([value], types)
schema = trim_schema(extract.extract_schema_from_document(data, {}))
self.assertDictEqual(schema, expected)

def test_list_subtypes(self):
i = {
data = {
"a": ["a", "b", 3],
"b": [[1, 2, 3]],
"c": [{"a": "a"}, {"a": "b"}],
}

subtypes = extract.extract_schema_from_array(i["a"], {})
subtypes = extract.extract_schema_from_array(data["a"], {})
self.assertListEqual(["STRING", "INTEGER"], list(subtypes.keys()))

subtypes = extract.extract_schema_from_array(i["b"], {})
subtypes = extract.extract_schema_from_array(data["b"], {})
self.assertListEqual(["ARRAY"], list(subtypes.keys()))
self.assertListEqual(["INTEGER"], list(subtypes["ARRAY"]["types"].keys()))

subtypes = extract.extract_schema_from_array(i["c"], {})
subtypes = extract.extract_schema_from_array(data["c"], {})
self.assertListEqual(["OBJECT"], list(subtypes.keys()))

def test_object_type(self):
i = {"a": {"b": "c"}}
s = extract.extract_schema_from_document(i, {})
self.assertListEqual(["OBJECT"], list(s["a"]["types"].keys()))
data = {"a": {"b": "c"}}
expected = {"a": "OBJECT"}
schema = trim_schema(extract.extract_schema_from_document(data, {}))
self.assertDictEqual(schema, expected)


class TestTypeCount(unittest.TestCase):
Expand All @@ -84,3 +81,15 @@ def test_multiple_of_different_type(self):
self.assertEqual(s["a"]["types"]["INTEGER"]["count"], 1)
self.assertEqual(s["a"]["types"]["STRING"]["count"], 1)
self.assertEqual(s["a"]["types"]["BOOLEAN"]["count"], 1)


def get_types(schema_item) -> t.List[str]:
return list(schema_item["types"].keys())


def trim_schema(schema) -> t.Dict[str, t.Any]:
result = OrderedDict()
for key, value in schema.items():
types = get_types(value)
result[key] = types[0]
return result

0 comments on commit bf29e23

Please sign in to comment.