chore: update tests for new taxonomy parser

openfoodfacts · Dec 21, 2023 · cd1e288 · cd1e288
1 parent 6674876
commit cd1e288
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 58 deletions.
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/__init__.py b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py
@@ -1 +1,2 @@
 from .parser import Parser
+from .taxonomy_parser import TaxonomyParser
diff --git a/parser/tests/conftest.py b/parser/tests/conftest.py
@@ -11,7 +11,7 @@ def neo4j():
     """waiting for neo4j to be ready"""
     uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
     driver = GraphDatabase.driver(uri)
-    session = driver.session()
+    session = driver.session(database="neo4j")
     connected = False
     while not connected:
         try:

diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py
@@ -11,21 +11,21 @@
 @pytest.fixture(autouse=True)
 def test_setup(neo4j):
     # delete all the nodes, relations and search indexes in the database
-    query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n"
+    query = "MATCH (n:p_test_branch) DETACH DELETE n"
     neo4j.session().run(query)
     query = "DROP INDEX p_test_branch_SearchIds IF EXISTS"
     neo4j.session().run(query)
     query = "DROP INDEX p_test_branch_SearchTags IF EXISTS"
     neo4j.session().run(query)
 
-    query1 = "MATCH (n:p_test_branch1:t_test:b_branch1) DETACH DELETE n"
+    query1 = "MATCH (n:p_test_branch1) DETACH DELETE n"
     neo4j.session().run(query1)
     query1 = "DROP INDEX p_test_branch1_SearchIds IF EXISTS"
     neo4j.session().run(query1)
     query1 = "DROP INDEX p_test_branch1_SearchTags IF EXISTS"
     neo4j.session().run(query1)
 
-    query2 = "MATCH (n:p_test_branch2:t_test:b_branch2) DETACH DELETE n"
+    query2 = "MATCH (n:p_test_branch2) DETACH DELETE n"
     neo4j.session().run(query2)
     query2 = "DROP INDEX p_test_branch2_SearchIds IF EXISTS"
     neo4j.session().run(query2)
@@ -41,14 +41,14 @@ def test_round_trip(neo4j):
         # parse taxonomy
         test_parser(TEST_TAXONOMY_TXT, "branch", "test")
         # just quick check it runs ok with total number of nodes
-        query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)"
+        query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
         result = session.run(query)
         number_of_nodes = result.value()[0]
         assert number_of_nodes == 14
 
         # dump taxonomy back
         test_dumper = unparser.WriteTaxonomy(session)
-        lines = list(test_dumper.iter_lines("p_test_branch:t_test:b_branch"))
+        lines = list(test_dumper.iter_lines("p_test_branch"))
 
     original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)]
     # expected result is close to original file with a few tweaks
@@ -82,20 +82,20 @@ def test_two_branch_round_trip(neo4j):
         test_parser(TEST_TAXONOMY_TXT, "branch2", "test")
 
         # just quick check it runs ok with total number of nodes
-        query = "MATCH (n:p_test_branch1:t_test:b_branch1) RETURN COUNT(*)"
+        query = "MATCH (n:p_test_branch1) RETURN COUNT(*)"
         result = session.run(query)
         number_of_nodes = result.value()[0]
         assert number_of_nodes == 14
 
-        query = "MATCH (n:p_test_branch2:t_test:b_branch2) RETURN COUNT(*)"
+        query = "MATCH (n:p_test_branch2) RETURN COUNT(*)"
         result = session.run(query)
         number_of_nodes = result.value()[0]
         assert number_of_nodes == 14
 
         # dump taxonomy back
         test_dumper = unparser.WriteTaxonomy(session)
-        lines_branch1 = list(test_dumper.iter_lines("p_test_branch1:t_test:b_branch1"))
-        lines_branch2 = list(test_dumper.iter_lines("p_test_branch2:t_test:b_branch2"))
+        lines_branch1 = list(test_dumper.iter_lines("p_test_branch1"))
+        lines_branch2 = list(test_dumper.iter_lines("p_test_branch2"))
 
     original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)]
     # expected result is close to original file with a few tweaks

diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py
@@ -13,7 +13,9 @@
 @pytest.fixture(autouse=True)
 def test_setup(neo4j):
     # delete all the nodes and relations in the database
-    query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n"
+    query = "MATCH (n:p_test_branch) DETACH DELETE n"
+    neo4j.session().run(query)
+    query = "DROP INDEX p_test_branch_id_index IF EXISTS"
     neo4j.session().run(query)
     query = "DROP INDEX p_test_branch_SearchIds IF EXISTS"
     neo4j.session().run(query)
@@ -24,24 +26,22 @@ def test_setup(neo4j):
 def test_calling(neo4j):
     with neo4j.session() as session:
         test_parser = parser.Parser(session)
+        test_parser(TEST_TAXONOMY_TXT, "branch", "test")
 
-        # Create node test
-        test_parser.create_nodes(TEST_TAXONOMY_TXT, "p_test_branch:t_test:b_branch")
-
-        # total number of nodes
-        query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)"
+        # total number of nodes (TEXT, ENTRY, SYNONYMS, STOPWORDS) + 1 ERROR node
+        query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
         result = session.run(query)
         number_of_nodes = result.value()[0]
-        assert number_of_nodes == 13
+        assert number_of_nodes == 14
 
         # header correctly added
-        query = "MATCH (n:p_test_branch:t_test:b_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
+        query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
         result = session.run(query)
         header = result.value()[0]
         assert header == ["# test taxonomy"]
 
         # synonyms correctly added
-        query = "MATCH (n:p_test_branch:t_test:b_branch:SYNONYMS) RETURN n ORDER BY n.src_position"
+        query = "MATCH (n:p_test_branch:SYNONYMS) RETURN n ORDER BY n.src_position"
         results = session.run(query)
         expected_synonyms = [
             {
@@ -65,7 +65,7 @@ def test_calling(neo4j):
                 assert node[key] == expected_synonyms[i][key]
 
         # stopwords correctly added
-        query = "MATCH (n:p_test_branch:t_test:b_branch:STOPWORDS) RETURN n"
+        query = "MATCH (n:p_test_branch:STOPWORDS) RETURN n"
         results = session.run(query)
         expected_stopwords = {
             "id": "stopwords:0",
@@ -80,7 +80,7 @@ def test_calling(neo4j):
         # entries correctly added
         # check for two of them
         query = """
-            MATCH (n:p_test_branch:t_test:b_branch:ENTRY)
+            MATCH (n:p_test_branch:ENTRY)
             WHERE n.id='en:banana-yogurts'
             OR n.id='en:meat'
             RETURN n
@@ -108,10 +108,8 @@ def test_calling(neo4j):
             for key in expected_entries[i]:
                 assert node[key] == expected_entries[i][key]
 
-        # Child link test
-        test_parser.create_child_link("p_test_branch:t_test:b_branch")  # nodes already added
         query = """
-            MATCH (c:p_test_branch:t_test:b_branch)-[:is_child_of]->(p:p_test_branch:t_test:b_branch)
+            MATCH (c:p_test_branch)-[:is_child_of]->(p:p_test_branch)
             RETURN c.id, p.id
         """
         results = session.run(query)
@@ -133,10 +131,8 @@ def test_calling(neo4j):
         for pair in created_pairs:
             assert pair in expected_pairs
 
-        # Order link test
-        test_parser.create_previous_link("p_test_branch:t_test:b_branch")
         query = """
-            MATCH (n:p_test_branch:t_test:b_branch)-[:is_before]->(p:p_test_branch:t_test:b_branch)
+            MATCH (n:p_test_branch)-[:is_before]->(p:p_test_branch)
             RETURN n.id, p.id
         """
         results = session.run(query)
@@ -193,7 +189,7 @@ def test_error_log(neo4j, tmp_path, caplog):
             test_parser(str(taxonomy_path), "branch", "test")
 
         # only the 2 nodes imported, not the duplicate
-        query = "MATCH (n:p_test_branch:t_test:b_branch:ENTRY) RETURN COUNT(*)"
+        query = "MATCH (n:p_test_branch:ENTRY) RETURN COUNT(*)"
         result = session.run(query)
         number_of_nodes = result.value()[0]
         assert number_of_nodes == 2

diff --git a/parser/tests/unit/test_parser_unit.py b/parser/tests/unit/test_parser_unit.py
@@ -1,41 +1,43 @@
 import pathlib
 
+import pytest
+
 from openfoodfacts_taxonomy_parser import normalizer, parser
 
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
 
 
-def test_normalized_filename(neo4j):
-    with neo4j.session() as session:
-        x = parser.Parser(session)
-        normalizer = x.normalized_filename
-        name = normalizer("test")
-        assert name == "test.txt"
-        name = normalizer("test.txt")
-        assert name == "test.txt"
-        name = normalizer("t")
-        assert name == "t.txt"
+@pytest.mark.parametrize(
+    "filename, normalized_name",
+    [
+        ("test", "test.txt"),
+        ("test.txt", "test.txt"),
+        ("t", "t.txt"),
+    ],
+)
+def test_normalized_filename(filename: str, normalized_name: str):
+    taxonomy_parser = parser.TaxonomyParser()
+    assert taxonomy_parser._normalized_filename(filename) == normalized_name
 
 
 def test_fileiter(neo4j):
-    with neo4j.session() as session:
-        x = parser.Parser(session)
-        file = x.file_iter(TEST_TAXONOMY_TXT)
-
-        for counter, (_, line) in enumerate(file):
-            assert line == "" or line[0] == "#" or ":" in line
-            if counter == 26:
-                assert line == "carbon_footprint_fr_foodges_value:fr:10"
-        assert counter == 37
-
-
-def test_normalizing():
-    text = "Numéro #1, n°1 des ¾ des Français*"
-    text = normalizer.normalizing(text, "fr")
-    assert text == "numero-1-n-1-des-des-francais"
-    text = "Randôm Languäge wìth àccénts"
-    normal_text = normalizer.normalizing(text, "fr")
-    assert normal_text == "random-language-with-accents"
-    normal_text = normalizer.normalizing(text, "de")
-    assert normal_text == "randôm-languäge-wìth-àccénts"
+    taxonomy_parser = parser.TaxonomyParser()
+    file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT)
+    for counter, (_, line) in enumerate(file_iterator):
+        assert line == "" or line[0] == "#" or ":" in line
+        if counter == 26:
+            assert line == "carbon_footprint_fr_foodges_value:fr:10"
+    assert counter == 37
+
+
+@pytest.mark.parametrize(
+    "text, normalized_text, lang",
+    [
+        ("Numéro #1, n°1 des ¾ des Français*", "numero-1-n-1-des-des-francais", "fr"),
+        ("Randôm Languäge wìth àccénts", "random-language-with-accents", "fr"),
+        ("Randôm Languäge wìth àccénts", "randôm-languäge-wìth-àccénts", "de"),
+    ],
+)
+def test_normalizing(text: str, normalized_text: str, lang: str):
+    assert normalizer.normalizing(text, lang) == normalized_text