diff --git a/parser/openfoodfacts_taxonomy_parser/parser/__init__.py b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py index 2a3855a1..9da82bc2 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/__init__.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py @@ -1 +1,2 @@ from .parser import Parser +from .taxonomy_parser import TaxonomyParser diff --git a/parser/tests/conftest.py b/parser/tests/conftest.py index cf53d30b..8e8a87d5 100644 --- a/parser/tests/conftest.py +++ b/parser/tests/conftest.py @@ -11,7 +11,7 @@ def neo4j(): """waiting for neo4j to be ready""" uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") driver = GraphDatabase.driver(uri) - session = driver.session() + session = driver.session(database="neo4j") connected = False while not connected: try: diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py index 3a12a956..6bf91bb8 100644 --- a/parser/tests/integration/test_parse_unparse_integration.py +++ b/parser/tests/integration/test_parse_unparse_integration.py @@ -11,21 +11,21 @@ @pytest.fixture(autouse=True) def test_setup(neo4j): # delete all the nodes, relations and search indexes in the database - query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n" + query = "MATCH (n:p_test_branch) DETACH DELETE n" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchIds IF EXISTS" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchTags IF EXISTS" neo4j.session().run(query) - query1 = "MATCH (n:p_test_branch1:t_test:b_branch1) DETACH DELETE n" + query1 = "MATCH (n:p_test_branch1) DETACH DELETE n" neo4j.session().run(query1) query1 = "DROP INDEX p_test_branch1_SearchIds IF EXISTS" neo4j.session().run(query1) query1 = "DROP INDEX p_test_branch1_SearchTags IF EXISTS" neo4j.session().run(query1) - query2 = "MATCH (n:p_test_branch2:t_test:b_branch2) DETACH DELETE n" + query2 = "MATCH (n:p_test_branch2) DETACH DELETE n" neo4j.session().run(query2) query2 = "DROP INDEX p_test_branch2_SearchIds IF EXISTS" neo4j.session().run(query2) @@ -41,14 +41,14 @@ def test_round_trip(neo4j): # parse taxonomy test_parser(TEST_TAXONOMY_TXT, "branch", "test") # just quick check it runs ok with total number of nodes - query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)" + query = "MATCH (n:p_test_branch) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] assert number_of_nodes == 14 # dump taxonomy back test_dumper = unparser.WriteTaxonomy(session) - lines = list(test_dumper.iter_lines("p_test_branch:t_test:b_branch")) + lines = list(test_dumper.iter_lines("p_test_branch")) original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] # expected result is close to original file with a few tweaks @@ -82,20 +82,20 @@ def test_two_branch_round_trip(neo4j): test_parser(TEST_TAXONOMY_TXT, "branch2", "test") # just quick check it runs ok with total number of nodes - query = "MATCH (n:p_test_branch1:t_test:b_branch1) RETURN COUNT(*)" + query = "MATCH (n:p_test_branch1) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] assert number_of_nodes == 14 - query = "MATCH (n:p_test_branch2:t_test:b_branch2) RETURN COUNT(*)" + query = "MATCH (n:p_test_branch2) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] assert number_of_nodes == 14 # dump taxonomy back test_dumper = unparser.WriteTaxonomy(session) - lines_branch1 = list(test_dumper.iter_lines("p_test_branch1:t_test:b_branch1")) - lines_branch2 = list(test_dumper.iter_lines("p_test_branch2:t_test:b_branch2")) + lines_branch1 = list(test_dumper.iter_lines("p_test_branch1")) + lines_branch2 = list(test_dumper.iter_lines("p_test_branch2")) original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] # expected result is close to original file with a few tweaks diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py index 1cbcb1a7..5dd4e090 100644 --- a/parser/tests/integration/test_parser_integration.py +++ b/parser/tests/integration/test_parser_integration.py @@ -13,7 +13,9 @@ @pytest.fixture(autouse=True) def test_setup(neo4j): # delete all the nodes and relations in the database - query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n" + query = "MATCH (n:p_test_branch) DETACH DELETE n" + neo4j.session().run(query) + query = "DROP INDEX p_test_branch_id_index IF EXISTS" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchIds IF EXISTS" neo4j.session().run(query) @@ -24,24 +26,22 @@ def test_setup(neo4j): def test_calling(neo4j): with neo4j.session() as session: test_parser = parser.Parser(session) + test_parser(TEST_TAXONOMY_TXT, "branch", "test") - # Create node test - test_parser.create_nodes(TEST_TAXONOMY_TXT, "p_test_branch:t_test:b_branch") - - # total number of nodes - query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)" + # total number of nodes (TEXT, ENTRY, SYNONYMS, STOPWORDS) + 1 ERROR node + query = "MATCH (n:p_test_branch) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] - assert number_of_nodes == 13 + assert number_of_nodes == 14 # header correctly added - query = "MATCH (n:p_test_branch:t_test:b_branch) WHERE n.id = '__header__' RETURN n.preceding_lines" + query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines" result = session.run(query) header = result.value()[0] assert header == ["# test taxonomy"] # synonyms correctly added - query = "MATCH (n:p_test_branch:t_test:b_branch:SYNONYMS) RETURN n ORDER BY n.src_position" + query = "MATCH (n:p_test_branch:SYNONYMS) RETURN n ORDER BY n.src_position" results = session.run(query) expected_synonyms = [ { @@ -65,7 +65,7 @@ def test_calling(neo4j): assert node[key] == expected_synonyms[i][key] # stopwords correctly added - query = "MATCH (n:p_test_branch:t_test:b_branch:STOPWORDS) RETURN n" + query = "MATCH (n:p_test_branch:STOPWORDS) RETURN n" results = session.run(query) expected_stopwords = { "id": "stopwords:0", @@ -80,7 +80,7 @@ def test_calling(neo4j): # entries correctly added # check for two of them query = """ - MATCH (n:p_test_branch:t_test:b_branch:ENTRY) + MATCH (n:p_test_branch:ENTRY) WHERE n.id='en:banana-yogurts' OR n.id='en:meat' RETURN n @@ -108,10 +108,8 @@ def test_calling(neo4j): for key in expected_entries[i]: assert node[key] == expected_entries[i][key] - # Child link test - test_parser.create_child_link("p_test_branch:t_test:b_branch") # nodes already added query = """ - MATCH (c:p_test_branch:t_test:b_branch)-[:is_child_of]->(p:p_test_branch:t_test:b_branch) + MATCH (c:p_test_branch)-[:is_child_of]->(p:p_test_branch) RETURN c.id, p.id """ results = session.run(query) @@ -133,10 +131,8 @@ def test_calling(neo4j): for pair in created_pairs: assert pair in expected_pairs - # Order link test - test_parser.create_previous_link("p_test_branch:t_test:b_branch") query = """ - MATCH (n:p_test_branch:t_test:b_branch)-[:is_before]->(p:p_test_branch:t_test:b_branch) + MATCH (n:p_test_branch)-[:is_before]->(p:p_test_branch) RETURN n.id, p.id """ results = session.run(query) @@ -193,7 +189,7 @@ def test_error_log(neo4j, tmp_path, caplog): test_parser(str(taxonomy_path), "branch", "test") # only the 2 nodes imported, not the duplicate - query = "MATCH (n:p_test_branch:t_test:b_branch:ENTRY) RETURN COUNT(*)" + query = "MATCH (n:p_test_branch:ENTRY) RETURN COUNT(*)" result = session.run(query) number_of_nodes = result.value()[0] assert number_of_nodes == 2 diff --git a/parser/tests/unit/test_parser_unit.py b/parser/tests/unit/test_parser_unit.py index 52125d89..86badc1b 100644 --- a/parser/tests/unit/test_parser_unit.py +++ b/parser/tests/unit/test_parser_unit.py @@ -1,41 +1,43 @@ import pathlib +import pytest + from openfoodfacts_taxonomy_parser import normalizer, parser # taxonomy in text format : test.txt TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt") -def test_normalized_filename(neo4j): - with neo4j.session() as session: - x = parser.Parser(session) - normalizer = x.normalized_filename - name = normalizer("test") - assert name == "test.txt" - name = normalizer("test.txt") - assert name == "test.txt" - name = normalizer("t") - assert name == "t.txt" +@pytest.mark.parametrize( + "filename, normalized_name", + [ + ("test", "test.txt"), + ("test.txt", "test.txt"), + ("t", "t.txt"), + ], +) +def test_normalized_filename(filename: str, normalized_name: str): + taxonomy_parser = parser.TaxonomyParser() + assert taxonomy_parser._normalized_filename(filename) == normalized_name def test_fileiter(neo4j): - with neo4j.session() as session: - x = parser.Parser(session) - file = x.file_iter(TEST_TAXONOMY_TXT) - - for counter, (_, line) in enumerate(file): - assert line == "" or line[0] == "#" or ":" in line - if counter == 26: - assert line == "carbon_footprint_fr_foodges_value:fr:10" - assert counter == 37 - - -def test_normalizing(): - text = "Numéro #1, n°1 des ¾ des Français*" - text = normalizer.normalizing(text, "fr") - assert text == "numero-1-n-1-des-des-francais" - text = "Randôm Languäge wìth àccénts" - normal_text = normalizer.normalizing(text, "fr") - assert normal_text == "random-language-with-accents" - normal_text = normalizer.normalizing(text, "de") - assert normal_text == "randôm-languäge-wìth-àccénts" + taxonomy_parser = parser.TaxonomyParser() + file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT) + for counter, (_, line) in enumerate(file_iterator): + assert line == "" or line[0] == "#" or ":" in line + if counter == 26: + assert line == "carbon_footprint_fr_foodges_value:fr:10" + assert counter == 37 + + +@pytest.mark.parametrize( + "text, normalized_text, lang", + [ + ("Numéro #1, n°1 des ¾ des Français*", "numero-1-n-1-des-des-francais", "fr"), + ("Randôm Languäge wìth àccénts", "random-language-with-accents", "fr"), + ("Randôm Languäge wìth àccénts", "randôm-languäge-wìth-àccénts", "de"), + ], +) +def test_normalizing(text: str, normalized_text: str, lang: str): + assert normalizer.normalizing(text, lang) == normalized_text