Skip to content

Commit

Permalink
chore: update tests for new taxonomy parser
Browse files Browse the repository at this point in the history
  • Loading branch information
eric-nguyen-cs committed Dec 21, 2023
1 parent 6674876 commit cd1e288
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 58 deletions.
1 change: 1 addition & 0 deletions parser/openfoodfacts_taxonomy_parser/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .parser import Parser
from .taxonomy_parser import TaxonomyParser
2 changes: 1 addition & 1 deletion parser/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def neo4j():
"""waiting for neo4j to be ready"""
uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
driver = GraphDatabase.driver(uri)
session = driver.session()
session = driver.session(database="neo4j")
connected = False
while not connected:
try:
Expand Down
18 changes: 9 additions & 9 deletions parser/tests/integration/test_parse_unparse_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@
@pytest.fixture(autouse=True)
def test_setup(neo4j):
# delete all the nodes, relations and search indexes in the database
query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n"
query = "MATCH (n:p_test_branch) DETACH DELETE n"
neo4j.session().run(query)
query = "DROP INDEX p_test_branch_SearchIds IF EXISTS"
neo4j.session().run(query)
query = "DROP INDEX p_test_branch_SearchTags IF EXISTS"
neo4j.session().run(query)

query1 = "MATCH (n:p_test_branch1:t_test:b_branch1) DETACH DELETE n"
query1 = "MATCH (n:p_test_branch1) DETACH DELETE n"
neo4j.session().run(query1)
query1 = "DROP INDEX p_test_branch1_SearchIds IF EXISTS"
neo4j.session().run(query1)
query1 = "DROP INDEX p_test_branch1_SearchTags IF EXISTS"
neo4j.session().run(query1)

query2 = "MATCH (n:p_test_branch2:t_test:b_branch2) DETACH DELETE n"
query2 = "MATCH (n:p_test_branch2) DETACH DELETE n"
neo4j.session().run(query2)
query2 = "DROP INDEX p_test_branch2_SearchIds IF EXISTS"
neo4j.session().run(query2)
Expand All @@ -41,14 +41,14 @@ def test_round_trip(neo4j):
# parse taxonomy
test_parser(TEST_TAXONOMY_TXT, "branch", "test")
# just quick check it runs ok with total number of nodes
query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)"
query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 14

# dump taxonomy back
test_dumper = unparser.WriteTaxonomy(session)
lines = list(test_dumper.iter_lines("p_test_branch:t_test:b_branch"))
lines = list(test_dumper.iter_lines("p_test_branch"))

original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)]
# expected result is close to original file with a few tweaks
Expand Down Expand Up @@ -82,20 +82,20 @@ def test_two_branch_round_trip(neo4j):
test_parser(TEST_TAXONOMY_TXT, "branch2", "test")

# just quick check it runs ok with total number of nodes
query = "MATCH (n:p_test_branch1:t_test:b_branch1) RETURN COUNT(*)"
query = "MATCH (n:p_test_branch1) RETURN COUNT(*)"
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 14

query = "MATCH (n:p_test_branch2:t_test:b_branch2) RETURN COUNT(*)"
query = "MATCH (n:p_test_branch2) RETURN COUNT(*)"
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 14

# dump taxonomy back
test_dumper = unparser.WriteTaxonomy(session)
lines_branch1 = list(test_dumper.iter_lines("p_test_branch1:t_test:b_branch1"))
lines_branch2 = list(test_dumper.iter_lines("p_test_branch2:t_test:b_branch2"))
lines_branch1 = list(test_dumper.iter_lines("p_test_branch1"))
lines_branch2 = list(test_dumper.iter_lines("p_test_branch2"))

original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)]
# expected result is close to original file with a few tweaks
Expand Down
32 changes: 14 additions & 18 deletions parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
@pytest.fixture(autouse=True)
def test_setup(neo4j):
# delete all the nodes and relations in the database
query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n"
query = "MATCH (n:p_test_branch) DETACH DELETE n"
neo4j.session().run(query)
query = "DROP INDEX p_test_branch_id_index IF EXISTS"
neo4j.session().run(query)
query = "DROP INDEX p_test_branch_SearchIds IF EXISTS"
neo4j.session().run(query)
Expand All @@ -24,24 +26,22 @@ def test_setup(neo4j):
def test_calling(neo4j):
with neo4j.session() as session:
test_parser = parser.Parser(session)
test_parser(TEST_TAXONOMY_TXT, "branch", "test")

# Create node test
test_parser.create_nodes(TEST_TAXONOMY_TXT, "p_test_branch:t_test:b_branch")

# total number of nodes
query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)"
# total number of nodes (TEXT, ENTRY, SYNONYMS, STOPWORDS) + 1 ERROR node
query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 13
assert number_of_nodes == 14

# header correctly added
query = "MATCH (n:p_test_branch:t_test:b_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
result = session.run(query)
header = result.value()[0]
assert header == ["# test taxonomy"]

# synonyms correctly added
query = "MATCH (n:p_test_branch:t_test:b_branch:SYNONYMS) RETURN n ORDER BY n.src_position"
query = "MATCH (n:p_test_branch:SYNONYMS) RETURN n ORDER BY n.src_position"
results = session.run(query)
expected_synonyms = [
{
Expand All @@ -65,7 +65,7 @@ def test_calling(neo4j):
assert node[key] == expected_synonyms[i][key]

# stopwords correctly added
query = "MATCH (n:p_test_branch:t_test:b_branch:STOPWORDS) RETURN n"
query = "MATCH (n:p_test_branch:STOPWORDS) RETURN n"
results = session.run(query)
expected_stopwords = {
"id": "stopwords:0",
Expand All @@ -80,7 +80,7 @@ def test_calling(neo4j):
# entries correctly added
# check for two of them
query = """
MATCH (n:p_test_branch:t_test:b_branch:ENTRY)
MATCH (n:p_test_branch:ENTRY)
WHERE n.id='en:banana-yogurts'
OR n.id='en:meat'
RETURN n
Expand Down Expand Up @@ -108,10 +108,8 @@ def test_calling(neo4j):
for key in expected_entries[i]:
assert node[key] == expected_entries[i][key]

# Child link test
test_parser.create_child_link("p_test_branch:t_test:b_branch") # nodes already added
query = """
MATCH (c:p_test_branch:t_test:b_branch)-[:is_child_of]->(p:p_test_branch:t_test:b_branch)
MATCH (c:p_test_branch)-[:is_child_of]->(p:p_test_branch)
RETURN c.id, p.id
"""
results = session.run(query)
Expand All @@ -133,10 +131,8 @@ def test_calling(neo4j):
for pair in created_pairs:
assert pair in expected_pairs

# Order link test
test_parser.create_previous_link("p_test_branch:t_test:b_branch")
query = """
MATCH (n:p_test_branch:t_test:b_branch)-[:is_before]->(p:p_test_branch:t_test:b_branch)
MATCH (n:p_test_branch)-[:is_before]->(p:p_test_branch)
RETURN n.id, p.id
"""
results = session.run(query)
Expand Down Expand Up @@ -193,7 +189,7 @@ def test_error_log(neo4j, tmp_path, caplog):
test_parser(str(taxonomy_path), "branch", "test")

# only the 2 nodes imported, not the duplicate
query = "MATCH (n:p_test_branch:t_test:b_branch:ENTRY) RETURN COUNT(*)"
query = "MATCH (n:p_test_branch:ENTRY) RETURN COUNT(*)"
result = session.run(query)
number_of_nodes = result.value()[0]
assert number_of_nodes == 2
Expand Down
62 changes: 32 additions & 30 deletions parser/tests/unit/test_parser_unit.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
import pathlib

import pytest

from openfoodfacts_taxonomy_parser import normalizer, parser

# taxonomy in text format : test.txt
TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")


def test_normalized_filename(neo4j):
with neo4j.session() as session:
x = parser.Parser(session)
normalizer = x.normalized_filename
name = normalizer("test")
assert name == "test.txt"
name = normalizer("test.txt")
assert name == "test.txt"
name = normalizer("t")
assert name == "t.txt"
@pytest.mark.parametrize(
"filename, normalized_name",
[
("test", "test.txt"),
("test.txt", "test.txt"),
("t", "t.txt"),
],
)
def test_normalized_filename(filename: str, normalized_name: str):
taxonomy_parser = parser.TaxonomyParser()
assert taxonomy_parser._normalized_filename(filename) == normalized_name


def test_fileiter(neo4j):
with neo4j.session() as session:
x = parser.Parser(session)
file = x.file_iter(TEST_TAXONOMY_TXT)

for counter, (_, line) in enumerate(file):
assert line == "" or line[0] == "#" or ":" in line
if counter == 26:
assert line == "carbon_footprint_fr_foodges_value:fr:10"
assert counter == 37


def test_normalizing():
text = "Numéro #1, n°1 des ¾ des Français*"
text = normalizer.normalizing(text, "fr")
assert text == "numero-1-n-1-des-des-francais"
text = "Randôm Languäge wìth àccénts"
normal_text = normalizer.normalizing(text, "fr")
assert normal_text == "random-language-with-accents"
normal_text = normalizer.normalizing(text, "de")
assert normal_text == "randôm-languäge-wìth-àccénts"
taxonomy_parser = parser.TaxonomyParser()
file_iterator = taxonomy_parser._file_iter(TEST_TAXONOMY_TXT)
for counter, (_, line) in enumerate(file_iterator):
assert line == "" or line[0] == "#" or ":" in line
if counter == 26:
assert line == "carbon_footprint_fr_foodges_value:fr:10"
assert counter == 37


@pytest.mark.parametrize(
"text, normalized_text, lang",
[
("Numéro #1, n°1 des ¾ des Français*", "numero-1-n-1-des-des-francais", "fr"),
("Randôm Languäge wìth àccénts", "random-language-with-accents", "fr"),
("Randôm Languäge wìth àccénts", "randôm-languäge-wìth-àccénts", "de"),
],
)
def test_normalizing(text: str, normalized_text: str, lang: str):
assert normalizer.normalizing(text, lang) == normalized_text

0 comments on commit cd1e288

Please sign in to comment.