diff --git a/docker-compose.yml b/docker-compose.yml index d1857a2d..93808b77 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -3,7 +3,7 @@ version: "3.9" services: neo4j: restart: ${RESTART_POLICY:-no} - image: neo4j:5.3.0-community + image: neo4j:5.14.0-community ports: # admin console - "${NEO4J_ADMIN_EXPOSE:-127.0.0.1:7474}:7474" diff --git a/parser/Makefile b/parser/Makefile index 83a0b6d6..dc6e744e 100644 --- a/parser/Makefile +++ b/parser/Makefile @@ -13,7 +13,7 @@ quality: tests: cd .. && docker compose up -d neo4j - pytest . + poetry run pytest . # we do not shutdown neo4j checks: quality tests \ No newline at end of file diff --git a/parser/openfoodfacts_taxonomy_parser/normalizer.py b/parser/openfoodfacts_taxonomy_parser/normalizer.py index 68e1e635..7a16b663 100644 --- a/parser/openfoodfacts_taxonomy_parser/normalizer.py +++ b/parser/openfoodfacts_taxonomy_parser/normalizer.py @@ -7,7 +7,7 @@ import unidecode -def normalizing(line, lang="default", char="-"): +def normalizing(line: str, lang="default", char="-"): """Normalize a string depending on the language code""" line = unicodedata.normalize("NFC", line) diff --git a/parser/openfoodfacts_taxonomy_parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser.py deleted file mode 100644 index cd346928..00000000 --- a/parser/openfoodfacts_taxonomy_parser/parser.py +++ /dev/null @@ -1,495 +0,0 @@ -import logging -import os -import re -import sys - -import iso639 -from neo4j import GraphDatabase - -from .exception import DuplicateIDError -from .normalizer import normalizing - - -def ellipsis(text, max=20): - """Cut a text adding eventual ellipsis if we do not display it fully""" - return text[:max] + ("..." if len(text) > max else "") - - -class ParserConsoleLogger: - def __init__(self): - self.parsing_warnings = [] # Stores all warning logs - self.parsing_errors = [] # Stores all error logs - - def info(self, msg, *args, **kwargs): - """Stores all parsing info logs""" - logging.info(msg, *args, **kwargs) - - def warning(self, msg, *args, **kwargs): - """Stores all parsing warning logs""" - self.parsing_warnings.append(msg % args) - logging.warning(msg, *args, **kwargs) - - def error(self, msg, *args, **kwargs): - """Stores all parsing error logs""" - self.parsing_errors.append(msg % args) - logging.error(msg, *args, **kwargs) - - -class Parser: - """Parse a taxonomy file and build a neo4j graph""" - - def __init__(self, session): - self.session = session - self.parser_logger = ParserConsoleLogger() - - def create_headernode(self, header, multi_label): - """Create the node for the header""" - query = f""" - CREATE (n:{multi_label}:TEXT) - SET n.id = '__header__' - SET n.preceding_lines= $header - SET n.src_position= 1 - """ - self.session.run(query, header=header) - - def create_node(self, data, multi_label): - """Run the query to create the node with data dictionary""" - position_query = """ - SET n.id = $id - SET n.is_before = $is_before - SET n.preceding_lines = $preceding_lines - SET n.src_position = $src_position - """ - entry_query = "" - if data["id"] == "__footer__": - id_query = f" CREATE (n:{multi_label}:TEXT) \n " - elif data["id"].startswith("synonyms"): - id_query = f" CREATE (n:{multi_label}:SYNONYMS) \n " - elif data["id"].startswith("stopwords"): - id_query = f" CREATE (n:{multi_label}:STOPWORDS) \n " - else: - id_query = f" CREATE (n:{multi_label}:ENTRY) \n " - position_query += " SET n.main_language = $main_language " - if data["parent_tag"]: - entry_query += " SET n.parents = $parent_tag \n" - for key in data: - if key.startswith("prop_"): - entry_query += " SET n." + key + " = $" + key + "\n" - - for key in data: - if key.startswith("tags_"): - entry_query += " SET n." + key + " = $" + key + "\n" - - query = id_query + entry_query + position_query - self.session.run(query, data) - - def normalized_filename(self, filename): - """Add the .txt extension if it is missing in the filename""" - return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "") - - def get_project_name(self, taxonomy_name, branch_name): - """Create a project name for given branch and taxonomy""" - return "p_" + taxonomy_name + "_" + branch_name - - def create_multi_label(self, taxonomy_name, branch_name): - """Create a combined label with taxonomy name and branch name""" - project_name = self.get_project_name(taxonomy_name, branch_name) - return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name) - - def file_iter(self, filename, start=0): - """Generator to get the file line by line""" - with open(filename, "r", encoding="utf8") as file: - for line_number, line in enumerate(file): - if line_number < start: - continue - # sanitizing - # remove any space characters at end of line - line = line.rstrip() - # replace ’ (typographique quote) to simple quote ' - line = line.replace("’", "'") - # replace commas that have no space around by a lower comma character - # and do the same for escaped comma (preceded by a \) - # (to distinguish them from commas acting as tags separators) - line = re.sub(r"(\d),(\d)", r"\1‚\2", line) - line = re.sub(r"\\,", "\\‚", line) - # removes parenthesis for roman numeral - line = re.sub(r"\(([ivx]+)\)", r"\1", line, flags=re.I) - yield line_number, line - yield line_number, "" # to end the last entry if not ended - - def remove_stopwords(self, lc, words): - """Remove the stopwords that were read at the beginning of the file""" - # First check if this language has stopwords - if lc in self.stopwords: - words_to_remove = self.stopwords[lc] - new_words = [] - for word in words.split("-"): - if word not in words_to_remove: - new_words.append(word) - return ("-").join(new_words) - else: - return words - - def add_line(self, line): - """ - Get a normalized string but keeping the language code "lc:", - used for id and parent tag - """ - lc, line = line.split(":", 1) - new_line = lc + ":" - new_line += self.remove_stopwords(lc, normalizing(line, lc)) - return new_line - - def get_lc_value(self, line): - """Get the language code "lc" and a list of normalized values""" - lc, line = line.split(":", 1) - new_line = [] - for word in line.split(","): - new_line.append(self.remove_stopwords(lc, normalizing(word, lc))) - return lc, new_line - - def new_node_data(self, is_before): - """To create an empty dictionary that will be used to create node""" - data = { - "id": "", - "main_language": "", - "preceding_lines": [], - "parent_tag": [], - "src_position": None, - "is_before": is_before, - } - return data - - def set_data_id(self, data, id, line_number): - if not data["id"]: - data["id"] = id - else: - raise DuplicateIDError(line_number) - return data - - def header_harvest(self, filename): - """ - Harvest the header (comment with #), - it has its own function because some header has multiple blocks - """ - h = 0 - header = [] - for _, line in self.file_iter(filename): - if not (line) or line[0] == "#": - header.append(line) - else: - break - h += 1 - - # we don't want to eat the comments of the next block - # and it removes the last separating line - for i in range(len(header)): - if header.pop(): - h -= 1 - else: - break - - return header, h - - def entry_end(self, line, data): - """Return True if the block ended""" - # stopwords and synonyms are one-liner, entries are separated by a blank line - if line.startswith("stopwords") or line.startswith("synonyms") or not line: - # can be the end of an block or just additional line separator, - # file_iter() always end with '' - if data["id"]: # to be sure that it's an end - return True - return False - - def remove_separating_line(self, data): - """ - To remove the one separating line that is always there, - between synonyms part and stopwords part and before each entry - """ - is_before = data["is_before"] - # first, check if there is at least one preceding line - if data["preceding_lines"] and not data["preceding_lines"][0]: - if data["id"].startswith("synonyms"): - # it's a synonyms block, - # if the previous block is a stopwords block, - # there is at least one separating line - if "stopwords" in is_before: - data["preceding_lines"].pop(0) - - elif data["id"].startswith("stopwords"): - # it's a stopwords block, - # if the previous block is a synonyms block, - # there is at least one separating line - if "synonyms" in is_before: - data["preceding_lines"].pop(0) - - else: - # it's an entry block, there is always a separating line - data["preceding_lines"].pop(0) - return data - - def harvest(self, filename): - """Transform data from file to dictionary - """ - saved_nodes = [] - index_stopwords = 0 - index_synonyms = 0 - language_code_prefix = re.compile( - r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:" - ) - # Check if it is correctly written - correctly_written = re.compile(r"\w+\Z") - # stopwords will contain a list of stopwords with their language code as key - self.stopwords = {} - - # header - header, next_line = self.header_harvest(filename) - yield header - - # the other entries - data = self.new_node_data(is_before="__header__") - data["is_before"] = "__header__" - for line_number, line in self.file_iter(filename, next_line): - # yield data if block ended - if self.entry_end(line, data): - if data["id"] in saved_nodes: - msg = ( - "Entry with same id %s already created, " - "duplicate id in file at line %s. " - "Node creation cancelled." - ) - self.parser_logger.error(msg, data['id'], data['src_position']) - else: - data = self.remove_separating_line(data) - yield data # another function will use this dictionary to create a node - saved_nodes.append(data["id"]) - data = self.new_node_data(is_before=data["id"]) - - # harvest the line - if not (line) or line[0] == "#": - # comment or blank - data["preceding_lines"].append(line) - else: - line = line.rstrip(",") - if not data["src_position"]: - data["src_position"] = line_number + 1 - if line.startswith("stopwords"): - # general stopwords definition for a language - id = "stopwords:" + str(index_stopwords) - data = self.set_data_id(data, id, line_number) - index_stopwords += 1 - try: - lc, value = self.get_lc_value(line[10:]) - except ValueError: - self.parser_logger.error( - "Missing language code at line %d ? '%s'", - line_number + 1, - ellipsis(line), - ) - else: - data["tags_" + lc] = value - # add the list with its lc - self.stopwords[lc] = value - elif line.startswith("synonyms"): - # general synonyms definition for a language - id = "synonyms:" + str(index_synonyms) - data = self.set_data_id(data, id, line_number) - index_synonyms += 1 - line = line[9:] - tags = [words.strip() for words in line[3:].split(",")] - try: - lc, value = self.get_lc_value(line) - except ValueError: - self.parser_logger.error( - "Missing language code at line %d ? '%s'", - line_number + 1, - ellipsis(line), - ) - else: - data["tags_" + lc] = tags - data["tags_ids_" + lc] = value - elif line[0] == "<": - # parent definition - data["parent_tag"].append(self.add_line(line[1:])) - elif language_code_prefix.match(line): - # synonyms definition - if not data["id"]: - data["id"] = self.add_line(line.split(",", 1)[0]) - # first 2-3 characters before ":" are the language code - data["main_language"] = data["id"].split(":", 1)[0] - # add tags and tagsid - lang, line = line.split(":", 1) - # to transform '-' from language code to '_' - lang = lang.strip().replace("-", "_") - tags_list = [] - tagsids_list = [] - for word in line.split(","): - tags_list.append(word.strip()) - word_normalized = self.remove_stopwords(lang, normalizing(word, lang)) - if word_normalized not in tagsids_list: - # in case 2 normalized synonyms are the same - tagsids_list.append(word_normalized) - data["tags_" + lang] = tags_list - data["tags_ids_" + lang] = tagsids_list - else: - # property definition - property_name = None - try: - property_name, lc, property_value = line.split(":", 2) - except ValueError: - self.parser_logger.error( - "Reading error at line %d, unexpected format: '%s'", - line_number + 1, - ellipsis(line), - ) - else: - # in case there is space before or after the colons - property_name = property_name.strip() - lc = lc.strip().replace("-", "_") - if not ( - correctly_written.match(property_name) and correctly_written.match(lc) - ): - self.parser_logger.error( - "Reading error at line %d, unexpected format: '%s'", - line_number + 1, - ellipsis(line), - ) - if property_name: - data["prop_" + property_name + "_" + lc] = property_value - - data["id"] = "__footer__" - data["preceding_lines"].pop(0) - data["src_position"] = line_number + 1 - len(data["preceding_lines"]) - yield data - - def create_nodes(self, filename, multi_label): - """Adding nodes to database""" - self.parser_logger.info("Creating nodes") - harvested_data = self.harvest(filename) - self.create_headernode(next(harvested_data), multi_label) - for entry in harvested_data: - self.create_node(entry, multi_label) - - def create_previous_link(self, multi_label): - self.parser_logger.info("Creating 'is_before' links") - query = f"MATCH(n:{multi_label}) WHERE n.is_before IS NOT NULL return n.id, n.is_before" - results = self.session.run(query) - for result in results: - id = result["n.id"] - id_previous = result["n.is_before"] - - query = f""" - MATCH(n:{multi_label}) WHERE n.id = $id - MATCH(p:{multi_label}) WHERE p.id= $id_previous - CREATE (p)-[r:is_before]->(n) - RETURN r - """ - results = self.session.run(query, id=id, id_previous=id_previous) - relation = results.values() - if len(relation) > 1: - self.parser_logger.error( - "2 or more 'is_before' links created for ids %s and %s, " - "one of the ids isn't unique", - id, - id_previous, - ) - elif not relation[0]: - self.parser_logger.error("link not created between %s and %s", id, id_previous) - - def parent_search(self, multi_label): - """Get the parent and the child to link""" - query = f"MATCH (n:{multi_label}:ENTRY) WHERE SIZE(n.parents)>0 RETURN n.id, n.parents" - results = self.session.run(query) - for result in results: - id = result["n.id"] - parent_list = result["n.parents"] - for parent in parent_list: - yield parent, id - - def create_child_link(self, multi_label): - """Create the relations between nodes""" - self.parser_logger.info("Creating 'is_child_of' links") - for parent, child_id in self.parent_search(multi_label): - lc, parent_id = parent.split(":") - query = f""" MATCH (p:{multi_label}:ENTRY) WHERE $parent_id IN p.tags_ids_""" + lc - query += f""" - MATCH (c:{multi_label}) WHERE c.id= $child_id - CREATE (c)-[r:is_child_of]->(p) - RETURN r - """ - result = self.session.run(query, parent_id=parent_id, child_id=child_id) - if not result.value(): - self.parser_logger.warning( - f"parent not found for child {child_id} with parent {parent_id}" - ) - - def delete_used_properties(self): - query = "MATCH (n) SET n.is_before = null, n.parents = null" - self.session.run(query) - - def create_fulltext_index(self, taxonomy_name, branch_name): - """Create indexes for search""" - project_name = self.get_project_name(taxonomy_name, branch_name) - query = [ - f"""CREATE FULLTEXT INDEX {project_name+'_SearchIds'} IF NOT EXISTS - FOR (n:{project_name}) ON EACH [n.id]\n""" - ] - query.append("""OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}}""") - self.session.run("".join(query)) - - language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""] - tags_prefixed_lc = ["n.tags_" + lc for lc in language_codes] - tags_prefixed_lc = ", ".join(tags_prefixed_lc) - query = f"""CREATE FULLTEXT INDEX {project_name+'_SearchTags'} IF NOT EXISTS - FOR (n:{project_name}) ON EACH [{tags_prefixed_lc}]""" - self.session.run(query) - - def create_parsing_errors_node(self, taxonomy_name, branch_name): - """Create node to list parsing errors""" - multi_label = self.create_multi_label(taxonomy_name, branch_name) - query = f""" - CREATE (n:{multi_label}:ERRORS) - SET n.id = $project_name - SET n.branch_name = $branch_name - SET n.taxonomy_name = $taxonomy_name - SET n.created_at = datetime() - SET n.warnings = $warnings_list - SET n.errors = $errors_list - """ - params = { - "project_name": self.get_project_name(taxonomy_name, branch_name), - "branch_name": branch_name, - "taxonomy_name": taxonomy_name, - "warnings_list": self.parser_logger.parsing_warnings, - "errors_list": self.parser_logger.parsing_errors, - } - self.session.run(query, params) - - def __call__(self, filename, branch_name, taxonomy_name): - """Process the file""" - filename = self.normalized_filename(filename) - branch_name = normalizing(branch_name, char="_") - multi_label = self.create_multi_label(taxonomy_name, branch_name) - self.create_nodes(filename, multi_label) - self.create_child_link(multi_label) - self.create_previous_link(multi_label) - self.create_fulltext_index(taxonomy_name, branch_name) - self.create_parsing_errors_node(taxonomy_name, branch_name) - # self.delete_used_properties() - - -if __name__ == "__main__": - # Setup logs - logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO) - filename = sys.argv[1] if len(sys.argv) > 1 else "test" - branch_name = sys.argv[2] if len(sys.argv) > 1 else "branch" - taxonomy_name = sys.argv[3] if len(sys.argv) > 1 else filename.rsplit(".", 1)[0] - - # Initialize neo4j - uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") - driver = GraphDatabase.driver(uri) - session = driver.session() - - # Pass session variable to parser object - parse = Parser(session) - parse(filename, branch_name, taxonomy_name) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/__init__.py b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py new file mode 100644 index 00000000..9da82bc2 --- /dev/null +++ b/parser/openfoodfacts_taxonomy_parser/parser/__init__.py @@ -0,0 +1,2 @@ +from .parser import Parser +from .taxonomy_parser import TaxonomyParser diff --git a/parser/openfoodfacts_taxonomy_parser/exception.py b/parser/openfoodfacts_taxonomy_parser/parser/exception.py similarity index 100% rename from parser/openfoodfacts_taxonomy_parser/exception.py rename to parser/openfoodfacts_taxonomy_parser/parser/exception.py diff --git a/parser/openfoodfacts_taxonomy_parser/parser/logger.py b/parser/openfoodfacts_taxonomy_parser/parser/logger.py new file mode 100644 index 00000000..555c5bb8 --- /dev/null +++ b/parser/openfoodfacts_taxonomy_parser/parser/logger.py @@ -0,0 +1,21 @@ +import logging + + +class ParserConsoleLogger: + def __init__(self): + self.parsing_warnings = [] # Stores all warning logs + self.parsing_errors = [] # Stores all error logs + + def info(self, msg, *args, **kwargs): + """Stores all parsing info logs""" + logging.info(msg, *args, **kwargs) + + def warning(self, msg, *args, **kwargs): + """Stores all parsing warning logs""" + self.parsing_warnings.append(msg % args) + logging.warning(msg, *args, **kwargs) + + def error(self, msg, *args, **kwargs): + """Stores all parsing error logs""" + self.parsing_errors.append(msg % args) + logging.error(msg, *args, **kwargs) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py new file mode 100644 index 00000000..d0af686d --- /dev/null +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -0,0 +1,199 @@ +import logging +import os +import sys + +import iso639 +from neo4j import GraphDatabase, Session + +from .logger import ParserConsoleLogger +from ..normalizer import normalizing +from .taxonomy_parser import ( + NodeType, + PreviousLink, + TaxonomyParser, + NodeData, + ChildLink, +) + + +def ellipsis(text, max=20): + """Cut a text adding eventual ellipsis if we do not display it fully""" + return text[:max] + ("..." if len(text) > max else "") + + +class Parser: + """Parse a taxonomy file and build a neo4j graph""" + + def __init__(self, session: Session): + self.session = session + self.parser_logger = ParserConsoleLogger() + + def _create_headernode(self, header: list[str], multi_label: str): + """Create the node for the header""" + query = f""" + CREATE (n:{multi_label}:TEXT) + SET n.id = '__header__' + SET n.preceding_lines= $header + SET n.src_position= 1 + """ + self.session.run(query, header=header) + + def _create_node(self, node_data: NodeData, multi_label: str): + """Run the query to create the node with data dictionary""" + position_query = """ + SET n.id = $id + SET n.is_before = $is_before + SET n.preceding_lines = $preceding_lines + SET n.src_position = $src_position + """ + entry_query = "" + if node_data.get_node_type() == NodeType.TEXT: + id_query = f" CREATE (n:{multi_label}:TEXT) \n " + elif node_data.get_node_type() == NodeType.SYNONYMS: + id_query = f" CREATE (n:{multi_label}:SYNONYMS) \n " + elif node_data.get_node_type() == NodeType.STOPWORDS: + id_query = f" CREATE (n:{multi_label}:STOPWORDS) \n " + else: + id_query = f" CREATE (n:{multi_label}:ENTRY) \n " + position_query += " SET n.main_language = $main_language " + if node_data.parent_tag: + entry_query += " SET n.parents = $parent_tag \n" + for key in node_data.properties: + if key.startswith("prop_"): + entry_query += " SET n." + key + " = $" + key + "\n" + + for key in node_data.tags: + if key.startswith("tags_"): + entry_query += " SET n." + key + " = $" + key + "\n" + + query = id_query + entry_query + position_query + self.session.run(query, node_data.to_dict()) + + def _get_project_name(self, taxonomy_name: str, branch_name: str): + """Create a project name for given branch and taxonomy""" + return "p_" + taxonomy_name + "_" + branch_name + + def _create_multi_label(self, taxonomy_name: str, branch_name: str) -> str: + """Create a combined label with taxonomy name and branch name""" + project_name = self._get_project_name(taxonomy_name, branch_name) + return project_name + ":" + ("t_" + taxonomy_name) + ":" + ("b_" + branch_name) + + def create_nodes(self, nodes: list[NodeData], multi_label: str): + """Adding nodes to database""" + self.parser_logger.info("Creating nodes") + for node in nodes: + if node.id == "__header__": + self._create_headernode(node.preceding_lines, multi_label) + else: + self._create_node(node, multi_label) + + def create_previous_link(self, previous_links: list[PreviousLink], multi_label: str): + self.parser_logger.info("Creating 'is_before' links") + for previous_link in previous_links: + id = previous_link["id"] + before_id = previous_link["before_id"] + + query = f""" + MATCH(n:{multi_label}) WHERE n.id = $id + MATCH(p:{multi_label}) WHERE p.id= $before_id + CREATE (p)-[r:is_before]->(n) + RETURN r + """ + results = self.session.run(query, id=id, before_id=before_id) + relation = results.values() + if len(relation) > 1: + self.parser_logger.error( + "2 or more 'is_before' links created for ids %s and %s, " + "one of the ids isn't unique", + id, + before_id, + ) + elif not relation[0]: + self.parser_logger.error("link not created between %s and %s", id, before_id) + + def create_child_link(self, child_links: list[ChildLink], multi_label: str): + """Create the relations between nodes""" + self.parser_logger.info("Creating 'is_child_of' links") + for child_link in child_links: + child_id = child_link["id"] + parent = child_link["parent_id"] + lc, parent_id = parent.split(":") + query = f""" MATCH (p:{multi_label}:ENTRY) WHERE $parent_id IN p.tags_ids_""" + lc + query += f""" + MATCH (c:{multi_label}) WHERE c.id= $child_id + CREATE (c)-[r:is_child_of]->(p) + RETURN r + """ + result = self.session.run(query, parent_id=parent_id, child_id=child_id) + if not result.value(): + self.parser_logger.warning( + f"parent not found for child {child_id} with parent {parent_id}" + ) + + def _create_fulltext_index(self, taxonomy_name: str, branch_name: str): + """Create indexes for search""" + project_name = self._get_project_name(taxonomy_name, branch_name) + query = ( + f"""CREATE FULLTEXT INDEX {project_name+'_SearchIds'} IF NOT EXISTS + FOR (n:{project_name}) ON EACH [n.id]\n""" + + """ + OPTIONS {indexConfig: {`fulltext.analyzer`: 'keyword'}}""" + ) + self.session.run(query) + + language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""] + tags_prefixed_lc = ["n.tags_" + lc for lc in language_codes] + tags_prefixed_lc = ", ".join(tags_prefixed_lc) + query = f"""CREATE FULLTEXT INDEX {project_name+'_SearchTags'} IF NOT EXISTS + FOR (n:{project_name}) ON EACH [{tags_prefixed_lc}]""" + self.session.run(query) + + def _create_parsing_errors_node(self, taxonomy_name: str, branch_name: str): + """Create node to list parsing errors""" + multi_label = self._create_multi_label(taxonomy_name, branch_name) + query = f""" + CREATE (n:{multi_label}:ERRORS) + SET n.id = $project_name + SET n.branch_name = $branch_name + SET n.taxonomy_name = $taxonomy_name + SET n.created_at = datetime() + SET n.warnings = $warnings_list + SET n.errors = $errors_list + """ + params = { + "project_name": self._get_project_name(taxonomy_name, branch_name), + "branch_name": branch_name, + "taxonomy_name": taxonomy_name, + "warnings_list": self.parser_logger.parsing_warnings, + "errors_list": self.parser_logger.parsing_errors, + } + self.session.run(query, params) + + def __call__(self, filename: str, branch_name: str, taxonomy_name: str): + """Process the file""" + branch_name = normalizing(branch_name, char="_") + multi_label = self._create_multi_label(taxonomy_name, branch_name) + taxonomy_parser = TaxonomyParser() + taxonomy = taxonomy_parser.parse_file(filename, self.parser_logger) + self.create_nodes([*taxonomy.entry_nodes, *taxonomy.other_nodes], multi_label) + self.create_child_link(taxonomy.child_links, multi_label) + self.create_previous_link(taxonomy.previous_links, multi_label) + self._create_fulltext_index(taxonomy_name, branch_name) + self._create_parsing_errors_node(taxonomy_name, branch_name) + + +if __name__ == "__main__": + # Setup logs + logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO) + filename = sys.argv[1] if len(sys.argv) > 1 else "test" + branch_name = sys.argv[2] if len(sys.argv) > 1 else "branch" + taxonomy_name = sys.argv[3] if len(sys.argv) > 1 else filename.rsplit(".", 1)[0] + + # Initialize neo4j + uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") + driver = GraphDatabase.driver(uri) + session = driver.session() + + # Pass session variable to parser object + parse = Parser(session) + parse(filename, branch_name, taxonomy_name) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py new file mode 100644 index 00000000..d46a52fa --- /dev/null +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -0,0 +1,393 @@ +import logging +import re +import sys +import timeit +from enum import Enum +from dataclasses import dataclass, field +from typing import Iterator, TypedDict + +from .logger import ParserConsoleLogger +from .exception import DuplicateIDError +from ..normalizer import normalizing + + +def ellipsis(text, max=20): + """Cut a text adding eventual ellipsis if we do not display it fully""" + return text[:max] + ("..." if len(text) > max else "") + + +class NodeType(str, Enum): + TEXT = "TEXT" + SYNONYMS = "SYNONYMS" + STOPWORDS = "STOPWORDS" + ENTRY = "ENTRY" + + +@dataclass(slots=True) +class NodeData: + id: str = "" + is_before: str | None = None + main_language: str | None = None + preceding_lines: list[str] = field(default_factory=list) + parent_tag: list[str] = field(default_factory=list) + src_position: int | None = None + properties: dict[str, str] = field(default_factory=dict) + tags: dict[str, list[str]] = field(default_factory=dict) + + def to_dict(self): + return { + "id": self.id, + "is_before": self.is_before, + "main_language": self.main_language, + "preceding_lines": self.preceding_lines, + "parent_tag": self.parent_tag, + "src_position": self.src_position, + **self.properties, + **self.tags, + } + + def get_node_type(self): + if self.id in ["__header__", "__footer__"]: + return NodeType.TEXT + elif self.id.startswith("synonyms"): + return NodeType.SYNONYMS + elif self.id.startswith("stopwords"): + return NodeType.STOPWORDS + else: + return NodeType.ENTRY + + +class PreviousLink(TypedDict): + before_id: str + id: str + + +class ChildLink(TypedDict): + parent_id: str + id: str + + +@dataclass(slots=True) +class Taxonomy: + entry_nodes: list[NodeData] + other_nodes: list[NodeData] + previous_links: list[PreviousLink] + child_links: list[ChildLink] + + +class TaxonomyParser: + """Parse a taxonomy file""" + + def __init__(self): + self.parser_logger = ParserConsoleLogger() + + def _normalized_filename(self, filename: str) -> str: + """Add the .txt extension if it is missing in the filename""" + return filename + (".txt" if (len(filename) < 4 or filename[-4:] != ".txt") else "") + + def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]: + """Generator to get the file line by line""" + with open(filename, "r", encoding="utf8") as file: + line_count = 0 + for line_number, line in enumerate(file): + if line_number < start: + continue + # sanitizing + # remove any space characters at end of line + line = line.rstrip() + # replace ’ (typographique quote) to simple quote ' + line = line.replace("’", "'") + # replace commas that have no space around by a lower comma character + # and do the same for escaped comma (preceded by a \) + # (to distinguish them from commas acting as tags separators) + line = re.sub(r"(\d),(\d)", r"\1‚\2", line) + line = re.sub(r"\\,", "\\‚", line) + # removes parenthesis for roman numeral + line = re.sub(r"\(([ivx]+)\)", r"\1", line, flags=re.I) + yield line_number, line + line_count += 1 + yield line_count, "" # to end the last entry if not ended + + def _remove_stopwords(self, lc: str, words: str) -> str: + """Remove the stopwords that were read at the beginning of the file""" + # First check if this language has stopwords + if lc in self.stopwords: + words_to_remove = self.stopwords[lc] + new_words = [] + for word in words.split("-"): + if word not in words_to_remove: + new_words.append(word) + return ("-").join(new_words) + else: + return words + + def _add_line(self, line: str) -> str: + """ + Get a normalized string but keeping the language code "lc:", + used for id and parent tag + """ + lc, line = line.split(":", 1) + new_line = lc + ":" + new_line += self._remove_stopwords(lc, normalizing(line, lc)) + return new_line + + def _get_lc_value(self, line: str) -> tuple[str, list[str]]: + """Get the language code "lc" and a list of normalized values""" + lc, line = line.split(":", 1) + new_line: list[str] = [] + for word in line.split(","): + new_line.append(self._remove_stopwords(lc, normalizing(word, lc))) + return lc, new_line + + def _set_data_id(self, data: NodeData, id: str, line_number: int) -> NodeData: + if not data.id: + data.id = id + else: + raise DuplicateIDError(line_number) + return data + + def _header_harvest(self, filename: str) -> tuple[list[str], int]: + """ + Harvest the header (comment with #), + it has its own function because some header has multiple blocks + """ + h = 0 + header: list[str] = [] + for _, line in self._file_iter(filename): + if not (line) or line[0] == "#": + header.append(line) + else: + break + h += 1 + + # we don't want to eat the comments of the next block + # and it removes the last separating line + for i in range(len(header)): + if header.pop(): + h -= 1 + else: + break + + return header, h + + def _entry_end(self, line: str, data: NodeData) -> bool: + """Return True if the block ended""" + # stopwords and synonyms are one-liner, entries are separated by a blank line + if line.startswith("stopwords") or line.startswith("synonyms") or not line: + # can be the end of an block or just additional line separator, + # file_iter() always end with '' + if data.id: # to be sure that it's an end + return True + return False + + def _remove_separating_line(self, data: NodeData) -> NodeData: + """ + To remove the one separating line that is always there, + between synonyms part and stopwords part and before each entry + """ + is_before = data.is_before + # first, check if there is at least one preceding line + if data.preceding_lines and not data.preceding_lines[0]: + if data.id.startswith("synonyms"): + # it's a synonyms block, + # if the previous block is a stopwords block, + # there is at least one separating line + if is_before and "stopwords" in is_before: + data.preceding_lines.pop(0) + + elif data.id.startswith("stopwords"): + # it's a stopwords block, + # if the previous block is a synonyms block, + # there is at least one separating line + if is_before and "synonyms" in is_before: + data.preceding_lines.pop(0) + + else: + # it's an entry block, there is always a separating line + data.preceding_lines.pop(0) + return data + + def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: + """Transform data from file to dictionary""" + saved_nodes = [] + index_stopwords = 0 + index_synonyms = 0 + language_code_prefix = re.compile( + r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:" + ) + # Check if it is correctly written + correctly_written = re.compile(r"\w+\Z") + # stopwords will contain a list of stopwords with their language code as key + self.stopwords = {} + # the other entries + data = NodeData(is_before="__header__") + line_number = ( + entries_start_line # if the iterator is empty, line_number will not be unbound + ) + for line_number, line in self._file_iter(filename, entries_start_line): + # yield data if block ended + if self._entry_end(line, data): + if data.id in saved_nodes: + msg = ( + "Entry with same id %s already created, " + "duplicate id in file at line %s. " + "Node creation cancelled." + ) + self.parser_logger.error(msg, data.id, data.src_position) + else: + data = self._remove_separating_line(data) + yield data # another function will use this dictionary to create a node + saved_nodes.append(data.id) + data = NodeData(is_before=data.id) + + # harvest the line + if not (line) or line[0] == "#": + # comment or blank + data.preceding_lines.append(line) + else: + line = line.rstrip(",") + if not data.src_position: + data.src_position = line_number + 1 + if line.startswith("stopwords"): + # general stopwords definition for a language + id = "stopwords:" + str(index_stopwords) + data = self._set_data_id(data, id, line_number) + index_stopwords += 1 + try: + lc, value = self._get_lc_value(line[10:]) + except ValueError: + self.parser_logger.error( + "Missing language code at line %d ? '%s'", + line_number + 1, + ellipsis(line), + ) + else: + data.tags["tags_" + lc] = value + # add the list with its lc + self.stopwords[lc] = value + elif line.startswith("synonyms"): + # general synonyms definition for a language + id = "synonyms:" + str(index_synonyms) + data = self._set_data_id(data, id, line_number) + index_synonyms += 1 + line = line[9:] + tags = [words.strip() for words in line[3:].split(",")] + try: + lc, value = self._get_lc_value(line) + except ValueError: + self.parser_logger.error( + "Missing language code at line %d ? '%s'", + line_number + 1, + ellipsis(line), + ) + else: + data.tags["tags_" + lc] = tags + data.tags["tags_ids_" + lc] = value + elif line[0] == "<": + # parent definition + data.parent_tag.append(self._add_line(line[1:])) + elif language_code_prefix.match(line): + # synonyms definition + if not data.id: + data.id = self._add_line(line.split(",", 1)[0]) + # first 2-3 characters before ":" are the language code + data.main_language = data.id.split(":", 1)[0] + # add tags and tagsid + lang, line = line.split(":", 1) + # to transform '-' from language code to '_' + lang = lang.strip().replace("-", "_") + tags_list = [] + tagsids_list = [] + for word in line.split(","): + tags_list.append(word.strip()) + word_normalized = self._remove_stopwords(lang, normalizing(word, lang)) + if word_normalized not in tagsids_list: + # in case 2 normalized synonyms are the same + tagsids_list.append(word_normalized) + data.tags["tags_" + lang] = tags_list + data.tags["tags_ids_" + lang] = tagsids_list + else: + # property definition + property_name = None + try: + property_name, lc, property_value = line.split(":", 2) + except ValueError: + self.parser_logger.error( + "Reading error at line %d, unexpected format: '%s'", + line_number + 1, + ellipsis(line), + ) + else: + # in case there is space before or after the colons + property_name = property_name.strip() + lc = lc.strip().replace("-", "_") + if not ( + correctly_written.match(property_name) and correctly_written.match(lc) + ): + self.parser_logger.error( + "Reading error at line %d, unexpected format: '%s'", + line_number + 1, + ellipsis(line), + ) + if property_name: + data.properties["prop_" + property_name + "_" + lc] = property_value + + data.id = "__footer__" + data.preceding_lines.pop(0) + data.src_position = line_number + 1 - len(data.preceding_lines) + yield data + + def _create_taxonomy(self, filename: str) -> Taxonomy: + """Create the taxonomy from the file""" + self.parser_logger.info("Parsing taxonomy file %s", filename) + harvested_header_data, entries_start_line = self._header_harvest(filename) + entry_nodes: list[NodeData] = [] + other_nodes = [ + NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1) + ] + previous_links: list[PreviousLink] = [] + child_links: list[ChildLink] = [] + harvested_data = self._harvest_entries(filename, entries_start_line) + for entry in harvested_data: + if entry.get_node_type() == NodeType.ENTRY: + entry_nodes.append(entry) + else: + other_nodes.append(entry) + if entry.is_before: + previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id)) + if entry.parent_tag: + for parent in entry.parent_tag: + child_links.append(ChildLink(parent_id=parent, id=entry.id)) + return Taxonomy( + entry_nodes=entry_nodes, + other_nodes=other_nodes, + previous_links=previous_links, + child_links=child_links, + ) + + def parse_file(self, filename: str, logger: ParserConsoleLogger | None = None) -> Taxonomy: + if logger: + self.parser_logger = logger + """Process the file into a Taxonomy object""" + start_time = timeit.default_timer() + filename = self._normalized_filename(filename) + taxonomy = self._create_taxonomy(filename) + end_time = timeit.default_timer() + self.parser_logger.info("Parsing done in %s seconds", end_time - start_time) + self.parser_logger.info( + "Found %d nodes", len(taxonomy.entry_nodes) + len(taxonomy.other_nodes) + ) + self.parser_logger.info("Found %d previous links", len(taxonomy.previous_links)) + self.parser_logger.info("Found %d child links", len(taxonomy.child_links)) + + return taxonomy + + +if __name__ == "__main__": + # Setup logs + logging.basicConfig(handlers=[logging.StreamHandler()], level=logging.INFO) + filename = sys.argv[1] if len(sys.argv) > 1 else "test" + + # Pass session variable to parser object + parse = TaxonomyParser() + parse.parse_file(filename) diff --git a/parser/tests/conftest.py b/parser/tests/conftest.py index cf53d30b..8e8a87d5 100644 --- a/parser/tests/conftest.py +++ b/parser/tests/conftest.py @@ -11,7 +11,7 @@ def neo4j(): """waiting for neo4j to be ready""" uri = os.environ.get("NEO4J_URI", "bolt://localhost:7687") driver = GraphDatabase.driver(uri) - session = driver.session() + session = driver.session(database="neo4j") connected = False while not connected: try: diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py index a21a7456..6bf91bb8 100644 --- a/parser/tests/integration/test_parse_unparse_integration.py +++ b/parser/tests/integration/test_parse_unparse_integration.py @@ -11,21 +11,21 @@ @pytest.fixture(autouse=True) def test_setup(neo4j): # delete all the nodes, relations and search indexes in the database - query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n" + query = "MATCH (n:p_test_branch) DETACH DELETE n" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchIds IF EXISTS" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchTags IF EXISTS" neo4j.session().run(query) - query1 = "MATCH (n:p_test_branch1:t_test:b_branch1) DETACH DELETE n" + query1 = "MATCH (n:p_test_branch1) DETACH DELETE n" neo4j.session().run(query1) query1 = "DROP INDEX p_test_branch1_SearchIds IF EXISTS" neo4j.session().run(query1) query1 = "DROP INDEX p_test_branch1_SearchTags IF EXISTS" neo4j.session().run(query1) - query2 = "MATCH (n:p_test_branch2:t_test:b_branch2) DETACH DELETE n" + query2 = "MATCH (n:p_test_branch2) DETACH DELETE n" neo4j.session().run(query2) query2 = "DROP INDEX p_test_branch2_SearchIds IF EXISTS" neo4j.session().run(query2) @@ -35,22 +35,20 @@ def test_setup(neo4j): def test_round_trip(neo4j): """test parsing and dumping back a taxonomy""" - session = neo4j.session() - test_parser = parser.Parser(session) + with neo4j.session() as session: + test_parser = parser.Parser(session) - # parse taxonomy - test_parser(TEST_TAXONOMY_TXT, "branch", "test") - # just quick check it runs ok with total number of nodes - query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)" - result = session.run(query) - number_of_nodes = result.value()[0] - assert number_of_nodes == 14 + # parse taxonomy + test_parser(TEST_TAXONOMY_TXT, "branch", "test") + # just quick check it runs ok with total number of nodes + query = "MATCH (n:p_test_branch) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 14 - # dump taxonomy back - test_dumper = unparser.WriteTaxonomy(session) - lines = list(test_dumper.iter_lines("p_test_branch:t_test:b_branch")) - - session.close() + # dump taxonomy back + test_dumper = unparser.WriteTaxonomy(session) + lines = list(test_dumper.iter_lines("p_test_branch")) original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] # expected result is close to original file with a few tweaks @@ -75,32 +73,29 @@ def test_round_trip(neo4j): def test_two_branch_round_trip(neo4j): """test parsing and dumping the same taxonomy with two different branches""" - session = neo4j.session() - - test_parser = parser.Parser(session) - - # parse taxonomy with branch1 - test_parser(TEST_TAXONOMY_TXT, "branch1", "test") - # parse taxonomy with branch2 - test_parser(TEST_TAXONOMY_TXT, "branch2", "test") - - # just quick check it runs ok with total number of nodes - query = "MATCH (n:p_test_branch1:t_test:b_branch1) RETURN COUNT(*)" - result = session.run(query) - number_of_nodes = result.value()[0] - assert number_of_nodes == 14 - - query = "MATCH (n:p_test_branch2:t_test:b_branch2) RETURN COUNT(*)" - result = session.run(query) - number_of_nodes = result.value()[0] - assert number_of_nodes == 14 - - # dump taxonomy back - test_dumper = unparser.WriteTaxonomy(session) - lines_branch1 = list(test_dumper.iter_lines("p_test_branch1:t_test:b_branch1")) - lines_branch2 = list(test_dumper.iter_lines("p_test_branch2:t_test:b_branch2")) - - session.close() + with neo4j.session() as session: + test_parser = parser.Parser(session) + + # parse taxonomy with branch1 + test_parser(TEST_TAXONOMY_TXT, "branch1", "test") + # parse taxonomy with branch2 + test_parser(TEST_TAXONOMY_TXT, "branch2", "test") + + # just quick check it runs ok with total number of nodes + query = "MATCH (n:p_test_branch1) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 14 + + query = "MATCH (n:p_test_branch2) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 14 + + # dump taxonomy back + test_dumper = unparser.WriteTaxonomy(session) + lines_branch1 = list(test_dumper.iter_lines("p_test_branch1")) + lines_branch2 = list(test_dumper.iter_lines("p_test_branch2")) original_lines = [line.rstrip("\n") for line in open(TEST_TAXONOMY_TXT)] # expected result is close to original file with a few tweaks diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py index c5223107..5dd4e090 100644 --- a/parser/tests/integration/test_parser_integration.py +++ b/parser/tests/integration/test_parser_integration.py @@ -13,7 +13,9 @@ @pytest.fixture(autouse=True) def test_setup(neo4j): # delete all the nodes and relations in the database - query = "MATCH (n:p_test_branch:t_test:b_branch) DETACH DELETE n" + query = "MATCH (n:p_test_branch) DETACH DELETE n" + neo4j.session().run(query) + query = "DROP INDEX p_test_branch_id_index IF EXISTS" neo4j.session().run(query) query = "DROP INDEX p_test_branch_SearchIds IF EXISTS" neo4j.session().run(query) @@ -22,192 +24,185 @@ def test_setup(neo4j): def test_calling(neo4j): - session = neo4j.session() - test_parser = parser.Parser(session) - - # Create node test - test_parser.create_nodes(TEST_TAXONOMY_TXT, "p_test_branch:t_test:b_branch") - - # total number of nodes - query = "MATCH (n:p_test_branch:t_test:b_branch) RETURN COUNT(*)" - result = session.run(query) - number_of_nodes = result.value()[0] - assert number_of_nodes == 13 - - # header correctly added - query = ( - "MATCH (n:p_test_branch:t_test:b_branch) WHERE n.id = '__header__' RETURN n.preceding_lines" - ) - result = session.run(query) - header = result.value()[0] - assert header == ["# test taxonomy"] - - # synonyms correctly added - query = "MATCH (n:p_test_branch:t_test:b_branch:SYNONYMS) RETURN n ORDER BY n.src_position" - results = session.run(query) - expected_synonyms = [ - { - "id": "synonyms:0", - "tags_en": ["passion fruit", "passionfruit"], - "tags_ids_en": ["passion-fruit", "passionfruit"], - "preceding_lines": [], - "src_position": 5, - }, - { - "id": "synonyms:1", - "tags_fr": ["fruit de la passion", "maracuja", "passion"], - "tags_ids_fr": ["fruit-passion", "maracuja", "passion"], - "preceding_lines": [""], - "src_position": 7, - }, - ] - for i, result in enumerate(results): - node = result.value() - for key in expected_synonyms[i]: - assert node[key] == expected_synonyms[i][key] - - # stopwords correctly added - query = "MATCH (n:p_test_branch:t_test:b_branch:STOPWORDS) RETURN n" - results = session.run(query) - expected_stopwords = { - "id": "stopwords:0", - "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et"], - "preceding_lines": [], - } - for result in results: - node = result.value() - for key in expected_stopwords: - assert node[key] == expected_stopwords[key] - - # entries correctly added - # check for two of them - query = """ - MATCH (n:p_test_branch:t_test:b_branch:ENTRY) - WHERE n.id='en:banana-yogurts' - OR n.id='en:meat' - RETURN n - ORDER BY n.src_position - """ - results = session.run(query) - expected_entries = [ - { - "tags_en": ["banana yogurts"], - "tags_ids_en": ["banana-yogurts"], - "tags_fr": ["yaourts à la banane"], - "tags_ids_fr": ["yaourts-banane"], + with neo4j.session() as session: + test_parser = parser.Parser(session) + test_parser(TEST_TAXONOMY_TXT, "branch", "test") + + # total number of nodes (TEXT, ENTRY, SYNONYMS, STOPWORDS) + 1 ERROR node + query = "MATCH (n:p_test_branch) RETURN COUNT(*)" + result = session.run(query) + number_of_nodes = result.value()[0] + assert number_of_nodes == 14 + + # header correctly added + query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines" + result = session.run(query) + header = result.value()[0] + assert header == ["# test taxonomy"] + + # synonyms correctly added + query = "MATCH (n:p_test_branch:SYNONYMS) RETURN n ORDER BY n.src_position" + results = session.run(query) + expected_synonyms = [ + { + "id": "synonyms:0", + "tags_en": ["passion fruit", "passionfruit"], + "tags_ids_en": ["passion-fruit", "passionfruit"], + "preceding_lines": [], + "src_position": 5, + }, + { + "id": "synonyms:1", + "tags_fr": ["fruit de la passion", "maracuja", "passion"], + "tags_ids_fr": ["fruit-passion", "maracuja", "passion"], + "preceding_lines": [""], + "src_position": 7, + }, + ] + for i, result in enumerate(results): + node = result.value() + for key in expected_synonyms[i]: + assert node[key] == expected_synonyms[i][key] + + # stopwords correctly added + query = "MATCH (n:p_test_branch:STOPWORDS) RETURN n" + results = session.run(query) + expected_stopwords = { + "id": "stopwords:0", + "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et"], "preceding_lines": [], - }, - { - "tags_en": ["meat"], - "tags_ids_en": ["meat"], - "preceding_lines": ["# meat", ""], - "prop_vegan_en": "no", - "prop_carbon_footprint_fr_foodges_value_fr": "10", - }, - ] - for i, result in enumerate(results): - node = result.value() - for key in expected_entries[i]: - assert node[key] == expected_entries[i][key] - - # Child link test - test_parser.create_child_link("p_test_branch:t_test:b_branch") # nodes already added - query = """ - MATCH (c:p_test_branch:t_test:b_branch)-[:is_child_of]->(p:p_test_branch:t_test:b_branch) - RETURN c.id, p.id - """ - results = session.run(query) - created_pairs = results.values() - - # correct number of links - number_of_links = len(created_pairs) - assert number_of_links == 6 - - # correctly linked - expected_pairs = [ - ["en:banana-yogurts", "en:yogurts"], - ["en:passion-fruit-yogurts", "en:yogurts"], - ["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"], - ["en:fake-meat", "en:meat"], - ["en:fake-duck-meat", "en:fake-meat"], - ["en:fake-duck-meat", "en:fake-stuff"], - ] - for pair in created_pairs: - assert pair in expected_pairs - - # Order link test - test_parser.create_previous_link("p_test_branch:t_test:b_branch") - query = """ - MATCH (n:p_test_branch:t_test:b_branch)-[:is_before]->(p:p_test_branch:t_test:b_branch) - RETURN n.id, p.id - """ - results = session.run(query) - created_pairs = results.values() - - # correct number of links - number_of_links = len(created_pairs) - assert number_of_links == 12 - - # correctly linked - expected_pairs = [ - ["__header__", "stopwords:0"], - ["stopwords:0", "synonyms:0"], - ["synonyms:0", "synonyms:1"], - ["synonyms:1", "en:yogurts"], - ["en:yogurts", "en:banana-yogurts"], - ["en:banana-yogurts", "en:passion-fruit-yogurts"], - ["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"], - ["fr:yaourts-fruit-passion-alleges", "en:meat"], - ["en:meat", "en:fake-meat"], - ["en:fake-meat", "en:fake-stuff"], - ["en:fake-stuff", "en:fake-duck-meat"], - ["en:fake-duck-meat", "__footer__"], - ] - for pair in created_pairs: - assert pair in expected_pairs - session.close() + } + for result in results: + node = result.value() + for key in expected_stopwords: + assert node[key] == expected_stopwords[key] + + # entries correctly added + # check for two of them + query = """ + MATCH (n:p_test_branch:ENTRY) + WHERE n.id='en:banana-yogurts' + OR n.id='en:meat' + RETURN n + ORDER BY n.src_position + """ + results = session.run(query) + expected_entries = [ + { + "tags_en": ["banana yogurts"], + "tags_ids_en": ["banana-yogurts"], + "tags_fr": ["yaourts à la banane"], + "tags_ids_fr": ["yaourts-banane"], + "preceding_lines": [], + }, + { + "tags_en": ["meat"], + "tags_ids_en": ["meat"], + "preceding_lines": ["# meat", ""], + "prop_vegan_en": "no", + "prop_carbon_footprint_fr_foodges_value_fr": "10", + }, + ] + for i, result in enumerate(results): + node = result.value() + for key in expected_entries[i]: + assert node[key] == expected_entries[i][key] + + query = """ + MATCH (c:p_test_branch)-[:is_child_of]->(p:p_test_branch) + RETURN c.id, p.id + """ + results = session.run(query) + created_pairs = results.values() + + # correct number of links + number_of_links = len(created_pairs) + assert number_of_links == 6 + + # correctly linked + expected_pairs = [ + ["en:banana-yogurts", "en:yogurts"], + ["en:passion-fruit-yogurts", "en:yogurts"], + ["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"], + ["en:fake-meat", "en:meat"], + ["en:fake-duck-meat", "en:fake-meat"], + ["en:fake-duck-meat", "en:fake-stuff"], + ] + for pair in created_pairs: + assert pair in expected_pairs + + query = """ + MATCH (n:p_test_branch)-[:is_before]->(p:p_test_branch) + RETURN n.id, p.id + """ + results = session.run(query) + created_pairs = results.values() + + # correct number of links + number_of_links = len(created_pairs) + assert number_of_links == 12 + + # correctly linked + expected_pairs = [ + ["__header__", "stopwords:0"], + ["stopwords:0", "synonyms:0"], + ["synonyms:0", "synonyms:1"], + ["synonyms:1", "en:yogurts"], + ["en:yogurts", "en:banana-yogurts"], + ["en:banana-yogurts", "en:passion-fruit-yogurts"], + ["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"], + ["fr:yaourts-fruit-passion-alleges", "en:meat"], + ["en:meat", "en:fake-meat"], + ["en:fake-meat", "en:fake-stuff"], + ["en:fake-stuff", "en:fake-duck-meat"], + ["en:fake-duck-meat", "__footer__"], + ] + for pair in created_pairs: + assert pair in expected_pairs def test_error_log(neo4j, tmp_path, caplog): # error entries with same id - session = neo4j.session() - test_parser = parser.Parser(session) - - taxonomy_txt = textwrap.dedent(""" - # a fake taxonomy - stopwords:fr: aux,au,de,le,du,la,a,et - - # meat - en:meat - -