From 1fe888e5240f71da376a3feb45738d43d2814404 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 15:28:13 +0200 Subject: [PATCH 1/7] fix: less transformations on comment lines and others --- Makefile | 4 +- .../parser/parser.py | 39 ++++-- .../parser/taxonomy_parser.py | 121 +++++++++++------- .../openfoodfacts_taxonomy_parser/unparser.py | 12 +- parser/openfoodfacts_taxonomy_parser/utils.py | 11 +- parser/tests/data/test.txt | 16 +-- .../test_parse_unparse_integration.py | 32 +++-- .../integration/test_parser_integration.py | 84 ++++++++++-- parser/tests/unit/test_parser_unit.py | 1 - parser/tests/unit/test_unparser_unit.py | 38 ++++++ 10 files changed, 263 insertions(+), 95 deletions(-) create mode 100644 parser/tests/unit/test_unparser_unit.py diff --git a/Makefile b/Makefile index cf31b416..611b4fbd 100644 --- a/Makefile +++ b/Makefile @@ -127,8 +127,8 @@ lint: backend_lint frontend_lint config_lint ## Run all linters backend_lint: ## Run lint on backend code @echo "🍜 Linting python code" - ${DOCKER_COMPOSE} run --rm taxonomy_api isort . - ${DOCKER_COMPOSE} run --rm taxonomy_api black . + ${DOCKER_COMPOSE} run --rm taxonomy_api isort . /parser + ${DOCKER_COMPOSE} run --rm taxonomy_api black . /parser frontend_lint: ## Run lint on frontend code @echo "🍜 Linting react code" diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py index e2f051a3..f9dca464 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -9,7 +9,14 @@ from ..utils import get_project_name, normalize_text from .logger import ParserConsoleLogger -from .taxonomy_parser import ChildLink, NodeData, NodeType, PreviousLink, Taxonomy, TaxonomyParser +from .taxonomy_parser import ( + ChildLink, + NodeData, + NodeType, + PreviousLink, + Taxonomy, + TaxonomyParser, +) class Parser: @@ -19,7 +26,9 @@ def __init__(self, session: Session): self.session = session self.parser_logger = ParserConsoleLogger() - def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str): + def _create_other_node( + self, tx: Transaction, node_data: NodeData, project_label: str + ): """Create a TEXT, SYNONYMS or STOPWORDS node""" if node_data.get_node_type() == NodeType.TEXT: type_label = "TEXT" @@ -88,20 +97,26 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str): original_taxonomy: entry_node.original_taxonomy """ - properties_query = ",\n".join([base_properties_query, *additional_properties_queries]) + properties_query = ",\n".join( + [base_properties_query, *additional_properties_queries] + ) query = f""" WITH $entry_nodes as entry_nodes UNWIND entry_nodes as entry_node CREATE (n:{project_label}:ENTRY {{ {properties_query} }}) """ - self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes]) + self.session.run( + query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes] + ) self.parser_logger.info( f"Created {len(entry_nodes)} ENTRY nodes in {timeit.default_timer() - start_time} seconds" ) - def _create_previous_links(self, previous_links: list[PreviousLink], project_label: str): + def _create_previous_links( + self, previous_links: list[PreviousLink], project_label: str + ): """Create the 'is_before' relations between nodes""" self.parser_logger.info("Creating 'is_before' links") start_time = timeit.default_timer() @@ -142,9 +157,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str): MATCH (p:{project_label}) USING INDEX p:{project_label}(id) WHERE p.id = child_link.parent_id MATCH (c:{project_label}) USING INDEX c:{project_label}(id) + WHERE c.id = child_link.id """ + """ - WHERE c.id = child_link.id CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p) WITH relations UNWIND relations AS relation @@ -210,7 +225,9 @@ def _create_node_fulltext_index(self, project_label: str): ) self.session.run(query) - language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""] + language_codes = [ + lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != "" + ] tags_prefixed_lc = ["n.tags_ids_" + lc for lc in language_codes] tags_prefixed_lc = ", ".join(tags_prefixed_lc) query = f"""CREATE FULLTEXT INDEX {project_label+'_SearchTagsIds'} IF NOT EXISTS @@ -225,9 +242,13 @@ def _create_node_indexes(self, project_label: str): self._create_node_id_index(project_label) self._create_node_fulltext_index(project_label) - self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds") + self.parser_logger.info( + f"Created indexes in {timeit.default_timer() - start_time} seconds" + ) - def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str): + def _write_to_database( + self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str + ): project_label = get_project_name(taxonomy_name, branch_name) # First create nodes, then create node indexes to accelerate relationship creation, then create relationships self._create_other_nodes(taxonomy.other_nodes, project_label) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 9eb4fa78..84eeb11f 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -96,15 +96,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]] for line_number, line in enumerate(file): if line_number < start: continue - # sanitizing - # remove any space characters at end of line - line = line.rstrip() - # replace commas between digits and that have no space around by a lower comma character - # and do the same for escaped comma (preceded by a \) - # (to distinguish them from commas acting as tags separators) - line = re.sub(r"(\d),(\d)", r"\1‚\2", line) - line = re.sub(r"\\,", "\\‚", line) - yield line_number, line + yield line_number, line.rstrip("\n") line_count += 1 yield line_count, "" # to end the last entry if not ended @@ -113,13 +105,30 @@ def _normalize_entry_id(self, raw_id: str) -> str: Get a normalized string but keeping the language code "lc:", used for id and parent tag """ + raw_id = raw_id.strip() lc, main_tag = raw_id.split(":", 1) normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords) normalized_id = f"{lc}:{normalized_main_tag}" return normalized_id + def prepare_line(self, line: str) -> str: + """prepare line for parsing + + This is different from `normalize_text` which is to compute ids + """ + line = line.strip() + # sanitizing + # remove any space or commas characters at end of line + line = re.sub(r"[\s,]+$", "", line) + # replace commas between digits and that have no space around by a lower comma character + # and do the same for escaped comma (preceded by a \) + # (to distinguish them from commas acting as tags separators) + line = re.sub(r"(\d),(\d)", r"\1‚\2", line) + line = re.sub(r"\\,", "\\‚", line) + return line + def undo_normalize_text(self, text: str) -> str: - """Undo some normalizations made in `_file_iter`""" + """Undo some normalizations made in `prepare_line`""" # restore commas from lower comma characters text = re.sub(r"(\d)‚(\d)", r"\1,\2", text) text = re.sub(r"\\‚", "\\,", text) @@ -148,7 +157,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: h = 0 header: list[str] = [] for _, line in self._file_iter(filename): - if not (line) or line[0] == "#": + if not (line.strip()) or line[0] == "#": header.append(line) else: break @@ -157,7 +166,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]: # we don't want to eat the comments of the next block # and it removes the last separating line for i in range(len(header)): - if header.pop(): + if header.pop().strip(): h -= 1 else: break @@ -169,7 +178,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool: if data.id.startswith("stopwords") or data.id.startswith("synonyms"): # stopwords and synonyms are one-liners; if the id is set, it's the end return True - if not line and data.id: + if not line.strip() and data.id: # entries are separated by a blank line return True return False @@ -181,7 +190,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: """ is_before = data.is_before # first, check if there is at least one preceding line - if data.preceding_lines and not data.preceding_lines[0]: + if data.preceding_lines and not data.preceding_lines[0].strip(): if data.id.startswith("synonyms"): # it's a synonyms block, # if the previous block is a stopwords block, @@ -211,7 +220,9 @@ def _get_node_data_with_comments_above_key( # Get comments just above the given line comments_above = [] current_line = line_number - 1 - while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line: + while ( + new_data.comments_stack and new_data.comments_stack[-1][0] == current_line + ): comments_above.append(new_data.comments_stack.pop()[1]) current_line -= 1 if comments_above: @@ -246,12 +257,12 @@ def is_entry_synonyms_line(self, line): matching_prefix = self._language_code_prefix.match(line) if matching_prefix: # verify it's not a property, that is a name followed by a colon and a language - return not ( - self._language_code_prefix.match(line[matching_prefix.end():]) - ) + return not (self._language_code_prefix.match(line[matching_prefix.end() :])) return False - def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: + def _harvest_entries( + self, filename: str, entries_start_line: int + ) -> Iterator[NodeData]: """Transform data from file to dictionary""" saved_nodes = [] index_stopwords = 0 @@ -263,12 +274,10 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N self.stopwords = {} # the first entry is after __header__ which was created before data = NodeData(is_before="__header__") - line_number = ( - entries_start_line # if the iterator is empty, line_number will not be unbound - ) - for line_number, line in self._file_iter(filename, entries_start_line): + line_number = entries_start_line # if the iterator is empty, line_number will not be unbound + for line_number, raw_line in self._file_iter(filename, entries_start_line): # yield data if block ended - if self._entry_end(line, data): + if self._entry_end(raw_line, data): data = self._remove_separating_line(data) if data.get_node_type() == NodeType.ENTRY: data = self._get_node_data_with_parent_and_end_comments(data) @@ -289,16 +298,16 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data = NodeData(is_before=is_before) # harvest the line - if not (line) or line[0] == "#": + if not (raw_line.strip()) or raw_line[0] == "#": # comment or blank line if data.id: # we are within the node definition - data.comments_stack.append((line_number, line)) + data.comments_stack.append((line_number, raw_line)) else: # we are before the actual node - data.preceding_lines.append(line) + data.preceding_lines.append(raw_line) else: - line = line.rstrip(",") + line = self.prepare_line(raw_line) if not data.src_position: data.src_position = line_number + 1 if line.startswith("stopwords"): @@ -309,7 +318,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N # remove "stopwords:" part line = line[10:] try: - lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False) + lc, tags, tags_ids = self._get_lc_value( + line, remove_stopwords=False + ) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'" @@ -337,7 +348,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data.tags["tags_ids_" + lc] = tags_ids elif line[0] == "<": # parent definition - data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1)) + data.parent_tags.append( + (self._normalize_entry_id(line[1:]), line_number + 1) + ) elif self.is_entry_synonyms_line(line): # synonyms definition if not data.id: @@ -353,7 +366,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list = [] for word in line.split(","): tags_list.append(self.undo_normalize_text(word.strip())) - word_normalized = normalize_text(word, lang, stopwords=self.stopwords) + word_normalized = normalize_text( + word, lang, stopwords=self.stopwords + ) if word_normalized not in tagsids_list: # in case 2 normalized synonyms are the same tagsids_list.append(word_normalized) @@ -376,14 +391,17 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N property_name = property_name.strip() lc = lc.strip().replace("-", "_") if not ( - correctly_written.match(property_name) and correctly_written.match(lc) + correctly_written.match(property_name) + and correctly_written.match(lc) ): self.parser_logger.error( f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'" ) if property_name: prop_key = "prop_" + property_name + "_" + lc - data.properties[prop_key] = self.undo_normalize_text(property_value.strip()) + data.properties[prop_key] = self.undo_normalize_text( + property_value.strip() + ) data = self._get_node_data_with_comments_above_key( data, line_number, prop_key ) @@ -409,7 +427,9 @@ def _normalise_and_validate_child_links( # we collect all the tags_ids in a certain language tags_ids = {} for node in entry_nodes: - node_tags_ids = {tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])} + node_tags_ids = { + tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", []) + } tags_ids.update(node_tags_ids) # we check if the parent_id exists in the tags_ids @@ -418,7 +438,9 @@ def _normalise_and_validate_child_links( if parent_id not in tags_ids: missing_child_links.append(child_link) else: - child_link["parent_id"] = tags_ids[parent_id] # normalise the parent_id + child_link["parent_id"] = tags_ids[ + parent_id + ] # normalise the parent_id normalised_child_links.append(child_link) return normalised_child_links, missing_child_links @@ -444,8 +466,10 @@ def _get_valid_child_links( ] # Normalise and validate the unnormalised links - normalised_child_links, missing_child_links = self._normalise_and_validate_child_links( - entry_nodes, child_links_to_normalise + normalised_child_links, missing_child_links = ( + self._normalise_and_validate_child_links( + entry_nodes, child_links_to_normalise + ) ) valid_child_links.extend(normalised_child_links) @@ -459,7 +483,9 @@ def _get_valid_child_links( return valid_child_links - def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]: + def _remove_duplicate_child_links( + self, child_links: list[ChildLink] + ) -> list[ChildLink]: """Remove duplicate child links (i.e child links with the same parent_id and id)""" unique_child_links = [] children_to_parents = collections.defaultdict(set) @@ -470,10 +496,13 @@ def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[Ch unique_child_links.append(child_link) return unique_child_links - def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]: + def _merge_duplicate_entry_nodes( + self, entry_nodes: list[NodeData] + ) -> list[NodeData]: """Merge entry nodes with the same id: - merge their tags (union) - - merge their properties (union, and in case of conflict, keep the last value)""" + - merge their properties (union, and in case of conflict, keep the last value) + """ unique_entry_nodes = [] ids_to_nodes = dict() for node in entry_nodes: @@ -539,7 +568,9 @@ def _create_taxonomy( entry_nodes: list[NodeData] = [] entry_nodes.extend(external_entry_nodes) other_nodes = [ - NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1) + NodeData( + id="__header__", preceding_lines=harvested_header_data, src_position=1 + ) ] previous_links: list[PreviousLink] = [] raw_child_links: list[ChildLink] = [] @@ -551,7 +582,9 @@ def _create_taxonomy( else: other_nodes.append(entry) if entry.is_before: - previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id)) + previous_links.append( + PreviousLink(before_id=entry.is_before, id=entry.id) + ) if entry.parent_tags: for position, (parent, line_position) in enumerate(entry.parent_tags): raw_child_links.append( @@ -584,7 +617,9 @@ def parse_file( start_time = timeit.default_timer() filename = normalize_filename(filename) taxonomy = self._create_taxonomy(filename, external_filenames) - self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.") + self.parser_logger.info( + f"Parsing done in {timeit.default_timer() - start_time} seconds." + ) self.parser_logger.info( f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes" ) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 158c2578..4a4770aa 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -67,7 +67,11 @@ def list_property_and_lc(self, node): """return an ordered list of properties with their language code (lc)""" # there is no rule for the order of properties # properties will be arranged in alphabetical order - values = [property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")] + values = [ + property[5:] + for property in node + if property.startswith("prop_") and not property.endswith("_comments") + ] # note: using the fact that we are sure to find language code after the first underscore return sorted(values, key=self.property_sort_key) @@ -83,7 +87,7 @@ def get_parents_lines(self, parents): parent = dict(parent) lc = parent["main_language"] parent_id = parent["tags_" + lc][0] - yield "<" + lc + ": " + parent_id + yield "< " + lc + ":" + parent_id def iter_lines(self, project_label): previous_block_id = "" @@ -91,9 +95,9 @@ def iter_lines(self, project_label): node = dict(node) has_content = node["id"] not in ["__header__", "__footer__"] # eventually add a blank line but in specific case - following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith( + following_synonyms = node["id"].startswith( "synonyms" - ) + ) and previous_block_id.startswith("synonyms") following_stopwords = node["id"].startswith( "stopwords" ) and previous_block_id.startswith("stopwords") diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py index 36225921..fa8a61d6 100644 --- a/parser/openfoodfacts_taxonomy_parser/utils.py +++ b/parser/openfoodfacts_taxonomy_parser/utils.py @@ -4,7 +4,12 @@ import unidecode -def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: dict[str, list[str]] | None = None) -> str: +def normalize_text( + line: str, + lang: str = "default", + char: str = "-", + stopwords: dict[str, list[str]] | None = None, +) -> str: """Normalize a string depending on the language code""" if stopwords is None: stopwords = {} @@ -43,7 +48,9 @@ def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: stopwords = stopwords[lang] line_surrounded_by_char = char + line + char for stopword in stopwords: - line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char) + line_surrounded_by_char = line_surrounded_by_char.replace( + char + stopword + char, char + ) line = line_surrounded_by_char[1:-1] return line diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index 92f38f86..c501a4b5 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit synonyms:fr: fruit de la passion, maracuja, passion - Date: Tue, 23 Jul 2024 17:32:50 +0200 Subject: [PATCH 2/7] fix: fix properties orders in unparse --- .../openfoodfacts_taxonomy_parser/unparser.py | 2 +- parser/tests/unit/test_unparser_unit.py | 22 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py index 4a4770aa..994407e0 100644 --- a/parser/openfoodfacts_taxonomy_parser/unparser.py +++ b/parser/openfoodfacts_taxonomy_parser/unparser.py @@ -58,7 +58,7 @@ def get_tags_line(self, node, lc): @staticmethod def property_sort_key(property): - name, lang_code, *_ = property.split("_", 2) + name, lang_code = property.rsplit("_", 1) # give priority to xx and en language codes priority = {"en": 1, "xx": 0} return (name, priority.get(lang_code, 100), lang_code) diff --git a/parser/tests/unit/test_unparser_unit.py b/parser/tests/unit/test_unparser_unit.py index 9764dd0d..249abbdf 100644 --- a/parser/tests/unit/test_unparser_unit.py +++ b/parser/tests/unit/test_unparser_unit.py @@ -12,16 +12,18 @@ """ vegan:en: yes ciqual_food_name:fr: Lécithine de soja + ciqual_food_name:ar: الكيكلية البيضية ciqual_food_code:en: 42200 vegetarian:en: yes ciqual_food_name:en: Soy lecithin """, [ - "ciqual_food_code:en", - "ciqual_food_name:en", - "ciqual_food_name:fr", - "vegan:en", - "vegetarian:en", + "ciqual_food_code_en", + "ciqual_food_name_en", + "ciqual_food_name_ar", + "ciqual_food_name_fr", + "vegan_en", + "vegetarian_en", ], ), ], @@ -33,6 +35,12 @@ def test_list_property_and_lc(properties_txt, expected): ] name_lc_values = [prop.split(":", 2) for prop in properties] node = {f"prop_{name}_{lc}": value for name, lc, value in name_lc_values} - # node = {f"prop_{name}_{lc}": value for prop in properties for name, lc, value in prop.split(":", 2)} result = unparser.WriteTaxonomy(None).list_property_and_lc(node) - assert result, expected + assert result == expected + +def test_list_property_and_lc_with_comment(): + node = {"prop_test_ar": "a", "prop_test_ar_comments": "# a", "prop_test_en": "b", "prop_test_comments_en": "legit"} + result = unparser.WriteTaxonomy(None).list_property_and_lc(node) + assert result == ["test_en", "test_ar", "test_comments_en"] + + From b7d5a0f2e85b1faa2c0c563e2987e9a14c970061 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 17:47:06 +0200 Subject: [PATCH 3/7] chore: flake8 --- parser/tests/unit/test_unparser_unit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parser/tests/unit/test_unparser_unit.py b/parser/tests/unit/test_unparser_unit.py index bffa685e..a904705b 100644 --- a/parser/tests/unit/test_unparser_unit.py +++ b/parser/tests/unit/test_unparser_unit.py @@ -1,5 +1,4 @@ import inspect -import pathlib import pytest From d8d62fe49e625b4b4dbe7407a46bcabbf9585007 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 17:52:39 +0200 Subject: [PATCH 4/7] chore: reformat comment --- parser/openfoodfacts_taxonomy_parser/parser/parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py index cdeacca0..633ff92a 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py @@ -230,7 +230,8 @@ def _create_node_indexes(self, project_label: str): def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str): project_label = get_project_name(taxonomy_name, branch_name) - # First create nodes, then create node indexes to accelerate relationship creation, + # First create nodes, + # then create node indexes to accelerate relationship creation, # then create relationships self._create_other_nodes(taxonomy.other_nodes, project_label) self._create_entry_nodes(taxonomy.entry_nodes, project_label) From f08082016403ab7710c2d63f953d730fdac56d9a Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 23:11:06 +0200 Subject: [PATCH 5/7] refactor: simplify comment handling in parser --- .../parser/taxonomy_parser.py | 97 +++++++------------ 1 file changed, 33 insertions(+), 64 deletions(-) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 81b8e824..6e7ce166 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -1,5 +1,4 @@ import collections -import copy import logging import re import sys @@ -31,11 +30,6 @@ class NodeData: properties: dict[str, str] = field(default_factory=dict) tags: dict[str, list[str]] = field(default_factory=dict) comments: dict[str, list[str]] = field(default_factory=dict) - # comments_stack is a list of tuples (line_number, comment), - # to keep track of comments just above the current line - # during parsing of an entry, to be able to add them - # to the right property or tag when possible - comments_stack: list[tuple[int, str]] = field(default_factory=list) is_external: bool = False # True if the node comes from another taxonomy original_taxonomy: str | None = None # the name of the taxonomy the node comes from @@ -210,42 +204,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData: data.preceding_lines.pop(0) return data - def _get_node_data_with_comments_above_key( - self, data: NodeData, line_number: int, key: str - ) -> NodeData: + def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData: """Returns the updated node data with comments above the given key stored in the {key}_comments property.""" - new_data = copy.deepcopy(data) - - # Get comments just above the given line - comments_above = [] - current_line = line_number - 1 - while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line: - comments_above.append(new_data.comments_stack.pop()[1]) - current_line -= 1 - if comments_above: - new_data.comments[key + "_comments"] = comments_above[::-1] - - return new_data - - def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData: - """Returns the updated node data with parent and end comments""" - new_data = copy.deepcopy(data) - - # Get parent comments (part of an entry block and just above/between the parents lines) - parent_comments = [] - while new_data.preceding_lines and new_data.preceding_lines[-1] != "": - parent_comments.append(new_data.preceding_lines.pop()) - if parent_comments: - new_data.comments["parent_comments"] = parent_comments[::-1] - - # Get end comments (part of an entry block after the last tag/prop - # and before the separating blank line) - end_comments = [comment[1] for comment in new_data.comments_stack] - if end_comments: - new_data.comments["end_comments"] = end_comments - - return new_data + if comments: + data.comments.setdefault(f"{key}_comments", []).extend(comments) + # reset the comments list + comments[:] = [] _language_code_prefix = re.compile( r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:" @@ -260,11 +225,30 @@ def is_entry_synonyms_line(self, line): ) return False + def finalize_data(self, data, comments, saved_nodes): + data = self._remove_separating_line(data) + if data.get_node_type() == NodeType.ENTRY: + self._add_comments(data, comments, "end") + if data.id in saved_nodes: + # this duplicate node will be merged with the first one + data.is_before = None + msg = ( + f"WARNING: Entry with same id {data.id} already exists, " + f"duplicate id in file at line {data.src_position}. " + "The two nodes will be merged, keeping the last " + "values in case of conflicts." + ) + self.parser_logger.error(msg) + else: + saved_nodes.append(data.id) + return data + def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]: """Transform data from file to dictionary""" saved_nodes = [] index_stopwords = 0 index_synonyms = 0 + comments = [] # Check if it is correctly written correctly_written = re.compile(r"\w+\Z") @@ -278,23 +262,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N for line_number, raw_line in self._file_iter(filename, entries_start_line): # yield data if block ended if self._entry_end(raw_line, data): - data = self._remove_separating_line(data) - if data.get_node_type() == NodeType.ENTRY: - data = self._get_node_data_with_parent_and_end_comments(data) - if data.id in saved_nodes: - # this duplicate node will be merged with the first one - data.is_before = None - msg = ( - f"WARNING: Entry with same id {data.id} already exists, " - f"duplicate id in file at line {data.src_position}. " - "The two nodes will be merged, keeping the last " - "values in case of conflicts." - ) - self.parser_logger.error(msg) - else: - saved_nodes.append(data.id) - is_before = data.id - yield data # another function will use this dictionary to create a node + is_before = data.is_before + # another function will use data to create a node + yield self.finalize_data(data, comments, saved_nodes) + # if data was a duplicate (is_before is None) reuse same is_before + is_before = data.id if data.is_before else is_before data = NodeData(is_before=is_before) # harvest the line @@ -302,7 +274,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N # comment or blank line if data.id: # we are within the node definition - data.comments_stack.append((line_number, raw_line)) + comments.append(raw_line) else: # we are before the actual node data.preceding_lines.append(raw_line) @@ -349,6 +321,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N elif line[0] == "<": # parent definition data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1)) + self._add_comments(data, comments, "parent") elif self.is_entry_synonyms_line(line): # synonyms definition if not data.id: @@ -370,9 +343,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N tagsids_list.append(word_normalized) data.tags["tags_" + lang] = tags_list data.tags["tags_ids_" + lang] = tagsids_list - data = self._get_node_data_with_comments_above_key( - data, line_number, "tags_" + lang - ) + self._add_comments(data, comments, "tags_" + lang) else: # property definition property_name = None @@ -399,9 +370,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N data.properties[prop_key] = self.undo_normalize_text( property_value.strip() ) - data = self._get_node_data_with_comments_above_key( - data, line_number, prop_key - ) + self._add_comments(data, comments, prop_key) data.id = "__footer__" data.preceding_lines.pop(0) From cac7585ddd6db2f0e853633ad2edfe3742233c94 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Tue, 23 Jul 2024 23:44:39 +0200 Subject: [PATCH 6/7] fix: comment below parents in parser --- .../parser/taxonomy_parser.py | 6 +++++- parser/tests/integration/test_parser_integration.py | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py index 6e7ce166..0cbad078 100644 --- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py +++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py @@ -56,6 +56,10 @@ def get_node_type(self): else: return NodeType.ENTRY + @property + def is_started(self): + return self.id or self.parent_tags or self.tags or self.properties + class PreviousLink(TypedDict): before_id: str @@ -272,7 +276,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N # harvest the line if not (raw_line.strip()) or raw_line[0] == "#": # comment or blank line - if data.id: + if data.is_started: # we are within the node definition comments.append(raw_line) else: diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py index c75687e9..ffc73191 100644 --- a/parser/tests/integration/test_parser_integration.py +++ b/parser/tests/integration/test_parser_integration.py @@ -411,3 +411,16 @@ def test_properties_confused_lang(neo4j, tmp_path): node = result.value()[0] # "web:en" was not confused with a language prefix "web:" assert "prop_web_en" in node.keys() + + +def test_comment_below_parent(neo4j, tmp_path): + """Test that if a comment is below a parent, it is not added to preceeding_lines""" + with neo4j.session() as session: + test_parser = parser.Parser(session) + fpath = str(pathlib.Path(__file__).parent.parent / "data" / "test_comment_below_parent.txt") + test_parser(fpath, None, "branch", "test") + query = "MATCH (n:p_test_branch) WHERE n.id = 'en:cow-milk' RETURN n" + result = session.run(query) + node = result.value()[0] + assert node["preceding_lines"] == ["# a comment above the parent"] + assert node["tags_en_comments"] == ["# a comment below the parent"] From c9e7e9bc020768bb298734804153a6f4cca6b4f0 Mon Sep 17 00:00:00 2001 From: Alex Garel Date: Wed, 24 Jul 2024 10:10:27 +0200 Subject: [PATCH 7/7] test: add file for test --- parser/tests/data/test_comment_below_parent.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 parser/tests/data/test_comment_below_parent.txt diff --git a/parser/tests/data/test_comment_below_parent.txt b/parser/tests/data/test_comment_below_parent.txt new file mode 100644 index 00000000..aa972dc4 --- /dev/null +++ b/parser/tests/data/test_comment_below_parent.txt @@ -0,0 +1,9 @@ +# test a bug found with ingr taxonomy + +en:milk + +# a comment above the parent +