openfoodfacts · alexgarel · Jul 24, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
@@ -143,9 +143,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str):
                 MATCH (p:{project_label}) USING INDEX p:{project_label}(id)
                 WHERE p.id = child_link.parent_id
                 MATCH (c:{project_label}) USING INDEX c:{project_label}(id)
+                WHERE c.id = child_link.id
             """
             + """
-                WHERE c.id = child_link.id
                 CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p)
                 WITH relations
                 UNWIND relations AS relation

@@ -1,5 +1,4 @@
 import collections
-import copy
 import logging
 import re
 import sys
@@ -31,11 +30,6 @@ class NodeData:
     properties: dict[str, str] = field(default_factory=dict)
     tags: dict[str, list[str]] = field(default_factory=dict)
     comments: dict[str, list[str]] = field(default_factory=dict)
-    # comments_stack is a list of tuples (line_number, comment),
-    # to keep track of comments just above the current line
-    # during parsing of an entry, to be able to add them
-    # to the right property or tag when possible
-    comments_stack: list[tuple[int, str]] = field(default_factory=list)
     is_external: bool = False  # True if the node comes from another taxonomy
     original_taxonomy: str | None = None  # the name of the taxonomy the node comes from
 
@@ -62,6 +56,10 @@ def get_node_type(self):
         else:
             return NodeType.ENTRY
 
+    @property
+    def is_started(self):
+        return self.id or self.parent_tags or self.tags or self.properties
+
 
 class PreviousLink(TypedDict):
     before_id: str
@@ -96,16 +94,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
             for line_number, line in enumerate(file):
                 if line_number < start:
                     continue
-                # sanitizing
-                # remove any space characters at end of line
-                line = line.rstrip()
-                # replace commas between digits
-                # and that have no space around by a lower comma character
-                # and do the same for escaped comma (preceded by a \)
-                # (to distinguish them from commas acting as tags separators)
-                line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
-                line = re.sub(r"\\,", "\\‚", line)
-                yield line_number, line
+                yield line_number, line.rstrip("\n")
                 line_count += 1
             yield line_count, ""  # to end the last entry if not ended
 
@@ -114,13 +103,30 @@ def _normalize_entry_id(self, raw_id: str) -> str:
         Get a normalized string but keeping the language code "lc:",
         used for id and parent tag
         """
+        raw_id = raw_id.strip()
         lc, main_tag = raw_id.split(":", 1)
         normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
         normalized_id = f"{lc}:{normalized_main_tag}"
         return normalized_id
 
+    def prepare_line(self, line: str) -> str:
+        """prepare line for parsing
+
+        This is different from `normalize_text` which is to compute ids
+        """
+        line = line.strip()
+        # sanitizing
+        # remove any space or commas characters at end of line
+        line = re.sub(r"[\s,]+$", "", line)
+        # replace commas between digits and that have no space around by a lower comma character
+        # and do the same for escaped comma (preceded by a \)
+        # (to distinguish them from commas acting as tags separators)
+        line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
+        line = re.sub(r"\\,", "\\‚", line)
+        return line
+
     def undo_normalize_text(self, text: str) -> str:
-        """Undo some normalizations made in `_file_iter`"""
+        """Undo some normalizations made in `prepare_line`"""
         # restore commas from lower comma characters
         text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
         text = re.sub(r"\\‚", "\\,", text)
@@ -149,7 +155,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
         h = 0
         header: list[str] = []
         for _, line in self._file_iter(filename):
-            if not (line) or line[0] == "#":
+            if not (line.strip()) or line[0] == "#":
                 header.append(line)
             else:
                 break
@@ -158,7 +164,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
         # we don't want to eat the comments of the next block
         # and it removes the last separating line
         for i in range(len(header)):
-            if header.pop():
+            if header.pop().strip():
                 h -= 1
             else:
                 break
@@ -170,7 +176,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool:
         if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
             # stopwords and synonyms are one-liners; if the id is set, it's the end
             return True
-        if not line and data.id:
+        if not line.strip() and data.id:
             # entries are separated by a blank line
             return True
         return False
@@ -182,7 +188,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
         """
         is_before = data.is_before
         # first, check if there is at least one preceding line
-        if data.preceding_lines and not data.preceding_lines[0]:
+        if data.preceding_lines and not data.preceding_lines[0].strip():
             if data.id.startswith("synonyms"):
                 # it's a synonyms block,
                 # if the previous block is a stopwords block,
@@ -202,42 +208,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
                 data.preceding_lines.pop(0)
         return data
 
-    def _get_node_data_with_comments_above_key(
-        self, data: NodeData, line_number: int, key: str
-    ) -> NodeData:
+    def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData:
         """Returns the updated node data with comments above the given
         key stored in the {key}_comments property."""
-        new_data = copy.deepcopy(data)
-
-        # Get comments just above the given line
-        comments_above = []
-        current_line = line_number - 1
-        while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
-            comments_above.append(new_data.comments_stack.pop()[1])
-            current_line -= 1
-        if comments_above:
-            new_data.comments[key + "_comments"] = comments_above[::-1]
-
-        return new_data
-
-    def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
-        """Returns the updated node data with parent and end comments"""
-        new_data = copy.deepcopy(data)
-
-        # Get parent comments (part of an entry block and just above/between the parents lines)
-        parent_comments = []
-        while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
-            parent_comments.append(new_data.preceding_lines.pop())
-        if parent_comments:
-            new_data.comments["parent_comments"] = parent_comments[::-1]
-
-        # Get end comments (part of an entry block after the last tag/prop
-        # and before the separating blank line)
-        end_comments = [comment[1] for comment in new_data.comments_stack]
-        if end_comments:
-            new_data.comments["end_comments"] = end_comments
-
-        return new_data
+        if comments:
+            data.comments.setdefault(f"{key}_comments", []).extend(comments)
+            # reset the comments list
+            comments[:] = []
 
     _language_code_prefix = re.compile(
         r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
@@ -247,17 +224,35 @@ def is_entry_synonyms_line(self, line):
         matching_prefix = self._language_code_prefix.match(line)
         if matching_prefix:
             # verify it's not a property, that is a name followed by a colon and a language
-            # we need no-qa this because of black vs flake8 opinion
             return not (
                 self._language_code_prefix.match(line[matching_prefix.end() :])  # noqa: E203
             )
         return False
 
+    def finalize_data(self, data, comments, saved_nodes):
+        data = self._remove_separating_line(data)
+        if data.get_node_type() == NodeType.ENTRY:
+            self._add_comments(data, comments, "end")
+        if data.id in saved_nodes:
+            # this duplicate node will be merged with the first one
+            data.is_before = None
+            msg = (
+                f"WARNING: Entry with same id {data.id} already exists, "
+                f"duplicate id in file at line {data.src_position}. "
+                "The two nodes will be merged, keeping the last "
+                "values in case of conflicts."
+            )
+            self.parser_logger.error(msg)
+        else:
+            saved_nodes.append(data.id)
+        return data
+
     def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
         """Transform data from file to dictionary"""
         saved_nodes = []
         index_stopwords = 0
         index_synonyms = 0
+        comments = []
 
         # Check if it is correctly written
         correctly_written = re.compile(r"\w+\Z")
@@ -268,39 +263,27 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
         line_number = (
             entries_start_line  # if the iterator is empty, line_number will not be unbound
         )
-        for line_number, line in self._file_iter(filename, entries_start_line):
+        for line_number, raw_line in self._file_iter(filename, entries_start_line):
             # yield data if block ended
-            if self._entry_end(line, data):
-                data = self._remove_separating_line(data)
-                if data.get_node_type() == NodeType.ENTRY:
-                    data = self._get_node_data_with_parent_and_end_comments(data)
-                if data.id in saved_nodes:
-                    # this duplicate node will be merged with the first one
-                    data.is_before = None
-                    msg = (
-                        f"WARNING: Entry with same id {data.id} already exists, "
-                        f"duplicate id in file at line {data.src_position}. "
-                        "The two nodes will be merged, keeping the last "
-                        "values in case of conflicts."
-                    )
-                    self.parser_logger.error(msg)
-                else:
-                    saved_nodes.append(data.id)
-                    is_before = data.id
-                yield data  # another function will use this dictionary to create a node
+            if self._entry_end(raw_line, data):
+                is_before = data.is_before
+                # another function will use data to create a node
+                yield self.finalize_data(data, comments, saved_nodes)
+                # if data was a duplicate (is_before is None) reuse same is_before
+                is_before = data.id if data.is_before else is_before
                 data = NodeData(is_before=is_before)
 
             # harvest the line
-            if not (line) or line[0] == "#":
+            if not (raw_line.strip()) or raw_line[0] == "#":
                 # comment or blank line
-                if data.id:
+                if data.is_started:
                     # we are within the node definition
-                    data.comments_stack.append((line_number, line))
+                    comments.append(raw_line)
                 else:
                     # we are before the actual node
-                    data.preceding_lines.append(line)
+                    data.preceding_lines.append(raw_line)
             else:
-                line = line.rstrip(",")
+                line = self.prepare_line(raw_line)
                 if not data.src_position:
                     data.src_position = line_number + 1
                 if line.startswith("stopwords"):
@@ -342,6 +325,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                 elif line[0] == "<":
                     # parent definition
                     data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
+                    self._add_comments(data, comments, "parent")
                 elif self.is_entry_synonyms_line(line):
                     # synonyms definition
                     if not data.id:
@@ -363,9 +347,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             tagsids_list.append(word_normalized)
                     data.tags["tags_" + lang] = tags_list
                     data.tags["tags_ids_" + lang] = tagsids_list
-                    data = self._get_node_data_with_comments_above_key(
-                        data, line_number, "tags_" + lang
-                    )
+                    self._add_comments(data, comments, "tags_" + lang)
                 else:
                     # property definition
                     property_name = None
@@ -384,17 +366,15 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             correctly_written.match(property_name) and correctly_written.match(lc)
                         ):
                             self.parser_logger.error(
-                                f"Reading error at line {line_number + 1},"
+                                f"Reading error at line {line_number + 1}, "
                                 f"unexpected format: '{self.parser_logger.ellipsis(line)}'"
                             )
                         if property_name:
                             prop_key = "prop_" + property_name + "_" + lc
                             data.properties[prop_key] = self.undo_normalize_text(
                                 property_value.strip()
                             )
-                            data = self._get_node_data_with_comments_above_key(
-                                data, line_number, prop_key
-                            )
+                            self._add_comments(data, comments, prop_key)
 
         data.id = "__footer__"
         data.preceding_lines.pop(0)

@@ -59,7 +59,7 @@ def get_tags_line(self, node, lc):
 
     @staticmethod
     def property_sort_key(property):
-        name, lang_code, *_ = property.split("_", 2)
+        name, lang_code = property.rsplit("_", 1)
         # give priority to xx and en language codes
         priority = {"en": 1, "xx": 0}
         return (name, priority.get(lang_code, 100), lang_code)
@@ -88,7 +88,7 @@ def get_parents_lines(self, parents):
             parent = dict(parent)
             lc = parent["main_language"]
             parent_id = parent["tags_" + lc][0]
-            yield "<" + lc + ": " + parent_id
+            yield "< " + lc + ":" + parent_id
 
     def iter_lines(self, project_label):
         previous_block_id = ""

@@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit
 
 synonyms:fr: fruit de la passion, maracuja, passion
 
-<en: milk
+< en:milk
 en: yogurts, yoghurts
 fr: yaourts, yoghourts, yogourts
 
-<en: yogurts
+< en:yogurts
 en: banana yogurts
 fr: yaourts à la banane
 
-<en: yogurts
+< en:yogurts
 en: Passion fruit yogurts
 fr: yaourts au fruit de la passion
 
-<fr: yaourts fruit de la passion
+< fr:yaourts fruit de la passion
 fr: yaourts au fruit de la passion allégés
 
-# meat
+# meat 
 
 en: meat
 carbon_footprint_fr_foodges_value:fr: 10
 vegan:en: no
 
-<en: meat
+< en:meat
 en: fake-meat
 vegan:en: yes
 
 en: fake-stuff
 
-<en: fake-stuff
-<en: fake-meat
+< en:fake-stuff
+< en:fake-meat
 en: fake-duck-meat
@@ -0,0 +1,9 @@
+# test a bug found with ingr taxonomy
+
+en:milk
+
+# a comment above the parent
+<en: milk
+# a comment below the parent
+en: Cow milk
+fr: lait de vache