From 1fe888e5240f71da376a3feb45738d43d2814404 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 15:28:13 +0200
Subject: [PATCH 1/7] fix: less transformations on comment lines and others

---
 Makefile                                      |   4 +-
 .../parser/parser.py                          |  39 ++++--
 .../parser/taxonomy_parser.py                 | 121 +++++++++++-------
 .../openfoodfacts_taxonomy_parser/unparser.py |  12 +-
 parser/openfoodfacts_taxonomy_parser/utils.py |  11 +-
 parser/tests/data/test.txt                    |  16 +--
 .../test_parse_unparse_integration.py         |  32 +++--
 .../integration/test_parser_integration.py    |  84 ++++++++++--
 parser/tests/unit/test_parser_unit.py         |   1 -
 parser/tests/unit/test_unparser_unit.py       |  38 ++++++
 10 files changed, 263 insertions(+), 95 deletions(-)
 create mode 100644 parser/tests/unit/test_unparser_unit.py

diff --git a/Makefile b/Makefile
index cf31b416..611b4fbd 100644
--- a/Makefile
+++ b/Makefile
@@ -127,8 +127,8 @@ lint: backend_lint frontend_lint config_lint ## Run all linters
 
 backend_lint: ## Run lint on backend code
 	@echo "🍜 Linting python code"
-	${DOCKER_COMPOSE} run --rm taxonomy_api isort .
-	${DOCKER_COMPOSE} run --rm taxonomy_api black .
+	${DOCKER_COMPOSE} run --rm taxonomy_api isort . /parser
+	${DOCKER_COMPOSE} run --rm taxonomy_api black . /parser
 
 frontend_lint: ## Run lint on frontend code
 	@echo "🍜 Linting react code"
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
index e2f051a3..f9dca464 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
@@ -9,7 +9,14 @@
 
 from ..utils import get_project_name, normalize_text
 from .logger import ParserConsoleLogger
-from .taxonomy_parser import ChildLink, NodeData, NodeType, PreviousLink, Taxonomy, TaxonomyParser
+from .taxonomy_parser import (
+    ChildLink,
+    NodeData,
+    NodeType,
+    PreviousLink,
+    Taxonomy,
+    TaxonomyParser,
+)
 
 
 class Parser:
@@ -19,7 +26,9 @@ def __init__(self, session: Session):
         self.session = session
         self.parser_logger = ParserConsoleLogger()
 
-    def _create_other_node(self, tx: Transaction, node_data: NodeData, project_label: str):
+    def _create_other_node(
+        self, tx: Transaction, node_data: NodeData, project_label: str
+    ):
         """Create a TEXT, SYNONYMS or STOPWORDS node"""
         if node_data.get_node_type() == NodeType.TEXT:
             type_label = "TEXT"
@@ -88,20 +97,26 @@ def _create_entry_nodes(self, entry_nodes: list[NodeData], project_label: str):
             original_taxonomy: entry_node.original_taxonomy
         """
 
-        properties_query = ",\n".join([base_properties_query, *additional_properties_queries])
+        properties_query = ",\n".join(
+            [base_properties_query, *additional_properties_queries]
+        )
 
         query = f"""
           WITH $entry_nodes as entry_nodes
           UNWIND entry_nodes as entry_node
           CREATE (n:{project_label}:ENTRY {{ {properties_query} }})
         """
-        self.session.run(query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes])
+        self.session.run(
+            query, entry_nodes=[entry_node.to_dict() for entry_node in entry_nodes]
+        )
 
         self.parser_logger.info(
             f"Created {len(entry_nodes)} ENTRY nodes in {timeit.default_timer() - start_time} seconds"
         )
 
-    def _create_previous_links(self, previous_links: list[PreviousLink], project_label: str):
+    def _create_previous_links(
+        self, previous_links: list[PreviousLink], project_label: str
+    ):
         """Create the 'is_before' relations between nodes"""
         self.parser_logger.info("Creating 'is_before' links")
         start_time = timeit.default_timer()
@@ -142,9 +157,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str):
                 MATCH (p:{project_label}) USING INDEX p:{project_label}(id)
                 WHERE p.id = child_link.parent_id
                 MATCH (c:{project_label}) USING INDEX c:{project_label}(id)
+                WHERE c.id = child_link.id
             """
             + """
-                WHERE c.id = child_link.id
                 CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p)
                 WITH relations
                 UNWIND relations AS relation
@@ -210,7 +225,9 @@ def _create_node_fulltext_index(self, project_label: str):
         )
         self.session.run(query)
 
-        language_codes = [lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""]
+        language_codes = [
+            lang.alpha2 for lang in list(iso639.languages) if lang.alpha2 != ""
+        ]
         tags_prefixed_lc = ["n.tags_ids_" + lc for lc in language_codes]
         tags_prefixed_lc = ", ".join(tags_prefixed_lc)
         query = f"""CREATE FULLTEXT INDEX {project_label+'_SearchTagsIds'} IF NOT EXISTS
@@ -225,9 +242,13 @@ def _create_node_indexes(self, project_label: str):
         self._create_node_id_index(project_label)
         self._create_node_fulltext_index(project_label)
 
-        self.parser_logger.info(f"Created indexes in {timeit.default_timer() - start_time} seconds")
+        self.parser_logger.info(
+            f"Created indexes in {timeit.default_timer() - start_time} seconds"
+        )
 
-    def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
+    def _write_to_database(
+        self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str
+    ):
         project_label = get_project_name(taxonomy_name, branch_name)
         # First create nodes, then create node indexes to accelerate relationship creation, then create relationships
         self._create_other_nodes(taxonomy.other_nodes, project_label)
diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
index 9eb4fa78..84eeb11f 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -96,15 +96,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
             for line_number, line in enumerate(file):
                 if line_number < start:
                     continue
-                # sanitizing
-                # remove any space characters at end of line
-                line = line.rstrip()
-                # replace commas between digits and that have no space around by a lower comma character
-                # and do the same for escaped comma (preceded by a \)
-                # (to distinguish them from commas acting as tags separators)
-                line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
-                line = re.sub(r"\\,", "\\‚", line)
-                yield line_number, line
+                yield line_number, line.rstrip("\n")
                 line_count += 1
             yield line_count, ""  # to end the last entry if not ended
 
@@ -113,13 +105,30 @@ def _normalize_entry_id(self, raw_id: str) -> str:
         Get a normalized string but keeping the language code "lc:",
         used for id and parent tag
         """
+        raw_id = raw_id.strip()
         lc, main_tag = raw_id.split(":", 1)
         normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
         normalized_id = f"{lc}:{normalized_main_tag}"
         return normalized_id
 
+    def prepare_line(self, line: str) -> str:
+        """prepare line for parsing
+
+        This is different from `normalize_text` which is to compute ids
+        """
+        line = line.strip()
+        # sanitizing
+        # remove any space or commas characters at end of line
+        line = re.sub(r"[\s,]+$", "", line)
+        # replace commas between digits and that have no space around by a lower comma character
+        # and do the same for escaped comma (preceded by a \)
+        # (to distinguish them from commas acting as tags separators)
+        line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
+        line = re.sub(r"\\,", "\\‚", line)
+        return line
+
     def undo_normalize_text(self, text: str) -> str:
-        """Undo some normalizations made in `_file_iter`"""
+        """Undo some normalizations made in `prepare_line`"""
         # restore commas from lower comma characters
         text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
         text = re.sub(r"\\‚", "\\,", text)
@@ -148,7 +157,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
         h = 0
         header: list[str] = []
         for _, line in self._file_iter(filename):
-            if not (line) or line[0] == "#":
+            if not (line.strip()) or line[0] == "#":
                 header.append(line)
             else:
                 break
@@ -157,7 +166,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
         # we don't want to eat the comments of the next block
         # and it removes the last separating line
         for i in range(len(header)):
-            if header.pop():
+            if header.pop().strip():
                 h -= 1
             else:
                 break
@@ -169,7 +178,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool:
         if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
             # stopwords and synonyms are one-liners; if the id is set, it's the end
             return True
-        if not line and data.id:
+        if not line.strip() and data.id:
             # entries are separated by a blank line
             return True
         return False
@@ -181,7 +190,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
         """
         is_before = data.is_before
         # first, check if there is at least one preceding line
-        if data.preceding_lines and not data.preceding_lines[0]:
+        if data.preceding_lines and not data.preceding_lines[0].strip():
             if data.id.startswith("synonyms"):
                 # it's a synonyms block,
                 # if the previous block is a stopwords block,
@@ -211,7 +220,9 @@ def _get_node_data_with_comments_above_key(
         # Get comments just above the given line
         comments_above = []
         current_line = line_number - 1
-        while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
+        while (
+            new_data.comments_stack and new_data.comments_stack[-1][0] == current_line
+        ):
             comments_above.append(new_data.comments_stack.pop()[1])
             current_line -= 1
         if comments_above:
@@ -246,12 +257,12 @@ def is_entry_synonyms_line(self, line):
         matching_prefix = self._language_code_prefix.match(line)
         if matching_prefix:
             # verify it's not a property, that is a name followed by a colon and a language
-            return not (
-                self._language_code_prefix.match(line[matching_prefix.end():])
-            )
+            return not (self._language_code_prefix.match(line[matching_prefix.end() :]))
         return False
 
-    def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
+    def _harvest_entries(
+        self, filename: str, entries_start_line: int
+    ) -> Iterator[NodeData]:
         """Transform data from file to dictionary"""
         saved_nodes = []
         index_stopwords = 0
@@ -263,12 +274,10 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
         self.stopwords = {}
         # the first entry is after __header__ which was created before
         data = NodeData(is_before="__header__")
-        line_number = (
-            entries_start_line  # if the iterator is empty, line_number will not be unbound
-        )
-        for line_number, line in self._file_iter(filename, entries_start_line):
+        line_number = entries_start_line  # if the iterator is empty, line_number will not be unbound
+        for line_number, raw_line in self._file_iter(filename, entries_start_line):
             # yield data if block ended
-            if self._entry_end(line, data):
+            if self._entry_end(raw_line, data):
                 data = self._remove_separating_line(data)
                 if data.get_node_type() == NodeType.ENTRY:
                     data = self._get_node_data_with_parent_and_end_comments(data)
@@ -289,16 +298,16 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                 data = NodeData(is_before=is_before)
 
             # harvest the line
-            if not (line) or line[0] == "#":
+            if not (raw_line.strip()) or raw_line[0] == "#":
                 # comment or blank line
                 if data.id:
                     # we are within the node definition
-                    data.comments_stack.append((line_number, line))
+                    data.comments_stack.append((line_number, raw_line))
                 else:
                     # we are before the actual node
-                    data.preceding_lines.append(line)
+                    data.preceding_lines.append(raw_line)
             else:
-                line = line.rstrip(",")
+                line = self.prepare_line(raw_line)
                 if not data.src_position:
                     data.src_position = line_number + 1
                 if line.startswith("stopwords"):
@@ -309,7 +318,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     # remove "stopwords:" part
                     line = line[10:]
                     try:
-                        lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False)
+                        lc, tags, tags_ids = self._get_lc_value(
+                            line, remove_stopwords=False
+                        )
                     except ValueError:
                         self.parser_logger.error(
                             f"Missing language code at line {line_number + 1} ? '{self.parser_logger.ellipsis(line)}'"
@@ -337,7 +348,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                         data.tags["tags_ids_" + lc] = tags_ids
                 elif line[0] == "<":
                     # parent definition
-                    data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
+                    data.parent_tags.append(
+                        (self._normalize_entry_id(line[1:]), line_number + 1)
+                    )
                 elif self.is_entry_synonyms_line(line):
                     # synonyms definition
                     if not data.id:
@@ -353,7 +366,9 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                     tagsids_list = []
                     for word in line.split(","):
                         tags_list.append(self.undo_normalize_text(word.strip()))
-                        word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
+                        word_normalized = normalize_text(
+                            word, lang, stopwords=self.stopwords
+                        )
                         if word_normalized not in tagsids_list:
                             # in case 2 normalized synonyms are the same
                             tagsids_list.append(word_normalized)
@@ -376,14 +391,17 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                         property_name = property_name.strip()
                         lc = lc.strip().replace("-", "_")
                         if not (
-                            correctly_written.match(property_name) and correctly_written.match(lc)
+                            correctly_written.match(property_name)
+                            and correctly_written.match(lc)
                         ):
                             self.parser_logger.error(
                                 f"Reading error at line {line_number + 1}, unexpected format: '{self.parser_logger.ellipsis(line)}'"
                             )
                         if property_name:
                             prop_key = "prop_" + property_name + "_" + lc
-                            data.properties[prop_key] = self.undo_normalize_text(property_value.strip())
+                            data.properties[prop_key] = self.undo_normalize_text(
+                                property_value.strip()
+                            )
                             data = self._get_node_data_with_comments_above_key(
                                 data, line_number, prop_key
                             )
@@ -409,7 +427,9 @@ def _normalise_and_validate_child_links(
             # we collect all the tags_ids in a certain language
             tags_ids = {}
             for node in entry_nodes:
-                node_tags_ids = {tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])}
+                node_tags_ids = {
+                    tag_id: node.id for tag_id in node.tags.get(f"tags_ids_{lc}", [])
+                }
                 tags_ids.update(node_tags_ids)
 
             # we check if the parent_id exists in the tags_ids
@@ -418,7 +438,9 @@ def _normalise_and_validate_child_links(
                 if parent_id not in tags_ids:
                     missing_child_links.append(child_link)
                 else:
-                    child_link["parent_id"] = tags_ids[parent_id]  # normalise the parent_id
+                    child_link["parent_id"] = tags_ids[
+                        parent_id
+                    ]  # normalise the parent_id
                     normalised_child_links.append(child_link)
 
         return normalised_child_links, missing_child_links
@@ -444,8 +466,10 @@ def _get_valid_child_links(
         ]
 
         # Normalise and validate the unnormalised links
-        normalised_child_links, missing_child_links = self._normalise_and_validate_child_links(
-            entry_nodes, child_links_to_normalise
+        normalised_child_links, missing_child_links = (
+            self._normalise_and_validate_child_links(
+                entry_nodes, child_links_to_normalise
+            )
         )
 
         valid_child_links.extend(normalised_child_links)
@@ -459,7 +483,9 @@ def _get_valid_child_links(
 
         return valid_child_links
 
-    def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[ChildLink]:
+    def _remove_duplicate_child_links(
+        self, child_links: list[ChildLink]
+    ) -> list[ChildLink]:
         """Remove duplicate child links (i.e child links with the same parent_id and id)"""
         unique_child_links = []
         children_to_parents = collections.defaultdict(set)
@@ -470,10 +496,13 @@ def _remove_duplicate_child_links(self, child_links: list[ChildLink]) -> list[Ch
                 unique_child_links.append(child_link)
         return unique_child_links
 
-    def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[NodeData]:
+    def _merge_duplicate_entry_nodes(
+        self, entry_nodes: list[NodeData]
+    ) -> list[NodeData]:
         """Merge entry nodes with the same id:
         - merge their tags (union)
-        - merge their properties (union, and in case of conflict, keep the last value)"""
+        - merge their properties (union, and in case of conflict, keep the last value)
+        """
         unique_entry_nodes = []
         ids_to_nodes = dict()
         for node in entry_nodes:
@@ -539,7 +568,9 @@ def _create_taxonomy(
         entry_nodes: list[NodeData] = []
         entry_nodes.extend(external_entry_nodes)
         other_nodes = [
-            NodeData(id="__header__", preceding_lines=harvested_header_data, src_position=1)
+            NodeData(
+                id="__header__", preceding_lines=harvested_header_data, src_position=1
+            )
         ]
         previous_links: list[PreviousLink] = []
         raw_child_links: list[ChildLink] = []
@@ -551,7 +582,9 @@ def _create_taxonomy(
             else:
                 other_nodes.append(entry)
             if entry.is_before:
-                previous_links.append(PreviousLink(before_id=entry.is_before, id=entry.id))
+                previous_links.append(
+                    PreviousLink(before_id=entry.is_before, id=entry.id)
+                )
             if entry.parent_tags:
                 for position, (parent, line_position) in enumerate(entry.parent_tags):
                     raw_child_links.append(
@@ -584,7 +617,9 @@ def parse_file(
         start_time = timeit.default_timer()
         filename = normalize_filename(filename)
         taxonomy = self._create_taxonomy(filename, external_filenames)
-        self.parser_logger.info(f"Parsing done in {timeit.default_timer() - start_time} seconds.")
+        self.parser_logger.info(
+            f"Parsing done in {timeit.default_timer() - start_time} seconds."
+        )
         self.parser_logger.info(
             f"Found {len(taxonomy.entry_nodes) + len(taxonomy.other_nodes)} nodes"
         )
diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py
index 158c2578..4a4770aa 100644
--- a/parser/openfoodfacts_taxonomy_parser/unparser.py
+++ b/parser/openfoodfacts_taxonomy_parser/unparser.py
@@ -67,7 +67,11 @@ def list_property_and_lc(self, node):
         """return an ordered list of properties with their language code (lc)"""
         # there is no rule for the order of properties
         # properties will be arranged in alphabetical order
-        values = [property[5:] for property in node if property.startswith("prop_") and not property.endswith("_comments")]
+        values = [
+            property[5:]
+            for property in node
+            if property.startswith("prop_") and not property.endswith("_comments")
+        ]
         # note: using the fact that we are sure to find language code after the first underscore
         return sorted(values, key=self.property_sort_key)
 
@@ -83,7 +87,7 @@ def get_parents_lines(self, parents):
             parent = dict(parent)
             lc = parent["main_language"]
             parent_id = parent["tags_" + lc][0]
-            yield "<" + lc + ": " + parent_id
+            yield "< " + lc + ":" + parent_id
 
     def iter_lines(self, project_label):
         previous_block_id = ""
@@ -91,9 +95,9 @@ def iter_lines(self, project_label):
             node = dict(node)
             has_content = node["id"] not in ["__header__", "__footer__"]
             # eventually add a blank line but in specific case
-            following_synonyms = node["id"].startswith("synonyms") and previous_block_id.startswith(
+            following_synonyms = node["id"].startswith(
                 "synonyms"
-            )
+            ) and previous_block_id.startswith("synonyms")
             following_stopwords = node["id"].startswith(
                 "stopwords"
             ) and previous_block_id.startswith("stopwords")
diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py
index 36225921..fa8a61d6 100644
--- a/parser/openfoodfacts_taxonomy_parser/utils.py
+++ b/parser/openfoodfacts_taxonomy_parser/utils.py
@@ -4,7 +4,12 @@
 import unidecode
 
 
-def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords: dict[str, list[str]] | None = None) -> str:
+def normalize_text(
+    line: str,
+    lang: str = "default",
+    char: str = "-",
+    stopwords: dict[str, list[str]] | None = None,
+) -> str:
     """Normalize a string depending on the language code"""
     if stopwords is None:
         stopwords = {}
@@ -43,7 +48,9 @@ def normalize_text(line: str, lang: str = "default", char: str = "-", stopwords:
         stopwords = stopwords[lang]
         line_surrounded_by_char = char + line + char
         for stopword in stopwords:
-            line_surrounded_by_char = line_surrounded_by_char.replace(char + stopword + char, char)
+            line_surrounded_by_char = line_surrounded_by_char.replace(
+                char + stopword + char, char
+            )
         line = line_surrounded_by_char[1:-1]
 
     return line
diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt
index 92f38f86..c501a4b5 100644
--- a/parser/tests/data/test.txt
+++ b/parser/tests/data/test.txt
@@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit
 
 synonyms:fr: fruit de la passion, maracuja, passion
 
-<en: milk
+< en:milk
 en: yogurts, yoghurts
 fr: yaourts, yoghourts, yogourts
 
-<en: yogurts
+< en:yogurts
 en: banana yogurts
 fr: yaourts à la banane
 
-<en: yogurts
+< en:yogurts
 en: Passion fruit yogurts
 fr: yaourts au fruit de la passion
 
-<fr: yaourts fruit de la passion
+< fr:yaourts fruit de la passion
 fr: yaourts au fruit de la passion allégés
 
-# meat
+# meat 
 
 en: meat
 carbon_footprint_fr_foodges_value:fr: 10
 vegan:en: no
 
-<en: meat
+< en:meat
 en: fake-meat
 vegan:en: yes
 
 en: fake-stuff
 
-<en: fake-stuff
-<en: fake-meat
+< en:fake-stuff
+< en:fake-meat
 en: fake-duck-meat
\ No newline at end of file
diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py
index 26876d0f..240c8b7b 100644
--- a/parser/tests/integration/test_parse_unparse_integration.py
+++ b/parser/tests/integration/test_parse_unparse_integration.py
@@ -1,13 +1,16 @@
 import pathlib
 
 import pytest
-
 from openfoodfacts_taxonomy_parser import parser, unparser
 
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
-TEST_EXTERNAL_1_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt")
-TEST_EXTERNAL_2_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt")
+TEST_EXTERNAL_1_TXT = str(
+    pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt"
+)
+TEST_EXTERNAL_2_TXT = str(
+    pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt"
+)
 
 
 @pytest.fixture(autouse=True)
@@ -60,10 +63,10 @@ def test_round_trip(neo4j):
         if line.startswith("stopwords:fr: aux"):
             line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
-        elif line.startswith("<fr: yaourts fruit de la passion"):
-            line = "<en: Passion fruit yogurts"
+        elif line.startswith("< fr:yaourts fruit de la passion"):
+            line = "< en:Passion fruit yogurts"
         # third tweak: removing unexisting parents
-        elif line.startswith("<en: milk"):
+        elif line.startswith("< en:milk"):
             continue
         expected_lines.append(line)
 
@@ -105,10 +108,10 @@ def test_two_branch_round_trip(neo4j):
         if line.startswith("stopwords:fr: aux"):
             line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
-        elif line.startswith("<fr: yaourts fruit de la passion"):
-            line = "<en: Passion fruit yogurts"
+        elif line.startswith("< fr:yaourts fruit de la passion"):
+            line = "< en:Passion fruit yogurts"
         # third tweak: removing unexisting parents
-        elif line.startswith("<en: milk"):
+        elif line.startswith("< en:milk"):
             continue
         expected_lines.append(line)
 
@@ -122,7 +125,12 @@ def test_round_trip_with_external_taxonomies(neo4j):
         test_parser = parser.Parser(session)
 
         # parse taxonomy
-        test_parser(TEST_TAXONOMY_TXT, [TEST_EXTERNAL_1_TXT, TEST_EXTERNAL_2_TXT], "branch", "test")
+        test_parser(
+            TEST_TAXONOMY_TXT,
+            [TEST_EXTERNAL_1_TXT, TEST_EXTERNAL_2_TXT],
+            "branch",
+            "test",
+        )
         # just quick check it runs ok with total number of nodes
         query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
         result = session.run(query)
@@ -141,8 +149,8 @@ def test_round_trip_with_external_taxonomies(neo4j):
         if line.startswith("stopwords:fr: aux"):
             line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
         # second tweak: renaming parent
-        elif line.startswith("<fr: yaourts fruit de la passion"):
-            line = "<en: Passion fruit yogurts"
+        elif line.startswith("< fr:yaourts fruit de la passion"):
+            line = "< en:Passion fruit yogurts"
         expected_lines.append(line)
 
     assert expected_lines == lines
diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py
index 4aba51a5..e0ffb803 100644
--- a/parser/tests/integration/test_parser_integration.py
+++ b/parser/tests/integration/test_parser_integration.py
@@ -3,13 +3,16 @@
 import textwrap
 
 import pytest
-
 from openfoodfacts_taxonomy_parser import parser
 
 # taxonomy in text format : test.txt
 TEST_TAXONOMY_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test.txt")
-TEST_EXTERNAL_1_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt")
-TEST_EXTERNAL_2_TXT = str(pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt")
+TEST_EXTERNAL_1_TXT = str(
+    pathlib.Path(__file__).parent.parent / "data" / "test_external1.txt"
+)
+TEST_EXTERNAL_2_TXT = str(
+    pathlib.Path(__file__).parent.parent / "data" / "test_external2.txt"
+)
 
 
 @pytest.fixture(autouse=True)
@@ -37,7 +40,9 @@ def test_calling(neo4j):
         assert number_of_nodes == 14
 
         # header correctly added
-        query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
+        query = (
+            "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
+        )
         result = session.run(query)
         header = result.value()[0]
         assert header == ["# test taxonomy"]
@@ -71,8 +76,28 @@ def test_calling(neo4j):
         results = session.run(query)
         expected_stopwords = {
             "id": "stopwords:0",
-            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test normalisation"],
-            "tags_ids_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test-normalisation"],
+            "tags_fr": [
+                "aux",
+                "au",
+                "de",
+                "le",
+                "du",
+                "la",
+                "a",
+                "et",
+                "test normalisation",
+            ],
+            "tags_ids_fr": [
+                "aux",
+                "au",
+                "de",
+                "le",
+                "du",
+                "la",
+                "a",
+                "et",
+                "test-normalisation",
+            ],
             "preceding_lines": [],
         }
         for result in results:
@@ -101,7 +126,7 @@ def test_calling(neo4j):
             {
                 "tags_en": ["meat"],
                 "tags_ids_en": ["meat"],
-                "preceding_lines": ["# meat", ""],
+                "preceding_lines": ["# meat ", ""],
                 "prop_vegan_en": "no",
                 "prop_carbon_footprint_fr_foodges_value_fr": "10",
             },
@@ -167,7 +192,12 @@ def test_calling(neo4j):
 def test_with_external_taxonomies(neo4j):
     with neo4j.session() as session:
         test_parser = parser.Parser(session)
-        test_parser(TEST_TAXONOMY_TXT, [TEST_EXTERNAL_1_TXT, TEST_EXTERNAL_2_TXT], "branch", "test")
+        test_parser(
+            TEST_TAXONOMY_TXT,
+            [TEST_EXTERNAL_1_TXT, TEST_EXTERNAL_2_TXT],
+            "branch",
+            "test",
+        )
 
         # total number of nodes (TEXT, ENTRY, SYNONYMS, STOPWORDS) + 1 ERROR node
         query = "MATCH (n:p_test_branch) RETURN COUNT(*)"
@@ -176,7 +206,9 @@ def test_with_external_taxonomies(neo4j):
         assert number_of_nodes == 22
 
         # header correctly added
-        query = "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
+        query = (
+            "MATCH (n:p_test_branch) WHERE n.id = '__header__' RETURN n.preceding_lines"
+        )
         result = session.run(query)
         header = result.value()[0]
         assert header == ["# test taxonomy"]
@@ -210,8 +242,28 @@ def test_with_external_taxonomies(neo4j):
         results = session.run(query)
         expected_stopwords = {
             "id": "stopwords:0",
-            "tags_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test normalisation"],
-            "tags_ids_fr": ["aux", "au", "de", "le", "du", "la", "a", "et", "test-normalisation"],
+            "tags_fr": [
+                "aux",
+                "au",
+                "de",
+                "le",
+                "du",
+                "la",
+                "a",
+                "et",
+                "test normalisation",
+            ],
+            "tags_ids_fr": [
+                "aux",
+                "au",
+                "de",
+                "le",
+                "du",
+                "la",
+                "a",
+                "et",
+                "test-normalisation",
+            ],
             "preceding_lines": [],
         }
         for result in results:
@@ -240,7 +292,7 @@ def test_with_external_taxonomies(neo4j):
             {
                 "tags_en": ["meat"],
                 "tags_ids_en": ["meat"],
-                "preceding_lines": ["# meat", ""],
+                "preceding_lines": ["# meat ", ""],
                 "prop_vegan_en": "no",
                 "prop_carbon_footprint_fr_foodges_value_fr": "10",
             },
@@ -357,10 +409,14 @@ def test_properties_confused_lang(neo4j, tmp_path):
     """Test that short property names don't get confused with language prefixes"""
     with neo4j.session() as session:
         test_parser = parser.Parser(session)
-        fpath = str(pathlib.Path(__file__).parent.parent / "data" / "test_property_confused_lang.txt")
+        fpath = str(
+            pathlib.Path(__file__).parent.parent
+            / "data"
+            / "test_property_confused_lang.txt"
+        )
         test_parser(fpath, None, "branch", "test")
         query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-planet' RETURN n"
         result = session.run(query)
         node = result.value()[0]
         # "web:en" was not confused with a language prefix "web:"
-        assert "prop_web_en" in node.keys()
\ No newline at end of file
+        assert "prop_web_en" in node.keys()
diff --git a/parser/tests/unit/test_parser_unit.py b/parser/tests/unit/test_parser_unit.py
index a18df7ef..d4d616ec 100644
--- a/parser/tests/unit/test_parser_unit.py
+++ b/parser/tests/unit/test_parser_unit.py
@@ -1,7 +1,6 @@
 import pathlib
 
 import pytest
-
 from openfoodfacts_taxonomy_parser import parser, utils
 
 # taxonomy in text format : test.txt
diff --git a/parser/tests/unit/test_unparser_unit.py b/parser/tests/unit/test_unparser_unit.py
new file mode 100644
index 00000000..9764dd0d
--- /dev/null
+++ b/parser/tests/unit/test_unparser_unit.py
@@ -0,0 +1,38 @@
+import inspect
+import pathlib
+
+import pytest
+from openfoodfacts_taxonomy_parser import unparser
+
+
+@pytest.mark.parametrize(
+    "properties_txt, expected",
+    [
+        (
+            """
+            vegan:en: yes
+            ciqual_food_name:fr: Lécithine de soja
+            ciqual_food_code:en: 42200
+            vegetarian:en: yes
+            ciqual_food_name:en: Soy lecithin
+            """,
+            [
+                "ciqual_food_code:en",
+                "ciqual_food_name:en",
+                "ciqual_food_name:fr",
+                "vegan:en",
+                "vegetarian:en",
+            ],
+        ),
+    ],
+)
+def test_list_property_and_lc(properties_txt, expected):
+    # construct a node with the given properties
+    properties = [
+        prop for prop in inspect.cleandoc(properties_txt).split("\n") if prop.strip()
+    ]
+    name_lc_values = [prop.split(":", 2) for prop in properties]
+    node = {f"prop_{name}_{lc}": value for name, lc, value in name_lc_values}
+    # node = {f"prop_{name}_{lc}": value for prop in properties for name, lc, value in prop.split(":", 2)}
+    result = unparser.WriteTaxonomy(None).list_property_and_lc(node)
+    assert result, expected

From b2c6eb145ee0af743597bf594d54310d56a7e884 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 17:32:50 +0200
Subject: [PATCH 2/7] fix: fix properties orders in unparse

---
 .../openfoodfacts_taxonomy_parser/unparser.py |  2 +-
 parser/tests/unit/test_unparser_unit.py       | 22 +++++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/parser/openfoodfacts_taxonomy_parser/unparser.py b/parser/openfoodfacts_taxonomy_parser/unparser.py
index 4a4770aa..994407e0 100644
--- a/parser/openfoodfacts_taxonomy_parser/unparser.py
+++ b/parser/openfoodfacts_taxonomy_parser/unparser.py
@@ -58,7 +58,7 @@ def get_tags_line(self, node, lc):
 
     @staticmethod
     def property_sort_key(property):
-        name, lang_code, *_ = property.split("_", 2)
+        name, lang_code = property.rsplit("_", 1)
         # give priority to xx and en language codes
         priority = {"en": 1, "xx": 0}
         return (name, priority.get(lang_code, 100), lang_code)
diff --git a/parser/tests/unit/test_unparser_unit.py b/parser/tests/unit/test_unparser_unit.py
index 9764dd0d..249abbdf 100644
--- a/parser/tests/unit/test_unparser_unit.py
+++ b/parser/tests/unit/test_unparser_unit.py
@@ -12,16 +12,18 @@
             """
             vegan:en: yes
             ciqual_food_name:fr: Lécithine de soja
+            ciqual_food_name:ar: الكيكلية البيضية
             ciqual_food_code:en: 42200
             vegetarian:en: yes
             ciqual_food_name:en: Soy lecithin
             """,
             [
-                "ciqual_food_code:en",
-                "ciqual_food_name:en",
-                "ciqual_food_name:fr",
-                "vegan:en",
-                "vegetarian:en",
+                "ciqual_food_code_en",
+                "ciqual_food_name_en",
+                "ciqual_food_name_ar",
+                "ciqual_food_name_fr",
+                "vegan_en",
+                "vegetarian_en",
             ],
         ),
     ],
@@ -33,6 +35,12 @@ def test_list_property_and_lc(properties_txt, expected):
     ]
     name_lc_values = [prop.split(":", 2) for prop in properties]
     node = {f"prop_{name}_{lc}": value for name, lc, value in name_lc_values}
-    # node = {f"prop_{name}_{lc}": value for prop in properties for name, lc, value in prop.split(":", 2)}
     result = unparser.WriteTaxonomy(None).list_property_and_lc(node)
-    assert result, expected
+    assert result == expected
+
+def test_list_property_and_lc_with_comment():
+    node = {"prop_test_ar": "a", "prop_test_ar_comments": "# a", "prop_test_en": "b", "prop_test_comments_en": "legit"}
+    result = unparser.WriteTaxonomy(None).list_property_and_lc(node)
+    assert result == ["test_en", "test_ar", "test_comments_en"]
+
+

From b7d5a0f2e85b1faa2c0c563e2987e9a14c970061 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 17:47:06 +0200
Subject: [PATCH 3/7] chore: flake8

---
 parser/tests/unit/test_unparser_unit.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parser/tests/unit/test_unparser_unit.py b/parser/tests/unit/test_unparser_unit.py
index bffa685e..a904705b 100644
--- a/parser/tests/unit/test_unparser_unit.py
+++ b/parser/tests/unit/test_unparser_unit.py
@@ -1,5 +1,4 @@
 import inspect
-import pathlib
 
 import pytest
 

From d8d62fe49e625b4b4dbe7407a46bcabbf9585007 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 17:52:39 +0200
Subject: [PATCH 4/7] chore: reformat comment

---
 parser/openfoodfacts_taxonomy_parser/parser/parser.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/parser.py b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
index cdeacca0..633ff92a 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/parser.py
@@ -230,7 +230,8 @@ def _create_node_indexes(self, project_label: str):
 
     def _write_to_database(self, taxonomy: Taxonomy, taxonomy_name: str, branch_name: str):
         project_label = get_project_name(taxonomy_name, branch_name)
-        # First create nodes, then create node indexes to accelerate relationship creation,
+        # First create nodes,
+        # then create node indexes to accelerate relationship creation,
         # then create relationships
         self._create_other_nodes(taxonomy.other_nodes, project_label)
         self._create_entry_nodes(taxonomy.entry_nodes, project_label)

From f08082016403ab7710c2d63f953d730fdac56d9a Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 23:11:06 +0200
Subject: [PATCH 5/7] refactor: simplify comment handling in parser

---
 .../parser/taxonomy_parser.py                 | 97 +++++++------------
 1 file changed, 33 insertions(+), 64 deletions(-)

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
index 81b8e824..6e7ce166 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -1,5 +1,4 @@
 import collections
-import copy
 import logging
 import re
 import sys
@@ -31,11 +30,6 @@ class NodeData:
     properties: dict[str, str] = field(default_factory=dict)
     tags: dict[str, list[str]] = field(default_factory=dict)
     comments: dict[str, list[str]] = field(default_factory=dict)
-    # comments_stack is a list of tuples (line_number, comment),
-    # to keep track of comments just above the current line
-    # during parsing of an entry, to be able to add them
-    # to the right property or tag when possible
-    comments_stack: list[tuple[int, str]] = field(default_factory=list)
     is_external: bool = False  # True if the node comes from another taxonomy
     original_taxonomy: str | None = None  # the name of the taxonomy the node comes from
 
@@ -210,42 +204,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
                 data.preceding_lines.pop(0)
         return data
 
-    def _get_node_data_with_comments_above_key(
-        self, data: NodeData, line_number: int, key: str
-    ) -> NodeData:
+    def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData:
         """Returns the updated node data with comments above the given
         key stored in the {key}_comments property."""
-        new_data = copy.deepcopy(data)
-
-        # Get comments just above the given line
-        comments_above = []
-        current_line = line_number - 1
-        while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
-            comments_above.append(new_data.comments_stack.pop()[1])
-            current_line -= 1
-        if comments_above:
-            new_data.comments[key + "_comments"] = comments_above[::-1]
-
-        return new_data
-
-    def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
-        """Returns the updated node data with parent and end comments"""
-        new_data = copy.deepcopy(data)
-
-        # Get parent comments (part of an entry block and just above/between the parents lines)
-        parent_comments = []
-        while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
-            parent_comments.append(new_data.preceding_lines.pop())
-        if parent_comments:
-            new_data.comments["parent_comments"] = parent_comments[::-1]
-
-        # Get end comments (part of an entry block after the last tag/prop
-        # and before the separating blank line)
-        end_comments = [comment[1] for comment in new_data.comments_stack]
-        if end_comments:
-            new_data.comments["end_comments"] = end_comments
-
-        return new_data
+        if comments:
+            data.comments.setdefault(f"{key}_comments", []).extend(comments)
+            # reset the comments list
+            comments[:] = []
 
     _language_code_prefix = re.compile(
         r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
@@ -260,11 +225,30 @@ def is_entry_synonyms_line(self, line):
             )
         return False
 
+    def finalize_data(self, data, comments, saved_nodes):
+        data = self._remove_separating_line(data)
+        if data.get_node_type() == NodeType.ENTRY:
+            self._add_comments(data, comments, "end")
+        if data.id in saved_nodes:
+            # this duplicate node will be merged with the first one
+            data.is_before = None
+            msg = (
+                f"WARNING: Entry with same id {data.id} already exists, "
+                f"duplicate id in file at line {data.src_position}. "
+                "The two nodes will be merged, keeping the last "
+                "values in case of conflicts."
+            )
+            self.parser_logger.error(msg)
+        else:
+            saved_nodes.append(data.id)
+        return data
+
     def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
         """Transform data from file to dictionary"""
         saved_nodes = []
         index_stopwords = 0
         index_synonyms = 0
+        comments = []
 
         # Check if it is correctly written
         correctly_written = re.compile(r"\w+\Z")
@@ -278,23 +262,11 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
         for line_number, raw_line in self._file_iter(filename, entries_start_line):
             # yield data if block ended
             if self._entry_end(raw_line, data):
-                data = self._remove_separating_line(data)
-                if data.get_node_type() == NodeType.ENTRY:
-                    data = self._get_node_data_with_parent_and_end_comments(data)
-                if data.id in saved_nodes:
-                    # this duplicate node will be merged with the first one
-                    data.is_before = None
-                    msg = (
-                        f"WARNING: Entry with same id {data.id} already exists, "
-                        f"duplicate id in file at line {data.src_position}. "
-                        "The two nodes will be merged, keeping the last "
-                        "values in case of conflicts."
-                    )
-                    self.parser_logger.error(msg)
-                else:
-                    saved_nodes.append(data.id)
-                    is_before = data.id
-                yield data  # another function will use this dictionary to create a node
+                is_before = data.is_before
+                # another function will use data to create a node
+                yield self.finalize_data(data, comments, saved_nodes)
+                # if data was a duplicate (is_before is None) reuse same is_before
+                is_before = data.id if data.is_before else is_before
                 data = NodeData(is_before=is_before)
 
             # harvest the line
@@ -302,7 +274,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                 # comment or blank line
                 if data.id:
                     # we are within the node definition
-                    data.comments_stack.append((line_number, raw_line))
+                    comments.append(raw_line)
                 else:
                     # we are before the actual node
                     data.preceding_lines.append(raw_line)
@@ -349,6 +321,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                 elif line[0] == "<":
                     # parent definition
                     data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
+                    self._add_comments(data, comments, "parent")
                 elif self.is_entry_synonyms_line(line):
                     # synonyms definition
                     if not data.id:
@@ -370,9 +343,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             tagsids_list.append(word_normalized)
                     data.tags["tags_" + lang] = tags_list
                     data.tags["tags_ids_" + lang] = tagsids_list
-                    data = self._get_node_data_with_comments_above_key(
-                        data, line_number, "tags_" + lang
-                    )
+                    self._add_comments(data, comments, "tags_" + lang)
                 else:
                     # property definition
                     property_name = None
@@ -399,9 +370,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
                             data.properties[prop_key] = self.undo_normalize_text(
                                 property_value.strip()
                             )
-                            data = self._get_node_data_with_comments_above_key(
-                                data, line_number, prop_key
-                            )
+                            self._add_comments(data, comments, prop_key)
 
         data.id = "__footer__"
         data.preceding_lines.pop(0)

From cac7585ddd6db2f0e853633ad2edfe3742233c94 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Tue, 23 Jul 2024 23:44:39 +0200
Subject: [PATCH 6/7] fix: comment below parents in parser

---
 .../parser/taxonomy_parser.py                       |  6 +++++-
 parser/tests/integration/test_parser_integration.py | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
index 6e7ce166..0cbad078 100644
--- a/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
+++ b/parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
@@ -56,6 +56,10 @@ def get_node_type(self):
         else:
             return NodeType.ENTRY
 
+    @property
+    def is_started(self):
+        return self.id or self.parent_tags or self.tags or self.properties
+
 
 class PreviousLink(TypedDict):
     before_id: str
@@ -272,7 +276,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
             # harvest the line
             if not (raw_line.strip()) or raw_line[0] == "#":
                 # comment or blank line
-                if data.id:
+                if data.is_started:
                     # we are within the node definition
                     comments.append(raw_line)
                 else:
diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py
index c75687e9..ffc73191 100644
--- a/parser/tests/integration/test_parser_integration.py
+++ b/parser/tests/integration/test_parser_integration.py
@@ -411,3 +411,16 @@ def test_properties_confused_lang(neo4j, tmp_path):
         node = result.value()[0]
         # "web:en" was not confused with a language prefix "web:"
         assert "prop_web_en" in node.keys()
+
+
+def test_comment_below_parent(neo4j, tmp_path):
+    """Test that if a comment is below a parent, it is not added to preceeding_lines"""
+    with neo4j.session() as session:
+        test_parser = parser.Parser(session)
+        fpath = str(pathlib.Path(__file__).parent.parent / "data" / "test_comment_below_parent.txt")
+        test_parser(fpath, None, "branch", "test")
+        query = "MATCH (n:p_test_branch) WHERE n.id = 'en:cow-milk' RETURN n"
+        result = session.run(query)
+        node = result.value()[0]
+        assert node["preceding_lines"] == ["# a comment above the parent"]
+        assert node["tags_en_comments"] == ["# a comment below the parent"]

From c9e7e9bc020768bb298734804153a6f4cca6b4f0 Mon Sep 17 00:00:00 2001
From: Alex Garel <alex@garel.org>
Date: Wed, 24 Jul 2024 10:10:27 +0200
Subject: [PATCH 7/7] test: add file for test

---
 parser/tests/data/test_comment_below_parent.txt | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 parser/tests/data/test_comment_below_parent.txt

diff --git a/parser/tests/data/test_comment_below_parent.txt b/parser/tests/data/test_comment_below_parent.txt
new file mode 100644
index 00000000..aa972dc4
--- /dev/null
+++ b/parser/tests/data/test_comment_below_parent.txt
@@ -0,0 +1,9 @@
+# test a bug found with ingr taxonomy
+
+en:milk
+
+# a comment above the parent
+<en: milk
+# a comment below the parent
+en: Cow milk
+fr: lait de vache
\ No newline at end of file