Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: less transformations on comment lines and others #523

Merged
merged 8 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parser/openfoodfacts_taxonomy_parser/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def _create_child_links(self, child_links: list[ChildLink], project_label: str):
MATCH (p:{project_label}) USING INDEX p:{project_label}(id)
WHERE p.id = child_link.parent_id
MATCH (c:{project_label}) USING INDEX c:{project_label}(id)
WHERE c.id = child_link.id
"""
+ """
WHERE c.id = child_link.id
CREATE (c)-[relations:is_child_of {position: child_link.position}]->(p)
WITH relations
UNWIND relations AS relation
Expand Down
154 changes: 67 additions & 87 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import collections
import copy
import logging
import re
import sys
Expand Down Expand Up @@ -31,11 +30,6 @@ class NodeData:
properties: dict[str, str] = field(default_factory=dict)
tags: dict[str, list[str]] = field(default_factory=dict)
comments: dict[str, list[str]] = field(default_factory=dict)
# comments_stack is a list of tuples (line_number, comment),
# to keep track of comments just above the current line
# during parsing of an entry, to be able to add them
# to the right property or tag when possible
comments_stack: list[tuple[int, str]] = field(default_factory=list)
is_external: bool = False # True if the node comes from another taxonomy
original_taxonomy: str | None = None # the name of the taxonomy the node comes from

Expand All @@ -62,6 +56,10 @@ def get_node_type(self):
else:
return NodeType.ENTRY

@property
def is_started(self):
return self.id or self.parent_tags or self.tags or self.properties


class PreviousLink(TypedDict):
before_id: str
Expand Down Expand Up @@ -96,16 +94,7 @@ def _file_iter(self, filename: str, start: int = 0) -> Iterator[tuple[int, str]]
for line_number, line in enumerate(file):
if line_number < start:
continue
# sanitizing
# remove any space characters at end of line
line = line.rstrip()
# replace commas between digits
# and that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
line = re.sub(r"\\,", "\\‚", line)
yield line_number, line
yield line_number, line.rstrip("\n")
line_count += 1
yield line_count, "" # to end the last entry if not ended

Expand All @@ -114,13 +103,30 @@ def _normalize_entry_id(self, raw_id: str) -> str:
Get a normalized string but keeping the language code "lc:",
used for id and parent tag
"""
raw_id = raw_id.strip()
lc, main_tag = raw_id.split(":", 1)
normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

def prepare_line(self, line: str) -> str:
"""prepare line for parsing

This is different from `normalize_text` which is to compute ids
"""
line = line.strip()
# sanitizing
# remove any space or commas characters at end of line
line = re.sub(r"[\s,]+$", "", line)
# replace commas between digits and that have no space around by a lower comma character
# and do the same for escaped comma (preceded by a \)
# (to distinguish them from commas acting as tags separators)
line = re.sub(r"(\d),(\d)", r"\1‚\2", line)
line = re.sub(r"\\,", "\\‚", line)
return line

def undo_normalize_text(self, text: str) -> str:
"""Undo some normalizations made in `_file_iter`"""
"""Undo some normalizations made in `prepare_line`"""
# restore commas from lower comma characters
text = re.sub(r"(\d)‚(\d)", r"\1,\2", text)
text = re.sub(r"\\‚", "\\,", text)
Expand Down Expand Up @@ -149,7 +155,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
h = 0
header: list[str] = []
for _, line in self._file_iter(filename):
if not (line) or line[0] == "#":
if not (line.strip()) or line[0] == "#":
header.append(line)
else:
break
Expand All @@ -158,7 +164,7 @@ def _header_harvest(self, filename: str) -> tuple[list[str], int]:
# we don't want to eat the comments of the next block
# and it removes the last separating line
for i in range(len(header)):
if header.pop():
if header.pop().strip():
h -= 1
else:
break
Expand All @@ -170,7 +176,7 @@ def _entry_end(self, line: str, data: NodeData) -> bool:
if data.id.startswith("stopwords") or data.id.startswith("synonyms"):
# stopwords and synonyms are one-liners; if the id is set, it's the end
return True
if not line and data.id:
if not line.strip() and data.id:
# entries are separated by a blank line
return True
return False
Expand All @@ -182,7 +188,7 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
"""
is_before = data.is_before
# first, check if there is at least one preceding line
if data.preceding_lines and not data.preceding_lines[0]:
if data.preceding_lines and not data.preceding_lines[0].strip():
if data.id.startswith("synonyms"):
# it's a synonyms block,
# if the previous block is a stopwords block,
Expand All @@ -202,42 +208,13 @@ def _remove_separating_line(self, data: NodeData) -> NodeData:
data.preceding_lines.pop(0)
return data

def _get_node_data_with_comments_above_key(
self, data: NodeData, line_number: int, key: str
) -> NodeData:
def _add_comments(self, data: NodeData, comments: list[str], key: str) -> NodeData:
"""Returns the updated node data with comments above the given
key stored in the {key}_comments property."""
new_data = copy.deepcopy(data)

# Get comments just above the given line
comments_above = []
current_line = line_number - 1
while new_data.comments_stack and new_data.comments_stack[-1][0] == current_line:
comments_above.append(new_data.comments_stack.pop()[1])
current_line -= 1
if comments_above:
new_data.comments[key + "_comments"] = comments_above[::-1]

return new_data

def _get_node_data_with_parent_and_end_comments(self, data: NodeData) -> NodeData:
"""Returns the updated node data with parent and end comments"""
new_data = copy.deepcopy(data)

# Get parent comments (part of an entry block and just above/between the parents lines)
parent_comments = []
while new_data.preceding_lines and new_data.preceding_lines[-1] != "":
parent_comments.append(new_data.preceding_lines.pop())
if parent_comments:
new_data.comments["parent_comments"] = parent_comments[::-1]

# Get end comments (part of an entry block after the last tag/prop
# and before the separating blank line)
end_comments = [comment[1] for comment in new_data.comments_stack]
if end_comments:
new_data.comments["end_comments"] = end_comments

return new_data
if comments:
data.comments.setdefault(f"{key}_comments", []).extend(comments)
# reset the comments list
comments[:] = []

_language_code_prefix = re.compile(
r"[a-zA-Z][a-zA-Z][a-zA-Z]?([-_][a-zA-Z][a-zA-Z][a-zA-Z]?)?:"
Expand All @@ -247,17 +224,35 @@ def is_entry_synonyms_line(self, line):
matching_prefix = self._language_code_prefix.match(line)
if matching_prefix:
# verify it's not a property, that is a name followed by a colon and a language
# we need no-qa this because of black vs flake8 opinion
return not (
self._language_code_prefix.match(line[matching_prefix.end() :]) # noqa: E203
)
return False

def finalize_data(self, data, comments, saved_nodes):
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
self._add_comments(data, comments, "end")
if data.id in saved_nodes:
# this duplicate node will be merged with the first one
data.is_before = None
msg = (
f"WARNING: Entry with same id {data.id} already exists, "
f"duplicate id in file at line {data.src_position}. "
"The two nodes will be merged, keeping the last "
"values in case of conflicts."
)
self.parser_logger.error(msg)
else:
saved_nodes.append(data.id)
return data

def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[NodeData]:
"""Transform data from file to dictionary"""
saved_nodes = []
index_stopwords = 0
index_synonyms = 0
comments = []

# Check if it is correctly written
correctly_written = re.compile(r"\w+\Z")
Expand All @@ -268,39 +263,27 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
line_number = (
entries_start_line # if the iterator is empty, line_number will not be unbound
)
for line_number, line in self._file_iter(filename, entries_start_line):
for line_number, raw_line in self._file_iter(filename, entries_start_line):
# yield data if block ended
if self._entry_end(line, data):
data = self._remove_separating_line(data)
if data.get_node_type() == NodeType.ENTRY:
data = self._get_node_data_with_parent_and_end_comments(data)
if data.id in saved_nodes:
# this duplicate node will be merged with the first one
data.is_before = None
msg = (
f"WARNING: Entry with same id {data.id} already exists, "
f"duplicate id in file at line {data.src_position}. "
"The two nodes will be merged, keeping the last "
"values in case of conflicts."
)
self.parser_logger.error(msg)
else:
saved_nodes.append(data.id)
is_before = data.id
yield data # another function will use this dictionary to create a node
if self._entry_end(raw_line, data):
is_before = data.is_before
# another function will use data to create a node
yield self.finalize_data(data, comments, saved_nodes)
# if data was a duplicate (is_before is None) reuse same is_before
is_before = data.id if data.is_before else is_before
data = NodeData(is_before=is_before)

# harvest the line
if not (line) or line[0] == "#":
if not (raw_line.strip()) or raw_line[0] == "#":
# comment or blank line
if data.id:
if data.is_started:
# we are within the node definition
data.comments_stack.append((line_number, line))
comments.append(raw_line)
else:
# we are before the actual node
data.preceding_lines.append(line)
data.preceding_lines.append(raw_line)
else:
line = line.rstrip(",")
line = self.prepare_line(raw_line)
if not data.src_position:
data.src_position = line_number + 1
if line.startswith("stopwords"):
Expand Down Expand Up @@ -342,6 +325,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
elif line[0] == "<":
# parent definition
data.parent_tags.append((self._normalize_entry_id(line[1:]), line_number + 1))
self._add_comments(data, comments, "parent")
elif self.is_entry_synonyms_line(line):
# synonyms definition
if not data.id:
Expand All @@ -363,9 +347,7 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
tagsids_list.append(word_normalized)
data.tags["tags_" + lang] = tags_list
data.tags["tags_ids_" + lang] = tagsids_list
data = self._get_node_data_with_comments_above_key(
data, line_number, "tags_" + lang
)
self._add_comments(data, comments, "tags_" + lang)
else:
# property definition
property_name = None
Expand All @@ -384,17 +366,15 @@ def _harvest_entries(self, filename: str, entries_start_line: int) -> Iterator[N
correctly_written.match(property_name) and correctly_written.match(lc)
):
self.parser_logger.error(
f"Reading error at line {line_number + 1},"
f"Reading error at line {line_number + 1}, "
f"unexpected format: '{self.parser_logger.ellipsis(line)}'"
)
if property_name:
prop_key = "prop_" + property_name + "_" + lc
data.properties[prop_key] = self.undo_normalize_text(
property_value.strip()
)
data = self._get_node_data_with_comments_above_key(
data, line_number, prop_key
)
self._add_comments(data, comments, prop_key)

data.id = "__footer__"
data.preceding_lines.pop(0)
Expand Down
4 changes: 2 additions & 2 deletions parser/openfoodfacts_taxonomy_parser/unparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_tags_line(self, node, lc):

@staticmethod
def property_sort_key(property):
name, lang_code, *_ = property.split("_", 2)
name, lang_code = property.rsplit("_", 1)
# give priority to xx and en language codes
priority = {"en": 1, "xx": 0}
return (name, priority.get(lang_code, 100), lang_code)
Expand Down Expand Up @@ -88,7 +88,7 @@ def get_parents_lines(self, parents):
parent = dict(parent)
lc = parent["main_language"]
parent_id = parent["tags_" + lc][0]
yield "<" + lc + ": " + parent_id
yield "< " + lc + ":" + parent_id

def iter_lines(self, project_label):
previous_block_id = ""
Expand Down
16 changes: 8 additions & 8 deletions parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,33 @@ synonyms:en: passion fruit, passionfruit

synonyms:fr: fruit de la passion, maracuja, passion

<en: milk
< en:milk
en: yogurts, yoghurts
fr: yaourts, yoghourts, yogourts

<en: yogurts
< en:yogurts
en: banana yogurts
fr: yaourts à la banane

<en: yogurts
< en:yogurts
en: Passion fruit yogurts
fr: yaourts au fruit de la passion

<fr: yaourts fruit de la passion
< fr:yaourts fruit de la passion
fr: yaourts au fruit de la passion allégés

# meat
# meat

en: meat
carbon_footprint_fr_foodges_value:fr: 10
vegan:en: no

<en: meat
< en:meat
en: fake-meat
vegan:en: yes

en: fake-stuff

<en: fake-stuff
<en: fake-meat
< en:fake-stuff
< en:fake-meat
en: fake-duck-meat
9 changes: 9 additions & 0 deletions parser/tests/data/test_comment_below_parent.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# test a bug found with ingr taxonomy

en:milk

# a comment above the parent
<en: milk
# a comment below the parent
en: Cow milk
fr: lait de vache
Loading