Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding more multiline string support #110

Merged
merged 3 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions percy/parser/_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Optional

from percy.parser._types import ROOT_NODE_VALUE
from percy.parser.types import NodeValue
from percy.parser.types import MultilineVariant, NodeValue
from percy.types import SentinelType


Expand Down Expand Up @@ -35,7 +35,7 @@ def __init__(
comment: str = "",
children: Optional[list["Node"]] = None,
list_member_flag: bool = False,
multiline_flag: bool = False,
multiline_variant: MultilineVariant = MultilineVariant.NONE,
key_flag: bool = False,
):
"""
Expand All @@ -44,14 +44,14 @@ def __init__(
:param comment: Comment on the line this node was found on
:param children: List of children nodes, descendants of this node
:param list_member_flag: Indicates if this node is part of a list
:param multiline_flag: Indicates if the node represents a multiline value
:param multiline_variant: Indicates if the node represents a multiline value AND which syntax variant is used
:param key_flag: Indicates if the node represents a key that points to zero or more subsequent values
"""
self.value = value
self.comment = comment
self.children: list[Node] = children if children else []
self.list_member_flag = list_member_flag
self.multiline_flag = multiline_flag
self.multiline_variant = multiline_variant
self.key_flag = key_flag

def __eq__(self, other: object) -> bool:
Expand All @@ -66,7 +66,7 @@ def __eq__(self, other: object) -> bool:
self.value == other.value
and self.comment == other.comment
and self.list_member_flag == other.list_member_flag
and self.multiline_flag == other.multiline_flag
and self.multiline_variant == other.multiline_variant
# Save recursive (most expensive) check for last
and self.children == other.children
)
Expand All @@ -86,7 +86,7 @@ def __str__(self) -> str:
f" - Comment: {self.comment!r}\n"
f" - Child count: {len(self.children)}\n"
f" - List?: {self.list_member_flag}\n"
f" - Multiline?: {self.multiline_flag}\n"
f" - Multiline?: {self.multiline_variant}\n"
f" - Key?: {self.key_flag}\n"
)

Expand Down
22 changes: 21 additions & 1 deletion percy/parser/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@
# Marker used to temporarily work around some Jinja-template parsing issues
PERCY_SUB_MARKER: Final[str] = "__PERCY_SUBSTITUTION_MARKER__"

# Ideal sort-order of the top-level YAML keys for human readability and traditionally how we organize our files. This
# should work on both old and new recipe formats.
TOP_LEVEL_KEY_SORT_ORDER: Final[dict[str, int]] = {
"schema_version": 0,
"context": 1,
"package": 2,
"source": 3,
"build": 4,
"requirements": 5,
"outputs": 6,
"test": 7,
"about": 8,
"extra": 9,
}

#### Private Classes (Not to be used external to the `parser` module) ####

# NOTE: The classes put in this file should be structures (NamedTuples) and very small support classes that don't make
Expand Down Expand Up @@ -54,5 +69,10 @@ class Regex:
JINJA_SET_LINE: Final[re.Pattern[str]] = re.compile(r"{%\s*set\s*" + _JINJA_VAR_FUNCTION_PATTERN + r"\s*=.*%}\s*\n")

SELECTOR: Final[re.Pattern[str]] = re.compile(r"\[.*\]")
MULTILINE: Final[re.Pattern[str]] = re.compile(r"^\s*.*:\s+\|(\s*|\s+#.*)")
# Detects the 6 common variants (3 |'s, 3 >'s). See this guide for more info:
# https://stackoverflow.com/questions/3790454/how-do-i-break-a-string-in-yaml-over-multiple-lines/21699210
MULTILINE: Final[re.Pattern[str]] = re.compile(r"^\s*.*:\s+(\||>)(\+|\-)?(\s*|\s+#.*)")
# Group where the "variant" string is identified
MULTILINE_VARIANT_CAPTURE_GROUP_CHAR: Final[int] = 1
MULTILINE_VARIANT_CAPTURE_GROUP_SUFFIX: Final[int] = 2
DETECT_TRAILING_COMMENT: Final[re.Pattern[str]] = re.compile(r"(\s)+(#)")
30 changes: 25 additions & 5 deletions percy/parser/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import cast

from percy.parser._types import PERCY_SUB_MARKER, ROOT_NODE_VALUE, Regex, StrStack, StrStackImmutable
from percy.parser.types import NodeValue
from percy.parser.types import TAB_AS_SPACES, MultilineVariant, NodeValue
from percy.types import H, SentinelType


Expand Down Expand Up @@ -88,12 +88,14 @@ def substitute_markers(s: str, subs: list[str]) -> str:
return s


def stringify_yaml(val: NodeValue | SentinelType, multiline_flag: bool = False) -> NodeValue:
def stringify_yaml(
val: NodeValue | SentinelType, multiline_variant: MultilineVariant = MultilineVariant.NONE
) -> NodeValue:
"""
Special function for handling edge cases when converting values back to YAML.
:param val: Value to check
:param multiline_flag: (Optional) If the value being processed is a multiline string, set this flag to True to
prevent unintended quote-escaping.
:param multiline_variant: (Optional) If the value being processed is a multiline string, indicate which YAML
descriptor is in use.
:returns: YAML version of a value, as a string.
"""
# Handled for type-completeness of `Node.value`. A `Node` with a sentinel as its value indicates a special Node
Expand All @@ -112,13 +114,31 @@ def stringify_yaml(val: NodeValue | SentinelType, multiline_flag: bool = False)
# quoting all YAML strings. Although not wrong, it does not follow our common practices. Quote escaping is not
# required for multiline strings. We do not escape quotes for Jinja value statements. We make an exception for
# strings containing the NEW recipe format syntax, ${{ }}, which is valid YAML.
if not multiline_flag and isinstance(val, str) and not Regex.JINJA_SUB.match(val):
if multiline_variant == MultilineVariant.NONE and isinstance(val, str) and not Regex.JINJA_SUB.match(val):
if "${{" not in val and ("'" in val or '"' in val):
# The PyYaml equivalent function injects newlines, hence why we abuse the JSON library to write our YAML
return json.dumps(val)
return val


def normalize_multiline_strings(val: NodeValue, variant: MultilineVariant) -> NodeValue:
"""
Utility function that takes in a Node's value and "normalizes" multiline strings so that they can be accurately
interpreted by PyYaml. We use PyYaml to handle the various ways in which a multiline string can be interpreted.
:param val: Value to normalize
:param variant: Multiline variant rules to follow
:returns: If the value is a multiline string, this returns the "normalized" string to be re-evaluated by PyYaml.
Otherwise, returns the original value.
"""
if variant == MultilineVariant.NONE:
return val

# Prepend the multiline marker to the string to have PyYaml interpret how the whitespace should be handled. JINJA
# substitutions in multi-line strings do not break the PyYaml parser.
multiline_str = f"\n{TAB_AS_SPACES}".join(cast(list[str], val))
return f"{variant}\n{TAB_AS_SPACES}{multiline_str}"


def dedupe_and_preserve_order(l: list[H]) -> list[H]:
"""
Takes a list of strings
Expand Down
110 changes: 78 additions & 32 deletions percy/parser/recipe_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import difflib
import json
import re
import sys
from typing import Callable, Final, Optional, TypeGuard, cast, no_type_check

import yaml
Expand All @@ -32,9 +33,17 @@
traverse_all,
traverse_with_index,
)
from percy.parser._types import PERCY_SUB_MARKER, ROOT_NODE_VALUE, ForceIndentDumper, Regex, StrStack
from percy.parser._types import (
PERCY_SUB_MARKER,
ROOT_NODE_VALUE,
TOP_LEVEL_KEY_SORT_ORDER,
ForceIndentDumper,
Regex,
StrStack,
)
from percy.parser._utils import (
dedupe_and_preserve_order,
normalize_multiline_strings,
num_tab_spaces,
stack_path_to_str,
str_to_stack_path,
Expand All @@ -50,6 +59,7 @@
TAB_SPACE_COUNT,
MessageCategory,
MessageTable,
MultilineVariant,
)
from percy.types import PRIMITIVES_TUPLE, JsonPatchType, JsonType, Primitives, SentinelType

Expand Down Expand Up @@ -202,7 +212,9 @@ def _generate_subtree(value: JsonType) -> list[Node]:
return [
Node(
value=value.splitlines(),
multiline_flag=True,
# The conversion from JSON-to-YAML is lossy here. Default to the closest equivalent, which preserves
# newlines.
multiline_variant=MultilineVariant.PIPE,
)
]

Expand All @@ -220,10 +232,9 @@ def _render_jinja_vars(self, s: str) -> JsonType:
"""
Helper function that replaces Jinja substitutions with their actual set values.
:param s: String to be re-rendered
:returns: The original value, augmented with Jinja substitutions. If substitutions have taken place, the type is
re-evaluated.
:returns: The original value, augmented with Jinja substitutions. Types are re-rendered to account for multiline
strings that may have been "normalized" prior to this call.
"""
replacement = False
# Search the string, replacing all substitutions we can recognize
for match in cast(list[str], Regex.JINJA_SUB.findall(s)):
lower_case = False
Expand All @@ -242,10 +253,7 @@ def _render_jinja_vars(self, s: str) -> JsonType:
if lower_case:
value = value.lower()
s = s.replace(match, value)
replacement = True
if replacement:
return cast(JsonType, yaml.safe_load(s))
return s
return cast(JsonType, yaml.safe_load(s))

def _rebuild_selectors(self) -> None:
"""
Expand Down Expand Up @@ -319,10 +327,19 @@ def __init__(self, content: str):
new_node = RecipeParser._parse_line_node(clean_line)
# If the last node ended (pre-comments) with a |, reset the value to be a list of the following,
# extra-indented strings
if Regex.MULTILINE.match(line):
multiline_re_match = Regex.MULTILINE.match(line)
if multiline_re_match:
# Calculate which multiline symbol is used. The first character must be matched, the second is optional.
variant_capture = cast(str, multiline_re_match.group(Regex.MULTILINE_VARIANT_CAPTURE_GROUP_CHAR))
variant_sign = cast(str | None, multiline_re_match.group(Regex.MULTILINE_VARIANT_CAPTURE_GROUP_SUFFIX))
if variant_sign is not None:
variant_capture += variant_sign
# Per YAML spec, multiline statements can't be commented. In other words, the `#` symbol is seen as a
# string character in multiline values.
multiline_node = Node([], multiline_flag=True)
multiline_node = Node(
[],
multiline_variant=MultilineVariant(variant_capture),
)
# Type narrow that we assigned `value` as a `list`
assert isinstance(multiline_node.value, list)
multiline = lines[line_idx]
Expand Down Expand Up @@ -362,6 +379,29 @@ def __init__(self, content: str):
# This table will have to be re-built or modified when the tree is modified with `patch()`.
self._rebuild_selectors()

def _sort_top_level_keys(self) -> None:
"""
Sorts the top-level keys to a "canonical" order (a human-centric order in which most recipes are currently
written). This should work on both old and new recipe formats.

TODO: Handle JINJA statements

The modification flag is not changed even though the underlying tree is. As far as YAML and our key-pathing
structure is concerned, order does not matter.
"""

def _comparison(n: Node) -> int:
# For now, put all comments at the top of the file. Arguably this is better than having them "randomly tag"
# to another top-level key.
if n.is_comment():
return -sys.maxsize
# Unidentified keys go to the bottom of the file.
if not isinstance(n.value, str) or n.value not in TOP_LEVEL_KEY_SORT_ORDER:
return sys.maxsize
return TOP_LEVEL_KEY_SORT_ORDER[n.value]

self._root.children.sort(key=_comparison)

@staticmethod
def _str_tree_recurse(node: Node, depth: int, lines: list[str]) -> None:
"""
Expand Down Expand Up @@ -441,7 +481,7 @@ def _render_tree(node: Node, depth: int, lines: list[str], parent: Optional[Node
lines.append(f"{spaces}{node.value}: {node.comment}".rstrip())
lines.append(
f"{spaces}{TAB_AS_SPACES}- "
f"{stringify_yaml(node.children[0].value, True)} "
f"{stringify_yaml(node.children[0].value, multiline_variant=node.children[0].multiline_variant)} "
f"{node.children[0].comment}".rstrip()
)
return
Expand All @@ -460,10 +500,14 @@ def _render_tree(node: Node, depth: int, lines: list[str], parent: Optional[Node
# for other types.
#
# By the language spec, # symbols do not indicate comments on multiline strings.
if node.children[0].multiline_flag:
lines.append(f"{spaces}{node.value}: | {node.comment}".rstrip())
if node.children[0].multiline_variant != MultilineVariant.NONE:
multi_variant: Final[MultilineVariant] = node.children[0].multiline_variant
lines.append(f"{spaces}{node.value}: {multi_variant} {node.comment}".rstrip())
for val_line in cast(list[str], node.children[0].value):
lines.append(f"{spaces}{TAB_AS_SPACES}" f"{stringify_yaml(val_line, True)}".rstrip())
lines.append(
f"{spaces}{TAB_AS_SPACES}"
f"{stringify_yaml(val_line, multiline_variant=multi_variant)}".rstrip()
)
return
lines.append(
f"{spaces}{node.value}: "
Expand Down Expand Up @@ -546,10 +590,13 @@ def _render_object_tree(self, node: Node, replace_variables: bool, data: JsonTyp
if child.is_comment():
continue

# Handle multiline strings
value = child.value if not child.multiline_flag else "\n".join(child.value)
if replace_variables and isinstance(value, str):
value = self._render_jinja_vars(value)
# Handle multiline strings and variable replacement
value = normalize_multiline_strings(child.value, child.multiline_variant)
if isinstance(value, str):
if replace_variables:
value = self._render_jinja_vars(value)
elif child.multiline_variant != MultilineVariant.NONE:
value = cast(str, yaml.safe_load(value))

# Empty keys are interpreted to point to `None`
if child.is_empty_key():
Expand Down Expand Up @@ -641,15 +688,8 @@ def _patch_and_log(patch: JsonPatchType) -> None:
continue
_patch_and_log({"op": "add", "path": f"/context/{name}", "value": value})

# Hack: `add` has no concept of ordering and new fields are appended to the end. Logically, `context` should be
# at the top of the file, so we'll force it to the front of root's child list.
# TODO: make more robust and don't assume `context` will be at the end of the list
# TODO: manage some human-friendly ordering of all top-level sections
new_recipe._root.children.insert(0, new_recipe._root.children.pop(-1))

# Similarly, patch-in the new `schema_version` value to the top of the file
_patch_and_log({"op": "add", "path": "/schema_version", "value": CURRENT_RECIPE_SCHEMA_FORMAT})
new_recipe._root.children.insert(0, new_recipe._root.children.pop(-1))

# Swap all JINJA to use the new `${{ }}` format.
jinja_sub_locations: Final[list[str]] = new_recipe.search(Regex.JINJA_SUB)
Expand Down Expand Up @@ -732,11 +772,14 @@ def _patch_and_log(patch: JsonPatchType) -> None:
# TODO Complete: handle changes to the recipe structure and fields
# TODO Complete: move operations may result in empty fields we can eliminate. This may require changes
# to `contains_value()`
# TODO Complete: ensure some common "canonical" ordering to the top-level fields

# Hack: Wipe the existing table so the JINJA `set` statements don't render the final form
new_recipe._vars_tbl = {}

# Sort the top-level keys to a "canonical" ordering. This should make previous patch operations look more
# "sensible" to a human reader.
new_recipe._sort_top_level_keys()

return new_recipe.render(), msg_tbl

## YAML Access Functions ##
Expand Down Expand Up @@ -788,13 +831,16 @@ def get_value(self, path: str, default: JsonType | SentinelType = _sentinel, sub
# Handle unpacking of the last key-value set of nodes.
if node.is_single_key() and not node.is_root():
# As of writing, Jinja substitutions are not used
if node.children[0].multiline_flag:
# PyYaml will not preserve newlines passed into strings, so we can directly check for variable
# substitutions on a multiline string
multiline_str = "\n".join(cast(str, node.children[0].value))
if node.children[0].multiline_variant != MultilineVariant.NONE:
multiline_str = cast(
str,
normalize_multiline_strings(
cast(list[str], node.children[0].value), node.children[0].multiline_variant
),
)
if sub_vars:
return self._render_jinja_vars(multiline_str)
return multiline_str
return cast(JsonType, yaml.safe_load(multiline_str))
return_value = cast(Primitives, node.children[0].value)
# Leaf nodes can return their value directly
elif node.is_leaf():
Expand Down
17 changes: 17 additions & 0 deletions percy/parser/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,23 @@
}


class MultilineVariant(StrEnum):
"""
Captures which "multiline" descriptor was used on a Node, if one was used at all.

See this guide for details on the YAML spec:
https://stackoverflow.com/questions/3790454/how-do-i-break-a-string-in-yaml-over-multiple-lines/21699210
"""

NONE = ""
PIPE = "|"
PIPE_PLUS = "|+"
PIPE_MINUS = "|-"
CARROT = ">"
CARROT_PLUS = ">+"
CARROT_MINUS = ">-"


class MessageCategory(StrEnum):
"""
Categories to classify `RecipeParser` messages into.
Expand Down
Loading
Loading