Skip to content

Commit

Permalink
Improve regex documentation
Browse files Browse the repository at this point in the history
Co-authored-by: Matti Lamppu <[email protected]>
  • Loading branch information
ranta and matti-lamppu committed Jan 8, 2025
1 parent bb3dc4d commit 55163fe
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def htmlize(text: str | None) -> str:
if text.startswith("<"):
return text

text = re.sub(r"(.{1,})", r"<p>\1</p>", text)
# Convert linebreaks to paragraphs
pattern = r"(.{1,})" # Match any characters, with a minimum length of 1 (Essentially get rows with content)
text = re.sub(pattern, r"<p>\1</p>", text)
return text.replace("\n", "") # Remove linebreaks


Expand Down
28 changes: 24 additions & 4 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,34 @@ def as_p_tags(texts: Iterable[str]) -> str:
def convert_html_to_text(html_text: str) -> str:
text = html2text(html_text, bodywidth=0)

# Link text and url are the same
# Link text and url are the same:
# Remove angle-brackets from links `<url>` -> `url`
# If there is a dot after the link, add a space between the link and the dot.
text = re.sub(r"<(https?://[^>]+)>(\.?)", r"\1 \2", text)

# Link text and url are different
# fmt: off
pattern = (
r"<" # begins with opening bracket
r"(?P<link>(https?://)?[^>]+)" # link, with optional protocol
r">" # followed by closing bracket
r"(?P<dot>\.?)" # with optional dot
)
# fmt: on
text = re.sub(pattern, r"\g<link> \g<dot>", text)

# Link text and url are different:
# Replace markdown-style links `[text](url)` with `text <url>`
text = re.sub(r"\[([^\]]+)\]\(((http|https)?://[^\)]+)\)", r"\1 <\2>", text)
# fmt: off
pattern = (
r"\[" # begins with "["
r"(?P<text>[^\]]+)" # any text that is not "]"
r"\]" # followed by "]"
r"\(" # followed by "("
r"(?P<link>(https?://)?[^\)]+)" # any link, with optional protocol
r"\)" # followed by ")"
)
# fmt: on

text = re.sub(pattern, r"\g<text> <\g<link>>", text)

# Remove any spaces between newline and the last newline, which is added by html2text
return text.replace(" \n", "\n").removesuffix("\n")

0 comments on commit 55163fe

Please sign in to comment.