Skip to content

Commit

Permalink
Take code out of the critical path in html parse
Browse files Browse the repository at this point in the history
  • Loading branch information
mnot committed Dec 26, 2023
1 parent d30e3d0 commit 87fdaa5
Showing 1 changed file with 16 additions and 15 deletions.
31 changes: 16 additions & 15 deletions redbot/resource/link_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ class HTMLLinkParser(HTMLParser):
"application/atom+xml",
]

link_types: Dict[str, Tuple[str, Optional[List[str]]]] = {
"link": ("href", ["stylesheet"]),
"a": ("href", None),
"img": ("src", None),
"script": ("src", None),
"frame": ("src", None),
"iframe": ("src", None),
}

def __init__(
self,
message: HttpMessageLinter,
Expand All @@ -47,14 +56,6 @@ def __init__(
self.message = message
self.link_procs = link_procs
self.err = err
self.link_types: Dict[str, Tuple[str, Optional[List[str]]]] = {
"link": ("href", ["stylesheet"]),
"a": ("href", None),
"img": ("src", None),
"script": ("src", None),
"frame": ("src", None),
"iframe": ("src", None),
}
self.errors = 0
self.last_err_pos: int = 0
self.ok = True
Expand Down Expand Up @@ -90,23 +91,23 @@ def feed(self, data: str) -> None:
self.ok = False

def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
attr_d = dict(attrs)
title = (attr_d.get("title", "") or "").strip()
if tag in self.link_types:
attr_d = dict(attrs)
url_attr, rels = self.link_types[tag]
if not rels or attr_d.get("rel", None) in rels:
target = attr_d.get(url_attr, "")
if target:
if "#" in target:
target = target[: target.index("#")]
title = (attr_d.get("title", "") or "").strip()
for proc in self.link_procs:
proc(self.message.base_uri, target, tag, title)
elif tag == "base":
self.message.base_uri = attr_d.get("href", self.message.base_uri) or ""
elif (
tag == "meta"
and (attr_d.get("http-equiv", "") or "").lower() == "content-type"
):
self.message.base_uri = dict(attrs).get("href", self.message.base_uri) or ""
elif tag == "meta":
attr_d = dict(attrs)
if (attr_d.get("http-equiv", "") or "").lower() != "content-type":
return
ct = attr_d.get("content", None)
if ct:
try:
Expand Down

0 comments on commit 87fdaa5

Please sign in to comment.