Skip to content

Commit

Permalink
[HACK] This is a hack try to replace vimeo iframe with video tag.
Browse files Browse the repository at this point in the history
Vimeo iframe embedded a js player. But we don't need it, what we need
is to read the video, html5 video tag is enough.
  • Loading branch information
mgautierfr committed Feb 2, 2024
1 parent bd955c9 commit 3b69172
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 5 deletions.
7 changes: 4 additions & 3 deletions src/warc2zim/content_rewriting/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(

mimetype = get_record_mime_type(record)

self.known_urls = known_urls
self.path = path
self.orig_url_str = get_record_url(record)
self.url_rewriter = ArticleUrlRewriter(self.orig_url_str, known_urls)
Expand Down Expand Up @@ -69,9 +70,9 @@ def rewrite_html(self, head_template: Template, css_insert: str | None):
orig_scheme=orig_url.scheme,
orig_host=orig_url.netloc,
)
return HtmlRewriter(self.url_rewriter, head_insert, css_insert).rewrite(
self.content
)
return HtmlRewriter(
self.known_urls, self.url_rewriter, head_insert, css_insert
).rewrite(self.content)

def rewrite_css(self):
return ("", CssRewriter(self.url_rewriter).rewrite(self.content))
Expand Down
47 changes: 45 additions & 2 deletions src/warc2zim/content_rewriting/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import io
import io, re
from collections import namedtuple
from html import escape
from html.parser import HTMLParser
Expand Down Expand Up @@ -56,11 +56,13 @@ def transform_attrs(
class HtmlRewriter(HTMLParser):
def __init__(
self,
known_urls: set[str],
url_rewriter: ArticleUrlRewriter,
pre_head_insert: str,
post_head_insert: str | None,
):
super().__init__()
self.known_urls = known_urls
self.url_rewriter = url_rewriter
self.css_rewriter = CssRewriter(url_rewriter)
self.title = None
Expand Down Expand Up @@ -96,6 +98,45 @@ def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = Fals
elif tag == "script":
self.rewrite_context = "script"

if tag == "iframe":
iframe_src = get_attr_value(attrs, "src")
if "player.vimeo.com" in iframe_src:
# Let's be hacking, replace the iframe with a html5 video
# We still have to get the url of the video to play.
# The url is hidden in the source of the player so it is difficult to get it.
# But, we also know that the video will be stored in a url "subdirectory" with the player_id
player_id = re.search("player.vimeo.com/video/(\d+)", iframe_src).group(
1
)
video_url = next(
url
for url in self.known_urls
if re.search(
f"vimeo-cdn.fuzzy.replayweb.page/.*?/{player_id}/", url
)
)

self.handle_starttag(
"video",
[
("width", get_attr_value(attrs, "width")),
("height", get_attr_value(attrs, "height")),
("controls", None),
],
)
self.handle_starttag(
"source",
[
("src", f"//{video_url}"),
("type", "video/mp4"),
],
)
# No end tag for source
self.handle_data("Your browser doesn't support the video tag")
self.handle_endtag("video")
self.rewrite_context = "skip_end"
return

self.send(f"<{tag}")
if attrs:
self.send(" ")
Expand All @@ -115,7 +156,9 @@ def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = Fals
self.send(self.pre_head_insert)

def handle_endtag(self, tag: str):
self.rewrite_context = None
rewrite_context, self.rewrite_context = self.rewrite_context, None
if rewrite_context == "skip_end":
return
if tag == "head" and self.post_head_insert:
self.send(self.post_head_insert)
self.send(f"</{tag}>")
Expand Down

0 comments on commit 3b69172

Please sign in to comment.