Skip to content

Commit

Permalink
[youtube] fix: extract mix playlist ids from ytInitialData (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
insaneracist committed Oct 29, 2020
1 parent 4932ba4 commit 5b0a6a8
Showing 1 changed file with 26 additions and 9 deletions.
35 changes: 26 additions & 9 deletions youtube_dlc/extractor/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,15 @@ def _download_webpage_handle(self, *args, **kwargs):
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))

def _get_yt_initial_data(self, video_id, webpage):
config = self._search_regex(
(r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
webpage, 'ytInitialData', default=None)
if config:
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)

def _real_initialize(self):
if self._downloader is None:
return
Expand Down Expand Up @@ -1397,15 +1406,6 @@ def _get_ytplayer_config(self, video_id, webpage):
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)

def _get_yt_initial_data(self, video_id, webpage):
config = self._search_regex(
(r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
webpage, 'ytInitialData', default=None)
if config:
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)

def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
Expand Down Expand Up @@ -2765,6 +2765,16 @@ def extract_videos_from_page(self, page):

return zip(ids_in_page, titles_in_page)

def _extract_mix_ids_from_yt_initial(self, yt_initial):
ids = []
playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'])
if type(playlist_contents) is list:
for item in playlist_contents:
videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'])
if type(videoId) is str:
ids.append(videoId)
return ids

def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
Expand All @@ -2778,6 +2788,13 @@ def _extract_mix(self, playlist_id):
r'''(?xs)data-video-username=".*?".*?
href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
webpage))

# if no ids in html of page, try using embedded json
if (len(new_ids) == 0):
yt_initial = self._get_yt_initial_data(playlist_id, webpage)
if yt_initial:
new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)

# Fetch new pages until all the videos are repeated, it seems that
# there are always 51 unique videos.
new_ids = [_id for _id in new_ids if _id not in ids]
Expand Down

0 comments on commit 5b0a6a8

Please sign in to comment.