-
Notifications
You must be signed in to change notification settings - Fork 1
/
fetch_videos.py
84 lines (74 loc) · 2.87 KB
/
fetch_videos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import bs4
import requests
import time
import json
import persists as p
from collections import namedtuple
import codecs
import os
root_url = "https://www.youtube.com"
def parse_video_div(div):
video_id = div.get("data-context-item-id", "")
title = div.find("a", "yt-uix-tile-link").text
first_li_tag = div.find("ul", 'yt-lockup-meta-info').find('li')
click_count = first_li_tag.text.replace('views', '').strip()
time_string = first_li_tag.find_next_sibling('li').text.strip()
#duration = div.find("span", "video-time").contents[0].text
#views = int(div.find("ul", "yt-lockup-meta-info").contents[0].text.rstrip(" views").replace(",", ""))
#img = div.find("img")
#thumbnail = "http:" + img.get("src", "") if img else ""
#return Video(video_id, title, duration, views, thumbnail)
return p.Video(video_id, title, '', click_count, time_string)
def parse_videos_page(page):
video_divs = page.find_all("div", "yt-lockup-video")
return [parse_video_div(div) for div in video_divs]
def find_load_more_url(page):
for button in page.find_all("button"):
url = button.get("data-uix-load-more-href")
if url:
return root_url + url
def download_page(url):
try:
return requests.get(url).text
except Exception as e:
print("Error!!! downloading url:%s with exception: %s" % (url, e))
def get_videos(username):
cached_videos = p.cached_videos_for_author(username.author)
if cached_videos != None:
print("has cached videos")
return cached_videos
if username.channel != None:
page_url = "{0}/channel/{1}/videos".format(root_url, username.channel.strip())
else:
page_url = "{0}/user/{1}/videos".format(root_url, username.author)
page = bs4.BeautifulSoup(download_page(page_url), "html.parser")
videos = parse_videos_page(page)
page_url = find_load_more_url(page)
while page_url:
print("getting more page")
page_response = download_page(page_url)
if page_response == None:
print("Download fail, sleep then continue")
time.sleep(4)
continue
json_data = json.loads(page_response)
page = bs4.BeautifulSoup(json_data.get("content_html", ""), "html.parser")
videos.extend(parse_videos_page(page))
page_url = find_load_more_url(bs4.BeautifulSoup(json_data.get("load_more_widget_html", ""), 'html.parser'))
time.sleep(1)
p.store_videos_to_file(videos, username.author)
return videos
def video_url(video_id):
return root_url + '/watch?v=' + video_id
def video_dest(author, video):
full_path = dir_path(author) + '/%(upload_date)s_' + video.title.replace('/','') + '.%(ext)s'
return full_path
def dir_path(author):
root = os.getcwd()
dir_path = root + '/' + author
if not os.path.isdir(dir_path):
os.mkdir(dir_path)
return dir_path
if __name__ == "__main__":
videos = get_videos(p.Author('FunForLouis', None))
print(videos)