-
Notifications
You must be signed in to change notification settings - Fork 1
/
_ghost_sbs_crawler.py
111 lines (93 loc) ยท 3.86 KB
/
_ghost_sbs_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse, parse_qs
import time
import glob
# designate target url
url = "https://programs.sbs.co.kr/radio/sghost/gorealrapod/56929"
# chrome driver options: downloading option and headless
options = webdriver.ChromeOptions()
options.add_experimental_option(
"prefs",
{
"download.default_directory": r"/Users/noopy/ghoststation_transcript/downloadedmp3",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True,
},
)
options.add_argument("headless")
driver = webdriver.Chrome(r"/Applications/chromedriver", chrome_options=options)
# open url link with driver
driver.get(url)
# get file list that already exist in ./downloadedmp3
mp3_file_list = glob.glob("downloadedmp3/*.mp3")
print(mp3_file_list)
# first page
page_count = 1
while True:
# parse webpage
divs = driver.find_elements_by_class_name("podcast_btn_w")
# print(divs)
for div in divs:
radio_mp3_link = div.find_element_by_css_selector("a").get_attribute("href")
# print(radio_mp3_link)
# get mp3 file name
radio_link_queries_parsed = radio_mp3_link.split("/")
last_item_of_the_list = radio_link_queries_parsed[-1]
video_item_quries_parsed = last_item_of_the_list.split("%")
file_name = video_item_quries_parsed[7][11:]
# if mp3 file exists in folder, don't download.
if any(file_name in s for s in mp3_file_list):
print(file_name + " already exists")
pass
# if mp3 file does not exist in folder, then download
else:
# download video with url open
resp = urllib.request.urlopen(radio_mp3_link)
respHTML = resp.read()
binfile = open(
"/Users/noopy/ghoststation_transcript/downloadedmp3/" + file_name, "wb"
)
binfile.write(respHTML)
binfile.close()
print(file_name + " is downloaded")
# get next bundle of 10 pages
# print(page_count)
if page_count % 10 == 0:
driver.find_element_by_id("program-front-radio-pagination-next").click()
time.sleep(3)
else:
pass
# Increase page_count value on each iteration on +1
page_count += 1
# paginate
try:
# Clicking on "2" on pagination on first iteration, "3" on second...
page_number = str(page_count)
# clicking to paginate
driver.find_element_by_id(
f"program-front-radio-pagination-page-{page_number}"
).click()
# waiting for page to load in order to prevent staele element error
time.sleep(3)
except NoSuchElementException:
# Stop loop if no more page available
break
"""
# tag structure
<a href="/radio/sghost/episodedownload?fileUrl=http%3A%2F%2Fpodcastdown.sbs.co.kr%2Fpowerfm%2F2018%2F12%2Fpodcast-v2000010307-20181228-549.mp3%3Fvod_id%3DV2000010307%26podcast_id%3DP0000000579" class="podcast_btn_download" title="๋ค์ด๋ก๋" download="549ํ ๋ด๋ฐฐ ํ์คํ๊ฒ ๋๋ ๋ฐฉ๋ฒ, 007์ ๋ํด์, ๋ฏธ๊ตญ๋ณด๋ค ๋ถํ์ด ๋ ๋๋น ">
<div class="podcast_btn_inner">
<span class="prog_icn icon_download"><i class="ir">๋ค์ด๋ก๋์์ด์ฝ</i></span>
<span class="btn_text">๋ค์ด๋ก๋</span>
</div>
</a>
"""
# download video with url retrieve -> Too Slow
# urllib.request.urlretrieve(radio_mp3_link)
# download video with FancyURL opener and retrieve -> Not working
# test=urllib.request.urlopen() is not working, since it is for python 2.7
# test=urllib.request.FancyURLopener()
# test.retrieve(radio_mp3_link,file_name)