-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnew yorker audio scraper.py
115 lines (88 loc) · 4.5 KB
/
new yorker audio scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
DRIVER_PATH = 'C:/Users/Sean/Downloads/selenium-scraper/chromedriver_win32/'
driver = webdriver.Chrome()
##driver.set_window_size(1080,800);
driver.get('https://www.newyorker.com/magazine');
magazine = 0
CurrentArticle = 0
ArticleElementList = driver.find_elements_by_xpath("//a[.//h4]")
print(ArticleElementList)
print("Article Element List Made")
ArticleCount = len(ArticleElementList)
print("Article Element List Count is ", ArticleCount)
while magazine < 20:
try:
if CurrentArticle < ArticleCount:
print("currently on magazine ", (magazine +1))
time.sleep(2)
ArticleElementList = driver.find_elements_by_xpath("//a[.//h4]")
ArticleCount = len(ArticleElementList)
print("Article Element List Count is ", ArticleCount)
CurrentArticleElement = ArticleElementList[CurrentArticle]
ArticleLinkString = CurrentArticleElement.get_attribute("href");
print("navigating to article number ", (CurrentArticle + 1))
print(ArticleLinkString)
driver.get(ArticleLinkString);
time.sleep(2);
driver.execute_script("window.scrollTo(0, 450)")
time.sleep(2);
driver.execute_script("document.body.style.zoom='.4'");
time.sleep(2);
try:
player = driver.find_element_by_xpath('//iframe[@title="Embedded Frame"]');
print("player found!");
playerlink = player.get_attribute("src");
print (playerlink);
print ("navigating to player link");
driver.get(playerlink);
time.sleep(2);
try:
driver.find_element_by_tag_name('button').click();
print("button found and clicked")
time.sleep(2);
try:
if "wync" in playerlink:
audiolink = (playerlink.replace('https://www.wnyc.org/widgets/ondemand_player/thenewyorker/#file=', ''))
audioelement = driver.find_element_by_xpath("//video");
print("audio element found")
audiolink = audioelement.get_attribute("src");
print("audio link is ", audiolink);
## if "wync" in playerlink:
## grab poem audio from mp3? how?
driver.get(audiolink);
print("sound file downloaded! :) moving back to index")
time.sleep(2);
driver.back();
time.sleep(2);
driver.back();
CurrentArticle = CurrentArticle + 1
print("Current Article Count has changed to ", (CurrentArticle + 1))
except NoSuchElementException:
print("audio element could not be found :( moving back to index")
time.sleep(2);
driver.back();
time.sleep(2);
driver.back();
CurrentArticle = CurrentArticle + 1
except NoSuchElementException:
print("button could not be found :( moving back to index")
time.sleep(2);
driver.back();
time.sleep(2);
driver.back();
CurrentArticle = CurrentArticle + 1
except NoSuchElementException:
print("player link not found :( moving back to index")
driver.back();
CurrentArticle = CurrentArticle + 1
else:
magazine = magazine + 1
print("navigating to next magazine")
driver.find_element_by_xpath('//button[@class="Button__button___2vDCa Button__tertiary___1LRXQ "]').click()
CurrentArticle = 0
except NoSuchElementException:
print("could not acquire list of articles or maybe grab the right href link")
else:
print("exited the loop")