-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscript_download.py
28 lines (24 loc) · 1.02 KB
/
transcript_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from bs4 import BeautifulSoup, NavigableString
import requests
url = "https://transcriptedpodcasts.com/2019/06/30/minnesota-mysteries-podcast-episode-1/"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser') #convert request to soup
content = soup.find(
'div', {'class': 'entry-content'}
) #these tags are where text are stored, change tag as appropriate for document
for a in content.findAll('a'):
del a['href']
with open("minnesota.txt", "w+") as a:
for ind, stuff in enumerate(content):
if ind > 16 and ind < 200 and not isinstance(
stuff, NavigableString
): #ignoring certain html elements that have advertisements, can remove
stu = stuff.text.split(": ", 1)
# print(ind, stu[0])
print(stu[-1].split(": ", 1))
# print("\n*****\n")
a.write(stu[-1].strip() + "\n")
# print(content.text)
# for data in soup.findAll('div', {'class':'entry-content'}):
# print(data)
# print("\n***********\n")