-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
38 lines (27 loc) · 1.09 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import random
import requests
from bs4 import BeautifulSoup
INVALID_TAGS = ['b', 'i', 'u', 'a', 'sup']
def get_webpage(url):
"""
Download a webpage and clean it up using beautiful soup. I'm only looking for factoids so I'm just looking for
<p></p> tags. I also want all nested tags to be removed and just the internal content remaining.
:param str url: The URL to fetch and parse.
:return: A list of strings containing cleansed/parsed elements.
"""
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, features="html.parser")
for tag in INVALID_TAGS:
for match in soup.findAll(tag):
match.replaceWithChildren()
paragraphs = [str.join('', paragraph.children) for paragraph in soup.findAll('p')]
return paragraphs
def get_random_fact(url):
"""
Get a random fact from the specified webpage.
:param str url: The URL to fetch the facts from.
:return str: A string containing a factoid from the specified webpage.
"""
facts = get_webpage(url)
return facts[random.randint(0, len(facts))]