-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper_v1.py
49 lines (38 loc) · 1.65 KB
/
web_scraper_v1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
alternate_url = 'https://www.diamondleague.com/lists-results/statistics/'
# create folder for output PDFs
folder_name='results_pdfs'
if not os.path.exists(folder_name):
os.mkdir(folder_name)
# initialize things I'll need globally
def DL_2023_web_scraper():
url = 'https://www.diamondleague.com/lists-results/2023-results/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pattern = re.compile(r'(.+?) 2023: Results \(PDF\)')
# Find all elements that match the pattern
section_titles = soup.find_all('a', text=pattern)
for title_element in section_titles:
# Extract the section title
section_title = pattern.search(title_element.text).group(1).lstrip()
print(section_title)
# Extract the PDF link
pdf_link = title_element['href']
# Create the full URL for the PDF
full_pdf_url = urljoin(url, pdf_link)
# Send a GET request to download the PDF file
pdf_response = requests.get(full_pdf_url)
if pdf_response.status_code == 200:
# Save the PDF file with a name based on the section title, inside the folder
pdf_filename = os.path.join(folder_name, f'{section_title}_2023_results.pdf')
with open(pdf_filename, 'wb') as pdf_file:
pdf_file.write(pdf_response.content)
print(f'PDF for "{section_title}" downloaded and saved to "{pdf_filename}"')
else:
print(f'Failed to download the PDF for "{section_title}".')
if __name__ == "__main__":
DL_2023_web_scraper()