-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcherwell_council_tax.py
84 lines (66 loc) · 2.98 KB
/
cherwell_council_tax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import requests
from bs4 import BeautifulSoup
import csv
import time
from urllib.parse import urljoin
# Define the URL of the list page
list_page_url = 'https://www.cherwell.gov.uk/directory/149/council-tax-charges-202425'
# Send a GET request to get the content of the list page
response = requests.get(list_page_url)
response.raise_for_status() # Check if the request was successful
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the <ul> tag that contains all the links
ul = soup.find('ul', {'class': 'list list--rich list--group-col2'})
# Extract names and URLs
records = []
for li in ul.find_all('li', {'class': 'list__item'}):
a_tag = li.find('a', {'class': 'list__link'})
name = a_tag.text.strip()
href = a_tag['href']
url = urljoin('https://www.cherwell.gov.uk', href)
records.append({'name': name, 'url': url})
# Define the list of field names (CSV column names)
fieldnames = ['name', 'Band A (6/9)', 'Band B (7/9)',
'Band C (8/9)', 'Band D (9/9)', 'Band E (11/9)', 'Band F (13/9)',
'Band G (15/9)', 'Band H (18/9)', 'Council']
# Open the CSV file for writing
with open('cherwell_council_tax.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
# Iterate over each record and visit the detail page
for index, record in enumerate(records):
detail_page_url = record['url']
print(f"Processing {index + 1}/{len(records)}: {record['name']}")
try:
# Send a GET request to get the detail page content
response = requests.get(detail_page_url)
response.raise_for_status()
# Parse the HTML content of the detail page
detail_soup = BeautifulSoup(response.content, 'html.parser')
# Find the <dl> definition list
dl = detail_soup.find('dl', {'class': 'list list--definition'})
if not dl:
print(f"Definition list not found, skipping {record['name']}")
continue
# Extract key-value pairs
dt_tags = dl.find_all('dt', {'class': 'list--definition__heading'})
dd_tags = dl.find_all('dd', {'class': 'list--definition__content'})
for dt, dd in zip(dt_tags, dd_tags):
key = dt.text.strip()
value = dd.text.strip()
# Only keep the 'Band' fields
if 'Band' in key:
record[key] = value
# Add the 'Council' field
record['Council'] = 1
# Remove unwanted fields
record.pop('url', None)
record.pop('Location', None)
record.pop('Information', None)
# Write the record to the CSV file
writer.writerow(record)
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}")
# Wait 1 second between requests to be polite to the server
time.sleep(1)