Skip to content

Commit

Permalink
Re-implemented scraper using preceding-sibling
Browse files Browse the repository at this point in the history
  • Loading branch information
samJMA committed Nov 8, 2024
1 parent 9c476c8 commit 6ba3826
Showing 1 changed file with 13 additions and 36 deletions.
49 changes: 13 additions & 36 deletions ca_qc_sainte_anne_de_bellevue/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,51 +6,28 @@

class SainteAnneDeBellevuePersonScraper(CanadianScraper):
def scrape(self):
def decode_email(e):
de = ""
k = int(e[:2], 16)

for i in range(2, len(e) - 1, 2):
de += chr(int(e[i : i + 2], 16) ^ k)

return de

page = self.lxmlize(COUNCIL_PAGE)

councillors = page.xpath('//div[@class="col-md-12"]')[0]
councillors = page.xpath('//p[a[contains(@href, "@")]]')
assert len(councillors), "No councillors found"

roles_and_districts = councillors.xpath(".//h2/text()")
roles = []
districts = []
names = []
emails = []
for councillor in councillors:
role = councillor.xpath("./preceding-sibling::h2[1]/text()")[0]

# Fill in roles and districts via h2 tags
for role in roles_and_districts:
role_and_district = role.split()

roles.append(role_and_district[0])

if len(role_and_district) == 1:
districts.append("Sainte-Anne-de-Bellevue")
if role == "Maire":
district = "Sainte-Anne-de-Bellevue"
else:
districts.append("District " + role_and_district[2])
district = "District " + role.split()[2]
role = "Conseiller"

# Fill in contact info via p tags.
contact_info = councillors.xpath('.//p[a[contains(@href, "@")]]')
for contact in contact_info:
contact = contact.text_content().split()
name = " ".join(contact[:2])
names.append(name)
councillor = councillor.text_content().split()

email = contact[3]
name = " ".join(councillor[:2])
email = councillor[3]
email = email.replace("Président", "")
emails.append(email)

assert len(roles) == len(districts) == len(names) == len(emails), "Lists are not of equal length"
for i in range(len(roles)):
p = Person(primary_org="legislature", name=names[i], district=districts[i], role=roles[i])
p = Person(primary_org="legislature", name=name, district=district, role=role)
p.add_source(COUNCIL_PAGE)
p.add_contact("email", emails[i])
p.add_contact("email", email)

yield p

0 comments on commit 6ba3826

Please sign in to comment.