From 6ba3826af1d9f2607db5400f85387c536168464b Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Fri, 8 Nov 2024 14:26:18 -0500 Subject: [PATCH] Re-implemented scraper using preceding-sibling --- ca_qc_sainte_anne_de_bellevue/people.py | 49 +++++++------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 5b6ba033..376a0baf 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -6,51 +6,28 @@ class SainteAnneDeBellevuePersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="col-md-12"]')[0] + councillors = page.xpath('//p[a[contains(@href, "@")]]') assert len(councillors), "No councillors found" - roles_and_districts = councillors.xpath(".//h2/text()") - roles = [] - districts = [] - names = [] - emails = [] + for councillor in councillors: + role = councillor.xpath("./preceding-sibling::h2[1]/text()")[0] - # Fill in roles and districts via h2 tags - for role in roles_and_districts: - role_and_district = role.split() - - roles.append(role_and_district[0]) - - if len(role_and_district) == 1: - districts.append("Sainte-Anne-de-Bellevue") + if role == "Maire": + district = "Sainte-Anne-de-Bellevue" else: - districts.append("District " + role_and_district[2]) + district = "District " + role.split()[2] + role = "Conseiller" - # Fill in contact info via p tags. - contact_info = councillors.xpath('.//p[a[contains(@href, "@")]]') - for contact in contact_info: - contact = contact.text_content().split() - name = " ".join(contact[:2]) - names.append(name) + councillor = councillor.text_content().split() - email = contact[3] + name = " ".join(councillor[:2]) + email = councillor[3] email = email.replace("Président", "") - emails.append(email) - assert len(roles) == len(districts) == len(names) == len(emails), "Lists are not of equal length" - for i in range(len(roles)): - p = Person(primary_org="legislature", name=names[i], district=districts[i], role=roles[i]) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_contact("email", emails[i]) + p.add_contact("email", email) + yield p