From 763ea86c98e3dcc2f797011514d21d4c257ce2ab Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Mon, 15 Jul 2024 14:56:34 +1000 Subject: [PATCH] update Saskatchewan --- ca_sk/people.py | 113 +++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/ca_sk/people.py b/ca_sk/people.py index 6f46fe52..b6a584b7 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -1,3 +1,5 @@ +import re + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -8,71 +10,64 @@ class SaskatchewanPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - members = page.xpath('//table[@id="MLAs"]//tr')[1:] + members = page.xpath('//table[@id="mla-table"]//tr')[1:] assert len(members), "No members found" for member in members: - if "Vacant" not in member.xpath("./td")[0].text_content(): - name = member.xpath("./td")[0].text_content().split(". ", 1)[1] - district = member.xpath("./td")[2].text_content() - url = member.xpath("./td[1]/a/@href")[0] - page = self.lxmlize(url) - party = page.xpath('//span[@id="ContentContainer_MainContent_ContentBottom_Property4"]' "/span")[ - 0 - ].text - - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - try: - p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - except IndexError: - pass + if "Vacant" in member.xpath("./td")[1].text_content(): + continue + name = member.xpath("./td")[0].text_content().split(". ", 1)[1].strip() + district = member.xpath("./td")[2].text_content().strip() + url = member.xpath("./td[1]/a/@href")[0] + page = self.lxmlize(url) + party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(' - ')[1].strip() - contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] - website = contact.xpath("./div[3]/div[3]/div[2]/a") - if website: - p.add_link(website[0].text_content()) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + try: + p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] + except IndexError: + pass - def handle_address(lines, address_type): - address_lines = [] - for line in lines: - if line.endswith(":"): # Room:, Phone:, Fax: - break - address_lines.append(line) - if address_lines: - p.add_contact( - "address", - " ".join(address_lines), - address_type, - ) + def handle_address(lines, address_type): + address_lines = [] + for line in lines: + if re.match(r'(Room|Phone|Fax)\:', line): + break + address_lines.append(line) + if address_lines: + p.add_contact( + "address", + " ".join(address_lines), + address_type, + ) - def handle_phone(lines, phone_type): - if "Phone:" in lines: - next_line = lines[lines.index("Phone:") + 1] - if next_line.endswith(":"): - return - number = None - if "/" in next_line: - for fragment in next_line.split("/"): - if fragment.strip().startswith("306-"): - number = fragment.strip() - break - else: - number = next_line - p.add_contact("voice", number, phone_type, area_code=306) + def handle_phone(lines, phone_type): + matches = re.findall(r'Phone\:\s*(306-[\d\-]+)', '\n'.join(lines)) + if len(matches) == 1: + p.add_contact("voice", matches[0], phone_type, area_code=306) - legislature_lines = contact.xpath('.//div[@class="col-md-4"][1]/div//text()') - assert legislature_lines[0] == "Legislative Building Address" - handle_address(legislature_lines[1:], "legislature") - handle_phone(legislature_lines[1:], "legislature") + for address in page.xpath('//div[@class="col-md-3"]'): + lines = address.xpath('./div//text()') + address_type = None + if lines[0] == "Legislative Building Address": + address_type = "legislature" + elif lines[0] == "Constituency Address": + address_type = "constituency" + else: + raise AssertionError(f"Unexpected address type: {lines[0]}") + handle_address(lines[1:], address_type) + handle_phone(lines[1:], address_type) - constituency_lines = contact.xpath('.//div[@class="col-md-4"][2]/div//text()') - assert constituency_lines[0] == "Constituency Address" - handle_address(constituency_lines[1:], "constituency") - handle_phone(constituency_lines[1:], "constituency") + email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False) + if email: + p.add_contact("email", email) - email = self.get_email(contact, error=False) - if email: - p.add_contact("email", email) + websites = re.findall( + r'Website:\s*(http\S+)', + ' '.join(page.xpath('//div[@class="col-md-4"]/div//text()')) + ) + if len(websites) == 1: + p.add_link(websites[0]) - yield p + yield p