From 3f6de35e382b23480d4bfb02521221a66922355c Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 12:06:26 -0400 Subject: [PATCH 01/21] fix --- ca_on_markham/people.py | 83 +++++++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 2b01dfd7..cc20b976 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -3,10 +3,8 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = ( - "https://www.markham.ca/wps/portal/home/about/city-hall/regional-ward-councillors/02-regional-ward-councillors" -) -MAYOR_PAGE = "https://www.markham.ca/wps/portal/home/about/city-hall/mayor/00-mayors-office" +COUNCIL_PAGE = "https://www.markham.ca/about-city-markham/city-hall/regional-ward-councillors" +MAYOR_PAGE = "https://www.markham.ca/about-city-markham/city-hall/mayors-office" class MarkhamPersonScraper(CanadianScraper): @@ -17,25 +15,19 @@ def scrape(self): yield self.scrape_mayor(MAYOR_PAGE) - councillors = page.xpath('//div[@class="col-sm-3 col-xs-6"]') + regional_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[0] + ward_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[1] + councillors = [regional_councillors, ward_councillors] assert len(councillors), "No councillors found" - for councillor in councillors: - name, district = councillor.xpath(".//h4/text()")[0].split(", ") - if "Ward" in district: - district = district.replace("Councillor", "").strip() - role = "Councillor" - elif "Regional" in district: - role = "Regional Councillor" - district = f"Markham (seat {regional_councillor_seat_number})" - regional_councillor_seat_number += 1 - else: - role = district - district = "Markham" + for i, councillor in enumerate(regional_councillors): + name = councillor.xpath(".//h3/text()")[0].strip() + district = councillor.xpath(".//p/text()")[0].strip() + role = "Regional Councillor" + district = f"Markham (seat {regional_councillor_seat_number})" + regional_councillor_seat_number += 1 image = councillor.xpath(".//img/@src")[0] - url = "https://www.markham.ca/wps/portal/home/about" + re.search( - r"(?<=about).*(?='\))", councillor.xpath(".//a/@href")[0] - ).group(0) + url = councillor.xpath(".//a/@href")[0] address, phone, email, links = self.get_contact(url) @@ -52,14 +44,48 @@ def scrape(self): p.add_link(link) yield p + + for i, councillor in enumerate(ward_councillors): + name = councillor.xpath(".//h3/text()")[0].strip() + district = councillor.xpath(".//p/text()")[0].strip() + district = district.replace("Councillor", "").strip() + role = "Councillor" + image = councillor.xpath(".//img/@src")[0] + url = councillor.xpath(".//a/@href")[0] + + address, phone, email, links = self.get_contact(url) + + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + + p.image = image + p.add_contact("address", address, "legislature") + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) + + for link in links: + p.add_link(link) + + yield p + def get_contact(self, url): page = self.lxmlize(url) - contact_node = page.xpath('//div[@class="vcard col-sm-6"]')[0] + contact_node = page.xpath('//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]')[0] links = [] - address = contact_node.xpath(".//p/text()")[:2] + if contact_node.xpath('.//span[@class="address-line1"]/text()'): + address = (contact_node.xpath('.//span[@class="address-line1"]/text()')[0] + + " " + contact_node.xpath('.//span[@class="locality"]/text()')[0] + + " " + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0] + + " " + contact_node.xpath('.//span[@class="postal-code"]/text()')[0] + + " " + contact_node.xpath('.//span[@class="country"]/text()')[0]) + else: + contact_node = page.xpath('//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]')[0] + address = contact_node.xpath('.//p/text()')[0] + " " + contact_node.xpath('.//p/text()')[1] + links = get_links(contact_node) phone = self.get_phone(contact_node) email = self.get_email(contact_node) @@ -68,12 +94,13 @@ def get_contact(self, url): def scrape_mayor(self, url): page = self.lxmlize(url) - name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(", ", 1)[1] - email = self.get_email(page) - phone = self.get_phone(page) - + name = page.xpath('.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()')[0] + contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0] + email = self.get_email(contact_node) + phone = self.get_phone(contact_node) + p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor") - p.image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0] + p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0] p.add_contact("email", email) p.add_contact("voice", phone, "legislature") p.add_source(url) @@ -86,6 +113,6 @@ def get_links(elem): links = elem.xpath(".//a") for link in links: link = link.attrib["href"] - if "http://www.markham.ca" not in link and "mail" not in link: + if "http://www.markham.ca" not in link and "mail" not in link and "tel" not in link: links_r.append(link) return links_r From 2c417f86f74e25a12a3d225d780cbc4c744dbe58 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 15:05:49 -0400 Subject: [PATCH 02/21] cleanup --- ca_on_markham/people.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index cc20b976..5f8ae577 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -1,5 +1,3 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -19,7 +17,7 @@ def scrape(self): ward_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[1] councillors = [regional_councillors, ward_councillors] assert len(councillors), "No councillors found" - for i, councillor in enumerate(regional_councillors): + for councillor in regional_councillors: name = councillor.xpath(".//h3/text()")[0].strip() district = councillor.xpath(".//p/text()")[0].strip() role = "Regional Councillor" @@ -44,8 +42,8 @@ def scrape(self): p.add_link(link) yield p - - for i, councillor in enumerate(ward_councillors): + + for councillor in ward_councillors: name = councillor.xpath(".//h3/text()")[0].strip() district = councillor.xpath(".//p/text()")[0].strip() district = district.replace("Councillor", "").strip() @@ -69,7 +67,7 @@ def scrape(self): p.add_link(link) yield p - + def get_contact(self, url): page = self.lxmlize(url) @@ -85,7 +83,7 @@ def get_contact(self, url): else: contact_node = page.xpath('//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]')[0] address = contact_node.xpath('.//p/text()')[0] + " " + contact_node.xpath('.//p/text()')[1] - + links = get_links(contact_node) phone = self.get_phone(contact_node) email = self.get_email(contact_node) From 5fe2b2175cf5aa46b0f50d580d80c0ac69dc51f7 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 15:07:10 -0400 Subject: [PATCH 03/21] cleanup --- ca_on_markham/people.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 5f8ae577..2def610d 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -67,7 +67,7 @@ def scrape(self): p.add_link(link) yield p - + def get_contact(self, url): page = self.lxmlize(url) @@ -96,7 +96,7 @@ def scrape_mayor(self, url): contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0] email = self.get_email(contact_node) phone = self.get_phone(contact_node) - + p = Person(primary_org="legislature", name=name, district="Markham", role="Mayor") p.image = page.xpath('.//div[@class="align-right media--image"]/div/img/@src')[0] p.add_contact("email", email) From 8ea25dda32894d71b35a2d97447dfae8f136bb04 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 14:48:01 -0400 Subject: [PATCH 04/21] fix --- ca_on_thunder_bay/people.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index 0c09dcac..86d56487 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -45,4 +45,4 @@ def scrape(self): def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key - return super().lxmlize(url, encoding, user_agent, cookies, xml) + return super().lxmlize(url, encoding, user_agent=user_agent, cookies=cookies, xml=xml) From dc1fe25d24217e34c99ace0cf3b458bb7eb84bdd Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 10:07:58 -0400 Subject: [PATCH 05/21] Fixed index list out of range --- ca_qc_dollard_des_ormeaux/people.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ca_qc_dollard_des_ormeaux/people.py b/ca_qc_dollard_des_ormeaux/people.py index ef11a5f8..74045a82 100644 --- a/ca_qc_dollard_des_ormeaux/people.py +++ b/ca_qc_dollard_des_ormeaux/people.py @@ -29,8 +29,10 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath(".//@data-src")[0] - + img_path = councillor.xpath(".//@data-src") + if img_path: + p.image = img_path[0] + print(p.image) p.add_contact("email", email) p.add_contact("voice", general_phone, "legislature") p.add_contact("fax", general_fax, "legislature") From 44c9d68c98bf3278a0183f85f1f8a276f4613420 Mon Sep 17 00:00:00 2001 From: Samuel Pei Date: Tue, 29 Oct 2024 16:39:08 -0400 Subject: [PATCH 06/21] Removed unnecessary print statement --- ca_qc_dollard_des_ormeaux/people.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ca_qc_dollard_des_ormeaux/people.py b/ca_qc_dollard_des_ormeaux/people.py index 74045a82..f9b2ccda 100644 --- a/ca_qc_dollard_des_ormeaux/people.py +++ b/ca_qc_dollard_des_ormeaux/people.py @@ -32,7 +32,6 @@ def scrape(self): img_path = councillor.xpath(".//@data-src") if img_path: p.image = img_path[0] - print(p.image) p.add_contact("email", email) p.add_contact("voice", general_phone, "legislature") p.add_contact("fax", general_fax, "legislature") From 36e54c922ee10cf915a8ffecdfa8e9cf034053ea Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 30 Oct 2024 17:37:49 -0400 Subject: [PATCH 07/21] chore: Use consistent variable name --- ca_qc_dollard_des_ormeaux/people.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ca_qc_dollard_des_ormeaux/people.py b/ca_qc_dollard_des_ormeaux/people.py index f9b2ccda..1753f0ff 100644 --- a/ca_qc_dollard_des_ormeaux/people.py +++ b/ca_qc_dollard_des_ormeaux/people.py @@ -29,9 +29,9 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - img_path = councillor.xpath(".//@data-src") - if img_path: - p.image = img_path[0] + image = councillor.xpath(".//@data-src") + if image: + p.image = image[0] p.add_contact("email", email) p.add_contact("voice", general_phone, "legislature") p.add_contact("fax", general_fax, "legislature") From ed22c5a7e01e5e60bf2ab5228f292bb0e7244019 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:45:53 -0400 Subject: [PATCH 08/21] fix --- ca_on_markham/people.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 2def610d..7f4384bd 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -13,8 +13,12 @@ def scrape(self): yield self.scrape_mayor(MAYOR_PAGE) - regional_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[0] - ward_councillors = page.xpath('//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]')[1] + regional_councillors = page.xpath( + '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' + )[0] + ward_councillors = page.xpath( + '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' + )[1] councillors = [regional_councillors, ward_councillors] assert len(councillors), "No councillors found" for councillor in regional_councillors: @@ -71,18 +75,28 @@ def scrape(self): def get_contact(self, url): page = self.lxmlize(url) - contact_node = page.xpath('//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]')[0] + contact_node = page.xpath( + '//div[@class="pd-x-16 pd-y-32 bg-white committee-right-info-section layout__region layout__region--second"]' + )[0] links = [] if contact_node.xpath('.//span[@class="address-line1"]/text()'): - address = (contact_node.xpath('.//span[@class="address-line1"]/text()')[0] - + " " + contact_node.xpath('.//span[@class="locality"]/text()')[0] - + " " + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0] - + " " + contact_node.xpath('.//span[@class="postal-code"]/text()')[0] - + " " + contact_node.xpath('.//span[@class="country"]/text()')[0]) + address = ( + contact_node.xpath('.//span[@class="address-line1"]/text()')[0] + + " " + + contact_node.xpath('.//span[@class="locality"]/text()')[0] + + " " + + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0] + + " " + + contact_node.xpath('.//span[@class="postal-code"]/text()')[0] + + " " + + contact_node.xpath('.//span[@class="country"]/text()')[0] + ) else: - contact_node = page.xpath('//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]')[0] - address = contact_node.xpath('.//p/text()')[0] + " " + contact_node.xpath('.//p/text()')[1] + contact_node = page.xpath( + '//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]' + )[0] + address = contact_node.xpath(".//p/text()")[0] + " " + contact_node.xpath(".//p/text()")[1] links = get_links(contact_node) phone = self.get_phone(contact_node) @@ -92,7 +106,9 @@ def get_contact(self, url): def scrape_mayor(self, url): page = self.lxmlize(url) - name = page.xpath('.//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()')[0] + name = page.xpath( + './/div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]/h1/span/span/text()' + )[0] contact_node = page.xpath('.//div[@class="dept-contact-info--block"]')[0] email = self.get_email(contact_node) phone = self.get_phone(contact_node) From 5a348b393a05fd057ef5ae0d7912ff58f7190494 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 15:37:21 -0400 Subject: [PATCH 09/21] cleanup --- ca_bc_victoria/people.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py index 9796b6ca..12a70505 100644 --- a/ca_bc_victoria/people.py +++ b/ca_bc_victoria/people.py @@ -36,7 +36,10 @@ def scrape(self): '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/@href' )[0] page = self.lxmlize(mayor_url) - role, name = page.xpath("//h1/span")[0].text_content().split(" ", 1) + role = "Mayor" + role, name = page.xpath( + '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/text()' + )[0].split(" ", 1) photo = councillor.xpath('//div[@class="field__item"]/img/@src')[0] email = self.get_email(page) phone = self.get_phone(page) From c5517886c31c8ebef30ff60dc5032f32e61ddddd Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:53:45 -0400 Subject: [PATCH 10/21] remove --- ca_bc_victoria/people.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py index 12a70505..8e35d5d9 100644 --- a/ca_bc_victoria/people.py +++ b/ca_bc_victoria/people.py @@ -36,7 +36,6 @@ def scrape(self): '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/@href' )[0] page = self.lxmlize(mayor_url) - role = "Mayor" role, name = page.xpath( '//ul[@class="menu menu--level-0"]//a[contains(., "Mayor") and not(contains(., "Council"))]/text()' )[0].split(" ", 1) From aca7b8474f12897def384a2fe78d0415dff85918 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Tue, 29 Oct 2024 10:23:10 -0400 Subject: [PATCH 11/21] undo me --- ca_on_wilmot/people.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index d9676cfe..68497e2c 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -1,14 +1,16 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl" +COUNCIL_PAGE = "https://www.wilmot.ca/en/township-office/council.aspx" class WilmotPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@class="contactList"]//tr') + # councillors = page.xpath('//div[@id="StandardOneColumnTK1_lm1723651463356"]') + councillors = page.xpath('//div[contains(@class, "icrtAccordion")]') + print("councillors", councillors) assert len(councillors), "No councillors found" for councillor in councillors: name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1) From 911613282882209c4e6854151e9f6c1cd72d14a0 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 11:58:26 -0400 Subject: [PATCH 12/21] fix --- ca_on_wilmot/people.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 68497e2c..2d5ea96d 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -8,22 +8,28 @@ class WilmotPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - # councillors = page.xpath('//div[@id="StandardOneColumnTK1_lm1723651463356"]') - councillors = page.xpath('//div[contains(@class, "icrtAccordion")]') - print("councillors", councillors) + councillors = page.xpath('.//table[@class="icrtAccordion"]//tr') + councillors = parse_counsillors(councillors) assert len(councillors), "No councillors found" for councillor in councillors: - name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1) - if "Mayor" in role_district: - yield scrape_mayor(councillor, name) - continue - role, district = role_district.split(" - ") - + roleAndName, contactInfo = councillor + try: + role, name = roleAndName.text_content().strip().split("—\xa0") + except: + role, name = roleAndName.text_content().strip().split("— ") + + if "Councillor" in role: + district = role.split(" Councillor")[0] + role = "Councillor" + else: + district = "Wilmot" + + phone = self.get_phone(contactInfo) + email = self.get_email(contactInfo) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - - phone = self.get_phone(councillor).replace("/", "") p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) yield p @@ -39,3 +45,6 @@ def scrape_mayor(div, name): p.add_contact("voice", other_phone, "office") return p + +def parse_counsillors(councillors): + return [councillors[i:i + 2] for i in range(0, len(councillors), 2)] \ No newline at end of file From b185fdc38c5648579e0b80ea9c1d542e5a4f9bbe Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Wed, 30 Oct 2024 15:19:23 -0400 Subject: [PATCH 13/21] cleanup --- ca_on_wilmot/people.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 2d5ea96d..c9c16a7f 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -12,11 +12,11 @@ def scrape(self): councillors = parse_counsillors(councillors) assert len(councillors), "No councillors found" for councillor in councillors: - roleAndName, contactInfo = councillor - try: - role, name = roleAndName.text_content().strip().split("—\xa0") - except: - role, name = roleAndName.text_content().strip().split("— ") + role_name, contact_info = councillor + if "—\xa0" in role_name.text_content().strip(): + role, name = role_name.text_content().strip().split("—\xa0") + else: + role, name = role_name.text_content().strip().split("— ") if "Councillor" in role: district = role.split(" Councillor")[0] @@ -24,8 +24,8 @@ def scrape(self): else: district = "Wilmot" - phone = self.get_phone(contactInfo) - email = self.get_email(contactInfo) + phone = self.get_phone(contact_info) + email = self.get_email(contact_info) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact("voice", phone, "legislature") @@ -47,4 +47,4 @@ def scrape_mayor(div, name): return p def parse_counsillors(councillors): - return [councillors[i:i + 2] for i in range(0, len(councillors), 2)] \ No newline at end of file + return [councillors[i:i + 2] for i in range(0, len(councillors), 2)] From 5d93ed684174f106a5b109f2094f07a640bed818 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Thu, 31 Oct 2024 10:47:25 -0400 Subject: [PATCH 14/21] Update ca_on_wilmot/people.py Simplify Co-authored-by: James McKinney <26463+jpmckinney@users.noreply.github.com> --- ca_on_wilmot/people.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index c9c16a7f..1b7828ca 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -13,10 +13,11 @@ def scrape(self): assert len(councillors), "No councillors found" for councillor in councillors: role_name, contact_info = councillor - if "—\xa0" in role_name.text_content().strip(): - role, name = role_name.text_content().strip().split("—\xa0") + role_name = role_name.text_content().strip() + if "—\xa0" in role_name: + role, name = role_name.split("—\xa0") else: - role, name = role_name.text_content().strip().split("— ") + role, name = role_name.split("— ") if "Councillor" in role: district = role.split(" Councillor")[0] From b2b178cee1a5f97cfcbff8b925bb29126fd80554 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:51:15 -0400 Subject: [PATCH 15/21] trigger pre commit ci --- ca_on_wilmot/people.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 1b7828ca..7045af15 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -47,5 +47,6 @@ def scrape_mayor(div, name): return p + def parse_counsillors(councillors): - return [councillors[i:i + 2] for i in range(0, len(councillors), 2)] + return [councillors[i : i + 2] for i in range(0, len(councillors), 2)] From 4f95e76de491aea0b0670e3ef289d6e24ae25de6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:11:11 -0400 Subject: [PATCH 16/21] chore: Tidy Wilmot --- ca_on_wilmot/people.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 7045af15..29797efe 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -9,15 +9,10 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('.//table[@class="icrtAccordion"]//tr') - councillors = parse_counsillors(councillors) assert len(councillors), "No councillors found" - for councillor in councillors: - role_name, contact_info = councillor - role_name = role_name.text_content().strip() - if "—\xa0" in role_name: - role, name = role_name.split("—\xa0") - else: - role, name = role_name.split("— ") + for i in range(0, len(councillors), 2): + role_name, contact_info = councillors[i], councillors[i + 1] + role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ") if "Councillor" in role: district = role.split(" Councillor")[0] @@ -46,7 +41,3 @@ def scrape_mayor(div, name): p.add_contact("voice", other_phone, "office") return p - - -def parse_counsillors(councillors): - return [councillors[i : i + 2] for i in range(0, len(councillors), 2)] From b6dc6a89c497406ed8de599b31c462a5462f1e5d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:11:35 -0400 Subject: [PATCH 17/21] build: Add .python-version --- .python-version | 1 + 1 file changed, 1 insertion(+) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 00000000..bd28b9c5 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.9 From 361c1e3c118914d1f05f1cc835a0762c3d0cc9a6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:17:37 -0400 Subject: [PATCH 18/21] ca_on_wilmot: Remove unused scrapy_mayor function --- ca_on_wilmot/people.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 29797efe..d4ac6f1c 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -8,15 +8,17 @@ class WilmotPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('.//table[@class="icrtAccordion"]//tr') + councillors = page.xpath('//table[@class="icrtAccordion"]//tr') assert len(councillors), "No councillors found" for i in range(0, len(councillors), 2): role_name, contact_info = councillors[i], councillors[i + 1] role, name = role_name.text_content().strip().replace("\xa0", " ").split("— ") + # "Ward 1 Councillor" if "Councillor" in role: district = role.split(" Councillor")[0] role = "Councillor" + # "Mayor", "Executive Officer to the Mayor and Council" else: district = "Wilmot" @@ -27,17 +29,3 @@ def scrape(self): p.add_contact("voice", phone, "legislature") p.add_contact("email", email) yield p - - -def scrape_mayor(div, name): - p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor") - p.add_source(COUNCIL_PAGE) - - address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content() - phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0] - other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0] - p.add_contact("address", address, "legislature") - p.add_contact("voice", phone, "legislature") - p.add_contact("voice", other_phone, "office") - - return p From 4853a958c7ebc5dd87b602e290334e9763fd6604 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:25:53 -0400 Subject: [PATCH 19/21] ca_on_markham: Reduce repetition --- ca_on_markham/people.py | 81 ++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 53 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 7f4384bd..8fe1e105 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -7,70 +7,45 @@ class MarkhamPersonScraper(CanadianScraper): def scrape(self): - regional_councillor_seat_number = 1 - - page = self.lxmlize(COUNCIL_PAGE) - yield self.scrape_mayor(MAYOR_PAGE) - regional_councillors = page.xpath( + groups = self.lxmlize(COUNCIL_PAGE).xpath( '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' - )[0] - ward_councillors = page.xpath( - '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' - )[1] - councillors = [regional_councillors, ward_councillors] - assert len(councillors), "No councillors found" - for councillor in regional_councillors: - name = councillor.xpath(".//h3/text()")[0].strip() - district = councillor.xpath(".//p/text()")[0].strip() - role = "Regional Councillor" - district = f"Markham (seat {regional_councillor_seat_number})" - regional_councillor_seat_number += 1 - - image = councillor.xpath(".//img/@src")[0] - url = councillor.xpath(".//a/@href")[0] + ) + assert len(groups) == 2, "No councillors found" - address, phone, email, links = self.get_contact(url) - - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - - p.image = image - p.add_contact("address", address, "legislature") - p.add_contact("voice", phone, "legislature") - p.add_contact("email", email) - - for link in links: - p.add_link(link) - - yield p + regional_councillor_seat_number = 1 + for i, group in enumerate(groups): + for councillor in group: + name = councillor.xpath(".//h3/text()")[0].strip() + district = councillor.xpath(".//p/text()")[0].strip() - for councillor in ward_councillors: - name = councillor.xpath(".//h3/text()")[0].strip() - district = councillor.xpath(".//p/text()")[0].strip() - district = district.replace("Councillor", "").strip() - role = "Councillor" + if i == 0: + role = "Regional Councillor" + district = f"Markham (seat {regional_councillor_seat_number})" + regional_councillor_seat_number += 1 + else: + role = "Councillor" + district = district.replace("Councillor", "").strip() - image = councillor.xpath(".//img/@src")[0] - url = councillor.xpath(".//a/@href")[0] + image = councillor.xpath(".//img/@src")[0] + url = councillor.xpath(".//a/@href")[0] - address, phone, email, links = self.get_contact(url) + address, phone, email, links = self.get_contact(url) - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) - p.image = image - p.add_contact("address", address, "legislature") - p.add_contact("voice", phone, "legislature") - p.add_contact("email", email) + p.image = image + p.add_contact("address", address, "legislature") + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) - for link in links: - p.add_link(link) + for link in links: + p.add_link(link) - yield p + yield p def get_contact(self, url): page = self.lxmlize(url) From 746369d037c5e4aef7ac07fb0a9eb5b53a714787 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 1 Nov 2024 12:37:18 -0400 Subject: [PATCH 20/21] ca_on_markham: Align with original code --- ca_on_markham/people.py | 81 +++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index 8fe1e105..276f91f0 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -7,45 +7,50 @@ class MarkhamPersonScraper(CanadianScraper): def scrape(self): + regional_councillor_seat_number = 1 + + page = self.lxmlize(COUNCIL_PAGE) + yield self.scrape_mayor(MAYOR_PAGE) - groups = self.lxmlize(COUNCIL_PAGE).xpath( - '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]' + councillors = page.xpath( + '//div[@class="grid md:grid-cols-2 grid-cols-1 lg:grid-cols-4 gap-4 scrollablec"]/div' ) - assert len(groups) == 2, "No councillors found" + assert len(councillors), "No councillors found" - regional_councillor_seat_number = 1 - for i, group in enumerate(groups): - for councillor in group: - name = councillor.xpath(".//h3/text()")[0].strip() - district = councillor.xpath(".//p/text()")[0].strip() + for councillor in councillors: + name = councillor.xpath(".//h3/text()")[0].strip() + district = councillor.xpath(".//p/text()")[0].strip() - if i == 0: - role = "Regional Councillor" - district = f"Markham (seat {regional_councillor_seat_number})" - regional_councillor_seat_number += 1 - else: - role = "Councillor" - district = district.replace("Councillor", "").strip() + if "Ward" in district: + district = district.replace("Councillor", "").strip() + role = "Councillor" + elif "Regional" in district: + role = "Regional Councillor" + district = f"Markham (seat {regional_councillor_seat_number})" + regional_councillor_seat_number += 1 + else: + role = district + district = "Markham" - image = councillor.xpath(".//img/@src")[0] - url = councillor.xpath(".//a/@href")[0] + image = councillor.xpath(".//img/@src")[0] + url = councillor.xpath(".//a/@href")[0] - address, phone, email, links = self.get_contact(url) + address, phone, email, links = self.get_contact(url) - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) - p.image = image - p.add_contact("address", address, "legislature") - p.add_contact("voice", phone, "legislature") - p.add_contact("email", email) + p.image = image + p.add_contact("address", address, "legislature") + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) - for link in links: - p.add_link(link) + for link in links: + p.add_link(link) - yield p + yield p def get_contact(self, url): page = self.lxmlize(url) @@ -56,22 +61,20 @@ def get_contact(self, url): links = [] if contact_node.xpath('.//span[@class="address-line1"]/text()'): - address = ( - contact_node.xpath('.//span[@class="address-line1"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="locality"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="postal-code"]/text()')[0] - + " " - + contact_node.xpath('.//span[@class="country"]/text()')[0] + address = " ".join( + ( + contact_node.xpath('.//span[@class="address-line1"]/text()')[0], + contact_node.xpath('.//span[@class="locality"]/text()')[0], + contact_node.xpath('.//span[@class="administrative-area"]/text()')[0], + contact_node.xpath('.//span[@class="postal-code"]/text()')[0], + contact_node.xpath('.//span[@class="country"]/text()')[0], + ) ) else: contact_node = page.xpath( '//div[@class="formatted-text field-content field-content--label--body field-content--entity-type--block-content field-content--name--body"]' )[0] - address = contact_node.xpath(".//p/text()")[0] + " " + contact_node.xpath(".//p/text()")[1] + address = f'{contact_node.xpath(".//p/text()")[0]} {contact_node.xpath(".//p/text()")[1]}' links = get_links(contact_node) phone = self.get_phone(contact_node) From b088f711986f0b3131d8dba6bd7e1f33f6d54566 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 4 Nov 2024 14:33:15 -0500 Subject: [PATCH 21/21] Fix Guelph scraper, update csv file url (#444) * fix * simplify find image, fix name role * include ward in district * scrape correct csv file remove scraping webpage --- ca_on_guelph/people.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py index ef9488cd..24eddf6e 100644 --- a/ca_on_guelph/people.py +++ b/ca_on_guelph/people.py @@ -2,6 +2,6 @@ class GuelphPersonScraper(CSVScraper): - # http://data.open.guelph.ca/dataset/city-of-guelph-contacts - csv_url = "http://data.open.guelph.ca/datafiles/guelph-mayor-and-councillors-contact-information-2018-2022.csv" + # https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about + csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv" many_posts_per_area = True