From 17fd1735973de89068c86642e2441d220a495b8f Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:13:32 -0400 Subject: [PATCH 1/4] fix --- ca_on_guelph/people.py | 60 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py index ef9488cd..3d7ac254 100644 --- a/ca_on_guelph/people.py +++ b/ca_on_guelph/people.py @@ -1,7 +1,57 @@ -from utils import CSVScraper +from utils import CanadianPerson as Person +from utils import CanadianScraper +COUNCIL_PAGE = "https://guelph.ca/city-hall/mayor-and-council/city-council/" +MAYOR_PAGE = "https://guelph.ca/city-hall/mayor-and-council/mayors-office/" -class GuelphPersonScraper(CSVScraper): - # http://data.open.guelph.ca/dataset/city-of-guelph-contacts - csv_url = "http://data.open.guelph.ca/datafiles/guelph-mayor-and-councillors-contact-information-2018-2022.csv" - many_posts_per_area = True + +class GuelphPersonScraper(CanadianScraper): + def scrape(self): + page = self.lxmlize(COUNCIL_PAGE) + + councillor_nodes = page.xpath('.//div[@class="thumbnail"]')[1:] + assert len(councillor_nodes), "No councillors found" + + for councillor_node in councillor_nodes: + ward_district = councillor_node.xpath(".//h2/text()")[0].split(" Councillors")[0] + district = ward_district.split(" ")[-1] + + councillors = councillor_node.xpath(".//div/div") + for councillor in councillors: + role_and_name = councillor.xpath(".//h3/text()") + if not role_and_name: + continue + + role_and_name = councillor.xpath(".//h3/text()")[0] + name, role = role_and_name.split(" ", 1) + contact_info = councillor.xpath(".//p/text()") + phone = contact_info[1].strip() + email = self.get_email(councillor) + if councillor.xpath(".//p/img/@src"): + image = councillor.xpath(".//p/img/@src")[0] + else: + image = councillor.xpath(".//div/img/@src")[0] + + p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) + p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + + yield self.scrape_mayor(MAYOR_PAGE) + + def scrape_mayor(self, url): + page = self.lxmlize(url) + + mayor_node = page.xpath('.//div[@class="entry-content"]/p')[-1] + name = mayor_node.xpath(".//text()")[0].strip().split("Mayor ")[1] + phone = self.get_phone(mayor_node) + email = self.get_email(mayor_node) + image = mayor_node.xpath('//img[contains(@alt, "Mayor")]/@src')[0] + + p = Person(primary_org="legislature", name=name, district="Guelph", role="Mayor", image=image) + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) + p.add_source(MAYOR_PAGE) + + return p From 5d81fb1c72c907bb6974dac57bceec01cc94a505 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:28:42 -0400 Subject: [PATCH 2/4] simplify find image, fix name role --- ca_on_guelph/people.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py index 3d7ac254..4dbd275b 100644 --- a/ca_on_guelph/people.py +++ b/ca_on_guelph/people.py @@ -23,14 +23,10 @@ def scrape(self): continue role_and_name = councillor.xpath(".//h3/text()")[0] - name, role = role_and_name.split(" ", 1) - contact_info = councillor.xpath(".//p/text()") - phone = contact_info[1].strip() + role, name = role_and_name.split(" ", 1) + phone = councillor.xpath(".//p/text()")[1].strip() email = self.get_email(councillor) - if councillor.xpath(".//p/img/@src"): - image = councillor.xpath(".//p/img/@src")[0] - else: - image = councillor.xpath(".//div/img/@src")[0] + image = councillor.xpath(".//img/@src")[0] p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) p.add_contact("email", email) From b85cd512e4d18e416b04978fecff50ac492444d9 Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Thu, 31 Oct 2024 10:37:29 -0400 Subject: [PATCH 3/4] include ward in district --- ca_on_guelph/people.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py index 4dbd275b..a1debbbc 100644 --- a/ca_on_guelph/people.py +++ b/ca_on_guelph/people.py @@ -13,8 +13,7 @@ def scrape(self): assert len(councillor_nodes), "No councillors found" for councillor_node in councillor_nodes: - ward_district = councillor_node.xpath(".//h2/text()")[0].split(" Councillors")[0] - district = ward_district.split(" ")[-1] + district = councillor_node.xpath(".//h2/text()")[0].split("Councillors")[0].strip() councillors = councillor_node.xpath(".//div/div") for councillor in councillors: From 0b80560bd4ec7bd5852ff0305c570c09c71f626d Mon Sep 17 00:00:00 2001 From: Brighten Zhang Date: Mon, 4 Nov 2024 13:33:57 -0500 Subject: [PATCH 4/4] scrape correct csv file remove scraping webpage --- ca_on_guelph/people.py | 55 ++++-------------------------------------- 1 file changed, 5 insertions(+), 50 deletions(-) diff --git a/ca_on_guelph/people.py b/ca_on_guelph/people.py index a1debbbc..24eddf6e 100644 --- a/ca_on_guelph/people.py +++ b/ca_on_guelph/people.py @@ -1,52 +1,7 @@ -from utils import CanadianPerson as Person -from utils import CanadianScraper +from utils import CSVScraper -COUNCIL_PAGE = "https://guelph.ca/city-hall/mayor-and-council/city-council/" -MAYOR_PAGE = "https://guelph.ca/city-hall/mayor-and-council/mayors-office/" - -class GuelphPersonScraper(CanadianScraper): - def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) - - councillor_nodes = page.xpath('.//div[@class="thumbnail"]')[1:] - assert len(councillor_nodes), "No councillors found" - - for councillor_node in councillor_nodes: - district = councillor_node.xpath(".//h2/text()")[0].split("Councillors")[0].strip() - - councillors = councillor_node.xpath(".//div/div") - for councillor in councillors: - role_and_name = councillor.xpath(".//h3/text()") - if not role_and_name: - continue - - role_and_name = councillor.xpath(".//h3/text()")[0] - role, name = role_and_name.split(" ", 1) - phone = councillor.xpath(".//p/text()")[1].strip() - email = self.get_email(councillor) - image = councillor.xpath(".//img/@src")[0] - - p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) - p.add_contact("email", email) - if phone: - p.add_contact("voice", phone, "legislature") - p.add_source(COUNCIL_PAGE) - - yield self.scrape_mayor(MAYOR_PAGE) - - def scrape_mayor(self, url): - page = self.lxmlize(url) - - mayor_node = page.xpath('.//div[@class="entry-content"]/p')[-1] - name = mayor_node.xpath(".//text()")[0].strip().split("Mayor ")[1] - phone = self.get_phone(mayor_node) - email = self.get_email(mayor_node) - image = mayor_node.xpath('//img[contains(@alt, "Mayor")]/@src')[0] - - p = Person(primary_org="legislature", name=name, district="Guelph", role="Mayor", image=image) - p.add_contact("voice", phone, "legislature") - p.add_contact("email", email) - p.add_source(MAYOR_PAGE) - - return p +class GuelphPersonScraper(CSVScraper): + # https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about + csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv" + many_posts_per_area = True