Skip to content

Commit

Permalink
update Saskatchewan
Browse files Browse the repository at this point in the history
  • Loading branch information
DrMeers committed Jul 15, 2024
1 parent 2b4a9f8 commit 763ea86
Showing 1 changed file with 54 additions and 59 deletions.
113 changes: 54 additions & 59 deletions ca_sk/people.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from utils import CanadianPerson as Person
from utils import CanadianScraper

Expand All @@ -8,71 +10,64 @@ class SaskatchewanPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)

members = page.xpath('//table[@id="MLAs"]//tr')[1:]
members = page.xpath('//table[@id="mla-table"]//tr')[1:]
assert len(members), "No members found"
for member in members:
if "Vacant" not in member.xpath("./td")[0].text_content():
name = member.xpath("./td")[0].text_content().split(". ", 1)[1]
district = member.xpath("./td")[2].text_content()
url = member.xpath("./td[1]/a/@href")[0]
page = self.lxmlize(url)
party = page.xpath('//span[@id="ContentContainer_MainContent_ContentBottom_Property4"]' "/span")[
0
].text

p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
try:
p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]
except IndexError:
pass
if "Vacant" in member.xpath("./td")[1].text_content():
continue
name = member.xpath("./td")[0].text_content().split(". ", 1)[1].strip()
district = member.xpath("./td")[2].text_content().strip()
url = member.xpath("./td[1]/a/@href")[0]
page = self.lxmlize(url)
party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(' - ')[1].strip()

contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
website = contact.xpath("./div[3]/div[3]/div[2]/a")
if website:
p.add_link(website[0].text_content())
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p.add_source(COUNCIL_PAGE)
p.add_source(url)
try:
p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]
except IndexError:
pass

def handle_address(lines, address_type):
address_lines = []
for line in lines:
if line.endswith(":"): # Room:, Phone:, Fax:
break
address_lines.append(line)
if address_lines:
p.add_contact(
"address",
" ".join(address_lines),
address_type,
)
def handle_address(lines, address_type):
address_lines = []
for line in lines:
if re.match(r'(Room|Phone|Fax)\:', line):
break
address_lines.append(line)
if address_lines:
p.add_contact(
"address",
" ".join(address_lines),
address_type,
)

def handle_phone(lines, phone_type):
if "Phone:" in lines:
next_line = lines[lines.index("Phone:") + 1]
if next_line.endswith(":"):
return
number = None
if "/" in next_line:
for fragment in next_line.split("/"):
if fragment.strip().startswith("306-"):
number = fragment.strip()
break
else:
number = next_line
p.add_contact("voice", number, phone_type, area_code=306)
def handle_phone(lines, phone_type):
matches = re.findall(r'Phone\:\s*(306-[\d\-]+)', '\n'.join(lines))
if len(matches) == 1:
p.add_contact("voice", matches[0], phone_type, area_code=306)

legislature_lines = contact.xpath('.//div[@class="col-md-4"][1]/div//text()')
assert legislature_lines[0] == "Legislative Building Address"
handle_address(legislature_lines[1:], "legislature")
handle_phone(legislature_lines[1:], "legislature")
for address in page.xpath('//div[@class="col-md-3"]'):
lines = address.xpath('./div//text()')
address_type = None
if lines[0] == "Legislative Building Address":
address_type = "legislature"
elif lines[0] == "Constituency Address":
address_type = "constituency"
else:
raise AssertionError(f"Unexpected address type: {lines[0]}")
handle_address(lines[1:], address_type)
handle_phone(lines[1:], address_type)

constituency_lines = contact.xpath('.//div[@class="col-md-4"][2]/div//text()')
assert constituency_lines[0] == "Constituency Address"
handle_address(constituency_lines[1:], "constituency")
handle_phone(constituency_lines[1:], "constituency")
email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False)
if email:
p.add_contact("email", email)

email = self.get_email(contact, error=False)
if email:
p.add_contact("email", email)
websites = re.findall(
r'Website:\s*(http\S+)',
' '.join(page.xpath('//div[@class="col-md-4"]/div//text()'))
)
if len(websites) == 1:
p.add_link(websites[0])

yield p
yield p

0 comments on commit 763ea86

Please sign in to comment.