Skip to content

Commit

Permalink
More events scraping (#252)
Browse files Browse the repository at this point in the history
* venture lab + engineering events

* rodin events

* formatting

* lint black

* headers

* ..

* ...

* ....

* small fixes

* flake

* college houses

* lint

* lint black

* .

* .

* .

* Lint

* Revert migration file

* Revert remaining files

* wharton events

* lint

* walrus

* oops

* Dev Container

* x86 arch

* image urls and emails and other fixes

* judtin minor fixes

* college house list of tuples

---------

Co-authored-by: Justin Zhang <[email protected]>
  • Loading branch information
ashleyzhang01 and judtinzhang authored Mar 17, 2024
1 parent 32a8a34 commit 0905840
Show file tree
Hide file tree
Showing 8 changed files with 500 additions and 5 deletions.
128 changes: 128 additions & 0 deletions backend/penndata/management/commands/get_college_house_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import datetime

import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand
from django.utils import timezone

from penndata.models import Event


EVENT_TYPE_MAP = [
("https://rodin.house.upenn.edu/calendar", Event.TYPE_RODIN_COLLEGE_HOUSE),
("https://harnwell.house.upenn.edu/calendar", Event.TYPE_HARNWELL_COLLEGE_HOUSE),
("https://harrison.house.upenn.edu/calendar", Event.TYPE_HARRISON_COLLEGE_HOUSE),
("https://gutmann.house.upenn.edu/calendar", Event.TYPE_GUTMANN_COLLEGE_HOUSE),
("https://radian.house.upenn.edu/calendar", Event.TYPE_RADIAN_COLLEGE_HOUSE),
("https://lauder.house.upenn.edu/calendar", Event.TYPE_LAUDER_COLLEGE_HOUSE),
("https://hill.house.upenn.edu/calendar", Event.TYPE_HILL_COLLEGE_HOUSE),
("https://kcech.house.upenn.edu/calendar", Event.TYPE_KCECH_COLLEGE_HOUSE),
("https://ware.house.upenn.edu/calendar", Event.TYPE_WARE_COLLEGE_HOUSE),
("https://fh.house.upenn.edu/calendar", Event.TYPE_FH_COLLEGE_HOUSE),
("https://riepe.house.upenn.edu/calendar", Event.TYPE_RIEPE_COLLEGE_HOUSE),
("https://dubois.house.upenn.edu/calendar", Event.TYPE_DUBOIS_COLLEGE_HOUSE),
("https://gregory.house.upenn.edu/calendar", Event.TYPE_GREGORY_COLLEGE_HOUSE),
("https://stouffer.house.upenn.edu/calendar", Event.TYPE_STOUFFER_COLLEGE_HOUSE),
]


class Command(BaseCommand):
def handle(self, *args, **kwargs):
for site, event_type in EVENT_TYPE_MAP:
self.scrape_calendar_page(f"{site}", event_type)
now = timezone.localtime()
if now.day > 25:
next = now + datetime.timedelta(days=30)
next_month, next_year = next.month, next.year
for site, event_type in EVENT_TYPE_MAP:
self.scrape_calendar_page(f"{site}/{next_year}-{next_month:02d}", event_type)

self.stdout.write("Uploaded College House Events!")

def scrape_details(self, event_url):
try:
resp = requests.get(event_url)
except ConnectionError:
print("Error:", ConnectionError)
return None
soup = BeautifulSoup(resp.text, "html.parser")

location = (
soup.find("div", class_="field-name-field-public-display-location").text.strip()
if soup.find("div", class_="field-name-field-public-display-location")
else None
)
start_time_str = (
soup.select_one(".date-display-start").get("content")
if soup.select_one(".date-display-start")
else ""
)
end_time_str = (
soup.select_one(".date-display-end").get("content")
if soup.select_one(".date-display-end")
else ""
)
start_time = (
datetime.datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S%z")
if start_time_str
else None
)
end_time = (
datetime.datetime.strptime(end_time_str, "%Y-%m-%dT%H:%M:%S%z")
if end_time_str
else None
)
description = (
soup.select_one(".field-name-body").text.strip()
if soup.select_one(".field-name-body")
else None
)
image_url = (
soup.select_one(".field-name-field-image img")["src"]
if soup.select_one(".field-name-field-image img")
else None
)
return location, start_time, end_time, description, image_url

def scrape_calendar_page(self, calendar_url, event_type):
try:
resp = requests.get(calendar_url)
except ConnectionError:
print("Error:", ConnectionError)
return
soup = BeautifulSoup(resp.text, "html.parser")

event_cells = soup.find_all("td", class_="single-day future")

email_element = soup.find("div", class_="views-field-field-office-email-contact").find("a")
email = email_element["href"].split(":")[1] if email_element else None

for cell in event_cells:
if not (item := cell.find("div", class_="item")):
continue
if not (event_link := item.find("a", href=True)):
continue
name = event_link.text.strip()
if not (url := event_link.get("href")):
continue
index = calendar_url.find("/", calendar_url.find("://") + 3)
base_url = calendar_url[:index]
url = f"{base_url}{url}"

location, start_time, end_time, description, image_url = self.scrape_details(url)
print(url + " " + name)
Event.objects.update_or_create(
name=name,
defaults={
"event_type": event_type,
"image_url": image_url,
"start": start_time,
"end": end_time,
"location": location,
"website": url,
"description": description,
"email": email,
},
)
if start_time and start_time > timezone.localtime() + datetime.timedelta(days=30):
break
64 changes: 64 additions & 0 deletions backend/penndata/management/commands/get_engineering_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import datetime
import html
import json

import requests
from django.core.management.base import BaseCommand

from penndata.models import Event


ENGINEERING_EVENTS_WEBSITE = "https://events.seas.upenn.edu/calendar/list/"


class Command(BaseCommand):
def handle(self, *args, **kwargs):
try:
resp = requests.get(ENGINEERING_EVENTS_WEBSITE)
except ConnectionError:
print("Error:", ConnectionError)
return None

html_content = resp.text

start_marker = '<script type="application/ld+json">'
end_marker = "</script>"
start_index = html_content.find(start_marker)
end_index = html_content.find(end_marker, start_index)
json_ld_content = html_content[start_index + len(start_marker) : end_index]

events_data = json.loads(json_ld_content)

for event in events_data:
if (event_name := html.unescape(event.get("name", ""))) == "":
continue

description = (
html.unescape(event.get("description", "")).replace("<p>", "").replace("</p>\n", "")
)
url = event.get("url", None)

start = datetime.datetime.fromisoformat(event.get("startDate"))
end = datetime.datetime.fromisoformat(event["endDate"]) if "endDate" in event else None

location = event.get("location", dict()).get("name")
if (organizer := event.get("organizer")) and (email := organizer.get("email")):
email = html.unescape(email)
else:
email = None

Event.objects.update_or_create(
name=event_name,
defaults={
"event_type": Event.TYPE_PENN_ENGINEERING,
"image_url": None,
"start": start,
"end": end,
"location": location,
"website": url,
"description": description,
"email": email,
},
)

self.stdout.write("Uploaded Engineering Events!")
6 changes: 3 additions & 3 deletions backend/penndata/management/commands/get_penn_today_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,14 @@ def handle(self, *args, **kwargs):
Event.objects.update_or_create(
name=name,
defaults={
"event_type": "Penn Today",
"image_url": "",
"event_type": Event.TYPE_PENN_TODAY,
"image_url": None,
"start": timezone.make_aware(start_date),
"end": timezone.make_aware(end_date),
"location": location,
"website": event_url,
"description": description,
"email": "",
"email": None,
},
)

Expand Down
110 changes: 110 additions & 0 deletions backend/penndata/management/commands/get_venture_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import datetime
import html

import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand
from django.utils import timezone

from penndata.models import Event


VENTURE_EVENTS_WEBSITE = "https://venturelab.upenn.edu/venture-lab-events"
HEADERS = {"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36"}


class Command(BaseCommand):
def handle(self, *args, **kwargs):
now = timezone.localtime()
current_month, current_year = now.month, now.year

try:
resp = requests.get(VENTURE_EVENTS_WEBSITE, headers=HEADERS)
except ConnectionError:
print("Error:", ConnectionError)
return None

soup = BeautifulSoup(resp.text, "html.parser")

event_containers = soup.find_all("div", class_="PromoSearchResultEvent")
last_start_datetime = None

for event in event_containers:
event_date_elem = event.find("div", class_="PromoSearchResultEvent-eventDate")
event_start_datetime = None
event_end_datetime = None
# some events don't have a start/end date time or a year
if event_date_elem:
event_date_str = event_date_elem.text.strip()

event_date_parts = event_date_str.split(" at ")
event_start_str = event_date_parts[1].split(" - ")[0].strip()
event_end_str = event_date_parts[1].split(" - ")[1].strip()

event_start_datetime = datetime.datetime.strptime(
f"{event_date_parts[0]} {event_start_str}", "%B %d, %Y %I:%M%p"
)
event_end_datetime = datetime.datetime.strptime(
f"{event_date_parts[0]} {event_end_str}", "%B %d, %Y %I:%M%p"
)
last_start_datetime = event_start_datetime
else: # if no year given
event_month_elem = event.find("div", class_="PromoSearchResultEvent-month")
event_day_elem = event.find("div", class_="PromoSearchResultEvent-day")

if event_month_elem and event_day_elem:
event_month = event_month_elem.text.strip()
event_day = int(event_day_elem.text.strip())

if last_start_datetime: # has to be before any previous events
if (
datetime.datetime.strptime(event_month, "%B").month
> last_start_datetime.month
):
start_year = current_year - 1
else:
start_year = current_year
else: # if no date time yet
# if in future and next year
if current_month > datetime.datetime.strptime(event_month, "%B").month:
start_year = current_year + 1
else:
start_year = current_year

event_start_datetime = datetime.datetime(
start_year, datetime.datetime.strptime(event_month, "%B").month, event_day
)

# events are ordered from future to past, so break once we find a past event
if event_start_datetime < now.replace(tzinfo=None):
break

if title := event.find("div", class_="PromoSearchResultEvent-title"):
title = html.unescape(title.text.strip())

if location := event.find("div", class_="PromoSearchResultEvent-eventLocation"):
location = location.text.strip()

if description := event.find("div", class_="PromoSearchResultEvent-description"):
description = html.unescape(description.text.strip())

if url := event.find("div", class_="PromoSearchResultEvent-cta").find("a", href=True):
url = url["href"]

Event.objects.update_or_create(
name=title,
defaults={
"event_type": Event.TYPE_VENTURE_LAB,
"image_url": None,
"start": (
timezone.make_aware(event_start_datetime) if event_start_datetime else None
),
"end": timezone.make_aware(event_end_datetime) if event_end_datetime else None,
"location": location,
"website": url,
"description": description,
"email": "[email protected]",
},
)

self.stdout.write("Uploaded Venture Lab Events!")
73 changes: 73 additions & 0 deletions backend/penndata/management/commands/get_wharton_events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import datetime
import re

import pytz
import requests
from bs4 import BeautifulSoup
from django.core.management.base import BaseCommand

from penndata.models import Event


WHARTON_EVENTS_WEBSITE = "https://events.wharton.upenn.edu/events-hq/#list"


class Command(BaseCommand):
def handle(self, *args, **kwargs):
eastern = pytz.timezone("US/Eastern")

try:
resp = requests.get(WHARTON_EVENTS_WEBSITE)
except ConnectionError:
print("Error:", ConnectionError)
return None
soup = BeautifulSoup(resp.content, "html.parser")

event_entries = soup.find_all(class_="post-entry")

for entry in event_entries:
title = entry.find(class_="entry-title").text.strip()
description = entry.find("p").text.strip()
link = entry.find(class_="entry-title").a["href"]

info = entry.find(class_="info").span.text.strip()
# event has start and end times on same date
match = re.match(r"(\w+\s+\d+) \| (\d{1,2}:\d{2} [AP]M) - (\d{1,2}:\d{2} [AP]M)", info)
if match:
_, start_time, end_time = match.groups()
start_time_obj = datetime.datetime.strptime(start_time, "%I:%M %p")
end_time_obj = datetime.datetime.strptime(end_time, "%I:%M %p")
else:
# event has start and end times on different dates
match = re.match(
r"(\w+\s+\d+)(?: \| (\d{1,2}:\d{2} [AP]M))?"
r"(?: - (\w+\s+\d+ \| )?(\d{1,2}:\d{2} [AP]M))?",
info,
)
if match:
start_date, start_time, end_date, end_time = match.groups()
start_time_obj = (
datetime.datetime.strptime(start_time, "%I:%M %p") if start_time else None
)
end_time_obj = (
datetime.datetime.strptime(end_time, "%I:%M %p") if end_time else None
)
else:
print("Error: Cannot find date, update scraper.")
return
location = ",".join(info.split("•")[-2:])
Event.objects.update_or_create(
name=title,
defaults={
"event_type": Event.TYPE_WHARTON,
"image_url": None,
"start": eastern.localize(start_time_obj) if start_time_obj else None,
"end": eastern.localize(end_time_obj) if end_time_obj else None,
"location": location.strip(),
"website": link,
"description": description,
"email": None,
},
)

self.stdout.write("Uploaded Wharton Events!")
Loading

0 comments on commit 0905840

Please sign in to comment.