diff --git a/backend/penndata/management/commands/get_college_house_events.py b/backend/penndata/management/commands/get_college_house_events.py new file mode 100644 index 00000000..488bb8cf --- /dev/null +++ b/backend/penndata/management/commands/get_college_house_events.py @@ -0,0 +1,128 @@ +import datetime + +import requests +from bs4 import BeautifulSoup +from django.core.management.base import BaseCommand +from django.utils import timezone + +from penndata.models import Event + + +EVENT_TYPE_MAP = [ + ("https://rodin.house.upenn.edu/calendar", Event.TYPE_RODIN_COLLEGE_HOUSE), + ("https://harnwell.house.upenn.edu/calendar", Event.TYPE_HARNWELL_COLLEGE_HOUSE), + ("https://harrison.house.upenn.edu/calendar", Event.TYPE_HARRISON_COLLEGE_HOUSE), + ("https://gutmann.house.upenn.edu/calendar", Event.TYPE_GUTMANN_COLLEGE_HOUSE), + ("https://radian.house.upenn.edu/calendar", Event.TYPE_RADIAN_COLLEGE_HOUSE), + ("https://lauder.house.upenn.edu/calendar", Event.TYPE_LAUDER_COLLEGE_HOUSE), + ("https://hill.house.upenn.edu/calendar", Event.TYPE_HILL_COLLEGE_HOUSE), + ("https://kcech.house.upenn.edu/calendar", Event.TYPE_KCECH_COLLEGE_HOUSE), + ("https://ware.house.upenn.edu/calendar", Event.TYPE_WARE_COLLEGE_HOUSE), + ("https://fh.house.upenn.edu/calendar", Event.TYPE_FH_COLLEGE_HOUSE), + ("https://riepe.house.upenn.edu/calendar", Event.TYPE_RIEPE_COLLEGE_HOUSE), + ("https://dubois.house.upenn.edu/calendar", Event.TYPE_DUBOIS_COLLEGE_HOUSE), + ("https://gregory.house.upenn.edu/calendar", Event.TYPE_GREGORY_COLLEGE_HOUSE), + ("https://stouffer.house.upenn.edu/calendar", Event.TYPE_STOUFFER_COLLEGE_HOUSE), +] + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + for site, event_type in EVENT_TYPE_MAP: + self.scrape_calendar_page(f"{site}", event_type) + now = timezone.localtime() + if now.day > 25: + next = now + datetime.timedelta(days=30) + next_month, next_year = next.month, next.year + for site, event_type in EVENT_TYPE_MAP: + self.scrape_calendar_page(f"{site}/{next_year}-{next_month:02d}", event_type) + + self.stdout.write("Uploaded College House Events!") + + def scrape_details(self, event_url): + try: + resp = requests.get(event_url) + except ConnectionError: + print("Error:", ConnectionError) + return None + soup = BeautifulSoup(resp.text, "html.parser") + + location = ( + soup.find("div", class_="field-name-field-public-display-location").text.strip() + if soup.find("div", class_="field-name-field-public-display-location") + else None + ) + start_time_str = ( + soup.select_one(".date-display-start").get("content") + if soup.select_one(".date-display-start") + else "" + ) + end_time_str = ( + soup.select_one(".date-display-end").get("content") + if soup.select_one(".date-display-end") + else "" + ) + start_time = ( + datetime.datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S%z") + if start_time_str + else None + ) + end_time = ( + datetime.datetime.strptime(end_time_str, "%Y-%m-%dT%H:%M:%S%z") + if end_time_str + else None + ) + description = ( + soup.select_one(".field-name-body").text.strip() + if soup.select_one(".field-name-body") + else None + ) + image_url = ( + soup.select_one(".field-name-field-image img")["src"] + if soup.select_one(".field-name-field-image img") + else None + ) + return location, start_time, end_time, description, image_url + + def scrape_calendar_page(self, calendar_url, event_type): + try: + resp = requests.get(calendar_url) + except ConnectionError: + print("Error:", ConnectionError) + return + soup = BeautifulSoup(resp.text, "html.parser") + + event_cells = soup.find_all("td", class_="single-day future") + + email_element = soup.find("div", class_="views-field-field-office-email-contact").find("a") + email = email_element["href"].split(":")[1] if email_element else None + + for cell in event_cells: + if not (item := cell.find("div", class_="item")): + continue + if not (event_link := item.find("a", href=True)): + continue + name = event_link.text.strip() + if not (url := event_link.get("href")): + continue + index = calendar_url.find("/", calendar_url.find("://") + 3) + base_url = calendar_url[:index] + url = f"{base_url}{url}" + + location, start_time, end_time, description, image_url = self.scrape_details(url) + print(url + " " + name) + Event.objects.update_or_create( + name=name, + defaults={ + "event_type": event_type, + "image_url": image_url, + "start": start_time, + "end": end_time, + "location": location, + "website": url, + "description": description, + "email": email, + }, + ) + if start_time and start_time > timezone.localtime() + datetime.timedelta(days=30): + break diff --git a/backend/penndata/management/commands/get_engineering_events.py b/backend/penndata/management/commands/get_engineering_events.py new file mode 100644 index 00000000..1a304c4c --- /dev/null +++ b/backend/penndata/management/commands/get_engineering_events.py @@ -0,0 +1,64 @@ +import datetime +import html +import json + +import requests +from django.core.management.base import BaseCommand + +from penndata.models import Event + + +ENGINEERING_EVENTS_WEBSITE = "https://events.seas.upenn.edu/calendar/list/" + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + try: + resp = requests.get(ENGINEERING_EVENTS_WEBSITE) + except ConnectionError: + print("Error:", ConnectionError) + return None + + html_content = resp.text + + start_marker = '" + start_index = html_content.find(start_marker) + end_index = html_content.find(end_marker, start_index) + json_ld_content = html_content[start_index + len(start_marker) : end_index] + + events_data = json.loads(json_ld_content) + + for event in events_data: + if (event_name := html.unescape(event.get("name", ""))) == "": + continue + + description = ( + html.unescape(event.get("description", "")).replace("

", "").replace("

\n", "") + ) + url = event.get("url", None) + + start = datetime.datetime.fromisoformat(event.get("startDate")) + end = datetime.datetime.fromisoformat(event["endDate"]) if "endDate" in event else None + + location = event.get("location", dict()).get("name") + if (organizer := event.get("organizer")) and (email := organizer.get("email")): + email = html.unescape(email) + else: + email = None + + Event.objects.update_or_create( + name=event_name, + defaults={ + "event_type": Event.TYPE_PENN_ENGINEERING, + "image_url": None, + "start": start, + "end": end, + "location": location, + "website": url, + "description": description, + "email": email, + }, + ) + + self.stdout.write("Uploaded Engineering Events!") diff --git a/backend/penndata/management/commands/get_penn_today_events.py b/backend/penndata/management/commands/get_penn_today_events.py index b144b452..6916b164 100644 --- a/backend/penndata/management/commands/get_penn_today_events.py +++ b/backend/penndata/management/commands/get_penn_today_events.py @@ -104,14 +104,14 @@ def handle(self, *args, **kwargs): Event.objects.update_or_create( name=name, defaults={ - "event_type": "Penn Today", - "image_url": "", + "event_type": Event.TYPE_PENN_TODAY, + "image_url": None, "start": timezone.make_aware(start_date), "end": timezone.make_aware(end_date), "location": location, "website": event_url, "description": description, - "email": "", + "email": None, }, ) diff --git a/backend/penndata/management/commands/get_venture_events.py b/backend/penndata/management/commands/get_venture_events.py new file mode 100644 index 00000000..97000199 --- /dev/null +++ b/backend/penndata/management/commands/get_venture_events.py @@ -0,0 +1,110 @@ +import datetime +import html + +import requests +from bs4 import BeautifulSoup +from django.core.management.base import BaseCommand +from django.utils import timezone + +from penndata.models import Event + + +VENTURE_EVENTS_WEBSITE = "https://venturelab.upenn.edu/venture-lab-events" +HEADERS = {"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36"} + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + now = timezone.localtime() + current_month, current_year = now.month, now.year + + try: + resp = requests.get(VENTURE_EVENTS_WEBSITE, headers=HEADERS) + except ConnectionError: + print("Error:", ConnectionError) + return None + + soup = BeautifulSoup(resp.text, "html.parser") + + event_containers = soup.find_all("div", class_="PromoSearchResultEvent") + last_start_datetime = None + + for event in event_containers: + event_date_elem = event.find("div", class_="PromoSearchResultEvent-eventDate") + event_start_datetime = None + event_end_datetime = None + # some events don't have a start/end date time or a year + if event_date_elem: + event_date_str = event_date_elem.text.strip() + + event_date_parts = event_date_str.split(" at ") + event_start_str = event_date_parts[1].split(" - ")[0].strip() + event_end_str = event_date_parts[1].split(" - ")[1].strip() + + event_start_datetime = datetime.datetime.strptime( + f"{event_date_parts[0]} {event_start_str}", "%B %d, %Y %I:%M%p" + ) + event_end_datetime = datetime.datetime.strptime( + f"{event_date_parts[0]} {event_end_str}", "%B %d, %Y %I:%M%p" + ) + last_start_datetime = event_start_datetime + else: # if no year given + event_month_elem = event.find("div", class_="PromoSearchResultEvent-month") + event_day_elem = event.find("div", class_="PromoSearchResultEvent-day") + + if event_month_elem and event_day_elem: + event_month = event_month_elem.text.strip() + event_day = int(event_day_elem.text.strip()) + + if last_start_datetime: # has to be before any previous events + if ( + datetime.datetime.strptime(event_month, "%B").month + > last_start_datetime.month + ): + start_year = current_year - 1 + else: + start_year = current_year + else: # if no date time yet + # if in future and next year + if current_month > datetime.datetime.strptime(event_month, "%B").month: + start_year = current_year + 1 + else: + start_year = current_year + + event_start_datetime = datetime.datetime( + start_year, datetime.datetime.strptime(event_month, "%B").month, event_day + ) + + # events are ordered from future to past, so break once we find a past event + if event_start_datetime < now.replace(tzinfo=None): + break + + if title := event.find("div", class_="PromoSearchResultEvent-title"): + title = html.unescape(title.text.strip()) + + if location := event.find("div", class_="PromoSearchResultEvent-eventLocation"): + location = location.text.strip() + + if description := event.find("div", class_="PromoSearchResultEvent-description"): + description = html.unescape(description.text.strip()) + + if url := event.find("div", class_="PromoSearchResultEvent-cta").find("a", href=True): + url = url["href"] + + Event.objects.update_or_create( + name=title, + defaults={ + "event_type": Event.TYPE_VENTURE_LAB, + "image_url": None, + "start": ( + timezone.make_aware(event_start_datetime) if event_start_datetime else None + ), + "end": timezone.make_aware(event_end_datetime) if event_end_datetime else None, + "location": location, + "website": url, + "description": description, + "email": "venturelab@upenn.edu", + }, + ) + + self.stdout.write("Uploaded Venture Lab Events!") diff --git a/backend/penndata/management/commands/get_wharton_events.py b/backend/penndata/management/commands/get_wharton_events.py new file mode 100644 index 00000000..c82f9041 --- /dev/null +++ b/backend/penndata/management/commands/get_wharton_events.py @@ -0,0 +1,73 @@ +import datetime +import re + +import pytz +import requests +from bs4 import BeautifulSoup +from django.core.management.base import BaseCommand + +from penndata.models import Event + + +WHARTON_EVENTS_WEBSITE = "https://events.wharton.upenn.edu/events-hq/#list" + + +class Command(BaseCommand): + def handle(self, *args, **kwargs): + eastern = pytz.timezone("US/Eastern") + + try: + resp = requests.get(WHARTON_EVENTS_WEBSITE) + except ConnectionError: + print("Error:", ConnectionError) + return None + soup = BeautifulSoup(resp.content, "html.parser") + + event_entries = soup.find_all(class_="post-entry") + + for entry in event_entries: + title = entry.find(class_="entry-title").text.strip() + description = entry.find("p").text.strip() + link = entry.find(class_="entry-title").a["href"] + + info = entry.find(class_="info").span.text.strip() + # event has start and end times on same date + match = re.match(r"(\w+\s+\d+) \| (\d{1,2}:\d{2} [AP]M) - (\d{1,2}:\d{2} [AP]M)", info) + if match: + _, start_time, end_time = match.groups() + start_time_obj = datetime.datetime.strptime(start_time, "%I:%M %p") + end_time_obj = datetime.datetime.strptime(end_time, "%I:%M %p") + else: + # event has start and end times on different dates + match = re.match( + r"(\w+\s+\d+)(?: \| (\d{1,2}:\d{2} [AP]M))?" + r"(?: - (\w+\s+\d+ \| )?(\d{1,2}:\d{2} [AP]M))?", + info, + ) + if match: + start_date, start_time, end_date, end_time = match.groups() + start_time_obj = ( + datetime.datetime.strptime(start_time, "%I:%M %p") if start_time else None + ) + end_time_obj = ( + datetime.datetime.strptime(end_time, "%I:%M %p") if end_time else None + ) + else: + print("Error: Cannot find date, update scraper.") + return + location = ",".join(info.split("•")[-2:]) + Event.objects.update_or_create( + name=title, + defaults={ + "event_type": Event.TYPE_WHARTON, + "image_url": None, + "start": eastern.localize(start_time_obj) if start_time_obj else None, + "end": eastern.localize(end_time_obj) if end_time_obj else None, + "location": location.strip(), + "website": link, + "description": description, + "email": None, + }, + ) + + self.stdout.write("Uploaded Wharton Events!") diff --git a/backend/penndata/migrations/0011_alter_event_event_type_alter_event_start.py b/backend/penndata/migrations/0011_alter_event_event_type_alter_event_start.py new file mode 100644 index 00000000..c8347891 --- /dev/null +++ b/backend/penndata/migrations/0011_alter_event_event_type_alter_event_start.py @@ -0,0 +1,46 @@ +# Generated by Django 5.0.2 on 2024-03-15 21:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("penndata", "0010_auto_20240228_0150"), + ] + + operations = [ + migrations.AlterField( + model_name="event", + name="event_type", + field=models.CharField( + blank=True, + choices=[ + ("PENN TODAY", "Penn Today"), + ("VENTURE LAB", "Venture Lab"), + ("PENN ENGINEERING", "Penn Engineering"), + ("WHARTON", "Wharton"), + ("RODIN COLLEGE HOUSE", "Rodin College House"), + ("HARNWELL COLLEGE HOUSE", "Harnwell College House"), + ("HARRISON COLLEGE HOUSE", "Harrison College House"), + ("GUTMANN COLLEGE HOUSE", "Gutmann College House"), + ("RADIAN COLLEGE HOUSE", "Radian College House"), + ("LAUDER COLLEGE HOUSE", "Lauder College House"), + ("HILL COLLEGE HOUSE", "Hill College HouseE"), + ("KCECH COLLEGE HOUSE", "Kcech College House"), + ("WARE COLLEGE HOUSE", "Ware College House"), + ("FISHER HASSENFELD COLLEGE HOUSE", "Fisher Hassenfeld College House"), + ("RIEPE COLLEGE HOUSE", "Riepe College House"), + ("DU BOIS COLLEGE HOUSE", "Du Bois College House"), + ("GREGORY COLLEGE HOUSE", "Gregory College House"), + ("STOUFFER COLLEGE HOUSE", "Stouffer College House"), + ], + default=None, + max_length=63, + null=True, + ), + ), + migrations.AlterField( + model_name="event", name="start", field=models.DateTimeField(blank=True, null=True), + ), + ] diff --git a/backend/penndata/models.py b/backend/penndata/models.py index 9279fb52..45485b24 100644 --- a/backend/penndata/models.py +++ b/backend/penndata/models.py @@ -9,11 +9,53 @@ class Event(models.Model): - event_type = models.CharField(max_length=255, null=True, blank=True) + TYPE_PENN_TODAY = "PENN TODAY" + TYPE_VENTURE_LAB = "VENTURE LAB" + TYPE_PENN_ENGINEERING = "PENN ENGINEERING" + TYPE_WHARTON = "WHARTON" + TYPE_RODIN_COLLEGE_HOUSE = "RODIN COLLEGE HOUSE" + TYPE_HARNWELL_COLLEGE_HOUSE = "HARNWELL COLLEGE HOUSE" + TYPE_HARRISON_COLLEGE_HOUSE = "HARRISON COLLEGE HOUSE" + TYPE_GUTMANN_COLLEGE_HOUSE = "GUTMANN COLLEGE HOUSE" + TYPE_RADIAN_COLLEGE_HOUSE = "RADIAN COLLEGE HOUSE" + TYPE_LAUDER_COLLEGE_HOUSE = "LAUDER COLLEGE HOUSE" + TYPE_HILL_COLLEGE_HOUSE = "HILL COLLEGE HOUSE" + TYPE_KCECH_COLLEGE_HOUSE = "KCECH COLLEGE HOUSE" + TYPE_WARE_COLLEGE_HOUSE = "WARE COLLEGE HOUSE" + TYPE_FH_COLLEGE_HOUSE = "FISHER HASSENFELD COLLEGE HOUSE" + TYPE_RIEPE_COLLEGE_HOUSE = "RIEPE COLLEGE HOUSE" + TYPE_DUBOIS_COLLEGE_HOUSE = "DU BOIS COLLEGE HOUSE" + TYPE_GREGORY_COLLEGE_HOUSE = "GREGORY COLLEGE HOUSE" + TYPE_STOUFFER_COLLEGE_HOUSE = "STOUFFER COLLEGE HOUSE" + + TYPE_CHOICES = ( + (TYPE_PENN_TODAY, "Penn Today"), + (TYPE_VENTURE_LAB, "Venture Lab"), + (TYPE_PENN_ENGINEERING, "Penn Engineering"), + (TYPE_WHARTON, "Wharton"), + (TYPE_RODIN_COLLEGE_HOUSE, "Rodin College House"), + (TYPE_HARNWELL_COLLEGE_HOUSE, "Harnwell College House"), + (TYPE_HARRISON_COLLEGE_HOUSE, "Harrison College House"), + (TYPE_GUTMANN_COLLEGE_HOUSE, "Gutmann College House"), + (TYPE_RADIAN_COLLEGE_HOUSE, "Radian College House"), + (TYPE_LAUDER_COLLEGE_HOUSE, "Lauder College House"), + (TYPE_HILL_COLLEGE_HOUSE, "Hill College HouseE"), + (TYPE_KCECH_COLLEGE_HOUSE, "Kcech College House"), + (TYPE_WARE_COLLEGE_HOUSE, "Ware College House"), + (TYPE_FH_COLLEGE_HOUSE, "Fisher Hassenfeld College House"), + (TYPE_RIEPE_COLLEGE_HOUSE, "Riepe College House"), + (TYPE_DUBOIS_COLLEGE_HOUSE, "Du Bois College House"), + (TYPE_GREGORY_COLLEGE_HOUSE, "Gregory College House"), + (TYPE_STOUFFER_COLLEGE_HOUSE, "Stouffer College House"), + ) + + event_type = models.CharField( + max_length=63, choices=TYPE_CHOICES, default=None, null=True, blank=True + ) name = models.CharField(max_length=255) description = models.TextField(null=True, blank=True) image_url = models.URLField(null=True, blank=True) - start = models.DateTimeField() + start = models.DateTimeField(null=True, blank=True) end = models.DateTimeField(null=True, blank=True) location = models.CharField(max_length=255, null=True, blank=True) email = models.CharField(max_length=255, null=True, blank=True) diff --git a/k8s/main.ts b/k8s/main.ts index 9038cb43..ca5b50f3 100644 --- a/k8s/main.ts +++ b/k8s/main.ts @@ -129,6 +129,38 @@ export class MyChart extends PennLabsChart { cmd: ["python", "manage.py", "get_penn_today_events"], env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] }); + + new CronJob(this, 'get-engineering-events', { + schedule:'0 16 * * *', // Every day at 4 PM + image: backendImage, + secret, + cmd: ["python", "manage.py", "get_engineering_events"], + env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] + }); + + new CronJob(this, 'get-venture-events', { + schedule:'0 16 * * *', // Every day at 4 PM + image: backendImage, + secret, + cmd: ["python", "manage.py", "get_venture_events"], + env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] + }); + + new CronJob(this, 'get-wharton-events', { + schedule:'0 16 * * *', // Every day at 4 PM + image: backendImage, + secret, + cmd: ["python", "manage.py", "get_wharton_events"], + env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] + }); + + new CronJob(this, 'get-college-house-events', { + schedule:'0 17 * * *', // Every day at 5 PM + image: backendImage, + secret, + cmd: ["python", "manage.py", "get_college_house_events"], + env: [{ name: "DJANGO_SETTINGS_MODULE", value: "pennmobile.settings.production" }] + }); } }