More events scraping (#252)

* venture lab + engineering events * rodin events * formatting * lint black * headers * .. * ... * .... * small fixes * flake * college houses * lint * lint black * . * . * . * Lint * Revert migration file * Revert remaining files * wharton events * lint * walrus * oops * Dev Container * x86 arch * image urls and emails and other fixes * judtin minor fixes * college house list of tuples --------- Co-authored-by: Justin Zhang <[email protected]>
pennlabs · Mar 17, 2024 · 0905840 · 0905840
1 parent 32a8a34
commit 0905840
Show file tree

Hide file tree

Showing 8 changed files with 500 additions and 5 deletions.
diff --git a/backend/penndata/management/commands/get_college_house_events.py b/backend/penndata/management/commands/get_college_house_events.py
@@ -0,0 +1,128 @@
+import datetime
+
+import requests
+from bs4 import BeautifulSoup
+from django.core.management.base import BaseCommand
+from django.utils import timezone
+
+from penndata.models import Event
+
+
+EVENT_TYPE_MAP = [
+    ("https://rodin.house.upenn.edu/calendar", Event.TYPE_RODIN_COLLEGE_HOUSE),
+    ("https://harnwell.house.upenn.edu/calendar", Event.TYPE_HARNWELL_COLLEGE_HOUSE),
+    ("https://harrison.house.upenn.edu/calendar", Event.TYPE_HARRISON_COLLEGE_HOUSE),
+    ("https://gutmann.house.upenn.edu/calendar", Event.TYPE_GUTMANN_COLLEGE_HOUSE),
+    ("https://radian.house.upenn.edu/calendar", Event.TYPE_RADIAN_COLLEGE_HOUSE),
+    ("https://lauder.house.upenn.edu/calendar", Event.TYPE_LAUDER_COLLEGE_HOUSE),
+    ("https://hill.house.upenn.edu/calendar", Event.TYPE_HILL_COLLEGE_HOUSE),
+    ("https://kcech.house.upenn.edu/calendar", Event.TYPE_KCECH_COLLEGE_HOUSE),
+    ("https://ware.house.upenn.edu/calendar", Event.TYPE_WARE_COLLEGE_HOUSE),
+    ("https://fh.house.upenn.edu/calendar", Event.TYPE_FH_COLLEGE_HOUSE),
+    ("https://riepe.house.upenn.edu/calendar", Event.TYPE_RIEPE_COLLEGE_HOUSE),
+    ("https://dubois.house.upenn.edu/calendar", Event.TYPE_DUBOIS_COLLEGE_HOUSE),
+    ("https://gregory.house.upenn.edu/calendar", Event.TYPE_GREGORY_COLLEGE_HOUSE),
+    ("https://stouffer.house.upenn.edu/calendar", Event.TYPE_STOUFFER_COLLEGE_HOUSE),
+]
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        for site, event_type in EVENT_TYPE_MAP:
+            self.scrape_calendar_page(f"{site}", event_type)
+        now = timezone.localtime()
+        if now.day > 25:
+            next = now + datetime.timedelta(days=30)
+            next_month, next_year = next.month, next.year
+            for site, event_type in EVENT_TYPE_MAP:
+                self.scrape_calendar_page(f"{site}/{next_year}-{next_month:02d}", event_type)
+
+        self.stdout.write("Uploaded College House Events!")
+
+    def scrape_details(self, event_url):
+        try:
+            resp = requests.get(event_url)
+        except ConnectionError:
+            print("Error:", ConnectionError)
+            return None
+        soup = BeautifulSoup(resp.text, "html.parser")
+
+        location = (
+            soup.find("div", class_="field-name-field-public-display-location").text.strip()
+            if soup.find("div", class_="field-name-field-public-display-location")
+            else None
+        )
+        start_time_str = (
+            soup.select_one(".date-display-start").get("content")
+            if soup.select_one(".date-display-start")
+            else ""
+        )
+        end_time_str = (
+            soup.select_one(".date-display-end").get("content")
+            if soup.select_one(".date-display-end")
+            else ""
+        )
+        start_time = (
+            datetime.datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S%z")
+            if start_time_str
+            else None
+        )
+        end_time = (
+            datetime.datetime.strptime(end_time_str, "%Y-%m-%dT%H:%M:%S%z")
+            if end_time_str
+            else None
+        )
+        description = (
+            soup.select_one(".field-name-body").text.strip()
+            if soup.select_one(".field-name-body")
+            else None
+        )
+        image_url = (
+            soup.select_one(".field-name-field-image img")["src"]
+            if soup.select_one(".field-name-field-image img")
+            else None
+        )
+        return location, start_time, end_time, description, image_url
+
+    def scrape_calendar_page(self, calendar_url, event_type):
+        try:
+            resp = requests.get(calendar_url)
+        except ConnectionError:
+            print("Error:", ConnectionError)
+            return
+        soup = BeautifulSoup(resp.text, "html.parser")
+
+        event_cells = soup.find_all("td", class_="single-day future")
+
+        email_element = soup.find("div", class_="views-field-field-office-email-contact").find("a")
+        email = email_element["href"].split(":")[1] if email_element else None
+
+        for cell in event_cells:
+            if not (item := cell.find("div", class_="item")):
+                continue
+            if not (event_link := item.find("a", href=True)):
+                continue
+            name = event_link.text.strip()
+            if not (url := event_link.get("href")):
+                continue
+            index = calendar_url.find("/", calendar_url.find("://") + 3)
+            base_url = calendar_url[:index]
+            url = f"{base_url}{url}"
+
+            location, start_time, end_time, description, image_url = self.scrape_details(url)
+            print(url + " " + name)
+            Event.objects.update_or_create(
+                name=name,
+                defaults={
+                    "event_type": event_type,
+                    "image_url": image_url,
+                    "start": start_time,
+                    "end": end_time,
+                    "location": location,
+                    "website": url,
+                    "description": description,
+                    "email": email,
+                },
+            )
+            if start_time and start_time > timezone.localtime() + datetime.timedelta(days=30):
+                break
diff --git a/backend/penndata/management/commands/get_engineering_events.py b/backend/penndata/management/commands/get_engineering_events.py
@@ -0,0 +1,64 @@
+import datetime
+import html
+import json
+
+import requests
+from django.core.management.base import BaseCommand
+
+from penndata.models import Event
+
+
+ENGINEERING_EVENTS_WEBSITE = "https://events.seas.upenn.edu/calendar/list/"
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        try:
+            resp = requests.get(ENGINEERING_EVENTS_WEBSITE)
+        except ConnectionError:
+            print("Error:", ConnectionError)
+            return None
+
+        html_content = resp.text
+
+        start_marker = '<script type="application/ld+json">'
+        end_marker = "</script>"
+        start_index = html_content.find(start_marker)
+        end_index = html_content.find(end_marker, start_index)
+        json_ld_content = html_content[start_index + len(start_marker) : end_index]
+
+        events_data = json.loads(json_ld_content)
+
+        for event in events_data:
+            if (event_name := html.unescape(event.get("name", ""))) == "":
+                continue
+
+            description = (
+                html.unescape(event.get("description", "")).replace("<p>", "").replace("</p>\n", "")
+            )
+            url = event.get("url", None)
+
+            start = datetime.datetime.fromisoformat(event.get("startDate"))
+            end = datetime.datetime.fromisoformat(event["endDate"]) if "endDate" in event else None
+
+            location = event.get("location", dict()).get("name")
+            if (organizer := event.get("organizer")) and (email := organizer.get("email")):
+                email = html.unescape(email)
+            else:
+                email = None
+
+            Event.objects.update_or_create(
+                name=event_name,
+                defaults={
+                    "event_type": Event.TYPE_PENN_ENGINEERING,
+                    "image_url": None,
+                    "start": start,
+                    "end": end,
+                    "location": location,
+                    "website": url,
+                    "description": description,
+                    "email": email,
+                },
+            )
+
+        self.stdout.write("Uploaded Engineering Events!")
diff --git a/backend/penndata/management/commands/get_penn_today_events.py b/backend/penndata/management/commands/get_penn_today_events.py
@@ -104,14 +104,14 @@ def handle(self, *args, **kwargs):
             Event.objects.update_or_create(
                 name=name,
                 defaults={
-                    "event_type": "Penn Today",
-                    "image_url": "",
+                    "event_type": Event.TYPE_PENN_TODAY,
+                    "image_url": None,
                     "start": timezone.make_aware(start_date),
                     "end": timezone.make_aware(end_date),
                     "location": location,
                     "website": event_url,
                     "description": description,
-                    "email": "",
+                    "email": None,
                 },
             )
 

diff --git a/backend/penndata/management/commands/get_venture_events.py b/backend/penndata/management/commands/get_venture_events.py
@@ -0,0 +1,110 @@
+import datetime
+import html
+
+import requests
+from bs4 import BeautifulSoup
+from django.core.management.base import BaseCommand
+from django.utils import timezone
+
+from penndata.models import Event
+
+
+VENTURE_EVENTS_WEBSITE = "https://venturelab.upenn.edu/venture-lab-events"
+HEADERS = {"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36"}
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        now = timezone.localtime()
+        current_month, current_year = now.month, now.year
+
+        try:
+            resp = requests.get(VENTURE_EVENTS_WEBSITE, headers=HEADERS)
+        except ConnectionError:
+            print("Error:", ConnectionError)
+            return None
+
+        soup = BeautifulSoup(resp.text, "html.parser")
+
+        event_containers = soup.find_all("div", class_="PromoSearchResultEvent")
+        last_start_datetime = None
+
+        for event in event_containers:
+            event_date_elem = event.find("div", class_="PromoSearchResultEvent-eventDate")
+            event_start_datetime = None
+            event_end_datetime = None
+            # some events don't have a start/end date time or a year
+            if event_date_elem:
+                event_date_str = event_date_elem.text.strip()
+
+                event_date_parts = event_date_str.split(" at ")
+                event_start_str = event_date_parts[1].split(" - ")[0].strip()
+                event_end_str = event_date_parts[1].split(" - ")[1].strip()
+
+                event_start_datetime = datetime.datetime.strptime(
+                    f"{event_date_parts[0]} {event_start_str}", "%B %d, %Y %I:%M%p"
+                )
+                event_end_datetime = datetime.datetime.strptime(
+                    f"{event_date_parts[0]} {event_end_str}", "%B %d, %Y %I:%M%p"
+                )
+                last_start_datetime = event_start_datetime
+            else:  # if no year given
+                event_month_elem = event.find("div", class_="PromoSearchResultEvent-month")
+                event_day_elem = event.find("div", class_="PromoSearchResultEvent-day")
+
+                if event_month_elem and event_day_elem:
+                    event_month = event_month_elem.text.strip()
+                    event_day = int(event_day_elem.text.strip())
+
+                    if last_start_datetime:  # has to be before any previous events
+                        if (
+                            datetime.datetime.strptime(event_month, "%B").month
+                            > last_start_datetime.month
+                        ):
+                            start_year = current_year - 1
+                        else:
+                            start_year = current_year
+                    else:  # if no date time yet
+                        # if in future and next year
+                        if current_month > datetime.datetime.strptime(event_month, "%B").month:
+                            start_year = current_year + 1
+                        else:
+                            start_year = current_year
+
+                    event_start_datetime = datetime.datetime(
+                        start_year, datetime.datetime.strptime(event_month, "%B").month, event_day
+                    )
+
+            # events are ordered from future to past, so break once we find a past event
+            if event_start_datetime < now.replace(tzinfo=None):
+                break
+
+            if title := event.find("div", class_="PromoSearchResultEvent-title"):
+                title = html.unescape(title.text.strip())
+
+            if location := event.find("div", class_="PromoSearchResultEvent-eventLocation"):
+                location = location.text.strip()
+
+            if description := event.find("div", class_="PromoSearchResultEvent-description"):
+                description = html.unescape(description.text.strip())
+
+            if url := event.find("div", class_="PromoSearchResultEvent-cta").find("a", href=True):
+                url = url["href"]
+
+            Event.objects.update_or_create(
+                name=title,
+                defaults={
+                    "event_type": Event.TYPE_VENTURE_LAB,
+                    "image_url": None,
+                    "start": (
+                        timezone.make_aware(event_start_datetime) if event_start_datetime else None
+                    ),
+                    "end": timezone.make_aware(event_end_datetime) if event_end_datetime else None,
+                    "location": location,
+                    "website": url,
+                    "description": description,
+                    "email": "[email protected]",
+                },
+            )
+
+        self.stdout.write("Uploaded Venture Lab Events!")
diff --git a/backend/penndata/management/commands/get_wharton_events.py b/backend/penndata/management/commands/get_wharton_events.py
@@ -0,0 +1,73 @@
+import datetime
+import re
+
+import pytz
+import requests
+from bs4 import BeautifulSoup
+from django.core.management.base import BaseCommand
+
+from penndata.models import Event
+
+
+WHARTON_EVENTS_WEBSITE = "https://events.wharton.upenn.edu/events-hq/#list"
+
+
+class Command(BaseCommand):
+    def handle(self, *args, **kwargs):
+        eastern = pytz.timezone("US/Eastern")
+
+        try:
+            resp = requests.get(WHARTON_EVENTS_WEBSITE)
+        except ConnectionError:
+            print("Error:", ConnectionError)
+            return None
+        soup = BeautifulSoup(resp.content, "html.parser")
+
+        event_entries = soup.find_all(class_="post-entry")
+
+        for entry in event_entries:
+            title = entry.find(class_="entry-title").text.strip()
+            description = entry.find("p").text.strip()
+            link = entry.find(class_="entry-title").a["href"]
+
+            info = entry.find(class_="info").span.text.strip()
+            # event has start and end times on same date
+            match = re.match(r"(\w+\s+\d+) \| (\d{1,2}:\d{2} [AP]M) - (\d{1,2}:\d{2} [AP]M)", info)
+            if match:
+                _, start_time, end_time = match.groups()
+                start_time_obj = datetime.datetime.strptime(start_time, "%I:%M %p")
+                end_time_obj = datetime.datetime.strptime(end_time, "%I:%M %p")
+            else:
+                # event has start and end times on different dates
+                match = re.match(
+                    r"(\w+\s+\d+)(?: \| (\d{1,2}:\d{2} [AP]M))?"
+                    r"(?: - (\w+\s+\d+ \| )?(\d{1,2}:\d{2} [AP]M))?",
+                    info,
+                )
+                if match:
+                    start_date, start_time, end_date, end_time = match.groups()
+                    start_time_obj = (
+                        datetime.datetime.strptime(start_time, "%I:%M %p") if start_time else None
+                    )
+                    end_time_obj = (
+                        datetime.datetime.strptime(end_time, "%I:%M %p") if end_time else None
+                    )
+                else:
+                    print("Error: Cannot find date, update scraper.")
+                    return
+            location = ",".join(info.split("•")[-2:])
+            Event.objects.update_or_create(
+                name=title,
+                defaults={
+                    "event_type": Event.TYPE_WHARTON,
+                    "image_url": None,
+                    "start": eastern.localize(start_time_obj) if start_time_obj else None,
+                    "end": eastern.localize(end_time_obj) if end_time_obj else None,
+                    "location": location.strip(),
+                    "website": link,
+                    "description": description,
+                    "email": None,
+                },
+            )
+
+        self.stdout.write("Uploaded Wharton Events!")