-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* venture lab + engineering events * rodin events * formatting * lint black * headers * .. * ... * .... * small fixes * flake * college houses * lint * lint black * . * . * . * Lint * Revert migration file * Revert remaining files * wharton events * lint * walrus * oops * Dev Container * x86 arch * image urls and emails and other fixes * judtin minor fixes * college house list of tuples --------- Co-authored-by: Justin Zhang <[email protected]>
- Loading branch information
1 parent
32a8a34
commit 0905840
Showing
8 changed files
with
500 additions
and
5 deletions.
There are no files selected for viewing
128 changes: 128 additions & 0 deletions
128
backend/penndata/management/commands/get_college_house_events.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
import datetime | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from django.core.management.base import BaseCommand | ||
from django.utils import timezone | ||
|
||
from penndata.models import Event | ||
|
||
|
||
EVENT_TYPE_MAP = [ | ||
("https://rodin.house.upenn.edu/calendar", Event.TYPE_RODIN_COLLEGE_HOUSE), | ||
("https://harnwell.house.upenn.edu/calendar", Event.TYPE_HARNWELL_COLLEGE_HOUSE), | ||
("https://harrison.house.upenn.edu/calendar", Event.TYPE_HARRISON_COLLEGE_HOUSE), | ||
("https://gutmann.house.upenn.edu/calendar", Event.TYPE_GUTMANN_COLLEGE_HOUSE), | ||
("https://radian.house.upenn.edu/calendar", Event.TYPE_RADIAN_COLLEGE_HOUSE), | ||
("https://lauder.house.upenn.edu/calendar", Event.TYPE_LAUDER_COLLEGE_HOUSE), | ||
("https://hill.house.upenn.edu/calendar", Event.TYPE_HILL_COLLEGE_HOUSE), | ||
("https://kcech.house.upenn.edu/calendar", Event.TYPE_KCECH_COLLEGE_HOUSE), | ||
("https://ware.house.upenn.edu/calendar", Event.TYPE_WARE_COLLEGE_HOUSE), | ||
("https://fh.house.upenn.edu/calendar", Event.TYPE_FH_COLLEGE_HOUSE), | ||
("https://riepe.house.upenn.edu/calendar", Event.TYPE_RIEPE_COLLEGE_HOUSE), | ||
("https://dubois.house.upenn.edu/calendar", Event.TYPE_DUBOIS_COLLEGE_HOUSE), | ||
("https://gregory.house.upenn.edu/calendar", Event.TYPE_GREGORY_COLLEGE_HOUSE), | ||
("https://stouffer.house.upenn.edu/calendar", Event.TYPE_STOUFFER_COLLEGE_HOUSE), | ||
] | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **kwargs): | ||
for site, event_type in EVENT_TYPE_MAP: | ||
self.scrape_calendar_page(f"{site}", event_type) | ||
now = timezone.localtime() | ||
if now.day > 25: | ||
next = now + datetime.timedelta(days=30) | ||
next_month, next_year = next.month, next.year | ||
for site, event_type in EVENT_TYPE_MAP: | ||
self.scrape_calendar_page(f"{site}/{next_year}-{next_month:02d}", event_type) | ||
|
||
self.stdout.write("Uploaded College House Events!") | ||
|
||
def scrape_details(self, event_url): | ||
try: | ||
resp = requests.get(event_url) | ||
except ConnectionError: | ||
print("Error:", ConnectionError) | ||
return None | ||
soup = BeautifulSoup(resp.text, "html.parser") | ||
|
||
location = ( | ||
soup.find("div", class_="field-name-field-public-display-location").text.strip() | ||
if soup.find("div", class_="field-name-field-public-display-location") | ||
else None | ||
) | ||
start_time_str = ( | ||
soup.select_one(".date-display-start").get("content") | ||
if soup.select_one(".date-display-start") | ||
else "" | ||
) | ||
end_time_str = ( | ||
soup.select_one(".date-display-end").get("content") | ||
if soup.select_one(".date-display-end") | ||
else "" | ||
) | ||
start_time = ( | ||
datetime.datetime.strptime(start_time_str, "%Y-%m-%dT%H:%M:%S%z") | ||
if start_time_str | ||
else None | ||
) | ||
end_time = ( | ||
datetime.datetime.strptime(end_time_str, "%Y-%m-%dT%H:%M:%S%z") | ||
if end_time_str | ||
else None | ||
) | ||
description = ( | ||
soup.select_one(".field-name-body").text.strip() | ||
if soup.select_one(".field-name-body") | ||
else None | ||
) | ||
image_url = ( | ||
soup.select_one(".field-name-field-image img")["src"] | ||
if soup.select_one(".field-name-field-image img") | ||
else None | ||
) | ||
return location, start_time, end_time, description, image_url | ||
|
||
def scrape_calendar_page(self, calendar_url, event_type): | ||
try: | ||
resp = requests.get(calendar_url) | ||
except ConnectionError: | ||
print("Error:", ConnectionError) | ||
return | ||
soup = BeautifulSoup(resp.text, "html.parser") | ||
|
||
event_cells = soup.find_all("td", class_="single-day future") | ||
|
||
email_element = soup.find("div", class_="views-field-field-office-email-contact").find("a") | ||
email = email_element["href"].split(":")[1] if email_element else None | ||
|
||
for cell in event_cells: | ||
if not (item := cell.find("div", class_="item")): | ||
continue | ||
if not (event_link := item.find("a", href=True)): | ||
continue | ||
name = event_link.text.strip() | ||
if not (url := event_link.get("href")): | ||
continue | ||
index = calendar_url.find("/", calendar_url.find("://") + 3) | ||
base_url = calendar_url[:index] | ||
url = f"{base_url}{url}" | ||
|
||
location, start_time, end_time, description, image_url = self.scrape_details(url) | ||
print(url + " " + name) | ||
Event.objects.update_or_create( | ||
name=name, | ||
defaults={ | ||
"event_type": event_type, | ||
"image_url": image_url, | ||
"start": start_time, | ||
"end": end_time, | ||
"location": location, | ||
"website": url, | ||
"description": description, | ||
"email": email, | ||
}, | ||
) | ||
if start_time and start_time > timezone.localtime() + datetime.timedelta(days=30): | ||
break |
64 changes: 64 additions & 0 deletions
64
backend/penndata/management/commands/get_engineering_events.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import datetime | ||
import html | ||
import json | ||
|
||
import requests | ||
from django.core.management.base import BaseCommand | ||
|
||
from penndata.models import Event | ||
|
||
|
||
ENGINEERING_EVENTS_WEBSITE = "https://events.seas.upenn.edu/calendar/list/" | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **kwargs): | ||
try: | ||
resp = requests.get(ENGINEERING_EVENTS_WEBSITE) | ||
except ConnectionError: | ||
print("Error:", ConnectionError) | ||
return None | ||
|
||
html_content = resp.text | ||
|
||
start_marker = '<script type="application/ld+json">' | ||
end_marker = "</script>" | ||
start_index = html_content.find(start_marker) | ||
end_index = html_content.find(end_marker, start_index) | ||
json_ld_content = html_content[start_index + len(start_marker) : end_index] | ||
|
||
events_data = json.loads(json_ld_content) | ||
|
||
for event in events_data: | ||
if (event_name := html.unescape(event.get("name", ""))) == "": | ||
continue | ||
|
||
description = ( | ||
html.unescape(event.get("description", "")).replace("<p>", "").replace("</p>\n", "") | ||
) | ||
url = event.get("url", None) | ||
|
||
start = datetime.datetime.fromisoformat(event.get("startDate")) | ||
end = datetime.datetime.fromisoformat(event["endDate"]) if "endDate" in event else None | ||
|
||
location = event.get("location", dict()).get("name") | ||
if (organizer := event.get("organizer")) and (email := organizer.get("email")): | ||
email = html.unescape(email) | ||
else: | ||
email = None | ||
|
||
Event.objects.update_or_create( | ||
name=event_name, | ||
defaults={ | ||
"event_type": Event.TYPE_PENN_ENGINEERING, | ||
"image_url": None, | ||
"start": start, | ||
"end": end, | ||
"location": location, | ||
"website": url, | ||
"description": description, | ||
"email": email, | ||
}, | ||
) | ||
|
||
self.stdout.write("Uploaded Engineering Events!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
110 changes: 110 additions & 0 deletions
110
backend/penndata/management/commands/get_venture_events.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import datetime | ||
import html | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from django.core.management.base import BaseCommand | ||
from django.utils import timezone | ||
|
||
from penndata.models import Event | ||
|
||
|
||
VENTURE_EVENTS_WEBSITE = "https://venturelab.upenn.edu/venture-lab-events" | ||
HEADERS = {"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 Chrome/91.0.4472.124 Safari/537.36"} | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **kwargs): | ||
now = timezone.localtime() | ||
current_month, current_year = now.month, now.year | ||
|
||
try: | ||
resp = requests.get(VENTURE_EVENTS_WEBSITE, headers=HEADERS) | ||
except ConnectionError: | ||
print("Error:", ConnectionError) | ||
return None | ||
|
||
soup = BeautifulSoup(resp.text, "html.parser") | ||
|
||
event_containers = soup.find_all("div", class_="PromoSearchResultEvent") | ||
last_start_datetime = None | ||
|
||
for event in event_containers: | ||
event_date_elem = event.find("div", class_="PromoSearchResultEvent-eventDate") | ||
event_start_datetime = None | ||
event_end_datetime = None | ||
# some events don't have a start/end date time or a year | ||
if event_date_elem: | ||
event_date_str = event_date_elem.text.strip() | ||
|
||
event_date_parts = event_date_str.split(" at ") | ||
event_start_str = event_date_parts[1].split(" - ")[0].strip() | ||
event_end_str = event_date_parts[1].split(" - ")[1].strip() | ||
|
||
event_start_datetime = datetime.datetime.strptime( | ||
f"{event_date_parts[0]} {event_start_str}", "%B %d, %Y %I:%M%p" | ||
) | ||
event_end_datetime = datetime.datetime.strptime( | ||
f"{event_date_parts[0]} {event_end_str}", "%B %d, %Y %I:%M%p" | ||
) | ||
last_start_datetime = event_start_datetime | ||
else: # if no year given | ||
event_month_elem = event.find("div", class_="PromoSearchResultEvent-month") | ||
event_day_elem = event.find("div", class_="PromoSearchResultEvent-day") | ||
|
||
if event_month_elem and event_day_elem: | ||
event_month = event_month_elem.text.strip() | ||
event_day = int(event_day_elem.text.strip()) | ||
|
||
if last_start_datetime: # has to be before any previous events | ||
if ( | ||
datetime.datetime.strptime(event_month, "%B").month | ||
> last_start_datetime.month | ||
): | ||
start_year = current_year - 1 | ||
else: | ||
start_year = current_year | ||
else: # if no date time yet | ||
# if in future and next year | ||
if current_month > datetime.datetime.strptime(event_month, "%B").month: | ||
start_year = current_year + 1 | ||
else: | ||
start_year = current_year | ||
|
||
event_start_datetime = datetime.datetime( | ||
start_year, datetime.datetime.strptime(event_month, "%B").month, event_day | ||
) | ||
|
||
# events are ordered from future to past, so break once we find a past event | ||
if event_start_datetime < now.replace(tzinfo=None): | ||
break | ||
|
||
if title := event.find("div", class_="PromoSearchResultEvent-title"): | ||
title = html.unescape(title.text.strip()) | ||
|
||
if location := event.find("div", class_="PromoSearchResultEvent-eventLocation"): | ||
location = location.text.strip() | ||
|
||
if description := event.find("div", class_="PromoSearchResultEvent-description"): | ||
description = html.unescape(description.text.strip()) | ||
|
||
if url := event.find("div", class_="PromoSearchResultEvent-cta").find("a", href=True): | ||
url = url["href"] | ||
|
||
Event.objects.update_or_create( | ||
name=title, | ||
defaults={ | ||
"event_type": Event.TYPE_VENTURE_LAB, | ||
"image_url": None, | ||
"start": ( | ||
timezone.make_aware(event_start_datetime) if event_start_datetime else None | ||
), | ||
"end": timezone.make_aware(event_end_datetime) if event_end_datetime else None, | ||
"location": location, | ||
"website": url, | ||
"description": description, | ||
"email": "[email protected]", | ||
}, | ||
) | ||
|
||
self.stdout.write("Uploaded Venture Lab Events!") |
73 changes: 73 additions & 0 deletions
73
backend/penndata/management/commands/get_wharton_events.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import datetime | ||
import re | ||
|
||
import pytz | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from django.core.management.base import BaseCommand | ||
|
||
from penndata.models import Event | ||
|
||
|
||
WHARTON_EVENTS_WEBSITE = "https://events.wharton.upenn.edu/events-hq/#list" | ||
|
||
|
||
class Command(BaseCommand): | ||
def handle(self, *args, **kwargs): | ||
eastern = pytz.timezone("US/Eastern") | ||
|
||
try: | ||
resp = requests.get(WHARTON_EVENTS_WEBSITE) | ||
except ConnectionError: | ||
print("Error:", ConnectionError) | ||
return None | ||
soup = BeautifulSoup(resp.content, "html.parser") | ||
|
||
event_entries = soup.find_all(class_="post-entry") | ||
|
||
for entry in event_entries: | ||
title = entry.find(class_="entry-title").text.strip() | ||
description = entry.find("p").text.strip() | ||
link = entry.find(class_="entry-title").a["href"] | ||
|
||
info = entry.find(class_="info").span.text.strip() | ||
# event has start and end times on same date | ||
match = re.match(r"(\w+\s+\d+) \| (\d{1,2}:\d{2} [AP]M) - (\d{1,2}:\d{2} [AP]M)", info) | ||
if match: | ||
_, start_time, end_time = match.groups() | ||
start_time_obj = datetime.datetime.strptime(start_time, "%I:%M %p") | ||
end_time_obj = datetime.datetime.strptime(end_time, "%I:%M %p") | ||
else: | ||
# event has start and end times on different dates | ||
match = re.match( | ||
r"(\w+\s+\d+)(?: \| (\d{1,2}:\d{2} [AP]M))?" | ||
r"(?: - (\w+\s+\d+ \| )?(\d{1,2}:\d{2} [AP]M))?", | ||
info, | ||
) | ||
if match: | ||
start_date, start_time, end_date, end_time = match.groups() | ||
start_time_obj = ( | ||
datetime.datetime.strptime(start_time, "%I:%M %p") if start_time else None | ||
) | ||
end_time_obj = ( | ||
datetime.datetime.strptime(end_time, "%I:%M %p") if end_time else None | ||
) | ||
else: | ||
print("Error: Cannot find date, update scraper.") | ||
return | ||
location = ",".join(info.split("•")[-2:]) | ||
Event.objects.update_or_create( | ||
name=title, | ||
defaults={ | ||
"event_type": Event.TYPE_WHARTON, | ||
"image_url": None, | ||
"start": eastern.localize(start_time_obj) if start_time_obj else None, | ||
"end": eastern.localize(end_time_obj) if end_time_obj else None, | ||
"location": location.strip(), | ||
"website": link, | ||
"description": description, | ||
"email": None, | ||
}, | ||
) | ||
|
||
self.stdout.write("Uploaded Wharton Events!") |
Oops, something went wrong.