Skip to content

Commit

Permalink
add validation to acm links
Browse files Browse the repository at this point in the history
  • Loading branch information
veetihytonen committed Dec 9, 2024
1 parent 71e6f57 commit 31c8fd6
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
15 changes: 14 additions & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@

reference_dao = ReferenceDao(db)

def link_is_valid_form(link):
regex = re.compile("https:\/\/dl\.acm\.org\/doi\/(book\/)?\d{2}\.\d{4}\/[\d.]+")
match = re.fullmatch(regex, link)

return True if match else False

@app.route("/")
def index():
return render_template("index.html")
Expand Down Expand Up @@ -41,7 +47,14 @@ def create_new_reference():

if request.form.get("type") == "acm":
link = request.form.get("link")
html = requests.get(link).content
if not link_is_valid_form(link):
return render_template("/new_reference.html", errors=["Invalid ACM link"])

res = requests.get(link)
if res.status_code == 404:
return render_template("/new_reference.html", errors=["Invalid ACM link"])

html = res.content
data["link"] = link

if "book" in link:
Expand Down
19 changes: 17 additions & 2 deletions src/entities/acm_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ def __parse_article_number(soup):

return int(element.get_text())

def __parse_journal(soup):
element = soup.select_one('.core-enumeration span[property="name"]')
print(element)
if not element:
return None

return element.get_text()

def __parse_volume(soup):
element = soup.find('span', property='volumeNumber')
if not element:
return None

return int(element.get_text())

def scrape_article(html: str | bytes):
# parsed tree object of page html
soup = BeautifulSoup(html,'html.parser')
Expand All @@ -53,9 +68,9 @@ def scrape_article(html: str | bytes):
article_data = {
'author': __parse_article_authors(soup),
'title': soup.select_one('.core-container h1').get_text(),
'journal': soup.select_one('.core-enumeration span[property="name"]').get_text(),
'journal': __parse_journal(soup),
'year': date_data['year'],
'volume': int(soup.find('span', property='volumeNumber').get_text()),
'volume': __parse_volume(soup),
'number': __parse_article_number(soup),
'pages': __parse_article_pages(soup),
'month': date_data['month'],
Expand Down

0 comments on commit 31c8fd6

Please sign in to comment.