Skip to content

Commit

Permalink
Add BS extraction of LSV scores
Browse files Browse the repository at this point in the history
  • Loading branch information
rsullivan00 committed Sep 18, 2019
1 parent 4eaa1ab commit 17f1a7a
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 26 deletions.
15 changes: 1 addition & 14 deletions bin/download_cfb_articles
Original file line number Diff line number Diff line change
@@ -1,16 +1,3 @@
#!/usr/bin/env bash

ARTICLE_PREFIX=https://www.channelfireball.com/articles
CFB_DATA_LOCATION=data/cfb

if [ $# -eq 0 ]; then
echo "Usage: bin/download_cfb_articles <URL>"
exit 1
fi

mkdir -p $CFB_DATA_LOCATION

ARTICLE_NAME=${1#"$ARTICLE_PREFIX"}
ARTICLE_NAME=${ARTICLE_NAME%"/"}

curl -o $CFB_DATA_LOCATION/$ARTICLE_NAME.html $1
pipenv run python src/download_cfb.py
3 changes: 3 additions & 0 deletions bin/extract_cfb_ratings
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

pipenv run python src/scrape_cfb.py
8 changes: 7 additions & 1 deletion cfb_articles.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,10 @@ war,https://www.channelfireball.com/articles/luis-scott-vargas/war-of-the-spark-
war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-black/
war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-red/
war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-green/
war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-gold-artifacts-and-lands/
war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-gold-artifacts-and-lands/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-black/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-white/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-blue/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-red/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-green-and-gold/
m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-gold-artifacts-and-lands/
2 changes: 1 addition & 1 deletion src/download_cfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@

print('Downloading {}'.format(url))
with open(html_name, 'w') as html_file:
html_file.write(requests.get(url).text)
html_file.write(requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text)
41 changes: 31 additions & 10 deletions src/scrape_cfb.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
from bs4 import BeautifulSoup
import glob
import re
import sys
import pandas as pd
import os


def clean_name(html_name):
return re.sub(r'[`’]', "'", html_name)

"""
Reads all HTML files in `data/cfb` and extracts card names and their scores to
a CSV.
"""
for filename in glob.glob('data/cfb/*.html'):
dfs = []
for filename in glob.glob('data/cfb/*/*.html'):
print('Processing {}'.format(filename))
with open(filename) as fp:
soup = BeautifulSoup(fp)
set_name = os.path.basename(os.path.dirname(filename))
soup = BeautifulSoup(fp, features='html.parser')
headings = soup.find_all(['h1', 'h2', 'h3', 'h4'])

cards = []
Expand All @@ -19,17 +28,29 @@
# Scores are prefixed by `Limited: `
if heading.get_text().startswith('Limited:'):
score = re.sub(r'Limited:\s*', '', heading.get_text())
if not card_name:
raise 'Found score without card name'
cards.push([card_name, score])
if not len(score):
continue
if card_name is None:
print('Warning: Found score without card name. File: {} Score: {}'.format(filename, score), file=sys.stderr)
continue
cards.append([card_name, score])
card_name = None

# Card Titles are in Headings and on image alt texts
card_img = heading.find_next_sibling().find('img')
if not card_img or not card_img['alt'] == heading.get_text():
next_sibling = heading.find_next_sibling()
if not next_sibling:
continue
card_img = next_sibling.find('img')
html_name = heading.get_text()
cleaned_name = clean_name(html_name)
if not card_img or not card_img['alt'] in (html_name, cleaned_name):
continue

card_name = heading.get_text()

print(cards)
card_name = cleaned_name

df = pd.DataFrame.from_records(cards, columns=['name', 'score'])
df['set_name'] = set_name
dfs.append(df)

df = pd.concat(dfs)
df.to_csv('data/lsv_scores.csv')

0 comments on commit 17f1a7a

Please sign in to comment.