Add BS extraction of LSV scores

rsullivan00 · Sep 18, 2019 · 17f1a7a · 17f1a7a
1 parent 4eaa1ab
commit 17f1a7a
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 26 deletions.
diff --git a/bin/download_cfb_articles b/bin/download_cfb_articles
@@ -1,16 +1,3 @@
 #!/usr/bin/env bash
 
-ARTICLE_PREFIX=https://www.channelfireball.com/articles
-CFB_DATA_LOCATION=data/cfb
-
-if [ $# -eq 0 ]; then
-  echo "Usage: bin/download_cfb_articles <URL>"
-  exit 1
-fi
-
-mkdir -p $CFB_DATA_LOCATION
-
-ARTICLE_NAME=${1#"$ARTICLE_PREFIX"}
-ARTICLE_NAME=${ARTICLE_NAME%"/"}
-
-curl -o $CFB_DATA_LOCATION/$ARTICLE_NAME.html $1
+pipenv run python src/download_cfb.py
diff --git a/bin/extract_cfb_ratings b/bin/extract_cfb_ratings
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+pipenv run python src/scrape_cfb.py
diff --git a/cfb_articles.csv b/cfb_articles.csv
@@ -4,4 +4,10 @@ war,https://www.channelfireball.com/articles/luis-scott-vargas/war-of-the-spark-
 war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-black/
 war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-red/
 war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-green/
-war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-gold-artifacts-and-lands/
+war,https://www.channelfireball.com/articles/war-of-the-spark-limited-set-review-gold-artifacts-and-lands/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-black/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-white/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-blue/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-red/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-green-and-gold/
+m20,https://www.channelfireball.com/articles/core-set-2020-limited-set-review-gold-artifacts-and-lands/
diff --git a/src/download_cfb.py b/src/download_cfb.py
@@ -19,4 +19,4 @@
 
     print('Downloading {}'.format(url))
     with open(html_name, 'w') as html_file:
-        html_file.write(requests.get(url).text)
+        html_file.write(requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text)
diff --git a/src/scrape_cfb.py b/src/scrape_cfb.py
@@ -1,15 +1,24 @@
 from bs4 import BeautifulSoup
 import glob
 import re
+import sys
+import pandas as pd
+import os
 
 
+def clean_name(html_name):
+    return re.sub(r'[`’]', "'", html_name)
+
 """
 Reads all HTML files in `data/cfb` and extracts card names and their scores to
 a CSV.
 """
-for filename in glob.glob('data/cfb/*.html'):
+dfs = []
+for filename in glob.glob('data/cfb/*/*.html'):
+    print('Processing {}'.format(filename))
     with open(filename) as fp:
-        soup = BeautifulSoup(fp)
+        set_name = os.path.basename(os.path.dirname(filename))
+        soup = BeautifulSoup(fp, features='html.parser')
         headings = soup.find_all(['h1', 'h2', 'h3', 'h4'])
 
         cards = []
@@ -19,17 +28,29 @@
             # Scores are prefixed by `Limited: `
             if heading.get_text().startswith('Limited:'):
                 score = re.sub(r'Limited:\s*', '', heading.get_text())
-                if not card_name:
-                    raise 'Found score without card name'
-                cards.push([card_name, score])
+                if not len(score):
+                    continue
+                if card_name is None:
+                    print('Warning: Found score without card name. File: {} Score: {}'.format(filename, score), file=sys.stderr)
+                    continue
+                cards.append([card_name, score])
+                card_name = None
 
             # Card Titles are in Headings and on image alt texts
-            card_img = heading.find_next_sibling().find('img')
-            if not card_img or not card_img['alt'] == heading.get_text():
+            next_sibling = heading.find_next_sibling()
+            if not next_sibling:
+                continue
+            card_img = next_sibling.find('img')
+            html_name = heading.get_text()
+            cleaned_name = clean_name(html_name)
+            if not card_img or not card_img['alt'] in (html_name, cleaned_name):
                 continue
 
-            card_name = heading.get_text()
-
-        print(cards)
+            card_name = cleaned_name
 
+        df = pd.DataFrame.from_records(cards, columns=['name', 'score'])
+        df['set_name'] = set_name
+        dfs.append(df)
 
+df = pd.concat(dfs)
+df.to_csv('data/lsv_scores.csv')
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/usr/bin/env bash

		pipenv run python src/scrape_cfb.py