Skip to content

Commit

Permalink
BeautifulSoup Tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
CoreyMSchafer committed Nov 8, 2017
1 parent aff3362 commit fc5e8dd
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
37 changes: 37 additions & 0 deletions BeautifulSoup/scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from bs4 import BeautifulSoup
import requests
import csv

source = requests.get('http://coreyms.com').text

soup = BeautifulSoup(source, 'lxml')

csv_file = open('cms_scrape.csv', 'w')

csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'video_link'])

for article in soup.find_all('article'):
headline = article.h2.a.text
print(headline)

summary = article.find('div', class_='entry-content').p.text
print(summary)

try:
vid_src = article.find('iframe', class_='youtube-player')['src']

vid_id = vid_src.split('/')[4]
vid_id = vid_id.split('?')[0]

yt_link = f'https://youtube.com/watch?v={vid_id}'
except Exception as e:
yt_link = None

print(yt_link)

print()

csv_writer.writerow([headline, summary, yt_link])

csv_file.close()
31 changes: 31 additions & 0 deletions BeautifulSoup/simple.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<!doctype html>
<html class="no-js" lang="">
<head>
<title>Test - A Sample Website</title>
<meta charset="utf-8">
<link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css">
</head>
<body>
<h1 id='site_title'>Test Website</h1>
<hr></hr>
<div class="article">
<h2><a href="article_1.html">Article 1 Headline</a></h2>
<p>This is a summary of article 1</p>
</div>
<hr></hr>
<div class="article">
<h2><a href="article_2.html">Article 2 Headline</a></h2>
<p>This is a summary of article 2</p>
</div>
<hr></hr>

<div class='footer'>
<p>Footer Information</p>
</div>

<script src="js/vendor/modernizr-3.5.0.min.js"></script>
<script src="js/plugins.js"></script>
<script src="js/main.js"></script>
</body>
</html>

0 comments on commit fc5e8dd

Please sign in to comment.