Skip to content

Commit

Permalink
[eeoc] Update for new site (#300)
Browse files Browse the repository at this point in the history
* [eeoc] Update for new site

* Add comment for IG web team about including reports from old site

* Add commented-out placeholder for timestamp map

* Removed unused import

* Discard old EEOC scraper

* [eeoc] Fill in published dates

* [eeoc] Use hardcoded dates instead of fiscal year

* [eeoc] Skip transmittal letters
  • Loading branch information
shanecav84 authored and divergentdave committed Jan 31, 2017
1 parent 4d3a6bb commit c70b725
Showing 1 changed file with 118 additions and 123 deletions.
241 changes: 118 additions & 123 deletions inspectors/eeoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,160 +2,155 @@

import datetime
import logging
import os
import re
from urllib.parse import urljoin

from utils import utils, inspector
from utils import utils, inspector, admin

# http://www.eeoc.gov/eeoc/oig/index.cfm
archive = 2003
# https://oig.eeoc.gov/
archive = 1995

# options:
# standard since/year options for a year range to fetch from.
#
# Notes for IG's web team:
#

REPORTS_URL = "http://www.eeoc.gov/eeoc/oig/"
# - include all reports from old site (https://www.eeoc.gov/eeoc/oig/)
# - some old reports could be run through an OCR maybe
# - include full publication dates in HTML

AUDIT_REPORTS_URL = "https://oig.eeoc.gov/reports/audits"
CONGRESSIONAL_REPORTS_URL = "https://oig.eeoc.gov/reports/semi-annual"
REPORT_URLS = [
("audit", AUDIT_REPORTS_URL),
("congress", CONGRESSIONAL_REPORTS_URL),
]
INSPECTOR_URL = "https://oig.eeoc.gov/"

REPORT_PUBLISHED_MAP = {
'2005-02-amr': datetime.datetime(2005, 1, 1),
'2005-03-prop': datetime.datetime(2005, 9, 1),
'2005-08-mgt': datetime.datetime(2005, 10, 27),
'nccrep': datetime.datetime(2006, 6, 29),
"1995-005-atr": datetime.datetime(1995, 7, 1),
"1997-005-ape": datetime.datetime(1998, 6, 1),
"1999-008-impr": datetime.datetime(1999, 10, 1),
"1999-019-impr": datetime.datetime(1999, 10, 1),
"2001-019-aic": datetime.datetime(2002, 9, 1),
"2002-013-fin": datetime.datetime(2003, 12, 22),
"2003-001-amr": datetime.datetime(2004, 9, 1),
"2003-004-amr": datetime.datetime(2004, 1, 1),
"2003-004-fin": datetime.datetime(2004, 11, 1),
"2003-005-mis": datetime.datetime(2005, 9, 30),
"2003-015-amr": datetime.datetime(2002, 10, 1),
"2005-002-amr": datetime.datetime(2006, 7, 1),
"2005-003-prop": datetime.datetime(2005, 5, 1),
"2005-008-mgt": datetime.datetime(2005, 10, 1),
"2006-002-fin": datetime.datetime(2006, 10, 1),
"2006-007-amr": datetime.datetime(2007, 2, 1),
"2007-003-amr": datetime.datetime(2007, 4, 1),
"2007-007-adv": datetime.datetime(2009, 4, 24),
"2007-011-rfpe": datetime.datetime(2008, 1, 1),
"2007-012-amr": datetime.datetime(2008, 3, 27),
"2008-003-amr": datetime.datetime(2008, 9, 26),
"2008-006-fin": datetime.datetime(2008, 10, 1),
"2008-012-aep": datetime.datetime(2008, 9, 30),
"2009-004-fin": datetime.datetime(2009, 11, 13),
"2010-009-aep": datetime.datetime(2011, 3, 10),
"2011-001-aep": datetime.datetime(2011, 7, 15),
"2011-002-aep": datetime.datetime(2011, 9, 30),
"2011-002-fin": datetime.datetime(2011, 11, 11),
"2011-005-fism": datetime.datetime(2011, 11, 21),
"2011-apr-sep": datetime.datetime(2011, 11, 1),
"2011-oct-mar": datetime.datetime(2011, 4, 30),
"2012-001-fin": datetime.datetime(2012, 11, 15),
"2012-003-fism": datetime.datetime(2012, 9, 1),
"2012-008-purc": datetime.datetime(2013, 3, 26),
"2012-009-rev": datetime.datetime(2013, 4, 9),
"2012-010-pmev": datetime.datetime(2013, 3, 19),
"2012-apr-sep": datetime.datetime(2012, 11, 1),
"2012-oct-mar": datetime.datetime(2012, 4, 30),
"2013-001-fin": datetime.datetime(2013, 12, 12),
"2013-003-caro": datetime.datetime(2014, 9, 23),
"2013-005-fism": datetime.datetime(2013, 12, 5),
"2013-008-psa": datetime.datetime(2014, 9, 15),
"2013-apr-sep": datetime.datetime(2013, 11, 6),
"2013-oct-mar": datetime.datetime(2013, 4, 30),
"2014-001-fin": datetime.datetime(2014, 11, 17),
"2014-003-oe": datetime.datetime(2015, 5, 1),
"2014-008-eoig": datetime.datetime(2014, 12, 16),
"2014-apr-sep": datetime.datetime(2014, 10, 31),
"2014-oct-mar-0": datetime.datetime(2014, 4, 30),
"2015-001-fin": datetime.datetime(2015, 12, 16),
"2015-001-iper": datetime.datetime(2015, 5, 13),
"2015-001-lit": datetime.datetime(2016, 6, 1),
"2015-003-eoig": datetime.datetime(2015, 12, 9),
"2015-apr-sep": datetime.datetime(2015, 11, 1),
"2015-oct-mar": datetime.datetime(2015, 4, 30),
"2016-0004-aoig": datetime.datetime(2016, 5, 11),
"2016-001-aoig": datetime.datetime(2016, 11, 15),
"2016-008-eoig": datetime.datetime(2017, 1, 4),
"2016-012-aep": datetime.datetime(2016, 9, 30),
"2016-apr-sep": datetime.datetime(2016, 11, 1),
"2016-oct-mar": datetime.datetime(2016, 5, 1),
}

def run(options):
year_range = inspector.year_range(options, archive)

# Pull the reports
doc = utils.beautifulsoup_from_url(REPORTS_URL)
semiannual_report_results, other_results = doc.select("table tr")[1].select("td")

if not semiannual_report_results:
raise inspector.NoReportsFoundError("EEOC (semiannual reports)")
if not other_results:
raise inspector.NoReportsFoundError("EEOC (other reports)")

merge_items(semiannual_report_results)
merge_items(other_results)

for result in semiannual_report_results.select("li"):
report = semiannual_report_from(result, year_range, title_prefix="Semiannual Report - ")
if report:
inspector.save_report(report)

for result in other_results.select("li"):
report = report_from(result, year_range)
if report:
inspector.save_report(report)

def semiannual_report_from(result, year_range, title_prefix=None):
link = result.find("a")
report_url = urljoin(REPORTS_URL, link.get('href'))
report_filename = report_url.split("/")[-1]
report_id, _ = os.path.splitext(report_filename)

published_on_text = link.text.split("-")[-1].strip()
published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

title = link.text
if title_prefix:
title = "{}{}".format(title_prefix, title)

if published_on.year not in year_range:
logging.debug("[%s] Skipping, not in requested range." % report_url)
return

report = {
'inspector': "eeoc",
'inspector_url': "http://www.eeoc.gov/eeoc/oig/",
'agency': "eeoc",
'agency_name': "Equal Employment Opportunity Commission",
'report_id': report_id,
'url': report_url,
'title': title,
'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
}

return report

def report_from(result, year_range):
link = result.find("a")
report_url = urljoin(REPORTS_URL, link.get('href'))
report_filename = report_url.split("/")[-1]
report_id, _ = os.path.splitext(report_filename)
for report_type, report_url in REPORT_URLS:
doc = utils.beautifulsoup_from_url(report_url)
results = doc.select("tbody tr > td:nth-of-type(1) a") or doc.select(".views-more-link")
for result in results:
report = report_from(result, year_range, report_type)
if report:
inspector.save_report(report)

def report_from(result, year_range, report_type):
path = result.get("href")
html_report_url = urljoin(INSPECTOR_URL, path)
html_report = utils.beautifulsoup_from_url(html_report_url)
report_id = path.split('/')[-1]
title = html_report.find("span", {"property": "dc:title"})['content']
fiscal_year = fiscal_year_parse(html_report)

links = html_report.select(".file a")
hrefs = filter_links(links)
if len(hrefs) > 1:
raise Exception("Found multiple links on {}:\n{}".format(html_report_url,
hrefs))
if len(hrefs) == 0:
raise Exception("Found no links on {}".format(html_report_url))
pdf_report_url = hrefs[0]

if report_id in REPORT_PUBLISHED_MAP:
published_on = REPORT_PUBLISHED_MAP[report_id]
else:
try:
published_on_text = "-".join(re.search('\((\w+) (\d+),?\s(\d{4})\)', result.text).groups())
published_on = datetime.datetime.strptime(published_on_text, '%B-%d-%Y')
except AttributeError:
try:
published_on_text = "-".join(re.search('\((\w+)\s(\d{4})\)', result.text).groups())
published_on = datetime.datetime.strptime(published_on_text, '%B-%Y')
except AttributeError:
published_on_text = "-".join(re.search('(\w+) (\d+),?\s(\d{4})', result.text).groups())
published_on = datetime.datetime.strptime(published_on_text, '%B-%d-%Y')

title = link.text

if published_on.year not in year_range:
logging.debug("[%s] Skipping, not in requested range." % report_url)
admin.log_no_date("eeoc", report_id, title, pdf_report_url)

if fiscal_year not in year_range:
logging.debug("[%s] Skipping, not in requested range." % pdf_report_url)
return

report = {
'inspector': "eeoc",
'inspector_url': "http://www.eeoc.gov/eeoc/oig/",
'inspector_url': INSPECTOR_URL,
'agency': "eeoc",
'agency_name': "Equal Employment Opportunity Commission",
'report_id': report_id,
'url': report_url,
'url': pdf_report_url,
'title': title,
'type': report_type,
'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
}

return report

def merge_items(parent):
'''This function loops through all the <li> tags in a subtree of a document,
checks if two neighboring items each contain one link to the same href,
and if they do, merges them together into one <li> with one <a> tag.'''

items = parent.find_all("li")
for i in range(len(items) - 2, -1, -1):
first_item = items[i]
second_item = items[i+1]

first_links = first_item.find_all("a")
if len(first_links) != 1:
continue
second_links = second_item.find_all("a")
if len(second_links) != 1:
continue
first_link = first_links[0]
second_link = second_links[0]

if first_link.get("href") != second_link.get("href"):
continue

# Transplant the contents of the second link into the end of the first link
first_link.append(" ")
while second_link.contents:
temp = second_link.contents[0].extract()
first_link.append(temp)

# Discard the now-empty second link
second_link.extract()

# Transplant everything else inside the <li> to the end of the first <li>
while second_item.contents:
temp = second_item.contents[0].extract()
first_item.append(temp)

# Discard the now-empty list item
second_item.extract()
def fiscal_year_parse(html_report):
fiscal_year_text = (html_report
.find(class_="field-name-field-fiscal-year")
.find(class_="field-item")
.get_text())
return int(fiscal_year_text)

def filter_links(links):
return [link["href"] for link in links
if ("Trans_Rept" not in link.text and
"Transmittal" not in link.text)]

utils.run(run) if (__name__ == "__main__") else None

0 comments on commit c70b725

Please sign in to comment.