Skip to content

Commit

Permalink
Fixes issue sc3#443; add 'available_dates' and 'fetch_data_for_date' …
Browse files Browse the repository at this point in the history
…command for interfacing with raw inmate data
  • Loading branch information
bepetersn committed Jun 27, 2014
1 parent c4b3f43 commit e560759
Showing 1 changed file with 74 additions and 9 deletions.
83 changes: 74 additions & 9 deletions scraper/raw_inmate_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@

import os.path
import csv
from os import path
from pyquery import PyQuery
from collections import OrderedDict
import shutil
from itertools import chain
import csv, shutil, requests

RAW_INMATE_DATA_RELEASE_URL = 'http://cookcountyjail.recoveredfactory.net/raw_inmate_data/'
RAW_INMATE_DATA_STARTING_YEAR = '2014'

RAW_INMATE_DATA_BUILD_DIR = 'CCJ_RAW_INMATE_DATA_BUILD_DIR'
RAW_INMATE_DATA_RELEASE_DIR = 'CCJ_RAW_INMATE_DATA_RELEASE_DIR'
Expand Down Expand Up @@ -30,10 +34,11 @@ class RawInmateData:
('Court_Location', 'court_house_location')
])


def __init__(self, snap_shot_date, feature_controls, monitor):
if feature_controls is None:
feature_controls = {}
self.__klass = type(self)
featu.path

This comment has been minimized.

Copy link
@bepetersn

bepetersn Jun 27, 2014

Author Owner

Wow, weird deletions.

ass = type(self)
self.__klass_name = self.__klass.__name__
self.__monitor = monitor
self.__snap_shot_date = snap_shot_date
Expand All @@ -45,6 +50,43 @@ def __init__(self, snap_shot_date, feature_controls, monitor):
self.__feature_activated = False
self.__configure_feature(feature_controls)


@staticmethod
def available_dates():
""" Return a list of dates for which there is csv data available.
The dates are in text format, as follows: YYYY-MM-DD. """
year_to_try = RAW_INMATE_DATA_STARTING_YEAR
result = True
dates = []
while result:
result = RawInmateData._available_dates_for_year(year_to_try)
year_to_try = str(int(year_to_try) + 1)
if result:
dates.extend(result)

return dates

@staticmethod
def _available_dates_for_year(year):
""" Given a year, query the raw inmate data API, and return
a list of dates for which there is csv data available there.
The dates are in text format, as follows: YYYY-MM-DD. If
there is no data for the year, returns None. """
try:
result = requests.get(RAW_INMATE_DATA_RELEASE_URL + year)
except requests.RequestException:
return None

if result.status_code != requests.codes.ok:
return None

doc = PyQuery(result.content)
# get a list of links from the directory page
# ignore the first link, which points to the dir above
dates = doc('a:not(:first-child)')
# drop the '.csv'
return [d.text_content()[:-4] for d in dates]

def add(self, inmate_details):
if not self.__feature_activated:
return
Expand All @@ -70,23 +112,42 @@ def __debug(self, msg, debug_level=None):
self.__monitor.debug('{0}: {1}'.format(self.__klass_name, msg), debug_level)

def __ensure_year_dir(self):
year_dir = os.path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y'))
year_dir = path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y'))
try:
os.makedirs(year_dir)
except OSError:
if not os.path.isdir(year_dir):
if not path.isdir(year_dir):
raise
return year_dir

def __feature_control(self, feature_controls, feature_control):
okay, dir_name = False, None
if feature_control in feature_controls:
dir_name = feature_controls[feature_control]
okay = os.path.isdir(dir_name)
okay = path.isdir(dir_name)
if not okay:
self.__debug("'%s' does not exist or is not a directory" % dir_name)
return okay, dir_name

@staticmethod
def fetch_data_for_date(date):
""" Return the raw inmate data for the supplied date, in YYYY-MM-DD format.
If the data can't be fetched, for whatever reason, returns None. """
if date not in RawInmateData.available_dates():
return None

chosen_year = date[:4]
query_url = RAW_INMATE_DATA_RELEASE_URL + chosen_year + '/' + date + '.csv'
try:
result = requests.get(query_url)
except requests.RequestException:
return None

if result.status_code != requests.codes.ok:
return None

return result.content

def __file_name(self):
return self.__snap_shot_date.strftime('%Y-%m-%d.csv')

Expand All @@ -98,8 +159,12 @@ def finish(self):
shutil.move(self.__build_file_name, year_dir)

def __open_build_file(self):
self.__build_file_name = os.path.join(self.__build_dir, self.__file_name())
self.__build_file_name = path.join(self.__build_dir, self.__file_name())
self.__build_file = open(self.__build_file_name, "w")
self.__build_file_writer = csv.writer(self.__build_file)
header_names = [header_name for header_name in RawInmateData.HEADER_METHOD_NAMES.iterkeys()]
self.__build_file_writer.writerow(header_names)




0 comments on commit e560759

Please sign in to comment.