diff --git a/scraper/raw_inmate_data.py b/scraper/raw_inmate_data.py index 44fe3a5..091f017 100644 --- a/scraper/raw_inmate_data.py +++ b/scraper/raw_inmate_data.py @@ -1,8 +1,12 @@ -import os.path -import csv +from os import path +from pyquery import PyQuery from collections import OrderedDict -import shutil +from itertools import chain +import csv, shutil, requests + +RAW_INMATE_DATA_RELEASE_URL = 'http://cookcountyjail.recoveredfactory.net/raw_inmate_data/' +RAW_INMATE_DATA_STARTING_YEAR = '2014' RAW_INMATE_DATA_BUILD_DIR = 'CCJ_RAW_INMATE_DATA_BUILD_DIR' RAW_INMATE_DATA_RELEASE_DIR = 'CCJ_RAW_INMATE_DATA_RELEASE_DIR' @@ -30,10 +34,11 @@ class RawInmateData: ('Court_Location', 'court_house_location') ]) + def __init__(self, snap_shot_date, feature_controls, monitor): if feature_controls is None: - feature_controls = {} - self.__klass = type(self) + featu.path + ass = type(self) self.__klass_name = self.__klass.__name__ self.__monitor = monitor self.__snap_shot_date = snap_shot_date @@ -45,6 +50,43 @@ def __init__(self, snap_shot_date, feature_controls, monitor): self.__feature_activated = False self.__configure_feature(feature_controls) + + @staticmethod + def available_dates(): + """ Return a list of dates for which there is csv data available. + The dates are in text format, as follows: YYYY-MM-DD. """ + year_to_try = RAW_INMATE_DATA_STARTING_YEAR + result = True + dates = [] + while result: + result = RawInmateData._available_dates_for_year(year_to_try) + year_to_try = str(int(year_to_try) + 1) + if result: + dates.extend(result) + + return dates + + @staticmethod + def _available_dates_for_year(year): + """ Given a year, query the raw inmate data API, and return + a list of dates for which there is csv data available there. + The dates are in text format, as follows: YYYY-MM-DD. If + there is no data for the year, returns None. """ + try: + result = requests.get(RAW_INMATE_DATA_RELEASE_URL + year) + except requests.RequestException: + return None + + if result.status_code != requests.codes.ok: + return None + + doc = PyQuery(result.content) + # get a list of links from the directory page + # ignore the first link, which points to the dir above + dates = doc('a:not(:first-child)') + # drop the '.csv' + return [d.text_content()[:-4] for d in dates] + def add(self, inmate_details): if not self.__feature_activated: return @@ -70,11 +112,11 @@ def __debug(self, msg, debug_level=None): self.__monitor.debug('{0}: {1}'.format(self.__klass_name, msg), debug_level) def __ensure_year_dir(self): - year_dir = os.path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y')) + year_dir = path.join(self.__raw_inmate_dir, self.__snap_shot_date.strftime('%Y')) try: os.makedirs(year_dir) except OSError: - if not os.path.isdir(year_dir): + if not path.isdir(year_dir): raise return year_dir @@ -82,11 +124,30 @@ def __feature_control(self, feature_controls, feature_control): okay, dir_name = False, None if feature_control in feature_controls: dir_name = feature_controls[feature_control] - okay = os.path.isdir(dir_name) + okay = path.isdir(dir_name) if not okay: self.__debug("'%s' does not exist or is not a directory" % dir_name) return okay, dir_name + @staticmethod + def fetch_data_for_date(date): + """ Return the raw inmate data for the supplied date, in YYYY-MM-DD format. + If the data can't be fetched, for whatever reason, returns None. """ + if date not in RawInmateData.available_dates(): + return None + + chosen_year = date[:4] + query_url = RAW_INMATE_DATA_RELEASE_URL + chosen_year + '/' + date + '.csv' + try: + result = requests.get(query_url) + except requests.RequestException: + return None + + if result.status_code != requests.codes.ok: + return None + + return result.content + def __file_name(self): return self.__snap_shot_date.strftime('%Y-%m-%d.csv') @@ -98,8 +159,12 @@ def finish(self): shutil.move(self.__build_file_name, year_dir) def __open_build_file(self): - self.__build_file_name = os.path.join(self.__build_dir, self.__file_name()) + self.__build_file_name = path.join(self.__build_dir, self.__file_name()) self.__build_file = open(self.__build_file_name, "w") self.__build_file_writer = csv.writer(self.__build_file) header_names = [header_name for header_name in RawInmateData.HEADER_METHOD_NAMES.iterkeys()] self.__build_file_writer.writerow(header_names) + + + +