diff --git a/README.md b/README.md index e6754e4..c51f555 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,6 @@ Currently you can use: - [x] example code to loop and build file, may need a function first - [x] add WICHE dataset -- [] review and consider classes for each survey -- [] add altair for viz +- [x] review and consider classes for each survey (keep as functions for now) +- [x] add altair for viz - [] go back on older surveys farther than standard naming syntax (hd is older than 2002) diff --git a/pypeds/archive.py b/pypeds/archive.py index 73494f3..583dd58 100644 --- a/pypeds/archive.py +++ b/pypeds/archive.py @@ -49,213 +49,63 @@ def read_survey(path): return(survey_file) -###### utilities to build url data -# build a valid ipeds survey url - return a dict with a survey key and url for download -def get_hd(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'HD' + str(year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) -def get_ic(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'IC' + str(year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) +############ -def get_adm(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2014 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'ADM' + str(year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) - - -def get_sfa(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - sfa_year = str(year - 1)[2:] + str(year)[2:] - SURVEY = 'SFA' + str(sfa_year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) - -def get_efc(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'EF' + str(year) + "C" - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) - -def get_icay(year): - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'IC' + str(year) + "_AY" - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - return({'url': URL, 'survey': SURVEY}) - - -###### utilities to crawl and return a big dataset for the survey - -def hd(years = None): - # returns a dataframe of 1 or more survey collections - # will always use the revised file _rv, if the file has it - assert isinstance(years, list), "year is not a list of integers" - # init a dataframe to append things to - hd_df = pd.DataFrame({'pypeds_init': [True]}) - for year in years: - year_info = get_hd(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - hd_df = hd_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - hd_df_final = hd_df.loc[hd_df.pypeds_init != True, ] - hd_df_final.drop(columns=['pypeds_init'], inplace=True) - return(hd_df_final) - -def ic(years = None): - # returns a dataframe of 1 or more survey collections - # will always use the revised file _rv, if the file has it - assert isinstance(years, list), "year is not a list of integers" - # init dataframes to append things to - ic_df = pd.DataFrame({'pypeds_init': [True]}) - adm_df = pd.DataFrame({'pypeds_init': [True]}) - # loop for ic and conditional check for adm - for year in years: - year_info = get_ic(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False) - # check the year to get the admission data for 2014 and later - if year >= 2014: - year_info = get_adm(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - adm_df = adm_df.append(tmp_df, ignore_index=True, sort=False) +class IC(object): + """docstring""" - # finish up - # ignore pandas SettingWithCopyWarning, - pd.options.mode.chained_assignment = None - ic_df_final = ic_df.loc[ic_df.pypeds_init != True, ] - ic_df_final.drop(columns=['pypeds_init'], inplace=True) - adm_df_final = adm_df.loc[adm_df.pypeds_init != True, ] - adm_df_final.drop(columns=['pypeds_init'], inplace=True) - df = pd.merge(ic_df_final, adm_df_final, how="left", on=['unitid', 'survey_year'], suffixes=('_ic', '_adm')) - return(df) + # init + def __init__(self, years=[2017]): + """Constructor""" + self.years = years -def sfa(years = None): - # returns a dataframe of 1 or more survey collections - # will always use the revised file _rv, if the file has it - assert isinstance(years, list), "year is not a list of integers" - # init a dataframe to append things to - sfa_df = pd.DataFrame({'pypeds_init': [True]}) - for year in years: - year_info = get_sfa(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year-1) - sfa_df = sfa_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - sfa_df_final = sfa_df.loc[sfa_df.pypeds_init != True, ] - sfa_df_final.drop(columns=['pypeds_init'], inplace=True) - return(sfa_df_final) + # testing + def get_test(self): + for year in self.years: + print(year) -def efc(years = None): - # returns a dataframe of 1 or more survey collections - # will always use the revised file _rv, if the file has it - assert isinstance(years, list), "year is not a list of integers" - # init a dataframe to append things to - efc_df = pd.DataFrame({'pypeds_init': [True]}) - for year in years: - year_info = get_efc(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - efc_df = efc_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - efc_df_final = efc_df.loc[efc_df.pypeds_init != True, ] - efc_df_final.drop(columns=['pypeds_init'], inplace=True) - return(efc_df_final) -def icay(years = None): - # returns a dataframe of 1 or more survey collections - # will always use the revised file _rv, if the file has it - assert isinstance(years, list), "year is not a list of integers" - # init a dataframe to append things to - icay_df = pd.DataFrame({'pypeds_init': [True]}) - for year in years: - year_info = get_icay(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - icay_df = icay_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - icay_df_final = icay_df.loc[icay_df.pypeds_init != True, ] - icay_df_final.drop(columns=['pypeds_init'], inplace=True) - return(icay_df_final) + # method to get the data and return a dataframe + def get(self): + # setup the df + init_df = pd.DataFrame({'pypeds_init': [True]}) + for year in self.years: + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'IC' + str(year) + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + year_info = {'url': URL, 'survey': SURVEY} + #year_info = get_efc(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + init_df = init_df.append(tmp_df, ignore_index=True, sort=False) + # print("finished hd for year {}".format(str(year))) + # finish up + # ignore pandas SettingWithCopyWarning, basically + pd.options.mode.chained_assignment = None + init_df = init_df.loc[init_df.pypeds_init != True, ] + init_df.drop(columns=['pypeds_init'], inplace=True) + return(init_df) -class IC(object): +class HD(object): """docstring""" - + # init def __init__(self, years=[2017]): """Constructor""" + assert isinstance(years, list), "year is not a list of integers representing 4-digit year for survey" self.years = years - + # testing def get_test(self): for year in self.years: @@ -271,7 +121,7 @@ def get(self): assert isinstance(year, int), "year is not an integer" assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" # build the SURVEY id - SURVEY = 'IC' + str(year) + SURVEY = 'HD' + str(year) # build the url URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) # return the bits as a dictionary for use later @@ -290,3 +140,5 @@ def get(self): init_df = init_df.loc[init_df.pypeds_init != True, ] init_df.drop(columns=['pypeds_init'], inplace=True) return(init_df) + + diff --git a/pypeds/ipeds.py b/pypeds/ipeds.py index cce6ffa..b072060 100644 --- a/pypeds/ipeds.py +++ b/pypeds/ipeds.py @@ -49,92 +49,200 @@ def read_survey(path): return(survey_file) -class IC(object): - """docstring""" - - # init - def __init__(self, years=[2017]): - """Constructor""" - assert isinstance(years, list), "year is not a list of integers representing 4-digit year for survey" - self.years = years - - # testing - def get_test(self): - for year in self.years: - print(year) - - - # method to get the data and return a dataframe - def get(self): - # setup the df - init_df = pd.DataFrame({'pypeds_init': [True]}) - for year in self.years: - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'IC' + str(year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - year_info = {'url': URL, 'survey': SURVEY} - #year_info = get_efc(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - init_df = init_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - init_df = init_df.loc[init_df.pypeds_init != True, ] - init_df.drop(columns=['pypeds_init'], inplace=True) - return(init_df) - - -class HD(object): - """docstring""" - - # init - def __init__(self, years=[2017]): - """Constructor""" - assert isinstance(years, list), "year is not a list of integers representing 4-digit year for survey" - self.years = years - - # testing - def get_test(self): - for year in self.years: - print(year) - - - # method to get the data and return a dataframe - def get(self): - # setup the df - init_df = pd.DataFrame({'pypeds_init': [True]}) - for year in self.years: - # assert that year is a int and length 1 - assert isinstance(year, int), "year is not an integer" - assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" - # build the SURVEY id - SURVEY = 'HD' + str(year) - # build the url - URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) - # return the bits as a dictionary for use later - year_info = {'url': URL, 'survey': SURVEY} - #year_info = get_efc(year) - year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) - tmp_df = read_survey(year_fpath) - tmp_df.columns = tmp_df.columns.str.lower() - tmp_df['survey_year'] = int(year) - tmp_df['fall_year'] = int(year) - init_df = init_df.append(tmp_df, ignore_index=True, sort=False) - # print("finished hd for year {}".format(str(year))) - # finish up - # ignore pandas SettingWithCopyWarning, basically - pd.options.mode.chained_assignment = None - init_df = init_df.loc[init_df.pypeds_init != True, ] - init_df.drop(columns=['pypeds_init'], inplace=True) - return(init_df) +###### utilities to build url data + +# build a valid ipeds survey url - return a dict with a survey key and url for download +def get_hd(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'HD' + str(year) + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + +def get_ic(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'IC' + str(year) + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + +def get_adm(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2014 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'ADM' + str(year) + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + + +def get_sfa(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + sfa_year = str(year - 1)[2:] + str(year)[2:] + SURVEY = 'SFA' + str(sfa_year) + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + +def get_efc(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'EF' + str(year) + "C" + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + +def get_icay(year): + # assert that year is a int and length 1 + assert isinstance(year, int), "year is not an integer" + assert year >= 2002 and year <= 2017, "year must be >=2002 and < 2017" + # build the SURVEY id + SURVEY = 'IC' + str(year) + "_AY" + # build the url + URL = "https://nces.ed.gov/ipeds/datacenter/data/{}.zip".format(SURVEY) + # return the bits as a dictionary for use later + return({'url': URL, 'survey': SURVEY}) + + +###### utilities to crawl and return a big dataset for the survey + +def hd(years = None): + # returns a dataframe of 1 or more survey collections + # will always use the revised file _rv, if the file has it + assert isinstance(years, list), "year is not a list of integers" + # init a dataframe to append things to + hd_df = pd.DataFrame({'pypeds_init': [True]}) + for year in years: + year_info = get_hd(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + hd_df = hd_df.append(tmp_df, ignore_index=True, sort=False) + # print("finished hd for year {}".format(str(year))) + # finish up + # ignore pandas SettingWithCopyWarning, basically + pd.options.mode.chained_assignment = None + hd_df_final = hd_df.loc[hd_df.pypeds_init != True, ] + hd_df_final.drop(columns=['pypeds_init'], inplace=True) + return(hd_df_final) + +def ic(years = None): + # returns a dataframe of 1 or more survey collections + # will always use the revised file _rv, if the file has it + assert isinstance(years, list), "year is not a list of integers" + # init dataframes to append things to + ic_df = pd.DataFrame({'pypeds_init': [True]}) + adm_df = pd.DataFrame({'pypeds_init': [True]}) + # loop for ic and conditional check for adm + for year in years: + year_info = get_ic(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False) + # check the year to get the admission data for 2014 and later + if year >= 2014: + year_info = get_adm(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + adm_df = adm_df.append(tmp_df, ignore_index=True, sort=False) + + # finish up + # ignore pandas SettingWithCopyWarning, + pd.options.mode.chained_assignment = None + ic_df_final = ic_df.loc[ic_df.pypeds_init != True, ] + ic_df_final.drop(columns=['pypeds_init'], inplace=True) + adm_df_final = adm_df.loc[adm_df.pypeds_init != True, ] + adm_df_final.drop(columns=['pypeds_init'], inplace=True) + df = pd.merge(ic_df_final, adm_df_final, how="left", on=['unitid', 'survey_year'], suffixes=('_ic', '_adm')) + return(df) + +def sfa(years = None): + # returns a dataframe of 1 or more survey collections + # will always use the revised file _rv, if the file has it + assert isinstance(years, list), "year is not a list of integers" + # init a dataframe to append things to + sfa_df = pd.DataFrame({'pypeds_init': [True]}) + for year in years: + year_info = get_sfa(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year-1) + sfa_df = sfa_df.append(tmp_df, ignore_index=True, sort=False) + # print("finished hd for year {}".format(str(year))) + # finish up + # ignore pandas SettingWithCopyWarning, basically + pd.options.mode.chained_assignment = None + sfa_df_final = sfa_df.loc[sfa_df.pypeds_init != True, ] + sfa_df_final.drop(columns=['pypeds_init'], inplace=True) + return(sfa_df_final) + +def efc(years = None): + # returns a dataframe of 1 or more survey collections + # will always use the revised file _rv, if the file has it + assert isinstance(years, list), "year is not a list of integers" + # init a dataframe to append things to + efc_df = pd.DataFrame({'pypeds_init': [True]}) + for year in years: + year_info = get_efc(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + efc_df = efc_df.append(tmp_df, ignore_index=True, sort=False) + # print("finished hd for year {}".format(str(year))) + # finish up + # ignore pandas SettingWithCopyWarning, basically + pd.options.mode.chained_assignment = None + efc_df_final = efc_df.loc[efc_df.pypeds_init != True, ] + efc_df_final.drop(columns=['pypeds_init'], inplace=True) + return(efc_df_final) +def icay(years = None): + # returns a dataframe of 1 or more survey collections + # will always use the revised file _rv, if the file has it + assert isinstance(years, list), "year is not a list of integers" + # init a dataframe to append things to + icay_df = pd.DataFrame({'pypeds_init': [True]}) + for year in years: + year_info = get_icay(year) + year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey']) + tmp_df = read_survey(year_fpath) + tmp_df.columns = tmp_df.columns.str.lower() + tmp_df['survey_year'] = int(year) + tmp_df['fall_year'] = int(year) + icay_df = icay_df.append(tmp_df, ignore_index=True, sort=False) + # print("finished hd for year {}".format(str(year))) + # finish up + # ignore pandas SettingWithCopyWarning, basically + pd.options.mode.chained_assignment = None + icay_df_final = icay_df.loc[icay_df.pypeds_init != True, ] + icay_df_final.drop(columns=['pypeds_init'], inplace=True) + return(icay_df_final) diff --git a/test-notebooks/Test Classes.ipynb b/test-notebooks/Test Classes.ipynb index 072e69a..2a5cfb3 100644 --- a/test-notebooks/Test Classes.ipynb +++ b/test-notebooks/Test Classes.ipynb @@ -4,18 +4,271 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, + "outputs": [], + "source": [ + "from pypeds import ipeds" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "ic = ipeds.IC()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { - "ename": "IndentationError", - "evalue": "unindent does not match any outer indentation level (ipeds.py, line 153)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m\"/Users/btibert/github/pypeds/pypeds/ipeds.py\"\u001b[0;36m, line \u001b[0;32m153\u001b[0m\n\u001b[0;31m ic_df = pd.DataFrame({'pypeds_init': [True]})\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" + "name": "stdout", + "output_type": "stream", + "text": [ + "2017\n" ] } ], "source": [ - "%run ../pypeds/ipeds.py" + "ic.get_test()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = ic.get()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unitidpeo1istrpeo2istrpeo3istrpeo4istrpeo5istrpeo6istrcntlaffipubprimepubsecon...sport1confno1sport2confno2sport3confno3sport4confno4survey_yearfall_year
1100654.00.01.00.00.00.00.01.02.00.0...1.0133.01.0133.01.0133.01.0200.02017.02017.0
2100663.00.01.01.00.00.00.01.02.00.0...1.0111.01.0111.01.0111.01.0111.02017.02017.0
3100690.00.01.00.00.00.00.04.0-2.0-2.0...2.0-2.02.0-2.02.0-2.02.0-2.02017.02017.0
4100706.00.01.01.01.00.00.01.02.00.0...2.0-2.01.0146.01.0146.01.0146.02017.02017.0
5100724.01.01.00.00.00.00.01.02.00.0...1.0133.01.0133.01.0133.01.0133.02017.02017.0
\n", + "

5 rows × 123 columns

\n", + "
" + ], + "text/plain": [ + " unitid peo1istr peo2istr peo3istr peo4istr peo5istr peo6istr \\\n", + "1 100654.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "2 100663.0 0.0 1.0 1.0 0.0 0.0 0.0 \n", + "3 100690.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "4 100706.0 0.0 1.0 1.0 1.0 0.0 0.0 \n", + "5 100724.0 1.0 1.0 0.0 0.0 0.0 0.0 \n", + "\n", + " cntlaffi pubprime pubsecon ... sport1 confno1 sport2 confno2 \\\n", + "1 1.0 2.0 0.0 ... 1.0 133.0 1.0 133.0 \n", + "2 1.0 2.0 0.0 ... 1.0 111.0 1.0 111.0 \n", + "3 4.0 -2.0 -2.0 ... 2.0 -2.0 2.0 -2.0 \n", + "4 1.0 2.0 0.0 ... 2.0 -2.0 1.0 146.0 \n", + "5 1.0 2.0 0.0 ... 1.0 133.0 1.0 133.0 \n", + "\n", + " sport3 confno3 sport4 confno4 survey_year fall_year \n", + "1 1.0 133.0 1.0 200.0 2017.0 2017.0 \n", + "2 1.0 111.0 1.0 111.0 2017.0 2017.0 \n", + "3 2.0 -2.0 2.0 -2.0 2017.0 2017.0 \n", + "4 1.0 146.0 1.0 146.0 2017.0 2017.0 \n", + "5 1.0 133.0 1.0 133.0 2017.0 2017.0 \n", + "\n", + "[5 rows x 123 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "ic = ipeds.IC(years=[2016,2017])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "survey = ic.get()" ] }, { @@ -23,7 +276,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "survey.gr" + ] } ], "metadata": { diff --git a/test-notebooks/test-cases.ipynb b/test-notebooks/test-cases.ipynb index 3034949..a3d0909 100644 --- a/test-notebooks/test-cases.ipynb +++ b/test-notebooks/test-cases.ipynb @@ -1,5 +1,181 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def zip_parser(url=None, survey=None):\n", + " # setup the tmp path and file name\n", + " # thanks to https://stackoverflow.com/questions/55718917/download-zip-file-locally-to-tempfile-extract-files-to-tempfile-and-list-the-f/55719124#55719124\n", + " path = \"/tmp/\"\n", + " file = survey + \".zip\"\n", + " survey_lower = survey.lower()\n", + " # get the data\n", + " try:\n", + " results = requests.get(url)\n", + " except:\n", + " pass\n", + " with open(path + file, 'wb') as f:\n", + " f.write(results.content)\n", + " # extract the files to the path\n", + " file = zipfile.ZipFile(path + file)\n", + " file.extractall(path=path)\n", + " # list the csv files for the surveys, most likely get one , but may get to with _rv for revised\n", + " files = glob.glob(path + \"*\"+survey_lower+\"*\")\n", + " # isolate the file name\n", + " if len(files) > 1:\n", + " raw_file = [s for s in files if 'rv' in s]\n", + " raw_file = str(raw_file[0]) # just in case, take first\n", + " else:\n", + " raw_file = str(files[0])\n", + " # return a string\n", + " return(str(raw_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def read_survey(path):\n", + " if isinstance(path, list):\n", + " path = path[0]\n", + " # assumes a path, presumably from zip_parser\n", + " try:\n", + " ## encoding option needed for h2017, at least, wasnt needed for IC2013\n", + " survey_file = pd.read_csv(path, encoding='ISO-8859-1')\n", + " except:\n", + " survey_file = pd.DataFrame({'path':path})\n", + " # column names to lower - helps later and assumes a survey varname is historically unique\n", + " survey_file.columns = survey_file.columns.str.lower()\n", + " # add the survey\n", + " return(survey_file)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class IC(object):\n", + " \"\"\"docstring\"\"\"\n", + " \n", + " # init\n", + " def __init__(self, years=[2017]):\n", + " \"\"\"Constructor\"\"\"\n", + " assert isinstance(years, list), \"year is not a list of integers representing 4-digit year for survey\"\n", + " self.years = years\n", + " \n", + " # testing\n", + " def get_test(self):\n", + " for year in self.years:\n", + " print(year)\n", + "\n", + "\n", + " # method to get the data and return a dataframe\n", + " def get(self):\n", + " # setup the df\n", + " init_df = pd.DataFrame({'pypeds_init': [True]})\n", + " for year in self.years:\n", + " # assert that year is a int and length 1\n", + " assert isinstance(year, int), \"year is not an integer\"\n", + " assert year >= 2002 and year <= 2017, \"year must be >=2002 and < 2017\"\n", + " # build the SURVEY id\n", + " SURVEY = 'IC' + str(year)\n", + " # build the url\n", + " URL = \"https://nces.ed.gov/ipeds/datacenter/data/{}.zip\".format(SURVEY)\n", + " # return the bits as a dictionary for use later\n", + " year_info = {'url': URL, 'survey': SURVEY}\n", + " #year_info = get_efc(year)\n", + " year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])\n", + " tmp_df = read_survey(year_fpath)\n", + " tmp_df.columns = tmp_df.columns.str.lower()\n", + " tmp_df['survey_year'] = int(year)\n", + " tmp_df['fall_year'] = int(year)\n", + " init_df = init_df.append(tmp_df, ignore_index=True, sort=False)\n", + " # print(\"finished hd for year {}\".format(str(year)))\n", + " # finish up\n", + " # ignore pandas SettingWithCopyWarning, basically\n", + " pd.options.mode.chained_assignment = None\n", + " init_df = init_df.loc[init_df.pypeds_init != True, ]\n", + " init_df.drop(columns=['pypeds_init'], inplace=True)\n", + " self.df = init_df\n", + " return self.df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "ic = IC()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "UnboundLocalError", + "evalue": "local variable 'results' referenced before assignment", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mic\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mget\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0myear_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mURL\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'survey'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mSURVEY\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;31m#year_info = get_efc(year)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0myear_fpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzip_parser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0myear_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'url'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msurvey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0myear_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'survey'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0mtmp_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_survey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0myear_fpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0mtmp_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtmp_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mzip_parser\u001b[0;34m(url, survey)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'wb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;31m# extract the files to the path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZipFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'results' referenced before assignment" + ] + } + ], + "source": [ + "ic.get()" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/test-notebooks/test-package.ipynb b/test-notebooks/test-package.ipynb new file mode 100644 index 0000000..29e5bab --- /dev/null +++ b/test-notebooks/test-package.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from pypeds import ipeds\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "YEARS = list(range(2014, 2018, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "hd_df = ipeds.hd(years=YEARS)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(30008, 79)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hd_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2014.0 7687\n", + "2015.0 7647\n", + "2016.0 7521\n", + "2017.0 7153\n", + "Name: survey_year, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hd_df.survey_year.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}