Skip to content

Commit

Permalink
better handle of temp dir, now need to reapply merge of adm
Browse files Browse the repository at this point in the history
  • Loading branch information
Btibert3 committed May 14, 2019
1 parent c75054e commit 976a3ff
Show file tree
Hide file tree
Showing 4 changed files with 561 additions and 12 deletions.
34 changes: 22 additions & 12 deletions pypeds/ipeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@
import requests
import zipfile
import glob
import time


# zip file factory - returns a pandas dataframe
def zip_parser(url=None, survey=None):
# setup the tmp path and file name
# thanks to https://stackoverflow.com/questions/55718917/download-zip-file-locally-to-tempfile-extract-files-to-tempfile-and-list-the-f/55719124#55719124
path = "/tmp/"
path = "/tmp/" + str(int(time.time())) + "/" # hacky way to make unique path to extract time
file = survey + ".zip"
survey_lower = survey.lower()
# get the data
os.mkdir(path)
try:
results = requests.get(url)
except:
Expand All @@ -31,7 +33,7 @@ def zip_parser(url=None, survey=None):
raw_file = str(raw_file[0]) # just in case, take first
else:
raw_file = str(files[0])
# return a string
# return a string
return(str(raw_file))

def read_survey(path):
Expand All @@ -43,6 +45,8 @@ def read_survey(path):
survey_file = pd.read_csv(path, encoding='ISO-8859-1')
except:
survey_file = pd.DataFrame({'path':path})
# remove the file
os.remove(path)
# column names to lower - helps later and assumes a survey varname is historically unique
survey_file.columns = survey_file.columns.str.lower()
# add the survey
Expand All @@ -63,6 +67,7 @@ def get_hd(year):
# return the bits as a dictionary for use later
return({'url': URL, 'survey': SURVEY})


def get_ic(year):
# assert that year is a int and length 1
assert isinstance(year, int), "year is not an integer"
Expand All @@ -74,10 +79,11 @@ def get_ic(year):
# return the bits as a dictionary for use later
return({'url': URL, 'survey': SURVEY})


def get_adm(year):
# assert that year is a int and length 1
assert isinstance(year, int), "year is not an integer"
assert year >= 2014 and year <= 2017, "year must be >=2002 and < 2017"
assert year >= 2014 and year <= 2017, "year must be >=2014 and < 2017"
# build the SURVEY id
SURVEY = 'ADM' + str(year)
# build the url
Expand All @@ -98,6 +104,7 @@ def get_sfa(year):
# return the bits as a dictionary for use later
return({'url': URL, 'survey': SURVEY})


def get_efc(year):
# assert that year is a int and length 1
assert isinstance(year, int), "year is not an integer"
Expand All @@ -109,6 +116,7 @@ def get_efc(year):
# return the bits as a dictionary for use later
return({'url': URL, 'survey': SURVEY})


def get_icay(year):
# assert that year is a int and length 1
assert isinstance(year, int), "year is not an integer"
Expand Down Expand Up @@ -267,22 +275,24 @@ def extract(self):
#adm_df = pd.DataFrame({'pypeds_init': [True]})
# loop for ic and conditional check for adm
for year in self.years:
if year < 2014:
# process the old files
year_info = get_ic(year)
year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
tmp_df = read_survey(year_fpath)
tmp_df.columns = tmp_df.columns.str.lower()
tmp_df['survey_year'] = int(year)
tmp_df['fall_year'] = int(year)
ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)
# check the year to get the admission data for 2014 and later
if year >= 2014:
year_info = get_adm(year)
year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
tmp_df = read_survey(year_fpath)
tmp_df.columns = tmp_df.columns.str.lower()
tmp_df['survey_year'] = int(year)
tmp_df['fall_year'] = int(year)
ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)
# check the year to get the admission data for 2014 and later
if year >= 2014:
year_info = get_adm(year)
year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
tmp_df = read_survey(year_fpath)
tmp_df.columns = tmp_df.columns.str.lower()
tmp_df['survey_year'] = int(year)
tmp_df['fall_year'] = int(year)
ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)

# finish up
# ignore pandas SettingWithCopyWarning,
Expand Down
284 changes: 284 additions & 0 deletions test-notebooks/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pypeds import ipeds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"YEARS = [2017]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ic = ipeds.IC(years=YEARS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ic.years"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ic.extract()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = ic.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.survey_year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"hd = ipeds.HD(years=YEARS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"hd.extract()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = hd.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.survey_year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sfa = ipeds.SFA(years=YEARS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sfa.extract()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = sfa.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.survey_year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"efc = ipeds.EFC(years=YEARS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"efc.extract()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = efc.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.survey_year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"icay = ipeds.ICAY(years=YEARS)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"icay.extract()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = icay.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.survey_year.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 976a3ff

Please sign in to comment.