better handle of temp dir, now need to reapply merge of adm

Btibert3 · May 14, 2019 · 976a3ff · 976a3ff
1 parent c75054e
commit 976a3ff
Show file tree

Hide file tree

Showing 4 changed files with 561 additions and 12 deletions.
diff --git a/pypeds/ipeds.py b/pypeds/ipeds.py
@@ -4,16 +4,18 @@
 import requests
 import zipfile
 import glob
+import time
 
 
 # zip file factory - returns a pandas dataframe
 def zip_parser(url=None, survey=None):
     # setup the tmp path and file name
     # thanks to https://stackoverflow.com/questions/55718917/download-zip-file-locally-to-tempfile-extract-files-to-tempfile-and-list-the-f/55719124#55719124
-    path = "/tmp/"
+    path = "/tmp/" + str(int(time.time())) + "/"  # hacky way to make unique path to extract time
     file = survey + ".zip"
     survey_lower = survey.lower()
     # get the data
+    os.mkdir(path)
     try:
       results = requests.get(url)
     except:
@@ -31,7 +33,7 @@ def zip_parser(url=None, survey=None):
         raw_file = str(raw_file[0]) # just in case, take first
     else:
         raw_file = str(files[0])
-    # return a string
+    # return a string 
     return(str(raw_file))
 
 def read_survey(path):
@@ -43,6 +45,8 @@ def read_survey(path):
         survey_file = pd.read_csv(path, encoding='ISO-8859-1')
     except:
         survey_file = pd.DataFrame({'path':path})
+    # remove the file
+    os.remove(path)
     # column names to lower - helps later and assumes a survey varname is historically unique
     survey_file.columns = survey_file.columns.str.lower()
     # add the survey
@@ -63,6 +67,7 @@ def get_hd(year):
     # return the bits as a dictionary for use later
     return({'url': URL, 'survey': SURVEY})
 
+
 def get_ic(year):
     # assert that year is a int and length 1
     assert isinstance(year, int), "year is not an integer"
@@ -74,10 +79,11 @@ def get_ic(year):
     # return the bits as a dictionary for use later
     return({'url': URL, 'survey': SURVEY})
 
+
 def get_adm(year):
     # assert that year is a int and length 1
     assert isinstance(year, int), "year is not an integer"
-    assert year >= 2014 and year <= 2017, "year must be >=2002 and < 2017"
+    assert year >= 2014 and year <= 2017, "year must be >=2014 and < 2017"
     # build the SURVEY id
     SURVEY = 'ADM' + str(year)
     # build the url
@@ -98,6 +104,7 @@ def get_sfa(year):
     # return the bits as a dictionary for use later
     return({'url': URL, 'survey': SURVEY})
 
+
 def get_efc(year):
     # assert that year is a int and length 1
     assert isinstance(year, int), "year is not an integer"
@@ -109,6 +116,7 @@ def get_efc(year):
     # return the bits as a dictionary for use later
     return({'url': URL, 'survey': SURVEY})
 
+
 def get_icay(year):
     # assert that year is a int and length 1
     assert isinstance(year, int), "year is not an integer"
@@ -267,22 +275,24 @@ def extract(self):
         #adm_df = pd.DataFrame({'pypeds_init': [True]})
         # loop for ic and conditional check for adm
         for year in self.years:
+          if year < 2014:
+            # process the old files
             year_info = get_ic(year)
             year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
             tmp_df = read_survey(year_fpath)
             tmp_df.columns = tmp_df.columns.str.lower()
             tmp_df['survey_year'] = int(year)
             tmp_df['fall_year'] = int(year)
             ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)
-            # check the year to get the admission data for 2014 and later
-            if year >= 2014:
-              year_info = get_adm(year)
-              year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
-              tmp_df = read_survey(year_fpath)
-              tmp_df.columns = tmp_df.columns.str.lower()
-              tmp_df['survey_year'] = int(year)
-              tmp_df['fall_year'] = int(year)
-              ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)
+          # check the year to get the admission data for 2014 and later
+          if year >= 2014:
+            year_info = get_adm(year)
+            year_fpath = zip_parser(url=year_info['url'], survey=year_info['survey'])
+            tmp_df = read_survey(year_fpath)
+            tmp_df.columns = tmp_df.columns.str.lower()
+            tmp_df['survey_year'] = int(year)
+            tmp_df['fall_year'] = int(year)
+            ic_df = ic_df.append(tmp_df, ignore_index=True, sort=False)
 
         # finish up
         # ignore pandas SettingWithCopyWarning,

diff --git a/test-notebooks/Untitled.ipynb b/test-notebooks/Untitled.ipynb
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pypeds import ipeds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "YEARS = [2017]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ic = ipeds.IC(years=YEARS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ic.years"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ic.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = ic.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.survey_year.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd = ipeds.HD(years=YEARS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hd.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = hd.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.survey_year.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sfa = ipeds.SFA(years=YEARS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sfa.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = sfa.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.survey_year.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "efc = ipeds.EFC(years=YEARS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "efc.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = efc.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.survey_year.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icay = ipeds.ICAY(years=YEARS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "icay.extract()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = icay.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.survey_year.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}