Add term table, accomodate multiple terms, fix course.sis_id bug (#113,

#117) (#115) * Add term WIP * Modify Canvas course fetch func to use multiple terms * Improve formatting * Update configuration variable * Fix end_at parsing * Update canvas_zoom_meetings to handle multiple terms * Add fix for issue 117: migration changing data type, string type coercion * Fix ADD_COURSE_IDS functionality * Fix comment * Remove unneeded logging * Reformat params; change var name * Make term a dropped table * Remove migrate from comment * Make course.term_id NOT NULL * Tweak term func name * Reorder local imports * Fix mutable default parameter anti-pattern * Add log message to loop
tl-its-umich-edu · Apr 28, 2020 · a818e64 · a818e64
1 parent 6fe2b94
commit a818e64
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -72,8 +72,8 @@ This includes the creation of a configuration file called `env.json`. Complete t
     `LOG_LEVEL` | The minimum level for log messages that will appear in output. `INFO` or `DEBUG` is recommended for most use cases; see [Python's logging module](https://docs.python.org/3/library/logging.html).
     `JOB_NAMES` | The names of one or more jobs (not case sensitive) that have been implemented and defined in `run_jobs.py` (see the **Implementing a New Job** section below).
     `CANVAS_ACCOUNT_ID` | The Canvas instance root account ID number associated with the courses for which data will be collected.
-    `CANVAS_TERM_ID` | The Canvas instance term ID number that will be used to limit the query for Canvas courses. Set to 0 to use `ADD_COURSE_IDS`.
-    `ADD_COURSE_IDS` | Additional Canvas course IDs to retrieve. Duplicates found in `CANVAS_TERM_ID` (if defined) will be removed.
+    `CANVAS_TERM_IDS` | The Canvas instance term ID numbers that will be used to limit queries for Canvas courses. Set to `[]` (empty array) to only use `ADD_COURSE_IDS` (see below).
+    `ADD_COURSE_IDS` | Additional Canvas course IDs to retrieve when using `online_meetings/canvas_zoom_meetings.py`. Duplicates found in `CANVAS_TERM_ID` (if defined) will be removed.
     `API_BASE_URL` | The base URL for making requests using the U-M API Directory; the default value should be correct.
     `API_SCOPE_PREFIX` | The scope prefix that will be added after the `API_BASE_URL`; this is usually an acronym for the university location and the API Directory subscription name in CamelCase, separated by `/`.
     `API_SUBSCRIPTION_NAME` | The name of the API Directory subscription all in lowercase.

diff --git a/config/env_blank.json b/config/env_blank.json
@@ -2,7 +2,7 @@
   "LOG_LEVEL": "DEBUG",
   "JOB_NAMES": ["COURSE_INVENTORY"],
   "CANVAS_ACCOUNT_ID": 1,
-  "CANVAS_TERM_ID": 164,
+  "CANVAS_TERM_IDS": [164],
   "ADD_COURSE_IDS": [],
   "API_BASE_URL": "https://apigw.it.umich.edu/um",
   "API_SCOPE_PREFIX": "",

diff --git a/course_inventory/inventory.py b/course_inventory/inventory.py
@@ -11,21 +11,21 @@
 from umich_api.api_utils import ApiUtil
 
 # local libraries
+from course_inventory.async_enroll_gatherer import AsyncEnrollGatherer
+from course_inventory.canvas_course_usage import CanvasCourseUsage
+from course_inventory.gql_queries import queries as QUERIES
+from course_inventory.published_date import FetchPublishedDate
 from db.db_creator import DBCreator
 from environ import ENV
 from vocab import ValidDataSourceName
-from .async_enroll_gatherer import AsyncEnrollGatherer
-from .canvas_course_usage import CanvasCourseUsage
-from .gql_queries import queries as QUERIES
-from .published_date import FetchPublishedDate
 
 
 # Initialize settings and globals
 
 logger = logging.getLogger(__name__)
 
 ACCOUNT_ID = ENV.get('CANVAS_ACCOUNT_ID', 1)
-TERM_ID = ENV['CANVAS_TERM_ID']
+TERM_IDS = ENV['CANVAS_TERM_IDS']
 
 API_UTIL = ApiUtil(ENV['API_BASE_URL'], ENV['API_CLIENT_ID'], ENV['API_CLIENT_SECRET'])
 SUBSCRIPTION_NAME = ENV['API_SUBSCRIPTION_NAME']
@@ -39,6 +39,8 @@
 INVENTORY_DB = ENV['INVENTORY_DB']
 APPEND_TABLE_NAMES = ENV.get('APPEND_TABLE_NAMES', ['job_run', 'data_source_status'])
 
+CANVAS_DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
+
 
 # Function(s) - Canvas
 
@@ -65,14 +67,49 @@ def make_request_using_api_utils(url: str, params: Dict[str, Any] = {}) -> Respo
     return response
 
 
+def gather_term_data_from_api(account_id: int, term_ids: Sequence[int]) -> pd.DataFrame:
+    logger.info('** gather_new_term_data_from_api')
+
+    # Fetch data for terms from config
+    logger.info(f'Canvas terms specified in config: {term_ids}')
+    url_ending_with_scope = f'{API_SCOPE_PREFIX}/accounts/{account_id}/terms/'
+
+    term_dicts = []
+    for term_id in term_ids:
+        logger.info(f'Pulling data for term number {term_id}')
+        term_url_ending = url_ending_with_scope + str(term_id)
+        response = make_request_using_api_utils(term_url_ending)
+
+        term_data = json.loads(response.text)
+        slim_term_dict = {
+            'canvas_id': term_data['id'],
+            'name': term_data['name'],
+            'sis_id': int(term_data['sis_term_id']),
+            'start_at': pd.to_datetime(
+                term_data['start_at'],
+                format=CANVAS_DATETIME_FORMAT
+            ),
+            'end_at': pd.to_datetime(
+                term_data['end_at'],
+                format=CANVAS_DATETIME_FORMAT
+            )
+        }
+        term_dicts.append(slim_term_dict)
+
+    term_df = pd.DataFrame(term_dicts)
+    logger.debug(term_df.head())
+    return term_df
+
+
 def slim_down_course_data(course_data: List[Dict]) -> List[Dict]:
     slim_course_dicts = []
     for course_dict in course_data:
         slim_course_dict = {
             'canvas_id': course_dict['id'],
-            'sis_id': course_dict['sis_course_id'],
+            'sis_id': str(course_dict['sis_course_id']),
             'name': course_dict['name'],
             'account_id': course_dict['account_id'],
+            'term_id': course_dict['enrollment_term_id'],
             'created_at': course_dict['created_at'],
             'workflow_state': course_dict['workflow_state']
         }
@@ -85,39 +122,44 @@ def slim_down_course_data(course_data: List[Dict]) -> List[Dict]:
     return slim_course_dicts
 
 
-def gather_course_data_from_api(account_id: int, term_id: int) -> pd.DataFrame:
+def gather_course_data_from_api(account_id: int, term_ids: Sequence[int]) -> pd.DataFrame:
     logger.info('** gather_course_data_from_api')
     url_ending_with_scope = f'{API_SCOPE_PREFIX}/accounts/{account_id}/courses'
-    params = {
-        'with_enrollments': True,
-        'enrollment_type': ['student', 'teacher'],
-        'enrollment_term_id': term_id,
-        'per_page': 100,
-        'include': ['total_students']
-    }
 
-    # Make first course request
-    page_num = 1
-    logger.info(f'Course Page Number: {page_num}')
-    response = make_request_using_api_utils(url_ending_with_scope, params)
-    all_course_data = json.loads(response.text)
-    course_dicts = slim_down_course_data(all_course_data)
-    more_pages = True
-
-    while more_pages:
-        next_params = API_UTIL.get_next_page(response)
-        if next_params:
-            page_num += 1
-            logger.info(f'Course Page Number: {page_num}')
-            response = make_request_using_api_utils(url_ending_with_scope, next_params)
-            all_course_data = json.loads(response.text)
-            course_dicts += slim_down_course_data(all_course_data)
-        else:
-            logger.info('No more pages!')
-            more_pages = False
+    course_dicts = []
+    for term_id in term_ids:
+        logger.info(f'Fetching course data for term {term_id}')
+
+        params = {
+            'with_enrollments': True,
+            'enrollment_type': ['student', 'teacher'],
+            'enrollment_term_id': term_id,
+            'per_page': 100,
+            'include': ['total_students']
+        }
+
+        # Make first course request
+        page_num = 1
+        logger.info(f'Course Page Number: {page_num}')
+        response = make_request_using_api_utils(url_ending_with_scope, params)
+        all_course_data = json.loads(response.text)
+        course_dicts += slim_down_course_data(all_course_data)
+        more_pages = True
+
+        while more_pages:
+            next_params = API_UTIL.get_next_page(response)
+            if next_params:
+                page_num += 1
+                logger.info(f'Course Page Number: {page_num}')
+                response = make_request_using_api_utils(url_ending_with_scope, next_params)
+                all_course_data = json.loads(response.text)
+                course_dicts += slim_down_course_data(all_course_data)
+            else:
+                logger.info('No more pages!')
+                more_pages = False
 
     num_course_dicts = len(course_dicts)
-    logger.info(f'Total course records: {num_course_dicts}')
+    logger.info(f'Total course records for all active terms: {num_course_dicts}')
     course_dicts_with_students = []
     for course_dict in course_dicts:
         if course_dict['total_students'] > 0:
@@ -183,10 +225,14 @@ def pull_sis_section_data_from_udw(section_ids: Sequence[int], conn: connection)
 
 def run_course_inventory() -> Sequence[Dict[str, Union[ValidDataSourceName, pd.Timestamp]]]:
     logger.info("* run_course_inventory")
+
     logger.info('Making requests against the Canvas API')
 
+    # Gather term data
+    term_df = gather_term_data_from_api(ACCOUNT_ID, TERM_IDS)
+
     # Gather course data
-    course_df = gather_course_data_from_api(ACCOUNT_ID, TERM_ID)
+    course_df = gather_course_data_from_api(ACCOUNT_ID, TERM_IDS)
 
     logger.info("*** Fetching the published date ***")
     course_available_df = course_df.loc[course_df.workflow_state == 'available'].copy()
@@ -198,11 +244,12 @@ def run_course_inventory() -> Sequence[Dict[str, Union[ValidDataSourceName, pd.T
 
     logger.info("*** Checking for courses available and no published date ***")
     logger.info(course_df[(course_df['workflow_state'] == 'available') & (course_df['published_at'].isnull())])
+
     course_df['created_at'] = pd.to_datetime(course_df['created_at'],
-                                             format="%Y-%m-%dT%H:%M:%SZ",
+                                             format=CANVAS_DATETIME_FORMAT,
                                              errors='coerce')
     course_df['published_at'] = pd.to_datetime(course_df['published_at'],
-                                               format="%Y-%m-%dT%H:%M:%SZ",
+                                               format=CANVAS_DATETIME_FORMAT,
                                                errors='coerce')
 
     logger.info("*** Fetching the canvas course usage data ***")
@@ -260,6 +307,7 @@ def run_course_inventory() -> Sequence[Dict[str, Union[ValidDataSourceName, pd.T
     }
 
     # Produce output
+    num_term_records = len(term_df)
     num_course_records = len(course_df)
     num_user_records = len(user_df)
     num_section_records = len(section_df)
@@ -268,6 +316,10 @@ def run_course_inventory() -> Sequence[Dict[str, Union[ValidDataSourceName, pd.T
 
     if CREATE_CSVS:
         # Generate CSV Output
+        logger.info(f'Writing {num_term_records} term records to CSV')
+        term_df.to_csv(os.path.join('data', 'term.csv'), index=False)
+        logger.info('Wrote data to data/term.csv')
+
         logger.info(f'Writing {num_course_records} course records to CSV')
         course_df.to_csv(os.path.join('data', 'course.csv'), index=False)
         logger.info('Wrote data to data/course.csv')
@@ -288,12 +340,18 @@ def run_course_inventory() -> Sequence[Dict[str, Union[ValidDataSourceName, pd.T
         canvas_course_usage_df.to_csv(os.path.join('data', 'canvas_course_usage.csv'), index=False)
         logger.info('Wrote data to data/canvas_course_usage.csv')
 
-    # Empty tables (if any) in database, then migrate
-    logger.info('Emptying tables in DB')
+    # Initialize DBCreator object
     db_creator_obj = DBCreator(INVENTORY_DB, APPEND_TABLE_NAMES)
+
+    # Empty tables (if any) in database
+    logger.info('Emptying tables in DB')
     db_creator_obj.drop_records()
 
     # Insert gathered data
+    logger.info(f'Inserting {num_term_records} term records to DB')
+    term_df.to_sql('term', db_creator_obj.engine, if_exists='append', index=False)
+    logger.info(f'Inserted data into term table in {db_creator_obj.db_name}')
+
     logger.info(f'Inserting {num_course_records} course records to DB')
     course_df.to_sql('course', db_creator_obj.engine, if_exists='append', index=False)
     logger.info(f'Inserted data into course table in {db_creator_obj.db_name}')

diff --git a/db/migrations/0013.add_term_table.py b/db/migrations/0013.add_term_table.py
@@ -0,0 +1,31 @@
+#
+# file: migrations/0013.add_term_table.py
+#
+from yoyo import step
+
+steps = [
+    step('''
+        CREATE TABLE IF NOT EXISTS term
+        (
+            canvas_id INTEGER NOT NULL UNIQUE,
+            name VARCHAR(100) NOT NULL UNIQUE,
+            sis_id INTEGER NOT NULL,
+            start_at DATETIME NOT NULL,
+            end_at DATETIME NOT NULL,
+            PRIMARY KEY (canvas_id)
+        )
+        ENGINE=InnoDB
+        CHARACTER SET utf8mb4;
+    '''),
+    step('''
+        ALTER TABLE course
+        ADD COLUMN term_id INTEGER NOT NULL AFTER account_id;
+    '''),
+    step('''
+        ALTER TABLE course
+        ADD CONSTRAINT fk_term_id
+            FOREIGN KEY (term_id)
+            REFERENCES term(canvas_id)
+            ON UPDATE CASCADE ON DELETE CASCADE;
+    ''')
+]
diff --git a/db/migrations/0014.change_course_sis_id_data_type.py b/db/migrations/0014.change_course_sis_id_data_type.py
@@ -0,0 +1,10 @@
+#
+# file: migrations/0014.change_course_sis_id_data_type.py
+#
+from yoyo import step
+
+step('''
+    ALTER TABLE course
+    MODIFY
+        sis_id VARCHAR(15) NULL;
+''')
diff --git a/online_meetings/canvas_zoom_meetings.py b/online_meetings/canvas_zoom_meetings.py
@@ -7,7 +7,7 @@
 import re
 import sys
 from datetime import datetime
-from typing import Dict, Optional, List
+from typing import Dict, List, Optional, Sequence, Union
 
 import canvasapi
 import pandas as pd
@@ -160,16 +160,31 @@ def get_zoom_course(self, course: canvasapi.course.Course) -> None:
                 self.get_zoom_details(posturl, formdata, course.id)
         return None
 
-    def zoom_course_report(self, canvas_account: int = 1, enrollment_term_id: int = 0,
-                           published: bool = True, add_course_ids: list = None) -> None:
+    def zoom_course_report(
+        self,
+        canvas_account: int = 1,
+        enrollment_term_ids: Union[Sequence[int], None] = None,
+        published: bool = True,
+        add_course_ids: list = None
+    ) -> None:
 
         account = CANVAS.get_account(canvas_account)
         # Canvas has a limit of 100 per page on this API
         per_page = 100
-        # Get all published courses from the defined enrollment term
+
+        # Get all published courses from the defined enrollment terms
         courses = []
-        if enrollment_term_id:
-            courses = account.get_courses(enrollment_term_id=enrollment_term_id, published=published, per_page=per_page)
+        if enrollment_term_ids is not None:
+            for enrollment_term_id in enrollment_term_ids:
+                logger.info(f'Fetching published course data for term {enrollment_term_id}')
+                courses_list = list(
+                    account.get_courses(
+                        enrollment_term_id=enrollment_term_id,
+                        published=published,
+                        per_page=per_page
+                    )
+                )
+                courses += courses_list
 
         course_count = 0
         for course in courses:
@@ -191,7 +206,7 @@ def zoom_course_report(self, canvas_account: int = 1, enrollment_term_id: int =
 start_time = datetime.now()
 logger.info(f"Script started at {start_time}")
 zoom_placements = ZoomPlacements()
-zoom_placements.zoom_course_report(ENV.get("CANVAS_ACCOUNT_ID", 1), ENV.get("CANVAS_TERM_ID", 0),
+zoom_placements.zoom_course_report(ENV.get("CANVAS_ACCOUNT_ID", 1), ENV.get("CANVAS_TERM_IDS", []),
                                    True, ENV.get("ADD_COURSE_IDS", []))
 
 zoom_courses_df = pd.DataFrame(zoom_placements.zoom_courses)