From 700b986abf7eacacd46d8421da860dbc4561386a Mon Sep 17 00:00:00 2001 From: Sam Sciolla <35741256+ssciolla@users.noreply.github.com> Date: Fri, 22 May 2020 09:52:04 -0400 Subject: [PATCH] Handle possibility of duplicate courses being inserted to `course` table (#174) (#175) * Drop duplicate courses; log number of dropped courses * Improve logging messages --- course_inventory/inventory.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/course_inventory/inventory.py b/course_inventory/inventory.py index 4e3d296..bd4fd12 100644 --- a/course_inventory/inventory.py +++ b/course_inventory/inventory.py @@ -173,10 +173,14 @@ def gather_course_data_from_api(account_id: int, term_ids: Sequence[int]) -> pd. num_course_dicts_with_students = len(course_dicts_with_students) logger.info(f'Course records with students: {num_course_dicts_with_students}') - logger.info(f'Dropped {num_course_dicts - num_course_dicts_with_students} records') + logger.info(f'Dropped {num_course_dicts - num_course_dicts_with_students} course record(s) with no students') course_df = pd.DataFrame(course_dicts_with_students) course_df = course_df.drop(['total_students'], axis='columns') + orig_course_count = len(course_df) + course_df = course_df.drop_duplicates(subset=['canvas_id'], keep='last') + logger.info(f'Dropped {orig_course_count - len(course_df)} duplicate course record(s)') + logger.debug(course_df.head()) return course_df