From 3978df4a9c1c9634b328f77c47646d519b71035f Mon Sep 17 00:00:00 2001 From: Alex Dusenbery Date: Fri, 8 Mar 2024 16:08:16 -0500 Subject: [PATCH] fix: some email validation and encoding by latin-1 --- scripts/assignment_validation.py | 31 +++++++++++++++++++++++++++---- scripts/local_assignment_multi.py | 18 ++++++++++++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/scripts/assignment_validation.py b/scripts/assignment_validation.py index a9c3abf5..4e1bdfe2 100644 --- a/scripts/assignment_validation.py +++ b/scripts/assignment_validation.py @@ -14,18 +14,18 @@ """ import csv from collections import defaultdict, Counter +from email.utils import parseaddr import click -INPUT_FIELDNAMES = ['email', 'university_name'] +INPUT_FIELDNAMES = ['university_name', 'email'] def _iterate_csv(input_file): - with open(input_file, 'r') as f_in: + with open(input_file, 'r', encoding='latin-1') as f_in: reader = csv.DictReader(f_in, fieldnames=INPUT_FIELDNAMES, delimiter=',') # read and skip the header next(reader, None) - breakpoint() for row in reader: yield row @@ -42,7 +42,7 @@ def print_duplicates(input_file): for email, uni_list in unis_by_email.items(): if len(uni_list) > 1: - print(email, uni_list) + print(email or 'THE EMPTY STRING', 'is contained in', len(uni_list), 'different rows') @click.command() @@ -59,6 +59,28 @@ def print_plan_counts(input_file): print(plan, count) +def is_valid_email(email): + _, address = parseaddr(email) + if not address: + return False + return True + + +@click.command() +@click.option( + '--input-file', + help='Path of local file containing email addresses to assign.', +) +def validate_emails(input_file): + invalid_emails = Counter() + for row in _iterate_csv(input_file): + if not is_valid_email(row['email']): + invalid_emails[row['email']] += 1 + + print(f'There were {sum(invalid_emails.values())} invalid emails') + print(invalid_emails) + + @click.group() def run(): pass @@ -66,6 +88,7 @@ def run(): run.add_command(print_duplicates) run.add_command(print_plan_counts) +run.add_command(validate_emails) if __name__ == '__main__': diff --git a/scripts/local_assignment_multi.py b/scripts/local_assignment_multi.py index 94f668e4..ce1cb75d 100644 --- a/scripts/local_assignment_multi.py +++ b/scripts/local_assignment_multi.py @@ -40,7 +40,9 @@ import csv import json import os +import re import time +from email.utils import parseaddr from pprint import pprint import click @@ -146,6 +148,11 @@ def get_plan_uuids_by_name(plans_by_name_file): return plans_by_name +def is_valid_email(email): + _, address = parseaddr(email) + return bool(address) + + def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SIZE): """ Yield chunks of (chunk_id, subscription_plan, email) from the given input file. @@ -159,7 +166,10 @@ def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SI current_chunk = [] chunk_id = 0 current_subscription_plan_uuid = None - with open(input_file_path, 'r') as f_in: + # CSVs can contain non-ascii characters, latin-1 + # is the encoding that currently works with our production input. + # could eventually be parameterized as input to this command. + with open(input_file_path, 'r', encoding='latin-1') as f_in: reader = csv.DictReader(f_in, fieldnames=INPUT_FIELDNAMES, delimiter=',') # read and skip the header @@ -167,6 +177,10 @@ def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SI for row in reader: email = row['email'] + if not is_valid_email(email): + print("Invalid email:", email) + continue + university_name = row['university_name'] subscription_plan_uuid = plans_by_name[university_name] @@ -198,7 +212,7 @@ def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SI def _post_assignments(subscription_plan_uuid, emails_for_chunk, environment='local', fetch_jwt=False): """ - Maket the POST request to assign licenses. + Make the POST request to assign licenses. """ url_pattern = ENVIRONMENTS[environment] url = url_pattern.format(subscription_plan_uuid=subscription_plan_uuid)