From 70eddecda0e72cac7e2920a526cc784ac2a648e8 Mon Sep 17 00:00:00 2001 From: Dan LaManna Date: Thu, 4 Jan 2024 12:59:08 -0500 Subject: [PATCH] Migrate to use isic-metadata 1.0.0 --- isic_cli/cli/metadata.py | 42 +++++++++++++++++++++++++------------- setup.py | 4 +--- tests/test_cli_metadata.py | 4 ++-- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/isic_cli/cli/metadata.py b/isic_cli/cli/metadata.py index ae0788d..0614f66 100644 --- a/isic_cli/cli/metadata.py +++ b/isic_cli/cli/metadata.py @@ -5,10 +5,11 @@ from pathlib import Path import sys + import click from click.types import IntRange from humanize import intcomma -from isic_metadata.metadata import MetadataBatch, MetadataRow +from isic_metadata.metadata import MetadataBatch, MetadataRow, convert_errors from isic_metadata.utils import get_unstructured_columns from pydantic import ValidationError from rich.console import Console @@ -30,15 +31,22 @@ def metadata(obj): @metadata.command(name="validate") @click.argument( "csv_file", - type=click.File("rb"), + type=click.File("r"), ) def validate(csv_file: io.BufferedReader): """Validate metadata from a local csv.""" - # These imports are slow, inline them. - import pandas as pd - console = Console() - df = pd.read_csv(csv_file, header=0) + + # get number of rows in csv + num_rows = sum(1 for _ in csv_file) + csv_file.seek(0) + + reader = csv.DictReader(csv_file) + headers = reader.fieldnames + + if not headers: + click.secho("No rows found in csv!", fg="red") + sys.exit(1) # batch problems apply to the overall csv and can't be computed without looking at the # entire csv. @@ -47,20 +55,26 @@ def validate(csv_file: io.BufferedReader): # keyed by column, message column_problems: dict[tuple[str, str], list[int]] = defaultdict(list) - for i, (_, row) in track(enumerate(df.iterrows(), start=2), total=len(df)): + batch_items: list[MetadataRow] = [] + + # start enumerate at 2 to account for header row and 1-indexing + for i, row in track( + enumerate(reader, start=2), total=num_rows, description="Validating metadata" + ): + if row.get("patient_id") or row.get("lesion_id"): + batch_items.append( + MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id")) + ) try: - MetadataRow.model_validate(row.to_dict()) + MetadataRow.model_validate(row) except ValidationError as e: - for error in e.errors(): + for error in convert_errors(e): column = error["loc"][0] column_problems[(column, error["msg"])].append(i) try: MetadataBatch( - items=[ - MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id")) - for _, row in df.iterrows() - ] + items=batch_items, ) except ValidationError as e: for error in e.errors(): @@ -105,7 +119,7 @@ def validate(csv_file: io.BufferedReader): else: click.secho("No structural errors found!", fg="green") - unstructured_columns = get_unstructured_columns(df) + unstructured_columns = get_unstructured_columns(headers) if unstructured_columns: table = Table(title="Unrecognized Fields") table.add_column("Field", justify="left", style="cyan", no_wrap=True) diff --git a/setup.py b/setup.py index 16ce7e5..baba1ac 100644 --- a/setup.py +++ b/setup.py @@ -42,11 +42,9 @@ # We expect girder-cli-oauth-client to drop oob support in the future "girder-cli-oauth-client<1.0.0", "humanize", - "isic-metadata>=0.4.0", + "isic-metadata>=1.0.0", "more-itertools", - "numpy", "packaging", - "pandas", "requests", "retryable-requests", "rich", diff --git a/tests/test_cli_metadata.py b/tests/test_cli_metadata.py index a4e12d8..8806071 100644 --- a/tests/test_cli_metadata.py +++ b/tests/test_cli_metadata.py @@ -32,8 +32,8 @@ def test_metadata_validate(runner, cli_run): result = cli_run(["metadata", "validate", "foo.csv"]) assert result.exit_code == 1, result.exception - assert re.search(r"Invalid diagnosis.*foo", result.output), result.output - assert re.search(r"Invalid sex.*bar", result.output), result.output + assert re.search(r"Unsupported value for diagnosis: 'foo'.", result.output), result.output + assert re.search(r"sex.*Input should be 'male' or 'female'", result.output), result.output def test_metadata_validate_lesions_patients(runner, cli_run):