Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate to use isic-metadata 1.0.0 #70

Merged
merged 1 commit into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 27 additions & 14 deletions isic_cli/cli/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import click
from click.types import IntRange
from humanize import intcomma
from isic_metadata.metadata import MetadataBatch, MetadataRow
from isic_metadata.metadata import MetadataBatch, MetadataRow, convert_errors
from isic_metadata.utils import get_unstructured_columns
from pydantic import ValidationError
from rich.console import Console
Expand All @@ -30,15 +30,22 @@ def metadata(obj):
@metadata.command(name="validate")
@click.argument(
"csv_file",
type=click.File("rb"),
type=click.File("r"),
)
def validate(csv_file: io.BufferedReader):
"""Validate metadata from a local csv."""
# These imports are slow, inline them.
import pandas as pd

console = Console()
df = pd.read_csv(csv_file, header=0)

# get number of rows in csv
num_rows = sum(1 for _ in csv_file)
csv_file.seek(0)

reader = csv.DictReader(csv_file)
headers = reader.fieldnames

if not headers:
click.secho("No rows found in csv!", fg="red")
sys.exit(1)

# batch problems apply to the overall csv and can't be computed without looking at the
# entire csv.
Expand All @@ -47,20 +54,26 @@ def validate(csv_file: io.BufferedReader):
# keyed by column, message
column_problems: dict[tuple[str, str], list[int]] = defaultdict(list)

for i, (_, row) in track(enumerate(df.iterrows(), start=2), total=len(df)):
batch_items: list[MetadataRow] = []

# start enumerate at 2 to account for header row and 1-indexing
for i, row in track(
enumerate(reader, start=2), total=num_rows, description="Validating metadata"
):
if row.get("patient_id") or row.get("lesion_id"):
batch_items.append(
MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id"))
)
try:
MetadataRow.model_validate(row.to_dict())
MetadataRow.model_validate(row)
except ValidationError as e:
for error in e.errors():
for error in convert_errors(e):
column = error["loc"][0]
column_problems[(column, error["msg"])].append(i)

try:
MetadataBatch(
items=[
MetadataRow(patient_id=row.get("patient_id"), lesion_id=row.get("lesion_id"))
for _, row in df.iterrows()
]
items=batch_items,
)
except ValidationError as e:
for error in e.errors():
Expand Down Expand Up @@ -105,7 +118,7 @@ def validate(csv_file: io.BufferedReader):
else:
click.secho("No structural errors found!", fg="green")

unstructured_columns = get_unstructured_columns(df)
unstructured_columns = get_unstructured_columns(headers)
if unstructured_columns:
table = Table(title="Unrecognized Fields")
table.add_column("Field", justify="left", style="cyan", no_wrap=True)
Expand Down
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,9 @@
# We expect girder-cli-oauth-client to drop oob support in the future
"girder-cli-oauth-client<1.0.0",
"humanize",
"isic-metadata>=0.4.0",
"isic-metadata>=1.0.0",
"more-itertools",
"numpy",
"packaging",
"pandas",
"requests",
"retryable-requests",
"rich",
Expand Down
4 changes: 2 additions & 2 deletions tests/test_cli_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def test_metadata_validate(runner, cli_run):
result = cli_run(["metadata", "validate", "foo.csv"])

assert result.exit_code == 1, result.exception
assert re.search(r"Invalid diagnosis.*foo", result.output), result.output
assert re.search(r"Invalid sex.*bar", result.output), result.output
assert re.search(r"Unsupported value for diagnosis: 'foo'.", result.output), result.output
assert re.search(r"sex.*Input should be 'male' or 'female'", result.output), result.output


def test_metadata_validate_lesions_patients(runner, cli_run):
Expand Down