Skip to content

Commit

Permalink
ENH: support download by crdc_series_uuid
Browse files Browse the repository at this point in the history
Aims to resolve #117
  • Loading branch information
fedorov committed Oct 4, 2024
1 parent 9f7615b commit 510bbc8
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 16 deletions.
20 changes: 19 additions & 1 deletion idc_index/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,13 @@ def set_log_level(log_level):
default=None,
help="DICOM SeriesInstanceUID(s) to filter by.",
)
@click.option(
"--crdc-series-uuid",
type=str,
multiple=True,
default=None,
help="crdc_series_uuid(s) to filter by.",
)
@click.option(
"--quiet",
type=bool,
Expand Down Expand Up @@ -122,6 +129,7 @@ def download_from_selection(
patient_id,
study_instance_uid,
series_instance_uid,
crdc_series_uuid,
quiet,
show_progress_bar,
use_s5cmd_sync,
Expand Down Expand Up @@ -159,11 +167,17 @@ def download_from_selection(
if series_instance_uid
else None
)
crdc_series_uuid = (
[uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
if crdc_series_uuid
else None
)
logger_cli.debug("Inputs received from cli download:")
logger_cli.debug(f"collection_id: {collection_id}")
logger_cli.debug(f"patient_id: {patient_id}")
logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
logger_cli.debug(f"dry_run: {dry_run}")
logger_cli.debug(f"quiet: {quiet}")
logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
Expand All @@ -177,6 +191,7 @@ def download_from_selection(
patientId=patient_id,
studyInstanceUID=study_instance_uid,
seriesInstanceUID=series_instance_uid,
crdc_series_uuid=crdc_series_uuid,
quiet=quiet,
show_progress_bar=show_progress_bar,
use_s5cmd_sync=use_s5cmd_sync,
Expand Down Expand Up @@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
matches_found += check_and_download(
"SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
)
matches_found += check_and_download(
"crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
)
if not matches_found:
logger_cli.error(
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
)


Expand Down
71 changes: 56 additions & 15 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ def __init__(self):
logger.debug(f"Reading index file v{idc_index_data.__version__}")
self.index = pd.read_parquet(file_path)

# initialize crdc_series_uuid for the index
# TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
# is merged (to minimize disruption), it will make more sense to change
# idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
# and consequently simplify the code here
self.index["crdc_series_uuid"] = (
self.index["series_aws_url"].str.split("/").str[3]
)

self.previous_versions_index_path = (
idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
)
Expand Down Expand Up @@ -136,7 +145,12 @@ def _filter_dataframe_by_id(key, dataframe, _id):

@staticmethod
def _safe_filter_by_selection(
df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
df_index,
collection_id,
patientId,
studyInstanceUID,
seriesInstanceUID,
crdc_series_uuid,
):
if collection_id is not None:
if not isinstance(collection_id, str) and not isinstance(
Expand All @@ -156,26 +170,48 @@ def _safe_filter_by_selection(
seriesInstanceUID, list
):
raise TypeError("seriesInstanceUID must be a string or list of strings")

if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
else:
result_df = df_index

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(result_df, patientId)

if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(
result_df, studyInstanceUID
if crdc_series_uuid is not None:
if not isinstance(crdc_series_uuid, str) and not isinstance(
crdc_series_uuid, list
):
raise TypeError("crdc_series_uuid must be a string or list of strings")

# Here we go down-up the hierarchy of filtering, taking into
# account the direction of one-to-many relationships
# one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
# one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
# one StudyInstanceUID can be associated with one and only one PatientID
# one PatientID can be associated with one and only one collection_id
# because of this we do not need to apply attributes above the given defined
# attribute in the hierarchy
# The earlier implemented behavior was a relic of the API from a different system
# that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.

if crdc_series_uuid is not None:
result_df = IDCClient._filter_dataframe_by_id(
"crdc_series_uuid", df_index, crdc_series_uuid
)
return result_df

if seriesInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_series_uid(
result_df, seriesInstanceUID
df_index, seriesInstanceUID
)
return result_df

if studyInstanceUID is not None:
result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
return result_df

if patientId is not None:
result_df = IDCClient._filter_by_patient_id(df_index, patientId)
return result_df

if collection_id is not None:
result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
return result_df

return result_df
return None

@staticmethod
def _filter_by_collection_id(df_index, collection_id):
Expand Down Expand Up @@ -1399,6 +1435,7 @@ def citations_from_selection(
patientId=None,
studyInstanceUID=None,
seriesInstanceUID=None,
crdc_series_uuid=None,
citation_format=CITATION_FORMAT_APA,
):
"""Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
Expand All @@ -1419,6 +1456,7 @@ def citations_from_selection(
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
crdc_series_uuid=crdc_series_uuid,
)

citations = []
Expand Down Expand Up @@ -1469,6 +1507,7 @@ def download_from_selection(
patientId=None,
studyInstanceUID=None,
seriesInstanceUID=None,
crdc_series_uuid=None,
quiet=True,
show_progress_bar=True,
use_s5cmd_sync=False,
Expand All @@ -1484,6 +1523,7 @@ def download_from_selection(
patientId: string or list of strings containing the values of PatientID to filter by
studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
show_progress_bar (bool): If True, tracks the progress of download
use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
Expand All @@ -1499,6 +1539,7 @@ def download_from_selection(
patientId=patientId,
studyInstanceUID=studyInstanceUID,
seriesInstanceUID=seriesInstanceUID,
crdc_series_uuid=crdc_series_uuid,
)

total_size = round(result_df["series_size_MB"].sum(), 2)
Expand Down
9 changes: 9 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,10 +469,19 @@ def test_cli_download(self):
with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# StudyInstanceUID:
["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
)
assert len(os.listdir(Path.cwd())) != 0

with runner.isolated_filesystem():
result = runner.invoke(
self.download,
# crdc_series_uuid:
["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
)
assert len(os.listdir(Path.cwd())) != 0

def test_prior_version_manifest(self):
# Define the values for each optional parameter
quiet_values = [True, False]
Expand Down

0 comments on commit 510bbc8

Please sign in to comment.