ENH: support download by crdc_series_uuid

Aims to resolve #117
ImagingDataCommons · Oct 4, 2024 · 510bbc8 · 510bbc8
1 parent 9f7615b
commit 510bbc8
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 16 deletions.
diff --git a/idc_index/cli.py b/idc_index/cli.py
@@ -83,6 +83,13 @@ def set_log_level(log_level):
     default=None,
     help="DICOM SeriesInstanceUID(s) to filter by.",
 )
+@click.option(
+    "--crdc-series-uuid",
+    type=str,
+    multiple=True,
+    default=None,
+    help="crdc_series_uuid(s) to filter by.",
+)
 @click.option(
     "--quiet",
     type=bool,
@@ -122,6 +129,7 @@ def download_from_selection(
     patient_id,
     study_instance_uid,
     series_instance_uid,
+    crdc_series_uuid,
     quiet,
     show_progress_bar,
     use_s5cmd_sync,
@@ -159,11 +167,17 @@ def download_from_selection(
         if series_instance_uid
         else None
     )
+    crdc_series_uuid = (
+        [uid.strip() for uid in (",".join(crdc_series_uuid)).split(",")]
+        if crdc_series_uuid
+        else None
+    )
     logger_cli.debug("Inputs received from cli download:")
     logger_cli.debug(f"collection_id: {collection_id}")
     logger_cli.debug(f"patient_id: {patient_id}")
     logger_cli.debug(f"study_instance_uid: {study_instance_uid}")
     logger_cli.debug(f"series_instance_uid: {series_instance_uid}")
+    logger_cli.debug(f"crdc_series_uuid: {crdc_series_uuid}")
     logger_cli.debug(f"dry_run: {dry_run}")
     logger_cli.debug(f"quiet: {quiet}")
     logger_cli.debug(f"show_progress_bar: {show_progress_bar}")
@@ -177,6 +191,7 @@ def download_from_selection(
         patientId=patient_id,
         studyInstanceUID=study_instance_uid,
         seriesInstanceUID=series_instance_uid,
+        crdc_series_uuid=crdc_series_uuid,
         quiet=quiet,
         show_progress_bar=show_progress_bar,
         use_s5cmd_sync=use_s5cmd_sync,
@@ -346,9 +361,12 @@ def check_and_download(column_name, item_ids, download_dir, kwarg_name):
         matches_found += check_and_download(
             "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
         )
+        matches_found += check_and_download(
+            "crdc_series_uuid", item_ids, download_dir, "crdc_series_uuid"
+        )
         if not matches_found:
             logger_cli.error(
-                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
+                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID, crdc_series_uuid."
             )
 
 

diff --git a/idc_index/index.py b/idc_index/index.py
@@ -64,6 +64,15 @@ def __init__(self):
         logger.debug(f"Reading index file v{idc_index_data.__version__}")
         self.index = pd.read_parquet(file_path)
 
+        # initialize crdc_series_uuid for the index
+        # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113
+        # is merged (to minimize disruption), it will make more sense to change
+        # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP,
+        # and consequently simplify the code here
+        self.index["crdc_series_uuid"] = (
+            self.index["series_aws_url"].str.split("/").str[3]
+        )
+
         self.previous_versions_index_path = (
             idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
         )
@@ -136,7 +145,12 @@ def _filter_dataframe_by_id(key, dataframe, _id):
 
     @staticmethod
     def _safe_filter_by_selection(
-        df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
+        df_index,
+        collection_id,
+        patientId,
+        studyInstanceUID,
+        seriesInstanceUID,
+        crdc_series_uuid,
     ):
         if collection_id is not None:
             if not isinstance(collection_id, str) and not isinstance(
@@ -156,26 +170,48 @@ def _safe_filter_by_selection(
                 seriesInstanceUID, list
             ):
                 raise TypeError("seriesInstanceUID must be a string or list of strings")
-
-        if collection_id is not None:
-            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
-        else:
-            result_df = df_index
-
-        if patientId is not None:
-            result_df = IDCClient._filter_by_patient_id(result_df, patientId)
-
-        if studyInstanceUID is not None:
-            result_df = IDCClient._filter_by_dicom_study_uid(
-                result_df, studyInstanceUID
+        if crdc_series_uuid is not None:
+            if not isinstance(crdc_series_uuid, str) and not isinstance(
+                crdc_series_uuid, list
+            ):
+                raise TypeError("crdc_series_uuid must be a string or list of strings")
+
+        # Here we go down-up the hierarchy of filtering, taking into
+        # account the direction of one-to-many relationships
+        #   one crdc_series_uuid can be associated with one and only one SeriesInstanceUID
+        #   one SeriesInstanceUID can be associated with one and only one StudyInstanceUID
+        #   one StudyInstanceUID can be associated with one and only one PatientID
+        #   one PatientID can be associated with one and only one collection_id
+        # because of this we do not need to apply attributes above the given defined
+        # attribute in the hierarchy
+        # The earlier implemented behavior was a relic of the API from a different system
+        # that influenced the API of SlicerIDCIndex, and propagated into idc-index. Unfortunately.
+
+        if crdc_series_uuid is not None:
+            result_df = IDCClient._filter_dataframe_by_id(
+                "crdc_series_uuid", df_index, crdc_series_uuid
             )
+            return result_df
 
         if seriesInstanceUID is not None:
             result_df = IDCClient._filter_by_dicom_series_uid(
-                result_df, seriesInstanceUID
+                df_index, seriesInstanceUID
             )
+            return result_df
+
+        if studyInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_study_uid(df_index, studyInstanceUID)
+            return result_df
+
+        if patientId is not None:
+            result_df = IDCClient._filter_by_patient_id(df_index, patientId)
+            return result_df
+
+        if collection_id is not None:
+            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
+            return result_df
 
-        return result_df
+        return None
 
     @staticmethod
     def _filter_by_collection_id(df_index, collection_id):
@@ -1399,6 +1435,7 @@ def citations_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
+        crdc_series_uuid=None,
         citation_format=CITATION_FORMAT_APA,
     ):
         """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
@@ -1419,6 +1456,7 @@ def citations_from_selection(
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            crdc_series_uuid=crdc_series_uuid,
         )
 
         citations = []
@@ -1469,6 +1507,7 @@ def download_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
+        crdc_series_uuid=None,
         quiet=True,
         show_progress_bar=True,
         use_s5cmd_sync=False,
@@ -1484,6 +1523,7 @@ def download_from_selection(
             patientId: string or list of strings containing the values of PatientID to filter by
             studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
+            crdc_series_uuid: string or list of strings containing the values of crdc_series_uuid to filter by
             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
             show_progress_bar (bool): If True, tracks the progress of download
             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
@@ -1499,6 +1539,7 @@ def download_from_selection(
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            crdc_series_uuid=crdc_series_uuid,
         )
 
         total_size = round(result_df["series_size_MB"].sum(), 2)

diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -469,10 +469,19 @@ def test_cli_download(self):
         with runner.isolated_filesystem():
             result = runner.invoke(
                 self.download,
+                # StudyInstanceUID:
                 ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
             )
             assert len(os.listdir(Path.cwd())) != 0
 
+        with runner.isolated_filesystem():
+            result = runner.invoke(
+                self.download,
+                # crdc_series_uuid:
+                ["e5c5c71d-62c4-4c50-a8a9-b6799c7f8dea"],
+            )
+            assert len(os.listdir(Path.cwd())) != 0
+
     def test_prior_version_manifest(self):
         # Define the values for each optional parameter
         quiet_values = [True, False]