Skip to content

Commit

Permalink
ENH: download clinical data files if clinical_index is requested
Browse files Browse the repository at this point in the history
Also address #107 by installing both indices and clinical_data into user
directory instead of the python package location.
  • Loading branch information
fedorov committed Oct 9, 2024
1 parent a054695 commit 82f7faf
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 13 deletions.
75 changes: 62 additions & 13 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import subprocess
import tempfile
import time
from importlib.metadata import distribution
from importlib.metadata import distribution, version
from pathlib import Path

import duckdb
import idc_index_data
import pandas as pd
import platformdirs
import psutil
import requests
from packaging.version import Version
Expand Down Expand Up @@ -86,33 +87,48 @@ def __init__(self):
{"Modality": pd.Series.unique, "series_size_MB": "sum"}
)

idc_version = f"v{Version(idc_index_data.__version__).major}"
self.idc_version = f"v{Version(idc_index_data.__version__).major}"

# since indices can change between versions, we need to store them in a versioned directory
self.indices_data_dir = platformdirs.user_data_dir(
"idc_index_data", "IDC", version=version("idc-index-data")
)
# these are the items that are fetched from IDC release assets (e.g., clinical data files)
self.idc_data_dir = platformdirs.user_data_dir(
"IDC", "IDC", version=self.idc_version
)
self.clinical_data_dir = None

self.indices_overview = {
"index": {
"description": "Main index containing one row per DICOM series.",
"installed": True,
"url": None,
"file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH,
},
"previous_versions_index": {
"description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
"installed": True,
"url": None,
"file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH,
},
"sm_index": {
"description": "DICOM Slide Microscopy series-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_index.parquet",
"file_path": None,
},
"sm_instance_index": {
"description": "DICOM Slide Microscopy instance-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_instance_index.parquet",
"file_path": None,
},
"clinical_index": {
"description": "Index of clinical data accompanying the available images.",
"installed": False,
"url": f"{asset_endpoint_url}/clinical_index.parquet",
"file_path": None,
},
}

Expand Down Expand Up @@ -275,28 +291,32 @@ def _check_create_directory(download_dir):

return str(download_dir.resolve())

def fetch_index(self, index) -> None:
def fetch_index(self, index_name) -> None:
"""
Downloads requested index and adds this index joined with the main index as respective class attribute.
Args:
index (str): Name of the index to be downloaded.
"""

if index not in self.indices_overview:
logger.error(f"Index {index} is not available and can not be fetched.")
elif self.indices_overview[index]["installed"]:
if index_name not in self.indices_overview:
logger.error(f"Index {index_name} is not available and can not be fetched.")
elif self.indices_overview[index_name]["installed"]:
logger.warning(
f"Index {index} already installed and will not be fetched again."
f"Index {index_name} already installed and will not be fetched again."
)
else:
response = requests.get(self.indices_overview[index]["url"], timeout=30)
logger.info("Fetching index %s", index_name)
response = requests.get(
self.indices_overview[index_name]["url"], timeout=30
)
if response.status_code == 200:
filepath = os.path.join(
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
f"{index}.parquet",
self.indices_data_dir,
f"{index_name}.parquet",
)

os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, mode="wb") as file:
file.write(response.content)

Expand All @@ -305,12 +325,41 @@ def fetch_index(self, index) -> None:
# self.index[["series_aws_url", "SeriesInstanceUID"]],
# on="SeriesInstanceUID", how="left"
# )
setattr(self.__class__, index, index_table)
self.indices_overview[index]["installed"] = True
setattr(self.__class__, index_name, index_table)
self.indices_overview[index_name]["installed"] = True
self.indices_overview[index_name]["file_path"] = filepath

else:
logger.error(
f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}"
f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}"
)
# if clinical_index is requested, likely the user will need clinical data
# download it here, given that the size is small (<2MB as of IDC v19)
if index_name == "clinical_index":
logger.info(
"Since clinical_index was fetched, also installing corresponding tables."
)
# create clinical_data folder under self.idc_data_dir, if it does not exist
self.clinical_data_dir = os.path.join(self.idc_data_dir, "clinical_data")
idc_clinical_data_release_url = f"s3://idc-open-metadata/bigquery_export/idc_{self.idc_version}_clinical/*"
result = subprocess.run(
[
self.s5cmdPath,
"--no-sign-request",
"cp",
idc_clinical_data_release_url,
self.clinical_data_dir,
],
capture_output=True,
text=True,
check=True,
)
if result.stderr and result.stdout.startswith("ERROR"):
logger.error("Failed to download IDC clinical data.")
else:
logger.info(
"IDC clinical data downloaded successfully to %s",
self.clinical_data_dir,
)

def get_collections(self):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ dependencies = [
"idc-index-data==19.0.2",
"packaging",
"pandas<2.2",
"platformdirs",
"psutil",
"pyarrow",
"requests",
Expand Down
7 changes: 7 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,13 @@ def test_indices_urls(self):
if i.indices_overview[index]["url"] is not None:
assert remote_file_exists(i.indices_overview[index]["url"])

def test_clinical_index_install(self):
i = IDCClient()
assert i.indices_overview["clinical_index"]["installed"] is False
i.fetch_index("clinical_index")
assert i.indices_overview["clinical_index"]["installed"] is True
assert len(os.listdir(i.clinical_data_dir)) > 0


if __name__ == "__main__":
unittest.main()

0 comments on commit 82f7faf

Please sign in to comment.