From c8bdbb7cc185281a16db1e11e5eb818e3c6bf0ae Mon Sep 17 00:00:00 2001 From: vkt1414 Date: Thu, 30 Nov 2023 20:54:39 -0500 Subject: [PATCH 1/3] create github actions for managing prs and commits --- .github/__init__.py | 0 .github/get_latest_index.py | 95 ++++++++++++ .../workflows/commit-pr-release-manager.yml | 141 ++++++++++++++++++ .github/workflows/get-latest-index.yml | 83 +++++++++++ .github/workflows/python-package.yml | 39 ----- idc_index/index.py | 3 +- setup.py | 2 +- 7 files changed, 322 insertions(+), 41 deletions(-) create mode 100644 .github/__init__.py create mode 100644 .github/get_latest_index.py create mode 100644 .github/workflows/commit-pr-release-manager.yml create mode 100644 .github/workflows/get-latest-index.yml delete mode 100644 .github/workflows/python-package.yml diff --git a/.github/__init__.py b/.github/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/.github/get_latest_index.py b/.github/get_latest_index.py new file mode 100644 index 00000000..739eecd1 --- /dev/null +++ b/.github/get_latest_index.py @@ -0,0 +1,95 @@ +import os +import re +import requests +import sys +import uuid +from google.cloud import bigquery + +class IDCIndexManager: + def __init__(self, project_id): + print("Initializing IDCIndexManager...") + self.project_id = project_id + self.client = bigquery.Client(project=project_id) + + def get_latest_idc_release_version(self, view_id): + print("Getting latest IDC release version...") + view = self.client.get_table(view_id) + latest_idc_release_version=int(re.search(r"idc_v(\d+)", view.view_query).group(1)) + return latest_idc_release_version + + def extract_index_version(self, file_path): + print(f"Extracting index version from {file_path}...") + with open(file_path, "r") as file: + for line in file: + if "def get_idc_version(self):" in line: + return int(re.findall(r"v(\d+)", next(file))[0]) + + def update_index_version(self, file_path, latest_idc_release_version): + print(f"Updating index version in {file_path}...") + with open(file_path, "r") as file: + lines = file.readlines() + with open(file_path, "w") as file: + for i in range(len(lines)): + if "def get_idc_version(self):" in lines[i]: + lines[i + 1] = re.sub( + r"v(\d+)", f"v{latest_idc_release_version}", lines[i + 1] + ) + file.write(lines[i]) + + def update_sql_queries_folder( + self, dir, current_index_version, latest_idc_release_version + ): + print(f"Updating SQL queries from {dir}...") + for file_name in os.listdir(dir): + if file_name.endswith(".sql"): + file_path = os.path.join(dir, file_name) + with open(file_path, "r") as file: + sql_query = file.read() + modified_sql_query = sql_query.replace( + f"idc_v{current_index_version}", f"idc_v{latest_idc_release_version}" + ) + with open(file_path, "w") as file: + file.write(modified_sql_query) + return modified_sql_query + + def execute_sql_query(self, file_path): + print(f"Executing SQL query from {file_path}...") + with open(file_path, "r") as file: + sql_query = file.read() + df = self.client.query(sql_query).to_dataframe() + csv_file_name = f"{os.path.basename(file_path).split('.')[0]}.csv.zip" + return df, csv_file_name + + def create_csv_zip_from_df(self, df, csv_file_name): + print(f"Creating CSV zip file {csv_file_name}...") + df.to_csv(csv_file_name, compression={'method': 'zip'}, escapechar="\\") + + def run_queries_folder(self, dir): + print(f"Running queries from {dir}...") + for file_name in os.listdir(dir): + if file_name.endswith(".sql"): + file_path = os.path.join(dir, file_name) + df, csv_file_name = self.execute_sql_query(file_path) + self.create_csv_zip_from_df(df, csv_file_name) + + def set_multiline_output(self, name, value): + print(f"Setting multiline output {name}...") + with open(os.environ["GITHUB_OUTPUT"], "a") as fh: + delimiter = uuid.uuid1() + print(f"{name}<<{delimiter}", file=fh) + print(value, file=fh) + print(delimiter, file=fh) + + def run(self): + print("Running IDCIndexManager...") + latest_idc_release_version = self.get_latest_idc_release_version("bigquery-public-data.idc_current.dicom_all_view") + print(f"Latest IDC release version: {latest_idc_release_version}") + current_index_version = self.extract_index_version("idc_index/index.py") + print(f"Current index version: {current_index_version}") + self.set_multiline_output("current_index_version", int(current_index_version)) + self.set_multiline_output("latest_idc_release_version", int(latest_idc_release_version)) + + +if __name__ == "__main__": + manager = IDCIndexManager("gcp-project-id") + manager.run() diff --git a/.github/workflows/commit-pr-release-manager.yml b/.github/workflows/commit-pr-release-manager.yml new file mode 100644 index 00000000..5efbf672 --- /dev/null +++ b/.github/workflows/commit-pr-release-manager.yml @@ -0,0 +1,141 @@ +name: commit-pr-idc-index-release-manager +on: + workflow_dispatch: + push: + pull_request: + +jobs: + update_idc_index: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/idc-index + permissions: + id-token: write + contents: write + pull-requests: read + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + pip install requests==2.31.0 pandas==2.1.1 google-cloud-bigquery==3.12.0 \ + pyarrow==13.0.0 db-dtypes==1.1.1 PyGithub==2.1.1 flake8==6.1.0 \ + duckdb==0.9.2 + shell: bash + + - name: Authorize Google Cloud + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.SERVICE_ACCOUNT_KEY }} + create_credentials_file: true + export_environment_variables: true + + - name: Check if queries folder changed in the latest commit or pull request + uses: dorny/paths-filter@v2 + id: pr_proposed_changes + with: + filters: | + queries: + - 'queries/**' + + - name: If queries are modified, run them with bigquery + id: initialize_idc_manager_class + if: steps.pr_proposed_changes.outputs.queries == 'true' + shell: python + run: | + import sys + import os + sys.path.append(".github") + + from get_latest_index import IDCIndexManager + + project_id = os.environ['GCP_PROJECT_ID'] + manager = IDCIndexManager(project_id) + + manager.run_queries_folder("queries/") + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + + - name: If queries are not modified download latest index from github + if: steps.pr_proposed_changes.outputs.queries == 'false' || startsWith(github.ref, 'refs/tags/v') + run: + wget -q https://github.com/ImagingDataCommons/idc-index/releases/download/latest/idc_index.csv.zip + shell: bash + + - name: Setup testing + run: | + wget -q "https://github.com/peak/s5cmd/releases/download/v2.2.2/s5cmd_2.2.2_Linux-64bit.tar.gz" + tar -xvzf "s5cmd_2.2.2_Linux-64bit.tar.gz" s5cmd + rm "s5cmd_2.2.2_Linux-64bit.tar.gz" + mv s5cmd /usr/local/bin/s5cmd + shell: bash + + - name: If queries are modified by a pull request, change latest_index_url to use locally available csv + if: steps.pr_proposed_changes.outputs.queries == 'true' && (github.event_name == 'pull_request' || github.ref != 'refs/heads/main') + run: | + import os + from pathlib import Path + home_dir = str(Path.home()) + with open('idc_index/index.py', 'r') as file: + filedata = file.read() + filedata = filedata.replace('https://github.com/ImagingDataCommons/idc-index/releases/download/latest/idc_index.csv.zip', os.path.join(home_dir, 'work/idc-index/idc-index/idc_index.csv.zip')) + with open('idc_index/index.py', 'w') as file: + file.write(filedata) + shell: python + + - name: Test package + run: | + python -m unittest -vv tests/idcindex.py + shell: bash + + - name: Create Tagged Release + id: create_tagged_release + if: (startsWith(github.ref, 'refs/tags/v') && github.event_name != 'pull_request') + uses: ncipollo/release-action@v1 + with: + prerelease: false + body: 'Versioned idc-index' + artifacts: "*.zip" + + - name: Create latest Release + if: (github.event_name != 'pull_request') && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) + uses: crowbarmaster/GH-Automatic-Releases@latest + with: + repo_token: "${{ secrets.GITHUB_TOKEN }}" + automatic_release_tag: "latest" + #generate_notes: false + body: "Latest idc-index" + prerelease: true + title: "Latest idc-index" + files: | + *.zip + + - name: Get version + uses: mtkennerly/dunamai-action@v1 + if: (startsWith(github.ref, 'refs/tags/v') && github.event_name != 'pull_request') + with: + env-var: set_pypi_idc_index_version + args: --style semver + + - name: Echo soon to be released pypi version + if: (startsWith(github.ref, 'refs/tags/v') && github.event_name != 'pull_request') + run: | + echo $set_pypi_idc_index_version + + - name: Build a source tarball + if: (startsWith(github.ref, 'refs/tags/v') && github.event_name != 'pull_request') + run: python setup.py sdist + + - name: Publish distribution to PyPI + if: (startsWith(github.ref, 'refs/tags/v') && github.event_name != 'pull_request' ) + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.github/workflows/get-latest-index.yml b/.github/workflows/get-latest-index.yml new file mode 100644 index 00000000..73445229 --- /dev/null +++ b/.github/workflows/get-latest-index.yml @@ -0,0 +1,83 @@ +name: idc-index release manager +on: + workflow_dispatch: + schedule: + - cron: 0 12 */1 * * + +jobs: + update_idc_index: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.11 + + - name: Install dependencies + run: pip install requests==2.31.0 pandas==2.1.1 google-cloud-bigquery==3.12.0 pyarrow==13.0.0 db-dtypes==1.1.1 PyGithub==2.1.1 + + - name: Authorize Google Cloud + uses: google-github-actions/auth@v1 + with: + credentials_json: ${{ secrets.SERVICE_ACCOUNT_KEY }} + create_credentials_file: true + export_environment_variables: true + + - name: Run script to get the latest idc index + id: initialize_idc_manager_class + shell: python + run: | + import sys + import os + sys.path.append(".github") + + from get_latest_index import IDCIndexManager + + project_id = os.environ['GCP_PROJECT_ID'] + manager = IDCIndexManager(project_id) + + current_index_version = manager.extract_index_version("idc_index/index.py") + latest_idc_release_version = manager.get_latest_idc_release_version("bigquery-public-data.idc_current.dicom_all_view") + + if current_index_version < latest_idc_release_version: + manager.update_index_version("idc_index/index.py", latest_idc_release_version) + manager.update_sql_queries_folder("queries/", current_index_version, latest_idc_release_version) + manager.run_queries_folder("queries/") + + manager.set_multiline_output("current_index_version", int(current_index_version)) + manager.set_multiline_output("latest_idc_release_version", int(latest_idc_release_version)) + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + + - name: Create latest Release + id: create_release + if: ${{ steps.initialize_idc_manager_class.outputs.current_index_version != steps.initialize_idc_manager_class.outputs.latest_idc_release_version }} + uses: ncipollo/release-action@v1 + with: + tag: latest + prerelease: true + makeLatest: true + allowUpdates: true + body: 'Latest idc-index' + artifacts: "*.zip" + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v5 + with: + title: Update to v${{ steps.initialize_idc_manager_class.outputs.latest_idc_release_version }} + body: Update sql queries and/or index.py to v${{ steps.initialize_idc_manager_class.outputs.latest_idc_release_version }} + base: main + branch: update-sql-queries-and-or-index + add-paths: | + queries/*.sql + idc_index/index.py + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + + + \ No newline at end of file diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml deleted file mode 100644 index 06cc0c51..00000000 --- a/.github/workflows/python-package.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Publish IDC Index Python distribution to PyPI - -on: - workflow_dispatch: - -jobs: - python-build-n-publish: - name: Build and publish Python distribution - runs-on: ubuntu-latest - environment: - name: pypi - url: https://pypi.org/p/idc-index - permissions: - id-token: write - steps: - - uses: actions/checkout@main - - - name: Initialize Python 3.11 - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 - - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - - name: Build binary wheel and a source tarball - run: python setup.py sdist - - - name: Publish distribution to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/idc_index/index.py b/idc_index/index.py index bbf6e69a..22457747 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -8,13 +8,14 @@ import zipfile import duckdb +latest_index_url= 'https://github.com/vkt1414/idc-index/releases/download/latest/idc_index.csv.zip' class IDCClient: def __init__(self): current_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(current_dir, 'idc_index.csv.zip') if not os.path.exists(file_path): - self.index=pd.read_csv('https://github.com/ImagingDataCommons/idc-index/releases/download/latest/idc_index.csv.zip', dtype=str, encoding='utf-8') + self.index=pd.read_csv(latest_index_url, dtype=str, encoding='utf-8') else: self.index = pd.read_csv(file_path, dtype=str, encoding='utf-8') self.index = self.index.astype(str).replace('nan', '') diff --git a/setup.py b/setup.py index 49d13922..dacdcd29 100644 --- a/setup.py +++ b/setup.py @@ -71,7 +71,7 @@ def run(self): logging.error('Failed to download s5cmd:', e) setup( name='idc_index', - version=package_version, + version=os.environ['set_pypi_idc_index_version'], packages=find_packages(), include_package_data=True, install_requires=['pandas', 'requests', 'duckdb'], From 9370a49a8020e3d251ea4ffa9864943799991471 Mon Sep 17 00:00:00 2001 From: vkt1414 Date: Fri, 1 Dec 2023 11:38:13 -0500 Subject: [PATCH 2/3] use local csv whenever queries are modified --- .github/workflows/commit-pr-release-manager.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/commit-pr-release-manager.yml b/.github/workflows/commit-pr-release-manager.yml index 5efbf672..64a0e927 100644 --- a/.github/workflows/commit-pr-release-manager.yml +++ b/.github/workflows/commit-pr-release-manager.yml @@ -80,8 +80,8 @@ jobs: mv s5cmd /usr/local/bin/s5cmd shell: bash - - name: If queries are modified by a pull request, change latest_index_url to use locally available csv - if: steps.pr_proposed_changes.outputs.queries == 'true' && (github.event_name == 'pull_request' || github.ref != 'refs/heads/main') + - name: If queries are modified, change latest_index_url to use locally available csv + if: steps.pr_proposed_changes.outputs.queries == 'true' run: | import os from pathlib import Path From 2f43ff8ea0ff889f8650a1b9800cda6a05ceb686 Mon Sep 17 00:00:00 2001 From: vkt1414 Date: Fri, 1 Dec 2023 12:08:53 -0500 Subject: [PATCH 3/3] remove * in s5cmd urls --- idc_index/index.py | 19 ++++++++++--------- queries/idc_index.sql | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 22457747..daed8113 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -191,7 +191,7 @@ def download_dicom_series(self, seriesInstanceUID, downloadDir, dry_run=False, q logging.debug('AWS Bucket Location: '+series_url) cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'cp', '--show-progress', - series_url, downloadDir] + series_url+'*', downloadDir] if not dry_run: process = subprocess.run(cmd, capture_output=(not quiet), text=(not quiet)) @@ -253,13 +253,14 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id= logging.info("Dry run. Not downloading files. Rerun with dry_run=False to download the files.") return - # Download the files - # make temporary file to store the list of files to download - manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd') - for index, row in result_df.iterrows(): - with open(manifest_file, 'a') as f: - f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n") - self.download_from_manifest(manifest_file, downloadDir) + else: + # Download the files + # make temporary file to store the list of files to download + manifest_file = os.path.join(downloadDir, 'download_manifest.s5cmd') + for index, row in result_df.iterrows(): + with open(manifest_file, 'a') as f: + f.write("cp --show-progress "+row['series_aws_url'] + " "+downloadDir+"\n") + self.download_from_manifest(manifest_file, downloadDir) """Download the files corresponding to the manifest file from IDC. The manifest file should be a text file with each line containing the s5cmd command to download the file. The URLs in the file must correspond to those in the AWS buckets! @@ -273,7 +274,7 @@ def download_from_selection(self, downloadDir=None, dry_run=True, collection_id= """ def download_from_manifest(self, manifest_file, downloadDir): cmd = [self.s5cmdPath, '--no-sign-request', '--endpoint-url', 'https://s3.amazonaws.com', 'run', - manifest_file, downloadDir] + manifest_file] process = subprocess.run(cmd, capture_output=True, text=True) logging.info(process.stderr) if process.returncode == 0: diff --git a/queries/idc_index.sql b/queries/idc_index.sql index 6718adbb..f6e17c75 100644 --- a/queries/idc_index.sql +++ b/queries/idc_index.sql @@ -22,7 +22,7 @@ SELECT COUNT(dicom_all.SOPInstanceUID) AS instanceCount, ANY_VALUE(license_short_name) as license_short_name, # download related attributes - ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url, + ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/")) AS series_aws_url, ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB, FROM bigquery-public-data.idc_v16.dicom_all AS dicom_all