From b4c26f547ad1f78c601a68dd2a491bdcd3fa5a7e Mon Sep 17 00:00:00 2001 From: Raja Vyshnavi Sriramoju Date: Tue, 12 Nov 2024 13:45:09 -0800 Subject: [PATCH 01/23] WIP: Oct 3rd update for nersc recon with podman-hpc container --- orchestration/flows/bl832/nersc.py | 96 ++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 orchestration/flows/bl832/nersc.py diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py new file mode 100644 index 0000000..6fdb6fb --- /dev/null +++ b/orchestration/flows/bl832/nersc.py @@ -0,0 +1,96 @@ +import datetime +import os +from pathlib import Path +import uuid + +from globus_sdk import TransferClient +from prefect import flow, task, get_run_logger +from prefect.blocks.system import JSON +from prefect.blocks.system import Secret + +from orchestration.flows.bl832.move import transfer_spot_to_data, transfer_data_to_nersc +from orchestration.nersc import NerscClient +import time + +@task(name="create_nersc_client") +def create_nersc_client(): + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") #.txt file + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") #.jwk file + + client = NerscClient(client_id_path, sfapi_key_path) + user = client.user() + + #error handling + + return client + +@task(name="submit_job_script") +def submit_job_script(client, user, logger): + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + + client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 1 +#SBATCH --time=00:15:00 +#SBATCH --exclusive + +date +srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata localhost/tomorecon_nersc_mpi_hdf5:1.0 python reconstruction.py input.txt +date +""" + job = client.perlmutter.submit_job(job_script) + job.complete() #waits for job completion + logger.info("Job completed") + #logger.info(f"Job {job.id} completed") + + return + +@flow(name="launch-nersc-jobs-tomo_recon") +def launch_nersc_jobs_tomo_recon( +): + logger = get_run_logger() + + # # Data transfer step + # config = Config832() + # # test_scicat(config) + # logger.info(f"{str(uuid.uuid4())}{file_path}") + # # copy file to a uniquely-named file in the same folder + # file = Path(file_path) + # new_file = str(file.with_name(f"test_{str(uuid.uuid4())}.txt")) + # logger.info(new_file) + # success = start_transfer( + # config.tc, config.spot832, file_path, config.spot832, new_file, logger=logger + # ) + # logger.info(success) + # spot832_path = transfer_spot_to_data( + # new_file, config.tc, config.spot832, config.data832 + # ) + # logger.info(f"Transferred {spot832_path} to spot to data") + + # task = transfer_data_to_nersc(new_file, config.tc, config.data832, config.nersc832) + # logger.info( + # f"File successfully transferred from data832 to NERSC {spot832_path}. Task {task}" + # ) + + + # Creating a sfapi client object + client = create_nersc_client() + user = client.user() + + logger.info("Client created") + + #Job submission step + submit_job_script(client, user, logger) + + + + return From b128b2c417d903187f3978d217751e759fefa408 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Dec 2024 10:11:46 -0800 Subject: [PATCH 02/23] able to get nersc reconstruction running in this configuration --- orchestration/flows/bl832/nersc.py | 101 ++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 30 deletions(-) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 6fdb6fb..6cdd7ba 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,42 +1,75 @@ -import datetime +# import datetime +from dotenv import load_dotenv import os -from pathlib import Path -import uuid +# from pathlib import Path +# import uuid -from globus_sdk import TransferClient +# from globus_sdk import TransferClient from prefect import flow, task, get_run_logger -from prefect.blocks.system import JSON -from prefect.blocks.system import Secret +# from prefect.blocks.system import JSON +# from prefect.blocks.system import Secret -from orchestration.flows.bl832.move import transfer_spot_to_data, transfer_data_to_nersc +# from orchestration.flows.bl832.move import transfer_spot_to_data, transfer_data_to_nersc from orchestration.nersc import NerscClient -import time +# import time + @task(name="create_nersc_client") def create_nersc_client(): - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") #.txt file - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") #.jwk file + load_dotenv() + logger = get_run_logger() - client = NerscClient(client_id_path, sfapi_key_path) - user = client.user() + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + + # Log paths for debugging (ensure no sensitive info is printed) + logger.info(f"Client ID Path: {client_id_path}") + logger.info(f"SFAPI Key Path: {sfapi_key_path}") - #error handling + # Verify that the paths are not None + if not client_id_path or not sfapi_key_path: + logger.error("Environment variables for NERSC credentials are not set.") + raise ValueError("Missing NERSC credentials paths.") + # Check if files exist + if not os.path.isfile(client_id_path): + logger.error(f"Client ID file not found at {client_id_path}") + raise FileNotFoundError(f"Client ID file not found at {client_id_path}") + if not os.path.isfile(sfapi_key_path): + logger.error(f"SFAPI Key file not found at {sfapi_key_path}") + raise FileNotFoundError(f"SFAPI Key file not found at {sfapi_key_path}") + + client = NerscClient(client_id_path, sfapi_key_path) return client + @task(name="submit_job_script") def submit_job_script(client, user, logger): + logger = get_run_logger() + load_dotenv() + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + home_path = f"/global/homes/{user.name[0]}/{user.name}" scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + try: + logger.info(f"Creating directory: {scratch_path}/prefect-recon-test") + client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") + logger.info("Directory created successfully.") + except Exception as e: + logger.error(f"Failed to create directory: {e}") + raise e - client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") job_script = f"""#!/bin/bash #SBATCH -q debug #SBATCH -A als #SBATCH -C cpu -#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err #SBATCH -N 1 #SBATCH --ntasks-per-node 1 #SBATCH --cpus-per-task 1 @@ -44,19 +77,26 @@ def submit_job_script(client, user, logger): #SBATCH --exclusive date -srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata localhost/tomorecon_nersc_mpi_hdf5:1.0 python reconstruction.py input.txt -date +srun podman-hpc login registry.nersc.gov --username {username} --password {password} +srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata --volume {scratch_path}/microctdata:/alsuser/ registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c python reconstruction.py input.txt +date """ - job = client.perlmutter.submit_job(job_script) - job.complete() #waits for job completion - logger.info("Job completed") - #logger.info(f"Job {job.id} completed") +# srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata localhost/tomorecon_nersc_mpi_hdf5:1.0 python reconstruction.py input.txt + + try: + logger.info("Submitting job script to Perlmutter.") + job = client.perlmutter.submit_job(job_script) + job.complete() # waits for job completion + logger.info("Job completed successfully.") + except Exception as e: + logger.error(f"Failed to submit or complete job: {e}") + raise e return -@flow(name="launch-nersc-jobs-tomo_recon") -def launch_nersc_jobs_tomo_recon( -): + +@flow(name="nersc_recon_flow") +def nersc_recon_flow(): logger = get_run_logger() # # Data transfer step @@ -80,7 +120,6 @@ def launch_nersc_jobs_tomo_recon( # logger.info( # f"File successfully transferred from data832 to NERSC {spot832_path}. Task {task}" # ) - # Creating a sfapi client object client = create_nersc_client() @@ -88,9 +127,11 @@ def launch_nersc_jobs_tomo_recon( logger.info("Client created") - #Job submission step + # Job submission step submit_job_script(client, user, logger) - - return + + +if __name__ == "__main__": + nersc_recon_flow() From 04637390ff90e89b91869c12599cebbfb264fda1 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Dec 2024 15:30:52 -0800 Subject: [PATCH 03/23] Added an sfapi slurm call to run tiff_to_zarr on nersc --- orchestration/flows/bl832/nersc.py | 251 +++++++++++++++++++++++------ 1 file changed, 205 insertions(+), 46 deletions(-) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 6cdd7ba..89fdd4a 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,17 +1,11 @@ -# import datetime from dotenv import load_dotenv import os -# from pathlib import Path -# import uuid -# from globus_sdk import TransferClient from prefect import flow, task, get_run_logger -# from prefect.blocks.system import JSON -# from prefect.blocks.system import Secret -# from orchestration.flows.bl832.move import transfer_spot_to_data, transfer_data_to_nersc +from orchestration.flows.bl832.alcf import transfer_data_to_data832 +from orchestration.flows.bl832.config import Config832 from orchestration.nersc import NerscClient -# import time @task(name="create_nersc_client") @@ -19,6 +13,10 @@ def create_nersc_client(): load_dotenv() logger = get_run_logger() + # Get paths to NERSC client ID and SFAPI key + # Note: These paths are set in the .env file + # We should consider moving these to the Prefect Secrets Manager + # We should also consider how to handle the short SFAPI key expiration time (2 days) client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") @@ -39,17 +37,34 @@ def create_nersc_client(): logger.error(f"SFAPI Key file not found at {sfapi_key_path}") raise FileNotFoundError(f"SFAPI Key file not found at {sfapi_key_path}") - client = NerscClient(client_id_path, sfapi_key_path) + try: + client = NerscClient(client_id_path, sfapi_key_path) + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e + return client -@task(name="submit_job_script") -def submit_job_script(client, user, logger): +@task(name="submit_recon_job_script") +def submit_recon_job_script( + client: NerscClient, + # file_path: str = None, +) -> bool: logger = get_run_logger() + if client is None: + logger.error("NERSC client is required for job submission.") + raise ValueError("NERSC client is required for job submission.") + # if file_path is None: + # logger.error("File path is required for job submission.") + # raise ValueError("File path is required for job submission.") + load_dotenv() username = os.getenv("NERSC_USERNAME") password = os.getenv("NERSC_PASSWORD") + user = client.user() + home_path = f"/global/homes/{user.name[0]}/{user.name}" scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" logger.info(home_path) @@ -63,6 +78,12 @@ def submit_job_script(client, user, logger): logger.error(f"Failed to create directory: {e}") raise e + # Need to update this script: + # take in the file path + # ignore input.txt + # run the reconstruction script with the file path + # run tiff to zarr after reconstruction + job_script = f"""#!/bin/bash #SBATCH -q debug #SBATCH -A als @@ -72,63 +93,201 @@ def submit_job_script(client, user, logger): #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err #SBATCH -N 1 #SBATCH --ntasks-per-node 1 -#SBATCH --cpus-per-task 1 +#SBATCH --cpus-per-task 64 #SBATCH --time=00:15:00 #SBATCH --exclusive date srun podman-hpc login registry.nersc.gov --username {username} --password {password} -srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata --volume {scratch_path}/microctdata:/alsuser/ registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c python reconstruction.py input.txt +srun podman-hpc run +--volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py +--volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt +--volume {scratch_path}/microctdata:/alsdata +--volume {scratch_path}/microctdata:/alsuser/ registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c python reconstruction.py input.txt date """ # srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata localhost/tomorecon_nersc_mpi_hdf5:1.0 python reconstruction.py input.txt try: - logger.info("Submitting job script to Perlmutter.") + logger.info("Submitting reconstruction job script to Perlmutter.") job = client.perlmutter.submit_job(job_script) job.complete() # waits for job completion - logger.info("Job completed successfully.") + logger.info("Reconstruction job completed successfully.") + return True except Exception as e: - logger.error(f"Failed to submit or complete job: {e}") + logger.error(f"Failed to submit or complete reconstruction job: {e}") + return False + + +@task(name="submit_tiff_to_zarr_job_script") +def submit_tiff_to_zarr_job_script( + client: NerscClient, + script_path: str = "tiff_to_zarr.py", + recon_path: str = None, + raw_path: str = None +) -> bool: + logger = get_run_logger() + if client is None: + logger.error("NERSC client is required for job submission.") + raise ValueError("NERSC client is required for job submission.") + + load_dotenv() + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + user = client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + try: + logger.info(f"Creating directory: {scratch_path}/prefect-recon-test") + client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") + logger.info("Directory created successfully.") + except Exception as e: + logger.error(f"Failed to create directory: {e}") raise e - return + # Need to update this script: + # take in the file path + # ignore input.txt + # run the reconstruction script with the file path + # run tiff to zarr after reconstruction + + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 64 +#SBATCH --time=00:15:00 +#SBATCH --exclusive + +date +srun podman-hpc login registry.nersc.gov --username {username} --password {password} +srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ +--volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ +--volume {scratch_path}/microctdata:/alsdata \ +--volume {scratch_path}/microctdata:/alsuser/ \ +registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c \ +bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && python -m pip show dask_image || python -m pip install dask_image && python {script_path} {recon_path} --raw_file {raw_path}" +date +""" + try: + logger.info("Submitting Tiff to Zarr job script to Perlmutter.") + job = client.perlmutter.submit_job(job_script) + logger.info(f"jobid={job.job_id}") + job.complete() # waits for job completion + logger.info("Tiff to Zarr job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") + return False @flow(name="nersc_recon_flow") -def nersc_recon_flow(): +def nersc_recon_flow( + file_path: str = "", + is_export_control: bool = False, + config=None, +): logger = get_run_logger() + logger.info("Starting NERSC flow for new file processing and transfer.") + if not config: + config = Config832() - # # Data transfer step - # config = Config832() - # # test_scicat(config) - # logger.info(f"{str(uuid.uuid4())}{file_path}") - # # copy file to a uniquely-named file in the same folder - # file = Path(file_path) - # new_file = str(file.with_name(f"test_{str(uuid.uuid4())}.txt")) - # logger.info(new_file) - # success = start_transfer( - # config.tc, config.spot832, file_path, config.spot832, new_file, logger=logger - # ) - # logger.info(success) - # spot832_path = transfer_spot_to_data( - # new_file, config.tc, config.spot832, config.data832 - # ) - # logger.info(f"Transferred {spot832_path} to spot to data") - - # task = transfer_data_to_nersc(new_file, config.tc, config.data832, config.nersc832) - # logger.info( - # f"File successfully transferred from data832 to NERSC {spot832_path}. Task {task}" - # ) - - # Creating a sfapi client object - client = create_nersc_client() - user = client.user() + if not is_export_control: + logger.info("File is not export controlled, will run reconstruction at NERSC.") + + # Step 1: Check if file exists at NERSC + # Data should already have been transferred to NERSC in new_832_file_flow (move.py) + # transfer_client = config.tc + + # nersc_raw_path = file_path.split("/global")[1] + # directory_path = os.path.dirname(os.path.join(config.nersc832.root_path, nersc_raw_path)) + # file_name = os.path.basename(nersc_raw_path) + + # # List the directory contents and check if the file exists + # file_exists = any( + # item["name"] == file_name and item["type"] == "file" + # for item in transfer_client.operation_ls( + # endpoint_id=config.nersc832.uuid, + # path=directory_path + # )["DATA"] + # ) + + file_exists = True + + # If file exists, run submit_job_script + if file_exists: + # logger.info(f"File {file_name} found at NERSC.") + # Creating a sfapi client object + client = create_nersc_client() + logger.info("NERSC SFAPI Client created") + + # Step 2A: If raw h5 file exists on NERSC, run submit_job_script + # Update submit_job_script to take in the file path + # Job submission step + nersc_reconstruction_success = submit_recon_job_script( + client=client + ) + # file_path=nersc_raw_path) + if not nersc_reconstruction_success: + logger.error("Reconstruction Failed.") + raise ValueError("Reconstruction at NERSC Failed") + + else: + logger.info("Reconstruction Successful.") + # Step 2B: Run tiff_to_zarr after reconstruction + # user = client.user() + nersc_tiff_scratch_path = "rec20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast" + nersc_raw_path = "20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" + nersc_tiff_to_zarr_success = submit_tiff_to_zarr_job_script( + client=client, + recon_path=nersc_tiff_scratch_path, + raw_path=nersc_raw_path) + if not nersc_tiff_to_zarr_success: + logger.error("Tiff to Zarr Failed.") + raise ValueError("Tiff to Zarr at ALCF Failed") + else: + logger.info("Tiff to Zarr Successful.") + + else: + logger.error(f"File {file_name} not found at NERSC.") + return + + # if nersc_reconstruction_success: + # # Step 3A: Send reconstructed data (tiff) to data832 + # # transfer reconstructions to data832 + # nersc_tiff_scratch_path = "" + # transfer_data_to_data832( + # file_path=nersc_tiff_scratch_path, + # transfer_client=config.tc, + # source_endpoint=config.nersc832, + # data832=config.data832 + # ) + + # if nersc_tiff_to_zarr_success: + # # Step 3B: Send zarr data to data832 + # nersc_zarr_scratch_path = "" + # transfer_data_to_data832( + # file_path=nersc_zarr_scratch_path, + # transfer_client=config.tc, + # source_endpoint=config.nersc832, + # data832=config.data832 + # ) - logger.info("Client created") + # Step 4 Schedule file deletion - # Job submission step - submit_job_script(client, user, logger) + else: + logger.info("File is export controlled, not running reconstruction at NERSC.") + return return From e1e7addbf12c7c21ba4df5cca56a06cd96fec270 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 6 Dec 2024 12:19:32 -0800 Subject: [PATCH 04/23] Migrating nersc tomography flow code to an abstract class structure (orchestration/flows/bl832/tomography_hpc.py). Linting orchestration/nersc.py --- orchestration/flows/bl832/nersc.py | 296 ----------------- orchestration/flows/bl832/tomography_hpc.py | 343 ++++++++++++++++++++ orchestration/nersc.py | 21 +- 3 files changed, 353 insertions(+), 307 deletions(-) delete mode 100644 orchestration/flows/bl832/nersc.py create mode 100644 orchestration/flows/bl832/tomography_hpc.py diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py deleted file mode 100644 index 89fdd4a..0000000 --- a/orchestration/flows/bl832/nersc.py +++ /dev/null @@ -1,296 +0,0 @@ -from dotenv import load_dotenv -import os - -from prefect import flow, task, get_run_logger - -from orchestration.flows.bl832.alcf import transfer_data_to_data832 -from orchestration.flows.bl832.config import Config832 -from orchestration.nersc import NerscClient - - -@task(name="create_nersc_client") -def create_nersc_client(): - load_dotenv() - logger = get_run_logger() - - # Get paths to NERSC client ID and SFAPI key - # Note: These paths are set in the .env file - # We should consider moving these to the Prefect Secrets Manager - # We should also consider how to handle the short SFAPI key expiration time (2 days) - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") - - # Log paths for debugging (ensure no sensitive info is printed) - logger.info(f"Client ID Path: {client_id_path}") - logger.info(f"SFAPI Key Path: {sfapi_key_path}") - - # Verify that the paths are not None - if not client_id_path or not sfapi_key_path: - logger.error("Environment variables for NERSC credentials are not set.") - raise ValueError("Missing NERSC credentials paths.") - - # Check if files exist - if not os.path.isfile(client_id_path): - logger.error(f"Client ID file not found at {client_id_path}") - raise FileNotFoundError(f"Client ID file not found at {client_id_path}") - if not os.path.isfile(sfapi_key_path): - logger.error(f"SFAPI Key file not found at {sfapi_key_path}") - raise FileNotFoundError(f"SFAPI Key file not found at {sfapi_key_path}") - - try: - client = NerscClient(client_id_path, sfapi_key_path) - except Exception as e: - logger.error(f"Failed to create NERSC client: {e}") - raise e - - return client - - -@task(name="submit_recon_job_script") -def submit_recon_job_script( - client: NerscClient, - # file_path: str = None, -) -> bool: - logger = get_run_logger() - if client is None: - logger.error("NERSC client is required for job submission.") - raise ValueError("NERSC client is required for job submission.") - # if file_path is None: - # logger.error("File path is required for job submission.") - # raise ValueError("File path is required for job submission.") - - load_dotenv() - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - try: - logger.info(f"Creating directory: {scratch_path}/prefect-recon-test") - client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") - logger.info("Directory created successfully.") - except Exception as e: - logger.error(f"Failed to create directory: {e}") - raise e - - # Need to update this script: - # take in the file path - # ignore input.txt - # run the reconstruction script with the file path - # run tiff to zarr after reconstruction - - job_script = f"""#!/bin/bash -#SBATCH -q debug -#SBATCH -A als -#SBATCH -C cpu -#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err -#SBATCH -N 1 -#SBATCH --ntasks-per-node 1 -#SBATCH --cpus-per-task 64 -#SBATCH --time=00:15:00 -#SBATCH --exclusive - -date -srun podman-hpc login registry.nersc.gov --username {username} --password {password} -srun podman-hpc run ---volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py ---volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt ---volume {scratch_path}/microctdata:/alsdata ---volume {scratch_path}/microctdata:/alsuser/ registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c python reconstruction.py input.txt -date -""" -# srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/reconstruction.py:/alsuser/reconstruction.py --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt --volume {scratch_path}/microctdata:/alsdata localhost/tomorecon_nersc_mpi_hdf5:1.0 python reconstruction.py input.txt - - try: - logger.info("Submitting reconstruction job script to Perlmutter.") - job = client.perlmutter.submit_job(job_script) - job.complete() # waits for job completion - logger.info("Reconstruction job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete reconstruction job: {e}") - return False - - -@task(name="submit_tiff_to_zarr_job_script") -def submit_tiff_to_zarr_job_script( - client: NerscClient, - script_path: str = "tiff_to_zarr.py", - recon_path: str = None, - raw_path: str = None -) -> bool: - logger = get_run_logger() - if client is None: - logger.error("NERSC client is required for job submission.") - raise ValueError("NERSC client is required for job submission.") - - load_dotenv() - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - try: - logger.info(f"Creating directory: {scratch_path}/prefect-recon-test") - client.perlmutter.run(f"mkdir -p {scratch_path}/prefect-recon-test") - logger.info("Directory created successfully.") - except Exception as e: - logger.error(f"Failed to create directory: {e}") - raise e - - # Need to update this script: - # take in the file path - # ignore input.txt - # run the reconstruction script with the file path - # run tiff to zarr after reconstruction - - job_script = f"""#!/bin/bash -#SBATCH -q debug -#SBATCH -A als -#SBATCH -C cpu -#SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err -#SBATCH -N 1 -#SBATCH --ntasks-per-node 1 -#SBATCH --cpus-per-task 64 -#SBATCH --time=00:15:00 -#SBATCH --exclusive - -date -srun podman-hpc login registry.nersc.gov --username {username} --password {password} -srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ ---volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ ---volume {scratch_path}/microctdata:/alsdata \ ---volume {scratch_path}/microctdata:/alsuser/ \ -registry.nersc.gov/als/tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c \ -bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && python -m pip show dask_image || python -m pip install dask_image && python {script_path} {recon_path} --raw_file {raw_path}" -date -""" - try: - logger.info("Submitting Tiff to Zarr job script to Perlmutter.") - job = client.perlmutter.submit_job(job_script) - logger.info(f"jobid={job.job_id}") - job.complete() # waits for job completion - logger.info("Tiff to Zarr job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") - return False - - -@flow(name="nersc_recon_flow") -def nersc_recon_flow( - file_path: str = "", - is_export_control: bool = False, - config=None, -): - logger = get_run_logger() - logger.info("Starting NERSC flow for new file processing and transfer.") - if not config: - config = Config832() - - if not is_export_control: - logger.info("File is not export controlled, will run reconstruction at NERSC.") - - # Step 1: Check if file exists at NERSC - # Data should already have been transferred to NERSC in new_832_file_flow (move.py) - # transfer_client = config.tc - - # nersc_raw_path = file_path.split("/global")[1] - # directory_path = os.path.dirname(os.path.join(config.nersc832.root_path, nersc_raw_path)) - # file_name = os.path.basename(nersc_raw_path) - - # # List the directory contents and check if the file exists - # file_exists = any( - # item["name"] == file_name and item["type"] == "file" - # for item in transfer_client.operation_ls( - # endpoint_id=config.nersc832.uuid, - # path=directory_path - # )["DATA"] - # ) - - file_exists = True - - # If file exists, run submit_job_script - if file_exists: - # logger.info(f"File {file_name} found at NERSC.") - # Creating a sfapi client object - client = create_nersc_client() - logger.info("NERSC SFAPI Client created") - - # Step 2A: If raw h5 file exists on NERSC, run submit_job_script - # Update submit_job_script to take in the file path - # Job submission step - nersc_reconstruction_success = submit_recon_job_script( - client=client - ) - # file_path=nersc_raw_path) - if not nersc_reconstruction_success: - logger.error("Reconstruction Failed.") - raise ValueError("Reconstruction at NERSC Failed") - - else: - logger.info("Reconstruction Successful.") - # Step 2B: Run tiff_to_zarr after reconstruction - # user = client.user() - nersc_tiff_scratch_path = "rec20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast" - nersc_raw_path = "20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" - nersc_tiff_to_zarr_success = submit_tiff_to_zarr_job_script( - client=client, - recon_path=nersc_tiff_scratch_path, - raw_path=nersc_raw_path) - if not nersc_tiff_to_zarr_success: - logger.error("Tiff to Zarr Failed.") - raise ValueError("Tiff to Zarr at ALCF Failed") - else: - logger.info("Tiff to Zarr Successful.") - - else: - logger.error(f"File {file_name} not found at NERSC.") - return - - # if nersc_reconstruction_success: - # # Step 3A: Send reconstructed data (tiff) to data832 - # # transfer reconstructions to data832 - # nersc_tiff_scratch_path = "" - # transfer_data_to_data832( - # file_path=nersc_tiff_scratch_path, - # transfer_client=config.tc, - # source_endpoint=config.nersc832, - # data832=config.data832 - # ) - - # if nersc_tiff_to_zarr_success: - # # Step 3B: Send zarr data to data832 - # nersc_zarr_scratch_path = "" - # transfer_data_to_data832( - # file_path=nersc_zarr_scratch_path, - # transfer_client=config.tc, - # source_endpoint=config.nersc832, - # data832=config.data832 - # ) - - # Step 4 Schedule file deletion - - else: - logger.info("File is export controlled, not running reconstruction at NERSC.") - return - - return - - -if __name__ == "__main__": - nersc_recon_flow() diff --git a/orchestration/flows/bl832/tomography_hpc.py b/orchestration/flows/bl832/tomography_hpc.py new file mode 100644 index 0000000..af7bf4e --- /dev/null +++ b/orchestration/flows/bl832/tomography_hpc.py @@ -0,0 +1,343 @@ +from abc import ABC, abstractmethod +from dotenv import load_dotenv +import logging +import os +from pathlib import Path +from typing import Optional, Dict, Any + +from orchestration.flows.bl832.config import Config832 +from orchestration.nersc import NerscClient + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class TomographyHPCController(ABC): + """ + Abstract class for tomography HPC controllers. + Provides interface methods for reconstruction and building multi-resolution datasets. + + Args: + ABC: Abstract Base Class + """ + def __init__(self): + pass + + @abstractmethod + def reconstruct( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + """Perform tomography reconstruction + + :param file_path: Path to the file to reconstruct. + :param is_export_control: Flag indicating export control restrictions. + :param config: Optional configuration dictionary. + :return: True if successful, False otherwise. + """ + pass + + @abstractmethod + def build_multi_resolution( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + """Generate multi-resolution version of reconstructed tomography + + :param file_path: Path to the file for which to build multi-resolution data. + :param is_export_control: Flag indicating export control restrictions. + :param config: Optional configuration dictionary. + :return: True if successful, False otherwise. + """ + pass + + +class ALCFTomographyHPCController(TomographyHPCController): + """ + Implementation of TomographyHPCController for ALCF. + Methods here leverage Globus Compute for processing tasks. + + Args: + TomographyHPCController (_type_): _description_ + """ + + def __init__(self): + pass + + def reconstruct( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + + # uses Globus Compute to reconstruct the tomography + pass + + def build_multi_resolution( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + # uses Globus Compute to build multi-resolution tomography + pass + + +class NERSCTomographyHPCController(TomographyHPCController): + """ + Implementation for a NERSC-based tomography HPC controller. + + Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. + """ + + def __init__(self): + self.client = self._create_nersc_client() + + def reconstruct( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + """ + Use NERSC for tomography reconstruction + """ + logger.info("Starting NERSC reconstruction process.") + + if is_export_control: + logger.warning("File is export controlled; skipping NERSC reconstruction.") + return False + else: + logger.info("File is not export controlled; proceeding with NERSC reconstruction.") + + if not config: + config = Config832() + + nersc_reconstruction_success = self._submit_nersc_reconstruction_job( + file_path=file_path + ) + + logger.info(f"Was NERSC reconstruction successful: {nersc_reconstruction_success}") + + return nersc_reconstruction_success + + def build_multi_resolution( + self, + file_path: str = "", + is_export_control: bool = False, + config: Optional[Dict[str, Any]] = None + ) -> bool: + """Use NERSC to make multiresolution version of tomography results.""" + + if is_export_control: + logger.warning("File is export controlled; skipping NERSC multi-resolution task.") + return False + else: + logger.info("File is not export controlled; proceeding with NERSC multi-resolution task.") + + if not config: + config = Config832() + + nersc_multi_resolution_success = self._submit_nersc_multi_resolution_job( + file_path=file_path + ) + + logger.info(f"Was NERSC multi-resolution conversion successful: {nersc_multi_resolution_success}") + + return nersc_multi_resolution_success + + def _create_nersc_client(self) -> NerscClient: + """Create and return an NERSC client instance""" + load_dotenv() + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + + if not client_id_path or not sfapi_key_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("Missing NERSC credentials paths.") + if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): + logger.error("NERSC credential files are missing.") + raise FileNotFoundError("NERSC credential files are missing.") + + try: + return NerscClient(client_id_path, sfapi_key_path) + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e + + def _submit_nersc_reconstruction_job( + self, + file_path: str = None + ) -> bool: + """Submit a tomography reconstruction job to NERSC""" + + if self.client is None: + logger.error("NERSC client is required for job submission.") + raise ValueError("NERSC client is required for job submission.") + + load_dotenv() + + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = "tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c" + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + + job_script = f"""#!/bin/bash + #SBATCH -q preempt + #SBATCH -A als + #SBATCH -C cpu + #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 + #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out + #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err + #SBATCH -N 1 + #SBATCH --ntasks-per-node 1 + #SBATCH --cpus-per-task 64 + #SBATCH --time=00:15:00 + #SBATCH --exclusive + + date + + srun podman-hpc login registry.nersc.gov --username {username} --password {password} + srun podman-hpc run + --volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py + --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt + --volume {scratch_path}/microctdata:/alsdata + --volume {scratch_path}/microctdata:/alsuser/ \ + registry.nersc.gov/als/{image_name} \ + python sfapi_reconstruction.py {file_name} {folder_name} + + date + """ + + try: + logger.info("Submitting reconstruction job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + job.complete() # waits for job completion + logger.info("Reconstruction job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete reconstruction job: {e}") + return False + + def _submit_nersc_multi_resolution_job( + self, + file_path: str = None, + ) -> bool: + """Submit a multi-resolution tomography job to NERSC""" + + if self.client is None: + logger.error("NERSC client is required for job submission.") + raise ValueError("NERSC client is required for job submission.") + + load_dotenv() + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = "tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c" + recon_path = file_path + raw_path = file_path + + # Need to update this script: + # rebuild image with dependencies + + job_script = f"""#!/bin/bash + #SBATCH -q preempt + #SBATCH -A als + #SBATCH -C cpu + #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 + #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out + #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err + #SBATCH -N 1 + #SBATCH --ntasks-per-node 1 + #SBATCH --cpus-per-task 64 + #SBATCH --time=00:15:00 + #SBATCH --exclusive + + date + + srun podman-hpc login registry.nersc.gov --username {username} --password {password} + srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ + --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ + --volume {scratch_path}/microctdata:/alsdata \ + --volume {scratch_path}/microctdata:/alsuser/ \ + registry.nersc.gov/als/{image_name} \ + bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ + python -m pip show dask_image || python -m pip install dask_image && \ + python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" + + date + """ + try: + logger.info("Submitting Tiff to Zarr job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + logger.info(f"jobid={job.job_id}") + job.complete() # waits for job completion + logger.info("Tiff to Zarr job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") + return False + + +def get_controller( + hpc_type: str = None +) -> TomographyHPCController: + """ + Factory function to retrieve the appropriate HPC controller. + + :param hpc_type: The type of HPC environment, either 'ALCF' or 'NERSC'. + :return: An instance of TomographyHPCController. + :raises ValueError: If an invalid HPC type is provided. + """ + if hpc_type == "ALCF": + return ALCFTomographyHPCController() + elif hpc_type == "NERSC": + return NERSCTomographyHPCController() + else: + raise ValueError("Invalid HPC type") + + +def do_it_all(): + controller = get_controller("ALCF") + controller.reconstruct() + controller.build_multi_resolution() + + file_path = "" + controller = get_controller("NERSC") + controller.reconstruct( + file_path=file_path, + is_export_control=False, + ) + controller.build_multi_resolution( + file_path=file_path, + is_export_control=False, + ) + + +if __name__ == "__main__": + do_it_all() + logger.info("Done.") diff --git a/orchestration/nersc.py b/orchestration/nersc.py index 777b752..e432ec4 100644 --- a/orchestration/nersc.py +++ b/orchestration/nersc.py @@ -1,19 +1,19 @@ import json import logging -from pathlib import Path -import time +# from pathlib import Path +# import time -from authlib.integrations.requests_client import OAuth2Session -from authlib.oauth2.rfc7523 import PrivateKeyJWT +# from authlib.integrations.requests_client import OAuth2Session +# from authlib.oauth2.rfc7523 import PrivateKeyJWT from authlib.jose import JsonWebKey from sfapi_client import Client -from sfapi_client._sync.client import SFAPI_BASE_URL, SFAPI_TOKEN_URL +# from sfapi_client._sync.client import SFAPI_BASE_URL, SFAPI_TOKEN_URL from sfapi_client.compute import Machine # Temporary patch till the sfapi_client is updated from sfapi_client.jobs import JobSacct -from sfapi_client.compute import Compute +# from sfapi_client.compute import Compute JobSacct.model_rebuild() @@ -34,9 +34,8 @@ def __init__( # Reading the client_id and private key from the files self.client_id = None self.pri_key = None - #self.session = None + # self.session = None self.init_client_info() - super().__init__(self.client_id, self.pri_key) @@ -80,7 +79,7 @@ def request_job_status(self): def update_job_id(self): if self.job is None: - self.logger.info(f"No job found") + self.logger.info("No job found") else: self.jobid = self.job.jobid @@ -89,7 +88,7 @@ def update_job_state(self): self.job_state = self.job.state if self.job_state == "RUNNING": - self.has_ran = True + self.has_ran = True elif self.job_state == "COMPLETE": self.logger.info(f"Job {self.jobid} with COMPLETE status") @@ -104,6 +103,6 @@ def submit_job(self, job_script): self.logger.info(f"Submitting job with script: {job_script}") self.job = self.perlmutter.submit_job(job_script) self.update_job_id() - #self.update_job_state() + # self.update_job_state() self.logger.info(f"Submitted job id: {self.jobid}") From 8c5d20afce8f67712033b92598dbe0f10c7139e0 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 6 Dec 2024 16:16:03 -0800 Subject: [PATCH 05/23] Addressing dylan's comments. Moved image names to config.yml and config.py, renamed tomography_hpc.py to job_controller.py. Added back nersc.py, which now just uses job_controller (need to add back in the other data transfer/pruning logic, etc). Inject NERSCClient into the nersc controller init. Removed references to export controls. Fixed typing for Config832 references (and pass into the init). Turned create_nersc_client() into a static method in the class. Moved load_dotenv to the top of the module. Consoldated reconstrut() and and build_multi_resolution() in the nersc implementation. --- config.yml | 4 + orchestration/flows/bl832/config.py | 1 + .../{tomography_hpc.py => job_controller.py} | 132 +++++------------- orchestration/flows/bl832/nersc.py | 29 ++++ 4 files changed, 72 insertions(+), 94 deletions(-) rename orchestration/flows/bl832/{tomography_hpc.py => job_controller.py} (71%) create mode 100644 orchestration/flows/bl832/nersc.py diff --git a/config.yml b/config.yml index f06ab9f..c6cdbf2 100644 --- a/config.yml +++ b/config.yml @@ -95,6 +95,10 @@ globus: client_id: ${GLOBUS_CLIENT_ID} client_secret: ${GLOBUS_CLIENT_SECRET} +harbor_images832: + recon_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c + multires_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c + prefect: deployments: - type_spec: new_file_832 diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 57de5f4..148d58b 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -21,3 +21,4 @@ def __init__(self) -> None: self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] self.scicat = config["scicat"] + self.harbor_images832 = config["harbor_images832"] diff --git a/orchestration/flows/bl832/tomography_hpc.py b/orchestration/flows/bl832/job_controller.py similarity index 71% rename from orchestration/flows/bl832/tomography_hpc.py rename to orchestration/flows/bl832/job_controller.py index af7bf4e..0c74604 100644 --- a/orchestration/flows/bl832/tomography_hpc.py +++ b/orchestration/flows/bl832/job_controller.py @@ -3,13 +3,14 @@ import logging import os from pathlib import Path -from typing import Optional, Dict, Any +from typing import Optional from orchestration.flows.bl832.config import Config832 from orchestration.nersc import NerscClient logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +load_dotenv() class TomographyHPCController(ABC): @@ -20,21 +21,20 @@ class TomographyHPCController(ABC): Args: ABC: Abstract Base Class """ - def __init__(self): + def __init__( + self, + Config832: Optional[Config832] = None + ) -> None: pass @abstractmethod def reconstruct( self, file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None ) -> bool: """Perform tomography reconstruction :param file_path: Path to the file to reconstruct. - :param is_export_control: Flag indicating export control restrictions. - :param config: Optional configuration dictionary. :return: True if successful, False otherwise. """ pass @@ -43,14 +43,10 @@ def reconstruct( def build_multi_resolution( self, file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None ) -> bool: """Generate multi-resolution version of reconstructed tomography :param file_path: Path to the file for which to build multi-resolution data. - :param is_export_control: Flag indicating export control restrictions. - :param config: Optional configuration dictionary. :return: True if successful, False otherwise. """ pass @@ -62,17 +58,15 @@ class ALCFTomographyHPCController(TomographyHPCController): Methods here leverage Globus Compute for processing tasks. Args: - TomographyHPCController (_type_): _description_ + TomographyHPCController (ABC): Abstract class for tomography HPC controllers. """ - def __init__(self): + def __init__(self) -> None: pass def reconstruct( self, file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None ) -> bool: # uses Globus Compute to reconstruct the tomography @@ -81,10 +75,9 @@ def reconstruct( def build_multi_resolution( self, file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None ) -> bool: # uses Globus Compute to build multi-resolution tomography + pass @@ -95,65 +88,20 @@ class NERSCTomographyHPCController(TomographyHPCController): Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. """ - def __init__(self): - self.client = self._create_nersc_client() - - def reconstruct( + def __init__( self, - file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None - ) -> bool: - """ - Use NERSC for tomography reconstruction - """ - logger.info("Starting NERSC reconstruction process.") - - if is_export_control: - logger.warning("File is export controlled; skipping NERSC reconstruction.") - return False + client: NerscClient = None, + Config832: Optional[Config832] = None + ) -> None: + self.client = client + if not Config832: + self.config = Config832() else: - logger.info("File is not export controlled; proceeding with NERSC reconstruction.") + self.config = Config832 - if not config: - config = Config832() - - nersc_reconstruction_success = self._submit_nersc_reconstruction_job( - file_path=file_path - ) - - logger.info(f"Was NERSC reconstruction successful: {nersc_reconstruction_success}") - - return nersc_reconstruction_success - - def build_multi_resolution( - self, - file_path: str = "", - is_export_control: bool = False, - config: Optional[Dict[str, Any]] = None - ) -> bool: - """Use NERSC to make multiresolution version of tomography results.""" - - if is_export_control: - logger.warning("File is export controlled; skipping NERSC multi-resolution task.") - return False - else: - logger.info("File is not export controlled; proceeding with NERSC multi-resolution task.") - - if not config: - config = Config832() - - nersc_multi_resolution_success = self._submit_nersc_multi_resolution_job( - file_path=file_path - ) - - logger.info(f"Was NERSC multi-resolution conversion successful: {nersc_multi_resolution_success}") - - return nersc_multi_resolution_success - - def _create_nersc_client(self) -> NerscClient: + def create_nersc_client() -> NerscClient: """Create and return an NERSC client instance""" - load_dotenv() + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") @@ -170,17 +118,14 @@ def _create_nersc_client(self) -> NerscClient: logger.error(f"Failed to create NERSC client: {e}") raise e - def _submit_nersc_reconstruction_job( + def reconstruct( self, - file_path: str = None + file_path: str = "", ) -> bool: - """Submit a tomography reconstruction job to NERSC""" - - if self.client is None: - logger.error("NERSC client is required for job submission.") - raise ValueError("NERSC client is required for job submission.") - - load_dotenv() + """ + Use NERSC for tomography reconstruction + """ + logger.info("Starting NERSC reconstruction process.") # Can't use this long term in production. Need to find a better way to handle credentials. # Want to run this as the alsdev user @@ -194,11 +139,16 @@ def _submit_nersc_reconstruction_job( logger.info(home_path) logger.info(scratch_path) - image_name = "tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c" + image_name = self.config.harbor_images832["recon_image"] path = Path(file_path) folder_name = path.parent.name file_name = path.stem + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + job_script = f"""#!/bin/bash #SBATCH -q preempt #SBATCH -A als @@ -236,17 +186,12 @@ def _submit_nersc_reconstruction_job( logger.error(f"Failed to submit or complete reconstruction job: {e}") return False - def _submit_nersc_multi_resolution_job( + def build_multi_resolution( self, - file_path: str = None, + file_path: str = "", ) -> bool: - """Submit a multi-resolution tomography job to NERSC""" - - if self.client is None: - logger.error("NERSC client is required for job submission.") - raise ValueError("NERSC client is required for job submission.") + """Use NERSC to make multiresolution version of tomography results.""" - load_dotenv() username = os.getenv("NERSC_USERNAME") password = os.getenv("NERSC_PASSWORD") @@ -257,7 +202,7 @@ def _submit_nersc_multi_resolution_job( logger.info(home_path) logger.info(scratch_path) - image_name = "tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c" + image_name = self.config.harbor_images832["multires_image"] recon_path = file_path raw_path = file_path @@ -316,12 +261,13 @@ def get_controller( if hpc_type == "ALCF": return ALCFTomographyHPCController() elif hpc_type == "NERSC": - return NERSCTomographyHPCController() + client = NERSCTomographyHPCController.create_nersc_client() + return NERSCTomographyHPCController(client=client) else: raise ValueError("Invalid HPC type") -def do_it_all(): +def do_it_all() -> None: controller = get_controller("ALCF") controller.reconstruct() controller.build_multi_resolution() @@ -330,11 +276,9 @@ def do_it_all(): controller = get_controller("NERSC") controller.reconstruct( file_path=file_path, - is_export_control=False, ) controller.build_multi_resolution( file_path=file_path, - is_export_control=False, ) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py new file mode 100644 index 0000000..7161adc --- /dev/null +++ b/orchestration/flows/bl832/nersc.py @@ -0,0 +1,29 @@ +from prefect import flow + +from orchestration.flows.bl832.job_controller import get_controller + + +@flow(name="nersc_recon_flow") +def nersc_recon_flow( + file_path: str, +) -> bool: + """ + Perform tomography reconstruction on NERSC. + + :param file_path: Path to the file to reconstruct. + """ + + # To do: Implement file transfers, pruning, and other necessary steps + + controller = get_controller("NERSC") + nersc_reconstruction_success = controller.reconstruct( + file_path=file_path, + ) + nersc_multi_res_success = controller.build_multi_resolution( + file_path=file_path, + ) + + if nersc_reconstruction_success and nersc_multi_res_success: + return True + else: + return False From 235acdae7e9137b5689c85a985d2fdfb19e36182 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 9 Dec 2024 11:30:26 -0800 Subject: [PATCH 06/23] Moved NERSC specific NERSCTomographyHPCController to flows/bl832/nersc.py. Updated get_controller() in job_controller.py to take in an HPC enum that stores the corresponding Controller implementation for that facility. --- orchestration/flows/bl832/job_controller.py | 197 +++----------------- orchestration/flows/bl832/nersc.py | 180 +++++++++++++++++- 2 files changed, 200 insertions(+), 177 deletions(-) diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index 0c74604..94f4fbd 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -1,12 +1,11 @@ from abc import ABC, abstractmethod from dotenv import load_dotenv +from enum import Enum import logging -import os -from pathlib import Path from typing import Optional from orchestration.flows.bl832.config import Config832 -from orchestration.nersc import NerscClient +from orchestration.flows.bl832.nersc import NERSCTomographyHPCController logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -81,190 +80,36 @@ def build_multi_resolution( pass -class NERSCTomographyHPCController(TomographyHPCController): +class HPC(Enum): """ - Implementation for a NERSC-based tomography HPC controller. - - Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. + Each HPC enum member directly stores a callable that returns a TomographyHPCController. """ - - def __init__( - self, - client: NerscClient = None, - Config832: Optional[Config832] = None - ) -> None: - self.client = client - if not Config832: - self.config = Config832() - else: - self.config = Config832 - - def create_nersc_client() -> NerscClient: - """Create and return an NERSC client instance""" - - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") - - if not client_id_path or not sfapi_key_path: - logger.error("NERSC credentials paths are missing.") - raise ValueError("Missing NERSC credentials paths.") - if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): - logger.error("NERSC credential files are missing.") - raise FileNotFoundError("NERSC credential files are missing.") - - try: - return NerscClient(client_id_path, sfapi_key_path) - except Exception as e: - logger.error(f"Failed to create NERSC client: {e}") - raise e - - def reconstruct( - self, - file_path: str = "", - ) -> bool: - """ - Use NERSC for tomography reconstruction - """ - logger.info("Starting NERSC reconstruction process.") - - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["recon_image"] - path = Path(file_path) - folder_name = path.parent.name - file_name = path.stem - - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - job_script = f"""#!/bin/bash - #SBATCH -q preempt - #SBATCH -A als - #SBATCH -C cpu - #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 - #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out - #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err - #SBATCH -N 1 - #SBATCH --ntasks-per-node 1 - #SBATCH --cpus-per-task 64 - #SBATCH --time=00:15:00 - #SBATCH --exclusive - - date - - srun podman-hpc login registry.nersc.gov --username {username} --password {password} - srun podman-hpc run - --volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py - --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt - --volume {scratch_path}/microctdata:/alsdata - --volume {scratch_path}/microctdata:/alsuser/ \ - registry.nersc.gov/als/{image_name} \ - python sfapi_reconstruction.py {file_name} {folder_name} - - date - """ - - try: - logger.info("Submitting reconstruction job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - job.complete() # waits for job completion - logger.info("Reconstruction job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete reconstruction job: {e}") - return False - - def build_multi_resolution( - self, - file_path: str = "", - ) -> bool: - """Use NERSC to make multiresolution version of tomography results.""" - - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["multires_image"] - recon_path = file_path - raw_path = file_path - - # Need to update this script: - # rebuild image with dependencies - - job_script = f"""#!/bin/bash - #SBATCH -q preempt - #SBATCH -A als - #SBATCH -C cpu - #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 - #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out - #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err - #SBATCH -N 1 - #SBATCH --ntasks-per-node 1 - #SBATCH --cpus-per-task 64 - #SBATCH --time=00:15:00 - #SBATCH --exclusive - - date - - srun podman-hpc login registry.nersc.gov --username {username} --password {password} - srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ - --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ - --volume {scratch_path}/microctdata:/alsdata \ - --volume {scratch_path}/microctdata:/alsuser/ \ - registry.nersc.gov/als/{image_name} \ - bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ - python -m pip show dask_image || python -m pip install dask_image && \ - python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" - - date - """ - try: - logger.info("Submitting Tiff to Zarr job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - logger.info(f"jobid={job.job_id}") - job.complete() # waits for job completion - logger.info("Tiff to Zarr job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") - return False + ALCF = ("ALCF", lambda: ALCFTomographyHPCController()) + NERSC = ("NERSC", lambda: NERSCTomographyHPCController( + client=NERSCTomographyHPCController.create_nersc_client() + )) + # Ex: add more HPCs here + # OLCF = ("OLCF", lambda: OLCFTomographyHPCController()) def get_controller( hpc_type: str = None ) -> TomographyHPCController: """ - Factory function to retrieve the appropriate HPC controller. + Factory function to retrieve the appropriate HPC controller instance based on the given HPC type. - :param hpc_type: The type of HPC environment, either 'ALCF' or 'NERSC'. - :return: An instance of TomographyHPCController. + :param hpc_type: The type of HPC environment as a string, (e.g. 'ALCF' or 'NERSC'). + :return: An instance of TomographyHPCController for the given HPC environment. :raises ValueError: If an invalid HPC type is provided. """ - if hpc_type == "ALCF": - return ALCFTomographyHPCController() - elif hpc_type == "NERSC": - client = NERSCTomographyHPCController.create_nersc_client() - return NERSCTomographyHPCController(client=client) - else: - raise ValueError("Invalid HPC type") + if not hpc_type: + raise ValueError("No HPC type provided.") + + # Convert the string to uppercase and remove whitespace to avoid errors and validate hpc_type against HPC enum. + hpc_enum = HPC(hpc_type.strip().upper()) + + # Access hpc_enum.value directly. As defined, it should be directly callable. + return hpc_enum.value() # Call the stored class to get a new instance of the selected Controller. def do_it_all() -> None: diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 7161adc..b9a93b3 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,6 +1,184 @@ +from dotenv import load_dotenv +import logging +import os +from pathlib import Path from prefect import flow +from typing import Optional -from orchestration.flows.bl832.job_controller import get_controller +from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.job_controller import get_controller, TomographyHPCController +from orchestration.nersc import NerscClient + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +load_dotenv() + + +class NERSCTomographyHPCController(TomographyHPCController): + """ + Implementation for a NERSC-based tomography HPC controller. + + Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. + """ + + def __init__( + self, + client: NerscClient = None, + Config832: Optional[Config832] = None + ) -> None: + self.client = client + if not Config832: + self.config = Config832() + else: + self.config = Config832 + + def create_nersc_client() -> NerscClient: + """Create and return an NERSC client instance""" + + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + + if not client_id_path or not sfapi_key_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("Missing NERSC credentials paths.") + if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): + logger.error("NERSC credential files are missing.") + raise FileNotFoundError("NERSC credential files are missing.") + + try: + return NerscClient(client_id_path, sfapi_key_path) + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e + + def reconstruct( + self, + file_path: str = "", + ) -> bool: + """ + Use NERSC for tomography reconstruction + """ + logger.info("Starting NERSC reconstruction process.") + + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["recon_image"] + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + job_script = f"""#!/bin/bash + #SBATCH -q preempt + #SBATCH -A als + #SBATCH -C cpu + #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 + #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out + #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err + #SBATCH -N 1 + #SBATCH --ntasks-per-node 1 + #SBATCH --cpus-per-task 64 + #SBATCH --time=00:15:00 + #SBATCH --exclusive + + date + + srun podman-hpc login registry.nersc.gov --username {username} --password {password} + srun podman-hpc run + --volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py + --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt + --volume {scratch_path}/microctdata:/alsdata + --volume {scratch_path}/microctdata:/alsuser/ \ + registry.nersc.gov/als/{image_name} \ + python sfapi_reconstruction.py {file_name} {folder_name} + + date + """ + + try: + logger.info("Submitting reconstruction job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + job.complete() # waits for job completion + logger.info("Reconstruction job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete reconstruction job: {e}") + return False + + def build_multi_resolution( + self, + file_path: str = "", + ) -> bool: + """Use NERSC to make multiresolution version of tomography results.""" + + username = os.getenv("NERSC_USERNAME") + password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["multires_image"] + recon_path = file_path + raw_path = file_path + + # Need to update this script: + # rebuild image with dependencies + + job_script = f"""#!/bin/bash + #SBATCH -q preempt + #SBATCH -A als + #SBATCH -C cpu + #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 + #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out + #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err + #SBATCH -N 1 + #SBATCH --ntasks-per-node 1 + #SBATCH --cpus-per-task 64 + #SBATCH --time=00:15:00 + #SBATCH --exclusive + + date + + srun podman-hpc login registry.nersc.gov --username {username} --password {password} + srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ + --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ + --volume {scratch_path}/microctdata:/alsdata \ + --volume {scratch_path}/microctdata:/alsuser/ \ + registry.nersc.gov/als/{image_name} \ + bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ + python -m pip show dask_image || python -m pip install dask_image && \ + python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" + + date + """ + try: + logger.info("Submitting Tiff to Zarr job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + logger.info(f"jobid={job.job_id}") + job.complete() # waits for job completion + logger.info("Tiff to Zarr job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") + return False @flow(name="nersc_recon_flow") From 9afefa3ba1443406411410244fbc5e0e8497dd63 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 11 Dec 2024 15:33:50 -0800 Subject: [PATCH 07/23] Configured both reconstruction and multires steps to take in a file path and run respective tasks. Revert to debug queue (at least for now) since the queue time is orders of magnitude short than the preempt queue (want to look into the interactive queue). Moved NERSCTomographyHPCController back into job_controller.py due to circular imports. Fixed Enum implementation. --- orchestration/flows/bl832/job_controller.py | 240 ++++++++++++++++++-- orchestration/flows/bl832/nersc.py | 183 +-------------- requirements.txt | 2 +- 3 files changed, 231 insertions(+), 194 deletions(-) diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index 94f4fbd..04e95d1 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -2,10 +2,14 @@ from dotenv import load_dotenv from enum import Enum import logging -from typing import Optional +import os +from pathlib import Path +import time +from typing import Callable, Optional from orchestration.flows.bl832.config import Config832 -from orchestration.flows.bl832.nersc import NERSCTomographyHPCController +# from orchestration.flows.bl832.nersc import NERSCTomographyHPCController +from orchestration.nersc import NerscClient logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -80,36 +84,232 @@ def build_multi_resolution( pass +class NERSCTomographyHPCController(TomographyHPCController): + """ + Implementation for a NERSC-based tomography HPC controller. + + Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. + """ + + def __init__( + self, + client: NerscClient = None, + config: Optional[Config832] = None + ) -> None: + self.client = client + + if not config: + self.config = Config832() + else: + self.config = config + + def create_nersc_client() -> NerscClient: + """Create and return an NERSC client instance""" + + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + + if not client_id_path or not sfapi_key_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("Missing NERSC credentials paths.") + if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): + logger.error("NERSC credential files are missing.") + raise FileNotFoundError("NERSC credential files are missing.") + + try: + return NerscClient(client_id_path, sfapi_key_path) + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e + + def reconstruct( + self, + file_path: str = "", + ) -> bool: + """ + Use NERSC for tomography reconstruction + """ + logger.info("Starting NERSC reconstruction process.") + + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + # username = os.getenv("NERSC_USERNAME") + # password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["recon_image"] + logger.info(image_name) + path = Path(file_path) + folder_name = path.parent.name + if not folder_name: + folder_name = "" + + file_name = f"{path.stem}.h5" + + logger.info(f"File name: {file_name}") + logger.info(f"Folder name: {folder_name}") + + # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately + # Note: If q=debug, there is no minimum time limit + # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. + + # If the image has not been pulled before, then you must login to Harbor first (hopefully we can get a robot account) + # srun podman-hpc login registry.nersc.gov --username {username} --password {password} +# --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ + + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomo_recon_test-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 64 +#SBATCH --time=0:15:00 +#SBATCH --exclusive + +date +srun podman-hpc run \ +--volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ +--volume {scratch_path}/microctdata:/alsdata \ +--volume {scratch_path}/microctdata:/alsuser/ \ +registry.nersc.gov/als/{image_name} \ +bash -c "python -m pip install numpy==1.23.2 && \ +python sfapi_reconstruction.py {file_name} {folder_name}" +date +""" + + try: + logger.info("Submitting reconstruction job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + logging.info(job.jobid) + job.update() + time.sleep(60) # Wait 60 seconds for job to register before checking status + logging.info(job.state) + job.complete() # waits for job completion + logger.info("Reconstruction job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete reconstruction job: {e}") + return False + + def build_multi_resolution( + self, + file_path: str = "", + ) -> bool: + """Use NERSC to make multiresolution version of tomography results.""" + + # username = os.getenv("NERSC_USERNAME") + # password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["multires_image"] + + # TODO: fix these paths + + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + + recon_path = f"scratch/{folder_name}/rec{file_name}/" + raw_path = f"{folder_name}/{file_name}.h5" + + # Need to update this script: + # rebuild image with dependencies + + # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomo_multires_test-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 64 +#SBATCH --time=0:15:00 +#SBATCH --exclusive + +date +srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ +--volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ +--volume {scratch_path}/microctdata:/alsdata \ +--volume {scratch_path}/microctdata:/alsuser/ \ +registry.nersc.gov/als/{image_name} \ +bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ +python -m pip show dask_image || python -m pip install dask_image && \ +python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" + +date +""" + try: + logger.info("Submitting Tiff to Zarr job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + time.sleep(30) # Wait 30 seconds before checking job completion + job.complete() # waits for job completion + logger.info("Tiff to Zarr job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") + return False + + class HPC(Enum): """ - Each HPC enum member directly stores a callable that returns a TomographyHPCController. + Enum representing different HPC environments. + Use enum names as strings to identify HPC sites, ensuring a standard set of values. + + Members: + ALCF: Argonne Leadership Computing Facility + NERSC: National Energy Research Scientific Computing Center """ - ALCF = ("ALCF", lambda: ALCFTomographyHPCController()) - NERSC = ("NERSC", lambda: NERSCTomographyHPCController( - client=NERSCTomographyHPCController.create_nersc_client() - )) - # Ex: add more HPCs here - # OLCF = ("OLCF", lambda: OLCFTomographyHPCController()) + ALCF = "ALCF" + NERSC = "NERSC" -def get_controller( - hpc_type: str = None -) -> TomographyHPCController: +def get_controller(hpc_type: str) -> TomographyHPCController: """ - Factory function to retrieve the appropriate HPC controller instance based on the given HPC type. + Factory function that returns an HPC controller instance for the given HPC environment. - :param hpc_type: The type of HPC environment as a string, (e.g. 'ALCF' or 'NERSC'). - :return: An instance of TomographyHPCController for the given HPC environment. - :raises ValueError: If an invalid HPC type is provided. + :param hpc_type: A string identifying the HPC environment (e.g., 'ALCF', 'NERSC'). + :return: An instance of a TomographyHPCController subclass corresponding to the given HPC environment. + :raises ValueError: If an invalid or unsupported HPC type is specified. """ if not hpc_type: raise ValueError("No HPC type provided.") - # Convert the string to uppercase and remove whitespace to avoid errors and validate hpc_type against HPC enum. - hpc_enum = HPC(hpc_type.strip().upper()) + # Normalize input + hpc_str = hpc_type.strip().upper() + + # Attempt to map the given string to the HPC enum + try: + hpc_enum = HPC(hpc_str) + except ValueError: + raise ValueError(f"'{hpc_type}' is not a valid HPC") from None + + # Map HPC enum members to corresponding controller constructors + controller_map: dict[HPC, Callable[[], TomographyHPCController]] = { + HPC.ALCF: lambda: ALCFTomographyHPCController(), + HPC.NERSC: lambda: NERSCTomographyHPCController(NERSCTomographyHPCController.create_nersc_client()), + } - # Access hpc_enum.value directly. As defined, it should be directly callable. - return hpc_enum.value() # Call the stored class to get a new instance of the selected Controller. + # Return a new controller instance + return controller_map[hpc_enum]() def do_it_all() -> None: diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index b9a93b3..0f0f9a1 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,186 +1,19 @@ from dotenv import load_dotenv import logging -import os -from pathlib import Path +# import os +# from pathlib import Path from prefect import flow -from typing import Optional +# from typing import Optional -from orchestration.flows.bl832.config import Config832 -from orchestration.flows.bl832.job_controller import get_controller, TomographyHPCController -from orchestration.nersc import NerscClient +# from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.job_controller import get_controller +# from orchestration.nersc import NerscClient logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) load_dotenv() -class NERSCTomographyHPCController(TomographyHPCController): - """ - Implementation for a NERSC-based tomography HPC controller. - - Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. - """ - - def __init__( - self, - client: NerscClient = None, - Config832: Optional[Config832] = None - ) -> None: - self.client = client - if not Config832: - self.config = Config832() - else: - self.config = Config832 - - def create_nersc_client() -> NerscClient: - """Create and return an NERSC client instance""" - - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") - - if not client_id_path or not sfapi_key_path: - logger.error("NERSC credentials paths are missing.") - raise ValueError("Missing NERSC credentials paths.") - if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): - logger.error("NERSC credential files are missing.") - raise FileNotFoundError("NERSC credential files are missing.") - - try: - return NerscClient(client_id_path, sfapi_key_path) - except Exception as e: - logger.error(f"Failed to create NERSC client: {e}") - raise e - - def reconstruct( - self, - file_path: str = "", - ) -> bool: - """ - Use NERSC for tomography reconstruction - """ - logger.info("Starting NERSC reconstruction process.") - - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["recon_image"] - path = Path(file_path) - folder_name = path.parent.name - file_name = path.stem - - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - job_script = f"""#!/bin/bash - #SBATCH -q preempt - #SBATCH -A als - #SBATCH -C cpu - #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 - #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out - #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err - #SBATCH -N 1 - #SBATCH --ntasks-per-node 1 - #SBATCH --cpus-per-task 64 - #SBATCH --time=00:15:00 - #SBATCH --exclusive - - date - - srun podman-hpc login registry.nersc.gov --username {username} --password {password} - srun podman-hpc run - --volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py - --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt - --volume {scratch_path}/microctdata:/alsdata - --volume {scratch_path}/microctdata:/alsuser/ \ - registry.nersc.gov/als/{image_name} \ - python sfapi_reconstruction.py {file_name} {folder_name} - - date - """ - - try: - logger.info("Submitting reconstruction job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - job.complete() # waits for job completion - logger.info("Reconstruction job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete reconstruction job: {e}") - return False - - def build_multi_resolution( - self, - file_path: str = "", - ) -> bool: - """Use NERSC to make multiresolution version of tomography results.""" - - username = os.getenv("NERSC_USERNAME") - password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["multires_image"] - recon_path = file_path - raw_path = file_path - - # Need to update this script: - # rebuild image with dependencies - - job_script = f"""#!/bin/bash - #SBATCH -q preempt - #SBATCH -A als - #SBATCH -C cpu - #SBATCH --job-name=tomorecon_nersc_mpi_hdf5_1-0 - #SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out - #SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err - #SBATCH -N 1 - #SBATCH --ntasks-per-node 1 - #SBATCH --cpus-per-task 64 - #SBATCH --time=00:15:00 - #SBATCH --exclusive - - date - - srun podman-hpc login registry.nersc.gov --username {username} --password {password} - srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ - --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ - --volume {scratch_path}/microctdata:/alsdata \ - --volume {scratch_path}/microctdata:/alsuser/ \ - registry.nersc.gov/als/{image_name} \ - bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ - python -m pip show dask_image || python -m pip install dask_image && \ - python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" - - date - """ - try: - logger.info("Submitting Tiff to Zarr job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - logger.info(f"jobid={job.job_id}") - job.complete() # waits for job completion - logger.info("Tiff to Zarr job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") - return False - - @flow(name="nersc_recon_flow") def nersc_recon_flow( file_path: str, @@ -205,3 +38,7 @@ def nersc_recon_flow( return True else: return False + + +if __name__ == "__main__": + nersc_recon_flow(file_path="dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5") diff --git a/requirements.txt b/requirements.txt index 88c2dd6..7252202 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ globus-sdk>=3.0 h5py httpx>=0.22.0 -numpy +numpy==1.23.2 pillow python-dotenv prefect==2.19.5 From 6304fe9b3d4f406144314828db72ca00b3d2cce8 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Thu, 12 Dec 2024 11:47:06 -0800 Subject: [PATCH 08/23] Resolved circular import issue, NERSC controller is not defined in flows/bl832/nersc.py. Removed the ALCF controlelr from job_controller, in anticipation of that being defined in alcf.py when I refactor that code. --- orchestration/flows/bl832/job_controller.py | 256 ++------------------ orchestration/flows/bl832/nersc.py | 201 ++++++++++++++- 2 files changed, 209 insertions(+), 248 deletions(-) diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index 04e95d1..ed16cfb 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -2,14 +2,9 @@ from dotenv import load_dotenv from enum import Enum import logging -import os -from pathlib import Path -import time -from typing import Callable, Optional +from typing import Optional from orchestration.flows.bl832.config import Config832 -# from orchestration.flows.bl832.nersc import NERSCTomographyHPCController -from orchestration.nersc import NerscClient logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -55,220 +50,6 @@ def build_multi_resolution( pass -class ALCFTomographyHPCController(TomographyHPCController): - """ - Implementation of TomographyHPCController for ALCF. - Methods here leverage Globus Compute for processing tasks. - - Args: - TomographyHPCController (ABC): Abstract class for tomography HPC controllers. - """ - - def __init__(self) -> None: - pass - - def reconstruct( - self, - file_path: str = "", - ) -> bool: - - # uses Globus Compute to reconstruct the tomography - pass - - def build_multi_resolution( - self, - file_path: str = "", - ) -> bool: - # uses Globus Compute to build multi-resolution tomography - - pass - - -class NERSCTomographyHPCController(TomographyHPCController): - """ - Implementation for a NERSC-based tomography HPC controller. - - Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. - """ - - def __init__( - self, - client: NerscClient = None, - config: Optional[Config832] = None - ) -> None: - self.client = client - - if not config: - self.config = Config832() - else: - self.config = config - - def create_nersc_client() -> NerscClient: - """Create and return an NERSC client instance""" - - client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") - - if not client_id_path or not sfapi_key_path: - logger.error("NERSC credentials paths are missing.") - raise ValueError("Missing NERSC credentials paths.") - if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): - logger.error("NERSC credential files are missing.") - raise FileNotFoundError("NERSC credential files are missing.") - - try: - return NerscClient(client_id_path, sfapi_key_path) - except Exception as e: - logger.error(f"Failed to create NERSC client: {e}") - raise e - - def reconstruct( - self, - file_path: str = "", - ) -> bool: - """ - Use NERSC for tomography reconstruction - """ - logger.info("Starting NERSC reconstruction process.") - - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - # username = os.getenv("NERSC_USERNAME") - # password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["recon_image"] - logger.info(image_name) - path = Path(file_path) - folder_name = path.parent.name - if not folder_name: - folder_name = "" - - file_name = f"{path.stem}.h5" - - logger.info(f"File name: {file_name}") - logger.info(f"Folder name: {folder_name}") - - # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately - # Note: If q=debug, there is no minimum time limit - # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. - - # If the image has not been pulled before, then you must login to Harbor first (hopefully we can get a robot account) - # srun podman-hpc login registry.nersc.gov --username {username} --password {password} -# --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ - - job_script = f"""#!/bin/bash -#SBATCH -q debug -#SBATCH -A als -#SBATCH -C cpu -#SBATCH --job-name=tomo_recon_test-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err -#SBATCH -N 1 -#SBATCH --ntasks-per-node 1 -#SBATCH --cpus-per-task 64 -#SBATCH --time=0:15:00 -#SBATCH --exclusive - -date -srun podman-hpc run \ ---volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ ---volume {scratch_path}/microctdata:/alsdata \ ---volume {scratch_path}/microctdata:/alsuser/ \ -registry.nersc.gov/als/{image_name} \ -bash -c "python -m pip install numpy==1.23.2 && \ -python sfapi_reconstruction.py {file_name} {folder_name}" -date -""" - - try: - logger.info("Submitting reconstruction job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - logging.info(job.jobid) - job.update() - time.sleep(60) # Wait 60 seconds for job to register before checking status - logging.info(job.state) - job.complete() # waits for job completion - logger.info("Reconstruction job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete reconstruction job: {e}") - return False - - def build_multi_resolution( - self, - file_path: str = "", - ) -> bool: - """Use NERSC to make multiresolution version of tomography results.""" - - # username = os.getenv("NERSC_USERNAME") - # password = os.getenv("NERSC_PASSWORD") - - user = self.client.user() - - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) - - image_name = self.config.harbor_images832["multires_image"] - - # TODO: fix these paths - - path = Path(file_path) - folder_name = path.parent.name - file_name = path.stem - - recon_path = f"scratch/{folder_name}/rec{file_name}/" - raw_path = f"{folder_name}/{file_name}.h5" - - # Need to update this script: - # rebuild image with dependencies - - # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately - job_script = f"""#!/bin/bash -#SBATCH -q debug -#SBATCH -A als -#SBATCH -C cpu -#SBATCH --job-name=tomo_multires_test-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err -#SBATCH -N 1 -#SBATCH --ntasks-per-node 1 -#SBATCH --cpus-per-task 64 -#SBATCH --time=0:15:00 -#SBATCH --exclusive - -date -srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ ---volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ ---volume {scratch_path}/microctdata:/alsdata \ ---volume {scratch_path}/microctdata:/alsuser/ \ -registry.nersc.gov/als/{image_name} \ -bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ -python -m pip show dask_image || python -m pip install dask_image && \ -python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" - -date -""" - try: - logger.info("Submitting Tiff to Zarr job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) - time.sleep(30) # Wait 30 seconds before checking job completion - job.complete() # waits for job completion - logger.info("Tiff to Zarr job completed successfully.") - return True - except Exception as e: - logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") - return False - - class HPC(Enum): """ Enum representing different HPC environments. @@ -282,7 +63,7 @@ class HPC(Enum): NERSC = "NERSC" -def get_controller(hpc_type: str) -> TomographyHPCController: +def get_controller(hpc_type: HPC) -> TomographyHPCController: """ Factory function that returns an HPC controller instance for the given HPC environment. @@ -290,26 +71,19 @@ def get_controller(hpc_type: str) -> TomographyHPCController: :return: An instance of a TomographyHPCController subclass corresponding to the given HPC environment. :raises ValueError: If an invalid or unsupported HPC type is specified. """ - if not hpc_type: - raise ValueError("No HPC type provided.") - - # Normalize input - hpc_str = hpc_type.strip().upper() - - # Attempt to map the given string to the HPC enum - try: - hpc_enum = HPC(hpc_str) - except ValueError: - raise ValueError(f"'{hpc_type}' is not a valid HPC") from None - - # Map HPC enum members to corresponding controller constructors - controller_map: dict[HPC, Callable[[], TomographyHPCController]] = { - HPC.ALCF: lambda: ALCFTomographyHPCController(), - HPC.NERSC: lambda: NERSCTomographyHPCController(NERSCTomographyHPCController.create_nersc_client()), - } - - # Return a new controller instance - return controller_map[hpc_enum]() + if not isinstance(hpc_type, HPC): + raise ValueError(f"Invalid HPC type provided: {hpc_type}") + + if hpc_type == HPC.ALCF: + from orchestration.flows.bl832.alcf import ALCFTomographyHPCController + return ALCFTomographyHPCController() + elif hpc_type == HPC.NERSC: + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + return NERSCTomographyHPCController( + NERSCTomographyHPCController.create_nersc_client() + ) + else: + raise ValueError(f"Unsupported HPC type: {hpc_type}") def do_it_all() -> None: diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 0f0f9a1..3664867 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,19 +1,206 @@ from dotenv import load_dotenv import logging -# import os -# from pathlib import Path +import os +from pathlib import Path from prefect import flow -# from typing import Optional +import time +from typing import Optional -# from orchestration.flows.bl832.config import Config832 -from orchestration.flows.bl832.job_controller import get_controller -# from orchestration.nersc import NerscClient +from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController +from orchestration.nersc import NerscClient logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) load_dotenv() +class NERSCTomographyHPCController(TomographyHPCController): + """ + Implementation for a NERSC-based tomography HPC controller. + + Submits reconstruction and multi-resolution jobs to NERSC via SFAPI. + """ + + def __init__( + self, + client: NerscClient = None, + config: Optional[Config832] = None + ) -> None: + self.client = client + + if not config: + self.config = Config832() + else: + self.config = config + + def create_nersc_client() -> NerscClient: + """Create and return an NERSC client instance""" + + client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") + sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + + if not client_id_path or not sfapi_key_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("Missing NERSC credentials paths.") + if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): + logger.error("NERSC credential files are missing.") + raise FileNotFoundError("NERSC credential files are missing.") + + try: + return NerscClient(client_id_path, sfapi_key_path) + except Exception as e: + logger.error(f"Failed to create NERSC client: {e}") + raise e + + def reconstruct( + self, + file_path: str = "", + ) -> bool: + """ + Use NERSC for tomography reconstruction + """ + logger.info("Starting NERSC reconstruction process.") + + # Can't use this long term in production. Need to find a better way to handle credentials. + # Want to run this as the alsdev user + # username = os.getenv("NERSC_USERNAME") + # password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["recon_image"] + logger.info(image_name) + path = Path(file_path) + folder_name = path.parent.name + if not folder_name: + folder_name = "" + + file_name = f"{path.stem}.h5" + + logger.info(f"File name: {file_name}") + logger.info(f"Folder name: {folder_name}") + + # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately + # Note: If q=debug, there is no minimum time limit + # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. + + # If the image has not been pulled before, + # then you must login to Harbor first (hopefully we can get a robot account) + # Looking into using github actions to build the image and host it on on github instead + # srun podman-hpc login registry.nersc.gov --username {username} --password {password} + + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomo_recon_test-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 64 +#SBATCH --time=0:15:00 +#SBATCH --exclusive + +date +srun podman-hpc run \ +--volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ +--volume {scratch_path}/microctdata:/alsdata \ +--volume {scratch_path}/microctdata:/alsuser/ \ +registry.nersc.gov/als/{image_name} \ +bash -c "python -m pip install numpy==1.23.2 && \ +python sfapi_reconstruction.py {file_name} {folder_name}" +date +""" + + try: + logger.info("Submitting reconstruction job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + logging.info(job.jobid) + job.update() + time.sleep(60) # Wait 60 seconds for job to register before checking status + logging.info(job.state) + job.complete() # waits for job completion + logger.info("Reconstruction job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete reconstruction job: {e}") + return False + + def build_multi_resolution( + self, + file_path: str = "", + ) -> bool: + """Use NERSC to make multiresolution version of tomography results.""" + + # username = os.getenv("NERSC_USERNAME") + # password = os.getenv("NERSC_PASSWORD") + + user = self.client.user() + + home_path = f"/global/homes/{user.name[0]}/{user.name}" + scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(home_path) + logger.info(scratch_path) + + image_name = self.config.harbor_images832["multires_image"] + + # TODO: fix these paths + + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + + recon_path = f"scratch/{folder_name}/rec{file_name}/" + raw_path = f"{folder_name}/{file_name}.h5" + + # Need to update this script: + # rebuild image with dependencies + + # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately + job_script = f"""#!/bin/bash +#SBATCH -q debug +#SBATCH -A als +#SBATCH -C cpu +#SBATCH --job-name=tomo_multires_test-0 +#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out +#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH -N 1 +#SBATCH --ntasks-per-node 1 +#SBATCH --cpus-per-task 64 +#SBATCH --time=0:15:00 +#SBATCH --exclusive + +date +srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ +--volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ +--volume {scratch_path}/microctdata:/alsdata \ +--volume {scratch_path}/microctdata:/alsuser/ \ +registry.nersc.gov/als/{image_name} \ +bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ +python -m pip show dask_image || python -m pip install dask_image && \ +python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" + +date +""" + try: + logger.info("Submitting Tiff to Zarr job script to Perlmutter.") + job = self.client.perlmutter.submit_job(job_script) + time.sleep(30) # Wait 30 seconds before checking job completion + job.complete() # waits for job completion + logger.info("Tiff to Zarr job completed successfully.") + return True + except Exception as e: + logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") + return False + + @flow(name="nersc_recon_flow") def nersc_recon_flow( file_path: str, @@ -26,7 +213,7 @@ def nersc_recon_flow( # To do: Implement file transfers, pruning, and other necessary steps - controller = get_controller("NERSC") + controller = get_controller(HPC.NERSC) nersc_reconstruction_success = controller.reconstruct( file_path=file_path, ) From 8e16fcc547d4cc32b23f92d487ad796cab2be3c5 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 13 Dec 2024 15:58:04 -0800 Subject: [PATCH 09/23] Catch sfapi errors after submitting jobs, such that we are able to wait for the job to complete before moving onto the next step. An OK workaround for this: https://github.com/NERSC/sfapi_client/issues/93 --- orchestration/flows/bl832/nersc.py | 81 ++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 3664867..a4933d8 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -3,6 +3,7 @@ import os from pathlib import Path from prefect import flow +import re import time from typing import Optional @@ -122,16 +123,39 @@ def reconstruct( try: logger.info("Submitting reconstruction job script to Perlmutter.") job = self.client.perlmutter.submit_job(job_script) - logging.info(job.jobid) - job.update() - time.sleep(60) # Wait 60 seconds for job to register before checking status - logging.info(job.state) - job.complete() # waits for job completion + logger.info(f"Submitted job ID: {job.jobid}") + + try: + job.update() + except Exception as update_err: + logger.warning(f"Initial job update failed, continuing: {update_err}") + + time.sleep(60) + logger.info(f"Job {job.jobid} current state: {job.state}") + + job.complete() # Wait until the job completes logger.info("Reconstruction job completed successfully.") return True + except Exception as e: - logger.error(f"Failed to submit or complete reconstruction job: {e}") - return False + logger.info(f"Error during job submission or completion: {e}") + match = re.search(r"Job not found:\s*(\d+)", str(e)) + + if match: + jobid = match.group(1) + logger.info(f"Attempting to recover job {jobid}.") + try: + job = self.client.perlmutter.job(jobid=jobid) + time.sleep(30) + job.complete() + logger.info("Reconstruction job completed successfully after recovery.") + return True + except Exception as recovery_err: + logger.error(f"Failed to recover job {jobid}: {recovery_err}") + return False + else: + # Unknown error: cannot recover + return False def build_multi_resolution( self, @@ -192,13 +216,38 @@ def build_multi_resolution( try: logger.info("Submitting Tiff to Zarr job script to Perlmutter.") job = self.client.perlmutter.submit_job(job_script) - time.sleep(30) # Wait 30 seconds before checking job completion - job.complete() # waits for job completion - logger.info("Tiff to Zarr job completed successfully.") + logger.info(f"Submitted job ID: {job.jobid}") + + try: + job.update() + except Exception as update_err: + logger.warning(f"Initial job update failed, continuing: {update_err}") + + time.sleep(60) + logger.info(f"Job {job.jobid} current state: {job.state}") + + job.complete() # Wait until the job completes + logger.info("Reconstruction job completed successfully.") return True + except Exception as e: - logger.error(f"Failed to submit or complete Tiff to Zarr job: {e}") - return False + logger.warning(f"Error during job submission or completion: {e}") + match = re.search(r"Job not found:\s*(\d+)", str(e)) + + if match: + jobid = match.group(1) + logger.info(f"Attempting to recover job {jobid}.") + try: + job = self.client.perlmutter.job(jobid=jobid) + time.sleep(30) + job.complete() + logger.info("Reconstruction job completed successfully after recovery.") + return True + except Exception as recovery_err: + logger.error(f"Failed to recover job {jobid}: {recovery_err}") + return False + else: + return False @flow(name="nersc_recon_flow") @@ -214,14 +263,14 @@ def nersc_recon_flow( # To do: Implement file transfers, pruning, and other necessary steps controller = get_controller(HPC.NERSC) - nersc_reconstruction_success = controller.reconstruct( - file_path=file_path, - ) + # nersc_reconstruction_success = controller.reconstruct( + # file_path=file_path, + # ) nersc_multi_res_success = controller.build_multi_resolution( file_path=file_path, ) - if nersc_reconstruction_success and nersc_multi_res_success: + if nersc_multi_res_success: # nersc_reconstruction_success and nersc_multi_res_success: return True else: return False From c6601be7cdb0d9bca2393bd322fec479baedfb74 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 13 Dec 2024 16:49:56 -0800 Subject: [PATCH 10/23] Adding test_sfapi_flow.py pytest script. --- orchestration/_tests/test_sfapi_flow.py | 107 ++++++++++++++++++++++++ orchestration/flows/bl832/nersc.py | 8 +- 2 files changed, 111 insertions(+), 4 deletions(-) create mode 100644 orchestration/_tests/test_sfapi_flow.py diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py new file mode 100644 index 0000000..b2962c9 --- /dev/null +++ b/orchestration/_tests/test_sfapi_flow.py @@ -0,0 +1,107 @@ +import pytest +from unittest.mock import MagicMock, PropertyMock, patch +from orchestration.flows.bl832.nersc import nersc_recon_flow, NERSCTomographyHPCController +from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.job_controller import HPC +from orchestration.nersc import NerscClient + + +@pytest.fixture +def mock_nersc_client(): + """Fixture to mock the NerscClient class.""" + mock_client = MagicMock(spec=NerscClient) + mock_client.user.return_value.name = "testuser" + + # Mock perlmutter client with job-related methods + perlmutter_mock = MagicMock() + type(mock_client).perlmutter = PropertyMock(return_value=perlmutter_mock) + perlmutter_mock.submit_job.return_value.jobid = "12345" + perlmutter_mock.submit_job.return_value.state = "COMPLETED" + perlmutter_mock.job.return_value.state = "COMPLETED" + perlmutter_mock.job.return_value.complete = MagicMock() + perlmutter_mock.job.return_value.jobid = "12345" + + return mock_client + + +@pytest.fixture +def mock_config832(): + """Fixture to mock the Config832 class.""" + mock_config = MagicMock(spec=Config832) + mock_config.harbor_images832 = { + "recon_image": "mock_recon_image", + "multires_image": "mock_multires_image", + } + return mock_config + + +@pytest.fixture +def mock_controller(mock_nersc_client, mock_config832): + """Fixture to mock the NERSCTomographyHPCController class.""" + with patch("orchestration.flows.bl832.job_controller.get_controller") as mock_get_controller: + mock_get_controller.return_value = MagicMock( + spec=HPC.NERSC, + client=mock_nersc_client, + config=mock_config832 + ) + mock_get_controller.return_value.build_multi_resolution = MagicMock(return_value=True) + mock_get_controller.return_value.reconstruct = MagicMock(return_value=True) + yield mock_get_controller.return_value + + +def test_nersc_recon_flow_success(mock_controller): + """Test the nersc_recon_flow for a successful run.""" + file_path = "dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" + + with patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller): + result = nersc_recon_flow(file_path=file_path) + + assert result is True, "nersc_recon_flow should return True for a successful run." + + +def test_nersc_recon_flow_failure(mock_controller): + """Test the nersc_recon_flow for a failure scenario.""" + file_path = "dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" + mock_controller.build_multi_resolution.return_value = False + + with patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller): + result = nersc_recon_flow(file_path=file_path) + + assert result is False, "nersc_recon_flow should return False for a failure scenario." + + +def test_nersc_client_initialization_error(): + """Test error handling during NERSC client initialization.""" + with patch("orchestration.flows.bl832.nersc.NERSCTomographyHPCController.create_nersc_client", + side_effect=ValueError("Missing NERSC credentials paths.")): + with pytest.raises(ValueError, match="Missing NERSC credentials paths."): + NERSCTomographyHPCController.create_nersc_client() + + +def test_job_submission(mock_controller): + """Test job submission and status updates.""" + job_script = "mock_job_script" + mock_job = mock_controller.client.perlmutter.submit_job.return_value + mock_job.state = "COMPLETED" + + mock_controller.client.perlmutter.submit_job(job_script) + mock_controller.client.perlmutter.submit_job.assert_called_once_with(job_script) + assert mock_job.jobid == "12345", "Job ID should match the mock job ID." + + +def test_job_recovery(mock_controller): + """Test recovery of a failed or lost job.""" + mock_job = mock_controller.client.perlmutter.job.return_value + mock_job.complete = MagicMock() + + mock_controller.client.perlmutter.job.side_effect = [ + FileNotFoundError("Job not found: 12345"), + mock_job + ] + + with patch("time.sleep", return_value=None): + recon_result = mock_controller.reconstruct(file_path="mock_file_path") + multires_result = mock_controller.build_multi_resolution(file_path="mock_file_path") + + assert recon_result is True, "Job recovery should succeed." + assert multires_result is True, "Job recovery should succeed." diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index a4933d8..c56e02b 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -263,14 +263,14 @@ def nersc_recon_flow( # To do: Implement file transfers, pruning, and other necessary steps controller = get_controller(HPC.NERSC) - # nersc_reconstruction_success = controller.reconstruct( - # file_path=file_path, - # ) + nersc_reconstruction_success = controller.reconstruct( + file_path=file_path, + ) nersc_multi_res_success = controller.build_multi_resolution( file_path=file_path, ) - if nersc_multi_res_success: # nersc_reconstruction_success and nersc_multi_res_success: + if nersc_reconstruction_success and nersc_multi_res_success: return True else: return False From 6057aaed6b2163e5ffb69f9c7668480cf67ac20c Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 13 Dec 2024 17:04:36 -0800 Subject: [PATCH 11/23] Addressing prefect secret patch issue in pytest script --- orchestration/_tests/test_sfapi_flow.py | 37 +++++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py index b2962c9..6d7c86d 100644 --- a/orchestration/_tests/test_sfapi_flow.py +++ b/orchestration/_tests/test_sfapi_flow.py @@ -1,9 +1,34 @@ import pytest from unittest.mock import MagicMock, PropertyMock, patch -from orchestration.flows.bl832.nersc import nersc_recon_flow, NERSCTomographyHPCController -from orchestration.flows.bl832.config import Config832 -from orchestration.flows.bl832.job_controller import HPC -from orchestration.nersc import NerscClient +from uuid import uuid4 + +from prefect.blocks.system import Secret +from prefect.testing.utilities import prefect_test_harness + +# Patch Secret.load globally before importing application code +with patch("prefect.blocks.system.Secret.load") as mock_secret_load: + mock_secret_load.return_value = Secret(value=str(uuid4())) + # Import application modules after patching Secret.load + from orchestration.flows.bl832.nersc import nersc_recon_flow, NERSCTomographyHPCController + from orchestration.flows.bl832.config import Config832 + from orchestration.flows.bl832.job_controller import HPC + from orchestration.nersc import NerscClient + + +@pytest.fixture(autouse=True, scope="session") +def prefect_test_fixture(): + """ + A pytest fixture that automatically sets up and tears down the Prefect test harness + for the entire test session. It creates and saves test secrets and configurations + required for Globus integration. + + Yields: + None + """ + with prefect_test_harness(): + globus_client_id = Secret(value=str(uuid4())) + globus_client_id.save(name="globus-client-id") + yield @pytest.fixture @@ -82,11 +107,11 @@ def test_job_submission(mock_controller): """Test job submission and status updates.""" job_script = "mock_job_script" mock_job = mock_controller.client.perlmutter.submit_job.return_value - mock_job.state = "COMPLETED" + job_id = mock_job.jobid mock_controller.client.perlmutter.submit_job(job_script) mock_controller.client.perlmutter.submit_job.assert_called_once_with(job_script) - assert mock_job.jobid == "12345", "Job ID should match the mock job ID." + assert job_id == "12345", "Job ID should match the mock job ID." def test_job_recovery(mock_controller): From f9548a8e027f06b00ea5e45e7ec91e02230ae3b8 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 17 Dec 2024 13:13:56 -0800 Subject: [PATCH 12/23] Removing the NerscClient dependency, and using sfapi_client module directyly in NERSCTomographyHPCController --- orchestration/_tests/test_sfapi_flow.py | 7 ++-- orchestration/flows/bl832/job_controller.py | 2 +- orchestration/flows/bl832/nersc.py | 37 +++++++++++++++------ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py index 6d7c86d..e6ddd69 100644 --- a/orchestration/_tests/test_sfapi_flow.py +++ b/orchestration/_tests/test_sfapi_flow.py @@ -28,6 +28,9 @@ def prefect_test_fixture(): with prefect_test_harness(): globus_client_id = Secret(value=str(uuid4())) globus_client_id.save(name="globus-client-id") + globus_client_secret = Secret(value=str(uuid4())) + globus_client_secret.save(name="globus-client-secret") + yield @@ -97,10 +100,10 @@ def test_nersc_recon_flow_failure(mock_controller): def test_nersc_client_initialization_error(): """Test error handling during NERSC client initialization.""" - with patch("orchestration.flows.bl832.nersc.NERSCTomographyHPCController.create_nersc_client", + with patch("orchestration.flows.bl832.nersc.NERSCTomographyHPCController.create_sfapi_client", side_effect=ValueError("Missing NERSC credentials paths.")): with pytest.raises(ValueError, match="Missing NERSC credentials paths."): - NERSCTomographyHPCController.create_nersc_client() + NERSCTomographyHPCController.create_sfapi_client() def test_job_submission(mock_controller): diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index ed16cfb..f985f29 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -80,7 +80,7 @@ def get_controller(hpc_type: HPC) -> TomographyHPCController: elif hpc_type == HPC.NERSC: from orchestration.flows.bl832.nersc import NERSCTomographyHPCController return NERSCTomographyHPCController( - NERSCTomographyHPCController.create_nersc_client() + NERSCTomographyHPCController.create_sfapi_client() ) else: raise ValueError(f"Unsupported HPC type: {hpc_type}") diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index c56e02b..24c7d90 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,4 +1,5 @@ from dotenv import load_dotenv +import json import logging import os from pathlib import Path @@ -7,9 +8,13 @@ import time from typing import Optional +from authlib.jose import JsonWebKey +from sfapi_client import Client +from sfapi_client.compute import Machine + from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController -from orchestration.nersc import NerscClient + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -25,7 +30,7 @@ class NERSCTomographyHPCController(TomographyHPCController): def __init__( self, - client: NerscClient = None, + client: Client = None, config: Optional[Config832] = None ) -> None: self.client = client @@ -35,21 +40,31 @@ def __init__( else: self.config = config - def create_nersc_client() -> NerscClient: + def create_sfapi_client() -> Client: """Create and return an NERSC client instance""" client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") - sfapi_key_path = os.getenv("PATH_NERSC_PRI_KEY") + client_secret_path = os.getenv("PATH_NERSC_PRI_KEY") - if not client_id_path or not sfapi_key_path: + if not client_id_path or not client_secret_path: logger.error("NERSC credentials paths are missing.") raise ValueError("Missing NERSC credentials paths.") - if not os.path.isfile(client_id_path) or not os.path.isfile(sfapi_key_path): + if not os.path.isfile(client_id_path) or not os.path.isfile(client_secret_path): logger.error("NERSC credential files are missing.") raise FileNotFoundError("NERSC credential files are missing.") + client_id = None + client_secret = None + with open(client_id_path, "r") as f: + client_id = f.read() + + with open(client_secret_path, "r") as f: + client_secret = JsonWebKey.import_key(json.loads(f.read())) + try: - return NerscClient(client_id_path, sfapi_key_path) + client = Client(client_id, client_secret) + logger.info("NERSC client created successfully.") + return client except Exception as e: logger.error(f"Failed to create NERSC client: {e}") raise e @@ -67,7 +82,6 @@ def reconstruct( # Want to run this as the alsdev user # username = os.getenv("NERSC_USERNAME") # password = os.getenv("NERSC_PASSWORD") - user = self.client.user() home_path = f"/global/homes/{user.name[0]}/{user.name}" @@ -95,6 +109,7 @@ def reconstruct( # then you must login to Harbor first (hopefully we can get a robot account) # Looking into using github actions to build the image and host it on on github instead # srun podman-hpc login registry.nersc.gov --username {username} --password {password} +# SBATCH -q debug job_script = f"""#!/bin/bash #SBATCH -q debug @@ -122,7 +137,8 @@ def reconstruct( try: logger.info("Submitting reconstruction job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) + perlmutter = self.client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) logger.info(f"Submitted job ID: {job.jobid}") try: @@ -215,7 +231,8 @@ def build_multi_resolution( """ try: logger.info("Submitting Tiff to Zarr job script to Perlmutter.") - job = self.client.perlmutter.submit_job(job_script) + perlmutter = self.client.compute(Machine.perlmutter) + job = perlmutter.submit_job(job_script) logger.info(f"Submitted job ID: {job.jobid}") try: From 3fe95b5c01d7847f911f5d1e829754ccafda3035 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 17 Dec 2024 13:36:19 -0800 Subject: [PATCH 13/23] Adding deployment configuration for the nersc_recon_flow --- create_deployments_832_nersc.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100755 create_deployments_832_nersc.sh diff --git a/create_deployments_832_nersc.sh b/create_deployments_832_nersc.sh new file mode 100755 index 0000000..4fd437f --- /dev/null +++ b/create_deployments_832_nersc.sh @@ -0,0 +1,10 @@ +export $(grep -v '^#' .env | xargs) + +# create 'nersc_flow_pool' +prefect work-pool create 'nersc_flow_pool' + +# nersc_flow_pool + # in docker-compose.yaml: + # command: prefect agent start --pool "nersc_flow_pool" +prefect deployment build ./orchestration/flows/bl832/nersc.py:nersc_recon_flow -n nersc_recon_flow -p nersc_flow_pool -q nersc_recon_flow_queue +prefect deployment apply nersc_recon_flow-deployment.yaml From 6a4d2bf7604c1e67fcff00dde476d36f25a642d5 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Thu, 19 Dec 2024 15:53:20 -0800 Subject: [PATCH 14/23] Added a comments in indicating that NerscClient is deprecated and will be removed. Removed init of COnfig832 and sfapi_client from NERSCTomographyHPCCOntroller, and instead inject them (no longer optional). Updated unit tests in --- orchestration/_tests/test_sfapi_flow.py | 316 +++++++++++++++----- orchestration/flows/bl832/job_controller.py | 14 +- orchestration/flows/bl832/nersc.py | 25 +- orchestration/nersc.py | 12 +- 4 files changed, 265 insertions(+), 102 deletions(-) diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py index e6ddd69..eedf659 100644 --- a/orchestration/_tests/test_sfapi_flow.py +++ b/orchestration/_tests/test_sfapi_flow.py @@ -1,19 +1,12 @@ +# orchestration/_tests/test_sfapi_flow.py + import pytest -from unittest.mock import MagicMock, PropertyMock, patch +from unittest.mock import MagicMock, patch, mock_open +from pathlib import Path from uuid import uuid4 - from prefect.blocks.system import Secret from prefect.testing.utilities import prefect_test_harness -# Patch Secret.load globally before importing application code -with patch("prefect.blocks.system.Secret.load") as mock_secret_load: - mock_secret_load.return_value = Secret(value=str(uuid4())) - # Import application modules after patching Secret.load - from orchestration.flows.bl832.nersc import nersc_recon_flow, NERSCTomographyHPCController - from orchestration.flows.bl832.config import Config832 - from orchestration.flows.bl832.job_controller import HPC - from orchestration.nersc import NerscClient - @pytest.fixture(autouse=True, scope="session") def prefect_test_fixture(): @@ -34,102 +27,259 @@ def prefect_test_fixture(): yield +# ---------------------------- +# Tests for create_sfapi_client +# ---------------------------- + + +def test_create_sfapi_client_success(): + """ + Test successful creation of the SFAPI client. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + + # Mock data for client_id and client_secret files + mock_client_id = 'value' + mock_client_secret = '{"key": "value"}' + + # Create separate mock_open instances for each file + mock_open_client_id = mock_open(read_data=mock_client_id) + mock_open_client_secret = mock_open(read_data=mock_client_secret) + + with patch("orchestration.flows.bl832.nersc.os.getenv") as mock_getenv, \ + patch("orchestration.flows.bl832.nersc.os.path.isfile") as mock_isfile, \ + patch("builtins.open", side_effect=[ + mock_open_client_id.return_value, + mock_open_client_secret.return_value + ]), \ + patch("orchestration.flows.bl832.nersc.JsonWebKey.import_key") as mock_import_key, \ + patch("orchestration.flows.bl832.nersc.Client") as MockClient: + + # Mock environment variables + mock_getenv.side_effect = lambda x: { + "PATH_NERSC_CLIENT_ID": "/path/to/client_id", + "PATH_NERSC_PRI_KEY": "/path/to/client_secret" + }.get(x, None) + + # Mock file existence + mock_isfile.return_value = True + + # Mock JsonWebKey.import_key to return a mock secret + mock_import_key.return_value = "mock_secret" + + # Create the client + client = NERSCTomographyHPCController.create_sfapi_client() + + # Assert that Client was instantiated with 'value' and 'mock_secret' + MockClient.assert_called_once_with("value", "mock_secret") + + # Assert that the returned client is the mocked client + assert client == MockClient.return_value, "Client should be the mocked sfapi_client.Client instance" + + +def test_create_sfapi_client_missing_paths(): + """ + Test creation of the SFAPI client with missing credential paths. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + + with patch("orchestration.flows.bl832.nersc.os.getenv", return_value=None): + with pytest.raises(ValueError, match="Missing NERSC credentials paths."): + NERSCTomographyHPCController.create_sfapi_client() + + +def test_create_sfapi_client_missing_files(): + """ + Test creation of the SFAPI client with missing credential files. + """ + with ( + # Mock environment variables + patch( + "orchestration.flows.bl832.nersc.os.getenv", + side_effect=lambda x: { + "PATH_NERSC_CLIENT_ID": "/path/to/client_id", + "PATH_NERSC_PRI_KEY": "/path/to/client_secret" + }.get(x, None) + ), + + # Mock file existence to simulate missing files + patch("orchestration.flows.bl832.nersc.os.path.isfile", return_value=False) + ): + # Import the module after applying patches to ensure mocks are in place + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + + # Expect a FileNotFoundError due to missing credential files + with pytest.raises(FileNotFoundError, match="NERSC credential files are missing."): + NERSCTomographyHPCController.create_sfapi_client() + +# ---------------------------- +# Fixture for Mocking SFAPI Client +# ---------------------------- + + @pytest.fixture -def mock_nersc_client(): - """Fixture to mock the NerscClient class.""" - mock_client = MagicMock(spec=NerscClient) - mock_client.user.return_value.name = "testuser" +def mock_sfapi_client(): + """ + Mock the sfapi_client.Client class with necessary methods. + """ + with patch("orchestration.flows.bl832.nersc.Client") as MockClient: + mock_client_instance = MockClient.return_value + + # Mock the user method + mock_user = MagicMock() + mock_user.name = "testuser" + mock_client_instance.user.return_value = mock_user - # Mock perlmutter client with job-related methods - perlmutter_mock = MagicMock() - type(mock_client).perlmutter = PropertyMock(return_value=perlmutter_mock) - perlmutter_mock.submit_job.return_value.jobid = "12345" - perlmutter_mock.submit_job.return_value.state = "COMPLETED" - perlmutter_mock.job.return_value.state = "COMPLETED" - perlmutter_mock.job.return_value.complete = MagicMock() - perlmutter_mock.job.return_value.jobid = "12345" + # Mock the compute method to return a mocked compute object + mock_compute = MagicMock() + mock_job = MagicMock() + mock_job.jobid = "12345" + mock_job.state = "COMPLETED" + mock_compute.submit_job.return_value = mock_job + mock_client_instance.compute.return_value = mock_compute - return mock_client + yield mock_client_instance +# ---------------------------- +# Fixture for Mocking Config832 +# ---------------------------- + @pytest.fixture def mock_config832(): - """Fixture to mock the Config832 class.""" - mock_config = MagicMock(spec=Config832) - mock_config.harbor_images832 = { - "recon_image": "mock_recon_image", - "multires_image": "mock_multires_image", - } - return mock_config + """ + Mock the Config832 class to provide necessary configurations. + """ + with patch("orchestration.flows.bl832.nersc.Config832") as MockConfig: + mock_config = MockConfig.return_value + mock_config.harbor_images832 = { + "recon_image": "mock_recon_image", + "multires_image": "mock_multires_image", + } + mock_config.apps = {"als_transfer": "some_config"} + yield mock_config -@pytest.fixture -def mock_controller(mock_nersc_client, mock_config832): - """Fixture to mock the NERSCTomographyHPCController class.""" - with patch("orchestration.flows.bl832.job_controller.get_controller") as mock_get_controller: - mock_get_controller.return_value = MagicMock( - spec=HPC.NERSC, - client=mock_nersc_client, - config=mock_config832 - ) - mock_get_controller.return_value.build_multi_resolution = MagicMock(return_value=True) - mock_get_controller.return_value.reconstruct = MagicMock(return_value=True) - yield mock_get_controller.return_value +# ---------------------------- +# Tests for NERSCTomographyHPCController +# ---------------------------- +def test_reconstruct_success(mock_sfapi_client, mock_config832): + """ + Test successful reconstruction job submission. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + from sfapi_client.compute import Machine -def test_nersc_recon_flow_success(mock_controller): - """Test the nersc_recon_flow for a successful run.""" - file_path = "dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" + controller = NERSCTomographyHPCController(client=mock_sfapi_client, config=mock_config832) + file_path = "path/to/file.h5" - with patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller): - result = nersc_recon_flow(file_path=file_path) + with patch("orchestration.flows.bl832.nersc.time.sleep", return_value=None): + result = controller.reconstruct(file_path=file_path) - assert result is True, "nersc_recon_flow should return True for a successful run." + # Verify that compute was called with Machine.perlmutter + mock_sfapi_client.compute.assert_called_once_with(Machine.perlmutter) + # Verify that submit_job was called once + mock_sfapi_client.compute.return_value.submit_job.assert_called_once() -def test_nersc_recon_flow_failure(mock_controller): - """Test the nersc_recon_flow for a failure scenario.""" - file_path = "dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5" - mock_controller.build_multi_resolution.return_value = False + # Verify that complete was called on the job + mock_sfapi_client.compute.return_value.submit_job.return_value.complete.assert_called_once() - with patch("orchestration.flows.bl832.nersc.get_controller", return_value=mock_controller): - result = nersc_recon_flow(file_path=file_path) + # Assert that the method returns True + assert result is True, "reconstruct should return True on successful job completion." - assert result is False, "nersc_recon_flow should return False for a failure scenario." +def test_reconstruct_submission_failure(mock_sfapi_client, mock_config832): + """ + Test reconstruction job submission failure. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController -def test_nersc_client_initialization_error(): - """Test error handling during NERSC client initialization.""" - with patch("orchestration.flows.bl832.nersc.NERSCTomographyHPCController.create_sfapi_client", - side_effect=ValueError("Missing NERSC credentials paths.")): - with pytest.raises(ValueError, match="Missing NERSC credentials paths."): - NERSCTomographyHPCController.create_sfapi_client() + controller = NERSCTomographyHPCController(client=mock_sfapi_client, config=mock_config832) + file_path = "path/to/file.h5" + + # Simulate submission failure + mock_sfapi_client.compute.return_value.submit_job.side_effect = Exception("Submission failed") + + with patch("orchestration.flows.bl832.nersc.time.sleep", return_value=None): + result = controller.reconstruct(file_path=file_path) + + # Assert that the method returns False + assert result is False, "reconstruct should return False on submission failure." + + +def test_build_multi_resolution_success(mock_sfapi_client, mock_config832): + """ + Test successful multi-resolution job submission. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + from sfapi_client.compute import Machine + controller = NERSCTomographyHPCController(client=mock_sfapi_client, config=mock_config832) + file_path = "path/to/file.h5" -def test_job_submission(mock_controller): - """Test job submission and status updates.""" - job_script = "mock_job_script" - mock_job = mock_controller.client.perlmutter.submit_job.return_value - job_id = mock_job.jobid + with patch("orchestration.flows.bl832.nersc.time.sleep", return_value=None): + result = controller.build_multi_resolution(file_path=file_path) + + # Verify that compute was called with Machine.perlmutter + mock_sfapi_client.compute.assert_called_once_with(Machine.perlmutter) + + # Verify that submit_job was called once + mock_sfapi_client.compute.return_value.submit_job.assert_called_once() + + # Verify that complete was called on the job + mock_sfapi_client.compute.return_value.submit_job.return_value.complete.assert_called_once() + + # Assert that the method returns True + assert result is True, "build_multi_resolution should return True on successful job completion." + + +def test_build_multi_resolution_submission_failure(mock_sfapi_client, mock_config832): + """ + Test multi-resolution job submission failure. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + + controller = NERSCTomographyHPCController(client=mock_sfapi_client, config=mock_config832) + file_path = "path/to/file.h5" + + # Simulate submission failure + mock_sfapi_client.compute.return_value.submit_job.side_effect = Exception("Submission failed") + + with patch("orchestration.flows.bl832.nersc.time.sleep", return_value=None): + result = controller.build_multi_resolution(file_path=file_path) + + # Assert that the method returns False + assert result is False, "build_multi_resolution should return False on submission failure." + + +def test_job_submission(mock_sfapi_client): + """ + Test job submission and status updates. + """ + from orchestration.flows.bl832.nersc import NERSCTomographyHPCController + from sfapi_client.compute import Machine - mock_controller.client.perlmutter.submit_job(job_script) - mock_controller.client.perlmutter.submit_job.assert_called_once_with(job_script) - assert job_id == "12345", "Job ID should match the mock job ID." + controller = NERSCTomographyHPCController(client=mock_sfapi_client, config=MagicMock()) + file_path = "path/to/file.h5" + # Mock Path to extract file and folder names + with patch.object(Path, 'parent', new_callable=MagicMock) as mock_parent, \ + patch.object(Path, 'stem', new_callable=MagicMock) as mock_stem: + mock_parent.name = "to" + mock_stem.return_value = "file" -def test_job_recovery(mock_controller): - """Test recovery of a failed or lost job.""" - mock_job = mock_controller.client.perlmutter.job.return_value - mock_job.complete = MagicMock() + with patch("orchestration.flows.bl832.nersc.time.sleep", return_value=None): + controller.reconstruct(file_path=file_path) - mock_controller.client.perlmutter.job.side_effect = [ - FileNotFoundError("Job not found: 12345"), - mock_job - ] + # Verify that compute was called with Machine.perlmutter + mock_sfapi_client.compute.assert_called_once_with(Machine.perlmutter) - with patch("time.sleep", return_value=None): - recon_result = mock_controller.reconstruct(file_path="mock_file_path") - multires_result = mock_controller.build_multi_resolution(file_path="mock_file_path") + # Verify that submit_job was called once + mock_sfapi_client.compute.return_value.submit_job.assert_called_once() - assert recon_result is True, "Job recovery should succeed." - assert multires_result is True, "Job recovery should succeed." + # Verify the returned job has the expected attributes + submitted_job = mock_sfapi_client.compute.return_value.submit_job.return_value + assert submitted_job.jobid == "12345", "Job ID should match the mock job ID." + assert submitted_job.state == "COMPLETED", "Job state should be COMPLETED." diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index f985f29..1adf1bd 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -2,7 +2,6 @@ from dotenv import load_dotenv from enum import Enum import logging -from typing import Optional from orchestration.flows.bl832.config import Config832 @@ -21,7 +20,7 @@ class TomographyHPCController(ABC): """ def __init__( self, - Config832: Optional[Config832] = None + config: Config832 ) -> None: pass @@ -63,7 +62,10 @@ class HPC(Enum): NERSC = "NERSC" -def get_controller(hpc_type: HPC) -> TomographyHPCController: +def get_controller( + hpc_type: HPC, + config: Config832 +) -> TomographyHPCController: """ Factory function that returns an HPC controller instance for the given HPC environment. @@ -74,13 +76,17 @@ def get_controller(hpc_type: HPC) -> TomographyHPCController: if not isinstance(hpc_type, HPC): raise ValueError(f"Invalid HPC type provided: {hpc_type}") + if not config: + raise ValueError("Config object is required.") + if hpc_type == HPC.ALCF: from orchestration.flows.bl832.alcf import ALCFTomographyHPCController return ALCFTomographyHPCController() elif hpc_type == HPC.NERSC: from orchestration.flows.bl832.nersc import NERSCTomographyHPCController return NERSCTomographyHPCController( - NERSCTomographyHPCController.create_sfapi_client() + client=NERSCTomographyHPCController.create_sfapi_client(), + config=config ) else: raise ValueError(f"Unsupported HPC type: {hpc_type}") diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 24c7d90..a5ba767 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -6,7 +6,6 @@ from prefect import flow import re import time -from typing import Optional from authlib.jose import JsonWebKey from sfapi_client import Client @@ -30,16 +29,13 @@ class NERSCTomographyHPCController(TomographyHPCController): def __init__( self, - client: Client = None, - config: Optional[Config832] = None + client: Client, + config: Config832 ) -> None: self.client = client + self.config = config - if not config: - self.config = Config832() - else: - self.config = config - + @staticmethod def create_sfapi_client() -> Client: """Create and return an NERSC client instance""" @@ -191,8 +187,6 @@ def build_multi_resolution( image_name = self.config.harbor_images832["multires_image"] - # TODO: fix these paths - path = Path(file_path) folder_name = path.parent.name file_name = path.stem @@ -270,6 +264,7 @@ def build_multi_resolution( @flow(name="nersc_recon_flow") def nersc_recon_flow( file_path: str, + config: Config832, ) -> bool: """ Perform tomography reconstruction on NERSC. @@ -279,7 +274,10 @@ def nersc_recon_flow( # To do: Implement file transfers, pruning, and other necessary steps - controller = get_controller(HPC.NERSC) + controller = get_controller( + hpc_type=HPC.NERSC, + config=config + ) nersc_reconstruction_success = controller.reconstruct( file_path=file_path, ) @@ -294,4 +292,7 @@ def nersc_recon_flow( if __name__ == "__main__": - nersc_recon_flow(file_path="dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5") + nersc_recon_flow( + file_path="dabramov/20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast.h5", + config=Config832() + ) diff --git a/orchestration/nersc.py b/orchestration/nersc.py index e432ec4..e751281 100644 --- a/orchestration/nersc.py +++ b/orchestration/nersc.py @@ -1,3 +1,7 @@ +''' +DEPRECATION WARNING: NerscClient is deprecated and will be removed when we refactor the ptychography code +''' + import json import logging # from pathlib import Path @@ -18,6 +22,9 @@ class NerscClient(Client): + ''' + DEPRECATION WARNING: NerscClient is deprecated and will be removed when we refactor the ptychography code + ''' def __init__( self, path_client_id, @@ -69,7 +76,7 @@ def init_client_info( ): self.get_client_id() self.get_private_key() - + def init_directory_paths(self): self.home_path = f"/global/homes/{self.user().name[0]}/{self.user().name}" self.scratch_path = f"/pscratch/sd/{self.user().name[0]}/{self.user().name}" @@ -86,7 +93,7 @@ def update_job_id(self): def update_job_state(self): self.request_job_status() self.job_state = self.job.state - + if self.job_state == "RUNNING": self.has_ran = True elif self.job_state == "COMPLETE": @@ -105,4 +112,3 @@ def submit_job(self, job_script): self.update_job_id() # self.update_job_state() self.logger.info(f"Submitted job id: {self.jobid}") - From 223c3b47e9ec0384a1ac93898b55d1eceec4df0d Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 20 Dec 2024 13:30:15 -0800 Subject: [PATCH 15/23] Updated NERSC job script to pull from ghcr.io image instead of Harbor with up to date python installations --- config.yml | 4 ++++ orchestration/flows/bl832/config.py | 2 +- orchestration/flows/bl832/nersc.py | 32 +++++++---------------------- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/config.yml b/config.yml index c6cdbf2..933764e 100644 --- a/config.yml +++ b/config.yml @@ -99,6 +99,10 @@ harbor_images832: recon_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c multires_image: tomorecon_nersc_mpi_hdf5@sha256:cc098a2cfb6b1632ea872a202c66cb7566908da066fd8f8c123b92fa95c2a43c +ghcr_images832: + recon_image: ghcr.io/als-computing/microct:master + multires_image: ghcr.io/als-computing/microct:master + prefect: deployments: - type_spec: new_file_832 diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 148d58b..586b1af 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -21,4 +21,4 @@ def __init__(self) -> None: self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] self.scicat = config["scicat"] - self.harbor_images832 = config["harbor_images832"] + self.ghcr_images832 = config["ghcr_images832"] diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index a5ba767..3a9d559 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -74,10 +74,6 @@ def reconstruct( """ logger.info("Starting NERSC reconstruction process.") - # Can't use this long term in production. Need to find a better way to handle credentials. - # Want to run this as the alsdev user - # username = os.getenv("NERSC_USERNAME") - # password = os.getenv("NERSC_PASSWORD") user = self.client.user() home_path = f"/global/homes/{user.name[0]}/{user.name}" @@ -85,7 +81,8 @@ def reconstruct( logger.info(home_path) logger.info(scratch_path) - image_name = self.config.harbor_images832["recon_image"] + image_name = self.config.ghcr_images832["recon_image"] + logger.info(image_name) path = Path(file_path) folder_name = path.parent.name @@ -101,12 +98,6 @@ def reconstruct( # Note: If q=debug, there is no minimum time limit # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. - # If the image has not been pulled before, - # then you must login to Harbor first (hopefully we can get a robot account) - # Looking into using github actions to build the image and host it on on github instead - # srun podman-hpc login registry.nersc.gov --username {username} --password {password} -# SBATCH -q debug - job_script = f"""#!/bin/bash #SBATCH -q debug #SBATCH -A als @@ -125,9 +116,8 @@ def reconstruct( --volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ --volume {scratch_path}/microctdata:/alsdata \ --volume {scratch_path}/microctdata:/alsuser/ \ -registry.nersc.gov/als/{image_name} \ -bash -c "python -m pip install numpy==1.23.2 && \ -python sfapi_reconstruction.py {file_name} {folder_name}" +{image_name} \ +bash -c "python sfapi_reconstruction.py {file_name} {folder_name}" date """ @@ -175,9 +165,6 @@ def build_multi_resolution( ) -> bool: """Use NERSC to make multiresolution version of tomography results.""" - # username = os.getenv("NERSC_USERNAME") - # password = os.getenv("NERSC_PASSWORD") - user = self.client.user() home_path = f"/global/homes/{user.name[0]}/{user.name}" @@ -185,7 +172,7 @@ def build_multi_resolution( logger.info(home_path) logger.info(scratch_path) - image_name = self.config.harbor_images832["multires_image"] + image_name = self.config.ghcr_images832["multires_image"] path = Path(file_path) folder_name = path.parent.name @@ -194,9 +181,6 @@ def build_multi_resolution( recon_path = f"scratch/{folder_name}/rec{file_name}/" raw_path = f"{folder_name}/{file_name}.h5" - # Need to update this script: - # rebuild image with dependencies - # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately job_script = f"""#!/bin/bash #SBATCH -q debug @@ -216,10 +200,8 @@ def build_multi_resolution( --volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ --volume {scratch_path}/microctdata:/alsdata \ --volume {scratch_path}/microctdata:/alsuser/ \ -registry.nersc.gov/als/{image_name} \ -bash -c "python -m pip show ngff_zarr || python -m pip install ngff_zarr && \ -python -m pip show dask_image || python -m pip install dask_image && \ -python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" +{image_name} \ +bash -c "python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" date """ From 79f128c0abda2d9874f0187da5c3170120e3b1f5 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 14 Jan 2025 12:13:58 -0800 Subject: [PATCH 16/23] Updated config to include location of reconstruction scripts on NERSC. Updated nersc.py to use the realtime queue, which now works with minimal delay. Updated recon/multires jobs to use pscratch for temporary file storage/writing, after copying data from the source at /global/cfs/.../8.3.2/raw/. The next steps include copying the reconstructions to the appropriate /global/cfs/.../8.3.2/scratch/ directory, copying to data832, and scheduling pruning. --- config.yml | 12 +-- orchestration/flows/bl832/config.py | 2 +- orchestration/flows/bl832/nersc.py | 134 +++++++++++++++++++++------- scripts/cancel_sfapi_job.py | 38 ++++++++ 4 files changed, 147 insertions(+), 39 deletions(-) create mode 100644 scripts/cancel_sfapi_job.py diff --git a/config.yml b/config.yml index 933764e..1cc1781 100644 --- a/config.yml +++ b/config.yml @@ -48,12 +48,6 @@ globus: uuid: 9032dd3a-e841-4687-a163-2720da731b5b name: alcf_home832 - nersc_test: - root_path: /global/cfs/cdirs/als/data_mover/share/dabramov - uri: nersc.gov - uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 - name: nersc_test - nersc_alsdev: root_path: /global/homes/a/alsdev/test_directory/ uri: nersc.gov @@ -72,6 +66,12 @@ globus: uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 name: nersc832_alsdev_scratch + nersc832_alsdev_recon_scripts: + root_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_reconstruction_scripts + uri: nersc.gov + uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 + name: nersc832_alsdev_recon_scripts + nersc832: root_path: /global/cfs/cdirs/als/data_mover/8.3.2 uri: nersc.gov diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 586b1af..5a6e8ab 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -14,10 +14,10 @@ def __init__(self) -> None: self.data832_raw = self.endpoints["data832_raw"] self.data832_scratch = self.endpoints["data832_scratch"] self.nersc832 = self.endpoints["nersc832"] - self.nersc_test = self.endpoints["nersc_test"] self.nersc_alsdev = self.endpoints["nersc_alsdev"] self.nersc832_alsdev_raw = self.endpoints["nersc832_alsdev_raw"] self.nersc832_alsdev_scratch = self.endpoints["nersc832_alsdev_scratch"] + self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] self.scicat = config["scicat"] diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 3a9d559..b52dad3 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -13,7 +13,7 @@ from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController - +# from orchestration.prefect import schedule_prefect_flow logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -76,14 +76,22 @@ def reconstruct( user = self.client.user() - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) + raw_path = self.config.nersc832_alsdev_raw.root_path + + recon_image = self.config.ghcr_images832["recon_image"] + logger.info(f"{recon_image=}") + + recon_scripts_dir = self.config.nersc832_alsdev_recon_scripts.root_path + logger.info(f"{recon_scripts_dir=}") - image_name = self.config.ghcr_images832["recon_image"] + scratch_path = self.config.nersc832_alsdev_scratch.root_path + logger.info(f"{scratch_path=}") + + # home_path = f"/global/homes/{user.name[0]}/{user.name}" + pscratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + # logger.info(home_path) + # logger.info(scratch_path) - logger.info(image_name) path = Path(file_path) folder_name = path.parent.name if not folder_name: @@ -99,12 +107,12 @@ def reconstruct( # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. job_script = f"""#!/bin/bash -#SBATCH -q debug +#SBATCH -q realtime #SBATCH -A als #SBATCH -C cpu -#SBATCH --job-name=tomo_recon_test-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH --job-name=tomo_recon_{folder_name}_{file_name} +#SBATCH --output={pscratch_path}/tomo_recon_logs/%x_%j.out +#SBATCH --error={pscratch_path}/tomo_recon_logs/%x_%j.err #SBATCH -N 1 #SBATCH --ntasks-per-node 1 #SBATCH --cpus-per-task 64 @@ -112,11 +120,28 @@ def reconstruct( #SBATCH --exclusive date +echo "Creating directory {pscratch_path}/bl832/raw/{folder_name}" +mkdir -p {pscratch_path}/bl832/raw/{folder_name} +mkdir -p {pscratch_path}/bl832/scratch/{folder_name} + +echo "Copying file {raw_path}/{folder_name}/{file_name} to {pscratch_path}/bl832/raw/{folder_name}/" +cp {raw_path}/{folder_name}/{file_name} {pscratch_path}/bl832/raw/{folder_name} +if [ $? -ne 0 ]; then + echo "Failed to copy data to pscratch." + exit 1 +fi + +chmod -R 2775 {pscratch_path}/bl832 + +echo "Verifying copied files..." +ls -l {pscratch_path}/bl832/raw/{folder_name}/ + +echo "Running reconstruction container..." srun podman-hpc run \ ---volume {home_path}/tomo_recon_repo/microct/legacy/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ ---volume {scratch_path}/microctdata:/alsdata \ ---volume {scratch_path}/microctdata:/alsuser/ \ -{image_name} \ +--volume {recon_scripts_dir}/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ +--volume {pscratch_path}/bl832:/alsdata \ +--volume {pscratch_path}/bl832:/alsuser/ \ +{recon_image} \ bash -c "python sfapi_reconstruction.py {file_name} {folder_name}" date """ @@ -165,30 +190,40 @@ def build_multi_resolution( ) -> bool: """Use NERSC to make multiresolution version of tomography results.""" + logger.info("Starting NERSC multiresolution process.") + user = self.client.user() - home_path = f"/global/homes/{user.name[0]}/{user.name}" - scratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - logger.info(home_path) - logger.info(scratch_path) + multires_image = self.config.ghcr_images832["multires_image"] + logger.info(f"{multires_image=}") - image_name = self.config.ghcr_images832["multires_image"] + recon_scripts_dir = self.config.nersc832_alsdev_recon_scripts.root_path + logger.info(f"{recon_scripts_dir=}") + + scratch_path = self.config.nersc832_alsdev_scratch.root_path + logger.info(f"{scratch_path=}") + + pscratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" + logger.info(f"{pscratch_path=}") path = Path(file_path) folder_name = path.parent.name file_name = path.stem recon_path = f"scratch/{folder_name}/rec{file_name}/" - raw_path = f"{folder_name}/{file_name}.h5" + logger.info(f"{recon_path=}") + + raw_path = f"raw/{folder_name}/{file_name}.h5" + logger.info(f"{raw_path=}") # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately job_script = f"""#!/bin/bash -#SBATCH -q debug +#SBATCH -q realtime #SBATCH -A als #SBATCH -C cpu -#SBATCH --job-name=tomo_multires_test-0 -#SBATCH --output={scratch_path}/nerscClient-test/%x_%j.out -#SBATCH --error={scratch_path}/nerscClient-test/%x_%j.err +#SBATCH --job-name=tomo_multires_{folder_name}_{file_name} +#SBATCH --output={pscratch_path}/tomo_recon_logs/%x_%j.out +#SBATCH --error={pscratch_path}/tomo_recon_logs/%x_%j.err #SBATCH -N 1 #SBATCH --ntasks-per-node 1 #SBATCH --cpus-per-task 64 @@ -196,15 +231,18 @@ def build_multi_resolution( #SBATCH --exclusive date -srun podman-hpc run --volume {home_path}/tomo_recon_repo/microct/legacy/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ ---volume {home_path}/tomo_recon_repo/microct/legacy/input.txt:/alsuser/input.txt \ ---volume {scratch_path}/microctdata:/alsdata \ ---volume {scratch_path}/microctdata:/alsuser/ \ -{image_name} \ + +echo "Running multires container..." +srun podman-hpc run \ +--volume {recon_scripts_dir}/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ +--volume {pscratch_path}/bl832:/alsdata \ +--volume {pscratch_path}/bl832:/alsuser/ \ +{multires_image} \ bash -c "python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" date """ + try: logger.info("Submitting Tiff to Zarr job script to Perlmutter.") perlmutter = self.client.compute(Machine.perlmutter) @@ -254,8 +292,6 @@ def nersc_recon_flow( :param file_path: Path to the file to reconstruct. """ - # To do: Implement file transfers, pruning, and other necessary steps - controller = get_controller( hpc_type=HPC.NERSC, config=config @@ -267,6 +303,40 @@ def nersc_recon_flow( file_path=file_path, ) + nersc_reconstruction_success = True + + # TODO: Transfer reconstructed files from pscratch to /global/cfs/...8.3.2/scratch/... + + # TODO: Transfer files to data832 + + # TODO: Schedule pruning + # data832/scratch : 14 days + # nersc/pscratch : 1 day + # nersc832/scratch : never? + + # source_endpoint = config.data832_scratch + # check_endpoint = config.nersc832_alsdev_scratch + # location = "data832_scratch" + # schedule_days = 35 + # try: + # flow_name = f"delete {location}: {Path(file_path).name}" + # schedule_prefect_flow( + # deployment_name=f"prune_{location}/prune_{location}", + # flow_run_name=flow_name, + # parameters={ + # "relative_path": file_path, + # "source_endpoint": source_endpoint, + # "check_endpoint": check_endpoint + # }, + # duration_from_now=schedule_days + # ) + # return True + # except Exception as e: + # logger.error(f"Failed to schedule prune task: {e}") + # return False + + # TODO: Ingest into SciCat + if nersc_reconstruction_success and nersc_multi_res_success: return True else: diff --git a/scripts/cancel_sfapi_job.py b/scripts/cancel_sfapi_job.py new file mode 100644 index 0000000..53dec05 --- /dev/null +++ b/scripts/cancel_sfapi_job.py @@ -0,0 +1,38 @@ +from dotenv import load_dotenv +import json +import logging +import os + +from authlib.jose import JsonWebKey +from sfapi_client import Client +from sfapi_client.compute import Machine + + +load_dotenv() +logger = logging.getLogger(__name__) + +client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") +client_secret_path = os.getenv("PATH_NERSC_PRI_KEY") + +if not client_id_path or not client_secret_path: + logger.error("NERSC credentials paths are missing.") + raise ValueError("Missing NERSC credentials paths.") +if not os.path.isfile(client_id_path) or not os.path.isfile(client_secret_path): + logger.error("NERSC credential files are missing.") + raise FileNotFoundError("NERSC credential files are missing.") + +client_id = None +client_secret = None +with open(client_id_path, "r") as f: + client_id = f.read() + +with open(client_secret_path, "r") as f: + client_secret = JsonWebKey.import_key(json.loads(f.read())) + +with Client(client_id, client_secret) as client: + perlmutter = client.compute(Machine.perlmutter) + # job = perlmutter.submit_job(job_path) + jobs = perlmutter.jobs(user="dabramov") + for job in jobs: + logger.info(f"Cancelling job: {job.jobid}") + job.cancel() From fae190ff07c0a5be8bdd0a6c87767b9ddf5874d6 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 14 Jan 2025 16:12:28 -0800 Subject: [PATCH 17/23] Update pruning code in nersc.py, although it still needs testing. Added nersc832_alsdev_pscratch endpoint in the config, and added a nersc_prune_pool and prefect deployment. --- config.yml | 6 + create_deployments_832_nersc.sh | 7 ++ orchestration/flows/bl832/config.py | 1 + orchestration/flows/bl832/nersc.py | 167 ++++++++++++++++++++-------- orchestration/flows/bl832/prune.py | 14 +++ 5 files changed, 149 insertions(+), 46 deletions(-) diff --git a/config.yml b/config.yml index 1cc1781..36e5e56 100644 --- a/config.yml +++ b/config.yml @@ -66,6 +66,12 @@ globus: uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 name: nersc832_alsdev_scratch + nersc832_alsdev_pscratch: + root_path: /pscratch/sd/a/alsdev/8.3.2 + uri: nersc.gov + uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 + name: nersc832_alsdev_pscratch + nersc832_alsdev_recon_scripts: root_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_reconstruction_scripts uri: nersc.gov diff --git a/create_deployments_832_nersc.sh b/create_deployments_832_nersc.sh index 4fd437f..822d12a 100755 --- a/create_deployments_832_nersc.sh +++ b/create_deployments_832_nersc.sh @@ -2,9 +2,16 @@ export $(grep -v '^#' .env | xargs) # create 'nersc_flow_pool' prefect work-pool create 'nersc_flow_pool' +prefect work-pool create 'nersc_prune_pool' # nersc_flow_pool # in docker-compose.yaml: # command: prefect agent start --pool "nersc_flow_pool" prefect deployment build ./orchestration/flows/bl832/nersc.py:nersc_recon_flow -n nersc_recon_flow -p nersc_flow_pool -q nersc_recon_flow_queue prefect deployment apply nersc_recon_flow-deployment.yaml + +# alcf_prune_pool + # in docker-compose.yaml: + # command: prefect agent start --pool "nersc_prune_pool" +prefect deployment build ./orchestration/flows/bl832/prune.py:prune_alcf832_raw -n prune_nersc832_pscratch -p nersc_prune_pool -q prune_nersc832_pscratch_queue +prefect deployment apply prune_nersc832_pscratch-deployment.yaml diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 5a6e8ab..a427d80 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -17,6 +17,7 @@ def __init__(self) -> None: self.nersc_alsdev = self.endpoints["nersc_alsdev"] self.nersc832_alsdev_raw = self.endpoints["nersc832_alsdev_raw"] self.nersc832_alsdev_scratch = self.endpoints["nersc832_alsdev_scratch"] + self.nersc832_alsdev_pscratch = self.endpoints["nersc832_alsdev_pscratch"] self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index b52dad3..bbce721 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -1,25 +1,57 @@ +import datetime from dotenv import load_dotenv import json import logging import os from pathlib import Path -from prefect import flow import re import time from authlib.jose import JsonWebKey +from globus_sdk import TransferClient +from prefect import flow, task +from prefect.blocks.system import JSON from sfapi_client import Client from sfapi_client.compute import Machine from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController -# from orchestration.prefect import schedule_prefect_flow +from orchestration.globus.transfer import GlobusEndpoint, start_transfer +from orchestration.prefect import schedule_prefect_flow logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) load_dotenv() +@task(name="transfer_data_at_nersc") +def transfer_data_at_nersc( + file_path: str, + transfer_client: TransferClient, + nersc_source: GlobusEndpoint, + nersc_destination: GlobusEndpoint, +): + # if source_file begins with "/", it will mess up os.path.join + if file_path[0] == "/": + file_path = file_path[1:] + source_path = os.path.join(nersc_source.root_path, file_path) + dest_path = os.path.join(nersc_destination.root_path, file_path) + + logger.info(f"Transferring {dest_path} data832 to nersc") + + success = start_transfer( + transfer_client, + nersc_source, + source_path, + nersc_destination, + dest_path, + max_wait_seconds=600, + logger=logger, + ) + + return success + + class NERSCTomographyHPCController(TomographyHPCController): """ Implementation for a NERSC-based tomography HPC controller. @@ -77,6 +109,7 @@ def reconstruct( user = self.client.user() raw_path = self.config.nersc832_alsdev_raw.root_path + logger.info(f"{raw_path=}") recon_image = self.config.ghcr_images832["recon_image"] logger.info(f"{recon_image=}") @@ -87,10 +120,8 @@ def reconstruct( scratch_path = self.config.nersc832_alsdev_scratch.root_path logger.info(f"{scratch_path=}") - # home_path = f"/global/homes/{user.name[0]}/{user.name}" pscratch_path = f"/pscratch/sd/{user.name[0]}/{user.name}" - # logger.info(home_path) - # logger.info(scratch_path) + logger.info(f"{pscratch_path=}") path = Path(file_path) folder_name = path.parent.name @@ -120,27 +151,27 @@ def reconstruct( #SBATCH --exclusive date -echo "Creating directory {pscratch_path}/bl832/raw/{folder_name}" -mkdir -p {pscratch_path}/bl832/raw/{folder_name} -mkdir -p {pscratch_path}/bl832/scratch/{folder_name} +echo "Creating directory {pscratch_path}/8.3.2/raw/{folder_name}" +mkdir -p {pscratch_path}/8.3.2/raw/{folder_name} +mkdir -p {pscratch_path}/8.3.2/scratch/{folder_name} -echo "Copying file {raw_path}/{folder_name}/{file_name} to {pscratch_path}/bl832/raw/{folder_name}/" -cp {raw_path}/{folder_name}/{file_name} {pscratch_path}/bl832/raw/{folder_name} +echo "Copying file {raw_path}/{folder_name}/{file_name} to {pscratch_path}/8.3.2/raw/{folder_name}/" +cp {raw_path}/{folder_name}/{file_name} {pscratch_path}/8.3.2/raw/{folder_name} if [ $? -ne 0 ]; then echo "Failed to copy data to pscratch." exit 1 fi -chmod -R 2775 {pscratch_path}/bl832 +chmod -R 2775 {pscratch_path}/8.3.2 echo "Verifying copied files..." -ls -l {pscratch_path}/bl832/raw/{folder_name}/ +ls -l {pscratch_path}/8.3.2/raw/{folder_name}/ echo "Running reconstruction container..." srun podman-hpc run \ --volume {recon_scripts_dir}/sfapi_reconstruction.py:/alsuser/sfapi_reconstruction.py \ ---volume {pscratch_path}/bl832:/alsdata \ ---volume {pscratch_path}/bl832:/alsuser/ \ +--volume {pscratch_path}/8.3.2:/alsdata \ +--volume {pscratch_path}/8.3.2:/alsuser/ \ {recon_image} \ bash -c "python sfapi_reconstruction.py {file_name} {folder_name}" date @@ -235,14 +266,13 @@ def build_multi_resolution( echo "Running multires container..." srun podman-hpc run \ --volume {recon_scripts_dir}/tiff_to_zarr.py:/alsuser/tiff_to_zarr.py \ ---volume {pscratch_path}/bl832:/alsdata \ ---volume {pscratch_path}/bl832:/alsuser/ \ +--volume {pscratch_path}/8.3.2:/alsdata \ +--volume {pscratch_path}/8.3.2:/alsuser/ \ {multires_image} \ bash -c "python tiff_to_zarr.py {recon_path} --raw_file {raw_path}" date """ - try: logger.info("Submitting Tiff to Zarr job script to Perlmutter.") perlmutter = self.client.compute(Machine.perlmutter) @@ -259,6 +289,29 @@ def build_multi_resolution( job.complete() # Wait until the job completes logger.info("Reconstruction job completed successfully.") + + # defining this GlobusEndpoint here rather than config.yaml since the root path depends on the SFAPI user name + # using the same uuid from another endpoint because it's the same alsdev collection + nersc832_pscratch_endpoint = GlobusEndpoint( + uuid=self.config.nersc832_alsdev_scratch.uuid, + uri=self.config.nersc832_alsdev_scratch.uri, + root_path=f"{pscratch_path}/scratch", + name="nersc832_pscratch" + ) + + # Working on a permission denied error when transferring + relative_recon_path = os.path.relpath(recon_path, "scratch") + transfer_data_at_nersc( + file_path=relative_recon_path, + transfer_client=self.config.tc, + nersc_source=nersc832_pscratch_endpoint, + nersc_destination=self.config.nersc832_alsdev_scratch) + transfer_data_at_nersc( + file_path=f"{relative_recon_path}.zarr", + transfer_client=self.config.tc, + nersc_source=nersc832_pscratch_endpoint, + nersc_destination=self.config.nersc832_alsdev_scratch + ) return True except Exception as e: @@ -281,6 +334,56 @@ def build_multi_resolution( return False +def schedule_pruning(config: Config832, file_path: str) -> bool: + # data832/scratch : 14 days + # nersc/pscratch : 1 day + # nersc832/scratch : never? + + pruning_config = JSON.load("pruning-config").value + data832_delay = datetime.timedelta(days=pruning_config["delete_data832_files_after_days"]) + nersc832_delay = datetime.timedelta(days=pruning_config["delete_nersc832_files_after_days"]) + + # Delete from data832_scratch + try: + source_endpoint = config.data832_scratch + check_endpoint = config.nersc832_alsdev_scratch + location = "data832_scratch" + + flow_name = f"delete {location}: {Path(file_path).name}" + schedule_prefect_flow( + deployment_name=f"prune_{location}/prune_{location}", + flow_run_name=flow_name, + parameters={ + "relative_path": file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint + }, + duration_from_now=data832_delay + ) + except Exception as e: + logger.error(f"Failed to schedule prune task: {e}") + + # Delete from nersc832_pscratch + try: + source_endpoint = config.nersc832_alsdev_pscratch + check_endpoint = None + location = "nersc832_alsdev_pscratch" + + flow_name = f"delete {location}: {Path(file_path).name}" + schedule_prefect_flow( + deployment_name=f"prune_{location}/prune_{location}", + flow_run_name=flow_name, + parameters={ + "relative_path": file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint + }, + duration_from_now=nersc832_delay + ) + except Exception as e: + logger.error(f"Failed to schedule prune task: {e}") + + @flow(name="nersc_recon_flow") def nersc_recon_flow( file_path: str, @@ -303,37 +406,9 @@ def nersc_recon_flow( file_path=file_path, ) - nersc_reconstruction_success = True - - # TODO: Transfer reconstructed files from pscratch to /global/cfs/...8.3.2/scratch/... - # TODO: Transfer files to data832 - # TODO: Schedule pruning - # data832/scratch : 14 days - # nersc/pscratch : 1 day - # nersc832/scratch : never? - - # source_endpoint = config.data832_scratch - # check_endpoint = config.nersc832_alsdev_scratch - # location = "data832_scratch" - # schedule_days = 35 - # try: - # flow_name = f"delete {location}: {Path(file_path).name}" - # schedule_prefect_flow( - # deployment_name=f"prune_{location}/prune_{location}", - # flow_run_name=flow_name, - # parameters={ - # "relative_path": file_path, - # "source_endpoint": source_endpoint, - # "check_endpoint": check_endpoint - # }, - # duration_from_now=schedule_days - # ) - # return True - # except Exception as e: - # logger.error(f"Failed to schedule prune task: {e}") - # return False + schedule_pruning(config=config, file_path=file_path) # TODO: Ingest into SciCat diff --git a/orchestration/flows/bl832/prune.py b/orchestration/flows/bl832/prune.py index 44b87be..8d87c0f 100644 --- a/orchestration/flows/bl832/prune.py +++ b/orchestration/flows/bl832/prune.py @@ -143,5 +143,19 @@ def prune_nersc832_alsdev_scratch( config=config) +@flow(name="prune_nersc832_alsdev_pscratch") +def prune_nersc832_alsdev_pscratch( + relative_path: str, + source_endpoint: GlobusEndpoint, + check_endpoint: Union[GlobusEndpoint, None] = None, + config=None, +): + prune_files( + relative_path=relative_path, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + config=config) + + if __name__ == "__main__": prune_nersc832_alsdev_scratch("BLS-00564_dyparkinson/") From 1b2ae5cda76f3460d30ed86a6580ec01a45cd455 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 17 Jan 2025 13:43:09 -0800 Subject: [PATCH 18/23] Added transfer_controller.py, and use the GlobusTransferController to move data within the nersc reconstruction flow. --- orchestration/flows/bl832/nersc.py | 93 ++++++----- orchestration/transfer_controller.py | 223 +++++++++++++++++++++++++++ 2 files changed, 279 insertions(+), 37 deletions(-) create mode 100644 orchestration/transfer_controller.py diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index bbce721..df5fd53 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -8,15 +8,16 @@ import time from authlib.jose import JsonWebKey -from globus_sdk import TransferClient -from prefect import flow, task +# from globus_sdk import TransferClient +from prefect import flow # , task from prefect.blocks.system import JSON from sfapi_client import Client from sfapi_client.compute import Machine from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController -from orchestration.globus.transfer import GlobusEndpoint, start_transfer +from orchestration.globus.transfer import GlobusEndpoint +from orchestration.transfer_controller import get_transfer_controller, CopyMethod from orchestration.prefect import schedule_prefect_flow logger = logging.getLogger(__name__) @@ -24,32 +25,32 @@ load_dotenv() -@task(name="transfer_data_at_nersc") -def transfer_data_at_nersc( - file_path: str, - transfer_client: TransferClient, - nersc_source: GlobusEndpoint, - nersc_destination: GlobusEndpoint, -): - # if source_file begins with "/", it will mess up os.path.join - if file_path[0] == "/": - file_path = file_path[1:] - source_path = os.path.join(nersc_source.root_path, file_path) - dest_path = os.path.join(nersc_destination.root_path, file_path) - - logger.info(f"Transferring {dest_path} data832 to nersc") - - success = start_transfer( - transfer_client, - nersc_source, - source_path, - nersc_destination, - dest_path, - max_wait_seconds=600, - logger=logger, - ) +# @task(name="transfer_data_at_nersc") +# def transfer_data_at_nersc( +# file_path: str, +# transfer_client: TransferClient, +# nersc_source: GlobusEndpoint, +# nersc_destination: GlobusEndpoint, +# ): +# # if source_file begins with "/", it will mess up os.path.join +# if file_path[0] == "/": +# file_path = file_path[1:] +# source_path = os.path.join(nersc_source.root_path, file_path) +# dest_path = os.path.join(nersc_destination.root_path, file_path) + +# logger.info(f"Transferring {dest_path} data832 to nersc") - return success +# success = start_transfer( +# transfer_client, +# nersc_source, +# source_path, +# nersc_destination, +# dest_path, +# max_wait_seconds=600, +# logger=logger, +# ) + +# return success class NERSCTomographyHPCController(TomographyHPCController): @@ -295,23 +296,41 @@ def build_multi_resolution( nersc832_pscratch_endpoint = GlobusEndpoint( uuid=self.config.nersc832_alsdev_scratch.uuid, uri=self.config.nersc832_alsdev_scratch.uri, - root_path=f"{pscratch_path}/scratch", + root_path=f"{pscratch_path}/8.3.2/scratch", name="nersc832_pscratch" ) # Working on a permission denied error when transferring relative_recon_path = os.path.relpath(recon_path, "scratch") - transfer_data_at_nersc( + + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=self.config + ) + + transfer_controller.copy( file_path=relative_recon_path, - transfer_client=self.config.tc, - nersc_source=nersc832_pscratch_endpoint, - nersc_destination=self.config.nersc832_alsdev_scratch) - transfer_data_at_nersc( + source=nersc832_pscratch_endpoint, + destination=self.config.nersc832_alsdev_scratch + ) + + transfer_controller.copy( file_path=f"{relative_recon_path}.zarr", - transfer_client=self.config.tc, - nersc_source=nersc832_pscratch_endpoint, - nersc_destination=self.config.nersc832_alsdev_scratch + source=nersc832_pscratch_endpoint, + destination=self.config.nersc832_alsdev_scratch ) + + # transfer_data_at_nersc( + # file_path=relative_recon_path, + # transfer_client=self.config.tc, + # nersc_source=nersc832_pscratch_endpoint, + # nersc_destination=self.config.nersc832_alsdev_scratch) + # transfer_data_at_nersc( + # file_path=f"{relative_recon_path}.zarr", + # transfer_client=self.config.tc, + # nersc_source=nersc832_pscratch_endpoint, + # nersc_destination=self.config.nersc832_alsdev_scratch + # ) return True except Exception as e: diff --git a/orchestration/transfer_controller.py b/orchestration/transfer_controller.py new file mode 100644 index 0000000..0339be6 --- /dev/null +++ b/orchestration/transfer_controller.py @@ -0,0 +1,223 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from dotenv import load_dotenv +from enum import Enum +import logging +import os +import time +from typing import Generic, Protocol, TypeVar + +import globus_sdk + +from orchestration.flows.bl832.config import Config832 +from orchestration.globus.transfer import GlobusEndpoint, start_transfer + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +load_dotenv() + + +Endpoint = TypeVar("Endpoint", bound=GlobusEndpoint) + + +class BaseEndpoint(Protocol): + """ + A protocol or abstract interface that all endpoints must implement or satisfy. + """ + + @property + @abstractmethod + def root_path(self) -> str: + """ + Root path or base directory for this endpoint. + """ + ... + + @property + @abstractmethod + def name(self) -> str: + """ + A human-readable or reference name for the endpoint. + """ + ... + + +@dataclass +class FileSystemEndpoint(BaseEndpoint): + root_path: str + name: str = "local" + + def full_path(self, path_suffix: str) -> str: + if path_suffix.startswith("/"): + path_suffix = path_suffix[1:] + return f"{self.root_path.rstrip('/')}/{path_suffix}" + + +class TransferController(Generic[Endpoint], ABC): + """ + Abstract class for transferring data. + + Args: + ABC: Abstract Base Class + """ + def __init__( + self, + config: Config832 + ) -> None: + self.config = config + + @abstractmethod + def copy( + self, + file_path: str = None, + source: Endpoint = None, + destination: Endpoint = None, + ) -> bool: + pass + + +class GlobusTransfer(TransferController[GlobusEndpoint]): + def __init__( + self, + config: Config832 + ) -> None: + self.config = config + """ + Use Globus Transfer to move data between endpoints. + + Args: + TransferController: Abstract class for transferring data. + """ + def copy( + self, + file_path: str = None, + source: GlobusEndpoint = None, + destination: GlobusEndpoint = None, + ) -> bool: + """ + Copy a file from a source endpoint to a destination endpoint. + + Args: + file_path (str): The path of the file to copy. + source (GlobusEndpoint): The source endpoint. + destination (GlobusEndpoint): The destination endpoint. + transfer_client (TransferClient): The Globus transfer client. + """ + + logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") + + if file_path[0] == "/": + file_path = file_path[1:] + + source_path = os.path.join(source.root_path, file_path) + dest_path = os.path.join(destination.root_path, file_path) + logger.info(f"Transferring {source_path} to {dest_path}") + # Start the timer + start_time = time.time() + + try: + success = start_transfer( + transfer_client=self.config.tc, + source_endpoint=source, + source_path=source_path, + dest_endpoint=destination, + dest_path=dest_path, + max_wait_seconds=600, + logger=logger, + ) + if success: + logger.info("Transfer completed successfully.") + else: + logger.error("Transfer failed.") + return success + except globus_sdk.services.transfer.errors.TransferAPIError as e: + logger.error(f"Failed to submit transfer: {e}") + return False + finally: + # Stop the timer and calculate the duration + elapsed_time = time.time() - start_time + logger.info(f"Transfer process took {elapsed_time:.2f} seconds.") + return success + + +class SimpleTransfer(TransferController[FileSystemEndpoint]): + def __init__( + self, + config: Config832 + ) -> None: + self.config = config + """ + Use a simple 'cp' command to move data within the same system. + + Args: + TransferController: Abstract class for transferring data. + """ + + def copy( + self, + file_path: str = "", + source: FileSystemEndpoint = "", + destination: FileSystemEndpoint = "", + ) -> bool: + + logger.info(f"Transferring {file_path} from {source} to {destination}") + + if file_path[0] == "/": + file_path = file_path[1:] + + source_path = os.path.join(source, file_path) + dest_path = os.path.join(destination, file_path) + logger.info(f"Transferring {source_path} to {dest_path}") + # Start the timer + start_time = time.time() + + try: + os.system(f"cp -r {source_path} {dest_path}") + logger.info("Transfer completed successfully.") + return True + except Exception as e: + logger.error(f"Transfer failed: {e}") + return False + finally: + # Stop the timer and calculate the duration + elapsed_time = time.time() - start_time + logger.info(f"Transfer process took {elapsed_time:.2f} seconds.") + return True + + +class CopyMethod(Enum): + """ + Enum representing different transfer methods. + Use enum names as strings to identify transfer methods, ensuring a standard set of values. + """ + GLOBUS = "globus" + SIMPLE = "simple" + + +def get_transfer_controller( + transfer_type: CopyMethod, + config: Config832 +) -> TransferController: + """ + Get the appropriate transfer controller based on the transfer type. + + Args: + transfer_type (str): The type of transfer to perform. + config (Config832): The configuration object. + + Returns: + TransferController: The transfer controller object. + """ + if transfer_type == CopyMethod.GLOBUS: + return GlobusTransfer(config) + elif transfer_type == CopyMethod.SIMPLE: + return SimpleTransfer(config) + else: + raise ValueError(f"Invalid transfer type: {transfer_type}") + + +def main(): + config = Config832() + transfer_type = CopyMethod.GLOBUS + controller = get_transfer_controller(transfer_type, config) + controller From e2963f063b467293da16d5d40db66c89f62fff31 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 21 Jan 2025 13:01:17 -0800 Subject: [PATCH 19/23] Included new controller class for data transfers and refactored the NERSC flow to use those. Updated the Prefect deployment script for the NERSC implementation. Updated endpoints in config.yml to include pscratch/scratch and pscratch/raw for alsdev. Verified pruning code is working as expected. --- config.yml | 12 +- create_deployments_832_nersc.sh | 9 +- orchestration/flows/bl832/config.py | 3 +- orchestration/flows/bl832/job_controller.py | 4 + orchestration/flows/bl832/nersc.py | 219 ++++++++++++-------- orchestration/flows/bl832/prune.py | 18 +- orchestration/transfer_controller.py | 4 +- 7 files changed, 174 insertions(+), 95 deletions(-) diff --git a/config.yml b/config.yml index 36e5e56..3624ef3 100644 --- a/config.yml +++ b/config.yml @@ -66,11 +66,17 @@ globus: uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 name: nersc832_alsdev_scratch - nersc832_alsdev_pscratch: - root_path: /pscratch/sd/a/alsdev/8.3.2 + nersc832_alsdev_pscratch_raw: + root_path: /pscratch/sd/a/alsdev/8.3.2/raw uri: nersc.gov uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 - name: nersc832_alsdev_pscratch + name: nersc832_alsdev_pscratch_raw + + nersc832_alsdev_pscratch_scratch: + root_path: /pscratch/sd/a/alsdev/8.3.2/scratch + uri: nersc.gov + uuid: d40248e6-d874-4f7b-badd-2c06c16f1a58 + name: nersc832_alsdev_pscratch_scratch nersc832_alsdev_recon_scripts: root_path: /global/cfs/cdirs/als/data_mover/8.3.2/tomography_reconstruction_scripts diff --git a/create_deployments_832_nersc.sh b/create_deployments_832_nersc.sh index 822d12a..29886aa 100755 --- a/create_deployments_832_nersc.sh +++ b/create_deployments_832_nersc.sh @@ -10,8 +10,11 @@ prefect work-pool create 'nersc_prune_pool' prefect deployment build ./orchestration/flows/bl832/nersc.py:nersc_recon_flow -n nersc_recon_flow -p nersc_flow_pool -q nersc_recon_flow_queue prefect deployment apply nersc_recon_flow-deployment.yaml -# alcf_prune_pool +# nersc_prune_pool # in docker-compose.yaml: # command: prefect agent start --pool "nersc_prune_pool" -prefect deployment build ./orchestration/flows/bl832/prune.py:prune_alcf832_raw -n prune_nersc832_pscratch -p nersc_prune_pool -q prune_nersc832_pscratch_queue -prefect deployment apply prune_nersc832_pscratch-deployment.yaml +prefect deployment build ./orchestration/flows/bl832/prune.py:prune_nersc832_alsdev_pscratch_raw -n prune_nersc832_alsdev_pscratch_raw -p nersc_prune_pool -q prune_nersc832_pscratch_queue +prefect deployment apply prune_nersc832_alsdev_pscratch_raw-deployment.yaml + +prefect deployment build ./orchestration/flows/bl832/prune.py:prune_nersc832_alsdev_pscratch_scratch -n prune_nersc832_alsdev_pscratch_scratch -p nersc_prune_pool -q prune_nersc832_pscratch_queue +prefect deployment apply prune_nersc832_alsdev_pscratch_scratch-deployment.yaml diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index a427d80..ff19a9c 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -17,7 +17,8 @@ def __init__(self) -> None: self.nersc_alsdev = self.endpoints["nersc_alsdev"] self.nersc832_alsdev_raw = self.endpoints["nersc832_alsdev_raw"] self.nersc832_alsdev_scratch = self.endpoints["nersc832_alsdev_scratch"] - self.nersc832_alsdev_pscratch = self.endpoints["nersc832_alsdev_pscratch"] + self.nersc832_alsdev_pscratch_raw = self.endpoints["nersc832_alsdev_pscratch_raw"] + self.nersc832_alsdev_pscratch_scratch = self.endpoints["nersc832_alsdev_pscratch_scratch"] self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] self.alcf832_raw = self.endpoints["alcf832_raw"] self.alcf832_scratch = self.endpoints["alcf832_scratch"] diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index 1adf1bd..afa23fa 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -60,6 +60,7 @@ class HPC(Enum): """ ALCF = "ALCF" NERSC = "NERSC" + OLCF = "OLCF" def get_controller( @@ -88,6 +89,9 @@ def get_controller( client=NERSCTomographyHPCController.create_sfapi_client(), config=config ) + elif hpc_type == HPC.OLCF: + # TODO: Implement OLCF controller + pass else: raise ValueError(f"Unsupported HPC type: {hpc_type}") diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index df5fd53..4a4a079 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -8,15 +8,13 @@ import time from authlib.jose import JsonWebKey -# from globus_sdk import TransferClient -from prefect import flow # , task +from prefect import flow from prefect.blocks.system import JSON from sfapi_client import Client from sfapi_client.compute import Machine from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController -from orchestration.globus.transfer import GlobusEndpoint from orchestration.transfer_controller import get_transfer_controller, CopyMethod from orchestration.prefect import schedule_prefect_flow @@ -25,34 +23,6 @@ load_dotenv() -# @task(name="transfer_data_at_nersc") -# def transfer_data_at_nersc( -# file_path: str, -# transfer_client: TransferClient, -# nersc_source: GlobusEndpoint, -# nersc_destination: GlobusEndpoint, -# ): -# # if source_file begins with "/", it will mess up os.path.join -# if file_path[0] == "/": -# file_path = file_path[1:] -# source_path = os.path.join(nersc_source.root_path, file_path) -# dest_path = os.path.join(nersc_destination.root_path, file_path) - -# logger.info(f"Transferring {dest_path} data832 to nersc") - -# success = start_transfer( -# transfer_client, -# nersc_source, -# source_path, -# nersc_destination, -# dest_path, -# max_wait_seconds=600, -# logger=logger, -# ) - -# return success - - class NERSCTomographyHPCController(TomographyHPCController): """ Implementation for a NERSC-based tomography HPC controller. @@ -72,6 +42,8 @@ def __init__( def create_sfapi_client() -> Client: """Create and return an NERSC client instance""" + # When generating the SFAPI Key in Iris, make sure to select "asldev" as the user! + # Otherwise, the key will not have the necessary permissions to access the data. client_id_path = os.getenv("PATH_NERSC_CLIENT_ID") client_secret_path = os.getenv("PATH_NERSC_PRI_KEY") @@ -291,46 +263,6 @@ def build_multi_resolution( job.complete() # Wait until the job completes logger.info("Reconstruction job completed successfully.") - # defining this GlobusEndpoint here rather than config.yaml since the root path depends on the SFAPI user name - # using the same uuid from another endpoint because it's the same alsdev collection - nersc832_pscratch_endpoint = GlobusEndpoint( - uuid=self.config.nersc832_alsdev_scratch.uuid, - uri=self.config.nersc832_alsdev_scratch.uri, - root_path=f"{pscratch_path}/8.3.2/scratch", - name="nersc832_pscratch" - ) - - # Working on a permission denied error when transferring - relative_recon_path = os.path.relpath(recon_path, "scratch") - - transfer_controller = get_transfer_controller( - transfer_type=CopyMethod.GLOBUS, - config=self.config - ) - - transfer_controller.copy( - file_path=relative_recon_path, - source=nersc832_pscratch_endpoint, - destination=self.config.nersc832_alsdev_scratch - ) - - transfer_controller.copy( - file_path=f"{relative_recon_path}.zarr", - source=nersc832_pscratch_endpoint, - destination=self.config.nersc832_alsdev_scratch - ) - - # transfer_data_at_nersc( - # file_path=relative_recon_path, - # transfer_client=self.config.tc, - # nersc_source=nersc832_pscratch_endpoint, - # nersc_destination=self.config.nersc832_alsdev_scratch) - # transfer_data_at_nersc( - # file_path=f"{relative_recon_path}.zarr", - # transfer_client=self.config.tc, - # nersc_source=nersc832_pscratch_endpoint, - # nersc_destination=self.config.nersc832_alsdev_scratch - # ) return True except Exception as e: @@ -353,7 +285,12 @@ def build_multi_resolution( return False -def schedule_pruning(config: Config832, file_path: str) -> bool: +def schedule_pruning( + config: Config832, + raw_file_path: str, + tiff_file_path: str, + zarr_file_path: str +) -> bool: # data832/scratch : 14 days # nersc/pscratch : 1 day # nersc832/scratch : never? @@ -361,19 +298,43 @@ def schedule_pruning(config: Config832, file_path: str) -> bool: pruning_config = JSON.load("pruning-config").value data832_delay = datetime.timedelta(days=pruning_config["delete_data832_files_after_days"]) nersc832_delay = datetime.timedelta(days=pruning_config["delete_nersc832_files_after_days"]) - - # Delete from data832_scratch + + # data832_delay, nersc832_delay = datetime.timedelta(minutes=1), datetime.timedelta(minutes=1) + + # Delete tiffs from data832_scratch + logger.info(f"Deleting tiffs from data832_scratch: {tiff_file_path=}") + try: + source_endpoint = config.data832_scratch + check_endpoint = config.nersc832_alsdev_scratch + location = "data832_scratch" + + flow_name = f"delete {location}: {Path(tiff_file_path).name}" + schedule_prefect_flow( + deployment_name=f"prune_{location}/prune_{location}", + flow_run_name=flow_name, + parameters={ + "relative_path": tiff_file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint + }, + duration_from_now=data832_delay + ) + except Exception as e: + logger.error(f"Failed to schedule prune task: {e}") + + # Delete zarr from data832_scratch + logger.info(f"Deleting zarr from data832_scratch: {zarr_file_path=}") try: source_endpoint = config.data832_scratch check_endpoint = config.nersc832_alsdev_scratch location = "data832_scratch" - flow_name = f"delete {location}: {Path(file_path).name}" + flow_name = f"delete {location}: {Path(zarr_file_path).name}" schedule_prefect_flow( deployment_name=f"prune_{location}/prune_{location}", flow_run_name=flow_name, parameters={ - "relative_path": file_path, + "relative_path": zarr_file_path, "source_endpoint": source_endpoint, "check_endpoint": check_endpoint }, @@ -382,18 +343,61 @@ def schedule_pruning(config: Config832, file_path: str) -> bool: except Exception as e: logger.error(f"Failed to schedule prune task: {e}") - # Delete from nersc832_pscratch + # Delete from nersc832_pscratch/raw + logger.info(f"Deleting raw from nersc832_alsdev_pscratch_raw: {raw_file_path=}") try: - source_endpoint = config.nersc832_alsdev_pscratch + source_endpoint = config.nersc832_alsdev_pscratch_raw check_endpoint = None - location = "nersc832_alsdev_pscratch" + location = "nersc832_alsdev_pscratch_raw" - flow_name = f"delete {location}: {Path(file_path).name}" + flow_name = f"delete {location}: {Path(raw_file_path).name}" schedule_prefect_flow( deployment_name=f"prune_{location}/prune_{location}", flow_run_name=flow_name, parameters={ - "relative_path": file_path, + "relative_path": raw_file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint + }, + duration_from_now=nersc832_delay + ) + except Exception as e: + logger.error(f"Failed to schedule prune task: {e}") + + # Delete tiffs from from nersc832_pscratch/scratch + logger.info(f"Deleting tiffs from nersc832_alsdev_pscratch_scratch: {tiff_file_path=}") + try: + source_endpoint = config.nersc832_alsdev_pscratch_scratch + check_endpoint = None + location = "nersc832_alsdev_pscratch_scratch" + + flow_name = f"delete {location}: {Path(tiff_file_path).name}" + schedule_prefect_flow( + deployment_name=f"prune_{location}/prune_{location}", + flow_run_name=flow_name, + parameters={ + "relative_path": tiff_file_path, + "source_endpoint": source_endpoint, + "check_endpoint": check_endpoint + }, + duration_from_now=nersc832_delay + ) + except Exception as e: + logger.error(f"Failed to schedule prune task: {e}") + + # Delete zarr from from nersc832_pscratch/scratch + logger.info(f"Deleting zarr from nersc832_alsdev_pscratch_scratch: {zarr_file_path=}") + try: + source_endpoint = config.nersc832_alsdev_pscratch_scratch + check_endpoint = None + location = "nersc832_alsdev_pscratch_scratch" + + flow_name = f"delete {location}: {Path(zarr_file_path).name}" + schedule_prefect_flow( + deployment_name=f"prune_{location}/prune_{location}", + flow_run_name=flow_name, + parameters={ + "relative_path": zarr_file_path, "source_endpoint": source_endpoint, "check_endpoint": check_endpoint }, @@ -414,6 +418,7 @@ def nersc_recon_flow( :param file_path: Path to the file to reconstruct. """ + logger.info(f"Starting NERSC reconstruction flow for {file_path=}") controller = get_controller( hpc_type=HPC.NERSC, config=config @@ -425,12 +430,58 @@ def nersc_recon_flow( file_path=file_path, ) - # TODO: Transfer files to data832 + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem - schedule_pruning(config=config, file_path=file_path) + tiff_file_path = f"{folder_name}/rec{file_name}" + zarr_file_path = f"{folder_name}/rec{file_name}.zarr" - # TODO: Ingest into SciCat + logger.info(f"{tiff_file_path=}") + logger.info(f"{zarr_file_path=}") + # Transfer reconstructed data + logger.info("Preparing transfer.") + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=config + ) + + logger.info("Copy from /pscratch/sd/a/alsdev/8.3.2 to /global/cfs/cdirs/als/data_mover/8.3.2/scratch.") + transfer_controller.copy( + file_path=tiff_file_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.nersc832_alsdev_scratch + ) + + transfer_controller.copy( + file_path=zarr_file_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.nersc832_alsdev_scratch + ) + + logger.info("Copy from NERSC /global/cfs/cdirs/als/data_mover/8.3.2/scratch to data832") + transfer_controller.copy( + file_path=tiff_file_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.data832_scratch + ) + + transfer_controller.copy( + file_path=zarr_file_path, + source=config.nersc832_alsdev_pscratch_scratch, + destination=config.data832_scratch + ) + + logger.info("Scheduling pruning tasks.") + schedule_pruning( + config=config, + raw_file_path=file_path, + tiff_file_path=tiff_file_path, + zarr_file_path=zarr_file_path + ) + + # TODO: Ingest into SciCat if nersc_reconstruction_success and nersc_multi_res_success: return True else: diff --git a/orchestration/flows/bl832/prune.py b/orchestration/flows/bl832/prune.py index 8d87c0f..1de0508 100644 --- a/orchestration/flows/bl832/prune.py +++ b/orchestration/flows/bl832/prune.py @@ -143,8 +143,22 @@ def prune_nersc832_alsdev_scratch( config=config) -@flow(name="prune_nersc832_alsdev_pscratch") -def prune_nersc832_alsdev_pscratch( +@flow(name="prune_nersc832_alsdev_pscratch_raw") +def prune_nersc832_alsdev_pscratch_raw( + relative_path: str, + source_endpoint: GlobusEndpoint, + check_endpoint: Union[GlobusEndpoint, None] = None, + config=None, +): + prune_files( + relative_path=relative_path, + source_endpoint=source_endpoint, + check_endpoint=check_endpoint, + config=config) + + +@flow(name="prune_nersc832_alsdev_pscratch_scratch") +def prune_nersc832_alsdev_pscratch_scratch( relative_path: str, source_endpoint: GlobusEndpoint, check_endpoint: Union[GlobusEndpoint, None] = None, diff --git a/orchestration/transfer_controller.py b/orchestration/transfer_controller.py index 0339be6..2e74fc8 100644 --- a/orchestration/transfer_controller.py +++ b/orchestration/transfer_controller.py @@ -114,7 +114,7 @@ def copy( logger.info(f"Transferring {source_path} to {dest_path}") # Start the timer start_time = time.time() - + success = False try: success = start_transfer( transfer_client=self.config.tc, @@ -132,7 +132,7 @@ def copy( return success except globus_sdk.services.transfer.errors.TransferAPIError as e: logger.error(f"Failed to submit transfer: {e}") - return False + return success finally: # Stop the timer and calculate the duration elapsed_time = time.time() - start_time From f30dd28fea0b96528c455329c7c1149d3e990a35 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 21 Jan 2025 13:02:10 -0800 Subject: [PATCH 20/23] Added placeholders for ALCFTomographyHPCController in alcf.py and OLCFTomographyHPCController in olcf.py, to be addressed in future git issues. --- orchestration/flows/bl832/alcf.py | 35 +++++++++++++++++++++++++++++++ orchestration/flows/bl832/olcf.py | 28 +++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 orchestration/flows/bl832/olcf.py diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 3836b8e..7349001 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -3,6 +3,7 @@ import os from pathlib import Path import time +# from typing import Optional from globus_compute_sdk import Client, Executor from globus_compute_sdk.serialize import CombinedCode @@ -12,10 +13,44 @@ from prefect.blocks.system import JSON, Secret from orchestration.flows.bl832.config import Config832 +from orchestration.flows.bl832.job_controller import TomographyHPCController from orchestration.globus.transfer import GlobusEndpoint, start_transfer from orchestration.prefect import schedule_prefect_flow +class ALCFTomographyHPCController(TomographyHPCController): + """ + Implementation of TomographyHPCController for ALCF. + Methods here leverage Globus Compute for processing tasks. + + TODO: Refactor ALCF reconstruction flow into this class. + + Args: + TomographyHPCController (ABC): Abstract class for tomography HPC controllers. + """ + + def __init__(self) -> None: + pass + + def reconstruct( + self, + file_path: str = "", + ) -> bool: + + # uses Globus Compute to reconstruct the tomography + # TODO: Refactor ALCF reconstruction code into this class. + + pass + + def build_multi_resolution( + self, + file_path: str = "", + ) -> bool: + # uses Globus Compute to build multi-resolution tomography + # TODO: Refactor ALCF multi-res zarr code into this class. + pass + + @task(name="transfer_data_to_alcf") def transfer_data_to_alcf( file_path: str, diff --git a/orchestration/flows/bl832/olcf.py b/orchestration/flows/bl832/olcf.py new file mode 100644 index 0000000..6fdc441 --- /dev/null +++ b/orchestration/flows/bl832/olcf.py @@ -0,0 +1,28 @@ +from orchestration.flows.bl832.job_controller import TomographyHPCController + + +class OLCFTomographyHPCController(TomographyHPCController): + """ + Implementation of TomographyHPCController for OLCF. + + Args: + TomographyHPCController (ABC): Abstract class for tomography HPC controllers. + """ + + def __init__(self) -> None: + pass + + def reconstruct( + self, + file_path: str = "", + ) -> bool: + # TODO: Implement tomography reconstruction at OLCF + # https://docs.olcf.ornl.gov/ace_testbed/defiant_quick_start_guide.html#running-jobs + pass + + def build_multi_resolution( + self, + file_path: str = "", + ) -> bool: + # TODO: Implement building multi-resolution datasets at OLCF + pass From fd62200627335242cdb679721351cae30a313939 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 28 Jan 2025 09:55:56 -0800 Subject: [PATCH 21/23] Added a pytest script for the new transfer_controller and globus/simple implementations. Replaced BaseEndpoint(Protocol) with a more generic TransferEndpoint(ABC). Updated FileSystemEndpoint to include the missing root_path. Fixed SimpleTransferController class and verified that it works. Updated the names of both GlobusTransferController and SimpleTransferController. Bound the generic Endpoint to TransferEndpoint. Updated inits in job_controllers and transfer_controllers to use the super().init(config) notation. Added the built-in warnings module for orchestration/nersc.py for deprecation messages. Added to_dict and from_dict methods to the GlobusEndpoint class to address potential serialization/deserialization issues (although I haven't encountered them yet with Prefect). --- .../_tests/test_transfer_controller.py | 235 ++++++++++++++++++ orchestration/flows/bl832/job_controller.py | 2 +- orchestration/flows/bl832/nersc.py | 4 +- orchestration/globus/transfer.py | 28 +++ orchestration/nersc.py | 34 ++- orchestration/transfer_controller.py | 156 ++++++++---- 6 files changed, 412 insertions(+), 47 deletions(-) create mode 100644 orchestration/_tests/test_transfer_controller.py diff --git a/orchestration/_tests/test_transfer_controller.py b/orchestration/_tests/test_transfer_controller.py new file mode 100644 index 0000000..5994070 --- /dev/null +++ b/orchestration/_tests/test_transfer_controller.py @@ -0,0 +1,235 @@ +# test_transfer_controller.py + +import pytest +import globus_sdk +from unittest.mock import MagicMock, patch + +from orchestration.flows.bl832.config import Config832 +from orchestration.transfer_controller import ( + FileSystemEndpoint, + GlobusTransferController, + SimpleTransferController, + get_transfer_controller, + CopyMethod, +) +from orchestration.globus.transfer import GlobusEndpoint + + +@pytest.fixture +def mock_config832(): + """ + A pytest fixture that provides a mocked Config832 object + with a mocked TransferClient (tc). + """ + mock_config = MagicMock(spec=Config832) + # Mock the Globus transfer client + mock_config.tc = MagicMock(name="MockTransferClient") + return mock_config + + +@pytest.fixture +def mock_globus_endpoint(): + """ + A pytest fixture that returns a mocked GlobusEndpoint. + """ + endpoint = GlobusEndpoint( + name="mock_globus_endpoint", + root_path="/mock_globus_root/", + uuid="mock_endpoint_id", + uri="mock_endpoint_uri" + ) + return endpoint + + +@pytest.fixture +def mock_file_system_endpoint(): + """ + A pytest fixture that returns a FileSystemEndpoint instance. + """ + endpoint = FileSystemEndpoint( + name="mock_filesystem_endpoint", + root_path="/mock_fs_root" + ) + return endpoint + + +# -------------------------------------------------------------------------- +# Tests for get_transfer_controller +# -------------------------------------------------------------------------- +def test_get_transfer_controller_globus(mock_config832): + """ + Test that get_transfer_controller returns a GlobusTransferController when + the transfer type is CopyMethod.GLOBUS. + """ + controller = get_transfer_controller(CopyMethod.GLOBUS, mock_config832) + assert isinstance(controller, GlobusTransferController), ( + "get_transfer_controller should return a GlobusTransferController " + "instance for GLOBUS transfer." + ) + + +def test_get_transfer_controller_simple(mock_config832): + """ + Test that get_transfer_controller returns a SimpleTransferController when + the transfer type is CopyMethod.SIMPLE. + """ + controller = get_transfer_controller(CopyMethod.SIMPLE, mock_config832) + assert isinstance(controller, SimpleTransferController), ( + "get_transfer_controller should return a SimpleTransferController " + "instance for SIMPLE transfer." + ) + + +def test_get_transfer_controller_invalid(mock_config832): + """ + Test that get_transfer_controller raises ValueError for invalid transfer method. + """ + with pytest.raises(ValueError, match="Invalid transfer type"): + get_transfer_controller("invalid_type", mock_config832) + + +# -------------------------------------------------------------------------- +# Tests for GlobusTransferController +# -------------------------------------------------------------------------- +def test_globus_transfer_controller_copy_success(mock_config832, mock_globus_endpoint): + """ + Test a successful copy() operation using GlobusTransferController. + We mock start_transfer to return True. + """ + with patch("orchestration.transfer_controller.start_transfer", return_value=True) as mock_start_transfer: + controller = GlobusTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_globus_endpoint, + destination=mock_globus_endpoint, + ) + + assert result is True, "Expected True when transfer completes successfully." + mock_start_transfer.assert_called_once() + + # Verify arguments passed to start_transfer + _, called_kwargs = mock_start_transfer.call_args + assert called_kwargs["source_endpoint"] == mock_globus_endpoint + assert called_kwargs["dest_endpoint"] == mock_globus_endpoint + assert "max_wait_seconds" in called_kwargs, "max_wait_seconds should be passed to start_transfer." + + +def test_globus_transfer_controller_copy_failure(mock_config832, mock_globus_endpoint): + """ + Test a failing copy() operation using GlobusTransferController. + We mock start_transfer to return False, indicating a transfer failure. + """ + with patch("orchestration.transfer_controller.start_transfer", return_value=False) as mock_start_transfer: + controller = GlobusTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_globus_endpoint, + destination=mock_globus_endpoint, + ) + assert result is False, "Expected False when transfer fails." + mock_start_transfer.assert_called_once() + + +def test_globus_transfer_controller_copy_exception(mock_config832, mock_globus_endpoint): + """ + Test copy() operation that raises a TransferAPIError exception in GlobusTransferController. + """ + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.reason = "Bad Request" + + with patch( + "orchestration.transfer_controller.start_transfer", + side_effect=globus_sdk.services.transfer.errors.TransferAPIError(mock_response, "Mocked Error") + ) as mock_start_transfer: + controller = GlobusTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_globus_endpoint, + destination=mock_globus_endpoint, + ) + assert result is False, "Expected False when TransferAPIError is raised." + mock_start_transfer.assert_called_once() + + +# -------------------------------------------------------------------------- +# Tests for SimpleTransferController +# -------------------------------------------------------------------------- +def test_simple_transfer_controller_no_file_path(mock_config832, mock_file_system_endpoint): + """ + Test that copy() returns False if no file_path is provided. + """ + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="", + source=mock_file_system_endpoint, + destination=mock_file_system_endpoint, + ) + assert result is False, "Expected False when no file_path is provided." + + +def test_simple_transfer_controller_no_source_or_destination(mock_config832): + """ + Test that copy() returns False if source or destination is None. + """ + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="test.txt", + source=None, + destination=None, + ) + assert result is False, "Expected False when either source or destination is None." + + +def test_simple_transfer_controller_copy_success(mock_config832, mock_file_system_endpoint): + """ + Test a successful copy() operation using SimpleTransferController by mocking os.system + to return 0 (indicating success). + """ + with patch("os.system", return_value=0) as mock_os_system: + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_file_system_endpoint, + destination=mock_file_system_endpoint, + ) + + assert result is True, "Expected True when os.system returns 0." + mock_os_system.assert_called_once() + command_called = mock_os_system.call_args[0][0] + assert "cp -r" in command_called, "Expected cp command in os.system call." + + +def test_simple_transfer_controller_copy_failure(mock_config832, mock_file_system_endpoint): + """ + Test a failing copy() operation using SimpleTransferController by mocking os.system + to return a non-zero code. + """ + with patch("os.system", return_value=1) as mock_os_system: + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_file_system_endpoint, + destination=mock_file_system_endpoint, + ) + + assert result is False, "Expected False when os.system returns a non-zero code." + mock_os_system.assert_called_once() + command_called = mock_os_system.call_args[0][0] + assert "cp -r" in command_called, "Expected cp command in os.system call." + + +def test_simple_transfer_controller_copy_exception(mock_config832, mock_file_system_endpoint): + """ + Test a copy() operation that raises an exception in SimpleTransferController. + """ + with patch("os.system", side_effect=Exception("Mocked cp error")) as mock_os_system: + controller = SimpleTransferController(mock_config832) + result = controller.copy( + file_path="some_dir/test_file.txt", + source=mock_file_system_endpoint, + destination=mock_file_system_endpoint, + ) + + assert result is False, "Expected False when an exception is raised during copy." + mock_os_system.assert_called_once() diff --git a/orchestration/flows/bl832/job_controller.py b/orchestration/flows/bl832/job_controller.py index afa23fa..53af114 100644 --- a/orchestration/flows/bl832/job_controller.py +++ b/orchestration/flows/bl832/job_controller.py @@ -22,7 +22,7 @@ def __init__( self, config: Config832 ) -> None: - pass + self.config = config @abstractmethod def reconstruct( diff --git a/orchestration/flows/bl832/nersc.py b/orchestration/flows/bl832/nersc.py index 4a4a079..95e8aa3 100644 --- a/orchestration/flows/bl832/nersc.py +++ b/orchestration/flows/bl832/nersc.py @@ -35,8 +35,8 @@ def __init__( client: Client, config: Config832 ) -> None: + super().__init__(config) self.client = client - self.config = config @staticmethod def create_sfapi_client() -> Client: @@ -109,7 +109,7 @@ def reconstruct( # IMPORTANT: job script must be deindented to the leftmost column or it will fail immediately # Note: If q=debug, there is no minimum time limit # However, if q=preempt, there is a minimum time limit of 2 hours. Otherwise the job won't run. - + # The realtime queue can only be used for select accounts (e.g. ALS) job_script = f"""#!/bin/bash #SBATCH -q realtime #SBATCH -A als diff --git a/orchestration/globus/transfer.py b/orchestration/globus/transfer.py index 812d83d..f694706 100644 --- a/orchestration/globus/transfer.py +++ b/orchestration/globus/transfer.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from datetime import datetime, timezone, timedelta from dateutil import parser +import json import logging import os from pathlib import Path @@ -43,6 +44,13 @@ def full_path(self, path_suffix: str): path = Path(self.root_path) / path_suffix return str(path) + def to_dict(self) -> dict: + return self.__dict__ + + @classmethod + def from_dict(cls, data: dict) -> 'GlobusEndpoint': + return cls(**data) + @dataclass class GlobusApp: @@ -301,3 +309,23 @@ def prune_one_safe( task_wait(tranfer_client, delete_id) logger.info(f"file deleted from: {source_endpoint.uri}") + + +if __name__ == "__main__": + from orchestration.flows.bl832.config import Config832 + + # test globus endpoint serialization/deserialization + config = Config832() + # Example serialization + source = config.alcf832_raw + logger.info(source) + + serialized = json.dumps(source.to_dict()) + logger.info(serialized) + + # Example deserialization + data = json.loads(serialized) + source_deserialized = GlobusEndpoint.from_dict(data) + logger.info(source_deserialized) + + assert source == source_deserialized diff --git a/orchestration/nersc.py b/orchestration/nersc.py index e751281..a77d33a 100644 --- a/orchestration/nersc.py +++ b/orchestration/nersc.py @@ -1,11 +1,12 @@ ''' DEPRECATION WARNING: NerscClient is deprecated and will be removed when we refactor the ptychography code ''' - +import functools import json import logging # from pathlib import Path # import time +import warnings # from authlib.integrations.requests_client import OAuth2Session # from authlib.oauth2.rfc7523 import PrivateKeyJWT @@ -21,6 +22,21 @@ JobSacct.model_rebuild() +def deprecated_method(message: str): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + warnings.warn( + message="NerscClient() is deprecated and will be removed in a future version. Use the official NERSC " + "sfapi_client module instead: https://nersc.github.io/sfapi_client/", + category=DeprecationWarning, + stacklevel=2 + ) + return func(*args, **kwargs) + return wrapper + return decorator + + class NerscClient(Client): ''' DEPRECATION WARNING: NerscClient is deprecated and will be removed when we refactor the ptychography code @@ -31,6 +47,13 @@ def __init__( path_priv_key, logger=None, ): + warnings.warn( + "NerscClient() is deprecated and will be removed in a future version. " + "Use the official NERSC sfapi_client module instead: https://nersc.github.io/sfapi_client/", + DeprecationWarning, + stacklevel=2, # Shows warning at caller level + ) + self.path_client_id = path_client_id self.path_private_key = path_priv_key @@ -60,36 +83,44 @@ def __init__( self.has_ran = False self.perlmutter = self.compute(Machine.perlmutter) + @deprecated_method() def get_client_id(self): with open(self.path_client_id, "r") as f: self.client_id = f.read() + @deprecated_method() def get_private_key(self): with open(self.path_private_key, "r") as f: self.pri_key = JsonWebKey.import_key(json.loads(f.read())) + @deprecated_method() def get_machine_status(self): return self.perlmutter.status + @deprecated_method() def init_client_info( self ): self.get_client_id() self.get_private_key() + @deprecated_method() def init_directory_paths(self): self.home_path = f"/global/homes/{self.user().name[0]}/{self.user().name}" self.scratch_path = f"/pscratch/sd/{self.user().name[0]}/{self.user().name}" + @deprecated_method() def request_job_status(self): self.job = self.perlmutter.job(jobid=self.jobid) + @deprecated_method() def update_job_id(self): if self.job is None: self.logger.info("No job found") else: self.jobid = self.job.jobid + @deprecated_method() def update_job_state(self): self.request_job_status() self.job_state = self.job.state @@ -99,6 +130,7 @@ def update_job_state(self): elif self.job_state == "COMPLETE": self.logger.info(f"Job {self.jobid} with COMPLETE status") + @deprecated_method() def submit_job(self, job_script): self.task = None self.job = None diff --git a/orchestration/transfer_controller.py b/orchestration/transfer_controller.py index 2e74fc8..31cc7db 100644 --- a/orchestration/transfer_controller.py +++ b/orchestration/transfer_controller.py @@ -5,7 +5,7 @@ import logging import os import time -from typing import Generic, Protocol, TypeVar +from typing import Generic, TypeVar import globus_sdk @@ -17,42 +17,67 @@ load_dotenv() -Endpoint = TypeVar("Endpoint", bound=GlobusEndpoint) - - -class BaseEndpoint(Protocol): +class TransferEndpoint(ABC): """ - A protocol or abstract interface that all endpoints must implement or satisfy. + Abstract base class for endpoints. """ + def __init__( + self, + name: str, + root_path: str + ) -> None: + self.name = name + self.root_path = root_path - @property - @abstractmethod - def root_path(self) -> str: + def name(self) -> str: """ - Root path or base directory for this endpoint. + A human-readable or reference name for the endpoint. """ - ... + return self.name - @property - @abstractmethod - def name(self) -> str: + def root_path(self) -> str: """ - A human-readable or reference name for the endpoint. + Root path or base directory for this endpoint. """ - ... + return self.root_path @dataclass -class FileSystemEndpoint(BaseEndpoint): - root_path: str - name: str = "local" +class FileSystemEndpoint(TransferEndpoint): + """ + A file system endpoint. + + Args: + TransferEndpoint: Abstract class for endpoints. + """ + def __init__( + self, + name: str, + root_path: str + ) -> None: + super().__init__(name, root_path) - def full_path(self, path_suffix: str) -> str: + def full_path( + self, + path_suffix: str + ) -> str: + """ + Constructs the full path by appending the path_suffix to the root_path. + + Args: + path_suffix (str): The relative path to append. + + Returns: + str: The full absolute path. + """ if path_suffix.startswith("/"): path_suffix = path_suffix[1:] return f"{self.root_path.rstrip('/')}/{path_suffix}" +Endpoint = TypeVar("Endpoint", bound=TransferEndpoint) + + class TransferController(Generic[Endpoint], ABC): """ Abstract class for transferring data. @@ -73,15 +98,26 @@ def copy( source: Endpoint = None, destination: Endpoint = None, ) -> bool: + """ + Copy a file from a source endpoint to a destination endpoint. + + Args: + file_path (str): The path of the file to copy. + source (Endpoint): The source endpoint. + destination (Endpoint): The destination endpoint. + + Returns: + bool: True if the transfer was successful, False otherwise. + """ pass -class GlobusTransfer(TransferController[GlobusEndpoint]): +class GlobusTransferController(TransferController[GlobusEndpoint]): def __init__( self, config: Config832 ) -> None: - self.config = config + super().__init__(config) """ Use Globus Transfer to move data between endpoints. @@ -140,12 +176,9 @@ def copy( return success -class SimpleTransfer(TransferController[FileSystemEndpoint]): - def __init__( - self, - config: Config832 - ) -> None: - self.config = config +class SimpleTransferController(TransferController[FileSystemEndpoint]): + def __init__(self, config: Config832) -> None: + super().__init__(config) """ Use a simple 'cp' command to move data within the same system. @@ -156,25 +189,47 @@ def __init__( def copy( self, file_path: str = "", - source: FileSystemEndpoint = "", - destination: FileSystemEndpoint = "", + source: FileSystemEndpoint = None, + destination: FileSystemEndpoint = None, ) -> bool: + """ + Copy a file from a source endpoint to a destination endpoint using the 'cp' command. - logger.info(f"Transferring {file_path} from {source} to {destination}") + Args: + file_path (str): The path of the file to copy. + source (FileSystemEndpoint): The source endpoint. + destination (FileSystemEndpoint): The destination endpoint. - if file_path[0] == "/": + Returns: + bool: True if the transfer was successful, False otherwise. + """ + if not file_path: + logger.error("No file_path provided.") + return False + if not source or not destination: + logger.error("Source or destination endpoint not provided.") + return False + + logger.info(f"Transferring {file_path} from {source.name} to {destination.name}") + + if file_path.startswith("/"): file_path = file_path[1:] - source_path = os.path.join(source, file_path) - dest_path = os.path.join(destination, file_path) + source_path = os.path.join(source.root_path, file_path) + dest_path = os.path.join(destination.root_path, file_path) logger.info(f"Transferring {source_path} to {dest_path}") + # Start the timer start_time = time.time() try: - os.system(f"cp -r {source_path} {dest_path}") - logger.info("Transfer completed successfully.") - return True + result = os.system(f"cp -r '{source_path}' '{dest_path}'") + if result == 0: + logger.info("Transfer completed successfully.") + return True + else: + logger.error(f"Transfer failed with exit code {result}.") + return False except Exception as e: logger.error(f"Transfer failed: {e}") return False @@ -182,7 +237,6 @@ def copy( # Stop the timer and calculate the duration elapsed_time = time.time() - start_time logger.info(f"Transfer process took {elapsed_time:.2f} seconds.") - return True class CopyMethod(Enum): @@ -209,15 +263,31 @@ def get_transfer_controller( TransferController: The transfer controller object. """ if transfer_type == CopyMethod.GLOBUS: - return GlobusTransfer(config) + return GlobusTransferController(config) elif transfer_type == CopyMethod.SIMPLE: - return SimpleTransfer(config) + return SimpleTransferController(config) else: raise ValueError(f"Invalid transfer type: {transfer_type}") -def main(): +if __name__ == "__main__": config = Config832() transfer_type = CopyMethod.GLOBUS - controller = get_transfer_controller(transfer_type, config) - controller + globus_transfer_controller = get_transfer_controller(transfer_type, config) + globus_transfer_controller.copy( + file_path="dabramov/test.txt", + source=config.alcf832_raw, + destination=config.alcf832_scratch + ) + + simple_transfer_controller = get_transfer_controller(CopyMethod.SIMPLE, config) + success = simple_transfer_controller.copy( + file_path="test.rtf", + source=FileSystemEndpoint("source", "/Users/david/Documents/copy_test/test_source/"), + destination=FileSystemEndpoint("destination", "/Users/david/Documents/copy_test/test_destination/") + ) + + if success: + logger.info("Simple transfer succeeded.") + else: + logger.error("Simple transfer failed.") From f74a7a30d7be6d20e3d7527db5d0fc348a1996af Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 28 Jan 2025 10:46:00 -0800 Subject: [PATCH 22/23] Updated unittests --- orchestration/_tests/test_sfapi_flow.py | 3 +- .../_tests/test_transfer_controller.py | 197 ++++++++++++------ 2 files changed, 136 insertions(+), 64 deletions(-) diff --git a/orchestration/_tests/test_sfapi_flow.py b/orchestration/_tests/test_sfapi_flow.py index eedf659..a180968 100644 --- a/orchestration/_tests/test_sfapi_flow.py +++ b/orchestration/_tests/test_sfapi_flow.py @@ -1,9 +1,10 @@ # orchestration/_tests/test_sfapi_flow.py +from pathlib import Path import pytest from unittest.mock import MagicMock, patch, mock_open -from pathlib import Path from uuid import uuid4 + from prefect.blocks.system import Secret from prefect.testing.utilities import prefect_test_harness diff --git a/orchestration/_tests/test_transfer_controller.py b/orchestration/_tests/test_transfer_controller.py index 5994070..15b07f5 100644 --- a/orchestration/_tests/test_transfer_controller.py +++ b/orchestration/_tests/test_transfer_controller.py @@ -1,37 +1,90 @@ # test_transfer_controller.py import pytest -import globus_sdk +from pytest_mock import MockFixture from unittest.mock import MagicMock, patch +from uuid import uuid4 + +import globus_sdk +from prefect.blocks.system import Secret +from prefect.testing.utilities import prefect_test_harness + +from .test_globus import MockTransferClient + + +@pytest.fixture(autouse=True, scope="session") +def prefect_test_fixture(): + """ + A pytest fixture that automatically sets up and tears down the Prefect test harness + for the entire test session. It creates and saves test secrets and configurations + required for Globus integration. + + Yields: + None + """ + with prefect_test_harness(): + # Create ephemeral secrets in the local Prefect test database + globus_client_id = Secret(value=str(uuid4())) + globus_client_id.save(name="globus-client-id") + + globus_client_secret = Secret(value=str(uuid4())) + globus_client_secret.save(name="globus-client-secret") + + yield -from orchestration.flows.bl832.config import Config832 -from orchestration.transfer_controller import ( - FileSystemEndpoint, - GlobusTransferController, - SimpleTransferController, - get_transfer_controller, - CopyMethod, -) -from orchestration.globus.transfer import GlobusEndpoint + +@pytest.fixture(scope="session") +def transfer_controller_module(): + """ + Defer importing orchestration.transfer_controller until after + the prefect_test_fixture is loaded. This prevents Prefect from + trying to load secrets at import time. + """ + from orchestration.transfer_controller import ( + FileSystemEndpoint, + GlobusTransferController, + SimpleTransferController, + get_transfer_controller, + CopyMethod, + ) + return { + "FileSystemEndpoint": FileSystemEndpoint, + "GlobusTransferController": GlobusTransferController, + "SimpleTransferController": SimpleTransferController, + "get_transfer_controller": get_transfer_controller, + "CopyMethod": CopyMethod, + } + + +class MockEndpoint: + def __init__(self, root_path, uuid_value=None): + self.root_path = root_path + self.uuid = uuid_value or str(uuid4()) + self.uri = f"mock_endpoint_uri_{self.uuid}" @pytest.fixture def mock_config832(): """ - A pytest fixture that provides a mocked Config832 object - with a mocked TransferClient (tc). + Mock the Config832 class to provide necessary configurations. """ - mock_config = MagicMock(spec=Config832) - # Mock the Globus transfer client - mock_config.tc = MagicMock(name="MockTransferClient") - return mock_config + with patch("orchestration.flows.bl832.nersc.Config832") as MockConfig: + mock_config = MockConfig.return_value + mock_config.endpoints = { + "alcf832_raw": MockEndpoint("/alcf832_raw"), + } + mock_config.tc = MockTransferClient() + yield mock_config @pytest.fixture def mock_globus_endpoint(): """ A pytest fixture that returns a mocked GlobusEndpoint. + If your orchestration.globus.transfer also loads secrets at import, + you may need to similarly defer that import behind another fixture. """ + from orchestration.globus.transfer import GlobusEndpoint endpoint = GlobusEndpoint( name="mock_globus_endpoint", root_path="/mock_globus_root/", @@ -42,10 +95,11 @@ def mock_globus_endpoint(): @pytest.fixture -def mock_file_system_endpoint(): +def mock_file_system_endpoint(transfer_controller_module): """ A pytest fixture that returns a FileSystemEndpoint instance. """ + FileSystemEndpoint = transfer_controller_module["FileSystemEndpoint"] endpoint = FileSystemEndpoint( name="mock_filesystem_endpoint", root_path="/mock_fs_root" @@ -53,37 +107,38 @@ def mock_file_system_endpoint(): return endpoint +class MockSecret: + value = str(uuid4()) + + # -------------------------------------------------------------------------- # Tests for get_transfer_controller # -------------------------------------------------------------------------- -def test_get_transfer_controller_globus(mock_config832): - """ - Test that get_transfer_controller returns a GlobusTransferController when - the transfer type is CopyMethod.GLOBUS. - """ + +def test_get_transfer_controller_globus(mock_config832, transfer_controller_module): + CopyMethod = transfer_controller_module["CopyMethod"] + get_transfer_controller = transfer_controller_module["get_transfer_controller"] + GlobusTransferController = transfer_controller_module["GlobusTransferController"] + controller = get_transfer_controller(CopyMethod.GLOBUS, mock_config832) assert isinstance(controller, GlobusTransferController), ( - "get_transfer_controller should return a GlobusTransferController " - "instance for GLOBUS transfer." + "Expected GlobusTransferController for GLOBUS transfer type." ) -def test_get_transfer_controller_simple(mock_config832): - """ - Test that get_transfer_controller returns a SimpleTransferController when - the transfer type is CopyMethod.SIMPLE. - """ +def test_get_transfer_controller_simple(mock_config832, transfer_controller_module): + CopyMethod = transfer_controller_module["CopyMethod"] + get_transfer_controller = transfer_controller_module["get_transfer_controller"] + SimpleTransferController = transfer_controller_module["SimpleTransferController"] + controller = get_transfer_controller(CopyMethod.SIMPLE, mock_config832) assert isinstance(controller, SimpleTransferController), ( - "get_transfer_controller should return a SimpleTransferController " - "instance for SIMPLE transfer." + "Expected SimpleTransferController for SIMPLE transfer type." ) -def test_get_transfer_controller_invalid(mock_config832): - """ - Test that get_transfer_controller raises ValueError for invalid transfer method. - """ +def test_get_transfer_controller_invalid(mock_config832, transfer_controller_module): + get_transfer_controller = transfer_controller_module["get_transfer_controller"] with pytest.raises(ValueError, match="Invalid transfer type"): get_transfer_controller("invalid_type", mock_config832) @@ -91,11 +146,20 @@ def test_get_transfer_controller_invalid(mock_config832): # -------------------------------------------------------------------------- # Tests for GlobusTransferController # -------------------------------------------------------------------------- -def test_globus_transfer_controller_copy_success(mock_config832, mock_globus_endpoint): + +def test_globus_transfer_controller_copy_success( + mock_config832, mock_globus_endpoint, mocker: MockFixture, transfer_controller_module +): """ Test a successful copy() operation using GlobusTransferController. We mock start_transfer to return True. """ + GlobusTransferController = transfer_controller_module["GlobusTransferController"] + MockSecretClass = MockSecret + + # Patch any Secret.load calls to avoid real Prefect Cloud calls + mocker.patch('prefect.blocks.system.Secret.load', return_value=MockSecretClass()) + with patch("orchestration.transfer_controller.start_transfer", return_value=True) as mock_start_transfer: controller = GlobusTransferController(mock_config832) result = controller.copy( @@ -111,14 +175,21 @@ def test_globus_transfer_controller_copy_success(mock_config832, mock_globus_end _, called_kwargs = mock_start_transfer.call_args assert called_kwargs["source_endpoint"] == mock_globus_endpoint assert called_kwargs["dest_endpoint"] == mock_globus_endpoint - assert "max_wait_seconds" in called_kwargs, "max_wait_seconds should be passed to start_transfer." + assert "max_wait_seconds" in called_kwargs -def test_globus_transfer_controller_copy_failure(mock_config832, mock_globus_endpoint): +def test_globus_transfer_controller_copy_failure( + mock_config832, mock_globus_endpoint, mocker: MockFixture, transfer_controller_module +): """ Test a failing copy() operation using GlobusTransferController. We mock start_transfer to return False, indicating a transfer failure. """ + GlobusTransferController = transfer_controller_module["GlobusTransferController"] + MockSecretClass = MockSecret + + mocker.patch('prefect.blocks.system.Secret.load', return_value=MockSecretClass()) + with patch("orchestration.transfer_controller.start_transfer", return_value=False) as mock_start_transfer: controller = GlobusTransferController(mock_config832) result = controller.copy( @@ -130,10 +201,13 @@ def test_globus_transfer_controller_copy_failure(mock_config832, mock_globus_end mock_start_transfer.assert_called_once() -def test_globus_transfer_controller_copy_exception(mock_config832, mock_globus_endpoint): +def test_globus_transfer_controller_copy_exception( + mock_config832, mock_globus_endpoint, transfer_controller_module +): """ - Test copy() operation that raises a TransferAPIError exception in GlobusTransferController. + Test copy() operation that raises a TransferAPIError exception. """ + GlobusTransferController = transfer_controller_module["GlobusTransferController"] mock_response = MagicMock() mock_response.status_code = 400 mock_response.reason = "Bad Request" @@ -155,10 +229,11 @@ def test_globus_transfer_controller_copy_exception(mock_config832, mock_globus_e # -------------------------------------------------------------------------- # Tests for SimpleTransferController # -------------------------------------------------------------------------- -def test_simple_transfer_controller_no_file_path(mock_config832, mock_file_system_endpoint): - """ - Test that copy() returns False if no file_path is provided. - """ + +def test_simple_transfer_controller_no_file_path( + mock_config832, mock_file_system_endpoint, transfer_controller_module +): + SimpleTransferController = transfer_controller_module["SimpleTransferController"] controller = SimpleTransferController(mock_config832) result = controller.copy( file_path="", @@ -168,10 +243,8 @@ def test_simple_transfer_controller_no_file_path(mock_config832, mock_file_syste assert result is False, "Expected False when no file_path is provided." -def test_simple_transfer_controller_no_source_or_destination(mock_config832): - """ - Test that copy() returns False if source or destination is None. - """ +def test_simple_transfer_controller_no_source_or_destination(mock_config832, transfer_controller_module): + SimpleTransferController = transfer_controller_module["SimpleTransferController"] controller = SimpleTransferController(mock_config832) result = controller.copy( file_path="test.txt", @@ -181,11 +254,10 @@ def test_simple_transfer_controller_no_source_or_destination(mock_config832): assert result is False, "Expected False when either source or destination is None." -def test_simple_transfer_controller_copy_success(mock_config832, mock_file_system_endpoint): - """ - Test a successful copy() operation using SimpleTransferController by mocking os.system - to return 0 (indicating success). - """ +def test_simple_transfer_controller_copy_success( + mock_config832, mock_file_system_endpoint, transfer_controller_module +): + SimpleTransferController = transfer_controller_module["SimpleTransferController"] with patch("os.system", return_value=0) as mock_os_system: controller = SimpleTransferController(mock_config832) result = controller.copy( @@ -200,11 +272,10 @@ def test_simple_transfer_controller_copy_success(mock_config832, mock_file_syste assert "cp -r" in command_called, "Expected cp command in os.system call." -def test_simple_transfer_controller_copy_failure(mock_config832, mock_file_system_endpoint): - """ - Test a failing copy() operation using SimpleTransferController by mocking os.system - to return a non-zero code. - """ +def test_simple_transfer_controller_copy_failure( + mock_config832, mock_file_system_endpoint, transfer_controller_module +): + SimpleTransferController = transfer_controller_module["SimpleTransferController"] with patch("os.system", return_value=1) as mock_os_system: controller = SimpleTransferController(mock_config832) result = controller.copy( @@ -213,16 +284,16 @@ def test_simple_transfer_controller_copy_failure(mock_config832, mock_file_syste destination=mock_file_system_endpoint, ) - assert result is False, "Expected False when os.system returns a non-zero code." + assert result is False, "Expected False when os.system returns non-zero." mock_os_system.assert_called_once() command_called = mock_os_system.call_args[0][0] assert "cp -r" in command_called, "Expected cp command in os.system call." -def test_simple_transfer_controller_copy_exception(mock_config832, mock_file_system_endpoint): - """ - Test a copy() operation that raises an exception in SimpleTransferController. - """ +def test_simple_transfer_controller_copy_exception( + mock_config832, mock_file_system_endpoint, transfer_controller_module +): + SimpleTransferController = transfer_controller_module["SimpleTransferController"] with patch("os.system", side_effect=Exception("Mocked cp error")) as mock_os_system: controller = SimpleTransferController(mock_config832) result = controller.copy( From 087db2db4a08078690753899995e204311795e0b Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 28 Jan 2025 16:15:27 -0800 Subject: [PATCH 23/23] removed @dataclass decorator because it is redundant with an __init__ block --- orchestration/transfer_controller.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/orchestration/transfer_controller.py b/orchestration/transfer_controller.py index 31cc7db..973b796 100644 --- a/orchestration/transfer_controller.py +++ b/orchestration/transfer_controller.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from dataclasses import dataclass from dotenv import load_dotenv from enum import Enum import logging @@ -42,7 +41,6 @@ def root_path(self) -> str: return self.root_path -@dataclass class FileSystemEndpoint(TransferEndpoint): """ A file system endpoint.