From 3a0343e73e43d86e558c54ef69b01f577e2ea9c2 Mon Sep 17 00:00:00 2001 From: Douglas Jacobsen Date: Tue, 21 Jan 2025 11:29:05 -0700 Subject: [PATCH 1/2] Update exeuction path for NVIDIA container workloads This commit updates the execution path for NVIDIA container workloads, to make sure they execute binaries from `/workspace`. Additionally, the internal_mpi_command variable is added to all of these to make execution using the builtin MPI easier (rather than strictly external MPI). --- .../applications/nvidia-hpcg/application.py | 9 ++++++++- .../nvidia-hpl-mxp/application.py | 9 ++++++++- .../applications/nvidia-hpl/application.py | 19 +++++++++---------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py index 54bf403b4..2be4f66b1 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py @@ -23,7 +23,7 @@ class NvidiaHpcg(BaseHpcg): executable( "execute", - "./hpcg.sh --dat {experiment_run_dir}/hpcg.dat", + "{internal_mpi_command} /workspace/hpcg.sh --dat {experiment_run_dir}/hpcg.dat", use_mpi=True, ) @@ -31,6 +31,13 @@ class NvidiaHpcg(BaseHpcg): workload_group("all_workloads", workloads=["standard"], mode="append") + workload_variable( + "internal_mpi_command", + default="", + description="MPI Command for execution using container built-in MPI", + workload_group="all_workloads", + ) + workload_variable( "nvshmem_disable_cuda_vmm", default="1", diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py index 0c1e64fb6..1abd28e4e 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py @@ -36,7 +36,7 @@ class NvidiaHplMxp(HplBase): executable( "execute", - '/workspace/hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}', + '{internal_mpi_command} /workspace/hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}', use_mpi=True, ) @@ -50,6 +50,13 @@ class NvidiaHplMxp(HplBase): workloads=["standard", "calculator"], ) + workload_variable( + "internal_mpi_command", + default="", + description="MPI Command for execution using container built-in MPI", + workload_group="all_workloads", + ) + workload_variable( "nvshmem_disable_cuda_vmm", default="1", diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py index 29a5b511d..126f5cb2c 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py @@ -35,21 +35,14 @@ class NvidiaHpl(HplBase): tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia") executable( - "execute", "./hpl.sh --dat {experiment_run_dir}/HPL.dat", use_mpi=True - ) - - executable( - "execute-mxp", - './hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}', + "execute", + "{internal_mpi_command} /workspace/hpl.sh --dat {experiment_run_dir}/HPL.dat", use_mpi=True, ) workload("standard", executables=["execute"]) workload("calculator", executables=["execute"]) - workload("standard-mxp", executables=["execute-mxp"]) - workload("calculator-mxp", executables=["execute-mxp"]) - workload_group( "standard", workloads=["standard", "standard-mxp"], mode="append" ) @@ -60,7 +53,13 @@ class NvidiaHpl(HplBase): "all_workloads", workloads=["standard", "standard-mxp", "calculator", "calculator-mxp"], ) - workload_group("mxp", workloads=["standard-mxp", "calculator-mxp"]) + + workload_variable( + "internal_mpi_command", + default="", + description="MPI Command for execution using container built-in MPI", + workload_group="all_workloads", + ) workload_variable( "nvshmem_disable_cuda_vmm", From c504b4390230860d97c9d98fa7c1bd87371bef52 Mon Sep 17 00:00:00 2001 From: Douglas Jacobsen Date: Tue, 21 Jan 2025 11:37:28 -0700 Subject: [PATCH 2/2] Add nvidia-hpc-benchmarks base application and apply it This commit extract common logic from the NVIDIA containerized benchmark application definitions into a base application. This base application is then used to apply the same logic to each of the NVIDIA HPC Benchmark implementations. --- .../applications/nvidia-hpcg/application.py | 129 +-------- .../nvidia-hpl-mxp/application.py | 129 +-------- .../applications/nvidia-hpl/application.py | 250 ++++++++++++------ .../nvidia-hpc-benchmarks/base_application.py | 127 +++++++++ 4 files changed, 310 insertions(+), 325 deletions(-) create mode 100644 var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py diff --git a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py index 2be4f66b1..bea741146 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py @@ -8,9 +8,12 @@ from ramble.appkit import * from ramble.base_app.builtin.hpcg import Hpcg as BaseHpcg +from ramble.base_app.builtin.nvidia_hpc_benchmarks import ( + NvidiaHpcBenchmarks as NvidiaHpcBase, +) -class NvidiaHpcg(BaseHpcg): +class NvidiaHpcg(BaseHpcg, NvidiaHpcBase): """NVIDIA's HPCG benchmark accelerates the High Performance Conjugate Gradients (HPCG) Benchmark. HPCG is a software package that performs a fixed number of multigrid preconditioned (using a symmetric Gauss-Seidel @@ -30,127 +33,3 @@ class NvidiaHpcg(BaseHpcg): workload("standard", executables=["execute"]) workload_group("all_workloads", workloads=["standard"], mode="append") - - workload_variable( - "internal_mpi_command", - default="", - description="MPI Command for execution using container built-in MPI", - workload_group="all_workloads", - ) - - workload_variable( - "nvshmem_disable_cuda_vmm", - default="1", - description="", - workload_group="all_workloads", - ) - environment_variable( - "NVSHMEM_DISABLE_CUDA_VMM", - "{nvshmem_disable_cuda_vmm}", - description="", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_fct_comm_policy", - default="1", - description="", - workload_group="all_workloads", - ) - environment_variable( - "HPL_FCT_COMM_POLICY", - "{hpl_fct_comm_policy}", - description="", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_use_nvshmem", - default="0", - description="Whether to use NVSHMEM or not", - workload_group="all_workloads", - ) - environment_variable( - "HPL_USE_NVSHMEM", - "{hpl_use_nvshmem}", - description="Whether or not to use NVSHMEM", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_p2p_as_bcast", - default="0", - description="0 = ncclBcast, 1 = ncclSend/Recv", - workload_group="all_workloads", - ) - environment_variable( - "HPL_P2P_AS_BCAST", - "{hpl_p2p_as_bcast}", - description="Whether or not to use P2P for BCAST", - workload_group="all_workloads", - ) - - workload_variable( - "pmix_mca_gds", - default="^ds12", - description="", - workload_group="all_workloads", - ) - environment_variable( - "PMIX_MCA_gds", - "{pmix_mca_gds}", - description="PMIX MCA gds", - workload_group="all_workloads", - ) - - workload_variable( - "ompi_mca_btl", - default="^vader,tcp,openib,uct", - description="", - workload_group="all_workloads", - ) - environment_variable( - "OMPI_MCA_btl", - "{ompi_mca_btl}", - description="OpenMPI MCA btl", - workload_group="all_workloads", - ) - - workload_variable( - "ompi_mca_pml", - default="ucx", - description="", - workload_group="all_workloads", - ) - environment_variable( - "OMPI_MCA_pml", - "{ompi_mca_pml}", - description="OpenMPI MCA pml", - workload_group="all_workloads", - ) - - workload_variable( - "ucx_net_devices", - default="enp6s0,enp12s0,enp134s0,enp140s0", - description="", - workload_group="all_workloads", - ) - environment_variable( - "UCX_NET_DEVICES", - "{ucx_net_devices}", - description="UCX Net Devices", - workload_group="all_workloads", - ) - - workload_variable( - "ucx_max_rndv_rails", - default="4", - description="", - workload_group="all_workloads", - ) - environment_variable( - "UCX_MAX_RNDV_RAILS", - "{ucx_max_rndv_rails}", - description="UCX MAximum RNDV Rails", - workload_group="all_workloads", - ) diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py index 1abd28e4e..fd277cb45 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py @@ -9,9 +9,12 @@ from ramble.appkit import * from ramble.base_app.builtin.hpl import Hpl as HplBase +from ramble.base_app.builtin.nvidia_hpc_benchmarks import ( + NvidiaHpcBenchmarks as NvidiaHpcBase, +) -class NvidiaHplMxp(HplBase): +class NvidiaHplMxp(HplBase, NvidiaHpcBase): """This application defines how to run NVIDIA's optimized version of HPL, which is contained in NVIDIA's HPC-Benchmarks collection. @@ -50,130 +53,6 @@ class NvidiaHplMxp(HplBase): workloads=["standard", "calculator"], ) - workload_variable( - "internal_mpi_command", - default="", - description="MPI Command for execution using container built-in MPI", - workload_group="all_workloads", - ) - - workload_variable( - "nvshmem_disable_cuda_vmm", - default="1", - description="", - workload_group="all_workloads", - ) - environment_variable( - "NVSHMEM_DISABLE_CUDA_VMM", - "{nvshmem_disable_cuda_vmm}", - description="", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_fct_comm_policy", - default="1", - description="", - workload_group="all_workloads", - ) - environment_variable( - "HPL_FCT_COMM_POLICY", - "{hpl_fct_comm_policy}", - description="", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_use_nvshmem", - default="0", - description="Whether to use NVSHMEM or not", - workload_group="all_workloads", - ) - environment_variable( - "HPL_USE_NVSHMEM", - "{hpl_use_nvshmem}", - description="Whether or not to use NVSHMEM", - workload_group="all_workloads", - ) - - workload_variable( - "hpl_p2p_as_bcast", - default="0", - description="0 = ncclBcast, 1 = ncclSend/Recv", - workload_group="all_workloads", - ) - environment_variable( - "HPL_P2P_AS_BCAST", - "{hpl_p2p_as_bcast}", - description="Whether or not to use P2P for BCAST", - workload_group="all_workloads", - ) - - workload_variable( - "pmix_mca_gds", - default="^ds12", - description="", - workload_group="all_workloads", - ) - environment_variable( - "PMIX_MCA_gds", - "{pmix_mca_gds}", - description="PMIX MCA gds", - workload_group="all_workloads", - ) - - workload_variable( - "ompi_mca_btl", - default="^vader,tcp,openib,uct", - description="", - workload_group="all_workloads", - ) - environment_variable( - "OMPI_MCA_btl", - "{ompi_mca_btl}", - description="OpenMPI MCA btl", - workload_group="all_workloads", - ) - - workload_variable( - "ompi_mca_pml", - default="ucx", - description="", - workload_group="all_workloads", - ) - environment_variable( - "OMPI_MCA_pml", - "{ompi_mca_pml}", - description="OpenMPI MCA pml", - workload_group="all_workloads", - ) - - workload_variable( - "ucx_net_devices", - default="enp6s0,enp12s0,enp134s0,enp140s0", - description="", - workload_group="all_workloads", - ) - environment_variable( - "UCX_NET_DEVICES", - "{ucx_net_devices}", - description="UCX Net Devices", - workload_group="all_workloads", - ) - - workload_variable( - "ucx_max_rndv_rails", - default="4", - description="", - workload_group="all_workloads", - ) - environment_variable( - "UCX_MAX_RNDV_RAILS", - "{ucx_max_rndv_rails}", - description="UCX MAximum RNDV Rails", - workload_group="all_workloads", - ) - workload_variable( "block_size", default="1024", diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py index 126f5cb2c..6e78d815c 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py @@ -9,9 +9,12 @@ from ramble.appkit import * from ramble.base_app.builtin.hpl import Hpl as HplBase +from ramble.base_app.builtin.nvidia_hpc_benchmarks import ( + NvidiaHpcBenchmarks as NvidiaHPCBase, +) -class NvidiaHpl(HplBase): +class NvidiaHpl(HplBase, NvidiaHPCBase): """This application defines how to run NVIDIA's optimized version of HPL, which is contained in NVIDIA's HPC-Benchmarks collection. @@ -43,41 +46,18 @@ class NvidiaHpl(HplBase): workload("standard", executables=["execute"]) workload("calculator", executables=["execute"]) - workload_group( - "standard", workloads=["standard", "standard-mxp"], mode="append" - ) - workload_group( - "calculator", workloads=["calculator", "calculator-mxp"], mode="append" - ) + workload_group("standard", workloads=["standard"], mode="append") + workload_group("calculator", workloads=["calculator"], mode="append") workload_group( "all_workloads", - workloads=["standard", "standard-mxp", "calculator", "calculator-mxp"], - ) - - workload_variable( - "internal_mpi_command", - default="", - description="MPI Command for execution using container built-in MPI", - workload_group="all_workloads", - ) - - workload_variable( - "nvshmem_disable_cuda_vmm", - default="1", - description="", - workload_group="all_workloads", - ) - environment_variable( - "NVSHMEM_DISABLE_CUDA_VMM", - "{nvshmem_disable_cuda_vmm}", - description="", - workload_group="all_workloads", + workloads=["standard", "calculator"], ) workload_variable( "hpl_fct_comm_policy", default="1", - description="", + values=["0", "1"], + description="Which communication library to use in the panel factorization. 0 = NVSHMEM, 1 = Host MPI", workload_group="all_workloads", ) environment_variable( @@ -90,7 +70,8 @@ class NvidiaHpl(HplBase): workload_variable( "hpl_use_nvshmem", default="0", - description="Whether to use NVSHMEM or not", + values=["0", "1"], + description="Whether to use NVSHMEM or not. 0 = Disable, 1 = Enable.", workload_group="all_workloads", ) environment_variable( @@ -103,101 +84,220 @@ class NvidiaHpl(HplBase): workload_variable( "hpl_p2p_as_bcast", default="0", - description="0 = ncclBcast, 1 = ncclSend/Recv", + values=["0", "1", "2", "3", "4"], + description="0 = ncclBcast, 1 = ncclSend/Recv, 2 = CUDA-aware MPI, 3 = host MPI, 4 = NVSHMEM", workload_group="all_workloads", ) environment_variable( "HPL_P2P_AS_BCAST", "{hpl_p2p_as_bcast}", - description="Whether or not to use P2P for BCAST", + description="Which communication library to use in the final solve step.", workload_group="all_workloads", ) workload_variable( - "pmix_mca_gds", - default="^ds12", - description="", + "hpl_nvshmem_swap", + default="0", + values=["0", "1"], + description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.", workload_group="all_workloads", ) environment_variable( - "PMIX_MCA_gds", - "{pmix_mca_gds}", - description="PMIX MCA gds", + "HPL_NVSHMEM_SWAP", + "{hpl_nvshmem_swap}", + description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.", workload_group="all_workloads", ) workload_variable( - "ompi_mca_btl", - default="^vader,tcp,openib,uct", - description="", + "hpl_chunk_size_nbs", + default="16", + description="Number of matrix blocks to group for computations. Needs to be > 0", workload_group="all_workloads", ) environment_variable( - "OMPI_MCA_btl", - "{ompi_mca_btl}", - description="OpenMPI MCA btl", + "HPL_CHUNK_SIZE_NBS", + "{hpl_chunk_size_nbs}", + description="Number of matrix blocks to group for computations. Needs to be > 0", workload_group="all_workloads", ) workload_variable( - "ompi_mca_pml", - default="ucx", - description="", + "hpl_dist_trsm_flag", + default="1", + values=["0", "1"], + description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.", workload_group="all_workloads", ) environment_variable( - "OMPI_MCA_pml", - "{ompi_mca_pml}", - description="OpenMPI MCA pml", + "HPL_DIST_TRSM_FLAG", + "{hpl_dist_trsm_flag}", + description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.", workload_group="all_workloads", ) workload_variable( - "ucx_net_devices", - default="enp6s0,enp12s0,enp134s0,enp140s0", - description="", + "hpl_cta_per_fct", + default="16", + description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.", workload_group="all_workloads", ) environment_variable( - "UCX_NET_DEVICES", - "{ucx_net_devices}", - description="UCX Net Devices", + "HPL_CTA_PER_FCT", + "{hpl_cta_per_fct}", + description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.", workload_group="all_workloads", ) workload_variable( - "ucx_max_rndv_rails", - default="4", - description="", + "hpl_alloc_hugepages", + default="0", + values=["0", "1"], + description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.", workload_group="all_workloads", ) environment_variable( - "UCX_MAX_RNDV_RAILS", - "{ucx_max_rndv_rails}", - description="UCX MAximum RNDV Rails", + "HPL_ALLOC_HUGEPAGES", + "{hpl_alloc_hugepages}", + description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.", workload_group="all_workloads", ) workload_variable( - "block_size", - default="1024", - description="Size of each block", - workload_group="calculator", + "warmup_end_prog", + default="-1", + description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.", + workload_group="all_workloads", + ) + environment_variable( + "WARMUP_END_PROG", + "{warmup_end_prog}", + description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.", + workload_group="all_workloads", + ) + + workload_variable( + "test_loops", + default="1", + description="Runs the main loop X many times", + workload_group="all_workloads", + ) + environment_variable( + "TEST_LOOPS", + "{test_loops}", + description="Runs the main loop X many times", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_cusolver_mp_tests", + default="1", + description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)", + workload_group="all_workloads", + ) + environment_variable( + "HPL_CUSOLVER_MP_TESTS", + "{hpl_cusolver_mp_tests}", + description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)", + workload_group="all_workloads", ) workload_variable( - "nporder", - default="row", - description="Major order to use for matrix", - values=["row", "column"], - workload_group="mxp", + "hpl_cusolver_mp_tests_gemm_iters", + default="128", + description="Number of repeat GEMM calls in tests. Needs to be > 0.", + workload_group="all_workloads", + ) + environment_variable( + "HPL_CUSOLVER_MP_TESTS_GEMM_ITERS", + "{hpl_cusolver_mp_tests_gemm_iters}", + description="Number of repeat GEMM calls in tests. Needs to be > 0.", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_ooc_mode", + default="1", + description="Enables / disales out-of-core mode", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OOC_MODE", + "{hpl_ooc_mode}", + description="Enables / disales out-of-core mode", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_ooc_max_gpu_mem", + default="-1", + description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OOC_MAX_GPU_MEM", + "{hpl_ooc_max_gpu_mem}", + description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_ooc_tile_m", + default="4096", + description="Row blocking factor. Needs to be > 0", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OCC_TILE_M", + "{hpl_occ_tile_m}", + description="Row blocking factor. Needs to be > 0", + workload_group="all_workloads", ) workload_variable( - "gpu_affinity", - default="0:1:2:3:4:5:6:7", - description="Colon delimited list of GPU IDs", - workload_group="mxp", + "hpl_ooc_tile_n", + default="4096", + description="Column blocking factor. Needs to be > 0", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OCC_TILE_N", + "{hpl_occ_tile_n}", + description="Column blocking factor. Needs to be > 0", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_ooc_num_streams", + default="3", + description="Number of streams used for OCC operations", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OOC_NUM_STREAMS", + "{hpl_ooc_num_streams}", + description="Number of streams used for OCC operations", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_ooc_safe_size", + default="2.0", + description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC", + workload_group="all_workloads", + ) + environment_variable( + "HPL_OOC_SAFE_SIZE", + "{hpl_ooc_safe_size}", + description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC", + workload_group="all_workloads", + ) + + workload_variable( + "block_size", + default="1024", + description="Size of each block", + workload_group="calculator", ) figure_of_merit( diff --git a/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py b/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py new file mode 100644 index 000000000..2097ecee8 --- /dev/null +++ b/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py @@ -0,0 +1,127 @@ +# Copyright 2022-2025 The Ramble Authors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + + +from ramble.appkit import * + + +class NvidiaHpcBenchmarks(ExecutableApplication): + """The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL, + HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for + performance on NVIDIA accelerated HPC systems. + + NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a + (random) dense linear system in double precision (64-bit) arithmetic and in + mixed precision arithmetic using Tensor Cores, respectively, on + distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL + benchmark and HPL-MxP benchmark. + + NVIDIA's HPCG benchmark accelerates the High Performance Conjugate Gradients + (HPCG) Benchmark. HPCG is a software package that performs a fixed number of + multigrid preconditioned (using a symmetric Gauss-Seidel smoother) conjugate + gradient (PCG) iterations using double precision (64-bit) floating point + values. + + NVIDIA's STREAM benchmark is a simple synthetic benchmark program that measures + sustainable memory bandwidth. NVIDIA HPC-Benchmarks container includes STREAM + benchmarks optimized for NVIDIA Ampere GPU architecture (sm80), NVIDIA Hopper + GPU architecture (sm90) and NVIDIA Grace CPU. + """ + + name = "nvidia-hpc-benchmarks" + + maintainers("douglasjacobsen") + + tags("benchmark-app", "mini-app", "benchmark", "containerize") + + workload_group("all_workloads") + + workload_variable( + "internal_mpi_command", + default="", + description="MPI Command for execution using container built-in MPI", + workload_group="all_workloads", + ) + + workload_variable( + "nvshmem_disable_cuda_vmm", + default="1", + description="", + workload_group="all_workloads", + ) + environment_variable( + "NVSHMEM_DISABLE_CUDA_VMM", + "{nvshmem_disable_cuda_vmm}", + description="", + workload_group="all_workloads", + ) + + workload_variable( + "pmix_mca_gds", + default="^ds12", + description="", + workload_group="all_workloads", + ) + environment_variable( + "PMIX_MCA_gds", + "{pmix_mca_gds}", + description="PMIX MCA gds", + workload_group="all_workloads", + ) + + workload_variable( + "ompi_mca_btl", + default="^vader,tcp,openib,uct", + description="", + workload_group="all_workloads", + ) + environment_variable( + "OMPI_MCA_btl", + "{ompi_mca_btl}", + description="OpenMPI MCA btl", + workload_group="all_workloads", + ) + + workload_variable( + "ompi_mca_pml", + default="ucx", + description="", + workload_group="all_workloads", + ) + environment_variable( + "OMPI_MCA_pml", + "{ompi_mca_pml}", + description="OpenMPI MCA pml", + workload_group="all_workloads", + ) + + workload_variable( + "ucx_net_devices", + default="enp6s0,enp12s0,enp134s0,enp140s0", + description="", + workload_group="all_workloads", + ) + environment_variable( + "UCX_NET_DEVICES", + "{ucx_net_devices}", + description="UCX Net Devices", + workload_group="all_workloads", + ) + + workload_variable( + "ucx_max_rndv_rails", + default="4", + description="", + workload_group="all_workloads", + ) + environment_variable( + "UCX_MAX_RNDV_RAILS", + "{ucx_max_rndv_rails}", + description="UCX MAximum RNDV Rails", + workload_group="all_workloads", + )