From 3a0343e73e43d86e558c54ef69b01f577e2ea9c2 Mon Sep 17 00:00:00 2001
From: Douglas Jacobsen <dwjacobsen@google.com>
Date: Tue, 21 Jan 2025 11:29:05 -0700
Subject: [PATCH 1/2] Update exeuction path for NVIDIA container workloads

This commit updates the execution path for NVIDIA container workloads,
to make sure they execute binaries from `/workspace`. Additionally, the
internal_mpi_command variable is added to all of these to make execution
using the builtin MPI easier (rather than strictly external MPI).
---
 .../applications/nvidia-hpcg/application.py   |  9 ++++++++-
 .../nvidia-hpl-mxp/application.py             |  9 ++++++++-
 .../applications/nvidia-hpl/application.py    | 19 +++++++++----------
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
index 54bf403b4..2be4f66b1 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
@@ -23,7 +23,7 @@ class NvidiaHpcg(BaseHpcg):
 
     executable(
         "execute",
-        "./hpcg.sh --dat {experiment_run_dir}/hpcg.dat",
+        "{internal_mpi_command} /workspace/hpcg.sh --dat {experiment_run_dir}/hpcg.dat",
         use_mpi=True,
     )
 
@@ -31,6 +31,13 @@ class NvidiaHpcg(BaseHpcg):
 
     workload_group("all_workloads", workloads=["standard"], mode="append")
 
+    workload_variable(
+        "internal_mpi_command",
+        default="",
+        description="MPI Command for execution using container built-in MPI",
+        workload_group="all_workloads",
+    )
+
     workload_variable(
         "nvshmem_disable_cuda_vmm",
         default="1",
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
index 0c1e64fb6..1abd28e4e 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
@@ -36,7 +36,7 @@ class NvidiaHplMxp(HplBase):
 
     executable(
         "execute",
-        '/workspace/hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
+        '{internal_mpi_command} /workspace/hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
         use_mpi=True,
     )
 
@@ -50,6 +50,13 @@ class NvidiaHplMxp(HplBase):
         workloads=["standard", "calculator"],
     )
 
+    workload_variable(
+        "internal_mpi_command",
+        default="",
+        description="MPI Command for execution using container built-in MPI",
+        workload_group="all_workloads",
+    )
+
     workload_variable(
         "nvshmem_disable_cuda_vmm",
         default="1",
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
index 29a5b511d..126f5cb2c 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
@@ -35,21 +35,14 @@ class NvidiaHpl(HplBase):
     tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia")
 
     executable(
-        "execute", "./hpl.sh --dat {experiment_run_dir}/HPL.dat", use_mpi=True
-    )
-
-    executable(
-        "execute-mxp",
-        './hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
+        "execute",
+        "{internal_mpi_command} /workspace/hpl.sh --dat {experiment_run_dir}/HPL.dat",
         use_mpi=True,
     )
 
     workload("standard", executables=["execute"])
     workload("calculator", executables=["execute"])
 
-    workload("standard-mxp", executables=["execute-mxp"])
-    workload("calculator-mxp", executables=["execute-mxp"])
-
     workload_group(
         "standard", workloads=["standard", "standard-mxp"], mode="append"
     )
@@ -60,7 +53,13 @@ class NvidiaHpl(HplBase):
         "all_workloads",
         workloads=["standard", "standard-mxp", "calculator", "calculator-mxp"],
     )
-    workload_group("mxp", workloads=["standard-mxp", "calculator-mxp"])
+
+    workload_variable(
+        "internal_mpi_command",
+        default="",
+        description="MPI Command for execution using container built-in MPI",
+        workload_group="all_workloads",
+    )
 
     workload_variable(
         "nvshmem_disable_cuda_vmm",

From c504b4390230860d97c9d98fa7c1bd87371bef52 Mon Sep 17 00:00:00 2001
From: Douglas Jacobsen <dwjacobsen@google.com>
Date: Tue, 21 Jan 2025 11:37:28 -0700
Subject: [PATCH 2/2] Add nvidia-hpc-benchmarks base application and apply it

This commit extract common logic from the NVIDIA containerized benchmark
application definitions into a base application. This base application
is then used to apply the same logic to each of the NVIDIA HPC Benchmark
implementations.
---
 .../applications/nvidia-hpcg/application.py   | 129 +--------
 .../nvidia-hpl-mxp/application.py             | 129 +--------
 .../applications/nvidia-hpl/application.py    | 250 ++++++++++++------
 .../nvidia-hpc-benchmarks/base_application.py | 127 +++++++++
 4 files changed, 310 insertions(+), 325 deletions(-)
 create mode 100644 var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py

diff --git a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
index 2be4f66b1..bea741146 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpcg/application.py
@@ -8,9 +8,12 @@
 
 from ramble.appkit import *
 from ramble.base_app.builtin.hpcg import Hpcg as BaseHpcg
+from ramble.base_app.builtin.nvidia_hpc_benchmarks import (
+    NvidiaHpcBenchmarks as NvidiaHpcBase,
+)
 
 
-class NvidiaHpcg(BaseHpcg):
+class NvidiaHpcg(BaseHpcg, NvidiaHpcBase):
     """NVIDIA's HPCG benchmark accelerates the High Performance Conjugate
     Gradients (HPCG) Benchmark. HPCG is a software package that performs a
     fixed number of multigrid preconditioned (using a symmetric Gauss-Seidel
@@ -30,127 +33,3 @@ class NvidiaHpcg(BaseHpcg):
     workload("standard", executables=["execute"])
 
     workload_group("all_workloads", workloads=["standard"], mode="append")
-
-    workload_variable(
-        "internal_mpi_command",
-        default="",
-        description="MPI Command for execution using container built-in MPI",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "nvshmem_disable_cuda_vmm",
-        default="1",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "NVSHMEM_DISABLE_CUDA_VMM",
-        "{nvshmem_disable_cuda_vmm}",
-        description="",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_fct_comm_policy",
-        default="1",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_FCT_COMM_POLICY",
-        "{hpl_fct_comm_policy}",
-        description="",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_use_nvshmem",
-        default="0",
-        description="Whether to use NVSHMEM or not",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_USE_NVSHMEM",
-        "{hpl_use_nvshmem}",
-        description="Whether or not to use NVSHMEM",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_p2p_as_bcast",
-        default="0",
-        description="0 = ncclBcast, 1 = ncclSend/Recv",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_P2P_AS_BCAST",
-        "{hpl_p2p_as_bcast}",
-        description="Whether or not to use P2P for BCAST",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "pmix_mca_gds",
-        default="^ds12",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "PMIX_MCA_gds",
-        "{pmix_mca_gds}",
-        description="PMIX MCA gds",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ompi_mca_btl",
-        default="^vader,tcp,openib,uct",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "OMPI_MCA_btl",
-        "{ompi_mca_btl}",
-        description="OpenMPI MCA btl",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ompi_mca_pml",
-        default="ucx",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "OMPI_MCA_pml",
-        "{ompi_mca_pml}",
-        description="OpenMPI MCA pml",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ucx_net_devices",
-        default="enp6s0,enp12s0,enp134s0,enp140s0",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "UCX_NET_DEVICES",
-        "{ucx_net_devices}",
-        description="UCX Net Devices",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ucx_max_rndv_rails",
-        default="4",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "UCX_MAX_RNDV_RAILS",
-        "{ucx_max_rndv_rails}",
-        description="UCX MAximum RNDV Rails",
-        workload_group="all_workloads",
-    )
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
index 1abd28e4e..fd277cb45 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
@@ -9,9 +9,12 @@
 from ramble.appkit import *
 
 from ramble.base_app.builtin.hpl import Hpl as HplBase
+from ramble.base_app.builtin.nvidia_hpc_benchmarks import (
+    NvidiaHpcBenchmarks as NvidiaHpcBase,
+)
 
 
-class NvidiaHplMxp(HplBase):
+class NvidiaHplMxp(HplBase, NvidiaHpcBase):
     """This application defines how to run NVIDIA's optimized version of HPL,
     which is contained in NVIDIA's HPC-Benchmarks collection.
 
@@ -50,130 +53,6 @@ class NvidiaHplMxp(HplBase):
         workloads=["standard", "calculator"],
     )
 
-    workload_variable(
-        "internal_mpi_command",
-        default="",
-        description="MPI Command for execution using container built-in MPI",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "nvshmem_disable_cuda_vmm",
-        default="1",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "NVSHMEM_DISABLE_CUDA_VMM",
-        "{nvshmem_disable_cuda_vmm}",
-        description="",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_fct_comm_policy",
-        default="1",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_FCT_COMM_POLICY",
-        "{hpl_fct_comm_policy}",
-        description="",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_use_nvshmem",
-        default="0",
-        description="Whether to use NVSHMEM or not",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_USE_NVSHMEM",
-        "{hpl_use_nvshmem}",
-        description="Whether or not to use NVSHMEM",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "hpl_p2p_as_bcast",
-        default="0",
-        description="0 = ncclBcast, 1 = ncclSend/Recv",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "HPL_P2P_AS_BCAST",
-        "{hpl_p2p_as_bcast}",
-        description="Whether or not to use P2P for BCAST",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "pmix_mca_gds",
-        default="^ds12",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "PMIX_MCA_gds",
-        "{pmix_mca_gds}",
-        description="PMIX MCA gds",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ompi_mca_btl",
-        default="^vader,tcp,openib,uct",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "OMPI_MCA_btl",
-        "{ompi_mca_btl}",
-        description="OpenMPI MCA btl",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ompi_mca_pml",
-        default="ucx",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "OMPI_MCA_pml",
-        "{ompi_mca_pml}",
-        description="OpenMPI MCA pml",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ucx_net_devices",
-        default="enp6s0,enp12s0,enp134s0,enp140s0",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "UCX_NET_DEVICES",
-        "{ucx_net_devices}",
-        description="UCX Net Devices",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "ucx_max_rndv_rails",
-        default="4",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "UCX_MAX_RNDV_RAILS",
-        "{ucx_max_rndv_rails}",
-        description="UCX MAximum RNDV Rails",
-        workload_group="all_workloads",
-    )
-
     workload_variable(
         "block_size",
         default="1024",
diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
index 126f5cb2c..6e78d815c 100644
--- a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
+++ b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py
@@ -9,9 +9,12 @@
 from ramble.appkit import *
 
 from ramble.base_app.builtin.hpl import Hpl as HplBase
+from ramble.base_app.builtin.nvidia_hpc_benchmarks import (
+    NvidiaHpcBenchmarks as NvidiaHPCBase,
+)
 
 
-class NvidiaHpl(HplBase):
+class NvidiaHpl(HplBase, NvidiaHPCBase):
     """This application defines how to run NVIDIA's optimized version of HPL,
     which is contained in NVIDIA's HPC-Benchmarks collection.
 
@@ -43,41 +46,18 @@ class NvidiaHpl(HplBase):
     workload("standard", executables=["execute"])
     workload("calculator", executables=["execute"])
 
-    workload_group(
-        "standard", workloads=["standard", "standard-mxp"], mode="append"
-    )
-    workload_group(
-        "calculator", workloads=["calculator", "calculator-mxp"], mode="append"
-    )
+    workload_group("standard", workloads=["standard"], mode="append")
+    workload_group("calculator", workloads=["calculator"], mode="append")
     workload_group(
         "all_workloads",
-        workloads=["standard", "standard-mxp", "calculator", "calculator-mxp"],
-    )
-
-    workload_variable(
-        "internal_mpi_command",
-        default="",
-        description="MPI Command for execution using container built-in MPI",
-        workload_group="all_workloads",
-    )
-
-    workload_variable(
-        "nvshmem_disable_cuda_vmm",
-        default="1",
-        description="",
-        workload_group="all_workloads",
-    )
-    environment_variable(
-        "NVSHMEM_DISABLE_CUDA_VMM",
-        "{nvshmem_disable_cuda_vmm}",
-        description="",
-        workload_group="all_workloads",
+        workloads=["standard", "calculator"],
     )
 
     workload_variable(
         "hpl_fct_comm_policy",
         default="1",
-        description="",
+        values=["0", "1"],
+        description="Which communication library to use in the panel factorization. 0 = NVSHMEM, 1 = Host MPI",
         workload_group="all_workloads",
     )
     environment_variable(
@@ -90,7 +70,8 @@ class NvidiaHpl(HplBase):
     workload_variable(
         "hpl_use_nvshmem",
         default="0",
-        description="Whether to use NVSHMEM or not",
+        values=["0", "1"],
+        description="Whether to use NVSHMEM or not. 0 = Disable, 1 = Enable.",
         workload_group="all_workloads",
     )
     environment_variable(
@@ -103,101 +84,220 @@ class NvidiaHpl(HplBase):
     workload_variable(
         "hpl_p2p_as_bcast",
         default="0",
-        description="0 = ncclBcast, 1 = ncclSend/Recv",
+        values=["0", "1", "2", "3", "4"],
+        description="0 = ncclBcast, 1 = ncclSend/Recv, 2 = CUDA-aware MPI, 3 = host MPI, 4 = NVSHMEM",
         workload_group="all_workloads",
     )
     environment_variable(
         "HPL_P2P_AS_BCAST",
         "{hpl_p2p_as_bcast}",
-        description="Whether or not to use P2P for BCAST",
+        description="Which communication library to use in the final solve step.",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "pmix_mca_gds",
-        default="^ds12",
-        description="",
+        "hpl_nvshmem_swap",
+        default="0",
+        values=["0", "1"],
+        description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.",
         workload_group="all_workloads",
     )
     environment_variable(
-        "PMIX_MCA_gds",
-        "{pmix_mca_gds}",
-        description="PMIX MCA gds",
+        "HPL_NVSHMEM_SWAP",
+        "{hpl_nvshmem_swap}",
+        description="Performs row swaps using NVSHMEM instead of NCCL. 0 = Disable, 1 = Enable.",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "ompi_mca_btl",
-        default="^vader,tcp,openib,uct",
-        description="",
+        "hpl_chunk_size_nbs",
+        default="16",
+        description="Number of matrix blocks to group for computations. Needs to be > 0",
         workload_group="all_workloads",
     )
     environment_variable(
-        "OMPI_MCA_btl",
-        "{ompi_mca_btl}",
-        description="OpenMPI MCA btl",
+        "HPL_CHUNK_SIZE_NBS",
+        "{hpl_chunk_size_nbs}",
+        description="Number of matrix blocks to group for computations. Needs to be > 0",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "ompi_mca_pml",
-        default="ucx",
-        description="",
+        "hpl_dist_trsm_flag",
+        default="1",
+        values=["0", "1"],
+        description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.",
         workload_group="all_workloads",
     )
     environment_variable(
-        "OMPI_MCA_pml",
-        "{ompi_mca_pml}",
-        description="OpenMPI MCA pml",
+        "HPL_DIST_TRSM_FLAG",
+        "{hpl_dist_trsm_flag}",
+        description="Perform the solve step (TRSM) in parallel, rather than on only the ranks that own part of the matrix.",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "ucx_net_devices",
-        default="enp6s0,enp12s0,enp134s0,enp140s0",
-        description="",
+        "hpl_cta_per_fct",
+        default="16",
+        description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.",
         workload_group="all_workloads",
     )
     environment_variable(
-        "UCX_NET_DEVICES",
-        "{ucx_net_devices}",
-        description="UCX Net Devices",
+        "HPL_CTA_PER_FCT",
+        "{hpl_cta_per_fct}",
+        description="Sets the number of CTAs (thread blocks) for factorization. Needs to be > 0.",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "ucx_max_rndv_rails",
-        default="4",
-        description="",
+        "hpl_alloc_hugepages",
+        default="0",
+        values=["0", "1"],
+        description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.",
         workload_group="all_workloads",
     )
     environment_variable(
-        "UCX_MAX_RNDV_RAILS",
-        "{ucx_max_rndv_rails}",
-        description="UCX MAximum RNDV Rails",
+        "HPL_ALLOC_HUGEPAGES",
+        "{hpl_alloc_hugepages}",
+        description="Use 2MB hugepages for host-side allocations. Done through the madvise syscall.",
         workload_group="all_workloads",
     )
 
     workload_variable(
-        "block_size",
-        default="1024",
-        description="Size of each block",
-        workload_group="calculator",
+        "warmup_end_prog",
+        default="-1",
+        description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "WARMUP_END_PROG",
+        "{warmup_end_prog}",
+        description="Runs the main loop once before the 'real' run. Stops the warmup at x%. Values can be 1 - 100.",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "test_loops",
+        default="1",
+        description="Runs the main loop X many times",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "TEST_LOOPS",
+        "{test_loops}",
+        description="Runs the main loop X many times",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_cusolver_mp_tests",
+        default="1",
+        description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_CUSOLVER_MP_TESTS",
+        "{hpl_cusolver_mp_tests}",
+        description="Runs several tests of individual components of HPL (GEMMS, comms, etc.)",
+        workload_group="all_workloads",
     )
 
     workload_variable(
-        "nporder",
-        default="row",
-        description="Major order to use for matrix",
-        values=["row", "column"],
-        workload_group="mxp",
+        "hpl_cusolver_mp_tests_gemm_iters",
+        default="128",
+        description="Number of repeat GEMM calls in tests. Needs to be > 0.",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_CUSOLVER_MP_TESTS_GEMM_ITERS",
+        "{hpl_cusolver_mp_tests_gemm_iters}",
+        description="Number of repeat GEMM calls in tests. Needs to be > 0.",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_ooc_mode",
+        default="1",
+        description="Enables / disales out-of-core mode",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OOC_MODE",
+        "{hpl_ooc_mode}",
+        description="Enables / disales out-of-core mode",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_ooc_max_gpu_mem",
+        default="-1",
+        description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OOC_MAX_GPU_MEM",
+        "{hpl_ooc_max_gpu_mem}",
+        description="Limits the amount of GPU memory used for OOC. In GiB. Needs to be >= -1.",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_ooc_tile_m",
+        default="4096",
+        description="Row blocking factor. Needs to be > 0",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OCC_TILE_M",
+        "{hpl_occ_tile_m}",
+        description="Row blocking factor. Needs to be > 0",
+        workload_group="all_workloads",
     )
 
     workload_variable(
-        "gpu_affinity",
-        default="0:1:2:3:4:5:6:7",
-        description="Colon delimited list of GPU IDs",
-        workload_group="mxp",
+        "hpl_ooc_tile_n",
+        default="4096",
+        description="Column blocking factor. Needs to be > 0",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OCC_TILE_N",
+        "{hpl_occ_tile_n}",
+        description="Column blocking factor. Needs to be > 0",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_ooc_num_streams",
+        default="3",
+        description="Number of streams used for OCC operations",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OOC_NUM_STREAMS",
+        "{hpl_ooc_num_streams}",
+        description="Number of streams used for OCC operations",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "hpl_ooc_safe_size",
+        default="2.0",
+        description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "HPL_OOC_SAFE_SIZE",
+        "{hpl_ooc_safe_size}",
+        description="GPU memory (in GiB) needed for driver. This amount will not be used by HPL OCC",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "block_size",
+        default="1024",
+        description="Size of each block",
+        workload_group="calculator",
     )
 
     figure_of_merit(
diff --git a/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py b/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py
new file mode 100644
index 000000000..2097ecee8
--- /dev/null
+++ b/var/ramble/repos/builtin/base_applications/nvidia-hpc-benchmarks/base_application.py
@@ -0,0 +1,127 @@
+# Copyright 2022-2025 The Ramble Authors
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+
+from ramble.appkit import *
+
+
+class NvidiaHpcBenchmarks(ExecutableApplication):
+    """The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL,
+    HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for
+    performance on NVIDIA accelerated HPC systems.
+
+    NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a
+    (random) dense linear system in double precision (64-bit) arithmetic and in
+    mixed precision arithmetic using Tensor Cores, respectively, on
+    distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL
+    benchmark and HPL-MxP benchmark.
+
+    NVIDIA's HPCG benchmark accelerates the High Performance Conjugate Gradients
+    (HPCG) Benchmark. HPCG is a software package that performs a fixed number of
+    multigrid preconditioned (using a symmetric Gauss-Seidel smoother) conjugate
+    gradient (PCG) iterations using double precision (64-bit) floating point
+    values.
+
+    NVIDIA's STREAM benchmark is a simple synthetic benchmark program that measures
+    sustainable memory bandwidth. NVIDIA HPC-Benchmarks container includes STREAM
+    benchmarks optimized for NVIDIA Ampere GPU architecture (sm80), NVIDIA Hopper
+    GPU architecture (sm90) and NVIDIA Grace CPU.
+    """
+
+    name = "nvidia-hpc-benchmarks"
+
+    maintainers("douglasjacobsen")
+
+    tags("benchmark-app", "mini-app", "benchmark", "containerize")
+
+    workload_group("all_workloads")
+
+    workload_variable(
+        "internal_mpi_command",
+        default="",
+        description="MPI Command for execution using container built-in MPI",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "nvshmem_disable_cuda_vmm",
+        default="1",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "NVSHMEM_DISABLE_CUDA_VMM",
+        "{nvshmem_disable_cuda_vmm}",
+        description="",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "pmix_mca_gds",
+        default="^ds12",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "PMIX_MCA_gds",
+        "{pmix_mca_gds}",
+        description="PMIX MCA gds",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ompi_mca_btl",
+        default="^vader,tcp,openib,uct",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "OMPI_MCA_btl",
+        "{ompi_mca_btl}",
+        description="OpenMPI MCA btl",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ompi_mca_pml",
+        default="ucx",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "OMPI_MCA_pml",
+        "{ompi_mca_pml}",
+        description="OpenMPI MCA pml",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ucx_net_devices",
+        default="enp6s0,enp12s0,enp134s0,enp140s0",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "UCX_NET_DEVICES",
+        "{ucx_net_devices}",
+        description="UCX Net Devices",
+        workload_group="all_workloads",
+    )
+
+    workload_variable(
+        "ucx_max_rndv_rails",
+        default="4",
+        description="",
+        workload_group="all_workloads",
+    )
+    environment_variable(
+        "UCX_MAX_RNDV_RAILS",
+        "{ucx_max_rndv_rails}",
+        description="UCX MAximum RNDV Rails",
+        workload_group="all_workloads",
+    )