diff --git a/lib/ramble/ramble/appkit.py b/lib/ramble/ramble/appkit.py index 5d8ff425c..ca54dd060 100644 --- a/lib/ramble/ramble/appkit.py +++ b/lib/ramble/ramble/appkit.py @@ -28,6 +28,7 @@ from ramble.util.logger import logger as tty from ramble.util.file_util import get_file_path +from ramble.util.foms import FomType from ramble.util.output_capture import OUTPUT_CAPTURE diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py new file mode 100644 index 000000000..ce6bb869b --- /dev/null +++ b/var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py @@ -0,0 +1,227 @@ +# Copyright 2022-2025 The Ramble Authors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +from ramble.appkit import * + +from ramble.base_app.builtin.hpl import Hpl as HplBase + + +class NvidiaHplMxp(HplBase): + """This application defines how to run NVIDIA's optimized version of HPL, + which is contained in NVIDIA's HPC-Benchmarks collection. + + The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL, + HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for + performance on NVIDIA accelerated HPC systems. + + NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a + (random) dense linear system in double precision (64-bit) arithmetic and in + mixed precision arithmetic using Tensor Cores, respectively, on + distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL + benchmark and HPL-MxP benchmark. + + https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks + """ + + name = "nvidia-hpl-mxp" + + maintainers("douglasjacobsen") + + tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia") + + executable( + "execute", + './hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}', + use_mpi=True, + ) + + workload("standard", executables=["execute"]) + workload("calculator", executables=["execute"]) + + workload_group("standard", workloads=["standard"], mode="append") + workload_group("calculator", workloads=["calculator"], mode="append") + workload_group( + "all_workloads", + workloads=["standard", "calculator"], + ) + + workload_variable( + "nvshmem_disable_cuda_vmm", + default="1", + description="", + workload_group="all_workloads", + ) + environment_variable( + "NVSHMEM_DISABLE_CUDA_VMM", + "{nvshmem_disable_cuda_vmm}", + description="", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_fct_comm_policy", + default="1", + description="", + workload_group="all_workloads", + ) + environment_variable( + "HPL_FCT_COMM_POLICY", + "{hpl_fct_comm_policy}", + description="", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_use_nvshmem", + default="0", + description="Whether to use NVSHMEM or not", + workload_group="all_workloads", + ) + environment_variable( + "HPL_USE_NVSHMEM", + "{hpl_use_nvshmem}", + description="Whether or not to use NVSHMEM", + workload_group="all_workloads", + ) + + workload_variable( + "hpl_p2p_as_bcast", + default="0", + description="0 = ncclBcast, 1 = ncclSend/Recv", + workload_group="all_workloads", + ) + environment_variable( + "HPL_P2P_AS_BCAST", + "{hpl_p2p_as_bcast}", + description="Whether or not to use P2P for BCAST", + workload_group="all_workloads", + ) + + workload_variable( + "pmix_mca_gds", + default="^ds12", + description="", + workload_group="all_workloads", + ) + environment_variable( + "PMIX_MCA_gds", + "{pmix_mca_gds}", + description="PMIX MCA gds", + workload_group="all_workloads", + ) + + workload_variable( + "ompi_mca_btl", + default="^vader,tcp,openib,uct", + description="", + workload_group="all_workloads", + ) + environment_variable( + "OMPI_MCA_btl", + "{ompi_mca_btl}", + description="OpenMPI MCA btl", + workload_group="all_workloads", + ) + + workload_variable( + "ompi_mca_pml", + default="ucx", + description="", + workload_group="all_workloads", + ) + environment_variable( + "OMPI_MCA_pml", + "{ompi_mca_pml}", + description="OpenMPI MCA pml", + workload_group="all_workloads", + ) + + workload_variable( + "ucx_net_devices", + default="enp6s0,enp12s0,enp134s0,enp140s0", + description="", + workload_group="all_workloads", + ) + environment_variable( + "UCX_NET_DEVICES", + "{ucx_net_devices}", + description="UCX Net Devices", + workload_group="all_workloads", + ) + + workload_variable( + "ucx_max_rndv_rails", + default="4", + description="", + workload_group="all_workloads", + ) + environment_variable( + "UCX_MAX_RNDV_RAILS", + "{ucx_max_rndv_rails}", + description="UCX MAximum RNDV Rails", + workload_group="all_workloads", + ) + + workload_variable( + "block_size", + default="1024", + description="Size of each block", + workload_group="calculator", + ) + + workload_variable( + "nporder", + default="row", + description="Major order to use for matrix", + values=["row", "column"], + workload_group="all_workloads", + ) + + workload_variable( + "gpu_affinity", + default="0:1:2:3:4:5:6:7", + description="Colon delimited list of GPU IDs", + workload_group="all_workloads", + ) + + # MxP FOMs + gflops_regex = ( + r"\s+GFLOPS = (?P\S+), per GPU =\s+(?P\S+)" + ) + lu_gflops_regex = ( + r"\s+LU GFLOPS = (?P\S+), per GPU =\s+(?P\S+)" + ) + figure_of_merit( + "Total GFlops", + fom_regex=gflops_regex, + group_name="gflops", + units="GFLOP/s", + fom_type=FomType.THROUGHPUT, + ) + figure_of_merit( + "Per GPU GFlops", + fom_regex=gflops_regex, + group_name="per_gflops", + units="GFLOP/s", + fom_type=FomType.THROUGHPUT, + ) + + figure_of_merit( + "Total LU GFlops", + fom_regex=lu_gflops_regex, + group_name="gflops", + units="GFLOP/s", + fom_type=FomType.THROUGHPUT, + ) + figure_of_merit( + "Per GPU LU GFlops", + fom_regex=lu_gflops_regex, + group_name="per_gflops", + units="GFLOP/s", + fom_type=FomType.THROUGHPUT, + ) diff --git a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py index f75e71eeb..29a5b511d 100644 --- a/var/ramble/repos/builtin/applications/nvidia-hpl/application.py +++ b/var/ramble/repos/builtin/applications/nvidia-hpl/application.py @@ -200,3 +200,14 @@ class NvidiaHpl(HplBase): description="Colon delimited list of GPU IDs", workload_group="mxp", ) + + figure_of_merit( + "Per GPU GFlops", + fom_regex=r".*\s+(?P[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)" + + r"\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P

[0-9]+)\s+(?P[0-9]+)\s+(?P