-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #818 from douglasjacobsen/fix-hpl-foms
Update FOM definitions and split out HPL-MxP
- Loading branch information
Showing
5 changed files
with
292 additions
and
90 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
227 changes: 227 additions & 0 deletions
227
var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
# Copyright 2022-2025 The Ramble Authors | ||
# | ||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your | ||
# option. This file may not be copied, modified, or distributed | ||
# except according to those terms. | ||
|
||
from ramble.appkit import * | ||
|
||
from ramble.base_app.builtin.hpl import Hpl as HplBase | ||
|
||
|
||
class NvidiaHplMxp(HplBase): | ||
"""This application defines how to run NVIDIA's optimized version of HPL, | ||
which is contained in NVIDIA's HPC-Benchmarks collection. | ||
The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL, | ||
HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for | ||
performance on NVIDIA accelerated HPC systems. | ||
NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a | ||
(random) dense linear system in double precision (64-bit) arithmetic and in | ||
mixed precision arithmetic using Tensor Cores, respectively, on | ||
distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL | ||
benchmark and HPL-MxP benchmark. | ||
https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks | ||
""" | ||
|
||
name = "nvidia-hpl-mxp" | ||
|
||
maintainers("douglasjacobsen") | ||
|
||
tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia") | ||
|
||
executable( | ||
"execute", | ||
'./hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}', | ||
use_mpi=True, | ||
) | ||
|
||
workload("standard", executables=["execute"]) | ||
workload("calculator", executables=["execute"]) | ||
|
||
workload_group("standard", workloads=["standard"], mode="append") | ||
workload_group("calculator", workloads=["calculator"], mode="append") | ||
workload_group( | ||
"all_workloads", | ||
workloads=["standard", "calculator"], | ||
) | ||
|
||
workload_variable( | ||
"nvshmem_disable_cuda_vmm", | ||
default="1", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"NVSHMEM_DISABLE_CUDA_VMM", | ||
"{nvshmem_disable_cuda_vmm}", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"hpl_fct_comm_policy", | ||
default="1", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"HPL_FCT_COMM_POLICY", | ||
"{hpl_fct_comm_policy}", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"hpl_use_nvshmem", | ||
default="0", | ||
description="Whether to use NVSHMEM or not", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"HPL_USE_NVSHMEM", | ||
"{hpl_use_nvshmem}", | ||
description="Whether or not to use NVSHMEM", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"hpl_p2p_as_bcast", | ||
default="0", | ||
description="0 = ncclBcast, 1 = ncclSend/Recv", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"HPL_P2P_AS_BCAST", | ||
"{hpl_p2p_as_bcast}", | ||
description="Whether or not to use P2P for BCAST", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"pmix_mca_gds", | ||
default="^ds12", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"PMIX_MCA_gds", | ||
"{pmix_mca_gds}", | ||
description="PMIX MCA gds", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"ompi_mca_btl", | ||
default="^vader,tcp,openib,uct", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"OMPI_MCA_btl", | ||
"{ompi_mca_btl}", | ||
description="OpenMPI MCA btl", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"ompi_mca_pml", | ||
default="ucx", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"OMPI_MCA_pml", | ||
"{ompi_mca_pml}", | ||
description="OpenMPI MCA pml", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"ucx_net_devices", | ||
default="enp6s0,enp12s0,enp134s0,enp140s0", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"UCX_NET_DEVICES", | ||
"{ucx_net_devices}", | ||
description="UCX Net Devices", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"ucx_max_rndv_rails", | ||
default="4", | ||
description="", | ||
workload_group="all_workloads", | ||
) | ||
environment_variable( | ||
"UCX_MAX_RNDV_RAILS", | ||
"{ucx_max_rndv_rails}", | ||
description="UCX MAximum RNDV Rails", | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"block_size", | ||
default="1024", | ||
description="Size of each block", | ||
workload_group="calculator", | ||
) | ||
|
||
workload_variable( | ||
"nporder", | ||
default="row", | ||
description="Major order to use for matrix", | ||
values=["row", "column"], | ||
workload_group="all_workloads", | ||
) | ||
|
||
workload_variable( | ||
"gpu_affinity", | ||
default="0:1:2:3:4:5:6:7", | ||
description="Colon delimited list of GPU IDs", | ||
workload_group="all_workloads", | ||
) | ||
|
||
# MxP FOMs | ||
gflops_regex = ( | ||
r"\s+GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)" | ||
) | ||
lu_gflops_regex = ( | ||
r"\s+LU GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)" | ||
) | ||
figure_of_merit( | ||
"Total GFlops", | ||
fom_regex=gflops_regex, | ||
group_name="gflops", | ||
units="GFLOP/s", | ||
fom_type=FomType.THROUGHPUT, | ||
) | ||
figure_of_merit( | ||
"Per GPU GFlops", | ||
fom_regex=gflops_regex, | ||
group_name="per_gflops", | ||
units="GFLOP/s", | ||
fom_type=FomType.THROUGHPUT, | ||
) | ||
|
||
figure_of_merit( | ||
"Total LU GFlops", | ||
fom_regex=lu_gflops_regex, | ||
group_name="gflops", | ||
units="GFLOP/s", | ||
fom_type=FomType.THROUGHPUT, | ||
) | ||
figure_of_merit( | ||
"Per GPU LU GFlops", | ||
fom_regex=lu_gflops_regex, | ||
group_name="per_gflops", | ||
units="GFLOP/s", | ||
fom_type=FomType.THROUGHPUT, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.