Skip to content

Commit

Permalink
Merge pull request #818 from douglasjacobsen/fix-hpl-foms
Browse files Browse the repository at this point in the history
Update FOM definitions and split out HPL-MxP
  • Loading branch information
rfbgo authored Jan 10, 2025
2 parents cb82e32 + a599a93 commit 9be8e3f
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 90 deletions.
1 change: 1 addition & 0 deletions lib/ramble/ramble/appkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ramble.util.logger import logger as tty

from ramble.util.file_util import get_file_path
from ramble.util.foms import FomType

from ramble.util.output_capture import OUTPUT_CAPTURE

Expand Down
227 changes: 227 additions & 0 deletions var/ramble/repos/builtin/applications/nvidia-hpl-mxp/application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# Copyright 2022-2025 The Ramble Authors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

from ramble.appkit import *

from ramble.base_app.builtin.hpl import Hpl as HplBase


class NvidiaHplMxp(HplBase):
"""This application defines how to run NVIDIA's optimized version of HPL,
which is contained in NVIDIA's HPC-Benchmarks collection.
The NVIDIA HPC-Benchmarks collection provides four benchmarks (HPL,
HPL-MxP, HPCG, and STREAM) widely used in the HPC community optimized for
performance on NVIDIA accelerated HPC systems.
NVIDIA's HPL and HPL-MxP benchmarks provide software packages to solve a
(random) dense linear system in double precision (64-bit) arithmetic and in
mixed precision arithmetic using Tensor Cores, respectively, on
distributed-memory computers equipped with NVIDIA GPUs, based on the Netlib HPL
benchmark and HPL-MxP benchmark.
https://catalog.ngc.nvidia.com/orgs/nvidia/containers/hpc-benchmarks
"""

name = "nvidia-hpl-mxp"

maintainers("douglasjacobsen")

tags("benchmark-app", "benchmark", "linpack", "optimized", "nvidia")

executable(
"execute",
'./hpl-mxp.sh --gpu-affinity "{gpu_affinity}" --n {Ns} --nb {block_size} --nprow {Ps} --npcol {Qs} --nporder {nporder}',
use_mpi=True,
)

workload("standard", executables=["execute"])
workload("calculator", executables=["execute"])

workload_group("standard", workloads=["standard"], mode="append")
workload_group("calculator", workloads=["calculator"], mode="append")
workload_group(
"all_workloads",
workloads=["standard", "calculator"],
)

workload_variable(
"nvshmem_disable_cuda_vmm",
default="1",
description="",
workload_group="all_workloads",
)
environment_variable(
"NVSHMEM_DISABLE_CUDA_VMM",
"{nvshmem_disable_cuda_vmm}",
description="",
workload_group="all_workloads",
)

workload_variable(
"hpl_fct_comm_policy",
default="1",
description="",
workload_group="all_workloads",
)
environment_variable(
"HPL_FCT_COMM_POLICY",
"{hpl_fct_comm_policy}",
description="",
workload_group="all_workloads",
)

workload_variable(
"hpl_use_nvshmem",
default="0",
description="Whether to use NVSHMEM or not",
workload_group="all_workloads",
)
environment_variable(
"HPL_USE_NVSHMEM",
"{hpl_use_nvshmem}",
description="Whether or not to use NVSHMEM",
workload_group="all_workloads",
)

workload_variable(
"hpl_p2p_as_bcast",
default="0",
description="0 = ncclBcast, 1 = ncclSend/Recv",
workload_group="all_workloads",
)
environment_variable(
"HPL_P2P_AS_BCAST",
"{hpl_p2p_as_bcast}",
description="Whether or not to use P2P for BCAST",
workload_group="all_workloads",
)

workload_variable(
"pmix_mca_gds",
default="^ds12",
description="",
workload_group="all_workloads",
)
environment_variable(
"PMIX_MCA_gds",
"{pmix_mca_gds}",
description="PMIX MCA gds",
workload_group="all_workloads",
)

workload_variable(
"ompi_mca_btl",
default="^vader,tcp,openib,uct",
description="",
workload_group="all_workloads",
)
environment_variable(
"OMPI_MCA_btl",
"{ompi_mca_btl}",
description="OpenMPI MCA btl",
workload_group="all_workloads",
)

workload_variable(
"ompi_mca_pml",
default="ucx",
description="",
workload_group="all_workloads",
)
environment_variable(
"OMPI_MCA_pml",
"{ompi_mca_pml}",
description="OpenMPI MCA pml",
workload_group="all_workloads",
)

workload_variable(
"ucx_net_devices",
default="enp6s0,enp12s0,enp134s0,enp140s0",
description="",
workload_group="all_workloads",
)
environment_variable(
"UCX_NET_DEVICES",
"{ucx_net_devices}",
description="UCX Net Devices",
workload_group="all_workloads",
)

workload_variable(
"ucx_max_rndv_rails",
default="4",
description="",
workload_group="all_workloads",
)
environment_variable(
"UCX_MAX_RNDV_RAILS",
"{ucx_max_rndv_rails}",
description="UCX MAximum RNDV Rails",
workload_group="all_workloads",
)

workload_variable(
"block_size",
default="1024",
description="Size of each block",
workload_group="calculator",
)

workload_variable(
"nporder",
default="row",
description="Major order to use for matrix",
values=["row", "column"],
workload_group="all_workloads",
)

workload_variable(
"gpu_affinity",
default="0:1:2:3:4:5:6:7",
description="Colon delimited list of GPU IDs",
workload_group="all_workloads",
)

# MxP FOMs
gflops_regex = (
r"\s+GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
)
lu_gflops_regex = (
r"\s+LU GFLOPS = (?P<gflops>\S+), per GPU =\s+(?P<per_gflops>\S+)"
)
figure_of_merit(
"Total GFlops",
fom_regex=gflops_regex,
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
figure_of_merit(
"Per GPU GFlops",
fom_regex=gflops_regex,
group_name="per_gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)

figure_of_merit(
"Total LU GFlops",
fom_regex=lu_gflops_regex,
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
figure_of_merit(
"Per GPU LU GFlops",
fom_regex=lu_gflops_regex,
group_name="per_gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)
11 changes: 11 additions & 0 deletions var/ramble/repos/builtin/applications/nvidia-hpl/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,14 @@ class NvidiaHpl(HplBase):
description="Colon delimited list of GPU IDs",
workload_group="mxp",
)

figure_of_merit(
"Per GPU GFlops",
fom_regex=r".*\s+(?P<N>[0-9]+)\s+(?P<NB>[0-9]+)\s+(?P<P>[0-9]+)"
+ r"\s+(?P<Q>[0-9]+)\s+(?P<time>[0-9]+\.[0-9]+)\s+"
+ r"(?P<gflops>\S+)\s+\(\s+(?P<per_gpu_gflops>\S+)\)",
group_name="per_gpu_gflops",
units="GFLOP/s",
contexts=["problem-name"],
fom_type=FomType.THROUGHPUT,
)
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,25 @@ class Hpcg(ExecutableApplication):

figure_of_merit(
"Status",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
group_name="status",
units="",
)

figure_of_merit(
"Gflops",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9]+\.[0-9]+)",
"GFlops",
fom_regex=r"Final Summary::HPCG result is (?P<status>[a-zA-Z]+) with a GFLOP/s rating of=(?P<gflops>[0-9\.]+)",
group_name="gflops",
units="GFLOP/s",
fom_type=FomType.THROUGHPUT,
)

figure_of_merit(
"Time",
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9]+\.[0-9]*)",
fom_regex=r"Final Summary::Results are.* execution time.*is=(?P<exec_time>[0-9\.]*)",
group_name="exec_time",
units="s",
fom_type=FomType.TIME,
)

figure_of_merit(
Expand Down Expand Up @@ -106,9 +108,10 @@ class Hpcg(ExecutableApplication):

figure_of_merit(
"HPCG 2.4 Rating",
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9]+\.*[0-9]*)",
fom_regex=r"Final Summary::HPCG 2\.4 rating.*=(?P<rating>[0-9\.]+)",
group_name="rating",
units="",
fom_type=FomType.THROUGHPUT,
)

register_template(
Expand Down
Loading

0 comments on commit 9be8e3f

Please sign in to comment.