From 709530222b64112f244eecafaa0d0d35fe0bc50d Mon Sep 17 00:00:00 2001 From: Joe Wallwork Date: Mon, 27 Jan 2025 18:08:55 +0000 Subject: [PATCH] Add checks for comm size --- examples/7_MPI/CMakeLists.txt | 8 ++++---- examples/7_MPI/mpi_infer_fortran.f90 | 10 +++++++++- examples/7_MPI/mpi_infer_python.py | 6 ++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/examples/7_MPI/CMakeLists.txt b/examples/7_MPI/CMakeLists.txt index fbb3283a..f850dd6b 100644 --- a/examples/7_MPI/CMakeLists.txt +++ b/examples/7_MPI/CMakeLists.txt @@ -40,8 +40,8 @@ if(CMAKE_BUILD_TESTS) # the model WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) - # 3. Check the model can be loaded from file and run in Python and that its - # outputs meet expectations + # 3. Check the model can be loaded from file and run with MPI in Python and + # that its outputs meet expectations add_test( NAME mpi_infer_python COMMAND @@ -53,8 +53,8 @@ if(CMAKE_BUILD_TESTS) mpi_infer_python PROPERTIES PASS_REGULAR_EXPRESSION "MPI Python example ran successfully") - # 4. Check the model can be loaded from file and run in Fortran and that its - # outputs meet expectations + # 4. Check the model can be loaded from file and run with MPI in Fortran and + # that its outputs meet expectations add_test( NAME mpi_infer_fortran COMMAND diff --git a/examples/7_MPI/mpi_infer_fortran.f90 b/examples/7_MPI/mpi_infer_fortran.f90 index e217ec9c..bdaa91bd 100644 --- a/examples/7_MPI/mpi_infer_fortran.f90 +++ b/examples/7_MPI/mpi_infer_fortran.f90 @@ -47,6 +47,15 @@ program inference call mpi_init(ierr) call mpi_comm_rank(mpi_comm_world, rank, ierr) + call mpi_comm_size(mpi_comm_world, size, ierr) + + ! Check MPI was configured correctly + if (size == 1) then + write(*,*) "MPI communicator size is 1, indicating that it is not configured correctly" + write(*,*) "(assuming you specified more than one rank)" + call clean_up() + stop 999 + end if ! Get TorchScript model file as a command line argument num_args = command_argument_count() @@ -76,7 +85,6 @@ program inference write(unit=6, fmt=100) out_data(:) ! Gather the outputs onto rank 0 - call mpi_comm_size(mpi_comm_world, size, ierr) allocate(recvbuf(5,size)) call mpi_gather(out_data, 5, mpi_float, recvbuf, 5, mpi_float, 0, mpi_comm_world, ierr) diff --git a/examples/7_MPI/mpi_infer_python.py b/examples/7_MPI/mpi_infer_python.py index bd8963d8..8e85544b 100644 --- a/examples/7_MPI/mpi_infer_python.py +++ b/examples/7_MPI/mpi_infer_python.py @@ -59,6 +59,12 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor: comm = MPI.COMM_WORLD rank = comm.rank device_to_run = "cpu" + if comm.size == 1: + size_error = ( + "MPI communicator size is 1, indicating that it is not configured correctly" + " (assuming you specified more than one rank)" + ) + raise ValueError(size_error) batch_size_to_run = 1