From 709530222b64112f244eecafaa0d0d35fe0bc50d Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 27 Jan 2025 18:08:55 +0000
Subject: [PATCH] Add checks for comm size

---
 examples/7_MPI/CMakeLists.txt        |  8 ++++----
 examples/7_MPI/mpi_infer_fortran.f90 | 10 +++++++++-
 examples/7_MPI/mpi_infer_python.py   |  6 ++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/examples/7_MPI/CMakeLists.txt b/examples/7_MPI/CMakeLists.txt
index fbb3283a..f850dd6b 100644
--- a/examples/7_MPI/CMakeLists.txt
+++ b/examples/7_MPI/CMakeLists.txt
@@ -40,8 +40,8 @@ if(CMAKE_BUILD_TESTS)
                                   # the model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 3. Check the model can be loaded from file and run in Python and that its
-  #   outputs meet expectations
+  # 3. Check the model can be loaded from file and run with MPI in Python and
+  #   that its outputs meet expectations
   add_test(
     NAME mpi_infer_python
     COMMAND
@@ -53,8 +53,8 @@ if(CMAKE_BUILD_TESTS)
     mpi_infer_python PROPERTIES PASS_REGULAR_EXPRESSION
     "MPI Python example ran successfully")
 
-  # 4. Check the model can be loaded from file and run in Fortran and that its
-  #   outputs meet expectations
+  # 4. Check the model can be loaded from file and run with MPI in Fortran and
+  #   that its outputs meet expectations
   add_test(
     NAME mpi_infer_fortran
     COMMAND
diff --git a/examples/7_MPI/mpi_infer_fortran.f90 b/examples/7_MPI/mpi_infer_fortran.f90
index e217ec9c..bdaa91bd 100644
--- a/examples/7_MPI/mpi_infer_fortran.f90
+++ b/examples/7_MPI/mpi_infer_fortran.f90
@@ -47,6 +47,15 @@ program inference
 
    call mpi_init(ierr)
    call mpi_comm_rank(mpi_comm_world, rank, ierr)
+   call mpi_comm_size(mpi_comm_world, size, ierr)
+
+   ! Check MPI was configured correctly
+   if (size == 1) then
+      write(*,*) "MPI communicator size is 1, indicating that it is not configured correctly"
+      write(*,*) "(assuming you specified more than one rank)"
+      call clean_up()
+      stop 999
+   end if
 
    ! Get TorchScript model file as a command line argument
    num_args = command_argument_count()
@@ -76,7 +85,6 @@ program inference
    write(unit=6, fmt=100) out_data(:)
 
    ! Gather the outputs onto rank 0
-   call mpi_comm_size(mpi_comm_world, size, ierr)
    allocate(recvbuf(5,size))
    call mpi_gather(out_data, 5, mpi_float, recvbuf, 5, mpi_float, 0, mpi_comm_world, ierr)
 
diff --git a/examples/7_MPI/mpi_infer_python.py b/examples/7_MPI/mpi_infer_python.py
index bd8963d8..8e85544b 100644
--- a/examples/7_MPI/mpi_infer_python.py
+++ b/examples/7_MPI/mpi_infer_python.py
@@ -59,6 +59,12 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
     comm = MPI.COMM_WORLD
     rank = comm.rank
     device_to_run = "cpu"
+    if comm.size == 1:
+        size_error = (
+            "MPI communicator size is 1, indicating that it is not configured correctly"
+            " (assuming you specified more than one rank)"
+        )
+        raise ValueError(size_error)
 
     batch_size_to_run = 1