Merge remote-tracking branch 'origin/208_multi-gpu-build' into add-de…

…vices-iccs
Cambridge-ICCS · Jan 24, 2025 · 4504725 · 4504725
2 parents 765f79a + 3f3cd53
commit 4504725
Show file tree

Hide file tree

Showing 12 changed files with 83 additions and 27 deletions.
diff --git a/examples/1_SimpleNet/CMakeLists.txt b/examples/1_SimpleNet/CMakeLists.txt
@@ -29,7 +29,7 @@ if(CMAKE_BUILD_TESTS)
   add_test(NAME simplenet COMMAND ${Python_EXECUTABLE}
                                   ${PROJECT_SOURCE_DIR}/simplenet.py)
 
-  # 1. Check the model is saved to file in the expected location with the
+  # 2. Check the model is saved to file in the expected location with the
   #   pt2ts.py script
   add_test(
     NAME pt2ts
@@ -38,7 +38,7 @@ if(CMAKE_BUILD_TESTS)
                                   # the model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 1. Check the model can be loaded from file and run in Python and that its
+  # 3. Check the model can be loaded from file and run in Python and that its
   #   outputs meet expectations
   add_test(
     NAME simplenet_infer_python
@@ -47,7 +47,7 @@ if(CMAKE_BUILD_TESTS)
                                   # model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 1. Check the model can be loaded from file and run in Fortran and that its
+  # 4. Check the model can be loaded from file and run in Fortran and that its
   #   outputs meet expectations
   add_test(
     NAME simplenet_infer_fortran

diff --git a/examples/2_ResNet18/CMakeLists.txt b/examples/2_ResNet18/CMakeLists.txt
@@ -31,7 +31,7 @@ if(CMAKE_BUILD_TESTS)
     COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/resnet18.py
     WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
 
-  # 1. Check the model is saved to file in the expected location with the
+  # 2. Check the model is saved to file in the expected location with the
   #   pt2ts.py script
   add_test(
     NAME pt2ts
@@ -40,7 +40,7 @@ if(CMAKE_BUILD_TESTS)
                                   # the model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 1. Check the model can be loaded from file and run in Fortran and that its
+  # 3. Check the model can be loaded from file and run in Fortran and that its
   #   outputs meet expectations
   add_test(
     NAME resnet_infer_fortran

diff --git a/examples/3_MultiGPU/CMakeLists.txt b/examples/3_MultiGPU/CMakeLists.txt
@@ -18,7 +18,53 @@ find_package(FTorch)
 find_package(MPI REQUIRED)
 message(STATUS "Building with Fortran PyTorch coupling")
 
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(ERROR "No CUDA support")
+endif()
+
 # Fortran example
-add_executable(simplenet_infer_fortran_gpu simplenet_infer_fortran.f90)
-target_link_libraries(simplenet_infer_fortran_gpu PRIVATE FTorch::ftorch)
-target_link_libraries(simplenet_infer_fortran_gpu PRIVATE MPI::MPI_Fortran)
+add_executable(multigpu_infer_fortran multigpu_infer_fortran.f90)
+target_link_libraries(multigpu_infer_fortran PRIVATE FTorch::ftorch)
+target_link_libraries(multigpu_infer_fortran PRIVATE MPI::MPI_Fortran)
+
+# Integration testing
+if (CMAKE_BUILD_TESTS)
+  include(CTest)
+
+  # 1. Check the PyTorch model runs and its outputs meet expectations
+  add_test(NAME multigpu COMMAND ${Python_EXECUTABLE}
+                                  ${PROJECT_SOURCE_DIR}/multigpu.py)
+
+  # 2. Check the model is saved to file in the expected location with the
+  #   pt2ts.py script
+  add_test(
+    NAME pt2ts
+    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/pt2ts.py
+            ${PROJECT_BINARY_DIR} # Command line argument: filepath for saving
+                                  # the model
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+
+  # 3. Check the model can be loaded from file and run in Python and that its
+  #   outputs meet expectations
+  add_test(
+    NAME multigpu_infer_python
+    COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/multigpu_infer_python.py
+            ${PROJECT_BINARY_DIR} # Command line argument: filepath to find the
+                                  # model
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+
+  # 4. Check the model can be loaded from file and run in Fortran and that its
+  #   outputs meet expectations
+  add_test(
+    NAME multigpu_infer_fortran
+    COMMAND
+      multigpu_infer_fortran ${PROJECT_BINARY_DIR}/saved_multigpu_model_cuda.pt
+      # Command line argument: model file
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+  set_tests_properties(
+    multigpu_infer_fortran PROPERTIES PASS_REGULAR_EXPRESSION
+    "MultiGPU example ran successfully")
+endif()
diff --git a/examples/3_MultiGPU/simplenet.py → examples/3_MultiGPU/multigpu.py b/examples/3_MultiGPU/simplenet.py → examples/3_MultiGPU/multigpu.py
@@ -4,7 +4,7 @@
 from torch import nn
 
 
-class SimpleNet(nn.Module):
+class MultiGPUNet(nn.Module):
     """PyTorch module multiplying an input vector by 2."""
 
     def __init__(
@@ -42,12 +42,13 @@ def forward(self, batch: torch.Tensor) -> torch.Tensor:
 
 
 if __name__ == "__main__":
-    model = SimpleNet()
+    model = MultiGPUNet().to(torch.device("cuda"))
     model.eval()
 
     input_tensor = torch.Tensor([0.0, 1.0, 2.0, 3.0, 4.0])
     input_tensor_gpu = input_tensor.to(torch.device("cuda"))
 
     print(f"SimpleNet forward pass on CUDA device {input_tensor_gpu.get_device()}")
     with torch.no_grad():
-        print(model(input_tensor_gpu))
+        output = model(input_tensor_gpu)
+    print(output)
diff --git a/...es/3_MultiGPU/simplenet_infer_fortran.f90 → ...les/3_MultiGPU/multigpu_infer_fortran.f90 b/...es/3_MultiGPU/simplenet_infer_fortran.f90 → ...les/3_MultiGPU/multigpu_infer_fortran.f90
diff --git a/...ples/3_MultiGPU/simplenet_infer_python.py → examples/3_MultiGPU/multigpu_infer_python.py b/...ples/3_MultiGPU/simplenet_infer_python.py → examples/3_MultiGPU/multigpu_infer_python.py
@@ -1,4 +1,4 @@
-"""Load saved SimpleNet to TorchScript and run inference example."""
+"""Load saved MultiGPUNet to TorchScript and run inference example."""
 
 import torch
 from mpi4py import MPI
@@ -50,14 +50,13 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
 
 
 if __name__ == "__main__":
-    saved_model_file = "saved_simplenet_model_cuda.pt"
+    saved_model_file = "saved_multigpu_model_cuda.pt"
 
-    rank = MPI.COMM_WORLD.rank
-    device_to_run = f"cuda:{rank}"
+    device_to_run = f"cuda:{MPI.COMM_WORLD.rank}"
 
     batch_size_to_run = 1
 
     with torch.no_grad():
         result = deploy(saved_model_file, device_to_run, batch_size_to_run)
 
-    print(f"{rank}: {result}")
+    print(f"Output on device {device_to_run}: {result}")
diff --git a/examples/3_MultiGPU/pt2ts.py b/examples/3_MultiGPU/pt2ts.py
@@ -1,11 +1,13 @@
 """Load a PyTorch model and convert it to TorchScript."""
 
+import os
+import sys
 from typing import Optional
 
 # FPTLIB-TODO
 # Add a module import with your model here:
 # This example assumes the model architecture is in an adjacent module `my_ml_model.py`
-import simplenet
+import multigpu
 import torch
 
 
@@ -79,7 +81,7 @@ def load_torchscript(filename: Optional[str] = "saved_model.pt") -> torch.nn.Mod
     # Insert code here to load your model as `trained_model`.
     # This example assumes my_ml_model has a method `initialize` to load
     # architecture, weights, and place in inference mode
-    trained_model = simplenet.SimpleNet()
+    trained_model = multigpu.MultiGPUNet()
 
     # Switch off specific layers/parts of the model that behave
     # differently during training and inference.
@@ -115,7 +117,7 @@ def load_torchscript(filename: Optional[str] = "saved_model.pt") -> torch.nn.Mod
 
     # FPTLIB-TODO
     # Set the name of the file you want to save the torchscript model to:
-    saved_ts_filename = "saved_simplenet_model_cuda.pt"
+    saved_ts_filename = "saved_multigpu_model_cuda.pt"
     # A filepath may also be provided. To do this, pass the filepath as an argument to
     # this script when it is run from the command line, i.e. `./pt2ts.py path/to/model`.
 

diff --git a/examples/4_MultiIO/CMakeLists.txt b/examples/4_MultiIO/CMakeLists.txt
@@ -29,7 +29,7 @@ if(CMAKE_BUILD_TESTS)
   add_test(NAME multiionet COMMAND ${Python_EXECUTABLE}
                                    ${PROJECT_SOURCE_DIR}/multiionet.py)
 
-  # 1. Check the model is saved to file in the expected location with the
+  # 2. Check the model is saved to file in the expected location with the
   #   pt2ts.py script
   add_test(
     NAME pt2ts
@@ -38,7 +38,7 @@ if(CMAKE_BUILD_TESTS)
                                   # the model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 1. Check the model can be loaded from file and run in Python and that its
+  # 3. Check the model can be loaded from file and run in Python and that its
   #   outputs meet expectations
   add_test(
     NAME multiionet_infer_python
@@ -47,7 +47,7 @@ if(CMAKE_BUILD_TESTS)
       ${PROJECT_BINARY_DIR} # Command line argument: filepath to find the model
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
 
-  # 1. Check the model can be loaded from file and run in Fortran and that its
+  # 4. Check the model can be loaded from file and run in Fortran and that its
   #   outputs meet expectations
   add_test(
     NAME multiionet_infer_fortran

diff --git a/examples/6_Autograd/CMakeLists.txt b/examples/6_Autograd/CMakeLists.txt
@@ -29,7 +29,7 @@ if(CMAKE_BUILD_TESTS)
   add_test(NAME pyautograd COMMAND ${Python_EXECUTABLE}
                                    ${PROJECT_SOURCE_DIR}/autograd.py)
 
-  # 1. Check the Fortran Autograd script runs successfully
+  # 2. Check the Fortran Autograd script runs successfully
   add_test(
     NAME fautograd
     COMMAND autograd

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,7 +1,9 @@
 if(CMAKE_BUILD_TESTS)
   add_subdirectory(1_SimpleNet)
   add_subdirectory(2_ResNet18)
-  # add_subdirectory(3_MultiGPU)
+  if(ENABLE_CUDA AND ENABLE_MPI)
+    add_subdirectory(3_MultiGPU)
+  endif()
   add_subdirectory(4_MultiIO)
   # add_subdirectory(5_Looping)
   add_subdirectory(6_Autograd)

diff --git a/run_test_suite.sh b/run_test_suite.sh
@@ -82,7 +82,11 @@ fi
 
 # Run integration tests
 if [ "${UNIT_ONLY}" = false ]; then
-  EXAMPLES="1_SimpleNet 2_ResNet18 4_MultiIO 6_Autograd"
+  if [ -e "${BUILD_DIR}/test/examples/3_MultiGPU" ]; then
+    EXAMPLES="1_SimpleNet 2_ResNet18 3_MultiGPU 4_MultiIO 6_Autograd"
+  else
+    EXAMPLES="1_SimpleNet 2_ResNet18 4_MultiIO 6_Autograd"
+  fi
   for EXAMPLE in ${EXAMPLES}; do
     pip -q install -r examples/"${EXAMPLE}"/requirements.txt
     cd "${BUILD_DIR}"/test/examples/"${EXAMPLE}"

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -127,8 +127,10 @@ if(CMAKE_BUILD_TESTS)
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/2_ResNet18
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
-  # file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/3_MultiGPU
-  #      DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
+  if(ENABLE_CUDA AND ENABLE_MPI)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/3_MultiGPU
+        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
+  endif()
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/4_MultiIO
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   # file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/5_Looping