diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml index fe38f7bea1..9f7869de63 100644 --- a/.buildkite/distributed/pipeline.yml +++ b/.buildkite/distributed/pipeline.yml @@ -1,7 +1,7 @@ agents: queue: new-central - slurm_mem: 8G - modules: climacommon/2024_10_09 + slurm_mem: 8G # Note that the tests run on shared nodes, so limiting the memory usage might help in avoiding long queues + modules: climacommon/2024_10_08 env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite/distributed" @@ -16,13 +16,14 @@ steps: key: "init_central" env: TEST_GROUP: "init" + GPU_TEST: "true" command: - - echo "--- Instantiate project" - - "julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" + - echo "--- Initialize tests" + - "julia -O0 --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G - slurm_gpus: 1 - slurm_cpus_per_task: 8 + slurm_mem: 8G + slurm_ntasks: 1 + slurm_gpus_per_task: 1 - wait @@ -30,20 +31,27 @@ steps: key: "distributed_cpu" env: TEST_GROUP: "distributed" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 8G slurm_ntasks: 4 + retry: + automatic: + - exit_status: 1 + limit: 1 - label: "🐲 gpu distributed unit tests" key: "distributed_gpu" env: TEST_GROUP: "distributed" + GPU_TEST: "true" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 8G slurm_ntasks: 4 slurm_gpus_per_task: 1 retry: @@ -51,25 +59,31 @@ steps: - exit_status: 1 limit: 1 - - label: "🦾 cpu distributed solvers tests" key: "distributed_solvers_cpu" env: TEST_GROUP: "distributed_solvers" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 50G slurm_ntasks: 4 + retry: + automatic: + - exit_status: 1 + limit: 1 - label: "🛸 gpu distributed solvers tests" key: "distributed_solvers_gpu" env: TEST_GROUP: "distributed_solvers" + GPU_TEST: "true" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 50G slurm_ntasks: 4 slurm_gpus_per_task: 1 retry: @@ -81,20 +95,27 @@ steps: key: "distributed_hydrostatic_model_cpu" env: TEST_GROUP: "distributed_hydrostatic_model" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 50G slurm_ntasks: 4 + retry: + automatic: + - exit_status: 1 + limit: 1 - label: "🦏 gpu distributed hydrostatic model tests" key: "distributed_hydrostatic_model_gpu" env: TEST_GROUP: "distributed_hydrostatic_model" + GPU_TEST: "true" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 80G # Apparently the GPU tests require more memory slurm_ntasks: 4 slurm_gpus_per_task: 1 retry: @@ -106,20 +127,27 @@ steps: key: "distributed_nonhydrostatic_regression_cpu" env: TEST_GROUP: "distributed_nonhydrostatic_regression" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 50G slurm_ntasks: 4 + retry: + automatic: + - exit_status: 1 + limit: 1 - label: "🕺 gpu distributed nonhydrostatic regression" key: "distributed_nonhydrostatic_regression_gpu" env: TEST_GROUP: "distributed_nonhydrostatic_regression" + GPU_TEST: "true" + MPI_TEST: "true" commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: - slurm_mem: 120G + slurm_mem: 50G slurm_ntasks: 4 slurm_gpus_per_task: 1 retry: diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 1d6d6e31ae..49226230da 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -55,6 +55,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "unit" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -70,7 +71,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "unit" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -90,6 +90,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "poisson_solvers_1" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -105,7 +106,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "poisson_solvers_1" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -121,6 +121,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "poisson_solvers_2" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -136,7 +137,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "poisson_solvers_2" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -152,6 +152,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "matrix_poisson_solvers" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -167,7 +168,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "matrix_poisson_solvers" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -183,6 +183,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "general_solvers" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -198,7 +199,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "general_solvers" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -218,6 +218,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_1" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -233,7 +234,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_1" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -253,6 +253,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_2" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -268,7 +269,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_2" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -288,6 +288,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_3" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -303,7 +304,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "time_stepping_3" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -323,6 +323,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "turbulence_closures" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -338,7 +339,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "turbulence_closures" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -358,6 +358,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "hydrostatic_free_surface" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -373,7 +374,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "hydrostatic_free_surface" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -393,6 +393,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "shallow_water" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -408,7 +409,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "shallow_water" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -428,6 +428,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "simulation" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -443,7 +444,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "simulation" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -463,6 +463,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "lagrangian" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -478,7 +479,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "lagrangian" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -498,6 +498,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "abstract_operations" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -513,7 +514,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "abstract_operations" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -533,6 +533,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "multi_region" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -548,7 +549,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "multi_region" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -568,6 +568,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "nonhydrostatic_regression" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -583,7 +584,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "nonhydrostatic_regression" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -599,6 +599,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "hydrostatic_regression" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -614,7 +615,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "hydrostatic_regression" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -634,6 +634,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "scripts" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -649,7 +650,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "scripts" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -669,6 +669,7 @@ steps: env: JULIA_DEPOT_PATH: "$SVERDRUP_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "enzyme" + GPU_TEST: "true" commands: - "$SVERDRUP_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -684,7 +685,6 @@ steps: env: JULIA_DEPOT_PATH: "$TARTARUS_HOME/.julia-$BUILDKITE_BUILD_NUMBER" TEST_GROUP: "enzyme" - CUDA_VISIBLE_DEVICES: "-1" commands: - "$TARTARUS_HOME/julia-$JULIA_VERSION/bin/julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: diff --git a/Project.toml b/Project.toml index 30ba117d7a..bd0f0ef2fb 100644 --- a/Project.toml +++ b/Project.toml @@ -50,7 +50,7 @@ CubedSphere = "0.2, 0.3" Dates = "1.9" Distances = "0.10" DocStringExtensions = "0.8, 0.9" -Enzyme = "0.13.13" +Enzyme = "0.13.14" FFTW = "1" Glob = "1.3" IncompleteLU = "0.2" @@ -78,9 +78,11 @@ julia = "1.9" [extras] DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" +CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a" [targets] -test = ["DataDeps", "Enzyme", "SafeTestsets", "Test", "TimesDates"] +test = ["DataDeps", "SafeTestsets", "Test", "Enzyme", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates"] diff --git a/src/Fields/set!.jl b/src/Fields/set!.jl index e4b8a29cb6..e311f659f1 100644 --- a/src/Fields/set!.jl +++ b/src/Fields/set!.jl @@ -3,7 +3,7 @@ using KernelAbstractions: @kernel, @index using Adapt: adapt_structure using Oceananigans.Grids: on_architecture, node_names -using Oceananigans.Architectures: child_architecture, device, GPU, CPU +using Oceananigans.Architectures: child_architecture, cpu_architecture, device, GPU, CPU using Oceananigans.Utils: work_layout ##### @@ -45,13 +45,16 @@ end function set_to_function!(u, f) # Supports serial and distributed - arch = child_architecture(u) + arch = architecture(u) + child_arch = child_architecture(u) # Determine cpu_grid and cpu_u - if arch isa GPU - cpu_grid = on_architecture(CPU(), u.grid) - cpu_u = Field(location(u), cpu_grid; indices = indices(u)) - elseif arch isa CPU + if child_arch isa GPU + cpu_arch = cpu_architecture(arch) + cpu_grid = on_architecture(cpu_arch, u.grid) + cpu_u = Field(location(u), cpu_grid; indices = indices(u)) + + elseif child_arch isa CPU cpu_grid = u.grid cpu_u = u end @@ -65,8 +68,8 @@ function set_to_function!(u, f) catch err u_loc = Tuple(L() for L in location(u)) - arg_str = tuple_string(node_names(u.grid, u_loc...)) - loc_str = tuple_string(location(u)) + arg_str = tuple_string(node_names(u.grid, u_loc...)) + loc_str = tuple_string(location(u)) topo_str = tuple_string(topology(u.grid)) msg = string("An error was encountered within set! while setting the field", '\n', '\n', @@ -81,10 +84,8 @@ function set_to_function!(u, f) end # Transfer data to GPU if u is on the GPU - if child_architecture(u) isa GPU - set!(u, cpu_u) - end - + child_arch isa GPU && set!(u, cpu_u) + return u end diff --git a/test/runtests.jl b/test/runtests.jl index 816d8d578a..219eb36452 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -172,18 +172,24 @@ CUDA.allowscalar() do if group == :distributed || group == :all MPI.Initialized() || MPI.Init() + # In case CUDA is not found, we reset CUDA and restart the julia session + reset_cuda_if_necessary() archs = test_architectures() include("test_distributed_models.jl") end if group == :distributed_solvers || group == :all MPI.Initialized() || MPI.Init() + # In case CUDA is not found, we reset CUDA and restart the julia session + reset_cuda_if_necessary() include("test_distributed_transpose.jl") include("test_distributed_poisson_solvers.jl") end if group == :distributed_hydrostatic_model || group == :all MPI.Initialized() || MPI.Init() + # In case CUDA is not found, we reset CUDA and restart the julia session + reset_cuda_if_necessary() archs = test_architectures() include("test_hydrostatic_regression.jl") include("test_distributed_hydrostatic_model.jl") @@ -191,6 +197,8 @@ CUDA.allowscalar() do if group == :distributed_nonhydrostatic_regression || group == :all MPI.Initialized() || MPI.Init() + # In case CUDA is not found, we reset CUDA and restart the julia session + reset_cuda_if_necessary() archs = nonhydrostatic_regression_test_architectures() include("test_nonhydrostatic_regression.jl") end diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl index dca406d8c3..5ab7c90a7e 100644 --- a/test/test_distributed_poisson_solvers.jl +++ b/test/test_distributed_poisson_solvers.jl @@ -121,9 +121,7 @@ function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretc return Array(interior(∇²ϕ)) ≈ Array(R) end -@testset "Distributed FFT-based Poisson solver" begin - child_arch = test_child_arch() - +@testset "Distributed FFT-based Poisson solver" begin for topology in ((Periodic, Periodic, Periodic), (Periodic, Periodic, Bounded), (Periodic, Bounded, Bounded), diff --git a/test/test_distributed_transpose.jl b/test/test_distributed_transpose.jl index 5bc0fe6292..2eff450b27 100644 --- a/test/test_distributed_transpose.jl +++ b/test/test_distributed_transpose.jl @@ -38,8 +38,6 @@ function test_transpose(grid_points, ranks, topo, child_arch) end @testset "Distributed Transpose" begin - child_arch = test_child_arch() - for topology in ((Periodic, Periodic, Periodic), (Periodic, Periodic, Bounded), (Periodic, Bounded, Bounded), diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl index 7bc06e528e..a027daf4de 100644 --- a/test/utils_for_runtests.jl +++ b/test/utils_for_runtests.jl @@ -3,21 +3,55 @@ using Oceananigans.DistributedComputations: Distributed, Partition, child_archit import Oceananigans.Fields: interior -test_child_arch() = CUDA.has_cuda() ? GPU() : CPU() +# Are the test running on the GPUs? +# Are the test running in parallel? +child_arch = get(ENV, "GPU_TEST", nothing) == "true" ? GPU() : CPU() +mpi_test = get(ENV, "MPI_TEST", nothing) == "true" + +# Sometimes when running tests in parallel, the CUDA.jl package is not loaded correctly. +# This function is a failsafe to re-load CUDA.jl using the suggested cach compilation from +# https://github.com/JuliaGPU/CUDA.jl/blob/a085bbb3d7856dfa929e6cdae04a146a259a2044/src/initialization.jl#L105 +# To make sure Julia restarts, an error is thrown. +function reset_cuda_if_necessary() + + # Do nothing if we are on the CPU + if child_arch isa CPU + return + end + + try + c = CUDA.zeros(10) # This will fail if CUDA is not available + catch err -function test_architectures() - child_arch = test_child_arch() + # Avoid race conditions and precompile on rank 0 only + if MPI.Comm_rank(MPI.COMM_WORLD) == 0 + pkg = Base.PkgId(Base.UUID("76a88914-d11a-5bdc-97e0-2f5a05c973a2"), "CUDA_Runtime_jll") + Base.compilecache(pkg) + @info "CUDA.jl was not correctly loaded. Re-loading CUDA.jl and re-starting Julia." + end + MPI.Barrier(MPI.COMM_WORLD) + + # re-start Julia and re-load CUDA.jl + throw(err) + end +end + +function test_architectures() # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel. # We test several different configurations: `Partition(x = 4)`, `Partition(y = 4)`, # `Partition(x = 2, y = 2)`, and different fractional subdivisions in x, y and xy - if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4 - return (Distributed(child_arch; partition = Partition(4)), - Distributed(child_arch; partition = Partition(1, 4)), - Distributed(child_arch; partition = Partition(2, 2)), - Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))), - Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))), - Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) + if mpi_test + if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4 + return (Distributed(child_arch; partition = Partition(4)), + Distributed(child_arch; partition = Partition(1, 4)), + Distributed(child_arch; partition = Partition(2, 2)), + Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))), + Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))), + Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) + else + return throw("The MPI partitioning is not correctly configured.") + end else return tuple(child_arch) end @@ -26,15 +60,17 @@ end # For nonhydrostatic simulations we cannot use `Fractional` at the moment (requirements # for the tranpose are more stringent than for hydrostatic simulations). function nonhydrostatic_regression_test_architectures() - child_arch = test_child_arch() - # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel. # We test 3 different configurations: `Partition(x = 4)`, `Partition(y = 4)` # and `Partition(x = 2, y = 2)` - if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4 - return (Distributed(child_arch; partition = Partition(4)), - Distributed(child_arch; partition = Partition(1, 4)), - Distributed(child_arch; partition = Partition(2, 2))) + if mpi_test + if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4 + return (Distributed(child_arch; partition = Partition(4)), + Distributed(child_arch; partition = Partition(1, 4)), + Distributed(child_arch; partition = Partition(2, 2))) + else + return throw("The MPI partitioning is not correctly configured.") + end else return tuple(child_arch) end