Skip to content

Commit

Permalink
some failsafe option
Browse files Browse the repository at this point in the history
  • Loading branch information
simone-silvestri committed Nov 7, 2024
1 parent 08949b3 commit 908b31a
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 5 deletions.
9 changes: 5 additions & 4 deletions src/Fields/set!.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,16 @@ end

function set_to_function!(u, f)
# Supports serial and distributed
arch = child_architecture(u)
arch = architecture(u)
child_arch = child_architecture(u)

# Determine cpu_grid and cpu_u
if arch isa GPU
if child_arch isa GPU
cpu_arch = cpu_architecture(arch)
cpu_grid = on_architecture(cpu_arch, u.grid)
cpu_u = Field(location(u), cpu_grid; indices = indices(u))

elseif arch isa CPU
elseif child_arch isa CPU
cpu_grid = u.grid
cpu_u = u
end
Expand Down Expand Up @@ -83,7 +84,7 @@ function set_to_function!(u, f)
end

# Transfer data to GPU if u is on the GPU
arch isa GPU && set!(u, cpu_u)
child_arch isa GPU && set!(u, cpu_u)

return u
end
Expand Down
4 changes: 4 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,20 +172,23 @@ CUDA.allowscalar() do

if group == :distributed || group == :all
MPI.Initialized() || MPI.Init()
# In case CUDA is not found, we reset CUDA and restart the julia session
reset_cuda_if_necessary()
archs = test_architectures()
include("test_distributed_models.jl")
end

if group == :distributed_solvers || group == :all
MPI.Initialized() || MPI.Init()
# In case CUDA is not found, we reset CUDA and restart the julia session
reset_cuda_if_necessary()
include("test_distributed_transpose.jl")
include("test_distributed_poisson_solvers.jl")
end

if group == :distributed_hydrostatic_model || group == :all
MPI.Initialized() || MPI.Init()
# In case CUDA is not found, we reset CUDA and restart the julia session
reset_cuda_if_necessary()
archs = test_architectures()
include("test_hydrostatic_regression.jl")
Expand All @@ -194,6 +197,7 @@ CUDA.allowscalar() do

if group == :distributed_nonhydrostatic_regression || group == :all
MPI.Initialized() || MPI.Init()
# In case CUDA is not found, we reset CUDA and restart the julia session
reset_cuda_if_necessary()
archs = nonhydrostatic_regression_test_architectures()
include("test_nonhydrostatic_regression.jl")
Expand Down
10 changes: 9 additions & 1 deletion test/utils_for_runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import Oceananigans.Fields: interior
child_arch = get(ENV, "GPU_TEST", nothing) == "true" ? GPU() : CPU()
mpi_test = get(ENV, "MPI_TEST", nothing) == "true"

# Sometimes when running tests in parallel, the CUDA.jl package is not loaded correctly.
# This function is a failsafe to re-load CUDA.jl using the suggested cach compilation from
# https://github.com/JuliaGPU/CUDA.jl/blob/a085bbb3d7856dfa929e6cdae04a146a259a2044/src/initialization.jl#L105
# To make sure Julia restarts, an error is thrown.
function reset_cuda_if_necessary()

# Do nothing if we are on the CPU
Expand All @@ -18,12 +22,16 @@ function reset_cuda_if_necessary()
try
c = CUDA.zeros(10) # This will fail if CUDA is not available
catch err

# Avoid race conditions and precompile on rank 0 only
if MPI.Comm_rank(MPI.COMM_WORLD) == 0
pkg = Base.PkgId(Base.UUID("76a88914-d11a-5bdc-97e0-2f5a05c973a2"), "CUDA_Runtime_jll")
Base.compilecache(pkg)
@info "CUDA.jl was not loaded. Re-loading CUDA.jl and re-starting Julia."
@info "CUDA.jl was not correctly loaded. Re-loading CUDA.jl and re-starting Julia."
end

MPI.Barrier(MPI.COMM_WORLD)

# re-start Julia and re-load CUDA.jl
throw(err)
end
Expand Down

0 comments on commit 908b31a

Please sign in to comment.