From ab9d100627ad3630234293fcd97787ba1789ddd7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 9 Feb 2023 09:03:13 -0500
Subject: [PATCH 001/530] Update things

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index eacf165da9..8acb2ccb74 100644
--- a/Project.toml
+++ b/Project.toml
@@ -39,10 +39,10 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [compat]
-AMGX = "0.1.3"
+AMGX = "0.2"
 Adapt = "3"
 AlgebraicMultigrid = "0.5"
-CUDA = "3.8, 3.9"
+CUDA = "3.8, 3.9, 4"
 CUDAKernels = "0.4.7"
 Crayons = "4"
 CubedSphere = "0.1, 0.2"

From bc21b18d4f21d23bfff4ed761b14ba893f4bca79 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 9 Feb 2023 09:04:24 -0500
Subject: [PATCH 002/530] Allow AMGX 0.1.3

---
 Manifest.toml | 58 ++++++++++++++++++++++++++++++++-------------------
 Project.toml  |  2 +-
 2 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 214c0faf27..393b3767f6 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,19 +2,21 @@
 
 julia_version = "1.8.5"
 manifest_format = "2.0"
-project_hash = "96eebad427fe7f986c00777f7738aa636a9e621d"
+project_hash = "0cdf622045fd02ec8eed7a1cf68cf7b3abda3372"
 
 [[deps.AMGX]]
 deps = ["AMGX_jll", "CEnum", "CUDA", "JSON", "Libdl", "SparseArrays"]
-git-tree-sha1 = "5f2e38391ea8788c69e3dc2aa297b4bef98c2ac8"
+git-tree-sha1 = "61d4adc6c35e4626048fe7c353ddd27843ee6f75"
+repo-rev = "vc/2.3"
+repo-url = "https://github.com/JuliaGPU/AMGX.jl.git"
 uuid = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
-version = "0.1.4"
+version = "0.2.0"
 
 [[deps.AMGX_jll]]
-deps = ["Artifacts", "CUDA_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "c846a105d1bfabc86f6302d747cc48acbc7bb489"
+deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
+git-tree-sha1 = "9a9e64c4d2acee7b89286985eaa7489ac3e97328"
 uuid = "656d14af-56e4-5275-8e68-4e861d7b5043"
-version = "2.1.0+0"
+version = "2.3.0+1"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
@@ -61,9 +63,9 @@ version = "0.1.0"
 
 [[deps.BFloat16s]]
 deps = ["LinearAlgebra", "Printf", "Random", "Test"]
-git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072"
+git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
 uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
-version = "0.2.0"
+version = "0.4.2"
 
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
@@ -80,10 +82,10 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.2"
 
 [[deps.CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"]
-git-tree-sha1 = "6717cb9a3425ebb7b31ca4f832823615d175f64a"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"]
+git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "3.13.1"
+version = "4.0.1"
 
 [[deps.CUDAKernels]]
 deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"]
@@ -91,11 +93,23 @@ git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca"
 uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 version = "0.4.7"
 
-[[deps.CUDA_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "108e3ee33d8614b96c2ea43a621b0e8396d8a273"
-uuid = "e9e359dc-d701-5aa8-82ae-09bbf812ea83"
-version = "10.0.130+3"
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.2.0+0"
+
+[[deps.CUDA_Runtime_Discovery]]
+deps = ["Libdl"]
+git-tree-sha1 = "58dd8ec29f54f08c04b052d2c2fa6760b4f4b3a4"
+uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
+version = "0.1.1"
+
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
+git-tree-sha1 = "d3e6ccd30f84936c1a3a53d622d85d7d3f9b9486"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.2.3+2"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
@@ -312,9 +326,9 @@ version = "0.8.6"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "b8ae281340f0d3e973aae7b96fb7502b0119b376"
+git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "4.15.0"
+version = "4.16.0"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
@@ -471,9 +485,9 @@ version = "4.1.3+3"
 
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "f6e9dba33f9f2c44e08a020b0caf6903be540004"
+git-tree-sha1 = "9ff31d101d987eb9d66bd8b176ac7c277beccd09"
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "1.1.19+0"
+version = "1.1.20+0"
 
 [[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
@@ -488,9 +502,9 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "SnoopPrecompile"]
-git-tree-sha1 = "151d91d63d8d6c1a5789ecb7de51547e00480f1b"
+git-tree-sha1 = "18f84637e00b72ba6769034a4b50d79ee40c84a9"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.5.4"
+version = "2.5.5"
 
 [[deps.PencilArrays]]
 deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
diff --git a/Project.toml b/Project.toml
index 8acb2ccb74..d84f68048d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -39,7 +39,7 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [compat]
-AMGX = "0.2"
+AMGX = "0.1.3, 0.2"
 Adapt = "3"
 AlgebraicMultigrid = "0.5"
 CUDA = "3.8, 3.9, 4"

From 4db4001273c9afad632b081c13139cb28e340cf7 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Thu, 9 Feb 2023 09:08:31 -0500
Subject: [PATCH 003/530] Update AMGX branch

---
 Manifest.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Manifest.toml b/Manifest.toml
index 393b3767f6..c97a48e06f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -6,7 +6,7 @@ project_hash = "0cdf622045fd02ec8eed7a1cf68cf7b3abda3372"
 
 [[deps.AMGX]]
 deps = ["AMGX_jll", "CEnum", "CUDA", "JSON", "Libdl", "SparseArrays"]
-git-tree-sha1 = "61d4adc6c35e4626048fe7c353ddd27843ee6f75"
+git-tree-sha1 = "e837274ddd2c98d197a5079de76c52bd86c89b1b"
 repo-rev = "vc/2.3"
 repo-url = "https://github.com/JuliaGPU/AMGX.jl.git"
 uuid = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"

From 9fa054c4dc9f6aa5013736612e020b605b73531b Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 13 Feb 2023 12:01:01 -0500
Subject: [PATCH 004/530] Remove event system from KA

---
 Manifest.toml                                 | 84 ++++++++++---------
 Project.toml                                  |  4 +-
 src/AbstractOperations/computed_field.jl      |  3 +-
 src/Architectures.jl                          |  6 +-
 src/BoundaryConditions/BoundaryConditions.jl  |  2 +-
 src/BoundaryConditions/apply_flux_bcs.jl      | 24 +++---
 src/BoundaryConditions/fill_halo_regions.jl   | 10 +--
 .../fill_halo_regions_nothing.jl              |  8 +-
 .../fill_halo_regions_periodic.jl             | 18 ++--
 src/CubedSpheres/CubedSpheres.jl              |  9 +-
 .../cubed_sphere_kernel_launching.jl          | 11 +--
 src/Diagnostics/cfl.jl                        |  2 +-
 src/Distributed/halo_communication.jl         |  6 +-
 src/Fields/set!.jl                            |  1 -
 .../update_particle_properties.jl             | 15 +---
 .../HydrostaticFreeSurfaceModels.jl           |  2 +-
 ...ate_hydrostatic_free_surface_tendencies.jl | 61 +++++---------
 .../explicit_free_surface.jl                  |  8 +-
 .../implicit_free_surface.jl                  | 10 +--
 .../prescribed_hydrostatic_velocity_fields.jl |  4 +-
 .../split_explicit_free_surface_kernels.jl    |  4 +-
 ...ore_hydrostatic_free_surface_tendencies.jl | 22 ++---
 ...te_hydrostatic_free_surface_model_state.jl | 10 ++-
 .../NonhydrostaticModels.jl                   |  2 +-
 .../calculate_nonhydrostatic_tendencies.jl    | 50 +++++------
 .../pressure_correction.jl                    |  3 +-
 .../set_nonhydrostatic_model.jl               |  3 +-
 .../update_nonhydrostatic_model_state.jl      |  4 +-
 .../ShallowWaterModels/ShallowWaterModels.jl  |  2 +-
 .../calculate_shallow_water_tendencies.jl     | 33 ++------
 .../rk3_substep_shallow_water_model.jl        | 11 +--
 .../store_shallow_water_tendencies.jl         | 17 +---
 .../update_shallow_water_state.jl             | 11 +--
 src/TimeSteppers/quasi_adams_bashforth_2.jl   | 15 +---
 src/TimeSteppers/runge_kutta_3.jl             | 16 +---
 src/TimeSteppers/store_tendencies.jl          | 19 +----
 .../anisotropic_minimum_dissipation.jl        | 10 +--
 src/Utils/kernel_launching.jl                 | 12 +--
 38 files changed, 193 insertions(+), 339 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index c97a48e06f..e02c5b922a 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,8 +1,8 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.8.5"
+julia_version = "1.9.0-beta4"
 manifest_format = "2.0"
-project_hash = "0cdf622045fd02ec8eed7a1cf68cf7b3abda3372"
+project_hash = "e5c066cd371cc92d479d4d0c34bc89f3323ab6b3"
 
 [[deps.AMGX]]
 deps = ["AMGX_jll", "CEnum", "CUDA", "JSON", "Libdl", "SparseArrays"]
@@ -82,17 +82,13 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.2"
 
 [[deps.CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"]
-git-tree-sha1 = "edff14c60784c8f7191a62a23b15a421185bc8a8"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"]
+git-tree-sha1 = "ece93002780a9ea0312afa7bd3b8fed37a899aee"
+repo-rev = "vc/ka_transition"
+repo-url = "https://github.com/JuliaGPU/CUDA.jl.git"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
 version = "4.0.1"
 
-[[deps.CUDAKernels]]
-deps = ["Adapt", "CUDA", "KernelAbstractions", "StaticArrays", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca"
-uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
-version = "0.4.7"
-
 [[deps.CUDA_Driver_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
 git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1"
@@ -117,12 +113,6 @@ git-tree-sha1 = "c6d890a52d2c4d55d326439580c3b8d0875a77d9"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "1.15.7"
 
-[[deps.ChangesOfVariables]]
-deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
-git-tree-sha1 = "844b061c104c408b24537482469400af6075aae4"
-uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.5"
-
 [[deps.CommonSolve]]
 git-tree-sha1 = "9441451ee712d1aec22edad62db1a9af3dc8d852"
 uuid = "38540f10-b2f7-11e9-35d8-d573e4eb0ff2"
@@ -137,7 +127,7 @@ version = "4.6.0"
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.1+0"
+version = "1.0.2+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -272,12 +262,6 @@ version = "2018.0.3+2"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[deps.InverseFunctions]]
-deps = ["Test"]
-git-tree-sha1 = "49510dfcb407e572524ba94aeae2fced1f3feb0f"
-uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
-version = "0.1.8"
-
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -320,9 +304,11 @@ version = "1.12.0"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "cf9cae1c4c1ff83f6c02cfaf01698f05448e8325"
+git-tree-sha1 = "2f2f329569f0b627dbaf8e144af1fb36b660ad49"
+repo-rev = "vc/nix_dependencies"
+repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.8.6"
+version = "0.9.0"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
@@ -369,15 +355,25 @@ uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
 version = "1.16.1+2"
 
 [[deps.LinearAlgebra]]
-deps = ["Libdl", "libblastrampoline_jll"]
+deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[deps.LogExpFunctions]]
-deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
+deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
 git-tree-sha1 = "680e733c3a0a9cea9e935c8c2184aea6a63fa0b5"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 version = "0.3.21"
 
+    [deps.LogExpFunctions.extensions]
+    ChainRulesCoreExt = "ChainRulesCore"
+    ChangesOfVariablesExt = "ChangesOfVariables"
+    InverseFunctionsExt = "InverseFunctions"
+
+    [deps.LogExpFunctions.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+    ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
+    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
+
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
@@ -437,7 +433,7 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2022.2.1"
+version = "2022.10.11"
 
 [[deps.NCDatasets]]
 deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
@@ -470,7 +466,7 @@ version = "1.12.9"
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.20+0"
+version = "0.3.21+0"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -502,9 +498,9 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "SnoopPrecompile"]
-git-tree-sha1 = "18f84637e00b72ba6769034a4b50d79ee40c84a9"
+git-tree-sha1 = "946b56b2135c6c10bbb93efad8a78b699b6383ab"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.5.5"
+version = "2.5.6"
 
 [[deps.PencilArrays]]
 deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
@@ -519,9 +515,9 @@ uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
 version = "0.14.2"
 
 [[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.8.0"
+version = "1.9.0"
 
 [[deps.Preferences]]
 deps = ["TOML"]
@@ -616,7 +612,7 @@ version = "1.0.3"
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
 [[deps.SparseArrays]]
-deps = ["LinearAlgebra", "Random"]
+deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[deps.SpecialFunctions]]
@@ -633,9 +629,9 @@ version = "0.8.3"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "6954a456979f23d05085727adb17c4551c19ecd1"
+git-tree-sha1 = "cee507162ecbb677450f20058ca83bd559b6b752"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.12"
+version = "1.5.14"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
@@ -650,6 +646,7 @@ version = "0.3.0"
 [[deps.Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+version = "1.9.0"
 
 [[deps.Strided]]
 deps = ["LinearAlgebra", "TupleTools"]
@@ -673,10 +670,15 @@ version = "1.10.0"
 deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
 uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
+[[deps.SuiteSparse_jll]]
+deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
+uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
+version = "5.10.1+6"
+
 [[deps.TOML]]
 deps = ["Dates"]
 uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.0"
+version = "1.0.3"
 
 [[deps.TableTraits]]
 deps = ["IteratorInterfaceExtensions"]
@@ -693,7 +695,7 @@ version = "1.10.0"
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.1"
+version = "1.10.0"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
@@ -760,12 +762,12 @@ version = "2.10.3+0"
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.12+3"
+version = "1.2.13+0"
 
 [[deps.libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
+deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.1.1+0"
+version = "5.4.0+0"
 
 [[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
diff --git a/Project.toml b/Project.toml
index d84f68048d..e2cf81bbc9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ AMGX = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 CubedSphere = "7445602f-e544-4518-8976-18f8e8ae6cdb"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
@@ -43,7 +42,6 @@ AMGX = "0.1.3, 0.2"
 Adapt = "3"
 AlgebraicMultigrid = "0.5"
 CUDA = "3.8, 3.9, 4"
-CUDAKernels = "0.4.7"
 Crayons = "4"
 CubedSphere = "0.1, 0.2"
 DocStringExtensions = "0.8, 0.9"
@@ -52,7 +50,7 @@ Glob = "1.3"
 IncompleteLU = "0.2"
 IterativeSolvers = "0.9"
 JLD2 = "^0.4"
-KernelAbstractions = "0.8"
+KernelAbstractions = "0.9"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
 NCDatasets = "0.12.10"
 OffsetArrays = "1.4"
diff --git a/src/AbstractOperations/computed_field.jl b/src/AbstractOperations/computed_field.jl
index 474a20d081..da0dd89f04 100644
--- a/src/AbstractOperations/computed_field.jl
+++ b/src/AbstractOperations/computed_field.jl
@@ -73,8 +73,7 @@ end
 
 function compute_field!(comp)
     arch = architecture(comp)
-    event = launch!(arch, comp.grid, size(comp), _compute!, comp.data, comp.operand, comp.indices)
-    wait(device(arch), event)
+    launch!(arch, comp.grid, size(comp), _compute!, comp.data, comp.operand, comp.indices)
 end
 
 """Compute an `operand` and store in `data`."""
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 1b64dea9f6..2f0d9678f3 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -2,11 +2,11 @@ module Architectures
 
 export AbstractArchitecture, AbstractMultiArchitecture
 export CPU, GPU, MultiGPU
-export device, device_event, architecture, array_type, arch_array, unified_array, device_copy_to!
+export device, architecture, array_type, arch_array, unified_array, device_copy_to!
 
 using CUDA
 using KernelAbstractions
-using CUDAKernels
+using CUDA.CUDAKernels
 using Adapt
 using OffsetArrays
 
@@ -107,8 +107,6 @@ end
  
 @inline device_copy_to!(dst::Array, src::Array; kw...) = Base.copyto!(dst, src)
 
-device_event(arch) = Event(device(arch))
-
 @inline unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)
 @inline unsafe_free!(a)          = nothing
 
diff --git a/src/BoundaryConditions/BoundaryConditions.jl b/src/BoundaryConditions/BoundaryConditions.jl
index a571818855..1394e3178d 100644
--- a/src/BoundaryConditions/BoundaryConditions.jl
+++ b/src/BoundaryConditions/BoundaryConditions.jl
@@ -11,7 +11,7 @@ export
     fill_halo_regions!
 
 using CUDA
-using KernelAbstractions: @index, @kernel, MultiEvent, NoneEvent
+using KernelAbstractions: @index, @kernel
 
 using Oceananigans.Architectures: CPU, GPU, device
 using Oceananigans.Utils: work_layout, launch!
diff --git a/src/BoundaryConditions/apply_flux_bcs.jl b/src/BoundaryConditions/apply_flux_bcs.jl
index d12e63857d..06e7054c55 100644
--- a/src/BoundaryConditions/apply_flux_bcs.jl
+++ b/src/BoundaryConditions/apply_flux_bcs.jl
@@ -16,38 +16,38 @@ apply_z_bcs!(Gc, c, args...) = apply_z_bcs!(Gc, Gc.grid, c, c.boundary_condition
 # Shortcuts for...
 #
 # Nothing tendencies.
-apply_x_bcs!(::Nothing, args...) = NoneEvent()
-apply_y_bcs!(::Nothing, args...) = NoneEvent()
-apply_z_bcs!(::Nothing, args...) = NoneEvent()
+apply_x_bcs!(::Nothing, args...) = nothing
+apply_y_bcs!(::Nothing, args...) = nothing
+apply_z_bcs!(::Nothing, args...) = nothing
 
 # Not-flux boundary conditions
 const NotFluxBC = Union{PBC, CBC, VBC, GBC, OBC, ZFBC, Nothing}
 
-apply_x_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = NoneEvent()
-apply_y_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = NoneEvent()
-apply_z_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = NoneEvent()
+apply_x_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = nothing
+apply_y_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = nothing
+apply_z_bcs!(Gc, ::AbstractGrid, c, ::NotFluxBC, ::NotFluxBC, ::AbstractArchitecture, args...) = nothing
 
 # The real deal
 """
 Apply flux boundary conditions to a field `c` by adding the associated flux divergence to
 the source term `Gc` at the left and right.
 """
-apply_x_bcs!(Gc, grid::AbstractGrid, c, west_bc, east_bc, arch::AbstractArchitecture, dep, args...) =
-    launch!(arch, grid, :yz, _apply_x_bcs!, Gc, instantiated_location(Gc), grid, west_bc, east_bc, args..., dependencies=dep)
+apply_x_bcs!(Gc, grid::AbstractGrid, c, west_bc, east_bc, arch::AbstractArchitecture, args...) =
+    launch!(arch, grid, :yz, _apply_x_bcs!, Gc, instantiated_location(Gc), grid, west_bc, east_bc, args...)
 
 """
 Apply flux boundary conditions to a field `c` by adding the associated flux divergence to
 the source term `Gc` at the left and right.
 """
-apply_y_bcs!(Gc, grid::AbstractGrid, c, south_bc, north_bc, arch::AbstractArchitecture, dep, args...) =
-    launch!(arch, grid, :xz, _apply_y_bcs!, Gc, instantiated_location(Gc), grid, south_bc, north_bc, args..., dependencies=dep)
+apply_y_bcs!(Gc, grid::AbstractGrid, c, south_bc, north_bc, arch::AbstractArchitecture, args...) =
+    launch!(arch, grid, :xz, _apply_y_bcs!, Gc, instantiated_location(Gc), grid, south_bc, north_bc, args...)
 
 """
 Apply flux boundary conditions to a field `c` by adding the associated flux divergence to
 the source term `Gc` at the top and bottom.
 """
-apply_z_bcs!(Gc, grid::AbstractGrid, c, bottom_bc, top_bc, arch::AbstractArchitecture, dep, args...) =
-    launch!(arch, grid, :xy, _apply_z_bcs!, Gc, instantiated_location(Gc), grid, bottom_bc, top_bc, args..., dependencies=dep)
+apply_z_bcs!(Gc, grid::AbstractGrid, c, bottom_bc, top_bc, arch::AbstractArchitecture, args...) =
+    launch!(arch, grid, :xy, _apply_z_bcs!, Gc, instantiated_location(Gc), grid, bottom_bc, top_bc, args...)
 
 """
     _apply_x_bcs!(Gc, grid, west_bc, east_bc, args...)
diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 4e2d10d9b7..965b396058 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -1,6 +1,5 @@
 using OffsetArrays: OffsetArray
 using Oceananigans.Utils
-using Oceananigans.Architectures: device_event
 using Oceananigans.Grids: architecture
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
@@ -39,14 +38,13 @@ function fill_halo_regions!(c::MaybeTupledData, boundary_conditions, indices, lo
     halo_tuple = permute_boundary_conditions(boundary_conditions)
    
     for task = 1:3
-        barrier = device_event(arch)
-        fill_halo_event!(task, halo_tuple, c, indices, loc, arch, barrier, grid, args...; kwargs...)
+        fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, args...; kwargs...)
     end
 
     return nothing
 end
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, barrier, grid, args...; kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, args...; kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
@@ -55,8 +53,8 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, barrier, grid
     size   = fill_halo_size(c, fill_halo!, indices, bc_left, loc, grid)
     offset = fill_halo_offset(size, fill_halo!, indices)
 
-    event  = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, barrier, grid, args...; kwargs...)
-    wait(device(arch), event)
+    fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, args...; kwargs...)
+    return
 end
 
 function permute_boundary_conditions(boundary_conditions)
diff --git a/src/BoundaryConditions/fill_halo_regions_nothing.jl b/src/BoundaryConditions/fill_halo_regions_nothing.jl
index 533e1c6e46..448ccb2b98 100644
--- a/src/BoundaryConditions/fill_halo_regions_nothing.jl
+++ b/src/BoundaryConditions/fill_halo_regions_nothing.jl
@@ -2,15 +2,15 @@
 ##### Nothing happens when your boundary condition is nothing
 #####
 
-fill_west_and_east_halo!(c,  ::Nothing, ::Nothing, args...; kwargs...) = NoneEvent()
-fill_south_and_north_halo!(c,::Nothing, ::Nothing, args...; kwargs...) = NoneEvent()
-fill_bottom_and_top_halo!(c, ::Nothing, ::Nothing, args...; kwargs...) = NoneEvent()
+fill_west_and_east_halo!(c,  ::Nothing, ::Nothing, args...; kwargs...) = nothing 
+fill_south_and_north_halo!(c,::Nothing, ::Nothing, args...; kwargs...) = nothing
+fill_bottom_and_top_halo!(c, ::Nothing, ::Nothing, args...; kwargs...) = nothing
 
 for dir in (:west, :east, :south, :north, :bottom, :top)
         fill_nothing! = Symbol( :fill_, dir, :_halo!)
     alt_fill_nothing! = Symbol(:_fill_, dir, :_halo!)
     @eval begin
-        @inline     $fill_nothing!(c, ::Nothing, args...;  kwargs...)         = NoneEvent()
+        @inline     $fill_nothing!(c, ::Nothing, args...;  kwargs...)         = nothing
         @inline $alt_fill_nothing!(i, j, grid, c, ::Nothing, args...)         = nothing
         @inline $alt_fill_nothing!(i, j, grid, ::Nothing, ::Nothing, args...) = nothing
         @inline $alt_fill_nothing!(i, j, grid, ::Nothing, args...)            = nothing
diff --git a/src/BoundaryConditions/fill_halo_regions_periodic.jl b/src/BoundaryConditions/fill_halo_regions_periodic.jl
index 9354569998..edfa95a408 100644
--- a/src/BoundaryConditions/fill_halo_regions_periodic.jl
+++ b/src/BoundaryConditions/fill_halo_regions_periodic.jl
@@ -15,22 +15,22 @@ end
 
 @inline fix_halo_offsets(o, co) = co > 0 ? o - co : o # Windowed fields have only positive offsets to correct
 
-function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, dep, grid, args...; kw...)
+function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, yz_size, offset = parent_size_and_offset(c, 2, 3, size, offset)
-    event = launch!(arch, grid, yz_size, fill_periodic_west_and_east_halo!, c_parent, offset, grid.Hx, grid.Nx; dependencies=dep, kw...)
-    return event
+    launch!(arch, grid, yz_size, fill_periodic_west_and_east_halo!, c_parent, offset, grid.Hx, grid.Nx; kw...)
+    return
 end
 
-function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, dep, grid, args...; kw...)
+function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, xz_size, offset = parent_size_and_offset(c, 1, 3, size, offset)
-    event = launch!(arch, grid, xz_size, fill_periodic_south_and_north_halo!, c_parent, offset, grid.Hy, grid.Ny; dependencies=dep, kw...)
-    return event
+    launch!(arch, grid, xz_size, fill_periodic_south_and_north_halo!, c_parent, offset, grid.Hy, grid.Ny;  kw...)
+    return
 end
 
-function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, dep, grid, args...; kw...)
+function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, xy_size, offset = parent_size_and_offset(c, 1, 2, size, offset)
-    event = launch!(arch, grid, xy_size, fill_periodic_bottom_and_top_halo!, c_parent, offset, grid.Hz, grid.Nz; dependencies=dep, kw...)
-    return event
+    launch!(arch, grid, xy_size, fill_periodic_bottom_and_top_halo!, c_parent, offset, grid.Hz, grid.Nz; kw...)
+    return
 end
 
 #####
diff --git a/src/CubedSpheres/CubedSpheres.jl b/src/CubedSpheres/CubedSpheres.jl
index af8e35ab66..7067e344fc 100644
--- a/src/CubedSpheres/CubedSpheres.jl
+++ b/src/CubedSpheres/CubedSpheres.jl
@@ -188,11 +188,10 @@ function compute!(comp::CubedSphereComputedField, time=nothing)
     compute_at!(comp.operand, time)
 
     arch = architecture(comp)
-    events = Tuple(launch!(arch, c.grid, size(c), _compute!, c.data, c.operand, c.indices)
-                   for c in faces(comp))
-
-    wait(device(arch), MultiEvent(events))
-
+    foreach(faces(comp)) do c
+        launch!(arch, c.grid, size(c), _compute!, c.data, c.operand, c.indices)
+    end
+    
     fill_halo_regions!(comp)
 
     return comp
diff --git a/src/CubedSpheres/cubed_sphere_kernel_launching.jl b/src/CubedSpheres/cubed_sphere_kernel_launching.jl
index f52b4408da..5cd5103fbd 100644
--- a/src/CubedSpheres/cubed_sphere_kernel_launching.jl
+++ b/src/CubedSpheres/cubed_sphere_kernel_launching.jl
@@ -1,5 +1,3 @@
-using KernelAbstractions: Event, MultiEvent
-
 using Oceananigans.AbstractOperations: KernelFunctionOperation
 using Oceananigans.Architectures: device
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: ExplicitFreeSurface, PrescribedVelocityFields
@@ -34,17 +32,10 @@ function get_face(op::KernelFunctionOperation, face_index)
 end
 
 function launch!(arch, grid::ConformalCubedSphereGrid, dims, kernel!, args...; kwargs...)
-    events = []
-
     for (face_index, face_grid) in enumerate(grid.faces)
         face_args = Tuple(get_face(arg, face_index) for arg in args)
-        event = launch!(arch, face_grid, dims, kernel!, face_args...; kwargs...)
-        push!(events, event)
+        launch!(arch, face_grid, dims, kernel!, face_args...; kwargs...)
     end
-
-    events = filter(e -> e isa Event, events)
-
-    return MultiEvent(Tuple(events))
 end
 
 @inline launch!(arch, grid::ConformalCubedSphereGrid, ::Val{dims}, args...; kwargs...) where dims =
diff --git a/src/Diagnostics/cfl.jl b/src/Diagnostics/cfl.jl
index 0cac003061..0218987045 100644
--- a/src/Diagnostics/cfl.jl
+++ b/src/Diagnostics/cfl.jl
@@ -83,7 +83,7 @@ DiffusiveCFL(Δt) = CFL(Δt, cell_diffusion_timescale)
 ##### Accurate CFL via reduction
 #####
 
-using CUDA, CUDAKernels, KernelAbstractions, Tullio
+using CUDA, KernelAbstractions, Tullio
 
 using Oceananigans.Models
 using Oceananigans.Grids: halo_size
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index c829e941c9..3407d21a3d 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index, Event, MultiEvent
+using KernelAbstractions: @kernel, @index
 using OffsetArrays: OffsetArray
 
 import Oceananigans.Fields: tupled_fill_halo_regions!
@@ -83,8 +83,8 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     # Length check needed until this PR is merged: https://github.com/JuliaParallel/MPI.jl/pull/458
     length(mpi_requests) > 0 && MPI.Waitall!(mpi_requests)
 
-    events = filter(e -> e isa Event, events_and_requests)
-    wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
+    # events = filter(e -> e isa Event, events_and_requests)
+    # # wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
 
     return nothing
 end
diff --git a/src/Fields/set!.jl b/src/Fields/set!.jl
index c114077dc2..d77b50401c 100644
--- a/src/Fields/set!.jl
+++ b/src/Fields/set!.jl
@@ -1,5 +1,4 @@
 using CUDA
-using CUDAKernels
 using KernelAbstractions: @kernel, @index
 using Adapt: adapt_structure
 
diff --git a/src/LagrangianParticleTracking/update_particle_properties.jl b/src/LagrangianParticleTracking/update_particle_properties.jl
index 83f11e50d3..d37ea525aa 100644
--- a/src/LagrangianParticleTracking/update_particle_properties.jl
+++ b/src/LagrangianParticleTracking/update_particle_properties.jl
@@ -131,8 +131,6 @@ function update_particle_properties!(lagrangian_particles, model, Δt)
 
     arch = architecture(model)
 
-    events = []
-
     for (field_name, tracked_field) in pairs(lagrangian_particles.tracked_fields)
         compute!(tracked_field)
         particle_property = getproperty(lagrangian_particles.properties, field_name)
@@ -140,14 +138,10 @@ function update_particle_properties!(lagrangian_particles, model, Δt)
 
         update_field_property_kernel! = update_field_property!(device(arch), workgroup, worksize)
 
-        update_event = update_field_property_kernel!(particle_property, lagrangian_particles.properties, model.grid,
-                                                     datatuple(tracked_field), LX(), LY(), LZ(),
-                                                     dependencies=Event(device(arch)))
-        push!(events, update_event)
+        update_field_property_kernel!(particle_property, lagrangian_particles.properties, model.grid,
+                                                     datatuple(tracked_field), LX(), LY(), LZ())
     end
 
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     # Compute dynamics
 
     lagrangian_particles.dynamics(lagrangian_particles, model, Δt)
@@ -157,10 +151,7 @@ function update_particle_properties!(lagrangian_particles, model, Δt)
     advect_particles_kernel! = _advect_particles!(device(arch), workgroup, worksize)
 
     advect_particles_event = advect_particles_kernel!(lagrangian_particles.properties, lagrangian_particles.restitution, model.grid, Δt,
-                                                      datatuple(model.velocities),
-                                                      dependencies=Event(device(arch)))
-
-    wait(device(arch), advect_particles_event)
+                                                      datatuple(model.velocities))
     return nothing
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
index 6298bfe48e..98af25de8e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
@@ -5,7 +5,7 @@ export
     ExplicitFreeSurface, ImplicitFreeSurface, SplitExplicitFreeSurface, 
     PrescribedVelocityFields
 
-using KernelAbstractions: @index, @kernel, Event, MultiEvent, NoneEvent
+using KernelAbstractions: @index, @kernel
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
 using Oceananigans.Utils
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 20732c5f91..7bc7ed9ec2 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -143,9 +143,7 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
     arch = model.architecture
     grid = model.grid
 
-    barrier = device_event(model)
-
-    events = calculate_hydrostatic_momentum_tendencies!(model, model.velocities; dependencies = barrier)
+    calculate_hydrostatic_momentum_tendencies!(model, model.velocities)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
 
@@ -162,31 +160,27 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
                                                                                          model.closure,
                                                                                          model.diffusivity_fields)
 
-        Gc_event = launch!(arch, grid, :xyz,
-                           calculate_hydrostatic_free_surface_Gc!,
-                           c_tendency,
-                           c_kernel_function,
-                           grid,
-                           Val(tracer_index),
-                           c_advection,
-                           closure,
-                           c_immersed_bc,
-                           model.buoyancy,
-                           model.velocities,
-                           model.free_surface,
-                           model.tracers,
-                           top_tracer_bcs,
-                           diffusivity_fields,
-                           model.auxiliary_fields,
-                           c_forcing,
-                           model.clock;
-                           dependencies = barrier, 
-                           only_active_cells)
-
-        push!(events, Gc_event)
+        launch!(arch, grid, :xyz,
+                calculate_hydrostatic_free_surface_Gc!,
+                c_tendency,
+                c_kernel_function,
+                grid,
+                Val(tracer_index),
+                c_advection,
+                closure,
+                c_immersed_bc,
+                model.buoyancy,
+                model.velocities,
+                model.free_surface,
+                model.tracers,
+                top_tracer_bcs,
+                diffusivity_fields,
+                model.auxiliary_fields,
+                c_forcing,
+                model.clock;
+                only_active_cells)
     end
 
-    wait(device(arch), MultiEvent(Tuple(events)))
 
     return nothing
 end
@@ -261,27 +255,18 @@ end
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
 function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, grid, arch, velocities, free_surface, tracers, args...)
-
-    barrier = device_event(arch)
-
-    events = []
-
     # Velocity fields
     for i in (:u, :v)
-        apply_flux_bcs!(Gⁿ[i], events, velocities[i], arch, barrier, args...)
+        apply_flux_bcs!(Gⁿ[i], velocities[i], arch, args...)
     end
 
     # Free surface
-    apply_flux_bcs!(Gⁿ.η, events, displacement(free_surface), arch, barrier, args...)
+    apply_flux_bcs!(Gⁿ.η, displacement(free_surface), arch,  args...)
 
     # Tracer fields
     for i in propertynames(tracers)
-        apply_flux_bcs!(Gⁿ[i], events, tracers[i], arch, barrier, args...)
+        apply_flux_bcs!(Gⁿ[i], tracers[i], arch, args...)
     end
 
-    events = filter(e -> typeof(e) <: Event, events)
-
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
index 9ceb45e24c..93690426d5 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
@@ -62,13 +62,7 @@ function explicit_ab2_step_free_surface!(free_surface, model, Δt, χ, prognosti
     
     free_surface_event = launch!(model.architecture, model.grid, :xy,
                                 _explicit_ab2_step_free_surface!, free_surface.η, Δt, χ,
-                                model.timestepper.Gⁿ.η, model.timestepper.G⁻.η, size(model.grid, 3),
-                                dependencies = device_event(model.architecture))
-    
-    return MultiEvent(tuple(prognostic_field_events[1]..., prognostic_field_events[2]..., free_surface_event))
-    
-    # wait(device(model.architecture), free_surface_event)
-    # return nothing
+                                model.timestepper.Gⁿ.η, model.timestepper.G⁻.η, size(model.grid, 3))
 end
 
 #####
diff --git a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
index 1111b144d8..dc89c8e92c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
@@ -152,18 +152,12 @@ function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, 
 end
 
 function wait_velocity_event(arch, prognostic_field_events)
-    velocity_events = prognostic_field_events[1]
-
-    # Wait for predictor velocity update step to complete.
-    wait(device(arch), MultiEvent(velocity_events))
-
-    return MultiEvent(prognostic_field_events[2])
+    return nothing
 end
 
 function local_compute_integrated_volume_flux!(∫ᶻQ, velocities, arch)
     
-    masking_events = Tuple(mask_immersed_field!(q) for q in velocities)
-    wait(device(arch), MultiEvent(masking_events))
+    foreach(mask_immersed_field!, velocities)
 
     # Compute barotropic volume flux. Blocking.
     compute_vertically_integrated_volume_flux!(∫ᶻQ, velocities)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 8a43af17cf..97493d9c49 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -75,8 +75,8 @@ end
 
 @inline datatuple(obj::PrescribedVelocityFields) = (; u = datatuple(obj.u), v = datatuple(obj.v), w = datatuple(obj.w))
 
-ab2_step_velocities!(::PrescribedVelocityFields, args...) = [NoneEvent()]
-ab2_step_free_surface!(::Nothing, model, Δt, χ, prognostic_field_events) = MultiEvent(flatten_tuple(prognostic_field_events))
+ab2_step_velocities!(::PrescribedVelocityFields, args...) = nothing
+ab2_step_free_surface!(::Nothing, model, Δt, χ, prognostic_field_events) = nothing 
 compute_w_from_continuity!(::PrescribedVelocityFields, args...) = nothing
 
 validate_velocity_boundary_conditions(::PrescribedVelocityFields) = nothing
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 805fed9191..86eef48c82 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -152,9 +152,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     set_average_to_zero!(state)
 
     # Wait for predictor velocity update step to complete and mask it if immersed boundary.
-    wait(device(arch), velocities_update)
-    masking_events = Tuple(mask_immersed_field!(q) for q in model.velocities)
-    wait(device(arch), MultiEvent(masking_events))
+    foreach(mask_immersed_field!, model.velocities)
 
     # Compute barotropic mode of tendency fields
     barotropic_mode!(auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu, Gv)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index b4c3a6882e..a32edc47da 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -32,30 +32,20 @@ end
 
 """ Store previous source terms before updating them. """
 function store_tendencies!(model::HydrostaticFreeSurfaceModel)
-
-    barrier = device_event(model.architecture)
-
     prognostic_field_names = keys(prognostic_fields(model))
     three_dimensional_prognostic_field_names = filter(name -> name != :η, prognostic_field_names)
 
-    events = []
-
     for field_name in three_dimensional_prognostic_field_names
 
-        field_event = launch!(model.architecture, model.grid, :xyz,
-                              store_field_tendencies!,
-                              model.timestepper.G⁻[field_name],
-                              model.grid,
-                              model.timestepper.Gⁿ[field_name],
-                              dependencies = barrier)
+        launch!(model.architecture, model.grid, :xyz,
+                store_field_tendencies!,
+                model.timestepper.G⁻[field_name],
+                model.grid,
+                model.timestepper.Gⁿ[field_name])
 
-        push!(events, field_event)
     end
 
-    η_event = store_free_surface_tendency!(model.free_surface, model, barrier)
-    push!(events, η_event)
-
-    wait(device(model.architecture), MultiEvent(Tuple(events)))
+    store_free_surface_tendency!(model.free_surface, model)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index ce61fb6430..6bfb47447e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -44,10 +44,12 @@ function masking_actions!(model)
     η = displacement(model.free_surface)
     fields_to_mask = merge(model.auxiliary_fields, prognostic_fields(model))
 
-    masking_events = Any[mask_immersed_field!(field) for field in fields_to_mask if field !== η]
-    push!(masking_events, mask_immersed_reduced_field_xy!(η, k=size(model.grid, 3)))    
-
-    wait(device(model.architecture), MultiEvent(Tuple(masking_events)))
+    foreach(fields_to_mask) do field
+        if field !== η
+            mask_immersed_field!(field)
+        end
+    end
+    mask_immersed_reduced_field_xy!(η, k=size(model.grid, 3))
 end
 
 function compute_w_diffusivities_pressure!(model) 
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 5866fe2a32..85f0106ae4 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -4,7 +4,7 @@ export NonhydrostaticModel
 
 using DocStringExtensions
 
-using KernelAbstractions: @index, @kernel, Event, MultiEvent
+using KernelAbstractions: @index, @kernel
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
 using Oceananigans.Utils: launch!
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index d5a5591ead..27aafdfc14 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -40,7 +40,7 @@ function calculate_tendencies!(model::NonhydrostaticModel, callbacks)
 end
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(model; dependencies = device_event(model))
+function calculate_interior_tendency_contributions!(model)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
@@ -81,19 +81,17 @@ function calculate_interior_tendency_contributions!(model; dependencies = device
     
     only_active_cells = use_only_active_cells(grid)
 
-    Gu_event = launch!(arch, grid, :xyz, calculate_Gu!, 
-                       tendencies.u, u_kernel_args...;
-                       dependencies, only_active_cells)
+    launch!(arch, grid, :xyz, calculate_Gu!, 
+            tendencies.u, u_kernel_args...;
+            only_active_cells)
 
-    Gv_event = launch!(arch, grid, :xyz, calculate_Gv!, 
-                       tendencies.v, v_kernel_args...;
-                       dependencies, only_active_cells)
+    launch!(arch, grid, :xyz, calculate_Gv!, 
+            tendencies.v, v_kernel_args...;
+            only_active_cells)
 
-    Gw_event = launch!(arch, grid, :xyz, calculate_Gw!, 
-                       tendencies.w, w_kernel_args...;
-                       dependencies, only_active_cells)
-
-    events = [Gu_event, Gv_event, Gw_event]
+    launch!(arch, grid, :xyz, calculate_Gw!, 
+            tendencies.w, w_kernel_args...;
+            only_active_cells)
 
     start_tracer_kernel_args = (advection, closure)
     end_tracer_kernel_args   = (buoyancy, background_fields, velocities, tracers, auxiliary_fields, diffusivities)
@@ -103,19 +101,16 @@ function calculate_interior_tendency_contributions!(model; dependencies = device
         @inbounds forcing = forcings[tracer_index+3]
         @inbounds c_immersed_bc = tracers[tracer_index].boundary_conditions.immersed
 
-        Gc_event = launch!(arch, grid, :xyz, calculate_Gc!,
-                           c_tendency, grid, Val(tracer_index),
-                           start_tracer_kernel_args..., 
-                           c_immersed_bc,
-                           end_tracer_kernel_args...,
-                           forcing, clock;
-                           dependencies, only_active_cells)
+        launch!(arch, grid, :xyz, calculate_Gc!,
+                c_tendency, grid, Val(tracer_index),
+                start_tracer_kernel_args..., 
+                c_immersed_bc,
+                end_tracer_kernel_args...,
+                forcing, clock;
+                only_active_cells)
 
-        push!(events, Gc_event)
     end
 
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
@@ -181,16 +176,11 @@ end
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
 function calculate_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
-
-    barrier = device_event(arch)
-
     fields = merge(velocities, tracers)
 
-    x_events = Tuple(apply_x_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields) for i in 1:length(fields))
-    y_events = Tuple(apply_y_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields) for i in 1:length(fields))
-    z_events = Tuple(apply_z_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields) for i in 1:length(fields))
+    foreach(i->apply_x_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
+    foreach(i->apply_y_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
+    foreach(i->apply_z_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
                          
-    wait(device(arch), MultiEvent(tuple(x_events..., y_events..., z_events...)))
-
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/pressure_correction.jl b/src/Models/NonhydrostaticModels/pressure_correction.jl
index c320c103da..f3be399fa0 100644
--- a/src/Models/NonhydrostaticModels/pressure_correction.jl
+++ b/src/Models/NonhydrostaticModels/pressure_correction.jl
@@ -10,8 +10,7 @@ Calculate the (nonhydrostatic) pressure correction associated `tendencies`, `vel
 function calculate_pressure_correction!(model::NonhydrostaticModel, Δt)
 
     # Mask immersed velocities
-    velocity_masking_events = mask_immersed_velocities!(model.velocities, model.architecture, model.grid)
-    wait(device(model.architecture), MultiEvent(velocity_masking_events))
+    mask_immersed_velocities!(model.velocities, model.architecture, model.grid)
 
     fill_halo_regions!(model.velocities, model.clock, fields(model))
 
diff --git a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
index 900f702a23..fa7b574999 100644
--- a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
@@ -42,9 +42,8 @@ function set!(model::NonhydrostaticModel; enforce_incompressibility=true, kwargs
     end
 
     # Apply a mask
-    tracer_masking_events = Tuple(mask_immersed_field!(c) for c in model.tracers)
+    foreach(mask_immersed_field!, model.tracers)
     velocity_masking_events = mask_immersed_velocities!(model.velocities, model.architecture, model.grid)
-    wait(device(model.architecture), MultiEvent(tuple(velocity_masking_events..., tracer_masking_events...)))
 
     update_state!(model)
 
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index a5a507fdcc..8b0d1a457b 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -16,9 +16,7 @@ they are called in the end.
 function update_state!(model::NonhydrostaticModel, callbacks=[])
     
     # Mask immersed tracers
-    tracer_masking_events = Tuple(mask_immersed_field!(c) for c in model.tracers)
-
-    wait(device(model.architecture), MultiEvent(tracer_masking_events))
+    foreach(mask_immersed_field!, model.tracers)
 
     # Fill halos for velocities and tracers
     fill_halo_regions!(merge(model.velocities, model.tracers),  model.clock, fields(model))
diff --git a/src/Models/ShallowWaterModels/ShallowWaterModels.jl b/src/Models/ShallowWaterModels/ShallowWaterModels.jl
index 33dd370324..848d87c7ac 100644
--- a/src/Models/ShallowWaterModels/ShallowWaterModels.jl
+++ b/src/Models/ShallowWaterModels/ShallowWaterModels.jl
@@ -3,7 +3,7 @@ module ShallowWaterModels
 export ShallowWaterModel, ShallowWaterScalarDiffusivity,
        ConservativeFormulation, VectorInvariantFormulation
 
-using KernelAbstractions: @index, @kernel, Event, MultiEvent
+using KernelAbstractions: @index, @kernel
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
 using Adapt
diff --git a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl b/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
index c3941bfee9..72087fff48 100644
--- a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
+++ b/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
@@ -2,7 +2,7 @@ import Oceananigans.TimeSteppers: calculate_tendencies!
 
 using Oceananigans.Utils: work_layout
 using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
-using KernelAbstractions: @index, @kernel, Event, MultiEvent
+using KernelAbstractions: @index, @kernel
 
 using Oceananigans.Architectures: device
 
@@ -81,32 +81,25 @@ function calculate_interior_tendency_contributions!(tendencies,
     calculate_Gh_kernel!  =  calculate_Gh!(device(arch), workgroup, worksize)
     calculate_Gc_kernel!  =  calculate_Gc!(device(arch), workgroup, worksize)
 
-    barrier = Event(device(arch))
-
     args_vel = (grid, gravitational_acceleration, advection.momentum, velocities, coriolis, closure, 
                       bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
     args_h   = (grid, gravitational_acceleration, advection.mass, coriolis, closure, 
                       solution, tracers, diffusivities, forcings, clock, formulation)
 
-    Guh_event = calculate_Guh_kernel!(tendencies[1], args_vel...; dependencies = barrier)
-    Gvh_event = calculate_Gvh_kernel!(tendencies[2], args_vel...; dependencies = barrier)
-    Gh_event  =  calculate_Gh_kernel!(tendencies[3], args_h...;   dependencies = barrier)
-
-    events = [Guh_event, Gvh_event, Gh_event]
+    calculate_Guh_kernel!(tendencies[1], args_vel...)
+    calculate_Gvh_kernel!(tendencies[2], args_vel...)
+     calculate_Gh_kernel!(tendencies[3], args_h...)
 
     for (tracer_index, tracer_name) in enumerate(propertynames(tracers))
         @inbounds c_tendency = tendencies[tracer_index+3]
         @inbounds forcing = forcings[tracer_index+3]
         @inbounds c_advection = advection[tracer_name]
 
-        Gc_event = calculate_Gc_kernel!(c_tendency, grid, Val(tracer_index), c_advection, closure, solution,
-                                        tracers, diffusivities, forcing, clock, formulation, dependencies=barrier)
+        calculate_Gc_kernel!(c_tendency, grid, Val(tracer_index), c_advection, closure, solution,
+                             tracers, diffusivities, forcing, clock, formulation)
 
-        push!(events, Gc_event)
     end
 
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
@@ -207,24 +200,14 @@ end
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
 function calculate_boundary_tendency_contributions!(Gⁿ, arch, solution, tracers, clock, model_fields)
-
-    barrier = Event(device(arch))
-
     prognostic_fields = merge(solution, tracers)
 
-    events = []
-
     # Solution fields and tracer fields
     for i in 1:length(Gⁿ)
-        x_bcs_event = apply_x_bcs!(Gⁿ[i], prognostic_fields[i], arch, barrier, clock, model_fields)
-        y_bcs_event = apply_y_bcs!(Gⁿ[i], prognostic_fields[i], arch, barrier, clock, model_fields)
-        push!(events, x_bcs_event, y_bcs_event)
+        apply_x_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
+        apply_y_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
     end
 
-    events = filter(e -> typeof(e) <: Event, events)
-    
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl b/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
index 7352aed6ca..34fd01d97b 100644
--- a/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
+++ b/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
@@ -8,8 +8,6 @@ function rk3_substep!(model::ShallowWaterModel, Δt, γⁿ, ζⁿ)
 
     workgroup, worksize = work_layout(model.grid, :xyz)
 
-    barrier = Event(device(model.architecture))
-
     substep_solution_kernel! = rk3_substep_solution!(device(model.architecture), workgroup, worksize)
     substep_tracer_kernel! = rk3_substep_tracer!(device(model.architecture), workgroup, worksize)
 
@@ -17,22 +15,17 @@ function rk3_substep!(model::ShallowWaterModel, Δt, γⁿ, ζⁿ)
     solution_event = substep_solution_kernel!(model.solution,
                                               Δt, γⁿ, ζⁿ,
                                               model.timestepper.Gⁿ,
-                                              model.timestepper.G⁻;
-                                              dependencies=barrier)
+                                              model.timestepper.G⁻)
 
-    events = [solution_event]
 
     for i in 1:length(model.tracers)
         @inbounds c = model.tracers[i]
         @inbounds Gcⁿ = model.timestepper.Gⁿ[i+3]
         @inbounds Gc⁻ = model.timestepper.G⁻[i+3]
 
-        tracer_event = substep_tracer_kernel!(c, Δt, γⁿ, ζⁿ, Gcⁿ, Gc⁻, dependencies=barrier)
-
-        push!(events, tracer_event)
+        substep_tracer_kernel!(c, Δt, γⁿ, ζⁿ, Gcⁿ, Gc⁻)
     end
 
-    wait(device(model.architecture), MultiEvent(Tuple(events)))
 
     return nothing
 end
diff --git a/src/Models/ShallowWaterModels/store_shallow_water_tendencies.jl b/src/Models/ShallowWaterModels/store_shallow_water_tendencies.jl
index cd9dd427a0..5374dda2d6 100644
--- a/src/Models/ShallowWaterModels/store_shallow_water_tendencies.jl
+++ b/src/Models/ShallowWaterModels/store_shallow_water_tendencies.jl
@@ -14,31 +14,22 @@ end
 
 """ Store previous source terms before updating them. """
 function store_tendencies!(model::ShallowWaterModel)
-
-    barrier = Event(device(model.architecture))
-
     workgroup, worksize = work_layout(model.grid, :xyz)
 
     store_solution_tendencies_kernel! = store_solution_tendencies!(device(model.architecture), workgroup, worksize)
     store_tracer_tendency_kernel! = store_tracer_tendency!(device(model.architecture), workgroup, worksize)
 
-    solution_event = store_solution_tendencies_kernel!(model.timestepper.G⁻,
-                                                       model.grid,
-                                                       model.timestepper.Gⁿ,
-                                                       dependencies=barrier)
-
-    events = [solution_event]
+    store_solution_tendencies_kernel!(model.timestepper.G⁻,
+                                      model.grid,
+                                      model.timestepper.Gⁿ)
 
     # Tracer fields
     for i in 4:length(model.timestepper.G⁻)
         @inbounds Gc⁻ = model.timestepper.G⁻[i]
         @inbounds Gc⁰ = model.timestepper.Gⁿ[i]
-        tracer_event = store_tracer_tendency_kernel!(Gc⁻, model.grid, Gc⁰, dependencies=barrier)
-        push!(events, tracer_event)
+        store_tracer_tendency_kernel!(Gc⁻, model.grid, Gc⁰)
     end
 
-    wait(device(model.architecture), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index ca63efee1d..a9bd5425ba 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -12,9 +12,7 @@ If `callbacks` are provided (in an array), they are called in the end.
 function update_state!(model::ShallowWaterModel, callbacks=[])
 
     # Mask immersed fields
-    masking_events = Tuple(mask_immersed_field!(field) for field in model.solution)
-
-    wait(device(model.architecture), MultiEvent(masking_events))
+    foreach(mask_immersed_field!, model.solution)
 
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
 
@@ -27,8 +25,11 @@ function update_state!(model::ShallowWaterModel, callbacks=[])
 
     compute_velocities!(model.velocities, formulation(model))
 
-    [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
-
+    foreach(callbacks) do callback
+        if isa(callback.callsite, UpdateStateCallsite
+            callback(model)
+        end
+    end
     return nothing
 end
 
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 0df16090c6..5a952f0535 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -120,16 +120,12 @@ function ab2_step!(model, Δt, χ)
     barrier = device_event(arch)
     step_field_kernel! = ab2_step_field!(device(arch), workgroup, worksize)
     model_fields = prognostic_fields(model)
-    events = []
 
     for (i, field) in enumerate(model_fields)
 
-        field_event = step_field_kernel!(field, Δt, χ,
-                                         model.timestepper.Gⁿ[i],
-                                         model.timestepper.G⁻[i],
-                                         dependencies = device_event(arch))
-
-        push!(events, field_event)
+        step_field_kernel!(field, Δt, χ,
+                           model.timestepper.Gⁿ[i],
+                           model.timestepper.G⁻[i])
 
         # TODO: function tracer_index(model, field_index) = field_index - 3, etc...
         tracer_index = Val(i - 3) # assumption
@@ -140,12 +136,9 @@ function ab2_step!(model, Δt, χ)
                        model.diffusivity_fields,
                        tracer_index,
                        model.clock,
-                       Δt,
-                       dependencies = field_event)
+                       Δt)
     end
 
-    wait(device(model.architecture), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index 5931be3ab8..236cb7202c 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -161,16 +161,13 @@ stage_Δt(Δt, γⁿ, ::Nothing) = Δt * γⁿ
 function rk3_substep!(model, Δt, γⁿ, ζⁿ)
 
     workgroup, worksize = work_layout(model.grid, :xyz)
-    barrier = Event(device(architecture(model)))
     substep_field_kernel! = rk3_substep_field!(device(architecture(model)), workgroup, worksize)
     model_fields = prognostic_fields(model)
-    events = []
 
     for (i, field) in enumerate(model_fields)
-        field_event = substep_field_kernel!(field, Δt, γⁿ, ζⁿ,
-                                            model.timestepper.Gⁿ[i],
-                                            model.timestepper.G⁻[i],
-                                            dependencies=barrier)
+        substep_field_kernel!(field, Δt, γⁿ, ζⁿ,
+                              model.timestepper.Gⁿ[i],
+                              model.timestepper.G⁻[i])
 
         # TODO: function tracer_index(model, field_index) = field_index - 3, etc...
         tracer_index = Val(i - 3) # assumption
@@ -181,14 +178,9 @@ function rk3_substep!(model, Δt, γⁿ, ζⁿ)
                        model.diffusivity_fields,
                        tracer_index,
                        model.clock,
-                       stage_Δt(Δt, γⁿ, ζⁿ),
-                       dependencies = field_event)
-
-        push!(events, field_event)
+                       stage_Δt(Δt, γⁿ, ζⁿ))
     end
 
-    wait(device(architecture(model)), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/TimeSteppers/store_tendencies.jl b/src/TimeSteppers/store_tendencies.jl
index 18cafa81e3..c6bae9c6af 100644
--- a/src/TimeSteppers/store_tendencies.jl
+++ b/src/TimeSteppers/store_tendencies.jl
@@ -12,25 +12,14 @@ end
 
 """ Store previous source terms before updating them. """
 function store_tendencies!(model)
-
-    barrier = device_event(model.architecture)
-
     model_fields = prognostic_fields(model)
 
-    events = []
-
     for field_name in keys(model_fields)
-
-        field_event = launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
-                              model.timestepper.G⁻[field_name],
-                              model.grid,
-                              model.timestepper.Gⁿ[field_name],
-                              dependencies = barrier)
-
-        push!(events, field_event)
+        launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
+                model.timestepper.G⁻[field_name],
+                model.grid,
+                model.timestepper.Gⁿ[field_name])
     end
 
-    wait(device(model.architecture), MultiEvent(Tuple(events)))
-
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 4997ba9427..70ae6d56fe 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -189,19 +189,13 @@ function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimu
     viscosity_kernel! = calculate_nonlinear_viscosity!(device(arch), workgroup, worksize)
     diffusivity_kernel! = calculate_nonlinear_tracer_diffusivity!(device(arch), workgroup, worksize)
 
-    barrier = device_event(arch)
-    viscosity_event = viscosity_kernel!(diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers, dependencies=barrier)
-
-    events = [viscosity_event]
+    viscosity_kernel!(diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     for (tracer_index, κₑ) in enumerate(diffusivity_fields.κₑ)
         @inbounds tracer = tracers[tracer_index]
-        event = diffusivity_kernel!(κₑ, grid, closure, tracer, Val(tracer_index), velocities, dependencies=barrier)
-        push!(events, event)
+        diffusivity_kernel!(κₑ, grid, closure, tracer, Val(tracer_index), velocities)
     end
 
-    wait(device(arch), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index ae62eaa85e..2c9c56b4a4 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -72,18 +72,12 @@ end
 active_cells_work_layout(size, grid) = heuristic_workgroup(size...), size
 
 """
-    launch!(arch, grid, layout, kernel!, args...; dependencies=nothing, kwargs...)
+    launch!(arch, grid, layout, kernel!, args...; kwargs...)
 
 Launches `kernel!`, with arguments `args` and keyword arguments `kwargs`,
 over the `dims` of `grid` on the architecture `arch`.
-
-Returns an `event` token associated with the `kernel!` launch.
-
-The keyword argument `dependencies` is an `Event` or `MultiEvent` specifying prior kernels
-that must complete before `kernel!` is launched.
 """
 function launch!(arch, grid, workspec, kernel!, kernel_args...;
-                 dependencies = nothing,
                  include_right_boundaries = false,
                  reduced_dimensions = (),
                  location = nothing,
@@ -100,9 +94,9 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
 
     @debug "Launching kernel $kernel! with worksize $worksize"
 
-    event = loop!(kernel_args...; dependencies=dependencies)
+    loop!(kernel_args...)
 
-    return event
+    return nothing
 end
 
 # When dims::Val

From 17ce4bd5827e1d85b43e02991155e6a04df31b60 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Feb 2023 13:46:25 -0500
Subject: [PATCH 005/530] remove all events

---
 src/BoundaryConditions/fill_halo_regions.jl   |  3 -
 src/CubedSpheres/cubed_sphere_exchange_bcs.jl |  6 +-
 src/Distributed/distributed_apply_flux_bcs.jl |  8 +--
 .../distributed_fft_based_poisson_solver.jl   |  7 +-
 src/Fields/broadcasting_abstract_fields.jl    |  7 +-
 src/Fields/field.jl                           |  1 -
 src/Fields/regridding_fields.jl               |  6 +-
 src/Grids/latitude_longitude_grid.jl          | 10 +--
 src/ImmersedBoundaries/mask_immersed_field.jl | 17 +++--
 .../update_particle_properties.jl             |  5 +-
 .../barotropic_pressure_correction.jl         | 25 ++++---
 ...ate_hydrostatic_free_surface_tendencies.jl | 56 +++++++---------
 .../compute_w_from_continuity.jl              | 18 +----
 .../explicit_free_surface.jl                  | 20 ++----
 .../fft_based_implicit_free_surface_solver.jl |  8 +--
 .../hydrostatic_free_surface_ab2_step.jl      | 48 ++++----------
 .../implicit_free_surface.jl                  | 12 +---
 .../matrix_implicit_free_surface_solver.jl    | 15 ++---
 .../mg_implicit_free_surface_solver.jl        | 15 ++---
 .../pcg_implicit_free_surface_solver.jl       | 35 ++++------
 .../prescribed_hydrostatic_velocity_fields.jl |  4 +-
 .../single_column_model_mode.jl               |  3 +-
 ...ore_hydrostatic_free_surface_tendencies.jl | 20 +++---
 ...te_hydrostatic_free_surface_model_state.jl |  1 -
 src/Models/Models.jl                          |  3 +-
 .../NonhydrostaticModels.jl                   |  1 -
 ...rect_nonhydrostatic_immersed_tendencies.jl | 66 -------------------
 .../pressure_correction.jl                    | 17 ++---
 .../set_nonhydrostatic_model.jl               |  2 +-
 .../solve_for_pressure.jl                     | 19 ++----
 .../update_hydrostatic_pressure.jl            | 14 +---
 .../rk3_substep_shallow_water_model.jl        |  8 +--
 .../shallow_water_diffusion_operators.jl      | 11 ++--
 src/MultiRegion/MultiRegion.jl                |  2 +-
 .../multi_region_boundary_conditions.jl       | 49 +++++---------
 .../unified_implicit_free_surface_solver.jl   | 21 ++----
 src/Solvers/batched_tridiagonal_solver.jl     | 13 ++--
 src/Solvers/fft_based_poisson_solver.jl       |  6 +-
 .../fourier_tridiagonal_poisson_solver.jl     | 14 ++--
 src/Solvers/heptadiagonal_iterative_solver.jl |  7 +-
 src/Solvers/index_permutations.jl             | 14 ++--
 src/Solvers/matrix_solver_utils.jl            |  7 +-
 src/Solvers/sparse_preconditioners.jl         | 10 ++-
 src/TimeSteppers/quasi_adams_bashforth_2.jl   |  2 -
 src/TimeSteppers/store_tendencies.jl          |  1 -
 .../implicit_explicit_time_discretization.jl  |  2 -
 .../CATKEVerticalDiffusivities.jl             |  9 +--
 ...vective_adjustment_vertical_diffusivity.jl | 13 ++--
 .../isopycnal_skew_symmetric_diffusivity.jl   |  7 +-
 .../leith_enstrophy_diffusivity.jl            |  9 +--
 .../mews_vertical_diffusivity.jl              | 23 +++----
 .../ri_based_vertical_diffusivity.jl          | 25 ++++---
 .../smagorinsky_lilly.jl                      |  9 +--
 .../vertically_implicit_diffusion_solver.jl   |  2 +-
 test/dependencies_for_poisson_solvers.jl      | 16 ++---
 test/dependencies_for_runtests.jl             |  2 +-
 test/test_distributed_poisson_solvers.jl      | 12 +---
 ...face_immersed_boundaries_implicit_solve.jl |  3 +-
 test/test_implicit_free_surface_solver.jl     | 13 ++--
 test/test_matrix_poisson_solver.jl            |  6 +-
 test/test_multi_region_poisson_solver.jl      |  6 +-
 test/test_time_stepping.jl                    |  5 +-
 test/utils_for_runtests.jl                    |  6 +-
 .../doubly_bounded_poisson.jl                 |  6 +-
 .../triply_bounded_poisson.jl                 |  5 +-
 validation/stencils/stencil_calculations.jl   |  5 +-
 66 files changed, 261 insertions(+), 560 deletions(-)
 delete mode 100644 src/Models/NonhydrostaticModels/correct_nonhydrostatic_immersed_tendencies.jl

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 965b396058..3d9c2b3153 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -85,9 +85,6 @@ function permute_boundary_conditions(boundary_conditions)
     return (fill_halos!, boundary_conditions_array_left, boundary_conditions_array_right)
 end
 
-@inline validate_event(::Nothing) = NoneEvent()
-@inline validate_event(event)     = event
-
 #####
 ##### Halo filling order
 #####
diff --git a/src/CubedSpheres/cubed_sphere_exchange_bcs.jl b/src/CubedSpheres/cubed_sphere_exchange_bcs.jl
index e6e0b7547e..8d1ed442c4 100644
--- a/src/CubedSpheres/cubed_sphere_exchange_bcs.jl
+++ b/src/CubedSpheres/cubed_sphere_exchange_bcs.jl
@@ -5,8 +5,6 @@ using Oceananigans.Grids: AbstractGrid
 using Oceananigans.BoundaryConditions
 using Oceananigans.BoundaryConditions: AbstractBoundaryConditionClassification
 
-using KernelAbstractions: NoneEvent
-
 import Base: show
 
 import Oceananigans.BoundaryConditions: bc_str
@@ -96,5 +94,5 @@ Adapt.adapt_structure(to, ::CubedSphereExchangeBC) = nothing
 @inline apply_y_north_bc!( Gc, loc, ::CubedSphereExchangeBC, args...) = nothing
 @inline apply_y_south_bc!( Gc, loc, ::CubedSphereExchangeBC, args...) = nothing
 
-apply_x_bcs!(Gc, ::AbstractGrid, c, ::CubedSphereExchangeBC, ::CubedSphereExchangeBC, ::AbstractArchitecture, args...) = NoneEvent()
-apply_y_bcs!(Gc, ::AbstractGrid, c, ::CubedSphereExchangeBC, ::CubedSphereExchangeBC, ::AbstractArchitecture, args...) = NoneEvent()
+apply_x_bcs!(Gc, ::AbstractGrid, c, ::CubedSphereExchangeBC, ::CubedSphereExchangeBC, ::AbstractArchitecture, args...) = nothing
+apply_y_bcs!(Gc, ::AbstractGrid, c, ::CubedSphereExchangeBC, ::CubedSphereExchangeBC, ::AbstractArchitecture, args...) = nothing
diff --git a/src/Distributed/distributed_apply_flux_bcs.jl b/src/Distributed/distributed_apply_flux_bcs.jl
index 50d930088e..a04520e811 100644
--- a/src/Distributed/distributed_apply_flux_bcs.jl
+++ b/src/Distributed/distributed_apply_flux_bcs.jl
@@ -1,8 +1,6 @@
 using Oceananigans.Grids: AbstractGrid
 using Oceananigans.Architectures: AbstractArchitecture
 
-using KernelAbstractions: NoneEvent
-
 import Oceananigans.BoundaryConditions:
     apply_x_bcs!,
     apply_y_bcs!,
@@ -15,9 +13,9 @@ import Oceananigans.BoundaryConditions:
     apply_z_bottom_bc!
 
 # Bunch o' shortcuts for halo communication bcs
-apply_x_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = NoneEvent()
-apply_y_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = NoneEvent()
-apply_z_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = NoneEvent()
+apply_x_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = nothing
+apply_y_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = nothing
+apply_z_bcs!(Gc, ::AbstractGrid, c, ::HaloCommunicationBC, ::HaloCommunicationBC, ::AbstractArchitecture, args...) = nothing
 
 @inline apply_x_east_bc!(  Gc, loc, ::HaloCommunicationBC, args...) = nothing
 @inline apply_x_west_bc!(  Gc, loc, ::HaloCommunicationBC, args...) = nothing
diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 6ae73f7d6e..486ecabc51 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -186,11 +186,8 @@ function solve!(x, solver::DistributedFFTBasedPoissonSolver)
     xc = first(solver.storage)
 	
     # Copy the real component of xc to x.
-    copy_event = launch!(arch, solver.local_grid, :xyz,
-                         copy_permuted_real_component!, x, parent(xc), solver.input_permutation,
-                         dependencies = device_event(arch))
-
-    wait(device(arch), copy_event)
+    launch!(arch, solver.local_grid, :xyz,
+            copy_permuted_real_component!, x, parent(xc), solver.input_permutation)
 
     return x
 end
diff --git a/src/Fields/broadcasting_abstract_fields.jl b/src/Fields/broadcasting_abstract_fields.jl
index 80a394c897..9c18c31a0b 100644
--- a/src/Fields/broadcasting_abstract_fields.jl
+++ b/src/Fields/broadcasting_abstract_fields.jl
@@ -6,8 +6,6 @@ using Base.Broadcast: DefaultArrayStyle
 using Base.Broadcast: Broadcasted
 using CUDA
 
-using Oceananigans.Architectures: device_event
-
 struct FieldBroadcastStyle <: Broadcast.AbstractArrayStyle{3} end
 
 Base.Broadcast.BroadcastStyle(::Type{<:AbstractField}) = FieldBroadcastStyle()
@@ -72,10 +70,7 @@ broadcasted_to_abstract_operation(loc, grid, a) = a
 
     bc′ = broadcasted_to_abstract_operation(location(dest), grid, bc)
 
-    event = launch!(arch, grid, size(dest), broadcast_kernel!, dest, bc′, dest.indices,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, size(dest), broadcast_kernel!, dest, bc′, dest.indices)
 
     return dest
 end
diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index 3c0152444c..5a7a90f7de 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -1,4 +1,3 @@
-using Oceananigans.Architectures: device_event
 using Oceananigans.BoundaryConditions: OBC, CBC
 using Oceananigans.Grids: parent_index_range, index_range_offset, default_indices, all_indices, validate_indices
 
diff --git a/src/Fields/regridding_fields.jl b/src/Fields/regridding_fields.jl
index 01314873bc..9f3e7d69cf 100644
--- a/src/Fields/regridding_fields.jl
+++ b/src/Fields/regridding_fields.jl
@@ -73,13 +73,11 @@ function regrid!(a, target_grid, source_grid, b)
 
     if we_can_regrid_in_z(a, target_grid, source_grid, b)
         source_z_faces = znodes(Face, source_grid)
-        event = launch!(arch, target_grid, :xy, _regrid_in_z!, a, b, target_grid, source_grid, source_z_faces)
-        wait(device(arch), event)
+        launch!(arch, target_grid, :xy, _regrid_in_z!, a, b, target_grid, source_grid, source_z_faces)
         return a
     elseif we_can_regrid_in_y(a, target_grid, source_grid, b)
         source_y_faces = ynodes(Face, source_grid)
-        event = launch!(arch, target_grid, :xz, _regrid_in_y!, a, b, target_grid, source_grid, source_y_faces)
-        wait(device(arch), event)
+        launch!(arch, target_grid, :xz, _regrid_in_y!, a, b, target_grid, source_grid, source_y_faces)
         return a
     else
         msg = """Regridding
diff --git a/src/Grids/latitude_longitude_grid.jl b/src/Grids/latitude_longitude_grid.jl
index 0b191a6049..7d4a33873f 100644
--- a/src/Grids/latitude_longitude_grid.jl
+++ b/src/Grids/latitude_longitude_grid.jl
@@ -216,12 +216,9 @@ LatitudeLongitudeGrid(FT::DataType; kwargs...) = LatitudeLongitudeGrid(CPU(), FT
 
 """ Return a reproduction of `grid` with precomputed metric terms. """
 function with_precomputed_metrics(grid)
-    arch = architecture(grid)
     Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Δyᶠᶜ, Δyᶜᶠ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ = allocate_metrics(grid)
-    wait(device_event(arch))
 
     precompute_curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ )
-    wait(device_event(arch))
 
     Δyᶠᶜ, Δyᶜᶠ = precompute_Δy_metrics(grid, Δyᶠᶜ, Δyᶜᶠ)
 
@@ -464,8 +461,8 @@ function precompute_curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ,
     workgroup, worksize  = metric_workgroup(grid), metric_worksize(grid)
     curvilinear_metrics! = precompute_metrics_kernel!(Architectures.device(arch), workgroup, worksize)
 
-    event = curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ; dependencies=device_event(arch))
-    wait(event)
+    curvilinear_metrics!(grid, Δxᶠᶜ, Δxᶜᶠ, Δxᶠᶠ, Δxᶜᶜ, Azᶠᶜ, Azᶜᶠ, Azᶠᶠ, Azᶜᶜ)
+
 
     return nothing
 end
@@ -514,9 +511,8 @@ end
 function precompute_Δy_metrics(grid::LatitudeLongitudeGrid, Δyᶠᶜ, Δyᶜᶠ)
     arch = grid.architecture
     precompute_Δy! = precompute_Δy_kernel!(Architectures.device(arch), 16, length(grid.Δφᵃᶜᵃ) - 1)
-    event = precompute_Δy!(grid, Δyᶠᶜ, Δyᶜᶠ; dependencies=device_event(arch))
+    precompute_Δy!(grid, Δyᶠᶜ, Δyᶜᶠ)
     
-    wait(event)
     return Δyᶠᶜ, Δyᶜᶠ
 end
 
diff --git a/src/ImmersedBoundaries/mask_immersed_field.jl b/src/ImmersedBoundaries/mask_immersed_field.jl
index 0ad154a6ab..3a4a0f1426 100644
--- a/src/ImmersedBoundaries/mask_immersed_field.jl
+++ b/src/ImmersedBoundaries/mask_immersed_field.jl
@@ -1,19 +1,19 @@
 using KernelAbstractions: @kernel, @index
-using KernelAbstractions: NoneEvent
 using Statistics
-using Oceananigans.Architectures: architecture, device_event
+using Oceananigans.Architectures: architecture
 using Oceananigans.Fields: location, ZReducedField, Field
 
 instantiate(X) = X()
 
-mask_immersed_field!(field, grid, loc, value) = NoneEvent()
+mask_immersed_field!(field, grid, loc, value) = nothing
 mask_immersed_field!(field::Field, value=zero(eltype(field.grid))) =
     mask_immersed_field!(field, field.grid, location(field), value)
 
 function mask_immersed_field!(field::Field, grid::ImmersedBoundaryGrid, loc, value)
     arch = architecture(field)
     loc = instantiate.(loc)
-    return launch!(arch, grid, :xyz, _mask_immersed_field!, field, loc, grid, value; dependencies = device_event(arch))
+    launch!(arch, grid, :xyz, _mask_immersed_field!, field, loc, grid, value)
+    return nothing
 end
 
 @kernel function _mask_immersed_field!(field, loc, grid, value)
@@ -21,16 +21,15 @@ end
     @inbounds field[i, j, k] = scalar_mask(i, j, k, grid, grid.immersed_boundary, loc..., value, field)
 end
 
-mask_immersed_reduced_field_xy!(field,     args...; kw...) = NoneEvent()
+mask_immersed_reduced_field_xy!(field,     args...; kw...) = nothing
 mask_immersed_reduced_field_xy!(field::ZReducedField, value=zero(eltype(field.grid)); k) =
     mask_immersed_reduced_field_xy!(field, field.grid, location(field), value; k)
 
 function mask_immersed_reduced_field_xy!(field::ZReducedField, grid::ImmersedBoundaryGrid, loc, value; k)
     arch = architecture(field)
     loc = instantiate.(loc)
-    return launch!(arch, grid, :xy,
-                   _mask_immersed_reduced_field_xy!, field, loc, grid, value, k;
-                   dependencies = device_event(arch))
+    launch!(arch, grid, :xy, _mask_immersed_reduced_field_xy!, field, loc, grid, value, k)
+    return nothing
 end
 
 @kernel function _mask_immersed_reduced_field_xy!(field, loc, grid, value, k)
@@ -42,7 +41,7 @@ end
 ##### mask_immersed_velocities for NonhydrostaticModel
 #####
 
-mask_immersed_velocities!(U, arch, grid) = tuple(NoneEvent())
+mask_immersed_velocities!(U, arch, grid) = nothing
 
 #####
 ##### Masking for GridFittedBoundary
diff --git a/src/LagrangianParticleTracking/update_particle_properties.jl b/src/LagrangianParticleTracking/update_particle_properties.jl
index d37ea525aa..94156d25ad 100644
--- a/src/LagrangianParticleTracking/update_particle_properties.jl
+++ b/src/LagrangianParticleTracking/update_particle_properties.jl
@@ -149,9 +149,8 @@ function update_particle_properties!(lagrangian_particles, model, Δt)
     # Advect particles
 
     advect_particles_kernel! = _advect_particles!(device(arch), workgroup, worksize)
-
-    advect_particles_event = advect_particles_kernel!(lagrangian_particles.properties, lagrangian_particles.restitution, model.grid, Δt,
-                                                      datatuple(model.velocities))
+    advect_particles_kernel!(lagrangian_particles.properties, lagrangian_particles.restitution, model.grid, Δt, datatuple(model.velocities))
+    
     return nothing
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
index 31ac11ce44..c396187a2c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
@@ -17,26 +17,23 @@ pressure_correct_velocities!(model::ExplicitFreeSurfaceHFSM, Δt; kwargs...) = n
 ##### Barotropic pressure correction for models with a free surface
 #####
 
-function pressure_correct_velocities!(model::ImplicitFreeSurfaceHFSM, Δt;
-                                      dependencies = device_event(model.architecture))
+function pressure_correct_velocities!(model::ImplicitFreeSurfaceHFSM, Δt)
 
-    event = launch!(model.architecture, model.grid, :xyz,
-                    _barotropic_pressure_correction,
-                    model.velocities,
-                    model.grid,
-                    Δt,
-                    model.free_surface.gravitational_acceleration,
-                    model.free_surface.η,
-                    dependencies = dependencies)
-
-    wait(device(model.architecture), event)
+    launch!(model.architecture, model.grid, :xyz,
+            _barotropic_pressure_correction,
+            model.velocities,
+            model.grid,
+            Δt,
+            model.free_surface.gravitational_acceleration,
+            model.free_surface.η,
+            dependencies = dependencies)
 
     return nothing
 end
 
-calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM, dependencies) = NoneEvent()
+calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM, dependencies) = nothing
 
-function pressure_correct_velocities!(model::SplitExplicitFreeSurfaceHFSM, Δt; dependecies = nothing)
+function pressure_correct_velocities!(model::SplitExplicitFreeSurfaceHFSM, Δt)
     u, v, _ = model.velocities
     grid = model.grid 
     barotropic_split_explicit_corrector!(u, v, model.free_surface, grid)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 7bc7ed9ec2..bd2882738b 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -1,7 +1,6 @@
 import Oceananigans.TimeSteppers: calculate_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
-using Oceananigans.Architectures: device_event
 using Oceananigans: fields, prognostic_fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
 using Oceananigans.Utils: work_layout
 using Oceananigans.Fields: immersed_boundary_condition
@@ -38,27 +37,26 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
-function calculate_free_surface_tendency!(grid, model, dependencies)
+function calculate_free_surface_tendency!(grid, model)
 
     arch = architecture(grid)
 
-    Gη_event = launch!(arch, grid, :xy,
-                       calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η,
-                       grid,
-                       model.velocities,
-                       model.free_surface,
-                       model.tracers,
-                       model.auxiliary_fields,
-                       model.forcing,
-                       model.clock;
-                       dependencies = dependencies)
-
-    return Gη_event
+    launch!(arch, grid, :xy,
+            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η,
+            grid,
+            model.velocities,
+            model.free_surface,
+            model.tracers,
+            model.auxiliary_fields,
+            model.forcing,
+            model.clock)
+
+    return nothing
 end
     
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function calculate_hydrostatic_momentum_tendencies!(model, velocities; dependencies = device_event(model))
+function calculate_hydrostatic_momentum_tendencies!(model, velocities)
 
     grid = model.grid
     arch = architecture(grid)
@@ -86,19 +84,17 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities; dependenc
     
     only_active_cells = use_only_active_cells(grid)
 
-    Gu_event = launch!(arch, grid, :xyz,
-                       calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, u_kernel_args...;
-                       dependencies = dependencies, only_active_cells)
-
-    Gv_event = launch!(arch, grid, :xyz,
-                       calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, v_kernel_args...;
-                       dependencies = dependencies, only_active_cells)
+    launch!(arch, grid, :xyz,
+            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, u_kernel_args...;
+            only_active_cells)
 
-    Gη_event = calculate_free_surface_tendency!(grid, model, dependencies)
+    launch!(arch, grid, :xyz,
+            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, v_kernel_args...;
+            only_active_cells)
 
-    events = [Gu_event, Gv_event, Gη_event]
+    calculate_free_surface_tendency!(grid, model)
 
-    return events
+    return nothing
 end
 
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
@@ -243,12 +239,10 @@ end
 ##### Boundary condributions to hydrostatic free surface model
 #####
 
-function apply_flux_bcs!(Gcⁿ, events, c, arch, barrier, args...)
-    x_bcs_event = apply_x_bcs!(Gcⁿ, c, arch, barrier, args...)
-    y_bcs_event = apply_y_bcs!(Gcⁿ, c, arch, barrier, args...)
-    z_bcs_event = apply_z_bcs!(Gcⁿ, c, arch, barrier, args...)
-
-    push!(events, x_bcs_event, y_bcs_event, z_bcs_event)
+function apply_flux_bcs!(Gcⁿ, c, arch, barrier, args...)
+    apply_x_bcs!(Gcⁿ, c, arch, barrier, args...)
+    apply_y_bcs!(Gcⁿ, c, arch, barrier, args...)
+    apply_z_bcs!(Gcⁿ, c, arch, barrier, args...)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index cb61e462bb..504f873f9d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Architectures: device, device_event
+using Oceananigans.Architectures: device
 using Oceananigans.Operators: div_xyᶜᶜᶜ, Δzᶜᶜᶜ
 
 """
@@ -12,20 +12,8 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-function compute_w_from_continuity!(velocities, arch, grid)
-
-    event = launch!(arch,
-                    grid,
-                    :xy,
-                    _compute_w_from_continuity!,
-                    velocities,
-                    grid,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
-
-    return nothing
-end
+compute_w_from_continuity!(velocities, arch, grid) = 
+    launch!(arch, grid, :xy, _compute_w_from_continuity!, velocities, grid)
 
 @kernel function _compute_w_from_continuity!(U, grid)
     i, j = @index(Global, NTuple)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
index 93690426d5..672322068f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/explicit_free_surface.jl
@@ -1,4 +1,3 @@
-using Oceananigans.Architectures: device_event
 using Oceananigans.Grids: AbstractGrid
 using Oceananigans.Operators: ∂xᶠᶜᶜ, ∂yᶜᶠᶜ
 using Oceananigans.BoundaryConditions: regularize_field_boundary_conditions
@@ -50,20 +49,13 @@ end
 ##### Time stepping
 #####
 
-function ab2_step_free_surface!(free_surface::ExplicitFreeSurface, model, Δt, χ, prognostic_field_events) 
-    @apply_regionally prognostic_field_events = explicit_ab2_step_free_surface!(free_surface, model, Δt, χ, prognostic_field_events)
-    return prognostic_field_events
-end
-
-# ab2_step_free_surface!(free_surface::ExplicitFreeSurface, model, Δt, χ, prognostic_field_events) =
-#     @apply_regionally explicit_ab2_step_free_surface!(free_surface, model, Δt, χ, prognostic_field_events)
+ab2_step_free_surface!(free_surface::ExplicitFreeSurface, model, Δt, χ) = 
+    @apply_regionally explicit_ab2_step_free_surface!(free_surface, model, Δt, χ)
 
-function explicit_ab2_step_free_surface!(free_surface, model, Δt, χ, prognostic_field_events) 
-    
-    free_surface_event = launch!(model.architecture, model.grid, :xy,
-                                _explicit_ab2_step_free_surface!, free_surface.η, Δt, χ,
-                                model.timestepper.Gⁿ.η, model.timestepper.G⁻.η, size(model.grid, 3))
-end
+explicit_ab2_step_free_surface!(free_surface, model, Δt, χ) =
+    launch!(model.architecture, model.grid, :xy,
+            _explicit_ab2_step_free_surface!, free_surface.η, Δt, χ,
+            model.timestepper.Gⁿ.η, model.timestepper.G⁻.η, size(model.grid, 3))
 
 #####
 ##### Kernel
diff --git a/src/Models/HydrostaticFreeSurfaceModels/fft_based_implicit_free_surface_solver.jl b/src/Models/HydrostaticFreeSurfaceModels/fft_based_implicit_free_surface_solver.jl
index 592351ed23..4c27418110 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/fft_based_implicit_free_surface_solver.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/fft_based_implicit_free_surface_solver.jl
@@ -99,12 +99,10 @@ function compute_implicit_free_surface_right_hand_side!(rhs, implicit_solver::FF
     grid = implicit_solver.three_dimensional_grid
     Lz = grid.Lz
 
-    event = launch!(arch, grid, :xy,
-                    fft_implicit_free_surface_right_hand_side!,
-                    rhs, grid, g, Lz, Δt, ∫ᶻQ, η,
-                    dependencies = device_event(arch))
+    launch!(arch, grid, :xy,
+            fft_implicit_free_surface_right_hand_side!,
+            rhs, grid, g, Lz, Δt, ∫ᶻQ, η)
     
-    wait(device(arch), event)
     return nothing
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
index 9b7d40f739..51ff76c5e3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
@@ -1,4 +1,3 @@
-using Oceananigans.Architectures: device_event
 using Oceananigans.Fields: location
 using Oceananigans.TimeSteppers: ab2_step_field!
 using Oceananigans.TurbulenceClosures: implicit_step!
@@ -12,13 +11,10 @@ import Oceananigans.TimeSteppers: ab2_step!
 function ab2_step!(model::HydrostaticFreeSurfaceModel, Δt, χ)
 
     # Step locally velocity and tracers
-    @apply_regionally prognostic_field_events = local_ab2_step!(model, Δt, χ)
+    @apply_regionally local_ab2_step!(model, Δt, χ)
 
     # blocking step for implicit free surface, non blocking for explicit
-    prognostic_field_events = ab2_step_free_surface!(model.free_surface, model, Δt, χ, prognostic_field_events)
-
-    # waiting all the ab2 steps (velocities, free_surface and tracers to complete)
-    @apply_regionally wait(device(model.architecture), prognostic_field_events)
+    ab2_step_free_surface!(model.free_surface, model, Δt, χ)
 
     return nothing
 end
@@ -31,13 +27,8 @@ function local_ab2_step!(model, Δt, χ)
         barotropic_mode!(sefs.state.U, sefs.state.V, model.grid, u, v)
     end
 
-    explicit_velocity_step_events = ab2_step_velocities!(model.velocities, model, Δt, χ)
-    explicit_tracer_step_events   = ab2_step_tracers!(model.tracers, model, Δt, χ)
-    
-    prognostic_field_events = (tuple(explicit_velocity_step_events...),
-                               tuple(explicit_tracer_step_events...))
-
-    return prognostic_field_events    
+    ab2_step_velocities!(model.velocities, model, Δt, χ)
+    ab2_step_tracers!(model.tracers, model, Δt, χ)
 end
 
 #####
@@ -46,17 +37,13 @@ end
 
 function ab2_step_velocities!(velocities, model, Δt, χ)
 
-    # Launch velocity update kernels
-    explicit_velocity_step_events = []
-
     for (i, name) in enumerate((:u, :v))
         Gⁿ = model.timestepper.Gⁿ[name]
         G⁻ = model.timestepper.G⁻[name]
         velocity_field = model.velocities[name]
 
-        event = launch!(model.architecture, model.grid, :xyz,
-                        ab2_step_field!, velocity_field, Δt, χ, Gⁿ, G⁻,
-                        dependencies = device_event(model))
+        launch!(model.architecture, model.grid, :xyz,
+                ab2_step_field!, velocity_field, Δt, χ, Gⁿ, G⁻)
 
         # TODO: let next implicit solve depend on previous solve + explicit velocity step
         # Need to distinguish between solver events and tendency calculation events.
@@ -67,13 +54,10 @@ function ab2_step_velocities!(velocities, model, Δt, χ)
                        model.diffusivity_fields,
                        nothing,
                        model.clock, 
-                       Δt,
-                       dependencies = event)
-
-        push!(explicit_velocity_step_events, event)
+                       Δt)
     end
 
-    return explicit_velocity_step_events
+    return nothing
 end
 
 #####
@@ -82,22 +66,19 @@ end
 
 const EmptyNamedTuple = NamedTuple{(),Tuple{}}
 
-ab2_step_tracers!(::EmptyNamedTuple, model, Δt, χ) = [NoneEvent()]
+ab2_step_tracers!(::EmptyNamedTuple, model, Δt, χ) = nothing
 
 function ab2_step_tracers!(tracers, model, Δt, χ)
 
     # Tracer update kernels
-    explicit_tracer_step_events = []
-
     for (tracer_index, tracer_name) in enumerate(propertynames(tracers))
         Gⁿ = model.timestepper.Gⁿ[tracer_name]
         G⁻ = model.timestepper.G⁻[tracer_name]
         tracer_field = tracers[tracer_name]
         closure = model.closure
 
-        event = launch!(model.architecture, model.grid, :xyz,
-                        ab2_step_field!, tracer_field, Δt, χ, Gⁿ, G⁻,
-                        dependencies = device_event(model))
+        launch!(model.architecture, model.grid, :xyz,
+                ab2_step_field!, tracer_field, Δt, χ, Gⁿ, G⁻)
 
         implicit_step!(tracer_field,
                        model.timestepper.implicit_solver,
@@ -105,12 +86,9 @@ function ab2_step_tracers!(tracers, model, Δt, χ)
                        model.diffusivity_fields,
                        Val(tracer_index),
                        model.clock,
-                       Δt,
-                       dependencies = event)
-
-        push!(explicit_tracer_step_events, event)
+                       Δt)
     end
 
-    return explicit_tracer_step_events
+    return nothing
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
index dc89c8e92c..75ddc32b87 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/implicit_free_surface.jl
@@ -8,7 +8,6 @@ using Oceananigans.Fields
 using Oceananigans.Utils: prettytime
 
 using Adapt
-using KernelAbstractions: NoneEvent
 
 struct ImplicitFreeSurface{E, G, B, I, M, S} <: AbstractFreeSurface{E, G}
     η :: E
@@ -119,10 +118,10 @@ end
 """
 Implicitly step forward η.
 """
-ab2_step_free_surface!(free_surface::ImplicitFreeSurface, model, Δt, χ, prognostic_field_events) =
-    implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, Δt, χ, prognostic_field_events)
+ab2_step_free_surface!(free_surface::ImplicitFreeSurface, model, Δt, χ) =
+    implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, Δt, χ)
 
-function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, Δt, χ, prognostic_field_events)
+function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, Δt, χ)
     η      = free_surface.η
     g      = free_surface.gravitational_acceleration
     rhs    = free_surface.implicit_step_solver.right_hand_side
@@ -130,7 +129,6 @@ function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, 
     solver = free_surface.implicit_step_solver
     arch   = model.architecture
  
-    @apply_regionally prognostic_field_events = wait_velocity_event(arch,  prognostic_field_events)
     fill_halo_regions!(model.velocities)
 
     # Compute right hand side of implicit free surface equation
@@ -148,10 +146,6 @@ function implicit_free_surface_step!(free_surface::ImplicitFreeSurface, model, 
 
     fill_halo_regions!(η)
     
-    return prognostic_field_events
-end
-
-function wait_velocity_event(arch, prognostic_field_events)
     return nothing
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/matrix_implicit_free_surface_solver.jl b/src/Models/HydrostaticFreeSurfaceModels/matrix_implicit_free_surface_solver.jl
index 4019a5acc4..cbd99cada8 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/matrix_implicit_free_surface_solver.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/matrix_implicit_free_surface_solver.jl
@@ -89,12 +89,10 @@ function compute_implicit_free_surface_right_hand_side!(rhs,
     grid = solver.grid
     arch = architecture(grid)
 
-    event = launch!(arch, grid, :xy,
-                    implicit_linearized_free_surface_right_hand_side!,
-                    rhs, grid, g, Δt, ∫ᶻQ, η,
-		            dependencies = device_event(arch))
+    launch!(arch, grid, :xy,
+            implicit_linearized_free_surface_right_hand_side!,
+            rhs, grid, g, Δt, ∫ᶻQ, η)
     
-    wait(device(arch), event)
     return nothing
 end
 
@@ -123,12 +121,9 @@ function compute_matrix_coefficients(vertically_integrated_areas, grid, gravitat
     ∫Ax = vertically_integrated_areas.xᶠᶜᶜ
     ∫Ay = vertically_integrated_areas.yᶜᶠᶜ
 
-    event_c = launch!(arch, grid, :xy, _compute_coefficients!,
-                      diag, Ax, Ay, ∫Ax, ∫Ay, grid, gravitational_acceleration,
-                      dependencies = device_event(arch))
+    launch!(arch, grid, :xy, _compute_coefficients!,
+            diag, Ax, Ay, ∫Ax, ∫Ay, grid, gravitational_acceleration)
   
-    wait(event_c)
-
     return (Ax, Ay, Az, C, diag)
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/mg_implicit_free_surface_solver.jl b/src/Models/HydrostaticFreeSurfaceModels/mg_implicit_free_surface_solver.jl
index 4ee586cdef..e80442784d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/mg_implicit_free_surface_solver.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/mg_implicit_free_surface_solver.jl
@@ -121,11 +121,8 @@ function Az_∇h²ᶜᶜᶜ_linear_operation!(L_ηⁿ⁺¹, ηⁿ⁺¹, ∫ᶻ_A
     arch = architecture(L_ηⁿ⁺¹)
     fill_halo_regions!(ηⁿ⁺¹)
 
-    event = launch!(arch, grid, :xy, _Az_∇h²ᶜᶜᶜ_linear_operation!,
-                    L_ηⁿ⁺¹, grid,  ηⁿ⁺¹, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xy, _Az_∇h²ᶜᶜᶜ_linear_operation!,
+                    L_ηⁿ⁺¹, grid,  ηⁿ⁺¹, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ)
 
     return nothing
 end
@@ -201,11 +198,9 @@ function compute_implicit_free_surface_right_hand_side!(rhs, implicit_solver::MG
     arch = architecture(solver)
     grid = solver.grid
 
-    event = launch!(arch, grid, :xy,
-                    implicit_free_surface_right_hand_side!,
-                    rhs, grid, g, Δt, ∫ᶻQ, η,
-                    dependencies = device_event(arch))
+    launch!(arch, grid, :xy,
+            implicit_free_surface_right_hand_side!,
+            rhs, grid, g, Δt, ∫ᶻQ, η)
     
-    wait(device(arch), event)
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl b/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
index fe6b883e0a..c1511b1495 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/pcg_implicit_free_surface_solver.jl
@@ -81,9 +81,8 @@ build_implicit_step_solver(::Val{:PreconditionedConjugateGradient}, grid, settin
 function solve!(η, implicit_free_surface_solver::PCGImplicitFreeSurfaceSolver, rhs, g, Δt)
     # Take explicit step first? We haven't found improvement from this yet, but perhaps it will
     # help eventually.
-    #event = explicit_ab2_step_free_surface!(free_surface, model, Δt, χ)
-    #wait(device(model.architecture), event)
-
+    #explicit_ab2_step_free_surface!(free_surface, model, Δt, χ)
+    
     ∫ᶻA = implicit_free_surface_solver.vertically_integrated_lateral_areas
     solver = implicit_free_surface_solver.preconditioned_conjugate_gradient_solver
     
@@ -100,12 +99,10 @@ function compute_implicit_free_surface_right_hand_side!(rhs, implicit_solver::PC
     arch = architecture(solver)
     grid = solver.grid
 
-    event = launch!(arch, grid, :xy,
-                    implicit_free_surface_right_hand_side!,
-                    rhs, grid, g, Δt, ∫ᶻQ, η,
-		            dependencies = device_event(arch))
+    launch!(arch, grid, :xy,
+            implicit_free_surface_right_hand_side!,
+            rhs, grid, g, Δt, ∫ᶻQ, η)
     
-    wait(device(arch), event)
     return nothing
 end
 
@@ -132,11 +129,8 @@ function implicit_free_surface_linear_operation!(L_ηⁿ⁺¹, ηⁿ⁺¹, ∫
     arch = architecture(L_ηⁿ⁺¹)
     fill_halo_regions!(ηⁿ⁺¹)
 
-    event = launch!(arch, grid, :xy, _implicit_free_surface_linear_operation!,
-                    L_ηⁿ⁺¹, grid,  ηⁿ⁺¹, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ, g, Δt,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xy, _implicit_free_surface_linear_operation!,
+            L_ηⁿ⁺¹, grid,  ηⁿ⁺¹, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ, g, Δt)
 
     return nothing
 end
@@ -187,12 +181,10 @@ add to the rhs - H⁻¹ ∇H ⋅ ∇ηⁿ to the rhs...
     Az = grid.Δxᶜᵃᵃ * grid.Δyᵃᶜᵃ # assume horizontal regularity
     Lz = grid.Lz 
 
-    event = launch!(arch, grid, :xy,
-                    fft_preconditioner_right_hand_side!,
-                    poisson_solver.storage, r, η, grid, Az, Lz,
-                    dependencies = device_event(arch))
+    launch!(arch, grid, :xy,
+            fft_preconditioner_right_hand_side!,
+            poisson_solver.storage, r, η, grid, Az, Lz)
 
-    wait(device(arch), event)
 
     return solve!(P_r, preconditioner, poisson_solver.storage, g, Δt)
 end
@@ -272,11 +264,8 @@ function diagonally_dominant_precondition!(P_r, r, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ay
 
     fill_halo_regions!(r)
 
-    event = launch!(arch, grid, :xy, _diagonally_dominant_precondition!,
-                    P_r, grid, r, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ, g, Δt,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xy, _diagonally_dominant_precondition!,
+            P_r, grid, r, ∫ᶻ_Axᶠᶜᶜ, ∫ᶻ_Ayᶜᶠᶜ, g, Δt)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 97493d9c49..3acc2c8c37 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -76,7 +76,7 @@ end
 @inline datatuple(obj::PrescribedVelocityFields) = (; u = datatuple(obj.u), v = datatuple(obj.v), w = datatuple(obj.w))
 
 ab2_step_velocities!(::PrescribedVelocityFields, args...) = nothing
-ab2_step_free_surface!(::Nothing, model, Δt, χ, prognostic_field_events) = nothing 
+ab2_step_free_surface!(::Nothing, model, Δt, χ) = nothing 
 compute_w_from_continuity!(::PrescribedVelocityFields, args...) = nothing
 
 validate_velocity_boundary_conditions(::PrescribedVelocityFields) = nothing
@@ -90,7 +90,7 @@ FreeSurface(free_surface::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFiel
 hydrostatic_prognostic_fields(::PrescribedVelocityFields, ::Nothing, tracers) = tracers
 calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields; kwargs...) = []
 
-apply_flux_bcs!(::Nothing, c, arch, events, barrier, clock, model_fields) = nothing
+apply_flux_bcs!(::Nothing, c, arch, clock, model_fields) = nothing
 
 Adapt.adapt_structure(to, velocities::PrescribedVelocityFields) =
     PrescribedVelocityFields(Adapt.adapt(to, velocities.u),
diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index 6bdac8b8ba..86cf37de0a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -1,4 +1,3 @@
-using KernelAbstractions: NoneEvent
 using CUDA: @allowscalar
 
 using Oceananigans.Grids: Flat, Bounded
@@ -36,7 +35,7 @@ validate_tracer_advection(tracer_advection::Nothing, ::SingleColumnGrid) = nothi
 ##### Time-step optimizations
 #####
 
-calculate_free_surface_tendency!(::SingleColumnGrid, args...) = NoneEvent()
+calculate_free_surface_tendency!(::SingleColumnGrid, args...) = nothing
 
 # Fast state update and halo filling
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index a32edc47da..410d96d9b0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -1,10 +1,9 @@
-using KernelAbstractions: @index, @kernel, NoneEvent
+using KernelAbstractions: @index, @kernel
 
 using Oceananigans.TimeSteppers:  store_field_tendencies!
 
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
-using Oceananigans.Architectures: device_event
 
 using Oceananigans.Utils: launch!
 
@@ -16,18 +15,15 @@ import Oceananigans.TimeSteppers: store_tendencies!
     @inbounds Gη⁻[i, j, grid.Nz+1] = Gη⁰[i, j, grid.Nz+1]
 end
 
-store_free_surface_tendency!(free_surface, model, barrier) = NoneEvent()
+store_free_surface_tendency!(free_surface, model, barrier) = nothing
 
-function store_free_surface_tendency!(::ExplicitFreeSurface, model, barrier)
+function store_free_surface_tendency!(::ExplicitFreeSurface, model)
 
-    event = launch!(model.architecture, model.grid, :xy,
-                    _store_free_surface_tendency!,
-                    model.timestepper.G⁻.η,
-                    model.grid,
-                    model.timestepper.Gⁿ.η,
-                    dependencies = barrier)
-
-    return event
+    launch!(model.architecture, model.grid, :xy,
+            _store_free_surface_tendency!,
+            model.timestepper.G⁻.η,
+            model.grid,
+            model.timestepper.Gⁿ.η)
 end
 
 """ Store previous source terms before updating them. """
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 6bfb47447e..dbe679cc9b 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -1,5 +1,4 @@
 using Oceananigans.Architectures
-using Oceananigans.Architectures: device_event
 using Oceananigans.BoundaryConditions
 using Oceananigans.TurbulenceClosures: calculate_diffusivities!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!, mask_immersed_reduced_field_xy!
diff --git a/src/Models/Models.jl b/src/Models/Models.jl
index 7d9e20dc79..f53bb68a52 100644
--- a/src/Models/Models.jl
+++ b/src/Models/Models.jl
@@ -9,9 +9,8 @@ export
 
 using Oceananigans: AbstractModel
 
-import Oceananigans.Architectures: device_event, architecture
+import Oceananigans.Architectures: architecture
 
-device_event(model::AbstractModel) = device_event(model.architecture)
 architecture(model::AbstractModel) = model.architecture
 
 abstract type AbstractNonhydrostaticModel{TS} <: AbstractModel{TS} end
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 85f0106ae4..0062f10fdd 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -60,6 +60,5 @@ include("update_nonhydrostatic_model_state.jl")
 include("pressure_correction.jl")
 include("nonhydrostatic_tendency_kernel_functions.jl")
 include("calculate_nonhydrostatic_tendencies.jl")
-include("correct_nonhydrostatic_immersed_tendencies.jl")
 
 end # module
diff --git a/src/Models/NonhydrostaticModels/correct_nonhydrostatic_immersed_tendencies.jl b/src/Models/NonhydrostaticModels/correct_nonhydrostatic_immersed_tendencies.jl
deleted file mode 100644
index 0b2e19078e..0000000000
--- a/src/Models/NonhydrostaticModels/correct_nonhydrostatic_immersed_tendencies.jl
+++ /dev/null
@@ -1,66 +0,0 @@
-using Oceananigans.Grids: xnode, ynode, znode, Center, AbstractGrid
-
-import Oceananigans.TimeSteppers: correct_immersed_tendencies!
-
-"""
-    correct_immersed_tendencies!(model, Δt, γⁿ, ζⁿ)
-    
-Correct the tendency terms to implement no-slip boundary conditions on an immersed boundary
-without the contribution from the non-hydrostatic pressure. Makes velocity vanish within the
-immersed surface.
-"""
-
-correct_immersed_tendencies!(model::NonhydrostaticModel, Δt, γⁿ, ζⁿ) =
-    correct_immersed_tendencies!(model, model.immersed_boundary, Δt, γⁿ, ζⁿ)
-
-# if no immersed boundary, do nothing (no cost)
-correct_immersed_tendencies!(model, ::Nothing, Δt, γⁿ, ζⁿ) = nothing
-
-# otherwise, unpack the model
-function correct_immersed_tendencies!(model, immersed_boundary, Δt, γⁿ, ζⁿ)
-
-    workgroup, worksize = work_layout(model.grid, :xyz)
-
-    barrier = Event(device(model.architecture))
-
-    correct_immersed_tendencies_kernel! = _correct_immersed_tendencies!(device(model.architecture), workgroup, worksize)
-    
-    # event we want to occur, evaluate using kernel function
-    correct_tendencies_event =
-        correct_immersed_tendencies_kernel!(model.timestepper.Gⁿ,
-                                            model.grid,
-                                            immersed_boundary,
-                                            model.timestepper.G⁻,
-                                            model.velocities,
-                                            Δt, γⁿ, ζⁿ,
-                                            dependencies=barrier)
-
-    # wait for these things to happen before continuing in calculations
-    wait(device(model.architecture), correct_tendencies_event)
-
-    return nothing
-end
-
-@kernel function _correct_immersed_tendencies!(Gⁿ, grid::AbstractGrid{FT}, immersed, G⁻, velocities, Δt, γⁿ, ζⁿ) where FT
-    i, j, k = @index(Global, NTuple)
-    
-    # Evaluate x, y, z at cell centers to determine if node is immersed
-    x = xnode(Center(), i, grid)
-    y = ynode(Center(), j, grid)
-    z = znode(Center(), k, grid)
-
-    @inbounds begin
-        # correcting velocity tendency terms: if immersd boundary gives true then correct tendency, otherwise don't (it's a fluid node)
-        Gⁿ.u[i, j, k] = ifelse(immersed(x, y, z),
-                               - (velocities.u[i, j, k] + ζⁿ * Δt * G⁻.u[i, j, k]) / (γⁿ * Δt),
-                               Gⁿ.u[i, j, k])
-
-        Gⁿ.v[i, j, k] = ifelse(immersed(x, y, z),
-                               - (velocities.v[i, j, k] + ζⁿ * Δt * G⁻.v[i, j, k]) / (γⁿ * Δt),
-                               Gⁿ.v[i, j, k])
-
-        Gⁿ.w[i, j, k] = ifelse(immersed(x, y, z),
-                               - (velocities.w[i, j, k] + ζⁿ * Δt * G⁻.w[i, j, k]) / (γⁿ * Δt),
-                               Gⁿ.w[i, j, k])
-    end
-end
diff --git a/src/Models/NonhydrostaticModels/pressure_correction.jl b/src/Models/NonhydrostaticModels/pressure_correction.jl
index f3be399fa0..f2040ad724 100644
--- a/src/Models/NonhydrostaticModels/pressure_correction.jl
+++ b/src/Models/NonhydrostaticModels/pressure_correction.jl
@@ -41,15 +41,12 @@ end
 "Update the solution variables (velocities and tracers)."
 function pressure_correct_velocities!(model::NonhydrostaticModel, Δt)
 
-    event = launch!(model.architecture, model.grid, :xyz,
-                    _pressure_correct_velocities!,
-                    model.velocities,
-                    model.grid,
-                    Δt,
-                    model.pressures.pNHS,
-                    dependencies = device_event(model.architecture)) 
-
-    wait(device(model.architecture), event)
-
+    launch!(model.architecture, model.grid, :xyz,
+            _pressure_correct_velocities!,
+            model.velocities,
+            model.grid,
+            Δt,
+            model.pressures.pNHS)
+    
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
index fa7b574999..7594ba9cfa 100644
--- a/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/set_nonhydrostatic_model.jl
@@ -43,7 +43,7 @@ function set!(model::NonhydrostaticModel; enforce_incompressibility=true, kwargs
 
     # Apply a mask
     foreach(mask_immersed_field!, model.tracers)
-    velocity_masking_events = mask_immersed_velocities!(model.velocities, model.architecture, model.grid)
+    mask_immersed_velocities!(model.velocities, model.architecture, model.grid)
 
     update_state!(model)
 
diff --git a/src/Models/NonhydrostaticModels/solve_for_pressure.jl b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
index ec6e462068..5df2fd538e 100644
--- a/src/Models/NonhydrostaticModels/solve_for_pressure.jl
+++ b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
@@ -1,6 +1,5 @@
 using Oceananigans.Operators
 using Oceananigans.Solvers: FFTBasedPoissonSolver, FourierTridiagonalPoissonSolver, solve!
-using Oceananigans.Architectures: device_event
 using Oceananigans.Distributed: DistributedFFTBasedPoissonSolver
 
 using PencilArrays: Permutation
@@ -41,10 +40,8 @@ function solve_for_pressure!(pressure, solver::DistributedFFTBasedPoissonSolver,
     arch = architecture(solver)
     grid = solver.local_grid
 
-    rhs_event = launch!(arch, grid, :xyz, calculate_permuted_pressure_source_term_fft_based_solver!,
-                        rhs, grid, Δt, U★, solver.input_permutation, dependencies = device_event(arch))
-
-    wait(device(arch), rhs_event)
+    launch!(arch, grid, :xyz, calculate_permuted_pressure_source_term_fft_based_solver!,
+            rhs, grid, Δt, U★, solver.input_permutation,)
 
     # Solve pressure Poisson equation for pressure, given rhs
     solve!(pressure, solver)
@@ -59,10 +56,8 @@ function solve_for_pressure!(pressure, solver::FFTBasedPoissonSolver, Δt, U★)
     arch = architecture(solver)
     grid = solver.grid
 
-    rhs_event = launch!(arch, grid, :xyz, calculate_pressure_source_term_fft_based_solver!,
-                        rhs, grid, Δt, U★, dependencies = device_event(arch))
-
-    wait(device(arch), rhs_event)
+    launch!(arch, grid, :xyz, calculate_pressure_source_term_fft_based_solver!,
+            rhs, grid, Δt, U★)
 
     # Solve pressure Poisson given for pressure, given rhs
     solve!(pressure, solver, rhs)
@@ -77,10 +72,8 @@ function solve_for_pressure!(pressure, solver::FourierTridiagonalPoissonSolver,
     arch = architecture(solver)
     grid = solver.grid
 
-    rhs_event = launch!(arch, grid, :xyz, calculate_pressure_source_term_fourier_tridiagonal_solver!,
-                        rhs, grid, Δt, U★, dependencies = device_event(arch))
-
-    wait(device(arch), rhs_event)
+    launch!(arch, grid, :xyz, calculate_pressure_source_term_fourier_tridiagonal_solver!,
+            rhs, grid, Δt, U★)
 
     # Pressure Poisson rhs, scaled by Δzᶜᶜᶜ, is stored in solver.source_term:
     solve!(pressure, solver)
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 5b05df8e2c..f01765a5b2 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -24,17 +24,9 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 # Partial cell "algorithm"
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
+
 update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers)
 
-function update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers)
-
-    pressure_calculation = launch!(arch, grid, :xy, _update_hydrostatic_pressure!,
-                                   pHY′, grid, buoyancy, tracers,
-                                   dependencies = Event(device(arch)))
-
-    # Fill halo regions for pressure
-    wait(device(arch), pressure_calculation)
-    
-    return nothing
-end
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers) =  
+        launch!(arch, grid, :xy, _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)
diff --git a/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl b/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
index 34fd01d97b..880ae39163 100644
--- a/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
+++ b/src/Models/ShallowWaterModels/rk3_substep_shallow_water_model.jl
@@ -12,10 +12,10 @@ function rk3_substep!(model::ShallowWaterModel, Δt, γⁿ, ζⁿ)
     substep_tracer_kernel! = rk3_substep_tracer!(device(model.architecture), workgroup, worksize)
 
 
-    solution_event = substep_solution_kernel!(model.solution,
-                                              Δt, γⁿ, ζⁿ,
-                                              model.timestepper.Gⁿ,
-                                              model.timestepper.G⁻)
+    substep_solution_kernel!(model.solution,
+                             Δt, γⁿ, ζⁿ,
+                             model.timestepper.Gⁿ,
+                             model.timestepper.G⁻)
 
 
     for i in 1:length(model.tracers)
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index cecd688dc7..5dada95b0b 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -1,5 +1,5 @@
 using Oceananigans.Operators
-using Oceananigans.Architectures: device, device_event
+using Oceananigans.Architectures: device
 using Oceananigans.TurbulenceClosures: ExplicitTimeDiscretization, ThreeDimensionalFormulation
 
 using Oceananigans.TurbulenceClosures: 
@@ -52,12 +52,9 @@ function calculate_diffusivities!(diffusivity_fields, closure::ShallowWaterScala
 
     model_fields = shallow_water_fields(model.velocities, model.tracers, model.solution, formulation(model))
     
-    event = launch!(arch, grid, :xyz,
-                    calculate_nonlinear_viscosity!,
-                    diffusivity_fields.νₑ, grid, closure, clock, model_fields,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            calculate_nonlinear_viscosity!,
+            diffusivity_fields.νₑ, grid, closure, clock, model_fields)
 
     return nothing
 end
diff --git a/src/MultiRegion/MultiRegion.jl b/src/MultiRegion/MultiRegion.jl
index dbde53002e..86591dcb6c 100644
--- a/src/MultiRegion/MultiRegion.jl
+++ b/src/MultiRegion/MultiRegion.jl
@@ -17,7 +17,7 @@ using OffsetArrays
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: Reference, Iterate
 
-using KernelAbstractions: Event, NoneEvent, @kernel, @index
+using KernelAbstractions: @kernel, @index
 
 import Base: show, length, size
 
diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index b029653e67..08d70b16cb 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -1,5 +1,5 @@
 using Oceananigans: instantiated_location
-using Oceananigans.Architectures: arch_array, device_event, device_copy_to!
+using Oceananigans.Architectures: arch_array, device_copy_to!
 using Oceananigans.Operators: assumed_field_location
 using Oceananigans.Fields: reduced_dimensions
 
@@ -67,9 +67,8 @@ function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiR
     halo_tuple = construct_regionally(permute_boundary_conditions, bcs)
 
     for task = 1:3
-        barrier = device_event(arch)
         apply_regionally!(fill_halo_event!, task, halo_tuple, 
-                          c, indices, loc, arch, barrier, mrg, Reference(c.regional_objects), Reference(buffers.regional_objects), 
+                          c, indices, loc, arch, mrg, Reference(c.regional_objects), Reference(buffers.regional_objects), 
                           args...; kwargs...)
     end
 
@@ -87,20 +86,20 @@ for (lside, rside) in zip([:west, :south, :bottom], [:east, :north, :bottom])
     fill_right_halo! = Symbol(:fill_, rside, :_halo!)
 
     @eval begin
-        function $fill_both_halo!(c, left_bc::CBC, right_bc, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) 
-            event = $fill_right_halo!(c, right_bc, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...)
-            $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, event, grid, args...; kwargs...)
-            return NoneEvent()
+        function $fill_both_halo!(c, left_bc::CBC, right_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...) 
+            $fill_right_halo!(c, right_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
+            $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
+            return nothing
         end   
         function $fill_both_halo!(c, left_bc, right_bc::CBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) 
-            event = $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...)
-            $fill_right_halo!(c, right_bc, kernel_size, offset, loc, arch, event, grid, args...; kwargs...)
-            return NoneEvent()
+            $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
+            $fill_right_halo!(c, right_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
+            return nothing
         end   
     end
 end
 
-function fill_west_and_east_halo!(c, westbc::CBC, eastbc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_west_and_east_halo!(c, westbc::CBC, eastbc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
 
     H = halo_size(grid)[1]
     N = size(grid)[1]
@@ -110,8 +109,6 @@ function fill_west_and_east_halo!(c, westbc::CBC, eastbc::CBC, kernel_size, offs
     westdst = buffers[westbc.condition.rank].west.recv
     eastdst = buffers[eastbc.condition.rank].east.recv
 
-    wait(device(arch), dep)
-
     switch_device!(getdevice(w))
     westsrc = buffers[westbc.condition.from_rank].east.send
     westsrc .= view(parent(w), N+1:N+H, :, :)
@@ -127,10 +124,10 @@ function fill_west_and_east_halo!(c, westbc::CBC, eastbc::CBC, kernel_size, offs
     view(parent(c), 1:H, :, :)        .= westdst
     view(parent(c), N+H+1:N+2H, :, :) .= eastdst
 
-    return NoneEvent()
+    return nothing
 end
 
-function fill_south_and_north_halo!(c, southbc::CBC, northbc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_south_and_north_halo!(c, southbc::CBC, northbc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
 
     H = halo_size(grid)[2]
     N = size(grid)[2]
@@ -140,8 +137,6 @@ function fill_south_and_north_halo!(c, southbc::CBC, northbc::CBC, kernel_size,
     southdst = buffers[southbc.condition.rank].south.recv
     northdst = buffers[northbc.condition.rank].north.recv
 
-    wait(device(arch), dep)
-
     switch_device!(getdevice(s))
     southsrc = buffers[westbc.condition.from_rank].south.send
     southsrc .= view(parent(s), :, N+1:N+H, :)
@@ -157,22 +152,20 @@ function fill_south_and_north_halo!(c, southbc::CBC, northbc::CBC, kernel_size,
     view(parent(c), :, 1:H, :, :)        .= southdst
     view(parent(c), :, N+H+1:N+2H, :, :) .= northdst
 
-    return NoneEvent()
+    return nothing
 end
 
 #####
 ##### Single fill_halo! for Communicating boundary condition 
 #####
     
-function fill_west_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_west_halo!(c, bc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
     
     H = halo_size(grid)[1]
     N = size(grid)[1]
     w = neighbors[bc.condition.from_rank]
     dst = buffers[bc.condition.rank].west.recv
 
-    wait(device(arch), dep)
-
     switch_device!(getdevice(w))
     src = buffers[bc.condition.from_rank].east.send
     src .= view(parent(w), N+1:N+H, :, :)
@@ -187,15 +180,13 @@ function fill_west_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid,
     return nothing
 end
 
-function fill_east_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_east_halo!(c, bc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
 
     H = halo_size(grid)[1]
     N = size(grid)[1]
     e = neighbors[bc.condition.from_rank]
     dst = buffers[bc.condition.rank].east.recv
 
-    wait(device(arch), dep)
-
     switch_device!(getdevice(e))
     src = buffers[bc.condition.from_rank].west.send
     src .= view(parent(e), H+1:2H, :, :)
@@ -210,15 +201,13 @@ function fill_east_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid,
     return nothing
 end
 
-function fill_south_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_south_halo!(c, bc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
         
     H = halo_size(grid)[2]
     N = size(grid)[2]
     s = neighbors[bc.condition.from_rank]
     dst = buffers[bc.condition.rank].south.recv
 
-    wait(device(arch), dep)
-
     switch_device!(getdevice(s))
     src = buffers[bc.condition.from_rank].north.send
     src .= view(parent(s), :, N+1:N+H, :)
@@ -233,15 +222,13 @@ function fill_south_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid,
     return nothing
 end
 
-function fill_north_halo!(c, bc::CBC, kernel_size, offset, loc, arch, dep, grid, neighbors, buffers, args...; kwargs...)
+function fill_north_halo!(c, bc::CBC, kernel_size, offset, loc, arch, grid, neighbors, buffers, args...; kwargs...)
     
     H = halo_size(grid)[2]
     N = size(grid)[2]
     n = neighbors[bc.condition.from_rank]
     dst = buffers[bc.condition.rank].north.recv
-
-    wait(device(arch), dep)
-
+    
     switch_device!(getdevice(n))
     src = buffers[bc.condition.from_rank].south.send
     src .= view(parent(n), :, H+1:2H, :)
diff --git a/src/MultiRegion/unified_implicit_free_surface_solver.jl b/src/MultiRegion/unified_implicit_free_surface_solver.jl
index 72aaa00a38..6c580b4196 100644
--- a/src/MultiRegion/unified_implicit_free_surface_solver.jl
+++ b/src/MultiRegion/unified_implicit_free_surface_solver.jl
@@ -75,16 +75,10 @@ function compute_implicit_free_surface_right_hand_side!(rhs, implicit_solver::Un
     return nothing
 end
 
-function compute_regional_rhs!(rhs, grid, g, Δt, ∫ᶻQ, η, region, partition)
-    arch = architecture(grid)
-    event = launch!(arch, grid, :xy,
+compute_regional_rhs!(rhs, grid, g, Δt, ∫ᶻQ, η, region, partition) = 
+    launch!(architecture(grid), grid, :xy,
                     implicit_linearized_unified_free_surface_right_hand_side!,
-                    rhs, grid, g, Δt, ∫ᶻQ, η, region, partition,
-		            dependencies = device_event(arch))
-
-    wait(device(arch), event)
-    return nothing
-end
+                    rhs, grid, g, Δt, ∫ᶻQ, η, region, partition)
 
 # linearized right hand side
 @kernel function implicit_linearized_unified_free_surface_right_hand_side!(rhs, grid, g, Δt, ∫ᶻQ, η, region, partition)
@@ -115,13 +109,8 @@ function solve!(η, implicit_free_surface_solver::UnifiedImplicitFreeSurfaceSolv
     return nothing
 end
 
-function redistribute_lhs!(η, sol, arch, grid, region, partition)
-
-    event = launch!(arch, grid, :xy, _redistribute_lhs!, η, sol, region, grid, partition,
-		            dependencies = device_event(arch))
-
-    wait(device(arch), event)
-end
+redistribute_lhs!(η, sol, arch, grid, region, partition) = 
+    launch!(arch, grid, :xy, _redistribute_lhs!, η, sol, region, grid, partition)
 
 # linearized right hand side
 @kernel function _redistribute_lhs!(η, sol, region, grid, partition)
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 5dcf632a97..a65ed56ebc 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Architectures: device_event, arch_array
+using Oceananigans.Architectures: arch_array
 
 import Oceananigans.Architectures: architecture
 
@@ -59,7 +59,7 @@ end
 @inline get_coefficient(a::Base.Callable, i, j, k, grid, ::Nothing, args...) = a(i, j, k, grid, args...)
 
 """
-    solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...; dependencies = device_event(solver.architecture))
+    solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
                                       
 
 Solve the batched tridiagonal system of linear equations with right hand side
@@ -71,16 +71,13 @@ The result is stored in `ϕ` which must have size `(grid.Nx, grid.Ny, grid.Nz)`.
 
 Reference implementation per Numerical Recipes, Press et. al 1992 (§ 2.4).
 """
-function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...; dependencies = device_event(architecture(solver))) 
+function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args... )
 
     a, b, c, t, parameters = solver.a, solver.b, solver.c, solver.t, solver.parameters
     grid = solver.grid
 
-    event = launch!(architecture(solver), grid, :xy,
-                    solve_batched_tridiagonal_system_kernel!, ϕ, a, b, c, rhs, t, grid, parameters, args...,
-                    dependencies = dependencies)
-
-    wait(device(architecture(solver)), event)
+    launch!(architecture(solver), grid, :xy,
+                    solve_batched_tridiagonal_system_kernel!, ϕ, a, b, c, rhs, t, grid, parameters, args...)
 
     return nothing
 end
diff --git a/src/Solvers/fft_based_poisson_solver.jl b/src/Solvers/fft_based_poisson_solver.jl
index 34f8e294e2..7c5e81cf9e 100644
--- a/src/Solvers/fft_based_poisson_solver.jl
+++ b/src/Solvers/fft_based_poisson_solver.jl
@@ -1,4 +1,3 @@
-using Oceananigans.Architectures: device_event
 using Oceananigans.Fields: indices, offset_compute_index
 
 import Oceananigans.Architectures: architecture
@@ -114,9 +113,8 @@ function solve!(ϕ, solver::FFTBasedPoissonSolver, b, m=0)
     # Apply backward transforms in order
     [transform!(ϕc, solver.buffer) for transform! in solver.transforms.backward]
 
-    copy_event = launch!(arch, solver.grid, :xyz, copy_real_component!, ϕ, ϕc, indices(ϕ), dependencies=device_event(arch))
-    wait(device(arch), copy_event)
-
+    launch!(arch, solver.grid, :xyz, copy_real_component!, ϕ, ϕc, indices(ϕ))
+    
     return ϕ
 end
 
diff --git a/src/Solvers/fourier_tridiagonal_poisson_solver.jl b/src/Solvers/fourier_tridiagonal_poisson_solver.jl
index e8102079ad..93e39517af 100644
--- a/src/Solvers/fourier_tridiagonal_poisson_solver.jl
+++ b/src/Solvers/fourier_tridiagonal_poisson_solver.jl
@@ -1,5 +1,4 @@
 using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ
-using Oceananigans.Architectures: device_event
 import Oceananigans.Architectures: architecture
 
 struct FourierTridiagonalPoissonSolver{G, B, R, S, β, T}
@@ -52,9 +51,8 @@ function FourierTridiagonalPoissonSolver(grid, planner_flag=FFTW.PATIENT)
 
     # Compute diagonal coefficients for each grid point
     diagonal = arch_array(arch, zeros(Nx, Ny, Nz))
-    event = launch!(arch, grid, :xy, compute_main_diagonals!, diagonal, grid, λx, λy, dependencies=device_event(arch))
-    wait(device(arch), event)
-
+    launch!(arch, grid, :xy, compute_main_diagonals!, diagonal, grid, λx, λy)
+    
     # Set up batched tridiagonal solver
     btsolver = BatchedTridiagonalSolver(grid;
                                         lower_diagonal = lower_diagonal,
@@ -94,9 +92,8 @@ function solve!(x, solver::FourierTridiagonalPoissonSolver, b=nothing)
     # so that the solution has zero-mean.
     ϕ .= ϕ .- mean(ϕ)
 
-    copy_event = launch!(arch, solver.grid, :xyz, copy_real_component!, x, ϕ, indices(x), dependencies=device_event(arch))
-    wait(device(arch), copy_event)
-
+    launch!(arch, solver.grid, :xyz, copy_real_component!, x, ϕ, indices(x))
+    
     return nothing
 end
 
@@ -111,8 +108,7 @@ function set_source_term!(solver::FourierTridiagonalPoissonSolver, source_term)
     arch = architecture(solver)
     solver.source_term .= source_term
 
-    event = launch!(arch, grid, :xyz, multiply_by_Δzᵃᵃᶜ!, solver.source_term, grid, dependencies=Event(device(arch)))
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz, multiply_by_Δzᵃᵃᶜ!, solver.source_term, grid)
 
     return nothing
 end
diff --git a/src/Solvers/heptadiagonal_iterative_solver.jl b/src/Solvers/heptadiagonal_iterative_solver.jl
index 106969c02d..a76238c869 100644
--- a/src/Solvers/heptadiagonal_iterative_solver.jl
+++ b/src/Solvers/heptadiagonal_iterative_solver.jl
@@ -1,5 +1,5 @@
 using Oceananigans.Architectures
-using Oceananigans.Architectures: architecture, arch_array, unsafe_free!, device_event
+using Oceananigans.Architectures: architecture, arch_array, unsafe_free!
 using Oceananigans.Grids: interior_parent_indices, topology
 using Oceananigans.Utils: heuristic_workgroup
 using KernelAbstractions: @kernel, @index
@@ -175,9 +175,8 @@ function matrix_from_coefficients(arch, grid, coeffs, reduced_dim)
 
     # Initialize elements which vary during the simulation (as a function of Δt)
     loop! = _initialize_variable_diagonal!(Architectures.device(arch), heuristic_workgroup(N...), N)
-    event = loop!(diag, D, N; dependencies = device_event(arch))
-    wait(event)
-
+    loop!(diag, D, N)
+    
     # Fill matrix elements that stay constant in time
     fill_core_matrix!(coeff_d, coeff_x, coeff_y, coeff_z, Ax, Ay, Az, C, N, dims)
 
diff --git a/src/Solvers/index_permutations.jl b/src/Solvers/index_permutations.jl
index 18ef14d217..ce53a753af 100644
--- a/src/Solvers/index_permutations.jl
+++ b/src/Solvers/index_permutations.jl
@@ -83,12 +83,8 @@ unpermute_kernel! = Dict(
     3 => unpermute_z_indices!
 )
 
-function permute_indices!(dst, src, arch, grid, dim)
-    event = launch!(arch, grid, :xyz, permute_kernel![dim], dst, src, grid, dependencies=Event(device(arch)))
-    wait(device(arch), event)
-end
-
-function unpermute_indices!(dst, src, arch, grid, dim)
-    event = launch!(arch, grid, :xyz, unpermute_kernel![dim], dst, src, grid, dependencies=Event(device(arch)))
-    wait(device(arch), event)
-end
+permute_indices!(dst, src, arch, grid, dim) = 
+    launch!(arch, grid, :xyz, permute_kernel![dim], dst, src, grid)
+    
+unpermute_indices!(dst, src, arch, grid, dim) = 
+    launch!(arch, grid, :xyz, unpermute_kernel![dim], dst, src, grid)
diff --git a/src/Solvers/matrix_solver_utils.jl b/src/Solvers/matrix_solver_utils.jl
index aaaccda22b..b0865d69ae 100644
--- a/src/Solvers/matrix_solver_utils.jl
+++ b/src/Solvers/matrix_solver_utils.jl
@@ -1,5 +1,5 @@
 using Oceananigans.Architectures
-using Oceananigans.Architectures: device, device_event
+using Oceananigans.Architectures: device
 import Oceananigans.Architectures: architecture, unified_array
 using CUDA, CUDA.CUSPARSE
 using KernelAbstractions: @kernel, @index
@@ -33,9 +33,8 @@ using SparseArrays: fkeep!
 function update_diag!(constr, arch, M, N, diag, Δt, disp)   
     colptr, rowval, nzval = unpack_constructors(arch, constr)
     loop! = _update_diag!(device(arch), min(256, M), M)
-    event = loop!(nzval, colptr, rowval, diag, Δt, disp; dependencies=device_event(arch))
-    wait(device(arch), event)
-
+    loop!(nzval, colptr, rowval, diag, Δt, disp)
+    
     constr = constructors(arch, M, N, (colptr, rowval, nzval))
 end
 
diff --git a/src/Solvers/sparse_preconditioners.jl b/src/Solvers/sparse_preconditioners.jl
index ecc182b33b..679ec9a583 100644
--- a/src/Solvers/sparse_preconditioners.jl
+++ b/src/Solvers/sparse_preconditioners.jl
@@ -135,17 +135,15 @@ function asymptotic_diagonal_inverse_preconditioner(A::AbstractMatrix; asymptoti
     invdiag = arch_array(arch, zeros(eltype(nzval), M))
 
     loop! = _get_inv_diag!(dev, 256, M)
-    event = loop!(invdiag, colptr, rowval, nzval; dependencies=Event(dev))
-    wait(dev, event)
-
+    loop!(invdiag, colptr, rowval, nzval)
+    
     if asymptotic_order == 0
         Minv_cpu = spdiagm(0=>arch_array(CPU(), invdiag))
         Minv     = arch_sparse_matrix(arch, Minv_cpu)
     elseif asymptotic_order == 1
         loop! = _initialize_asymptotic_diagonal_inverse_preconditioner_first_order!(dev, 256, M)
-        event = loop!(nzval, colptr, rowval, invdiag; dependencies=Event(dev))
-        wait(dev, event)
-    
+        loop!(nzval, colptr, rowval, invdiag)
+        
         constr_new = (colptr, rowval, nzval)
         Minv = arch_sparse_matrix(arch, constructors(arch, M, M, constr_new))
     else
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 5a952f0535..27b0e0066f 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -1,6 +1,5 @@
 using Oceananigans.Fields: FunctionField, location
 using Oceananigans.TurbulenceClosures: implicit_step!
-using Oceananigans.Architectures: device_event
 using Oceananigans.Utils: @apply_regionally, apply_regionally!
 
 mutable struct QuasiAdamsBashforth2TimeStepper{FT, GT, IT} <: AbstractTimeStepper
@@ -117,7 +116,6 @@ function ab2_step!(model, Δt, χ)
 
     workgroup, worksize = work_layout(model.grid, :xyz)
     arch = model.architecture
-    barrier = device_event(arch)
     step_field_kernel! = ab2_step_field!(device(arch), workgroup, worksize)
     model_fields = prognostic_fields(model)
 
diff --git a/src/TimeSteppers/store_tendencies.jl b/src/TimeSteppers/store_tendencies.jl
index c6bae9c6af..06d179bd3a 100644
--- a/src/TimeSteppers/store_tendencies.jl
+++ b/src/TimeSteppers/store_tendencies.jl
@@ -1,6 +1,5 @@
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
-using Oceananigans.Architectures: device_event
 
 using Oceananigans.Utils: launch!
 
diff --git a/src/TurbulenceClosures/implicit_explicit_time_discretization.jl b/src/TurbulenceClosures/implicit_explicit_time_discretization.jl
index 023898f877..46cd3c7ddb 100644
--- a/src/TurbulenceClosures/implicit_explicit_time_discretization.jl
+++ b/src/TurbulenceClosures/implicit_explicit_time_discretization.jl
@@ -1,5 +1,3 @@
-using KernelAbstractions: NoneEvent
-
 using Oceananigans.Utils: arch_array
 using Oceananigans.Grids: AbstractGrid
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 4c32544461..439a31d199 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -197,12 +197,9 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     clock = model.clock
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    event = launch!(arch, grid, :xyz,
-                    calculate_CATKE_diffusivities!,
-                    diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            calculate_CATKE_diffusivities!,
+            diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index 7e94411466..dc7f8279a9 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Architectures: architecture, device_event, arch_array
+using Oceananigans.Architectures: architecture, arch_array
 using Oceananigans.AbstractOperations: KernelFunctionOperation
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators: ℑzᵃᵃᶜ
@@ -95,13 +95,10 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model)
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    event = launch!(arch, grid, :xyz,
-                    ## If we can figure out how to only precompute the "stability" of a cell:
-                    # compute_stability!, diffusivities, grid, closure, tracers, buoyancy,
-                    compute_convective_adjustment_diffusivities!, diffusivities, grid, closure, tracers, buoyancy,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            ## If we can figure out how to only precompute the "stability" of a cell:
+            # compute_stability!, diffusivities, grid, closure, tracers, buoyancy,
+            compute_convective_adjustment_diffusivities!, diffusivities, grid, closure, tracers, buoyancy)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 47ade18e33..d5feb60e72 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -87,11 +87,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model)
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    event = launch!(arch, grid, :xyz,
-                    compute_tapered_R₃₃!, diffusivities.ϵ_R₃₃, grid, closure, tracers, buoyancy,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            compute_tapered_R₃₃!, diffusivities.ϵ_R₃₃, grid, closure, tracers, buoyancy)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index c4bd8347b3..a6f1994ec2 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -97,12 +97,9 @@ function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLei
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    event = launch!(arch, grid, :xyz,
-                    calculate_nonlinear_viscosity!,
-                    diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            calculate_nonlinear_viscosity!,
+            diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index 5f05f59638..44ba6fb265 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -7,7 +7,7 @@ using Oceananigans.Utils
 using Oceananigans.Fields
 using Oceananigans.Operators
 
-using Oceananigans.Architectures: device, device_event
+using Oceananigans.Architectures: device
 using Oceananigans.Fields: ZeroField
 using Oceananigans.BoundaryConditions: FluxBoundaryCondition, FieldBoundaryConditions
 using Oceananigans.BuoyancyModels: ∂x_b, ∂y_b, ∂z_b
@@ -125,18 +125,15 @@ function calculate_diffusivities!(diffusivities, closure::MEWS, model)
     buoyancy = model.buoyancy
     velocities = model.velocities
 
-    event = launch!(arch, grid, :xyz,
-                    compute_mews_diffusivities!,
-                    diffusivities,
-                    grid,
-                    closure,
-                    velocities,
-                    tracers,
-                    buoyancy,
-                    coriolis,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            compute_mews_diffusivities!,
+            diffusivities,
+            grid,
+            closure,
+            velocities,
+            tracers,
+            buoyancy,
+            coriolis)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 9709c99726..217bb576e3 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Architectures: architecture, device_event, arch_array
+using Oceananigans.Architectures: architecture, arch_array
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators
 using Oceananigans.Operators: ℑzᵃᵃᶜ
@@ -120,19 +120,16 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model)
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    event = launch!(arch, grid, :xyz,
-                    compute_ri_based_diffusivities!,
-                    diffusivities,
-                    grid,
-                    closure,
-                    velocities,
-                    tracers,
-                    buoyancy,
-                    top_tracer_bcs,
-                    clock,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            compute_ri_based_diffusivities!,
+            diffusivities,
+            grid,
+            closure,
+            velocities,
+            tracers,
+            buoyancy,
+            top_tracer_bcs,
+            clock)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index af198811af..dd3416a64a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -114,12 +114,9 @@ function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly,
     velocities = model.velocities
     tracers = model.tracers
 
-    event = launch!(arch, grid, :xyz,
-                    calculate_nonlinear_viscosity!,
-                    diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz,
+            calculate_nonlinear_viscosity!,
+            diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     return nothing
 end
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 423406fdd4..890e7f7a14 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -29,7 +29,7 @@ using Oceananigans.Solvers: BatchedTridiagonalSolver, solve!
 ##### Batched Tridiagonal solver for implicit diffusion
 #####
 
-implicit_step!(field, ::Nothing, args...; kwargs...) = NoneEvent()
+implicit_step!(field, ::Nothing, args...; kwargs...) = nothing
 implicit_diffusion_solver(::ExplicitTimeDiscretization, args...; kwargs...) = nothing
 
 #####
diff --git a/test/dependencies_for_poisson_solvers.jl b/test/dependencies_for_poisson_solvers.jl
index 4f648563d1..26f5c96267 100644
--- a/test/dependencies_for_poisson_solvers.jl
+++ b/test/dependencies_for_poisson_solvers.jl
@@ -34,10 +34,8 @@ function random_divergent_source_term(grid)
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType
-    event = launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R,
-                    dependencies=Event(device(arch)))
-    wait(device(arch), event)
-
+    launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R)
+    
     return R, U
 end
 
@@ -63,19 +61,15 @@ function random_divergence_free_source_term(grid)
     fill_halo_regions!(Rv, nothing, nothing)
     fill_halo_regions!(Rw, nothing, nothing)
 
-    event = launch!(arch, grid, :xy, _compute_w_from_continuity!, U, grid,
-                    dependencies=Event(device(arch)))
-    wait(device(arch), event)
+    launch!(arch, grid, :xy, _compute_w_from_continuity!, U, grid)
 
     fill_halo_regions!(Rw, nothing, nothing)
 
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType
-    event = launch!(arch, grid, :xyz, divergence!, grid, Ru.data, Rv.data, Rw.data, R,
-                    dependencies=Event(device(arch)))
-    wait(device(arch), event)
-
+    launch!(arch, grid, :xyz, divergence!, grid, Ru.data, Rv.data, Rw.data, R)
+    
     return R
 end
 
diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl
index 6f7eb7dbeb..80a2398543 100644
--- a/test/dependencies_for_runtests.jl
+++ b/test/dependencies_for_runtests.jl
@@ -41,7 +41,7 @@ using Dates: DateTime, Nanosecond
 using Statistics: mean
 using LinearAlgebra: norm
 using NCDatasets: Dataset
-using KernelAbstractions: @kernel, @index, Event
+using KernelAbstractions: @kernel, @index
 
 import Oceananigans.Fields: interior
 import Oceananigans.Utils: launch!, datatuple
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index ccb8278627..4cfd82e464 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -58,10 +58,8 @@ function random_divergent_source_term(grid)
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType
-    event = launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R,
-                    dependencies=Event(device(arch)))
-    wait(device(arch), event)
-
+    launch!(arch, grid, :xyz, divergence!, grid, U.u.data, U.v.data, U.w.data, R)
+    
     return R
 end
 
@@ -84,11 +82,7 @@ function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     # Solve it
     ϕc = first(solver.storage)
 
-    event = launch!(arch, local_grid, :xyz,
-                    set_distributed_solver_input!, ϕc, R, solver.input_permutation,
-                    dependencies = device_event(arch))
-
-    wait(device(arch), event)
+    launch!(arch, local_grid, :xyz, set_distributed_solver_input!, ϕc, R, solver.input_permutation)
 
     solve!(ϕ, solver)
 
diff --git a/test/test_hydrostatic_free_surface_immersed_boundaries_implicit_solve.jl b/test/test_hydrostatic_free_surface_immersed_boundaries_implicit_solve.jl
index 9c4d1ec2e2..96f14e7801 100644
--- a/test/test_hydrostatic_free_surface_immersed_boundaries_implicit_solve.jl
+++ b/test/test_hydrostatic_free_surface_immersed_boundaries_implicit_solve.jl
@@ -59,8 +59,7 @@ using Oceananigans.Solvers: initialize_AMGX, finalize_AMGX
             v[imm1, jmm1, 1:Nz] .=  1
             v[imm1, jmp1, 1:Nz] .= -1
             
-            events = ((device_event(arch), device_event(arch)), (device_event(arch), device_event(arch)))
-            implicit_free_surface_step!(model.free_surface, model, 1.0, 1.5, events)
+            implicit_free_surface_step!(model.free_surface, model, 1.0, 1.5)
 
             sol = (sol..., model.free_surface.η)
             f  = (f..., model.free_surface)
diff --git a/test/test_implicit_free_surface_solver.jl b/test/test_implicit_free_surface_solver.jl
index 300184e711..828f5e089a 100644
--- a/test/test_implicit_free_surface_solver.jl
+++ b/test/test_implicit_free_surface_solver.jl
@@ -2,7 +2,6 @@ include("dependencies_for_runtests.jl")
 
 using Statistics
 using Oceananigans.BuoyancyModels: g_Earth
-using Oceananigans.Architectures: device_event
 using Oceananigans.Operators
 using Oceananigans.Grids: inactive_cell
 using Oceananigans.Models.HydrostaticFreeSurfaceModels:
@@ -58,10 +57,8 @@ function run_implicit_free_surface_solver_tests(arch, grid, free_surface)
                                         momentum_advection = nothing,
                                         free_surface)
 
-    events = ((device_event(arch), device_event(arch)), (device_event(arch), device_event(arch)))
-
     set_simple_divergent_velocity!(model)
-    implicit_free_surface_step!(model.free_surface, model, Δt, 1.5, events)
+    implicit_free_surface_step!(model.free_surface, model, Δt, 1.5)
 
     acronym = free_surface.solver_method == :Multigrid ? "MG" :
               free_surface.solver_method == :HeptadiagonalIterativeSolver ? "Matrix" :
@@ -199,16 +196,14 @@ end
         @test mat_model.free_surface.implicit_step_solver isa MatrixImplicitFreeSurfaceSolver
         @test  mg_model.free_surface.implicit_step_solver isa MGImplicitFreeSurfaceSolver
 
-        events = ((device_event(arch), device_event(arch)), (device_event(arch), device_event(arch)))
-
         Δt₁ = 900
         Δt₂ = 920.0
         
         for m in (mat_model, pcg_model, fft_model, mg_model)
             set_simple_divergent_velocity!(m)
-            implicit_free_surface_step!(m.free_surface, m, Δt₁, 1.5, events)
-            implicit_free_surface_step!(m.free_surface, m, Δt₁, 1.5, events)
-            implicit_free_surface_step!(m.free_surface, m, Δt₂, 1.5, events)
+            implicit_free_surface_step!(m.free_surface, m, Δt₁, 1.5)
+            implicit_free_surface_step!(m.free_surface, m, Δt₁, 1.5)
+            implicit_free_surface_step!(m.free_surface, m, Δt₂, 1.5)
         end
 
         mat_η = mat_model.free_surface.η
diff --git a/test/test_matrix_poisson_solver.jl b/test/test_matrix_poisson_solver.jl
index db858592c7..5297eb75a0 100644
--- a/test/test_matrix_poisson_solver.jl
+++ b/test/test_matrix_poisson_solver.jl
@@ -51,11 +51,7 @@ function compute_poisson_weights(grid)
     return (Ax, Ay, Az, C, D)
 end
 
-function poisson_rhs!(r, grid)
-    event = launch!(architecture(grid), grid, :xyz, _multiply_by_volume!, r, grid)
-    wait(event)
-    return nothing
-end
+poisson_rhs!(r, grid) = launch!(architecture(grid), grid, :xyz, _multiply_by_volume!, r, grid)
 
 function run_poisson_equation_test(grid)
     arch = architecture(grid)
diff --git a/test/test_multi_region_poisson_solver.jl b/test/test_multi_region_poisson_solver.jl
index 31694d1d89..db9f592fb6 100644
--- a/test/test_multi_region_poisson_solver.jl
+++ b/test/test_multi_region_poisson_solver.jl
@@ -63,11 +63,7 @@ function compute_poisson_weights(grid)
     return (Ax, Ay, Az, C, D)
 end
 
-function poisson_rhs!(r, grid)
-    event = launch!(architecture(grid), grid, :xyz, _multiply_by_volume!, r, grid)
-    wait(event)
-    return nothing
-end
+poisson_rhs!(r, grid) = launch!(architecture(grid), grid, :xyz, _multiply_by_volume!, r, grid)
 
 using Oceananigans.MultiRegion
 using Oceananigans.MultiRegion: UnifiedDiagonalIterativeSolver
diff --git a/test/test_time_stepping.jl b/test/test_time_stepping.jl
index c2c65076f2..891eca8153 100644
--- a/test/test_time_stepping.jl
+++ b/test/test_time_stepping.jl
@@ -127,9 +127,8 @@ function incompressible_in_time(grid, Nt, timestepper)
     end
 
     arch = architecture(grid)
-    event = launch!(arch, grid, :xyz, divergence!, grid, u.data, v.data, w.data, div_U.data, dependencies=Event(device(arch)))
-    wait(device(arch), event)
-
+    launch!(arch, grid, :xyz, divergence!, grid, u.data, v.data, w.data, div_U.data)
+    
     min_div = CUDA.@allowscalar minimum(interior(div_U))
     max_div = CUDA.@allowscalar maximum(interior(div_U))
     max_abs_div = CUDA.@allowscalar maximum(abs, interior(div_U))
diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl
index 3c0324f711..34f8c47a30 100644
--- a/test/utils_for_runtests.jl
+++ b/test/utils_for_runtests.jl
@@ -1,6 +1,6 @@
 using Oceananigans
 using Statistics
-using KernelAbstractions: @kernel, @index, Event
+using KernelAbstractions: @kernel, @index
 using CUDA
 using Test
 using Printf
@@ -69,9 +69,7 @@ end
 
 function compute_∇²!(∇²ϕ, ϕ, arch, grid)
     fill_halo_regions!(ϕ)
-    child_arch = child_architecture(arch)
-    event = launch!(child_arch, grid, :xyz, ∇²!, ∇²ϕ, grid, ϕ, dependencies=Event(device(child_arch)))
-    wait(device(child_arch), event)
+    launch!(child_arch, grid, :xyz, ∇²!, ∇²ϕ, grid, ϕ)
     fill_halo_regions!(∇²ϕ)
     return nothing
 end
diff --git a/validation/elliptic_solvers/doubly_bounded_poisson.jl b/validation/elliptic_solvers/doubly_bounded_poisson.jl
index 560a5c5675..8f99380e91 100644
--- a/validation/elliptic_solvers/doubly_bounded_poisson.jl
+++ b/validation/elliptic_solvers/doubly_bounded_poisson.jl
@@ -15,7 +15,7 @@ using BenchmarkTools,
       IterativeSolvers
 using Oceananigans.Solvers: initialize_AMGX, finalize_AMGX
 
-using KernelAbstractions: @kernel, @index, Event
+using KernelAbstractions: @kernel, @index
 using Statistics: mean
 
 import Oceananigans.Solvers: precondition!
@@ -54,10 +54,8 @@ end
 function compute_∇²!(∇²φ, φ, arch, grid)
     fill_halo_regions!(φ)
     child_arch = child_architecture(arch)
-    event = launch!(child_arch, grid, :xyz, ∇²!, ∇²φ, grid, φ, dependencies=Event(device(child_arch)))
-    wait(device(child_arch), event)
+    launch!(child_arch, grid, :xyz, ∇²!, ∇²φ, grid)
     fill_halo_regions!(∇²φ)
-
     return nothing
 end
 
diff --git a/validation/elliptic_solvers/triply_bounded_poisson.jl b/validation/elliptic_solvers/triply_bounded_poisson.jl
index b67bd69de5..bdfcb5d858 100644
--- a/validation/elliptic_solvers/triply_bounded_poisson.jl
+++ b/validation/elliptic_solvers/triply_bounded_poisson.jl
@@ -15,7 +15,7 @@ using BenchmarkTools,
       IterativeSolvers,
       GLMakie
 
-using KernelAbstractions: @kernel, @index, Event
+using KernelAbstractions: @kernel, @index
 using Statistics: mean
 
 import Oceananigans.Solvers: precondition!
@@ -71,8 +71,7 @@ end
 function compute_∇²!(∇²φ, φ, arch, grid)
     fill_halo_regions!(φ)
     child_arch = child_architecture(arch)
-    event = launch!(child_arch, grid, :xyz, ∇²!, ∇²φ, grid, φ, dependencies=Event(device(child_arch)))
-    wait(device(child_arch), event)
+    launch!(child_arch, grid, :xyz, ∇²!, ∇²φ, grid, φ)
     fill_halo_regions!(∇²φ)
 
     return nothing
diff --git a/validation/stencils/stencil_calculations.jl b/validation/stencils/stencil_calculations.jl
index 6f44d90c1b..b8eb1a62c0 100644
--- a/validation/stencils/stencil_calculations.jl
+++ b/validation/stencils/stencil_calculations.jl
@@ -38,9 +38,8 @@ function ∇²_KA!(∇²ϕ, ϕ)
 
     fill_halo_regions!(ϕ)
     loop! = _∇²_KA!(device(arch), workgroup, worksize)
-    event = loop!(∇²ϕ, grid, ϕ)
-    wait(device(arch), event)
-
+    loop!(∇²ϕ, grid, ϕ)
+    
     return nothing
 end
 

From ba70556321c4e1290e78a64c743b337bca31a5a4 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Feb 2023 14:01:05 -0500
Subject: [PATCH 006/530] compiles...

---
 Manifest.toml                                 | 64 +++++++++----------
 src/Distributed/halo_communication.jl         |  4 --
 src/Distributed/multi_architectures.jl        |  3 +-
 .../split_explicit_free_surface_kernels.jl    | 26 +++-----
 .../update_shallow_water_state.jl             |  2 +-
 .../shallow_water_bickley_jet_regression.jl   |  5 ++
 6 files changed, 45 insertions(+), 59 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index e02c5b922a..4a5de2570b 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,12 +1,12 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.0-beta4"
+julia_version = "1.8.0"
 manifest_format = "2.0"
 project_hash = "e5c066cd371cc92d479d4d0c34bc89f3323ab6b3"
 
 [[deps.AMGX]]
 deps = ["AMGX_jll", "CEnum", "CUDA", "JSON", "Libdl", "SparseArrays"]
-git-tree-sha1 = "e837274ddd2c98d197a5079de76c52bd86c89b1b"
+git-tree-sha1 = "4ce114680290d2989870c99db3a1ba9dd301634f"
 repo-rev = "vc/2.3"
 repo-url = "https://github.com/JuliaGPU/AMGX.jl.git"
 uuid = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
@@ -83,7 +83,7 @@ version = "0.1.2"
 
 [[deps.CUDA]]
 deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"]
-git-tree-sha1 = "ece93002780a9ea0312afa7bd3b8fed37a899aee"
+git-tree-sha1 = "666924b0caa3c8fd067de83b4aefc4b51d0b568f"
 repo-rev = "vc/ka_transition"
 repo-url = "https://github.com/JuliaGPU/CUDA.jl.git"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -113,6 +113,12 @@ git-tree-sha1 = "c6d890a52d2c4d55d326439580c3b8d0875a77d9"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "1.15.7"
 
+[[deps.ChangesOfVariables]]
+deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
+git-tree-sha1 = "844b061c104c408b24537482469400af6075aae4"
+uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
+version = "0.1.5"
+
 [[deps.CommonSolve]]
 git-tree-sha1 = "9441451ee712d1aec22edad62db1a9af3dc8d852"
 uuid = "38540f10-b2f7-11e9-35d8-d573e4eb0ff2"
@@ -127,7 +133,7 @@ version = "4.6.0"
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.2+0"
+version = "0.5.2+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -262,6 +268,12 @@ version = "2018.0.3+2"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
+[[deps.InverseFunctions]]
+deps = ["Test"]
+git-tree-sha1 = "49510dfcb407e572524ba94aeae2fced1f3feb0f"
+uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
+version = "0.1.8"
+
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -304,7 +316,7 @@ version = "1.12.0"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "2f2f329569f0b627dbaf8e144af1fb36b660ad49"
+git-tree-sha1 = "9687f6699e0a9883a0e5da86a2886eae77af2cd7"
 repo-rev = "vc/nix_dependencies"
 repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -355,25 +367,15 @@ uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
 version = "1.16.1+2"
 
 [[deps.LinearAlgebra]]
-deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
+deps = ["Libdl", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[deps.LogExpFunctions]]
-deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
+deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
 git-tree-sha1 = "680e733c3a0a9cea9e935c8c2184aea6a63fa0b5"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 version = "0.3.21"
 
-    [deps.LogExpFunctions.extensions]
-    ChainRulesCoreExt = "ChainRulesCore"
-    ChangesOfVariablesExt = "ChangesOfVariables"
-    InverseFunctionsExt = "InverseFunctions"
-
-    [deps.LogExpFunctions.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-    ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
-
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
@@ -433,7 +435,7 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2022.10.11"
+version = "2022.2.1"
 
 [[deps.NCDatasets]]
 deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
@@ -466,7 +468,7 @@ version = "1.12.9"
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.21+0"
+version = "0.3.20+0"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
@@ -515,9 +517,9 @@ uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
 version = "0.14.2"
 
 [[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.9.0"
+version = "1.8.0"
 
 [[deps.Preferences]]
 deps = ["TOML"]
@@ -612,7 +614,7 @@ version = "1.0.3"
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
 [[deps.SparseArrays]]
-deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
+deps = ["LinearAlgebra", "Random"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[deps.SpecialFunctions]]
@@ -629,9 +631,9 @@ version = "0.8.3"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "cee507162ecbb677450f20058ca83bd559b6b752"
+git-tree-sha1 = "67d3e75e8af8089ea34ce96974d5468d4a008ca6"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.14"
+version = "1.5.15"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
@@ -646,7 +648,6 @@ version = "0.3.0"
 [[deps.Statistics]]
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-version = "1.9.0"
 
 [[deps.Strided]]
 deps = ["LinearAlgebra", "TupleTools"]
@@ -670,15 +671,10 @@ version = "1.10.0"
 deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
 uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
-[[deps.SuiteSparse_jll]]
-deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
-uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "5.10.1+6"
-
 [[deps.TOML]]
 deps = ["Dates"]
 uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.3"
+version = "1.0.0"
 
 [[deps.TableTraits]]
 deps = ["IteratorInterfaceExtensions"]
@@ -762,12 +758,12 @@ version = "2.10.3+0"
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+0"
+version = "1.2.12+3"
 
 [[deps.libblastrampoline_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.4.0+0"
+version = "5.1.1+0"
 
 [[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 3407d21a3d..b60f71e075 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -70,7 +70,6 @@ end
 # TODO: combination of communicating and other boundary conditions in one direction are not implemented yet!
 function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::DistributedGrid, args...; kwargs...)
     arch    = architecture(grid)
-    barrier = Event(device(child_architecture(arch)))
 
     offset = (0, 0)
     x_events_requests = fill_west_and_east_halos!(c, bcs.west, bcs.east, :yz, offset, loc, arch, barrier, grid, args...; kwargs...)
@@ -83,9 +82,6 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     # Length check needed until this PR is merged: https://github.com/JuliaParallel/MPI.jl/pull/458
     length(mpi_requests) > 0 && MPI.Waitall!(mpi_requests)
 
-    # events = filter(e -> e isa Event, events_and_requests)
-    # # wait(device(child_architecture(arch)), MultiEvent(Tuple(events)))
-
     return nothing
 end
 
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index f591e32c68..43b6dbc24a 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Architectures
 using Oceananigans.Grids: topology, validate_tupled_argument
 
-import Oceananigans.Architectures: device, device_event, arch_array, array_type, child_architecture
+import Oceananigans.Architectures: device, arch_array, array_type, child_architecture
 import Oceananigans.Grids: zeros
 
 struct MultiArch{A, R, I, ρ, C, γ} <: AbstractMultiArchitecture
@@ -52,7 +52,6 @@ end
 
 child_architecture(arch::MultiArch)            = arch.child_architecture
 device(arch::AbstractMultiArchitecture)        = device(child_architecture(arch))
-device_event(arch::AbstractMultiArchitecture)  = device_event(child_architecture(arch))
 arch_array(arch::AbstractMultiArchitecture, A) = arch_array(child_architecture(arch), A)
 zeros(FT, arch::MultiArch, N...)               = zeros(FT, child_architecture(arch), N...) 
 array_type(arch::MultiArch)                    = array_type(child_architecture(arch))
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 86eef48c82..394ae5ad39 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @index, @kernel, Event
+using KernelAbstractions: @index, @kernel
 using KernelAbstractions.Extras.LoopInfo: @unroll
 using Oceananigans.Utils
 using Oceananigans.AbstractOperations: Δz  
@@ -45,22 +45,15 @@ function split_explicit_free_surface_substep!(η, state, auxiliary, settings, ar
 
     fill_halo_regions!(η)
 
-    event = launch!(arch, grid, :xy, split_explicit_free_surface_substep_kernel_1!, 
-            grid, Δτ, η, U, V, Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ,
-            dependencies=Event(device(arch)))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xy, split_explicit_free_surface_substep_kernel_1!, 
+            grid, Δτ, η, U, V, Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ)
 
     # U, V has been updated thus need to refill halo
     fill_halo_regions!(U)
     fill_halo_regions!(V)
 
-    event = launch!(arch, grid, :xy, split_explicit_free_surface_substep_kernel_2!, 
-            grid, Δτ, η, U, V, η̅, U̅, V̅, vel_weight, η_weight,
-            dependencies=Event(device(arch)))
-
-    wait(device(arch), event)
-            
+    launch!(arch, grid, :xy, split_explicit_free_surface_substep_kernel_2!, 
+            grid, Δτ, η, U, V, η̅, U̅, V̅, vel_weight, η_weight)
 end
 
 # Barotropic Model Kernels
@@ -113,11 +106,8 @@ function barotropic_split_explicit_corrector!(u, v, free_surface, grid)
     barotropic_mode!(U, V, grid, u, v)
     # add in "good" barotropic mode
 
-    event = launch!(arch, grid, :xyz, barotropic_split_explicit_corrector_kernel!,
-        u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ,
-        dependencies = Event(device(arch)))
-
-    wait(device(arch), event)
+    launch!(arch, grid, :xyz, barotropic_split_explicit_corrector_kernel!,
+        u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ)
 end
 
 @inline calc_ab2_tendencies(Gⁿ, G⁻, χ) = (convert(eltype(Gⁿ), (1.5)) + χ) * Gⁿ - (convert(eltype(Gⁿ), (0.5)) + χ) * G⁻
@@ -173,5 +163,5 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
 
     fill_halo_regions!(η)
 
-    return NoneEvent()
+    return nothing
 end
\ No newline at end of file
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index a9bd5425ba..f9e12161a5 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -26,7 +26,7 @@ function update_state!(model::ShallowWaterModel, callbacks=[])
     compute_velocities!(model.velocities, formulation(model))
 
     foreach(callbacks) do callback
-        if isa(callback.callsite, UpdateStateCallsite
+        if isa(callback.callsite, UpdateStateCallsite)
             callback(model)
         end
     end
diff --git a/test/regression_tests/shallow_water_bickley_jet_regression.jl b/test/regression_tests/shallow_water_bickley_jet_regression.jl
index fa4c3d32ed..8b2296411b 100644
--- a/test/regression_tests/shallow_water_bickley_jet_regression.jl
+++ b/test/regression_tests/shallow_water_bickley_jet_regression.jl
@@ -89,6 +89,11 @@ function run_shallow_water_regression(arch, formulation; regenerate_data = false
 
         summarize_regression_test(test_fields, truth_fields)
 
+        diff = Bool.(1.0 .- (test_fields.v .≈ truth_fields.v))
+        jldsave("difference_$(formulation).jld2", test_v = test_fields.v, truth_v = truth_fields.v, diff = diff)
+
+        @show findall(diff)
+
         @test all(test_fields.u .≈ truth_fields.u)
         @test all(test_fields.v .≈ truth_fields.v)
         @test all(test_fields.h .≈ truth_fields.h)

From 159c82757da6f2bb829cb40feb81cd24eb3fa581 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Feb 2023 14:20:33 -0500
Subject: [PATCH 007/530] fixed fill halo

---
 src/BoundaryConditions/fill_halo_regions.jl   | 12 +++++-----
 .../fill_halo_regions_flux.jl                 | 24 +++++++++----------
 .../fill_halo_regions_open.jl                 | 12 +++++-----
 .../barotropic_pressure_correction.jl         |  5 ++--
 .../vertically_implicit_diffusion_solver.jl   |  8 +++----
 5 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 3d9c2b3153..e21c5b4dcb 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -172,14 +172,14 @@ end
     end
 end
 
-fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, dep, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_west_and_east_halo!, c, west_bc, east_bc, offset, loc, grid, args...; dependencies=dep, kwargs...)
+fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; kwargs...) =
+    launch!(arch, grid, size, _fill_west_and_east_halo!, c, west_bc, east_bc, offset, loc, grid, args...; kwargs...)
 
-fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, dep, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_south_and_north_halo!, c, south_bc, north_bc, offset, loc, grid, args...; dependencies=dep, kwargs...)
+fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; kwargs...) =
+    launch!(arch, grid, size, _fill_south_and_north_halo!, c, south_bc, north_bc, offset, loc, grid, args...; kwargs...)
 
-fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, dep, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, offset, loc, grid, args...; dependencies=dep, kwargs...)
+fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; kwargs...) =
+    launch!(arch, grid, size, _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, offset, loc, grid, args...; kwargs...)
 
 #####
 ##### Calculate kernel size and offset for Windowed and Sliced Fields
diff --git a/src/BoundaryConditions/fill_halo_regions_flux.jl b/src/BoundaryConditions/fill_halo_regions_flux.jl
index 2a16deb650..58d2fe9b01 100644
--- a/src/BoundaryConditions/fill_halo_regions_flux.jl
+++ b/src/BoundaryConditions/fill_halo_regions_flux.jl
@@ -84,16 +84,16 @@ end
 ##### Kernel launchers for flux boundary conditions
 #####
 
-fill_west_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_west_halo!, c, offset,grid; dependencies=dep, kwargs...)
-fill_east_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_east_halo!, c, offset,grid; dependencies=dep, kwargs...)
-fill_south_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_south_halo!, c, offset,grid; dependencies=dep, kwargs...)
-fill_north_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_north_halo!, c, offset,grid; dependencies=dep, kwargs...)
-fill_bottom_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_bottom_halo!, c, offset,grid; dependencies=dep, kwargs...)
-fill_top_halo!(c, bc::FBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_top_halo!, c, offset, grid; dependencies=dep, kwargs...)
+fill_west_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_west_halo!, c, offset,grid; kwargs...)
+fill_east_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_east_halo!, c, offset,grid; kwargs...)
+fill_south_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_south_halo!, c, offset,grid; kwargs...)
+fill_north_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_north_halo!, c, offset,grid; kwargs...)
+fill_bottom_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_bottom_halo!, c, offset,grid; kwargs...)
+fill_top_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
+            launch!(arch, grid, kernel_size, fill_flux_top_halo!, c, offset, grid; kwargs...)
 
diff --git a/src/BoundaryConditions/fill_halo_regions_open.jl b/src/BoundaryConditions/fill_halo_regions_open.jl
index 2d941515a3..77a191f733 100644
--- a/src/BoundaryConditions/fill_halo_regions_open.jl
+++ b/src/BoundaryConditions/fill_halo_regions_open.jl
@@ -30,12 +30,12 @@ end
 @inbounds w[i′, j′, k_boundary] = getbc(bc, i′, j′, grid, args...)
 end
 
-@inline   fill_west_halo!(u, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset,           1, bc, grid, args...; dependencies=dep, kwargs...)
-@inline   fill_east_halo!(u, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset, grid.Nx + 1, bc, grid, args...; dependencies=dep, kwargs...)
-@inline  fill_south_halo!(v, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset,           1, bc, grid, args...; dependencies=dep, kwargs...)
-@inline  fill_north_halo!(v, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset, grid.Ny + 1, bc, grid, args...; dependencies=dep, kwargs...)
-@inline fill_bottom_halo!(w, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset,           1, bc, grid, args...; dependencies=dep, kwargs...)
-@inline    fill_top_halo!(w, bc::OBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset, grid.Nz + 1, bc, grid, args...; dependencies=dep, kwargs...)
+@inline   fill_west_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset,           1, bc, grid, args...; kwargs...)
+@inline   fill_east_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset, grid.Nx + 1, bc, grid, args...; kwargs...)
+@inline  fill_south_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset,           1, bc, grid, args...; kwargs...)
+@inline  fill_north_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset, grid.Ny + 1, bc, grid, args...; kwargs...)
+@inline fill_bottom_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset,           1, bc, grid, args...; kwargs...)
+@inline    fill_top_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset, grid.Nz + 1, bc, grid, args...; kwargs...)
 
 @inline   _fill_west_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[1, j, k]           = getbc(bc, j, k, grid, args...)
 @inline   _fill_east_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[grid.Nx + 1, j, k] = getbc(bc, j, k, grid, args...)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
index c396187a2c..6f0e4ca9d7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
@@ -25,13 +25,12 @@ function pressure_correct_velocities!(model::ImplicitFreeSurfaceHFSM, Δt)
             model.grid,
             Δt,
             model.free_surface.gravitational_acceleration,
-            model.free_surface.η,
-            dependencies = dependencies)
+            model.free_surface.η)
 
     return nothing
 end
 
-calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM, dependencies) = nothing
+calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM) = nothing
 
 function pressure_correct_velocities!(model::SplitExplicitFreeSurfaceHFSM, Δt)
     u, v, _ = model.velocities
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 890e7f7a14..d33e658fc8 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -142,8 +142,7 @@ is_vertically_implicit(closure) = time_discretization(closure) isa VerticallyImp
 
 """
     implicit_step!(field, implicit_solver::BatchedTridiagonalSolver,
-                   closure, diffusivity_fields, tracer_index, clock, Δt;
-                   dependencies)
+                   closure, diffusivity_fields, tracer_index, clock, Δt)
 
 Initialize the right hand side array `solver.batched_tridiagonal_solver.f`, and then solve the
 tridiagonal system for vertically-implicit diffusion, passing the arguments
@@ -159,7 +158,7 @@ function implicit_step!(field::Field,
                         diffusivity_fields,
                         tracer_index,
                         clock,
-                        Δt; dependencies)
+                        Δt)
     
    loc = location(field)
 
@@ -189,7 +188,6 @@ function implicit_step!(field::Field,
 
     return solve!(field, implicit_solver, field,
                   # ivd_*_diagonal gets called with these args after (i, j, k, grid):
-                  vi_closure, vi_diffusivity_fields, tracer_index, instantiate.(loc)..., clock, Δt, κz;
-                  dependencies)
+                  vi_closure, vi_diffusivity_fields, tracer_index, instantiate.(loc)..., clock, Δt, κz)
 end
 

From 7a5e45647657b2e25c1b642d6dad3498ddd40023 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 14 Feb 2023 18:56:00 -0500
Subject: [PATCH 008/530] Bump manifest

---
 Manifest.toml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 4a5de2570b..bf5b0c25ae 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,6 +1,6 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.8.0"
+julia_version = "1.8.5"
 manifest_format = "2.0"
 project_hash = "e5c066cd371cc92d479d4d0c34bc89f3323ab6b3"
 
@@ -133,7 +133,7 @@ version = "4.6.0"
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.5.2+0"
+version = "1.0.1+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -316,7 +316,7 @@ version = "1.12.0"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "9687f6699e0a9883a0e5da86a2886eae77af2cd7"
+git-tree-sha1 = "9536f1c772a6649ae2024504086e3b932acdfab7"
 repo-rev = "vc/nix_dependencies"
 repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
@@ -500,9 +500,9 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "SnoopPrecompile"]
-git-tree-sha1 = "946b56b2135c6c10bbb93efad8a78b699b6383ab"
+git-tree-sha1 = "6f4fbcd1ad45905a5dee3f4256fabb49aa2110c6"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.5.6"
+version = "2.5.7"
 
 [[deps.PencilArrays]]
 deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
@@ -691,7 +691,7 @@ version = "1.10.0"
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.0"
+version = "1.10.1"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]

From 2038c1091822c1715b9544f5f798c6a9bf5f6d80 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Feb 2023 19:56:44 -0500
Subject: [PATCH 009/530] couple of bugfixes

---
 src/CubedSpheres/CubedSpheres.jl                     |  4 ++--
 src/Distributed/halo_communication.jl                | 12 ++++++------
 .../calculate_hydrostatic_free_surface_tendencies.jl |  8 ++++----
 .../store_hydrostatic_free_surface_tendencies.jl     |  2 +-
 .../calculate_nonhydrostatic_tendencies.jl           |  6 +++---
 test/utils_for_runtests.jl                           |  1 +
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/CubedSpheres/CubedSpheres.jl b/src/CubedSpheres/CubedSpheres.jl
index 7067e344fc..ac78d819e4 100644
--- a/src/CubedSpheres/CubedSpheres.jl
+++ b/src/CubedSpheres/CubedSpheres.jl
@@ -93,10 +93,10 @@ end
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: apply_flux_bcs!
 
-function apply_flux_bcs!(Gcⁿ::AbstractCubedSphereField, events, c::AbstractCubedSphereField, arch, barrier, args...)
+function apply_flux_bcs!(Gcⁿ::AbstractCubedSphereField, events, c::AbstractCubedSphereField, arch, args...)
 
     for (face_index, Gcⁿ_face) in enumerate(faces(Gcⁿ))
-        apply_flux_bcs!(Gcⁿ_face, events, get_face(c, face_index), arch, barrier,
+        apply_flux_bcs!(Gcⁿ_face, events, get_face(c, face_index), arch,
                         Tuple(get_face(a, face_index) for a in args)...)
     end
 
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index b60f71e075..8002527df4 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -72,9 +72,9 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     arch    = architecture(grid)
 
     offset = (0, 0)
-    x_events_requests = fill_west_and_east_halos!(c, bcs.west, bcs.east, :yz, offset, loc, arch, barrier, grid, args...; kwargs...)
-    y_events_requests = fill_south_and_north_halos!(c, bcs.south, bcs.north, :xz, offset, loc, arch, barrier, grid, args...; kwargs...)
-    z_events_requests = fill_bottom_and_top_halos!(c, bcs.bottom, bcs.top, :xy, offset, loc, arch, barrier, grid, args...; kwargs...)
+    x_events_requests = fill_west_and_east_halos!(c, bcs.west, bcs.east, :yz, offset, loc, arch, grid, args...; kwargs...)
+    y_events_requests = fill_south_and_north_halos!(c, bcs.south, bcs.north, :xz, offset, loc, arch, grid, args...; kwargs...)
+    z_events_requests = fill_bottom_and_top_halos!(c, bcs.bottom, bcs.top, :xy, offset, loc, arch, grid, args...; kwargs...)
 
     events_and_requests = [x_events_requests..., y_events_requests..., z_events_requests...]
 
@@ -96,8 +96,8 @@ for (side, opposite_side) in zip([:west, :south, :bottom], [:east, :north, :top]
     fill_both_halo!  = Symbol("fill_$(side)_and_$(opposite_side)_halo!")
 
     @eval begin
-        function $fill_both_halos!(c, bc_side, bc_opposite_side, size, offset, loc, arch, barrier, grid, args...; kwargs...)
-                event = $fill_both_halo!(c, bc_side, bc_opposite_side, size, offset, loc, child_architecture(arch), barrier, grid, args...; kwargs...)
+        function $fill_both_halos!(c, bc_side, bc_opposite_side, size, offset, loc, arch, grid, args...; kwargs...)
+                event = $fill_both_halo!(c, bc_side, bc_opposite_side, size, offset, loc, child_architecture(arch), grid, args...; kwargs...)
             return [event]
         end
     end
@@ -120,7 +120,7 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
 
     @eval begin
         function $fill_both_halos!(c, bc_side::CBCT, bc_opposite_side::CBCT, size, offset, loc, arch, 
-                                   barrier, grid, args...; kwargs...)
+                                   grid, args...; kwargs...)
 
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index bd2882738b..b973826a99 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -239,10 +239,10 @@ end
 ##### Boundary condributions to hydrostatic free surface model
 #####
 
-function apply_flux_bcs!(Gcⁿ, c, arch, barrier, args...)
-    apply_x_bcs!(Gcⁿ, c, arch, barrier, args...)
-    apply_y_bcs!(Gcⁿ, c, arch, barrier, args...)
-    apply_z_bcs!(Gcⁿ, c, arch, barrier, args...)
+function apply_flux_bcs!(Gcⁿ, c, arch, args...)
+    apply_x_bcs!(Gcⁿ, c, arch, args...)
+    apply_y_bcs!(Gcⁿ, c, arch, args...)
+    apply_z_bcs!(Gcⁿ, c, arch, args...)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
index 410d96d9b0..469fb62c33 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/store_hydrostatic_free_surface_tendencies.jl
@@ -15,7 +15,7 @@ import Oceananigans.TimeSteppers: store_tendencies!
     @inbounds Gη⁻[i, j, grid.Nz+1] = Gη⁰[i, j, grid.Nz+1]
 end
 
-store_free_surface_tendency!(free_surface, model, barrier) = nothing
+store_free_surface_tendency!(free_surface, model) = nothing
 
 function store_free_surface_tendency!(::ExplicitFreeSurface, model)
 
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index 27aafdfc14..c2d7bfab1e 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -178,9 +178,9 @@ end
 function calculate_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
     fields = merge(velocities, tracers)
 
-    foreach(i->apply_x_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
-    foreach(i->apply_y_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
-    foreach(i->apply_z_bcs!(Gⁿ[i], fields[i], arch, barrier, clock, model_fields), 1:length(fields))
+    foreach(i->apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+    foreach(i->apply_y_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+    foreach(i->apply_z_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
                          
     return nothing
 end
diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl
index 34f8c47a30..1c1c8d1c54 100644
--- a/test/utils_for_runtests.jl
+++ b/test/utils_for_runtests.jl
@@ -69,6 +69,7 @@ end
 
 function compute_∇²!(∇²ϕ, ϕ, arch, grid)
     fill_halo_regions!(ϕ)
+    child_arch = child_architecture(arch)
     launch!(child_arch, grid, :xyz, ∇²!, ∇²ϕ, grid, ϕ)
     fill_halo_regions!(∇²ϕ)
     return nothing

From e3c82aa64a8cb3ece825b5dace5095e66a610538 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Feb 2023 21:40:56 -0500
Subject: [PATCH 010/530] tests should work (at least on CPU)

---
 src/BoundaryConditions/apply_flux_bcs.jl                    | 4 ++--
 src/CubedSpheres/CubedSpheres.jl                            | 4 ++--
 .../calculate_hydrostatic_free_surface_tendencies.jl        | 3 +--
 src/MultiRegion/multi_region_boundary_conditions.jl         | 2 +-
 test/test_immersed_advection.jl                             | 6 +++---
 5 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/BoundaryConditions/apply_flux_bcs.jl b/src/BoundaryConditions/apply_flux_bcs.jl
index 06e7054c55..2d22b8c7d7 100644
--- a/src/BoundaryConditions/apply_flux_bcs.jl
+++ b/src/BoundaryConditions/apply_flux_bcs.jl
@@ -9,8 +9,8 @@ using Oceananigans.Grids: AbstractGrid
 #####
 
 # Unpack
-apply_x_bcs!(Gc, c, args...) = apply_x_bcs!(Gc, Gc.grid, c, c.boundary_conditions.west, c.boundary_conditions.east, args...)
-apply_y_bcs!(Gc, c, args...) = apply_y_bcs!(Gc, Gc.grid, c, c.boundary_conditions.south, c.boundary_conditions.north, args...)
+apply_x_bcs!(Gc, c, args...) = apply_x_bcs!(Gc, Gc.grid, c, c.boundary_conditions.west,   c.boundary_conditions.east, args...)
+apply_y_bcs!(Gc, c, args...) = apply_y_bcs!(Gc, Gc.grid, c, c.boundary_conditions.south,  c.boundary_conditions.north, args...)
 apply_z_bcs!(Gc, c, args...) = apply_z_bcs!(Gc, Gc.grid, c, c.boundary_conditions.bottom, c.boundary_conditions.top, args...)
 
 # Shortcuts for...
diff --git a/src/CubedSpheres/CubedSpheres.jl b/src/CubedSpheres/CubedSpheres.jl
index ac78d819e4..c38351a949 100644
--- a/src/CubedSpheres/CubedSpheres.jl
+++ b/src/CubedSpheres/CubedSpheres.jl
@@ -93,10 +93,10 @@ end
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: apply_flux_bcs!
 
-function apply_flux_bcs!(Gcⁿ::AbstractCubedSphereField, events, c::AbstractCubedSphereField, arch, args...)
+function apply_flux_bcs!(Gcⁿ::AbstractCubedSphereField, c::AbstractCubedSphereField, arch, args...)
 
     for (face_index, Gcⁿ_face) in enumerate(faces(Gcⁿ))
-        apply_flux_bcs!(Gcⁿ_face, events, get_face(c, face_index), arch,
+        apply_flux_bcs!(Gcⁿ_face, get_face(c, face_index), arch,
                         Tuple(get_face(a, face_index) for a in args)...)
     end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index b973826a99..2da342cd28 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -22,7 +22,6 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
     calculate_hydrostatic_boundary_tendency_contributions!(model.timestepper.Gⁿ,
-                                                           model.grid,
                                                            model.architecture,
                                                            model.velocities,
                                                            model.free_surface,
@@ -248,7 +247,7 @@ function apply_flux_bcs!(Gcⁿ, c, arch, args...)
 end
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
-function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, grid, arch, velocities, free_surface, tracers, args...)
+function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velocities, free_surface, tracers, args...)
     # Velocity fields
     for i in (:u, :v)
         apply_flux_bcs!(Gⁿ[i], velocities[i], arch, args...)
diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index 08d70b16cb..33fe85e278 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -91,7 +91,7 @@ for (lside, rside) in zip([:west, :south, :bottom], [:east, :north, :bottom])
             $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
             return nothing
         end   
-        function $fill_both_halo!(c, left_bc, right_bc::CBC, kernel_size, offset, loc, arch, dep, grid, args...; kwargs...) 
+        function $fill_both_halo!(c, left_bc, right_bc::CBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) 
             $fill_left_halo!(c, left_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
             $fill_right_halo!(c, right_bc, kernel_size, offset, loc, arch, grid, args...; kwargs...)
             return nothing
diff --git a/test/test_immersed_advection.jl b/test/test_immersed_advection.jl
index ea176060ab..fee0874af1 100644
--- a/test/test_immersed_advection.jl
+++ b/test/test_immersed_advection.jl
@@ -103,7 +103,7 @@ for arch in archs
     
         c = CenterField(ibg)
         set!(c, 1.0)
-        wait(mask_immersed_field!(c))
+        mask_immersed_field!(c)
         fill_halo_regions!(c)
     
         for adv in advection_schemes, buffer in [1, 2, 3, 4, 5]
@@ -141,8 +141,8 @@ for arch in archs
         set!(u, 1.0)
         set!(v, 1.0)
 
-        wait(mask_immersed_field!(u))
-        wait(mask_immersed_field!(v))
+        mask_immersed_field!(u)
+        mask_immersed_field!(v)
 
         fill_halo_regions!(u)
         fill_halo_regions!(v)

From 21f926b63e41063aedb2ee69599e2475a7217c4d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 15 Feb 2023 19:11:40 -0500
Subject: [PATCH 011/530] bugfix

---
 src/Fields/field_boundary_buffers.jl                   |  2 +-
 .../split_explicit_free_surface_kernels.jl             | 10 ++--------
 src/MultiRegion/multi_region_boundary_conditions.jl    |  1 -
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index bead4526df..7c39784677 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -2,7 +2,7 @@ using Oceananigans.BoundaryConditions: MCBC, DCBC
 using Oceananigans.Architectures: arch_array
 using Oceananigans.Grids: halo_size
 using Oceananigans.Utils: launch!
-using KernelAbstractions: MultiEvent, NoneEvent, @kernel, @index
+using KernelAbstractions: @kernel, @index
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
 struct FieldBoundaryBuffers{W, E, S, N}
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index b197c50fae..3bdeff37e0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -211,14 +211,8 @@ end
 end
 
 # may need to do Val(Nk) since it may not be known at compile
-function barotropic_mode!(U, V, grid, u, v)
-
-    arch  = architecture(grid)
-    event = launch!(arch, grid, :xy, barotropic_mode_kernel!, U, V, grid, u, v,
-                   dependencies=Event(device(arch)))
-
-    wait(device(arch), event)
-end
+barotropic_mode!(U, V, grid, u, v) = 
+    launch!(architecture(grid), grid, :xy, barotropic_mode_kernel!, U, V, grid, u, v)
 
 function initialize_free_surface_state!(free_surface_state, η)
     state = free_surface_state
diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index e1c014de37..043a6e4491 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -2,7 +2,6 @@ using Oceananigans: instantiated_location
 using Oceananigans.Architectures: arch_array, device_copy_to!
 using Oceananigans.Operators: assumed_field_location
 using Oceananigans.Fields: reduced_dimensions
-using KernelAbstractions: MultiEvent
 
 using Oceananigans.BoundaryConditions: 
             ContinuousBoundaryFunction, 

From 9bf041135689f8bd2ac95bd908ae5e03de228a5f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 19 Feb 2023 15:15:40 -0500
Subject: [PATCH 012/530] bugfix

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index b7271f50f0..deb42d9c40 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -144,7 +144,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     events_and_requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; kwargs...)
     
-    if events_and_requests isa nothing
+    if isnothing(events_and_requests)
         return nothing
     end
     

From 0a443ff69bdcc0f1dddf3f510399574b73c4cac3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 17:45:45 -0500
Subject: [PATCH 013/530] temp

---
 src/Distributed/halo_communication.jl         | 11 ++--
 src/Distributed/interleave_comm_and_comp.jl   | 65 +++++++++++++++++++
 src/Distributed/multi_architectures.jl        |  8 ++-
 ...ate_hydrostatic_free_surface_tendencies.jl |  3 +
 ...te_hydrostatic_free_surface_model_state.jl |  3 +-
 5 files changed, 79 insertions(+), 11 deletions(-)
 create mode 100644 src/Distributed/interleave_comm_and_comp.jl

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index deb42d9c40..3a165e0102 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -142,16 +142,13 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
     size   = fill_halo_size(c, fill_halo!, indices, bc_left, loc, grid)
     offset = fill_halo_offset(size, fill_halo!, indices)
 
-    events_and_requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; kwargs...)
-    
-    if isnothing(events_and_requests)
+    requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; kwargs...)
+
+    if isnothing(requests)
         return nothing
     end
-    
-    MPI.Waitall(events_and_requests)
 
-    buffer_side = mpi_communication_side(Val(fill_halo!))
-    recv_from_buffers!(c, buffers, grid, Val(buffer_side))    
+    push!(arch.mpi_requests, requests...)
 
     return nothing
 end
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
new file mode 100644
index 0000000000..e1d831bd75
--- /dev/null
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -0,0 +1,65 @@
+function complete_communication_and_compute_boundary(model, grid::DistributedGrid)
+
+    arch = architecture(grid)
+
+    MPI.Waitall(arch.mpi_requests)
+    empty!(arch.mpi_requests)
+
+    for side in (:west_and_east, :south_and_north, :bottom_and_top)
+        for field in prognostic_fields(model)
+            recv_from_buffers!(field.data, field.boundary_buffers, grid, Val(side))    
+        end
+    end
+
+    # HERE we have to put fill_eventual_halo_corners
+    recompute_boundary_tendencies(model)
+
+    return nothing
+end
+
+function recompute_boundary_tendencies(model)
+
+    arch = model.architecture
+    grid = model.grid
+
+    recompute_calculate_hydrostatic_momentum_tendencies!(model, model.velocities)
+
+    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
+
+    only_active_cells = use_only_active_cells(grid)
+
+    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
+        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
+        @inbounds c_advection = model.advection[tracer_name]
+        @inbounds c_forcing = model.forcing[tracer_name]
+        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
+
+        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
+                                                                                         Val(tracer_name),
+                                                                                         model.closure,
+                                                                                         model.diffusivity_fields)
+
+        args = (calculate_hydrostatic_free_surface_Gc!,
+                c_tendency,
+                c_kernel_function,
+                grid,
+                Val(tracer_index),
+                c_advection,
+                closure,
+                c_immersed_bc,
+                model.buoyancy,
+                model.velocities,
+                model.free_surface,
+                model.tracers,
+                top_tracer_bcs,
+                diffusivity_fields,
+                model.auxiliary_fields,
+                c_forcing,
+                model.clock)
+
+        launch!(arch, grid, :xyz, args...; only_active_cells)
+    end
+end
+
+function compute_full_w_and_pressures!(model)
+
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index df7f8038f5..ee92c23378 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -6,13 +6,14 @@ import Oceananigans.Architectures: device, arch_array, array_type, child_archite
 import Oceananigans.Grids: zeros
 import Oceananigans.Fields: using_buffered_communication
 
-struct DistributedArch{A, R, I, ρ, C, γ, B} <: AbstractArchitecture
+struct DistributedArch{A, R, I, ρ, C, γ, B, M} <: AbstractArchitecture
   child_architecture :: A
           local_rank :: R
          local_index :: I
                ranks :: ρ
         connectivity :: C
         communicator :: γ
+        mpi_requests :: M
 end
 
 #####
@@ -99,9 +100,12 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
+    mpi_requests = MPI.Request[]
+
     B = use_buffers
+    M = typoef(mpi_requests)
 
-    return DistributedArch{A, R, I, ρ, C, γ, B}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator)
+    return DistributedArch{A, R, I, ρ, C, γ, B, M}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests)
 end
 
 const ViewsDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 2da342cd28..138c1dc881 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -18,6 +18,7 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
     calculate_hydrostatic_free_surface_interior_tendency_contributions!(model)
+    complete_communication_and_compute_boundary(model, model.grid)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
@@ -36,6 +37,8 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
+complete_communication_and_compute_boundary(model, grid) = nothing
+
 function calculate_free_surface_tendency!(grid, model)
 
     arch = architecture(grid)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 3aa0584c55..ea46b3e46c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -25,7 +25,6 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
     @apply_regionally masking_actions!(model, grid)
 
     fill_halo_regions!(prognostic_fields(model), model.clock, fields(model))
-    fill_horizontal_velocity_halos!(model.velocities.u, model.velocities.v, model.architecture)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 
@@ -35,7 +34,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
-    return nothing
+    return [requestsv..., requestsw..., requestsd..., requestsp...]
 end
 
 # Mask immersed fields

From 99ad4ab4c0f742337068431bf69b62c8d028f893 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:29:50 -0500
Subject: [PATCH 014/530] test it out on satori?

---
 src/Distributed/interleave_comm_and_comp.jl   |  47 +------
 ...ate_hydrostatic_free_surface_tendencies.jl | 120 ++++++++++++++++--
 2 files changed, 109 insertions(+), 58 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index e1d831bd75..2cf0b6a629 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -17,49 +17,4 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     return nothing
 end
 
-function recompute_boundary_tendencies(model)
-
-    arch = model.architecture
-    grid = model.grid
-
-    recompute_calculate_hydrostatic_momentum_tendencies!(model, model.velocities)
-
-    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
-
-    only_active_cells = use_only_active_cells(grid)
-
-    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
-        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
-        @inbounds c_advection = model.advection[tracer_name]
-        @inbounds c_forcing = model.forcing[tracer_name]
-        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
-
-        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
-                                                                                         Val(tracer_name),
-                                                                                         model.closure,
-                                                                                         model.diffusivity_fields)
-
-        args = (calculate_hydrostatic_free_surface_Gc!,
-                c_tendency,
-                c_kernel_function,
-                grid,
-                Val(tracer_index),
-                c_advection,
-                closure,
-                c_immersed_bc,
-                model.buoyancy,
-                model.velocities,
-                model.free_surface,
-                model.tracers,
-                top_tracer_bcs,
-                diffusivity_fields,
-                model.auxiliary_fields,
-                c_forcing,
-                model.clock)
-
-        launch!(arch, grid, :xyz, args...; only_active_cells)
-    end
-end
-
-function compute_full_w_and_pressures!(model)
-
+recompute_boundary_tendencies() = nothing
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 138c1dc881..d1228c664a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -5,6 +5,9 @@ using Oceananigans: fields, prognostic_fields, TimeStepCallsite, TendencyCallsit
 using Oceananigans.Utils: work_layout
 using Oceananigans.Fields: immersed_boundary_condition
 
+import Oceananigans.Distributed: complete_communication_and_compute_boundary, recompute_boundary_tendencies
+using Oceananigans.Distributed: tendency_kernel_size, tendency_kernel_offsets
+
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 
 """
@@ -37,7 +40,7 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
-complete_communication_and_compute_boundary(model, grid) = nothing
+@inline complete_communication_and_compute_boundary(model, grid) = nothing
 
 function calculate_free_surface_tendency!(grid, model)
 
@@ -87,11 +90,11 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     only_active_cells = use_only_active_cells(grid)
 
     launch!(arch, grid, :xyz,
-            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, u_kernel_args...;
+            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, (0, 0, 0), u_kernel_args...;
             only_active_cells)
 
     launch!(arch, grid, :xyz,
-            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, v_kernel_args...;
+            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, (0, 0, 0), v_kernel_args...;
             only_active_cells)
 
     calculate_free_surface_tendency!(grid, model)
@@ -161,6 +164,7 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
         launch!(arch, grid, :xyz,
                 calculate_hydrostatic_free_surface_Gc!,
                 c_tendency,
+                (0, 0, 0),
                 c_kernel_function,
                 grid,
                 Val(tracer_index),
@@ -188,24 +192,26 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid, args...)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid, args...)
     i, j, k = @index(Global, NTuple)
-    @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
+    i′, j′, k′ = (i, j, k) .+ offs
+    @inbounds Gu[i′, j′, k′] = hydrostatic_free_surface_u_velocity_tendency(i′, j′, k′, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, args...)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_ntuple(idx, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid, args...)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid, args...)
     i, j, k = @index(Global, NTuple)
-    @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
+    i′, j′, k′ = (i, j, k) .+ offs
+    @inbounds Gv[i′, j′, k′] = hydrostatic_free_surface_v_velocity_tendency(i′, j′, k′, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, args...)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_ntuple(idx, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
@@ -216,12 +222,13 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, tendency_kernel_function, grid, args...)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, tendency_kernel_function, grid, args...)
     i, j, k = @index(Global, NTuple)
-    @inbounds Gc[i, j, k] = tendency_kernel_function(i, j, k, grid, args...)
+    i′, j′, k′ = (i, j, k) .+ offs
+    @inbounds Gc[i′, j′, k′] = tendency_kernel_function(i′, j′, k′, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, tendency_kernel_function, grid::ActiveCellsIBG, args...)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, tendency_kernel_function, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_ntuple(idx, grid)
     @inbounds Gc[i, j, k] = tendency_kernel_function(i, j, k, grid, args...)
@@ -266,3 +273,92 @@ function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velo
 
     return nothing
 end
+
+function recompute_boundary_tendencies(model)
+    grid = model.grid
+    arch = architecture(grid)
+
+    Nx, Ny, Nz = size(grid)
+    size_x = (Hx, Ny, Nz)
+    size_y = (Nx, Hy, Nz-2Hz)
+    size_z = (Nx-2Hx, Ny-2Hy, Hz)
+
+    offsetᴸx = (0,  0,  0)
+    offsetᴸy = (0,  0,  Hz)
+    offsetᴸz = (Hx, Hy, 0)
+    offsetᴿx = (Nx-Hx, 0,      0)
+    offsetᴿy = (0,     Ny-Hy, Hz)
+    offsetᴿz = (Hx,    Hy,    Nz-Hz)
+
+    sizes   = (size_x, size_y, size_z, size_x, size_y, size_z)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴸz, offsetᴿx, offsetᴿy, offsetᴿz)
+
+    u_immersed_bc = immersed_boundary_condition(model.velocities.u)
+    v_immersed_bc = immersed_boundary_condition(model.velocities.v)
+
+    start_momentum_kernel_args = (grid,
+                                  model.advection.momentum,
+                                  model.coriolis,
+                                  model.closure)
+
+    end_momentum_kernel_args = (model.velocities,
+                                model.free_surface,
+                                model.tracers,
+                                model.buoyancy,
+                                model.diffusivity_fields,
+                                model.pressure.pHY′,
+                                model.auxiliary_fields,
+                                model.forcing,
+                                model.clock)
+
+    u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
+    v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
+    
+    only_active_cells = use_only_active_cells(grid)
+
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+        launch!(arch, grid, kernel_size,
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...;
+                only_active_cells)
+    
+        launch!(arch, grid, kernel_size,
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...;
+                only_active_cells)
+    end
+
+    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
+
+    only_active_cells = use_only_active_cells(grid)
+
+    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
+        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
+        @inbounds c_advection = model.advection[tracer_name]
+        @inbounds c_forcing = model.forcing[tracer_name]
+        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
+
+        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
+                                                                                         Val(tracer_name),
+                                                                                         model.closure,
+                                                                                         model.diffusivity_fields)
+
+        args = (c_kernel_function,
+                grid,
+                Val(tracer_index),
+                c_advection,
+                closure,
+                c_immersed_bc,
+                model.buoyancy,
+                model.velocities,
+                model.free_surface,
+                model.tracers,
+                top_tracer_bcs,
+                diffusivity_fields,
+                model.auxiliary_fields,
+                c_forcing,
+                model.clock)
+
+        for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+            launch!(arch, grid, kernel_size, calculate_hydrostatic_free_surface_Gc!, c_tendency, kernel_offsets, args...)
+        end
+    end
+end

From e491389deb16e36f798aca401f3c50ab541c5387 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:35:38 -0500
Subject: [PATCH 015/530] using a tag to tag the array to send

---
 src/Distributed/halo_communication.jl       | 30 ++++++++++-----------
 src/Distributed/interleave_comm_and_comp.jl |  1 +
 src/Distributed/multi_architectures.jl      |  5 ++--
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 3a165e0102..abd4e192b1 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -45,29 +45,33 @@ opposite_side = Dict(
 
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
-#   digit 1: the side
-#   digits 2-4: the "from" rank
-#   digits 5-7: the "to" rank
+#   digit 1-2: an identifier for the field that is reset each timestep
+#   digit 3: the side
+#   digits 4-6: the "from" rank
+#   digits 7-9: the "to" rank
 
 RANK_DIGITS = 3
+ID_DIGITS = 2
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        function $send_tag_fn_name(local_rank, rank_to_send_to)
+        function $send_tag_fn_name(arch, local_rank, rank_to_send_to)
+            field_id    = string(arch.mpi_tag, pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, side_digit * from_digits * to_digits)
+            return parse(Int, field_id * side_digit * from_digits * to_digits)
         end
 
-        function $recv_tag_fn_name(local_rank, rank_to_recv_from)
+        function $recv_tag_fn_name(arch, local_rank, rank_to_recv_from)
+            field_id    = string(arch.mpi_tag, pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, side_digit * from_digits * to_digits)
+            return parse(Int, field_id * side_digit * from_digits * to_digits)
         end
     end
 end
@@ -98,14 +102,14 @@ end
 
 function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::DistributedGrid, buffers, args...; kwargs...)
     arch       = architecture(grid)
-    child_arch = child_architecture(arch)
     halo_tuple = permute_boundary_conditions(bcs)
     
     for task = 1:3
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     end
 
-    fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    # fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    arch.mpi_tag += 1
 
     return nothing
 end
@@ -129,10 +133,6 @@ function fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers
     end
 end
 
-@inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
-@inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
-@inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
-
 function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
@@ -242,7 +242,7 @@ for side in sides
     @eval begin
         function $send_side_halo(c, grid, arch, side_location, local_rank, rank_to_send_to, buffers)
             send_buffer = $get_side_send_buffer(c, grid, side_location, buffers, arch)
-            send_tag = $side_send_tag(local_rank, rank_to_send_to)
+            send_tag = $side_send_tag(arch, local_rank, rank_to_send_to)
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
@@ -269,7 +269,7 @@ for side in sides
     @eval begin
         function $recv_and_fill_side_halo!(c, grid, arch, side_location, local_rank, rank_to_recv_from, buffers)
             recv_buffer = $get_side_recv_buffer(c, grid, side_location, buffers, arch)
-            recv_tag = $side_recv_tag(local_rank, rank_to_recv_from)
+            recv_tag = $side_recv_tag(arch, local_rank, rank_to_recv_from)
 
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 2cf0b6a629..1ca486f028 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -4,6 +4,7 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
 
     MPI.Waitall(arch.mpi_requests)
     empty!(arch.mpi_requests)
+    arch.mpi_tag = 0
 
     for side in (:west_and_east, :south_and_north, :bottom_and_top)
         for field in prognostic_fields(model)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index ee92c23378..f0a5acac0d 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -6,7 +6,7 @@ import Oceananigans.Architectures: device, arch_array, array_type, child_archite
 import Oceananigans.Grids: zeros
 import Oceananigans.Fields: using_buffered_communication
 
-struct DistributedArch{A, R, I, ρ, C, γ, B, M} <: AbstractArchitecture
+struct DistributedArch{A, R, I, ρ, C, γ, B, M, T} <: AbstractArchitecture
   child_architecture :: A
           local_rank :: R
          local_index :: I
@@ -14,6 +14,7 @@ struct DistributedArch{A, R, I, ρ, C, γ, B, M} <: AbstractArchitecture
         connectivity :: C
         communicator :: γ
         mpi_requests :: M
+             mpi_tag :: T
 end
 
 #####
@@ -105,7 +106,7 @@ function DistributedArch(child_architecture = CPU();
     B = use_buffers
     M = typoef(mpi_requests)
 
-    return DistributedArch{A, R, I, ρ, C, γ, B, M}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests)
+    return DistributedArch{A, R, I, ρ, C, γ, B, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, 0)
 end
 
 const ViewsDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}

From d97d8e1f3e086e54b5ff7a657072ba4782591433 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:36:02 -0500
Subject: [PATCH 016/530] we can send up to 100 fields before resetting

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index abd4e192b1..157eff3dc8 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -51,7 +51,7 @@ opposite_side = Dict(
 #   digits 7-9: the "to" rank
 
 RANK_DIGITS = 3
-ID_DIGITS = 2
+ID_DIGITS = 3
 
 for side in sides
     side_str = string(side)

From b198a086456d291cdf7bd9acb7bd4b835d6d0b41 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:41:40 -0500
Subject: [PATCH 017/530] at least it compiles

---
 src/Distributed/Distributed.jl                                   | 1 +
 .../calculate_hydrostatic_free_surface_tendencies.jl             | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 34bc28d633..23f6f93285 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -19,5 +19,6 @@ include("halo_communication_bcs.jl")
 include("distributed_fields.jl")
 include("halo_communication.jl")
 include("distributed_fft_based_poisson_solver.jl")
+include("interleave_comm_and_comp.jl")
 
 end # module
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index d1228c664a..a45bbc2895 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -6,7 +6,6 @@ using Oceananigans.Utils: work_layout
 using Oceananigans.Fields: immersed_boundary_condition
 
 import Oceananigans.Distributed: complete_communication_and_compute_boundary, recompute_boundary_tendencies
-using Oceananigans.Distributed: tendency_kernel_size, tendency_kernel_offsets
 
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 

From fdaf7bc7f5eb965e7e3349afd9e317d3ad00557b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:51:53 -0500
Subject: [PATCH 018/530] async fill halo

---
 src/Distributed/halo_communication.jl         | 21 ++++++++++++++-----
 src/Distributed/interleave_comm_and_comp.jl   |  2 +-
 src/Distributed/multi_architectures.jl        |  5 +++--
 .../split_explicit_free_surface_kernels.jl    |  6 +++---
 ...te_hydrostatic_free_surface_model_state.jl |  2 +-
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 157eff3dc8..e6391d72e6 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -59,7 +59,7 @@ for side in sides
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
         function $send_tag_fn_name(arch, local_rank, rank_to_send_to)
-            field_id    = string(arch.mpi_tag, pad=ID_DIGITS)
+            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
@@ -67,7 +67,7 @@ for side in sides
         end
 
         function $recv_tag_fn_name(arch, local_rank, rank_to_recv_from)
-            field_id    = string(arch.mpi_tag, pad=ID_DIGITS)
+            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
@@ -109,7 +109,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     end
 
     # fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
-    arch.mpi_tag += 1
+    arch.mpi_tag[1] += 1
 
     return nothing
 end
@@ -133,7 +133,11 @@ function fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers
     end
 end
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; kwargs...)
+@inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
+@inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
+@inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
+
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = true, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
@@ -148,7 +152,14 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
         return nothing
     end
 
-    push!(arch.mpi_requests, requests...)
+    if async
+        push!(arch.mpi_requests, requests...)
+        return nothing
+    end
+
+    MPI.Waitall(requests)
+    buffer_side = mpi_communication_side(Val(fill_halo!))
+    recv_from_buffers!(c, buffers, grid, Val(buffer_side))    
 
     return nothing
 end
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 1ca486f028..354cd491f3 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -4,7 +4,7 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
 
     MPI.Waitall(arch.mpi_requests)
     empty!(arch.mpi_requests)
-    arch.mpi_tag = 0
+    arch.mpi_tag[1] = 0
 
     for side in (:west_and_east, :south_and_north, :bottom_and_top)
         for field in prognostic_fields(model)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index f0a5acac0d..99eda9873b 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -104,9 +104,10 @@ function DistributedArch(child_architecture = CPU();
     mpi_requests = MPI.Request[]
 
     B = use_buffers
-    M = typoef(mpi_requests)
+    M = typeof(mpi_requests)
+    T = typeof([0])
 
-    return DistributedArch{A, R, I, ρ, C, γ, B, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, 0)
+    return DistributedArch{A, R, I, ρ, C, γ, B, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, [0])
 end
 
 const ViewsDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 3bdeff37e0..e3397264f0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -287,12 +287,12 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
 
     velocities = model.velocities
 
-    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅))
+    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅); async = true)
     
     @apply_regionally setup_split_explicit!(free_surface.auxiliary, free_surface.state, 
                                             free_surface.η, grid, Gu, Gv, Guⁿ, Gvⁿ, χ, velocities)
 
-    fill_halo_regions!((free_surface.auxiliary.Gᵁ, free_surface.auxiliary.Gⱽ))
+    fill_halo_regions!((free_surface.auxiliary.Gᵁ, free_surface.auxiliary.Gⱽ); async = true)
 
     # Solve for the free surface at tⁿ⁺¹
     @apply_regionally iterate_split_explicit!(free_surface, grid, Δt)
@@ -302,7 +302,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # substepped η field
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
-    fill_halo_regions!(free_surface.η)
+    fill_halo_regions!(free_surface.η; async = true)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index ea46b3e46c..b56d8643ca 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -34,7 +34,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
-    return [requestsv..., requestsw..., requestsd..., requestsp...]
+    return nothing
 end
 
 # Mask immersed fields

From 79838c3591e35ec4cd9fdeb164342a781779625f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 18:54:43 -0500
Subject: [PATCH 019/530] this works

---
 src/Distributed/interleave_comm_and_comp.jl                    | 2 ++
 .../calculate_hydrostatic_free_surface_tendencies.jl           | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 354cd491f3..59db818141 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -1,3 +1,5 @@
+using Oceananigans: prognostic_fields
+
 function complete_communication_and_compute_boundary(model, grid::DistributedGrid)
 
     arch = architecture(grid)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index a45bbc2895..160ebf09b7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -4,6 +4,7 @@ import Oceananigans: tracer_tendency_kernel_function
 using Oceananigans: fields, prognostic_fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
 using Oceananigans.Utils: work_layout
 using Oceananigans.Fields: immersed_boundary_condition
+using Oceananigans.Grids: halo_size
 
 import Oceananigans.Distributed: complete_communication_and_compute_boundary, recompute_boundary_tendencies
 
@@ -278,6 +279,8 @@ function recompute_boundary_tendencies(model)
     arch = architecture(grid)
 
     Nx, Ny, Nz = size(grid)
+    Hx, Hy, Hz = halo_size(grid)
+
     size_x = (Hx, Ny, Nz)
     size_y = (Nx, Hy, Nz-2Hz)
     size_z = (Nx-2Hx, Ny-2Hy, Hz)

From 2435cebf433427301900cfe5149b8d4c6cd0154f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 19:25:30 -0500
Subject: [PATCH 020/530] corerct order

---
 src/Distributed/halo_communication.jl                     | 2 +-
 .../split_explicit_free_surface_kernels.jl                | 6 +++---
 .../update_hydrostatic_free_surface_model_state.jl        | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index e6391d72e6..07c89b46ca 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -137,7 +137,7 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = true, kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = false, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index e3397264f0..3bdeff37e0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -287,12 +287,12 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
 
     velocities = model.velocities
 
-    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅); async = true)
+    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅))
     
     @apply_regionally setup_split_explicit!(free_surface.auxiliary, free_surface.state, 
                                             free_surface.η, grid, Gu, Gv, Guⁿ, Gvⁿ, χ, velocities)
 
-    fill_halo_regions!((free_surface.auxiliary.Gᵁ, free_surface.auxiliary.Gⱽ); async = true)
+    fill_halo_regions!((free_surface.auxiliary.Gᵁ, free_surface.auxiliary.Gⱽ))
 
     # Solve for the free surface at tⁿ⁺¹
     @apply_regionally iterate_split_explicit!(free_surface, grid, Δt)
@@ -302,7 +302,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # substepped η field
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
-    fill_halo_regions!(free_surface.η; async = true)
+    fill_halo_regions!(free_surface.η)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index b56d8643ca..2826c9d9f2 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -24,13 +24,13 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     @apply_regionally masking_actions!(model, grid)
 
-    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model))
+    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 
-    fill_halo_regions!(model.velocities.w, model.clock, fields(model))
-    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model))
-    fill_halo_regions!(model.pressure.pHY′)
+    fill_halo_regions!(model.velocities.w, model.clock, fields(model); async = true)
+    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model); async = true)
+    fill_halo_regions!(model.pressure.pHY′; async = true)
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     

From ef6d03b0a73f45d605318a00301a04b87bf3f072 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 19:33:27 -0500
Subject: [PATCH 021/530] in a new file

---
 .../HydrostaticFreeSurfaceModels.jl           |  1 +
 ...ate_hydrostatic_free_surface_tendencies.jl | 91 ------------------
 .../recompute_boundary_tendencies.jl          | 92 +++++++++++++++++++
 ...te_hydrostatic_free_surface_model_state.jl |  4 -
 4 files changed, 93 insertions(+), 95 deletions(-)
 create mode 100644 src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl

diff --git a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
index 1cfe8f869c..a11e111dc0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
@@ -101,6 +101,7 @@ displacement(::Nothing) = nothing
 include("barotropic_pressure_correction.jl")
 include("hydrostatic_free_surface_tendency_kernel_functions.jl")
 include("calculate_hydrostatic_free_surface_tendencies.jl")
+include("recompute_boundary_tendencies.jl")
 include("update_hydrostatic_free_surface_model_state.jl")
 include("hydrostatic_free_surface_ab2_step.jl")
 include("store_hydrostatic_free_surface_tendencies.jl")
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 160ebf09b7..ab2fc7dd8a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -273,94 +273,3 @@ function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velo
 
     return nothing
 end
-
-function recompute_boundary_tendencies(model)
-    grid = model.grid
-    arch = architecture(grid)
-
-    Nx, Ny, Nz = size(grid)
-    Hx, Hy, Hz = halo_size(grid)
-
-    size_x = (Hx, Ny, Nz)
-    size_y = (Nx, Hy, Nz-2Hz)
-    size_z = (Nx-2Hx, Ny-2Hy, Hz)
-
-    offsetᴸx = (0,  0,  0)
-    offsetᴸy = (0,  0,  Hz)
-    offsetᴸz = (Hx, Hy, 0)
-    offsetᴿx = (Nx-Hx, 0,      0)
-    offsetᴿy = (0,     Ny-Hy, Hz)
-    offsetᴿz = (Hx,    Hy,    Nz-Hz)
-
-    sizes   = (size_x, size_y, size_z, size_x, size_y, size_z)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴸz, offsetᴿx, offsetᴿy, offsetᴿz)
-
-    u_immersed_bc = immersed_boundary_condition(model.velocities.u)
-    v_immersed_bc = immersed_boundary_condition(model.velocities.v)
-
-    start_momentum_kernel_args = (grid,
-                                  model.advection.momentum,
-                                  model.coriolis,
-                                  model.closure)
-
-    end_momentum_kernel_args = (model.velocities,
-                                model.free_surface,
-                                model.tracers,
-                                model.buoyancy,
-                                model.diffusivity_fields,
-                                model.pressure.pHY′,
-                                model.auxiliary_fields,
-                                model.forcing,
-                                model.clock)
-
-    u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
-    v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
-    
-    only_active_cells = use_only_active_cells(grid)
-
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...;
-                only_active_cells)
-    
-        launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...;
-                only_active_cells)
-    end
-
-    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
-
-    only_active_cells = use_only_active_cells(grid)
-
-    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
-        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
-        @inbounds c_advection = model.advection[tracer_name]
-        @inbounds c_forcing = model.forcing[tracer_name]
-        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
-
-        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
-                                                                                         Val(tracer_name),
-                                                                                         model.closure,
-                                                                                         model.diffusivity_fields)
-
-        args = (c_kernel_function,
-                grid,
-                Val(tracer_index),
-                c_advection,
-                closure,
-                c_immersed_bc,
-                model.buoyancy,
-                model.velocities,
-                model.free_surface,
-                model.tracers,
-                top_tracer_bcs,
-                diffusivity_fields,
-                model.auxiliary_fields,
-                c_forcing,
-                model.clock)
-
-        for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-            launch!(arch, grid, kernel_size, calculate_hydrostatic_free_surface_Gc!, c_tendency, kernel_offsets, args...)
-        end
-    end
-end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
new file mode 100644
index 0000000000..41e74d8314
--- /dev/null
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -0,0 +1,92 @@
+function recompute_boundary_tendencies(model)
+    grid = model.grid
+    arch = architecture(grid)
+
+    # What shall we do with w, p and κ???
+
+    Nx, Ny, Nz = size(grid)
+    Hx, Hy, Hz = halo_size(grid)
+
+    size_x = (Hx, Ny, Nz)
+    size_y = (Nx, Hy, Nz-2Hz)
+    size_z = (Nx-2Hx, Ny-2Hy, Hz)
+
+    offsetᴸx = (0,  0,  0)
+    offsetᴸy = (0,  0,  Hz)
+    offsetᴸz = (Hx, Hy, 0)
+    offsetᴿx = (Nx-Hx, 0,      0)
+    offsetᴿy = (0,     Ny-Hy, Hz)
+    offsetᴿz = (Hx,    Hy,    Nz-Hz)
+
+    sizes   = (size_x, size_y, size_z, size_x, size_y, size_z)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴸz, offsetᴿx, offsetᴿy, offsetᴿz)
+
+    u_immersed_bc = immersed_boundary_condition(model.velocities.u)
+    v_immersed_bc = immersed_boundary_condition(model.velocities.v)
+
+    start_momentum_kernel_args = (grid,
+                                  model.advection.momentum,
+                                  model.coriolis,
+                                  model.closure)
+
+    end_momentum_kernel_args = (model.velocities,
+                                model.free_surface,
+                                model.tracers,
+                                model.buoyancy,
+                                model.diffusivity_fields,
+                                model.pressure.pHY′,
+                                model.auxiliary_fields,
+                                model.forcing,
+                                model.clock)
+
+    u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
+    v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
+    
+    only_active_cells = use_only_active_cells(grid)
+
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+        launch!(arch, grid, kernel_size,
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...;
+                only_active_cells)
+    
+        launch!(arch, grid, kernel_size,
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...;
+                only_active_cells)
+    end
+
+    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
+
+    only_active_cells = use_only_active_cells(grid)
+
+    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
+        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
+        @inbounds c_advection = model.advection[tracer_name]
+        @inbounds c_forcing = model.forcing[tracer_name]
+        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
+
+        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
+                                                                                         Val(tracer_name),
+                                                                                         model.closure,
+                                                                                         model.diffusivity_fields)
+
+        args = (c_kernel_function,
+                grid,
+                Val(tracer_index),
+                c_advection,
+                closure,
+                c_immersed_bc,
+                model.buoyancy,
+                model.velocities,
+                model.free_surface,
+                model.tracers,
+                top_tracer_bcs,
+                diffusivity_fields,
+                model.auxiliary_fields,
+                c_forcing,
+                model.clock)
+
+        for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+            launch!(arch, grid, kernel_size, calculate_hydrostatic_free_surface_Gc!, c_tendency, kernel_offsets, args...)
+        end
+    end
+end
\ No newline at end of file
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 2826c9d9f2..1b8e5395e2 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -28,10 +28,6 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 
-    fill_halo_regions!(model.velocities.w, model.clock, fields(model); async = true)
-    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model); async = true)
-    fill_halo_regions!(model.pressure.pHY′; async = true)
-
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
     return nothing

From 3bfd4337e4da74dc0a7fb76a35f9bbc8f39cf116 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 22 Feb 2023 19:45:41 -0500
Subject: [PATCH 022/530] fixed issue?

---
 src/Distributed/interleave_comm_and_comp.jl         |  8 +++-----
 .../compute_w_from_continuity.jl                    | 13 ++++++++++---
 .../update_hydrostatic_pressure.jl                  | 11 ++++++++---
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 59db818141..bb1f652293 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -8,12 +8,10 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     empty!(arch.mpi_requests)
     arch.mpi_tag[1] = 0
 
-    for side in (:west_and_east, :south_and_north, :bottom_and_top)
-        for field in prognostic_fields(model)
-            recv_from_buffers!(field.data, field.boundary_buffers, grid, Val(side))    
-        end
+    for field in merge(model.velocities, model.tracers)
+        recv_from_buffers!(field.data, field.boundary_buffers, grid)
     end
-
+    
     # HERE we have to put fill_eventual_halo_corners
     recompute_boundary_tendencies(model)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 504f873f9d..2447d21a5d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -13,12 +13,19 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
 compute_w_from_continuity!(velocities, arch, grid) = 
-    launch!(arch, grid, :xy, _compute_w_from_continuity!, velocities, grid)
+    launch!(arch, grid, w_kernel_size(grid), _compute_w_from_continuity!, velocities, grid)
+
+# extend w kernel to compute also the boundaries
+@inline w_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
 
 @kernel function _compute_w_from_continuity!(U, grid)
     i, j = @index(Global, NTuple)
-    U.w[i, j, 1] = 0
+
+    i′ = i - 1 
+    j′ = j - 1 
+
+    U.w[i′, j′, 1] = 0
     @unroll for k in 2:grid.Nz+1
-        @inbounds U.w[i, j, k] = U.w[i, j, k-1] - Δzᶜᶜᶜ(i, j, k-1, grid) * div_xyᶜᶜᶜ(i, j, k-1, grid, U.u, U.v)
+        @inbounds U.w[i′, j′, k] = U.w[i′, j′, k-1] - Δzᶜᶜᶜ(i′, j′, k-1, grid) * div_xyᶜᶜᶜ(i′, j′, k-1, grid, U.u, U.v)
     end
 end
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index f01765a5b2..94405cd7dd 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -9,11 +9,13 @@ the `buoyancy_perturbation` downwards:
 """
 @kernel function _update_hydrostatic_pressure!(pHY′, grid, buoyancy, C)
     i, j = @index(Global, NTuple)
+    i′ = i - 1 
+    j′ = j - 1 
 
-    @inbounds pHY′[i, j, grid.Nz] = - ℑzᵃᵃᶠ(i, j, grid.Nz+1, grid, z_dot_g_b, buoyancy, C) * Δzᶜᶜᶠ(i, j, grid.Nz+1, grid)
+    @inbounds pHY′[i′, j′, grid.Nz] = - ℑzᵃᵃᶠ(i′, j′, grid.Nz+1, grid, z_dot_g_b, buoyancy, C) * Δzᶜᶜᶠ(i′, j′, grid.Nz+1, grid)
 
     @unroll for k in grid.Nz-1 : -1 : 1
-        @inbounds pHY′[i, j, k] = pHY′[i, j, k+1] - ℑzᵃᵃᶠ(i, j, k+1, grid, z_dot_g_b, buoyancy, C) * Δzᶜᶜᶠ(i, j, k+1, grid)
+        @inbounds pHY′[i′, j′, k] = pHY′[i′, j′, k+1] - ℑzᵃᵃᶠ(i′, j′, k+1, grid, z_dot_g_b, buoyancy, C) * Δzᶜᶜᶠ(i′, j′, k+1, grid)
     end
 end
 
@@ -25,8 +27,11 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
+# extend p kernel to compute also the boundaries
+@inline p_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
+
 update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers)
 
 update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers) =  
-        launch!(arch, grid, :xy, _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)
+        launch!(arch, grid, p_kernel_size(grid), _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)

From 6a73d37bafc5f07a10cb5fe47b116ffad5684497 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 05:41:23 -0500
Subject: [PATCH 023/530] try like this

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 07c89b46ca..b247665012 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -50,7 +50,7 @@ opposite_side = Dict(
 #   digits 4-6: the "from" rank
 #   digits 7-9: the "to" rank
 
-RANK_DIGITS = 3
+RANK_DIGITS = 2
 ID_DIGITS = 3
 
 for side in sides

From 295c6ed08b4638142b75c8851dcfd7dbb48ee15b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 05:47:02 -0500
Subject: [PATCH 024/530] also eta async

---
 src/Distributed/interleave_comm_and_comp.jl                 | 6 +++---
 .../split_explicit_free_surface_kernels.jl                  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index bb1f652293..51b578785e 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -8,10 +8,10 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     empty!(arch.mpi_requests)
     arch.mpi_tag[1] = 0
 
-    for field in merge(model.velocities, model.tracers)
-        recv_from_buffers!(field.data, field.boundary_buffers, grid)
+    for field in prognostic_fields(model)
+        recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
     end
-    
+
     # HERE we have to put fill_eventual_halo_corners
     recompute_boundary_tendencies(model)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 3bdeff37e0..94368b7af0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -302,7 +302,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # substepped η field
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
-    fill_halo_regions!(free_surface.η)
+    fill_halo_regions!(free_surface.η; async = true)
 
     return nothing
 end

From a77860ba8b3efe2220b1b46e61387271d3526123 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 06:03:39 -0500
Subject: [PATCH 025/530] toggle on and off

---
 src/Distributed/halo_communication.jl                |  2 +-
 src/Distributed/interleave_comm_and_comp.jl          |  3 ++-
 src/Distributed/multi_architectures.jl               |  6 ++++--
 .../calculate_hydrostatic_free_surface_tendencies.jl |  2 +-
 .../recompute_boundary_tendencies.jl                 | 12 +++---------
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index b247665012..a23f82d294 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -152,7 +152,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
         return nothing
     end
 
-    if async
+    if async && !(arch isa SynchedDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 51b578785e..93d98bfb8e 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -1,6 +1,6 @@
 using Oceananigans: prognostic_fields
 
-function complete_communication_and_compute_boundary(model, grid::DistributedGrid)
+function complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch)
 
     arch = architecture(grid)
 
@@ -18,4 +18,5 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     return nothing
 end
 
+complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::SynchedDistributedArch) = nothing
 recompute_boundary_tendencies() = nothing
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 99eda9873b..e7990525d4 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -64,6 +64,7 @@ function DistributedArch(child_architecture = CPU();
                    ranks,
                    use_buffers = false,
                    devices = nothing, 
+                   interleave_communication = true,
                    communicator = MPI.COMM_WORLD)
 
     MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
@@ -101,7 +102,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = MPI.Request[]
+    mpi_requests = interleave_communication ? MPI.Request[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)
@@ -110,7 +111,8 @@ function DistributedArch(child_architecture = CPU();
     return DistributedArch{A, R, I, ρ, C, γ, B, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, [0])
 end
 
-const ViewsDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
+const ViewsDistributedArch   = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
+const SynchedDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
 
 using_buffered_communication(::DistributedArch{A, R, I, ρ, C, γ, B}) where {A, R, I, ρ, C, γ, B} = B
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index ab2fc7dd8a..fd0a005a1e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -21,7 +21,7 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
     calculate_hydrostatic_free_surface_interior_tendency_contributions!(model)
-    complete_communication_and_compute_boundary(model, model.grid)
+    complete_communication_and_compute_boundary(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 41e74d8314..f82906f1ca 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -18,7 +18,7 @@ function recompute_boundary_tendencies(model)
     offsetᴿy = (0,     Ny-Hy, Hz)
     offsetᴿz = (Hx,    Hy,    Nz-Hz)
 
-    sizes   = (size_x, size_y, size_z, size_x, size_y, size_z)
+    sizes   = (size_x,     size_y,   size_z,   size_x,   size_y,   size_z)
     offsets = (offsetᴸx, offsetᴸy, offsetᴸz, offsetᴿx, offsetᴿy, offsetᴿz)
 
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
@@ -42,22 +42,16 @@ function recompute_boundary_tendencies(model)
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
     
-    only_active_cells = use_only_active_cells(grid)
-
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...;
-                only_active_cells)
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...)
     
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...;
-                only_active_cells)
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...)
     end
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
 
-    only_active_cells = use_only_active_cells(grid)
-
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
         @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
         @inbounds c_advection = model.advection[tracer_name]

From 247a2e0edec9bf058ad0cfb963652f395e7151b2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 06:07:17 -0500
Subject: [PATCH 026/530] bugfix

---
 src/MultiRegion/multi_region_boundary_conditions.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index 043a6e4491..64129cbc28 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -69,7 +69,7 @@ function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiR
 
     for task = 1:3
         apply_regionally!(fill_halo_event!, task, halo_tuple, 
-                          c, indices, loc, arch, barrier, mrg, neighbors, buff, 
+                          c, indices, loc, arch, mrg, neighbors, buff, 
                           args...; kwargs...)
     end
 

From 6b7a659bab07b3f4e3142d7c0d3e75eede59fc92 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 06:33:59 -0500
Subject: [PATCH 027/530] smaller tag

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index a23f82d294..e11e70b90f 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -51,7 +51,7 @@ opposite_side = Dict(
 #   digits 7-9: the "to" rank
 
 RANK_DIGITS = 2
-ID_DIGITS = 3
+ID_DIGITS = 2
 
 for side in sides
     side_str = string(side)

From d140468114449907ebfbe5830ae6888fb9baefed Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 06:41:35 -0500
Subject: [PATCH 028/530] removed the top/bottom kernel

---
 .../recompute_boundary_tendencies.jl                     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index f82906f1ca..3b57dd36ca 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -1,3 +1,5 @@
+# We assume here that top/bottom BC are always synched (no partitioning in z)
+
 function recompute_boundary_tendencies(model)
     grid = model.grid
     arch = architecture(grid)
@@ -9,17 +11,14 @@ function recompute_boundary_tendencies(model)
 
     size_x = (Hx, Ny, Nz)
     size_y = (Nx, Hy, Nz-2Hz)
-    size_z = (Nx-2Hx, Ny-2Hy, Hz)
 
     offsetᴸx = (0,  0,  0)
     offsetᴸy = (0,  0,  Hz)
-    offsetᴸz = (Hx, Hy, 0)
     offsetᴿx = (Nx-Hx, 0,      0)
     offsetᴿy = (0,     Ny-Hy, Hz)
-    offsetᴿz = (Hx,    Hy,    Nz-Hz)
 
-    sizes   = (size_x,     size_y,   size_z,   size_x,   size_y,   size_z)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴸz, offsetᴿx, offsetᴿy, offsetᴿz)
+    sizes   = (size_x,     size_y,   size_x,   size_y)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
 
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
     v_immersed_bc = immersed_boundary_condition(model.velocities.v)

From e52c7f922e221989ce33a2f67789219f9fdd43dc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 06:43:55 -0500
Subject: [PATCH 029/530] correct kernel size

---
 .../recompute_boundary_tendencies.jl                      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 3b57dd36ca..fb93fd2508 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -10,12 +10,12 @@ function recompute_boundary_tendencies(model)
     Hx, Hy, Hz = halo_size(grid)
 
     size_x = (Hx, Ny, Nz)
-    size_y = (Nx, Hy, Nz-2Hz)
+    size_y = (Nx, Hy, Nz)
 
     offsetᴸx = (0,  0,  0)
-    offsetᴸy = (0,  0,  Hz)
-    offsetᴿx = (Nx-Hx, 0,      0)
-    offsetᴿy = (0,     Ny-Hy, Hz)
+    offsetᴸy = (0,  0,  0)
+    offsetᴿx = (Nx-Hx, 0,     0)
+    offsetᴿy = (0,     Ny-Hy, 0)
 
     sizes   = (size_x,     size_y,   size_x,   size_y)
     offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)

From 40b90489eedb1c9f92d17b2efbc45f0605edfca2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 08:34:14 -0500
Subject: [PATCH 030/530] fixed stuff?

---
 src/Distributed/interleave_comm_and_comp.jl   |  4 +-
 ...ate_hydrostatic_free_surface_tendencies.jl | 16 +++--
 .../compute_w_from_continuity.jl              | 10 ++--
 .../recompute_boundary_tendencies.jl          | 60 +++++++++++++++++--
 .../update_hydrostatic_pressure.jl            | 14 ++---
 .../ri_based_vertical_diffusivity.jl          | 15 +++--
 6 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 93d98bfb8e..a2c559c5b7 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -13,10 +13,10 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     end
 
     # HERE we have to put fill_eventual_halo_corners
-    recompute_boundary_tendencies(model)
+    recompute_boundary_tendencies!(model)
 
     return nothing
 end
 
 complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::SynchedDistributedArch) = nothing
-recompute_boundary_tendencies() = nothing
+recompute_boundary_tendencies!() = nothing
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index fd0a005a1e..7fea9ee5b3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -6,7 +6,7 @@ using Oceananigans.Utils: work_layout
 using Oceananigans.Fields: immersed_boundary_condition
 using Oceananigans.Grids: halo_size
 
-import Oceananigans.Distributed: complete_communication_and_compute_boundary, recompute_boundary_tendencies
+import Oceananigans.Distributed: complete_communication_and_compute_boundary
 
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 
@@ -40,7 +40,7 @@ function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
-@inline complete_communication_and_compute_boundary(model, grid) = nothing
+complete_communication_and_compute_boundary(model, grid, arch) = nothing
 
 function calculate_free_surface_tendency!(grid, model)
 
@@ -194,7 +194,9 @@ end
 """ Calculate the right-hand-side of the u-velocity equation. """
 @kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid, args...)
     i, j, k = @index(Global, NTuple)
-    i′, j′, k′ = (i, j, k) .+ offs
+    i′ = i + offs[1] 
+    j′ = j + offs[2] 
+    k′ = k + offs[3]
     @inbounds Gu[i′, j′, k′] = hydrostatic_free_surface_u_velocity_tendency(i′, j′, k′, grid, args...)
 end
 
@@ -207,7 +209,9 @@ end
 """ Calculate the right-hand-side of the v-velocity equation. """
 @kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid, args...)
     i, j, k = @index(Global, NTuple)
-    i′, j′, k′ = (i, j, k) .+ offs
+    i′ = i + offs[1] 
+    j′ = j + offs[2] 
+    k′ = k + offs[3]
     @inbounds Gv[i′, j′, k′] = hydrostatic_free_surface_v_velocity_tendency(i′, j′, k′, grid, args...)
 end
 
@@ -224,7 +228,9 @@ end
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
 @kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, tendency_kernel_function, grid, args...)
     i, j, k = @index(Global, NTuple)
-    i′, j′, k′ = (i, j, k) .+ offs
+    i′ = i + offs[1] 
+    j′ = j + offs[2] 
+    k′ = k + offs[3]
     @inbounds Gc[i′, j′, k′] = tendency_kernel_function(i′, j′, k′, grid, args...)
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 2447d21a5d..46605d6ea9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -12,17 +12,17 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid) = 
-    launch!(arch, grid, w_kernel_size(grid), _compute_w_from_continuity!, velocities, grid)
+compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) = 
+    launch!(arch, grid, kernel_size, _compute_w_from_continuity!, velocities, kernel_offsets, grid)
 
 # extend w kernel to compute also the boundaries
 @inline w_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
 
-@kernel function _compute_w_from_continuity!(U, grid)
+@kernel function _compute_w_from_continuity!(U, offs, grid)
     i, j = @index(Global, NTuple)
 
-    i′ = i - 1 
-    j′ = j - 1 
+    i′ = i + offs[1] 
+    j′ = j + offs[2] 
 
     U.w[i′, j′, 1] = 0
     @unroll for k in 2:grid.Nz+1
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index fb93fd2508..1575f185c7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -1,13 +1,12 @@
-# We assume here that top/bottom BC are always synched (no partitioning in z)
+import Oceananigans.Distributed: recompute_boundary_tendencies!
 
-function recompute_boundary_tendencies(model)
+# We assume here that top/bottom BC are always synched (no partitioning in z)
+function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
     arch = architecture(grid)
 
     # What shall we do with w, p and κ???
-
-    Nx, Ny, Nz = size(grid)
-    Hx, Hy, Hz = halo_size(grid)
+    recompute_auxiliaries!(model, grid, arch)
 
     size_x = (Hx, Ny, Nz)
     size_y = (Nx, Hy, Nz)
@@ -82,4 +81,55 @@ function recompute_boundary_tendencies(model)
             launch!(arch, grid, kernel_size, calculate_hydrostatic_free_surface_Gc!, c_tendency, kernel_offsets, args...)
         end
     end
+end
+
+function recompute_auxiliaries!(model, grid, arch)
+    Nx, Ny, _ = size(grid)
+    Hx, Hy, _ = halo_size(grid)
+
+    size_x = (Hx+1, Ny)
+    size_y = (Nx, Hy+1)
+
+    offsetᴸx = (-Hx,  0)
+    offsetᴸy = (0,  -Hy)
+    offsetᴿx = (Nx-1, 0)
+    offsetᴿy = (0, Ny-1)
+
+    sizes   = (size_x,     size_y,   size_x,   size_y)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+        compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
+    end
+
+    size_x = (1, Ny)
+    size_y = (Nx, 1)
+
+    offsetᴸx = (-1,  0)
+    offsetᴸy = (0,  -1)
+    offsetᴿx = (Nx,  0)
+    offsetᴿy = (0,  Ny)
+
+    sizes   = (size_x,     size_y,   size_x,   size_y)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+
+
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+        update_hydrostatic_pressure!(model.pressures.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
+    end
+
+    size_x = (1, Ny, Nz)
+    size_y = (Nx, 1, Nz)
+
+    offsetᴸx = (-1,  0, 0)
+    offsetᴸy = (0,  -1, 0)
+    offsetᴿx = (Nx,  0, 0)
+    offsetᴿy = (0,  Ny, 0)
+
+    sizes   = (size_x,     size_y,   size_x,   size_y)
+    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+        calculate_diffusivities!(model; kernel_size, kernel_offsets)
+    end
 end
\ No newline at end of file
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 94405cd7dd..df17440f8d 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -7,10 +7,10 @@ the `buoyancy_perturbation` downwards:
 
     `pHY′ = ∫ buoyancy_perturbation dz` from `z=0` down to `z=-Lz`
 """
-@kernel function _update_hydrostatic_pressure!(pHY′, grid, buoyancy, C)
+@kernel function _update_hydrostatic_pressure!(pHY′, offs, grid, buoyancy, C)
     i, j = @index(Global, NTuple)
-    i′ = i - 1 
-    j′ = j - 1 
+    i′ = i + offs[1] 
+    j′ = j + offs[2] 
 
     @inbounds pHY′[i′, j′, grid.Nz] = - ℑzᵃᵃᶠ(i′, j′, grid.Nz+1, grid, z_dot_g_b, buoyancy, C) * Δzᶜᶜᶠ(i′, j′, grid.Nz+1, grid)
 
@@ -30,8 +30,8 @@ const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 # extend p kernel to compute also the boundaries
 @inline p_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers) =
-    update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers)
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) =
+    update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
 
-update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers) =  
-        launch!(arch, grid, p_kernel_size(grid), _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) =  
+        launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 44ff21580b..64d679be81 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -111,7 +111,9 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κ, ν)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model)
+@inline kappa_kernel_size(grid) = size(grid) .+ 2
+
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(grid), kernel_offsets = (-0x1, -0x1, -0x1))
     arch = model.architecture
     grid = model.grid
     clock = model.clock
@@ -120,9 +122,10 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model)
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
             diffusivities,
+            kernel_offsets,
             grid,
             closure,
             velocities,
@@ -162,10 +165,14 @@ end
 @inline Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy) =
     ℑzᵃᵃᶜ(i, j, k, grid, Riᶜᶜᶠ, velocities, tracers, buoyancy)
 
-@kernel function compute_ri_based_diffusivities!(diffusivities, grid, closure::FlavorOfRBVD,
+@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
                                                  velocities, tracers, buoyancy, tracer_bcs, clock)
 
-    i, j, k, = @index(Global, NTuple)
+    i′, j′, k′ = @index(Global, NTuple)
+
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)

From 5861f1e21361f5c7accf23ad59ac6e1301639ad2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 09:08:35 -0500
Subject: [PATCH 031/530] bugfix

---
 .../ri_based_vertical_diffusivity.jl                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 64d679be81..5eb6adc602 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -113,7 +113,7 @@ end
 
 @inline kappa_kernel_size(grid) = size(grid) .+ 2
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(grid), kernel_offsets = (-0x1, -0x1, -0x1))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(model.grid), kernel_offsets = (-0x1, -0x1, -0x1))
     arch = model.architecture
     grid = model.grid
     clock = model.clock

From 48d60cd12416921760166ec258461b193eef6d36 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 09:38:08 -0500
Subject: [PATCH 032/530] bugfix

---
 .../HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl | 2 +-
 .../NonhydrostaticModels/update_hydrostatic_pressure.jl       | 4 ++--
 .../ri_based_vertical_diffusivity.jl                          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 46605d6ea9..a87c677a50 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -12,7 +12,7 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) = 
+compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = (-1, -1)) = 
     launch!(arch, grid, kernel_size, _compute_w_from_continuity!, velocities, kernel_offsets, grid)
 
 # extend w kernel to compute also the boundaries
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index df17440f8d..c6fd45a481 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -30,8 +30,8 @@ const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 # extend p kernel to compute also the boundaries
 @inline p_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) =
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
 
-update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-0x1, -0x1)) =  
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =  
         launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 5eb6adc602..6b9a93031e 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -113,7 +113,7 @@ end
 
 @inline kappa_kernel_size(grid) = size(grid) .+ 2
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(model.grid), kernel_offsets = (-0x1, -0x1, -0x1))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(model.grid), kernel_offsets = (-1, -1, -1))
     arch = model.architecture
     grid = model.grid
     clock = model.clock

From dc454a3bfb701d1a3eee4e90bc57f4ab2616e2a2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 10:41:28 -0500
Subject: [PATCH 033/530] bugfixes

---
 .../recompute_boundary_tendencies.jl          | 19 +++++++++++--------
 src/TurbulenceClosures/TurbulenceClosures.jl  |  2 +-
 src/TurbulenceClosures/closure_tuples.jl      |  4 ++--
 .../nothing_closure.jl                        |  4 ++--
 .../scalar_biharmonic_diffusivity.jl          |  2 +-
 .../scalar_diffusivity.jl                     |  2 +-
 .../smagorinsky_lilly.jl                      |  2 +-
 7 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 1575f185c7..21fe0a2cd5 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -7,6 +7,9 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
 
     # What shall we do with w, p and κ???
     recompute_auxiliaries!(model, grid, arch)
+    
+    Nx, Ny, Nz = size(grid)
+    Hx, Hy, Hz = halo_size(grid)
 
     size_x = (Hx, Ny, Nz)
     size_y = (Nx, Hy, Nz)
@@ -84,14 +87,14 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
 end
 
 function recompute_auxiliaries!(model, grid, arch)
-    Nx, Ny, _ = size(grid)
-    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, Nz = size(grid)
+    Hx, Hy, Hz = halo_size(grid)
 
-    size_x = (Hx+1, Ny)
-    size_y = (Nx, Hy+1)
+    size_x = (Hx, Ny)
+    size_y = (Nx, Hy)
 
-    offsetᴸx = (-Hx,  0)
-    offsetᴸy = (0,  -Hy)
+    offsetᴸx = (-Hx+1,  0)
+    offsetᴸy = (0,  -Hy+1)
     offsetᴿx = (Nx-1, 0)
     offsetᴿy = (0, Ny-1)
 
@@ -115,7 +118,7 @@ function recompute_auxiliaries!(model, grid, arch)
 
 
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        update_hydrostatic_pressure!(model.pressures.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
     end
 
     size_x = (1, Ny, Nz)
@@ -130,6 +133,6 @@ function recompute_auxiliaries!(model, grid, arch)
     offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        calculate_diffusivities!(model; kernel_size, kernel_offsets)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
     end
 end
\ No newline at end of file
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index b82825f589..b938a7bed5 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -67,7 +67,7 @@ abstract type AbstractTurbulenceClosure{TimeDiscretization} end
 validate_closure(closure) = closure
 closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
-calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...) = nothing
+calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 6463d402ec..d75186dd69 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -60,10 +60,10 @@ end
 
 with_tracers(tracers, closure_tuple::Tuple) = Tuple(with_tracers(tracers, closure) for closure in closure_tuple)
 
-function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...)
+function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...; kwargs...)
     for (α, closure) in enumerate(closure_tuple)
         @inbounds diffusivity_fields = diffusivity_fields_tuple[α]
-        calculate_diffusivities!(diffusivity_fields, closure, args...)
+        calculate_diffusivities!(diffusivity_fields, closure, args...; kwargs...)
     end
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
index 63938afb42..4e6df0477b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
@@ -3,5 +3,5 @@
 @inline ∂ⱼ_τ₂ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 @inline ∂ⱼ_τ₃ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 
-calculate_diffusivities!(diffusivities, ::Nothing, args...) = nothing
-calculate_diffusivities!(::Nothing, ::Nothing, args...) = nothing
+calculate_diffusivities!(diffusivities, ::Nothing, args...; kwargs...) = nothing
+calculate_diffusivities!(::Nothing, ::Nothing, args...; kwargs...) = nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index 0c606667ef..e21eddf670 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -71,7 +71,7 @@ end
 @inline viscosity(closure::ScalarBiharmonicDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarBiharmonicDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...) = nothing
+calculate_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...; kwargs...) = nothing
 
 function Base.summary(closure::ScalarBiharmonicDiffusivity)
     F = summary(formulation(closure))
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 1f9ac99107..6995ff2a0f 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -111,7 +111,7 @@ end
 @inline viscosity(closure::ScalarDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, ::ScalarDiffusivity, args...) = nothing
+calculate_diffusivities!(diffusivities, ::ScalarDiffusivity, args...; kwargs...) = nothing
 
 # Note: we could compute ν and κ (if they are Field):
 # function calculate_diffusivities!(diffusivities, closure::ScalarDiffusivity, args...)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index 5ad9e84e92..bb53356a2d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -106,7 +106,7 @@ filter width `Δᶠ`, and strain tensor dot product `Σ²`.
 end
 
 
-function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model)
+function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; kwargs...)
 
     arch = model.architecture
     grid = model.grid

From 5ccbfbc1a355637aaf11298327e9867cb96d926d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 11:05:45 -0500
Subject: [PATCH 034/530] fixed all issues?

---
 src/Distributed/interleave_comm_and_comp.jl | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index a2c559c5b7..f20078a9dc 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -2,13 +2,17 @@ using Oceananigans: prognostic_fields
 
 function complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch)
 
-    arch = architecture(grid)
+    for field in prognostic_fields(model)
+        arch = architecture(field.grid)
 
-    MPI.Waitall(arch.mpi_requests)
-    empty!(arch.mpi_requests)
-    arch.mpi_tag[1] = 0
+        # Wait for outstanding requests
+        !isempty(arch.mpi_requests) && MPI.Waitall(arch.mpi_requests)
 
-    for field in prognostic_fields(model)
+        # Reset MPI tag
+        arch.mpi_tag[1] -= arch.mpi_tag[1]
+        
+        # Reset MPI requests
+        empty!(arch.mpi_requests)
         recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
     end
 

From 2ded64fa58cf9829d0ba118c31ab726e8abc9d34 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 11:46:23 -0500
Subject: [PATCH 035/530] pad to larger ranks

---
 src/Distributed/halo_communication.jl       | 2 +-
 src/Distributed/interleave_comm_and_comp.jl | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index e11e70b90f..d29ea3c76d 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -50,7 +50,7 @@ opposite_side = Dict(
 #   digits 4-6: the "from" rank
 #   digits 7-9: the "to" rank
 
-RANK_DIGITS = 2
+RANK_DIGITS = 3
 ID_DIGITS = 2
 
 for side in sides
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index f20078a9dc..4f6444c10c 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -2,6 +2,8 @@ using Oceananigans: prognostic_fields
 
 function complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch)
 
+    # We iterate over the fields because we have to clear _ALL_ architectures
+    # and split explicit variables live on a different grid
     for field in prognostic_fields(model)
         arch = architecture(field.grid)
 

From cd93807c94567ddee1fc976e0b83857c941885cf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 11:49:57 -0500
Subject: [PATCH 036/530] comments

---
 src/Distributed/halo_communication.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d29ea3c76d..17f4d454ef 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -148,15 +148,19 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; kwargs...)
 
+    # if `isnothing(requests)`, `fill_halo!` did not involve MPI 
     if isnothing(requests)
         return nothing
     end
 
+    # Overlapping communication and computation, store requests in a `MPI.Request`
+    # pool to be waited upon after tendency calculation
     if async && !(arch isa SynchedDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
 
+    # Syncronous MPI fill_halo_event!
     MPI.Waitall(requests)
     buffer_side = mpi_communication_side(Val(fill_halo!))
     recv_from_buffers!(c, buffers, grid, Val(buffer_side))    

From 619f823827bbd96bdc3372c6b319c9e6ce430e4a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 13:16:41 -0500
Subject: [PATCH 037/530] enable overlapped computation

---
 src/Distributed/multi_architectures.jl                   | 4 ++--
 .../calculate_hydrostatic_free_surface_tendencies.jl     | 7 +++++--
 .../recompute_boundary_tendencies.jl                     | 9 +++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index e7990525d4..900d1eb85c 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -64,7 +64,7 @@ function DistributedArch(child_architecture = CPU();
                    ranks,
                    use_buffers = false,
                    devices = nothing, 
-                   interleave_communication = true,
+                   enable_overlapped_computation = true,
                    communicator = MPI.COMM_WORLD)
 
     MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
@@ -102,7 +102,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = interleave_communication ? MPI.Request[] : nothing
+    mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 7fea9ee5b3..dfe630a997 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -48,6 +48,7 @@ function calculate_free_surface_tendency!(grid, model)
 
     launch!(arch, grid, :xy,
             calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η,
+            (0, 0),
             grid,
             model.velocities,
             model.free_surface,
@@ -245,9 +246,11 @@ end
 #####
 
 """ Calculate the right-hand-side of the free surface displacement (``η``) equation. """
-@kernel function calculate_hydrostatic_free_surface_Gη!(Gη, grid, args...)
+@kernel function calculate_hydrostatic_free_surface_Gη!(Gη, offs, grid, args...)
     i, j = @index(Global, NTuple)
-    @inbounds Gη[i, j, grid.Nz+1] = free_surface_tendency(i, j, grid, args...)
+    i′ = i + offs[1]
+    j′ = j + offs[2]
+    @inbounds Gη[i′, j′, grid.Nz+1] = free_surface_tendency(i′, j′, grid, args...)
 end
 
 #####
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 21fe0a2cd5..f0b4d7f72a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -5,9 +5,9 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
     arch = architecture(grid)
 
-    # What shall we do with w, p and κ???
+    # We need new values for `w`, `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
-    
+
     Nx, Ny, Nz = size(grid)
     Hx, Hy, Hz = halo_size(grid)
 
@@ -49,6 +49,11 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     
         launch!(arch, grid, kernel_size,
                 calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...)
+        
+        launch!(arch, grid, kernel_size[1:2],
+                calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets[1:2],
+                grid, model.velocities, model.free_surface, model.tracers, model.auxiliary_fields, model.forcing,
+                model.clock)
     end
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)

From 51146597b71d923c83f2a425947e17175f2f4a54 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 23 Feb 2023 14:16:12 -0500
Subject: [PATCH 038/530] chnage

---
 src/Distributed/interleave_comm_and_comp.jl   | 28 +++++----
 ...distributed_split_explicit_free_surface.jl | 21 ++++++-
 .../hydrostatic_free_surface_ab2_step.jl      |  5 +-
 .../split_explicit_free_surface_kernels.jl    | 60 ++++++++++++-------
 4 files changed, 80 insertions(+), 34 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 4f6444c10c..621b8e68df 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -5,17 +5,7 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     # We iterate over the fields because we have to clear _ALL_ architectures
     # and split explicit variables live on a different grid
     for field in prognostic_fields(model)
-        arch = architecture(field.grid)
-
-        # Wait for outstanding requests
-        !isempty(arch.mpi_requests) && MPI.Waitall(arch.mpi_requests)
-
-        # Reset MPI tag
-        arch.mpi_tag[1] -= arch.mpi_tag[1]
-        
-        # Reset MPI requests
-        empty!(arch.mpi_requests)
-        recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
+        complete_halo_communication!(field)
     end
 
     # HERE we have to put fill_eventual_halo_corners
@@ -26,3 +16,19 @@ end
 
 complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::SynchedDistributedArch) = nothing
 recompute_boundary_tendencies!() = nothing
+
+function complete_halo_communication!(field)
+    arch = architecture(field.grid)
+
+    # Wait for outstanding requests
+    !isempty(arch.mpi_requests) && MPI.Waitall(arch.mpi_requests)
+
+    # Reset MPI tag
+    arch.mpi_tag[1] -= arch.mpi_tag[1]
+    
+    # Reset MPI requests
+    empty!(arch.mpi_requests)
+    recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
+
+    return nothing
+end
\ No newline at end of file
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 1a94431379..d85b5cedf6 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -1,5 +1,5 @@
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
-using Oceananigans.Distributed: DistributedGrid
+using Oceananigans.Distributed: DistributedGrid, DistributedField, complete_halo_communication!
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliary
@@ -85,3 +85,22 @@ end
 
     return (Ax, Ay, old_halos[3])
 end
+
+const DistributedSplitExplicit = SplitExplicitFreeSurface{<:DistributedField}
+
+function wait_free_surface_communication!(free_surface::DistributedSplitExplicit)
+    
+    state = free_surface.state
+
+    for field in (state.U̅, state.V̅)
+        complete_halo_communication!(field)
+    end
+
+    auxiliary = free_surface.auxiliary
+
+    for field in (auxiliary.Gᵁ, auxiliary.Gⱽ)
+        complete_halo_communication!(field)
+    end
+
+    return nothing
+end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
index 477b4a6845..509b1fe0ea 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_ab2_step.jl
@@ -8,8 +8,12 @@ import Oceananigans.TimeSteppers: ab2_step!
 ##### Step everything
 #####
 
+setup_free_surface!(model, free_surface, χ) = nothing
+
 function ab2_step!(model::HydrostaticFreeSurfaceModel, Δt, χ)
 
+    setup_free_surface!(model, model.free_surface, χ)
+
     # Step locally velocity and tracers
     @apply_regionally local_ab2_step!(model, Δt, χ)
 
@@ -22,7 +26,6 @@ end
 function local_ab2_step!(model, Δt, χ)
     ab2_step_velocities!(model.velocities, model, Δt, χ)
     ab2_step_tracers!(model.tracers, model, Δt, χ)
-    
     return nothing    
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 94368b7af0..33b45401b7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -193,6 +193,8 @@ function split_explicit_free_surface_substep!(η, state, auxiliary, settings, ar
 
     launch!(arch, grid, kernel_size, split_explicit_free_surface_evolution_kernel!, args...)
     launch!(arch, grid, kernel_size, split_explicit_barotropic_velocity_evolution_kernel!, args...)
+
+    return nothing
 end
 
 # Barotropic Model Kernels
@@ -257,6 +259,8 @@ function barotropic_split_explicit_corrector!(u, v, free_surface, grid)
 
     launch!(arch, grid, :xyz, barotropic_split_explicit_corrector_kernel!,
         u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ)
+
+    return nothing
 end
 
 @kernel function _calc_ab2_tendencies!(G⁻, Gⁿ, χ)
@@ -279,20 +283,11 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
 
     grid = free_surface.η.grid
 
-    # we start the time integration of η from the average ηⁿ     
-    Gu  = model.timestepper.G⁻.u
-    Gv  = model.timestepper.G⁻.v
-    Guⁿ = model.timestepper.Gⁿ.u
-    Gvⁿ = model.timestepper.Gⁿ.v
-
-    velocities = model.velocities
+    # Wait for previous set up
+    wait_free_surface_communication!(free_surface)
 
-    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅))
-    
-    @apply_regionally setup_split_explicit!(free_surface.auxiliary, free_surface.state, 
-                                            free_surface.η, grid, Gu, Gv, Guⁿ, Gvⁿ, χ, velocities)
-
-    fill_halo_regions!((free_surface.auxiliary.Gᵁ, free_surface.auxiliary.Gⱽ))
+    # reset free surface averages
+    @apply_regionally initialize_free_surface_state!(free_surface.state, free_surface.η)
 
     # Solve for the free surface at tⁿ⁺¹
     @apply_regionally iterate_split_explicit!(free_surface, grid, Δt)
@@ -302,8 +297,6 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # substepped η field
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
-    fill_halo_regions!(free_surface.η; async = true)
-
     return nothing
 end
 
@@ -321,19 +314,44 @@ function iterate_split_explicit!(free_surface, grid, Δt)
     for substep in 1:settings.substeps
         split_explicit_free_surface_substep!(η, state, auxiliary, settings, arch, grid, g, Δτ, substep)
     end
+
+    return nothing
 end
 
-function setup_split_explicit!(auxiliary, state, η, grid, Gu, Gv, Guⁿ, Gvⁿ, χ, velocities)
+# Setting up the tendencies and the communicating the barotopic velocity components
+# This function is called after `calculate_tendency` and before `ab2_step!`
+function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
+
+    grid = free_surface.η.grid
+    
+    # we start the time integration of η from the average ηⁿ     
+    Gu  = model.timestepper.G⁻.u
+    Gv  = model.timestepper.G⁻.v
+    Guⁿ = model.timestepper.Gⁿ.u
+    Gvⁿ = model.timestepper.Gⁿ.v
+    
+    auxiliary = free_surface.auxiliary
+
+    @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
+
+    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅); async = true)
+    fill_halo_regions!((auxiliary.Gᵁ, auxiliary.Gⱽ); async = true)
+
+    return nothing
+end
+
+function setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
     arch = architecture(grid)
 
     launch!(arch, grid, :xyz, _calc_ab2_tendencies!, Gu, Guⁿ, χ)
     launch!(arch, grid, :xyz, _calc_ab2_tendencies!, Gv, Gvⁿ, χ)
+    
+    mask_immersed_field!(Gu)
+    mask_immersed_field!(Gv)
 
-    # reset free surface averages
-    initialize_free_surface_state!(state, η)
-
-    # Compute barotropic mode of tendency fields
     barotropic_mode!(auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu, Gv)
 
     return nothing
-end
\ No newline at end of file
+end
+
+wait_free_surface_communication!(free_surface) = nothing
\ No newline at end of file

From 460555c3ed23be963f2611c1ca37d9f8cd5a0074 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 24 Feb 2023 03:36:55 -0500
Subject: [PATCH 039/530] better change

---
 src/Distributed/interleave_comm_and_comp.jl | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 621b8e68df..e31a3bbc50 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -21,14 +21,16 @@ function complete_halo_communication!(field)
     arch = architecture(field.grid)
 
     # Wait for outstanding requests
-    !isempty(arch.mpi_requests) && MPI.Waitall(arch.mpi_requests)
+    if !isempty(arch.mpi_requests) 
+        MPI.Waitall(arch.mpi_requests)
 
-    # Reset MPI tag
-    arch.mpi_tag[1] -= arch.mpi_tag[1]
+        # Reset MPI tag
+        arch.mpi_tag[1] -= arch.mpi_tag[1]
+    
+        # Reset MPI requests
+        empty!(arch.mpi_requests)
+        recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
+    end
     
-    # Reset MPI requests
-    empty!(arch.mpi_requests)
-    recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
-
     return nothing
 end
\ No newline at end of file

From bb89d536b86b77e1931a94e5079271a687364add Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 24 Feb 2023 16:43:12 +0100
Subject: [PATCH 040/530] check like this

---
 .../split_explicit_free_surface_kernels.jl                  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 33b45401b7..dcac89e219 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -330,12 +330,14 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     Guⁿ = model.timestepper.Gⁿ.u
     Gvⁿ = model.timestepper.Gⁿ.v
     
+    state     = free_surface.state
     auxiliary = free_surface.auxiliary
 
     @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
 
-    fill_halo_regions!((free_surface.state.U̅, free_surface.state.V̅); async = true)
-    fill_halo_regions!((auxiliary.Gᵁ, auxiliary.Gⱽ); async = true)
+    fields_to_fill = (state.U̅, state.V̅, auxiliary.Gᵁ, auxiliary.Gⱽ)
+
+    fill_halo_regions!(fields_to_fill; async = true)
 
     return nothing
 end

From a7ef3a0c9897c75581ba60d3809d8b1ba10598b1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 27 Feb 2023 15:39:49 +0100
Subject: [PATCH 041/530] more optimize

---
 .../split_explicit_free_surface_kernels.jl                 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index dcac89e219..b92cd3b354 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -297,6 +297,9 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # substepped η field
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
+    fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
+    fill_halo_regions!(fields_to_fill; async = true)
+
     return nothing
 end
 
@@ -330,13 +333,11 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     Guⁿ = model.timestepper.Gⁿ.u
     Gvⁿ = model.timestepper.Gⁿ.v
     
-    state     = free_surface.state
     auxiliary = free_surface.auxiliary
 
     @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
 
-    fields_to_fill = (state.U̅, state.V̅, auxiliary.Gᵁ, auxiliary.Gⱽ)
-
+    fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
     fill_halo_regions!(fields_to_fill; async = true)
 
     return nothing

From 995b64537d94af875c620735f8a1674ab703731d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 27 Feb 2023 16:57:19 +0100
Subject: [PATCH 042/530] compute only what is needed

---
 src/Distributed/interleave_comm_and_comp.jl   |  29 ++++
 ...ate_hydrostatic_free_surface_tendencies.jl |  21 ++-
 .../recompute_boundary_tendencies.jl          | 144 ++++++++++++------
 3 files changed, 144 insertions(+), 50 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index e31a3bbc50..51b12b19b7 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -1,4 +1,5 @@
 using Oceananigans: prognostic_fields
+using Oceananigans.Grids: halo_size
 
 function complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch)
 
@@ -17,6 +18,34 @@ end
 complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::SynchedDistributedArch) = nothing
 recompute_boundary_tendencies!() = nothing
 
+interior_tendency_kernel_size(grid::DistributedGrid)    = interior_tendency_kernel_size(grid,    architecture(grid))
+interior_tendency_kernel_offsets(grid::DistributedGrid) = interior_tendency_kernel_offsets(grid, architecture(grid))
+
+interior_tendency_kernel_size(grid, ::SynchedDistributedArch) = :xyz
+interior_tendency_kernel_offsets(grid, ::SynchedDistributedArch) = (0, 0, 0)
+
+function interior_tendency_kernel_size(grid, arch)
+    Rx, Ry, _ = arch.ranks
+    Hx, Hy, _ = halo_size(grid)
+
+    Nx, Ny, Nz = size(grid)
+    
+    Ax = Rx == 1 ? 0 : Hx
+    Ay = Ry == 1 ? 0 : Hy
+
+    return (Nx-2Ax, Ny-2Ay, Nz)
+end
+
+function interior_tendency_kernel_offsets(grid, arch)
+    Rx, Ry, _ = arch.ranks
+    Hx, Hy, _ = halo_size(grid)
+    
+    Ax = Rx == 1 ? 0 : Hx
+    Ay = Ry == 1 ? 0 : Hy
+
+    return (Ax, Ay, 0)
+end
+
 function complete_halo_communication!(field)
     arch = architecture(field.grid)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index dfe630a997..b19fae5f6a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -7,6 +7,7 @@ using Oceananigans.Fields: immersed_boundary_condition
 using Oceananigans.Grids: halo_size
 
 import Oceananigans.Distributed: complete_communication_and_compute_boundary
+import Oceananigans.Distributed: interior_tendency_kernel_size, interior_tendency_kernel_offsets
 
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 
@@ -60,6 +61,8 @@ function calculate_free_surface_tendency!(grid, model)
     return nothing
 end
     
+interior_tendency_kernel_size(grid)    = :xyz
+interior_tendency_kernel_offsets(grid) = (0, 0, 0)
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
 function calculate_hydrostatic_momentum_tendencies!(model, velocities)
@@ -90,12 +93,15 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     
     only_active_cells = use_only_active_cells(grid)
 
-    launch!(arch, grid, :xyz,
-            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, (0, 0, 0), u_kernel_args...;
+    kernel_size    =   interior_tendency_kernel_size(grid)
+    kernel_offsets = interior_tendency_kernel_offsets(grid)
+    
+    launch!(arch, grid, kernel_size,
+            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...;
             only_active_cells)
 
-    launch!(arch, grid, :xyz,
-            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, (0, 0, 0), v_kernel_args...;
+    launch!(arch, grid, kernel_size,
+            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...;
             only_active_cells)
 
     calculate_free_surface_tendency!(grid, model)
@@ -151,6 +157,9 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
 
     only_active_cells = use_only_active_cells(grid)
 
+    kernel_size    =   interior_tendency_kernel_size(grid)
+    kernel_offsets = interior_tendency_kernel_offsets(grid)
+    
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
         @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
         @inbounds c_advection = model.advection[tracer_name]
@@ -162,10 +171,10 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
                                                                                          model.closure,
                                                                                          model.diffusivity_fields)
 
-        launch!(arch, grid, :xyz,
+        launch!(arch, grid, kernel_size,
                 calculate_hydrostatic_free_surface_Gc!,
                 c_tendency,
-                (0, 0, 0),
+                kernel_offsets,
                 c_kernel_function,
                 grid,
                 Val(tracer_index),
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index f0b4d7f72a..429fd7db16 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -8,19 +8,7 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     # We need new values for `w`, `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
 
-    Nx, Ny, Nz = size(grid)
-    Hx, Hy, Hz = halo_size(grid)
-
-    size_x = (Hx, Ny, Nz)
-    size_y = (Nx, Hy, Nz)
-
-    offsetᴸx = (0,  0,  0)
-    offsetᴸy = (0,  0,  0)
-    offsetᴿx = (Nx-Hx, 0,     0)
-    offsetᴿy = (0,     Ny-Hy, 0)
-
-    sizes   = (size_x,     size_y,   size_x,   size_y)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+    sizes, offsets = compute_size_tendency_kernel(grid, arch)
 
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
     v_immersed_bc = immersed_boundary_condition(model.velocities.v)
@@ -92,52 +80,120 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
 end
 
 function recompute_auxiliaries!(model, grid, arch)
-    Nx, Ny, Nz = size(grid)
-    Hx, Hy, Hz = halo_size(grid)
+    
+    sizes, offs = compute_size_w_kernel(grid, arch)
 
-    size_x = (Hx, Ny)
-    size_y = (Nx, Hy)
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
+        compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
+    end
+
+    sizes, offs = compute_size_p_kernel(grid, arch)
 
-    offsetᴸx = (-Hx+1,  0)
-    offsetᴸy = (0,  -Hy+1)
-    offsetᴿx = (Nx-1, 0)
-    offsetᴿy = (0, Ny-1)
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
+    end
 
-    sizes   = (size_x,     size_y,   size_x,   size_y)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+    sizes, offs = compute_size_κ_kernel(grid, arch)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
     end
+end
 
-    size_x = (1, Ny)
-    size_y = (Nx, 1)
+function compute_size_w_kernel(grid, arch)
+    Nx, Ny, _ = size(grid)
+    Hx, Hy, _ = halo_size(grid)
+    Rx, Ry, _ = arch.ranks
 
-    offsetᴸx = (-1,  0)
-    offsetᴸy = (0,  -1)
-    offsetᴿx = (Nx,  0)
-    offsetᴿy = (0,  Ny)
+    size_x = (Hx, Ny)
+    size_y = (Nx, Hy)
 
-    sizes   = (size_x,     size_y,   size_x,   size_y)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+    offsᴸx = (-Hx+1, 0)
+    offsᴸy = (0, -Hy+1)
+    offsᴿx = (Nx-1, 0)
+    offsᴿy = (0, Ny-1)
+
+    if Rx != 1 && Ry != 1 
+        return ((size_x, size_y, size_x, size_y),
+                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
+    elseif Rx == 1
+        return ((size_y, size_y),
+                (offsᴸy, offsᴿy))
+    else
+        return ((size_x, size_x),
+                (offsᴸx, offsᴿx))
+    end
+end
 
+function compute_size_p_kernel(grid, arch)
+    Nx, Ny, _ = size(grid)
+    Rx, Ry, _ = arch.ranks
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
+    size_x = (1, Ny)
+    size_y = (Nx, 1)
+
+    offsᴸx = (-1, 0)
+    offsᴸy = (0, -1)
+    offsᴿx = (Nx, 0)
+    offsᴿy = (0, Ny)
+
+    if Rx != 1 && Ry != 1 
+        return ((size_x, size_y, size_x, size_y),
+                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
+    elseif Rx == 1
+        return ((size_y, size_y),
+                (offsᴸy, offsᴿy))
+    else
+        return ((size_x, size_x),
+                (offsᴸx, offsᴿx))
     end
+end
+
+function compute_size_κ_kernel(grid, arch)
+    Nx, Ny, Nz = size(grid)
+    Rx, Ry, _  = arch.ranks
 
     size_x = (1, Ny, Nz)
     size_y = (Nx, 1, Nz)
 
-    offsetᴸx = (-1,  0, 0)
-    offsetᴸy = (0,  -1, 0)
-    offsetᴿx = (Nx,  0, 0)
-    offsetᴿy = (0,  Ny, 0)
-
-    sizes   = (size_x,     size_y,   size_x,   size_y)
-    offsets = (offsetᴸx, offsetᴸy, offsetᴿx, offsetᴿy)
+    offsᴸx = (-1,  0, 0)
+    offsᴸy = (0,  -1, 0)
+    offsᴿx = (Nx,  0, 0)
+    offsᴿy = (0,  Ny, 0)
+
+    if Rx != 1 && Ry != 1 
+        return ((size_x, size_y, size_x, size_y),
+                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
+    elseif Rx == 1
+        return ((size_y, size_y),
+                (offsᴸy, offsᴿy))
+    else
+        return ((size_x, size_x),
+                (offsᴸx, offsᴿx))
+    end
+end
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
+function compute_size_tendency_kernel(grid, arch)
+    Nx, Ny, Nz = size(grid)
+    Hx, Hy, Hz = halo_size(grid)
+    Rx, Ry, _  = arch.ranks
+    
+    size_x = (Hx, Ny, Nz)
+    size_y = (Nx, Hy, Nz)
+    
+    offsᴸx = (0,  0,  0)
+    offsᴸy = (0,  0,  0)
+    offsᴿx = (Nx-Hx, 0,     0)
+    offsᴿy = (0,     Ny-Hy, 0)
+        
+    if Rx != 1 && Ry != 1 
+        return ((size_x, size_y, size_x, size_y),
+                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
+    elseif Rx == 1
+        return ((size_y, size_y),
+                (offsᴸy, offsᴿy))
+    else
+        return ((size_x, size_x),
+                (offsᴸx, offsᴿx))
     end
-end
\ No newline at end of file
+end

From 8abb3c63b13e8205816310dd3f7437f0457b4383 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 27 Feb 2023 17:01:58 +0100
Subject: [PATCH 043/530] compute only what required

---
 .../recompute_boundary_tendencies.jl          | 63 +++++++------------
 1 file changed, 23 insertions(+), 40 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 429fd7db16..3dd04eb8de 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -113,16 +113,10 @@ function compute_size_w_kernel(grid, arch)
     offsᴿx = (Nx-1, 0)
     offsᴿy = (0, Ny-1)
 
-    if Rx != 1 && Ry != 1 
-        return ((size_x, size_y, size_x, size_y),
-                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
-    elseif Rx == 1
-        return ((size_y, size_y),
-                (offsᴸy, offsᴿy))
-    else
-        return ((size_x, size_x),
-                (offsᴸx, offsᴿx))
-    end
+    sizes = (size_x, size_y, size_x, size_y)
+    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+        
+    return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
 function compute_size_p_kernel(grid, arch)
@@ -137,16 +131,10 @@ function compute_size_p_kernel(grid, arch)
     offsᴿx = (Nx, 0)
     offsᴿy = (0, Ny)
 
-    if Rx != 1 && Ry != 1 
-        return ((size_x, size_y, size_x, size_y),
-                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
-    elseif Rx == 1
-        return ((size_y, size_y),
-                (offsᴸy, offsᴿy))
-    else
-        return ((size_x, size_x),
-                (offsᴸx, offsᴿx))
-    end
+    sizes = (size_x, size_y, size_x, size_y)
+    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+        
+    return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
 function compute_size_κ_kernel(grid, arch)
@@ -161,16 +149,10 @@ function compute_size_κ_kernel(grid, arch)
     offsᴿx = (Nx,  0, 0)
     offsᴿy = (0,  Ny, 0)
 
-    if Rx != 1 && Ry != 1 
-        return ((size_x, size_y, size_x, size_y),
-                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
-    elseif Rx == 1
-        return ((size_y, size_y),
-                (offsᴸy, offsᴿy))
-    else
-        return ((size_x, size_x),
-                (offsᴸx, offsᴿx))
-    end
+    sizes = (size_x, size_y, size_x, size_y)
+    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+        
+    return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
 function compute_size_tendency_kernel(grid, arch)
@@ -185,15 +167,16 @@ function compute_size_tendency_kernel(grid, arch)
     offsᴸy = (0,  0,  0)
     offsᴿx = (Nx-Hx, 0,     0)
     offsᴿy = (0,     Ny-Hy, 0)
+
+    sizes = (size_x, size_y, size_x, size_y)
+    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
         
-    if Rx != 1 && Ry != 1 
-        return ((size_x, size_y, size_x, size_y),
-                (offsᴸx, offsᴸy, offsᴿx, offsᴿy))
-    elseif Rx == 1
-        return ((size_y, size_y),
-                (offsᴸy, offsᴿy))
-    else
-        return ((size_x, size_x),
-                (offsᴸx, offsᴿx))
-    end
+    return return_correct_directions(Rx, Ry, sizes, offs)
 end
+
+return_correct_directions(Rx, Ry, s, o) = Rx != 1 && Ry !=1 ? 
+                                          (s, o) :
+                                          Ry == 1 ?
+                                          ((s[1], s[3]), (o[1], o[3])) :
+                                          ((s[2], s[4]), (o[2], o[4])) 
+

From e8ad4f82d0503c53ca2bd8eb767c346d3a300feb Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 28 Feb 2023 10:39:23 +0100
Subject: [PATCH 044/530] bugfix

---
 .../recompute_boundary_tendencies.jl          | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 3dd04eb8de..23506e341d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -81,15 +81,15 @@ end
 
 function recompute_auxiliaries!(model, grid, arch)
     
-    sizes, offs = compute_size_w_kernel(grid, arch)
+    sizes, offsets = compute_size_w_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
     end
 
     sizes, offs = compute_size_p_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
+    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
     end
 
@@ -174,9 +174,15 @@ function compute_size_tendency_kernel(grid, arch)
     return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
-return_correct_directions(Rx, Ry, s, o) = Rx != 1 && Ry !=1 ? 
-                                          (s, o) :
-                                          Ry == 1 ?
-                                          ((s[1], s[3]), (o[1], o[3])) :
-                                          ((s[2], s[4]), (o[2], o[4])) 
+function return_correct_directions(Rx, Ry, s, o) 
+    if Rx != 1 && Ry !=1 
+        return s, o
+    elseif Rx != 1 && Ry == 1 
+        return (s[1], s[3]), (o[1], o[3])
+    elseif Rx == 1 && Ry != 1 
+        return (s[2], s[4]), (o[2], o[4])
+    else
+        return (), ()
+    end
+end
 

From 85d6ee29c85e5fff95d09b99331f1fd1c9299863 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 2 Mar 2023 09:53:21 +0100
Subject: [PATCH 045/530] final change

---
 .../recompute_boundary_tendencies.jl          | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 23506e341d..5710f1f3d6 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -8,7 +8,7 @@ function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     # We need new values for `w`, `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
 
-    sizes, offsets = compute_size_tendency_kernel(grid, arch)
+    sizes, offsets = size_tendency_kernel(grid, arch)
 
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
     v_immersed_bc = immersed_boundary_condition(model.velocities.v)
@@ -81,26 +81,26 @@ end
 
 function recompute_auxiliaries!(model, grid, arch)
     
-    sizes, offsets = compute_size_w_kernel(grid, arch)
+    sizes, offs = size_w_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
         compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
     end
 
-    sizes, offs = compute_size_p_kernel(grid, arch)
+    sizes, offs = size_p_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
         update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
     end
 
-    sizes, offs = compute_size_κ_kernel(grid, arch)
+    sizes, offs = size_κ_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
         calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
     end
 end
 
-function compute_size_w_kernel(grid, arch)
+function size_w_kernel(grid, arch)
     Nx, Ny, _ = size(grid)
     Hx, Hy, _ = halo_size(grid)
     Rx, Ry, _ = arch.ranks
@@ -119,7 +119,7 @@ function compute_size_w_kernel(grid, arch)
     return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
-function compute_size_p_kernel(grid, arch)
+function size_p_kernel(grid, arch)
     Nx, Ny, _ = size(grid)
     Rx, Ry, _ = arch.ranks
 
@@ -137,17 +137,17 @@ function compute_size_p_kernel(grid, arch)
     return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
-function compute_size_κ_kernel(grid, arch)
+function size_κ_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
     Rx, Ry, _  = arch.ranks
 
-    size_x = (1, Ny, Nz)
-    size_y = (Nx, 1, Nz)
+    size_x = (2, Ny, Nz)
+    size_y = (Nx, 2, Nz)
 
-    offsᴸx = (-1,  0, 0)
-    offsᴸy = (0,  -1, 0)
-    offsᴿx = (Nx,  0, 0)
-    offsᴿy = (0,  Ny, 0)
+    offsᴸx = (-2,    0, 0)
+    offsᴸy = (0,    -2, 0)
+    offsᴿx = (Nx-1,  0, 0)
+    offsᴿy = (0,  Ny-1, 0)
 
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
@@ -155,7 +155,7 @@ function compute_size_κ_kernel(grid, arch)
     return return_correct_directions(Rx, Ry, sizes, offs)
 end
 
-function compute_size_tendency_kernel(grid, arch)
+function size_tendency_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
     Hx, Hy, Hz = halo_size(grid)
     Rx, Ry, _  = arch.ranks
@@ -175,7 +175,7 @@ function compute_size_tendency_kernel(grid, arch)
 end
 
 function return_correct_directions(Rx, Ry, s, o) 
-    if Rx != 1 && Ry !=1 
+    if Rx != 1 && Ry != 1 
         return s, o
     elseif Rx != 1 && Ry == 1 
         return (s[1], s[3]), (o[1], o[3])

From 61c91a3c529f53d8669067123a533600e433e3ea Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 3 Mar 2023 14:03:08 +0100
Subject: [PATCH 046/530] fixed bugs

---
 src/Architectures.jl                          |  6 ++--
 src/Distributed/halo_communication.jl         |  4 +--
 src/Distributed/interleave_comm_and_comp.jl   | 14 +++++---
 src/Distributed/multi_architectures.jl        |  2 +-
 .../compute_w_from_continuity.jl              |  6 ++--
 .../recompute_boundary_tendencies.jl          | 35 +++++++++++--------
 .../split_explicit_free_surface_kernels.jl    |  4 +--
 ...te_hydrostatic_free_surface_model_state.jl |  2 +-
 8 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index cb0a107b08..ef2a5a59e3 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -87,12 +87,12 @@ function unified_array(::GPU, arr::AbstractArray)
     return vec
 end
 
-## Only for contiguous data!! (i.e. only if the offset for pointer(dst::CuArrat, offset::Int) is 1)
-@inline function device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false) 
+## Only for contiguous data!! (i.e. only if the offset for pointer(dst::CuArray, offset::Int) is 1)
+@inline function device_copy_to!(dst::CuArray, src::CuArray; blocking::Bool = false) 
     n = length(src)
     context!(context(src)) do
         GC.@preserve src dst begin
-            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async)
+            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async = blocking)
         end
     end
     return dst
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 17f4d454ef..f3f4f927b3 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -137,7 +137,7 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = false, kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = false, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
@@ -155,7 +155,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     # Overlapping communication and computation, store requests in a `MPI.Request`
     # pool to be waited upon after tendency calculation
-    if async && !(arch isa SynchedDistributedArch)
+    if blocking && !(arch isa BlockingDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 51b12b19b7..ce4b938924 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -15,14 +15,14 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     return nothing
 end
 
-complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::SynchedDistributedArch) = nothing
+complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::BlockingDistributedArch) = nothing
 recompute_boundary_tendencies!() = nothing
 
 interior_tendency_kernel_size(grid::DistributedGrid)    = interior_tendency_kernel_size(grid,    architecture(grid))
 interior_tendency_kernel_offsets(grid::DistributedGrid) = interior_tendency_kernel_offsets(grid, architecture(grid))
 
-interior_tendency_kernel_size(grid, ::SynchedDistributedArch) = :xyz
-interior_tendency_kernel_offsets(grid, ::SynchedDistributedArch) = (0, 0, 0)
+interior_tendency_kernel_size(grid, ::BlockingDistributedArch) = :xyz
+interior_tendency_kernel_offsets(grid, ::BlockingDistributedArch) = (0, 0, 0)
 
 function interior_tendency_kernel_size(grid, arch)
     Rx, Ry, _ = arch.ranks
@@ -46,6 +46,11 @@ function interior_tendency_kernel_offsets(grid, arch)
     return (Ax, Ay, 0)
 end
 
+"""
+    complete_halo_communication!(field)
+
+complete the halo passing of `field` among processors.
+"""
 function complete_halo_communication!(field)
     arch = architecture(field.grid)
 
@@ -58,8 +63,9 @@ function complete_halo_communication!(field)
     
         # Reset MPI requests
         empty!(arch.mpi_requests)
-        recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
     end
     
+    recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
+    
     return nothing
 end
\ No newline at end of file
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 900d1eb85c..31b18d6499 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -112,7 +112,7 @@ function DistributedArch(child_architecture = CPU();
 end
 
 const ViewsDistributedArch   = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
-const SynchedDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
+const BlockingDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
 
 using_buffered_communication(::DistributedArch{A, R, I, ρ, C, γ, B}) where {A, R, I, ρ, C, γ, B} = B
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index a87c677a50..10ccf5013d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -1,5 +1,6 @@
 using Oceananigans.Architectures: device
 using Oceananigans.Operators: div_xyᶜᶜᶜ, Δzᶜᶜᶜ
+using Oceananigans.Grids: halo_size
 
 """
     compute_w_from_continuity!(model)
@@ -12,11 +13,12 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = (-1, -1)) = 
+compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = w_kernel_offsets(grid)) = 
     launch!(arch, grid, kernel_size, _compute_w_from_continuity!, velocities, kernel_offsets, grid)
 
 # extend w kernel to compute also the boundaries
-@inline w_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
+@inline w_kernel_size(grid)    = size(grid)[[1, 2]] .+ halo_size(grid)[[1, 2]] .- 2
+@inline w_kernel_offsets(grid) = - halo_size(grid)[[1, 2]] .+ 1
 
 @kernel function _compute_w_from_continuity!(U, offs, grid)
     i, j = @index(Global, NTuple)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 5710f1f3d6..c69ab4e53d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -116,7 +116,7 @@ function size_w_kernel(grid, arch)
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs)
+    return return_correct_directions(Rx, Ry, sizes, offs, grid)
 end
 
 function size_p_kernel(grid, arch)
@@ -134,30 +134,30 @@ function size_p_kernel(grid, arch)
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs)
+    return return_correct_directions(Rx, Ry, sizes, offs, grid)
 end
 
 function size_κ_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
     Rx, Ry, _  = arch.ranks
 
-    size_x = (2, Ny, Nz)
-    size_y = (Nx, 2, Nz)
+    size_x = (1, Ny, Nz)
+    size_y = (Nx, 1, Nz)
 
-    offsᴸx = (-2,    0, 0)
-    offsᴸy = (0,    -2, 0)
-    offsᴿx = (Nx-1,  0, 0)
-    offsᴿy = (0,  Ny-1, 0)
+    offsᴸx = (-1,  0, 0)
+    offsᴸy = (0,  -1, 0)
+    offsᴿx = (Nx,  0, 0)
+    offsᴿy = (0,  Ny, 0)
 
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs)
+    return return_correct_directions(Rx, Ry, sizes, offs, grid)
 end
 
 function size_tendency_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
-    Hx, Hy, Hz = halo_size(grid)
+    Hx, Hy, _  = halo_size(grid)
     Rx, Ry, _  = arch.ranks
     
     size_x = (Hx, Ny, Nz)
@@ -171,15 +171,20 @@ function size_tendency_kernel(grid, arch)
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs)
+    return return_correct_directions(Rx, Ry, sizes, offs, grid)
 end
 
-function return_correct_directions(Rx, Ry, s, o) 
-    if Rx != 1 && Ry != 1 
+using Oceananigans.Operators: XFlatGrid, YFlatGrid
+
+function return_correct_directions(Rx, Ry, s, o, grid) 
+    include_x = !isa(grid, XFlatGrid) && (Rx != 1)
+    include_y = !isa(grid, YFlatGrid) && (Ry != 1)
+
+    if include_x && include_y
         return s, o
-    elseif Rx != 1 && Ry == 1 
+    elseif include_x && !(include_y)
         return (s[1], s[3]), (o[1], o[3])
-    elseif Rx == 1 && Ry != 1 
+    elseif !(include_x) && include_y
         return (s[2], s[4]), (o[2], o[4])
     else
         return (), ()
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index b92cd3b354..a9b46cb83f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -298,7 +298,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
-    fill_halo_regions!(fields_to_fill; async = true)
+    fill_halo_regions!(fields_to_fill; blocking = true)
 
     return nothing
 end
@@ -338,7 +338,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
 
     fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
-    fill_halo_regions!(fields_to_fill; async = true)
+    fill_halo_regions!(fields_to_fill; blocking = true)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 1b8e5395e2..8db1098800 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -24,7 +24,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     @apply_regionally masking_actions!(model, grid)
 
-    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
+    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); blocking = true)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 

From b8d9bbc55252875763d33df56f3804b9474310d1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 6 Mar 2023 08:47:30 -0500
Subject: [PATCH 047/530] support for flat grids

---
 .../compute_w_from_continuity.jl              | 38 +++++++++++++++++--
 .../update_hydrostatic_pressure.jl            | 28 ++++++++++++--
 .../ri_based_vertical_diffusivity.jl          |  4 +-
 .../turbulence_closure_utils.jl               | 23 +++++++++++
 4 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 10ccf5013d..09f7796861 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -16,10 +16,6 @@ compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities,
 compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = w_kernel_offsets(grid)) = 
     launch!(arch, grid, kernel_size, _compute_w_from_continuity!, velocities, kernel_offsets, grid)
 
-# extend w kernel to compute also the boundaries
-@inline w_kernel_size(grid)    = size(grid)[[1, 2]] .+ halo_size(grid)[[1, 2]] .- 2
-@inline w_kernel_offsets(grid) = - halo_size(grid)[[1, 2]] .+ 1
-
 @kernel function _compute_w_from_continuity!(U, offs, grid)
     i, j = @index(Global, NTuple)
 
@@ -31,3 +27,37 @@ compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(g
         @inbounds U.w[i′, j′, k] = U.w[i′, j′, k-1] - Δzᶜᶜᶜ(i′, j′, k-1, grid) * div_xyᶜᶜᶜ(i′, j′, k-1, grid, U.u, U.v)
     end
 end
+
+#####
+##### Size and offsets for the w kernel
+#####
+
+# extend w kernel to compute also the boundaries
+# If Flat, do not calculate on halos!
+
+using Oceananigans.Operators: XFlatGrid, YFlatGrid
+using Oceananigans.Grids: topology
+
+@inline function w_kernel_size(grid) 
+    Nx, Ny, _ = size(grid)
+    Hx, Hy, _ = halo_size(grid)
+
+    Tx, Ty, _ = topology(grid)
+
+    Ax = Tx == Flat ? Nx : Nx + 2Hx - 2 
+    Ay = Ty == Flat ? Ny : Ny + 2Hy - 2 
+
+    return (Ax, Ay)
+end
+
+@inline function w_kernel_offsets(grid)
+    Hx, Hy, _ = halo_size(grid)
+
+    Tx, Ty, _ = topology(grid)
+
+    Ax = Tx == Flat ? 0 : - Hx + 1 
+    Ay = Ty == Flat ? 0 : - Hy + 1 
+
+    return (Ax, Ay)
+end
+
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index c6fd45a481..df00ac3531 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -27,11 +27,33 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
-# extend p kernel to compute also the boundaries
-@inline p_kernel_size(grid) = size(grid)[[1, 2]] .+ 2
-
 update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
 
 update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =  
         launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
+
+using Oceananigans.Grids: topology
+
+# extend p kernel to compute also the boundaries
+@inline function p_kernel_size(grid) 
+    Nx, Ny, _ = size(grid)
+
+    Tx, Ty, _ = topology(grid)
+
+    Ax = Tx == Flat ? Nx : Nx + 2 
+    Ay = Ty == Flat ? Ny : Ny + 2 
+
+    return (Ax, Ay)
+end
+
+@inline function p_kernel_offsets(grid)
+    Tx, Ty, _ = topology(grid)
+
+    Ax = Tx == Flat ? 0 : - 1 
+    Ay = Ty == Flat ? 0 : - 1 
+
+    return (Ax, Ay)
+end
+        
+        
\ No newline at end of file
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 6b9a93031e..3782c62cb4 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -111,9 +111,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κ, ν)
 end
 
-@inline kappa_kernel_size(grid) = size(grid) .+ 2
-
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = kappa_kernel_size(model.grid), kernel_offsets = (-1, -1, -1))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
     arch = model.architecture
     grid = model.grid
     clock = model.clock
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 488e79763e..c2f1e125ab 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -38,3 +38,26 @@ end
     i, j, k = @index(Global, NTuple)
     @inbounds κₑ[i, j, k] = calc_nonlinear_κᶜᶜᶜ(i, j, k, grid, closure, tracer, tracer_index, U)
 end
+
+# extend κ kernel to compute also the boundaries
+@inline function κ_kernel_size(grid) 
+    Nx, Ny, Nz = size(grid)
+
+    Tx, Ty, Tz = topology(grid)
+
+    Ax = Tx == Flat ? Nx : Nx + 2 
+    Ay = Ty == Flat ? Ny : Ny + 2 
+    Az = Tz == Flat ? Nz : Nz + 2 
+
+    return (Ax, Ay, Az)
+end
+
+@inline function κ_kernel_offsets(grid)
+    Tx, Ty, Tz = topology(grid)
+
+    Ax = Tx == Flat ? 0 : - 1 
+    Ay = Ty == Flat ? 0 : - 1 
+    Az = Ty == Flat ? 0 : - 1 
+
+    return (Ax, Ay, 0)
+end
\ No newline at end of file

From 7b988238a1ba21717ae183afbba067c8d08ab952 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 6 Mar 2023 09:29:18 -0500
Subject: [PATCH 048/530] updating p offsets

---
 .../NonhydrostaticModels/update_hydrostatic_pressure.jl       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index df00ac3531..0a512f1926 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -27,10 +27,10 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = p_kernel_offsets(grid)) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
 
-update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = (-1, -1)) =  
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = p_kernel_offsets(grid)) =  
         launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
 
 using Oceananigans.Grids: topology

From 97ae668ebd8488782f3c524b81feca2ac59af6dc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 6 Mar 2023 09:29:33 -0500
Subject: [PATCH 049/530] change

---
 src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 0a512f1926..170197d02f 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -31,7 +31,7 @@ update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kerne
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
 
 update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = p_kernel_offsets(grid)) =  
-        launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
+    launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
 
 using Oceananigans.Grids: topology
 

From 76525fa6cb88b499d565bf4e3af1db09b7658a1e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 6 Mar 2023 14:42:05 -0500
Subject: [PATCH 050/530] update dependencies

---
 Manifest.toml | 132 +++++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index bf5b0c25ae..8a280b21f2 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,6 +1,6 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.8.5"
+julia_version = "1.8.0"
 manifest_format = "2.0"
 project_hash = "e5c066cd371cc92d479d4d0c34bc89f3323ab6b3"
 
@@ -25,10 +25,10 @@ uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 version = "1.2.1"
 
 [[deps.Adapt]]
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "0310e08cb19f5da31d08341c6120c047598f5b9c"
+deps = ["LinearAlgebra", "Requires"]
+git-tree-sha1 = "cc37d689f599e8df4f464b2fa3870ff7db7492ef"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "3.5.0"
+version = "3.6.1"
 
 [[deps.AlgebraicMultigrid]]
 deps = ["CommonSolve", "LinearAlgebra", "Printf", "Reexport", "SparseArrays"]
@@ -41,16 +41,10 @@ uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
 version = "1.1.1"
 
 [[deps.ArrayInterface]]
-deps = ["ArrayInterfaceCore", "Compat", "IfElse", "LinearAlgebra", "SnoopPrecompile", "Static"]
-git-tree-sha1 = "dedc16cbdd1d32bead4617d27572f582216ccf23"
+deps = ["Adapt", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "a89acc90c551067cd84119ff018619a1a76c6277"
 uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "6.0.25"
-
-[[deps.ArrayInterfaceCore]]
-deps = ["LinearAlgebra", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "e5f08b5689b1aad068e01751889f2f615c7db36d"
-uuid = "30b0a656-2188-435a-8636-2ec0e6a096e2"
-version = "0.1.29"
+version = "7.2.1"
 
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -82,8 +76,8 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.2"
 
 [[deps.CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions"]
-git-tree-sha1 = "666924b0caa3c8fd067de83b4aefc4b51d0b568f"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "f659a5cac9fec5f47d4f62baa6f441e3d57b23c1"
 repo-rev = "vc/ka_transition"
 repo-url = "https://github.com/JuliaGPU/CUDA.jl.git"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -115,9 +109,9 @@ version = "1.15.7"
 
 [[deps.ChangesOfVariables]]
 deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
-git-tree-sha1 = "844b061c104c408b24537482469400af6075aae4"
+git-tree-sha1 = "485193efd2176b88e6622a39a246f8c5b600e74e"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.5"
+version = "0.1.6"
 
 [[deps.CommonSolve]]
 git-tree-sha1 = "9441451ee712d1aec22edad62db1a9af3dc8d852"
@@ -126,14 +120,14 @@ version = "0.2.3"
 
 [[deps.Compat]]
 deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04"
+git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.6.0"
+version = "4.6.1"
 
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.1+0"
+version = "0.5.2+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -168,9 +162,9 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
 [[deps.DiffRules]]
 deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "c5b6685d53f933c11404a3ae9822afe30d522494"
+git-tree-sha1 = "a4ad7ef19d2cdc2eff57abbbe68032b1cd0bd8f8"
 uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.12.2"
+version = "1.13.0"
 
 [[deps.Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -199,9 +193,9 @@ version = "0.1.8"
 
 [[deps.FFTW]]
 deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
-git-tree-sha1 = "90630efff0894f8142308e334473eba54c433549"
+git-tree-sha1 = "f9818144ce7c8c41edf5c4c179c684d92aa4d9fe"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.5.0"
+version = "1.6.0"
 
 [[deps.FFTW_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -220,15 +214,15 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "4dfaff044eb2ce11a897fecd85538310e60b91e6"
+git-tree-sha1 = "a28f752ffab0ccd6660fc7af5ad1c9ad176f45f7"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.6.2"
+version = "8.6.3"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
-git-tree-sha1 = "57f7cde02d7a53c9d1d28443b9f11ac5fbe7ebc9"
+git-tree-sha1 = "1cd7f0af1aa58abc02ea1d872953a97359cb87fa"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
-version = "0.1.3"
+version = "0.1.4"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
@@ -275,9 +269,9 @@ uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
 version = "0.1.8"
 
 [[deps.IrrationalConstants]]
-git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
+git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
-version = "0.1.1"
+version = "0.2.2"
 
 [[deps.IterativeSolvers]]
 deps = ["LinearAlgebra", "Printf", "Random", "RecipesBase", "SparseArrays"]
@@ -291,10 +285,10 @@ uuid = "82899510-4779-5014-852e-03e436cf321d"
 version = "1.0.0"
 
 [[deps.JLD2]]
-deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "TranscodingStreams", "UUIDs"]
-git-tree-sha1 = "c3244ef42b7d4508c638339df1bdbf4353e144db"
+deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "Requires", "TranscodingStreams", "UUIDs"]
+git-tree-sha1 = "42c17b18ced77ff0be65957a591d34f4ed57c631"
 uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.30"
+version = "0.4.31"
 
 [[deps.JLLWrappers]]
 deps = ["Preferences"]
@@ -315,9 +309,9 @@ uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 version = "1.12.0"
 
 [[deps.KernelAbstractions]]
-deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "9536f1c772a6649ae2024504086e3b932acdfab7"
-repo-rev = "vc/nix_dependencies"
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "a2fc41047b3dbeb9bbfc4c3a39ef1aaa83c35f1c"
+repo-rev = "vc/device_to_backend"
 repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 version = "0.9.0"
@@ -330,9 +324,9 @@ version = "4.16.0"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
-git-tree-sha1 = "771bfe376249626d3ca12bcd58ba243d3f961576"
+git-tree-sha1 = "7718cf44439c676bc0ec66a87099f41015a522d6"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.16+0"
+version = "0.0.16+2"
 
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
@@ -372,9 +366,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[deps.LogExpFunctions]]
 deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "680e733c3a0a9cea9e935c8c2184aea6a63fa0b5"
+git-tree-sha1 = "0a1b7c2863e44523180fdb3146534e265a91870b"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.21"
+version = "0.3.23"
 
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -392,10 +386,10 @@ uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 version = "0.20.8"
 
 [[deps.MPICH_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "6d4fa43afab4611d090b11617ecea1a144b21d35"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "7ec808cad4f3940316c015cb16608e4e632c2c89"
 uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
-version = "4.0.2+5"
+version = "4.1.0+1"
 
 [[deps.MPIPreferences]]
 deps = ["Libdl", "Preferences"]
@@ -439,15 +433,15 @@ version = "2022.2.1"
 
 [[deps.NCDatasets]]
 deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
-git-tree-sha1 = "d3e32d2b0530d929a047ceab8799ae4204de6c88"
+git-tree-sha1 = "fe130b7201b7fd908d950076dbfc0671270894c5"
 uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
-version = "0.12.12"
+version = "0.12.13"
 
 [[deps.NaNMath]]
 deps = ["OpenLibm_jll"]
-git-tree-sha1 = "a7c3d1da1189a1c2fe843a3bfa04d18d20eb3211"
+git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4"
 uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "1.0.1"
+version = "1.0.2"
 
 [[deps.NetCDF_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Pkg", "XML2_jll", "Zlib_jll"]
@@ -476,10 +470,10 @@ uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
 version = "0.8.1+0"
 
 [[deps.OpenMPI_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "346d6b357a480300ed7854dbc70e746ac52e10fd"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "f3080f4212a8ba2ceb10a34b938601b862094314"
 uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
-version = "4.1.3+3"
+version = "4.1.5+0"
 
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -500,15 +494,15 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "SnoopPrecompile"]
-git-tree-sha1 = "6f4fbcd1ad45905a5dee3f4256fabb49aa2110c6"
+git-tree-sha1 = "478ac6c952fddd4399e71d4779797c538d0ff2bf"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.5.7"
+version = "2.5.8"
 
 [[deps.PencilArrays]]
-deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
-git-tree-sha1 = "47034fbd0b4aff6cd81e7c078c8ffa6b6cb5280c"
+deps = ["Adapt", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrayInterface", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
+git-tree-sha1 = "0c6ebb4777158b8662288fb4fca255e404adc94b"
 uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
-version = "0.17.9"
+version = "0.17.10"
 
 [[deps.PencilFFTs]]
 deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
@@ -588,18 +582,18 @@ version = "1.3.0"
 
 [[deps.Rotations]]
 deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays", "Statistics"]
-git-tree-sha1 = "9480500060044fd25a1c341da53f34df7443c2f2"
+git-tree-sha1 = "72a6abdcd088764878b473102df7c09bbc0548de"
 uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc"
-version = "1.3.4"
+version = "1.4.0"
 
 [[deps.SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 version = "0.7.0"
 
 [[deps.SeawaterPolynomials]]
-git-tree-sha1 = "3e4e6c809e96ddcc0077bdb6944a1abb53fc382b"
+git-tree-sha1 = "20e6926c620cedee2b7551b61169dd118b4e34f2"
 uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
-version = "0.3.0"
+version = "0.3.1"
 
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -619,21 +613,27 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[deps.SpecialFunctions]]
 deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "d75bda01f8c31ebb72df80a46c88b25d1c79c56d"
+git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.1.7"
+version = "2.2.0"
 
 [[deps.Static]]
 deps = ["IfElse"]
-git-tree-sha1 = "c35b107b61e7f34fa3f124026f2a9be97dea9e1c"
+git-tree-sha1 = "d0435ba43ab5ad1cbb5f0d286ca4ba67029ed3ee"
 uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.8.3"
+version = "0.8.4"
+
+[[deps.StaticArrayInterface]]
+deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
+git-tree-sha1 = "fd5f417fd7e103c121b0a0b4a6902f03991111f4"
+uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
+version = "1.3.0"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "67d3e75e8af8089ea34ce96974d5468d4a008ca6"
+git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.15"
+version = "1.5.16"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
@@ -691,7 +691,7 @@ version = "1.10.0"
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.1"
+version = "1.10.0"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]

From 4a176248b37925d6dd71e9dbb7ab863aa3b31cc1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 6 Mar 2023 15:26:58 -0500
Subject: [PATCH 051/530] CUDADevice -> CUDABackend

---
 src/Architectures.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index ef2a5a59e3..2b4d6f726f 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -37,7 +37,7 @@ struct GPU <: AbstractArchitecture end
 #####
 
 device(::CPU) = KernelAbstractions.CPU()
-device(::GPU) = CUDAKernels.CUDADevice(;always_inline=true)
+device(::GPU) = CUDAKernels.CUDABackend(;always_inline=true)
 
 architecture() = nothing
 architecture(::Number) = nothing

From fb7c656efe2f65d3efdc5efc5434aeb91d769e5a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 7 Mar 2023 12:02:27 -0500
Subject: [PATCH 052/530] compute tendencies in update state

---
 .../calculate_hydrostatic_free_surface_tendencies.jl |  6 +++---
 .../update_hydrostatic_free_surface_model_state.jl   |  7 +++++--
 .../calculate_nonhydrostatic_tendencies.jl           |  6 +++---
 .../update_nonhydrostatic_model_state.jl             |  4 +++-
 .../calculate_shallow_water_tendencies.jl            |  6 +++---
 .../ShallowWaterModels/update_shallow_water_state.jl |  5 ++++-
 src/TimeSteppers/TimeSteppers.jl                     |  2 +-
 src/TimeSteppers/quasi_adams_bashforth_2.jl          |  6 +-----
 src/TimeSteppers/runge_kutta_3.jl                    | 12 +++---------
 9 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index b19fae5f6a..5b415b5253 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -1,4 +1,4 @@
-import Oceananigans.TimeSteppers: calculate_tendencies!
+import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
 using Oceananigans: fields, prognostic_fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
@@ -12,12 +12,12 @@ import Oceananigans.Distributed: interior_tendency_kernel_size, interior_tendenc
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 
 """
-    calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
+    compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
 Calculate the interior and boundary contributions to tendency terms without the
 contribution from non-hydrostatic pressure.
 """
-function calculate_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
+function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index b2093ac7cb..f94c7e6baf 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -18,9 +18,10 @@ Update peripheral aspects of the model (auxiliary fields, halo regions, diffusiv
 hydrostatic pressure) to the current model state. If `callbacks` are provided (in an array),
 they are called in the end.
 """
-update_state!(model::HydrostaticFreeSurfaceModel, callbacks=[]) = update_state!(model, model.grid, callbacks)
+update_state!(model::HydrostaticFreeSurfaceModel, callbacks=[]; compute_tendencies = true) =
+         update_state!(model, model.grid, callbacks; compute_tendencies)
 
-function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
+function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; compute_tendencies = true)
 
     @apply_regionally masking_immersed_model_fields!(model, grid)
 
@@ -30,6 +31,8 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks)
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
+    compute_tendencies && compute_tendencies!(model, callbacks)
+
     return nothing
 end
 
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index c2d7bfab1e..e2498534aa 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -1,4 +1,4 @@
-import Oceananigans.TimeSteppers: calculate_tendencies!
+import Oceananigans.TimeSteppers: compute_tendencies!
 
 using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
 using Oceananigans.Utils: work_layout
@@ -6,12 +6,12 @@ using Oceananigans.Utils: work_layout
 using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
 
 """
-    calculate_tendencies!(model::NonhydrostaticModel)
+    compute_tendencies!(model::NonhydrostaticModel)
 
 Calculate the interior and boundary contributions to tendency terms without the
 contribution from non-hydrostatic pressure.
 """
-function calculate_tendencies!(model::NonhydrostaticModel, callbacks)
+function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 
     # Note:
     #
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 8b0d1a457b..946f3bb637 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -13,7 +13,7 @@ Update peripheral aspects of the model (halo regions, diffusivities, hydrostatic
 pressure) to the current model state. If `callbacks` are provided (in an array),
 they are called in the end.
 """
-function update_state!(model::NonhydrostaticModel, callbacks=[])
+function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendencies = true)
     
     # Mask immersed tracers
     foreach(mask_immersed_field!, model.tracers)
@@ -35,5 +35,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[])
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
 
+    compute_tendencies && compute_tendencies!(model, callbacks)
+
     return nothing
 end
diff --git a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl b/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
index 72087fff48..fdb02e33fe 100644
--- a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
+++ b/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
@@ -1,4 +1,4 @@
-import Oceananigans.TimeSteppers: calculate_tendencies!
+import Oceananigans.TimeSteppers: compute_tendencies!
 
 using Oceananigans.Utils: work_layout
 using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
@@ -10,12 +10,12 @@ using Oceananigans.BoundaryConditions
 
 
 """
-    calculate_tendencies!(model::ShallowWaterModel)
+    compute_tendencies!(model::ShallowWaterModel)
 
 Calculate the interior and boundary contributions to tendency terms without the
 contribution from non-hydrostatic pressure.
 """
-function calculate_tendencies!(model::ShallowWaterModel, callbacks)
+function compute_tendencies!(model::ShallowWaterModel, callbacks)
 
     # Note:
     #
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index f9e12161a5..25028b2cd9 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -9,7 +9,7 @@ import Oceananigans.TimeSteppers: update_state!
 Fill halo regions for `model.solution` and `model.tracers`.
 If `callbacks` are provided (in an array), they are called in the end.
 """
-function update_state!(model::ShallowWaterModel, callbacks=[])
+function update_state!(model::ShallowWaterModel, callbacks=[]; compute_tendencies = true)
 
     # Mask immersed fields
     foreach(mask_immersed_field!, model.solution)
@@ -30,6 +30,9 @@ function update_state!(model::ShallowWaterModel, callbacks=[])
             callback(model)
         end
     end
+
+    compute_tendencies && compute_tendencies!(model, callbacks)
+
     return nothing
 end
 
diff --git a/src/TimeSteppers/TimeSteppers.jl b/src/TimeSteppers/TimeSteppers.jl
index 9d77581b13..9126d2dfc5 100644
--- a/src/TimeSteppers/TimeSteppers.jl
+++ b/src/TimeSteppers/TimeSteppers.jl
@@ -43,7 +43,7 @@ end
 TimeStepper(stepper::AbstractTimeStepper, args...; kwargs...) = stepper
 
 function update_state! end
-function calculate_tendencies! end
+function compute_tendencies! end
 
 calculate_pressure_correction!(model, Δt) = nothing
 pressure_correct_velocities!(model, Δt) = nothing
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 27b0e0066f..9d6f5b8ab0 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -85,18 +85,14 @@ function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt
 
     model.timestepper.previous_Δt = Δt
 
-    # Be paranoid and update state at iteration 0
-    model.clock.iteration == 0 && update_state!(model, callbacks)
+    update_state!(model, callbacks)
 
-    @apply_regionally calculate_tendencies!(model, callbacks)
-    
     ab2_step!(model, Δt, χ) # full step for tracers, fractional step for velocities.
     calculate_pressure_correction!(model, Δt)
 
     @apply_regionally correct_velocities_and_store_tendecies!(model, Δt)
 
     tick!(model.clock, Δt)
-    update_state!(model, callbacks)
     update_particle_properties!(model, Δt)
 
     return nothing
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index 236cb7202c..aa9a863657 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -81,9 +81,6 @@ stage.
 function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[])
     Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
-    # Be paranoid and update state at iteration 0, in case run! is not used:
-    model.clock.iteration == 0 && update_state!(model, callbacks)
-
     γ¹ = model.timestepper.γ¹
     γ² = model.timestepper.γ²
     γ³ = model.timestepper.γ³
@@ -99,7 +96,7 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     # First stage
     #
 
-    calculate_tendencies!(model, callbacks)
+    update_state!(model, callbacks)
 
     correct_immersed_tendencies!(model, Δt, γ¹, 0)
 
@@ -110,14 +107,13 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
 
     tick!(model.clock, first_stage_Δt; stage=true)
     store_tendencies!(model)
-    update_state!(model, callbacks)
     update_particle_properties!(model, first_stage_Δt)
 
     #
     # Second stage
     #
 
-    calculate_tendencies!(model, callbacks)
+    update_state!(model, callbacks)
 
     correct_immersed_tendencies!(model, Δt, γ², ζ²)
 
@@ -128,14 +124,13 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
 
     tick!(model.clock, second_stage_Δt; stage=true)
     store_tendencies!(model)
-    update_state!(model, callbacks)
     update_particle_properties!(model, second_stage_Δt)
 
     #
     # Third stage
     #
 
-    calculate_tendencies!(model, callbacks)
+    update_state!(model, callbacks)
     
     correct_immersed_tendencies!(model, Δt, γ³, ζ³)
 
@@ -145,7 +140,6 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     pressure_correct_velocities!(model, third_stage_Δt)
 
     tick!(model.clock, third_stage_Δt)
-    update_state!(model, callbacks)
     update_particle_properties!(model, third_stage_Δt)
 
     return nothing

From 28fbd9d8046369e4c8a05bb31624f756a6a42bd7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 7 Mar 2023 12:10:48 -0500
Subject: [PATCH 053/530] apply regionally update state

---
 .../update_hydrostatic_free_surface_model_state.jl             | 3 ++-
 .../NonhydrostaticModels/update_nonhydrostatic_model_state.jl  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index f94c7e6baf..5a301b8e5a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -31,7 +31,8 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
-    compute_tendencies && compute_tendencies!(model, callbacks)
+    compute_tendencies && 
+        @apply_regionally compute_tendencies!(model, callbacks)
 
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 946f3bb637..21b38af7fe 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -35,7 +35,8 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
 
-    compute_tendencies && compute_tendencies!(model, callbacks)
+    compute_tendencies && 
+        @apply_regionally compute_tendencies!(model, callbacks)
 
     return nothing
 end

From 41d3cffcfcc5b8cb5fb2252d93e5b9fd8664dda4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 7 Mar 2023 12:12:40 -0500
Subject: [PATCH 054/530] update state in right place

---
 .../correct_immersed_tendencies.jl            |  7 -------
 src/TimeSteppers/quasi_adams_bashforth_2.jl   |  6 ++++--
 src/TimeSteppers/runge_kutta_3.jl             | 19 ++++++-------------
 3 files changed, 10 insertions(+), 22 deletions(-)
 delete mode 100644 src/TimeSteppers/correct_immersed_tendencies.jl

diff --git a/src/TimeSteppers/correct_immersed_tendencies.jl b/src/TimeSteppers/correct_immersed_tendencies.jl
deleted file mode 100644
index 8cc15f52f5..0000000000
--- a/src/TimeSteppers/correct_immersed_tendencies.jl
+++ /dev/null
@@ -1,7 +0,0 @@
-"""
-    correct_immersed_tendencies!(model, Δt, γⁿ, ζⁿ)
-
-Change the tendency fields to account for the presence of a boundary immersed
-within the `model` grid. Does nothing by default.
-"""
-correct_immersed_tendencies!(model, Δt, γⁿ, ζⁿ) = nothing # fallback function 
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 9d6f5b8ab0..5f8db28f9e 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -85,14 +85,16 @@ function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt
 
     model.timestepper.previous_Δt = Δt
 
-    update_state!(model, callbacks)
-
+    # Be paranoid and update state at iteration 0
+    model.clock.iteration == 0 && update_state!(model, callbacks)
+    
     ab2_step!(model, Δt, χ) # full step for tracers, fractional step for velocities.
     calculate_pressure_correction!(model, Δt)
 
     @apply_regionally correct_velocities_and_store_tendecies!(model, Δt)
 
     tick!(model.clock, Δt)
+    update_state!(model, callbacks)
     update_particle_properties!(model, Δt)
 
     return nothing
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index aa9a863657..ff9947bbfd 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -81,6 +81,9 @@ stage.
 function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[])
     Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
+    # Be paranoid and update state at iteration 0, in case run! is not used:
+    model.clock.iteration == 0 && update_state!(model, callbacks)
+
     γ¹ = model.timestepper.γ¹
     γ² = model.timestepper.γ²
     γ³ = model.timestepper.γ³
@@ -96,10 +99,6 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     # First stage
     #
 
-    update_state!(model, callbacks)
-
-    correct_immersed_tendencies!(model, Δt, γ¹, 0)
-
     rk3_substep!(model, Δt, γ¹, nothing)
 
     calculate_pressure_correction!(model, first_stage_Δt)
@@ -107,16 +106,13 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
 
     tick!(model.clock, first_stage_Δt; stage=true)
     store_tendencies!(model)
+    update_state!(model, callbacks)
     update_particle_properties!(model, first_stage_Δt)
 
     #
     # Second stage
     #
 
-    update_state!(model, callbacks)
-
-    correct_immersed_tendencies!(model, Δt, γ², ζ²)
-
     rk3_substep!(model, Δt, γ², ζ²)
 
     calculate_pressure_correction!(model, second_stage_Δt)
@@ -124,22 +120,19 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
 
     tick!(model.clock, second_stage_Δt; stage=true)
     store_tendencies!(model)
+    update_state!(model, callbacks)
     update_particle_properties!(model, second_stage_Δt)
 
     #
     # Third stage
     #
-
-    update_state!(model, callbacks)
-    
-    correct_immersed_tendencies!(model, Δt, γ³, ζ³)
-
     rk3_substep!(model, Δt, γ³, ζ³)
 
     calculate_pressure_correction!(model, third_stage_Δt)
     pressure_correct_velocities!(model, third_stage_Δt)
 
     tick!(model.clock, third_stage_Δt)
+    update_state!(model, callbacks)
     update_particle_properties!(model, third_stage_Δt)
 
     return nothing

From 7db99b315bd8c681bbb5684d9941f74df558bc75 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 7 Mar 2023 12:18:32 -0500
Subject: [PATCH 055/530] compute_tendencies in time step

---
 src/TimeSteppers/TimeSteppers.jl            | 1 -
 src/TimeSteppers/quasi_adams_bashforth_2.jl | 7 ++++---
 src/TimeSteppers/runge_kutta_3.jl           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TimeSteppers/TimeSteppers.jl b/src/TimeSteppers/TimeSteppers.jl
index 9126d2dfc5..baecad8f6f 100644
--- a/src/TimeSteppers/TimeSteppers.jl
+++ b/src/TimeSteppers/TimeSteppers.jl
@@ -54,6 +54,5 @@ include("clock.jl")
 include("store_tendencies.jl")
 include("quasi_adams_bashforth_2.jl")
 include("runge_kutta_3.jl")
-include("correct_immersed_tendencies.jl")
 
 end # module
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 5f8db28f9e..248da07f64 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -61,12 +61,13 @@ end
 #####
 
 """
-    time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; euler=false)
+    time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; euler=false, compute_tendencies=true)
 
 Step forward `model` one time step `Δt` with a 2nd-order Adams-Bashforth method and
 pressure-correction substep. Setting `euler=true` will take a forward Euler time step.
+Setting `compute_tendencies=false` will not calculate new tendencies
 """
-function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; callbacks = [], euler=false)
+function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt; callbacks = [], euler=false, compute_tendencies = true)
     Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
     # Shenanigans for properly starting the AB2 loop with an Euler step
@@ -94,7 +95,7 @@ function time_step!(model::AbstractModel{<:QuasiAdamsBashforth2TimeStepper}, Δt
     @apply_regionally correct_velocities_and_store_tendecies!(model, Δt)
 
     tick!(model.clock, Δt)
-    update_state!(model, callbacks)
+    update_state!(model, callbacks; compute_tendencies)
     update_particle_properties!(model, Δt)
 
     return nothing
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index ff9947bbfd..8b91d7d0b9 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -78,7 +78,7 @@ The 3rd-order Runge-Kutta method takes three intermediate substep stages to
 achieve a single timestep. A pressure correction step is applied at each intermediate
 stage.
 """
-function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[])
+function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbacks=[], compute_tendencies = true)
     Δt == 0 && @warn "Δt == 0 may cause model blowup!"
 
     # Be paranoid and update state at iteration 0, in case run! is not used:
@@ -132,7 +132,7 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     pressure_correct_velocities!(model, third_stage_Δt)
 
     tick!(model.clock, third_stage_Δt)
-    update_state!(model, callbacks)
+    update_state!(model, callbacks; compute_tendencies)
     update_particle_properties!(model, third_stage_Δt)
 
     return nothing

From 6009b2aafaa1517e8a08855fce5b7ea5a5033b5b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 7 Mar 2023 13:33:58 -0500
Subject: [PATCH 056/530] bugfix

---
 src/Models/NonhydrostaticModels/NonhydrostaticModels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 42e37cd8ce..dcbcb7ea53 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -7,7 +7,7 @@ using DocStringExtensions
 using KernelAbstractions: @index, @kernel
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
-using Oceananigans.Utils: launch!
+using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Solvers
 using Oceananigans.Distributed: DistributedArch, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   

From 3a6e8a121d5319decc2f8f4a308434180b8140df Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 14:33:43 -0400
Subject: [PATCH 057/530] increase tag

---
 src/Distributed/halo_communication.jl | 30 +++++++++++++++++----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index f3f4f927b3..37cc76315c 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -46,32 +46,40 @@ opposite_side = Dict(
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
 #   digit 1-2: an identifier for the field that is reset each timestep
+#   digit 3-4: an identifier for the field's location 
 #   digit 3: the side
 #   digits 4-6: the "from" rank
 #   digits 7-9: the "to" rank
 
 RANK_DIGITS = 3
-ID_DIGITS = 2
+ID_DIGITS   = 2
+LOC_DIGITS  = 3
+
+location_id(::Type{Center})  = 1
+location_id(::Type{Face})    = 2
+location_id(::Type{Nothing}) = 3
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        function $send_tag_fn_name(arch, local_rank, rank_to_send_to)
+        function $send_tag_fn_name(arch, local_rank, location, rank_to_send_to)
             field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            loc_id      = string(location_id.(location), pad=LOC_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * loc_id * side_digit * from_digits * to_digits)
         end
 
-        function $recv_tag_fn_name(arch, local_rank, rank_to_recv_from)
+        function $recv_tag_fn_name(arch, local_rank, location, rank_to_recv_from)
             field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            loc_id      = string(location_id.(location), pad=LOC_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * loc_id * side_digit * from_digits * to_digits)
         end
     end
 end
@@ -235,8 +243,8 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
 
             sync_device!(child_arch)
 
-            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], local_rank, bc_opposite_side.condition.to, buffers)
-            send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], local_rank, bc_opposite_side.condition.to, buffers)
+            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+            send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req, recv_req]
         end
@@ -255,9 +263,9 @@ for side in sides
     get_side_send_buffer = Symbol("get_$(side)_send_buffer")
 
     @eval begin
-        function $send_side_halo(c, grid, arch, side_location, local_rank, rank_to_send_to, buffers)
+        function $send_side_halo(c, grid, arch, side_location, location, local_rank, rank_to_send_to, buffers)
             send_buffer = $get_side_send_buffer(c, grid, side_location, buffers, arch)
-            send_tag = $side_send_tag(arch, local_rank, rank_to_send_to)
+            send_tag = $side_send_tag(arch, location, local_rank, rank_to_send_to)
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
@@ -282,9 +290,9 @@ for side in sides
     get_side_recv_buffer = Symbol("get_$(side)_recv_buffer")
 
     @eval begin
-        function $recv_and_fill_side_halo!(c, grid, arch, side_location, local_rank, rank_to_recv_from, buffers)
+        function $recv_and_fill_side_halo!(c, grid, arch, side_location, location, local_rank, rank_to_recv_from, buffers)
             recv_buffer = $get_side_recv_buffer(c, grid, side_location, buffers, arch)
-            recv_tag = $side_recv_tag(arch, local_rank, rank_to_recv_from)
+            recv_tag = $side_recv_tag(arch, location, local_rank, rank_to_recv_from)
 
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)

From 256288c3eb23e266e9454b76860a463fbb61440c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 14:45:41 -0400
Subject: [PATCH 058/530] bugfix

---
 src/Distributed/halo_communication.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 37cc76315c..383ca29789 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -226,8 +226,8 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
 
             sync_device!(child_arch)
 
-            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], local_rank, bc_side.condition.to, buffers)
-            send_req = $send_side_halo(c, grid, arch, loc[$dir], local_rank, bc_side.condition.to, buffers)
+            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+            send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             
             return [send_req, recv_req]
         end

From f0acd0d729c31b008043ba7c56f407de8ae3e5e3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 14:58:54 -0400
Subject: [PATCH 059/530] bugfix

---
 src/Distributed/halo_communication.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 383ca29789..5c4d9ba49b 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -55,9 +55,9 @@ RANK_DIGITS = 3
 ID_DIGITS   = 2
 LOC_DIGITS  = 3
 
-location_id(::Type{Center})  = 1
-location_id(::Type{Face})    = 2
-location_id(::Type{Nothing}) = 3
+location_id(::Center)  = 1
+location_id(::Face)    = 2
+location_id(::Nothing) = 3
 
 for side in sides
     side_str = string(side)
@@ -206,8 +206,8 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
 
             sync_device!(child_architecture(arch))
 
-            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], local_rank, bc_side.condition.to, buffers)
-            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], local_rank, bc_opposite_side.condition.to, buffers)
+            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             send_req1 = $send_side_halo(c, grid, arch, loc[$dir], local_rank, bc_side.condition.to, buffers)
             send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], local_rank, bc_opposite_side.condition.to, buffers)

From c2e05585eb9864476c9aaf82ce2194ebd3d0c7d0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 15:00:21 -0400
Subject: [PATCH 060/530] bugfixxin

---
 src/Distributed/halo_communication.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 5c4d9ba49b..d19af88b84 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -209,8 +209,8 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
             recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
-            send_req1 = $send_side_halo(c, grid, arch, loc[$dir], local_rank, bc_side.condition.to, buffers)
-            send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], local_rank, bc_opposite_side.condition.to, buffers)
+            send_req1 = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+            send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req1, send_req2, recv_req1, recv_req2]
         end

From 814eedc438e0431ae64d6668c9ce22cbc81fd249 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 15:19:19 -0400
Subject: [PATCH 061/530] testing new tags

---
 src/Distributed/halo_communication.jl | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d19af88b84..5bdbffeec7 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -51,31 +51,32 @@ opposite_side = Dict(
 #   digits 4-6: the "from" rank
 #   digits 7-9: the "to" rank
 
-RANK_DIGITS = 3
+RANK_DIGITS = 2
 ID_DIGITS   = 2
-LOC_DIGITS  = 3
+LOC_DIGITS  = 2
 
-location_id(::Center)  = 1
-location_id(::Face)    = 2
-location_id(::Nothing) = 3
+@inline loc_id(::Nothing) = 0
+@inline loc_id(::Face)    = 1
+@inline loc_id(::Center)  = 2
+@inline location_id(X, Y, Z) = loc_id(X) + 3*loc_id(Y) + 9*loc_id(Z)
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        function $send_tag_fn_name(arch, local_rank, location, rank_to_send_to)
+        function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
             field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id.(location), pad=LOC_DIGITS)
+            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
             return parse(Int, field_id * loc_id * side_digit * from_digits * to_digits)
         end
 
-        function $recv_tag_fn_name(arch, local_rank, location, rank_to_recv_from)
+        function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
             field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id.(location), pad=LOC_DIGITS)
+            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])

From 7a8236db904fe2abc14bbbe4c5bddcedca33ec81 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 19:50:48 -0400
Subject: [PATCH 062/530] reset MPI tag

---
 src/Distributed/halo_communication.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 5bdbffeec7..39d6a2d390 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -171,6 +171,9 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     # Syncronous MPI fill_halo_event!
     MPI.Waitall(requests)
+    # Reset MPI tag
+    arch.mpi_tag[1] -= arch.mpi_tag[1]
+
     buffer_side = mpi_communication_side(Val(fill_halo!))
     recv_from_buffers!(c, buffers, grid, Val(buffer_side))    
 

From ca5551b8e8bbd76e96f5c5a3d5d15e5a768b5baa Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 12 Mar 2023 21:16:40 -0400
Subject: [PATCH 063/530] change

---
 .../convective_adjustment_vertical_diffusivity.jl  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index dc7f8279a9..91d9b66336 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -88,25 +88,29 @@ DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCAVD) = (; κ = ZFac
 @inline viscosity(::FlavorOfCAVD, diffusivities) = diffusivities.ν
 @inline diffusivity(::FlavorOfCAVD, diffusivities, id) = diffusivities.κ
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model)
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
 
     arch = model.architecture
     grid = model.grid
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, kernel_size,
             ## If we can figure out how to only precompute the "stability" of a cell:
             # compute_stability!, diffusivities, grid, closure, tracers, buoyancy,
-            compute_convective_adjustment_diffusivities!, diffusivities, grid, closure, tracers, buoyancy)
+            compute_convective_adjustment_diffusivities!, diffusivities, kernel_offsets, grid, closure, tracers, buoyancy)
 
     return nothing
 end
 
 @inline is_stableᶜᶜᶠ(i, j, k, grid, tracers, buoyancy) = ∂z_b(i, j, k, grid, buoyancy, tracers) >= 0
 
-@kernel function compute_convective_adjustment_diffusivities!(diffusivities, grid, closure, tracers, buoyancy)
-    i, j, k, = @index(Global, NTuple)
+@kernel function compute_convective_adjustment_diffusivities!(diffusivities, offs, grid, closure, tracers, buoyancy)
+    i′, j′, k′ = @index(Global, NTuple)
+
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)

From 12ebbe5ef3102fcc36afbd44ceb7910ae9d5b3ac Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 13 Mar 2023 08:20:51 -0400
Subject: [PATCH 064/530] blocking is opposite than async

---
 src/Architectures.jl                                          | 4 ++--
 src/Distributed/halo_communication.jl                         | 4 ++--
 .../split_explicit_free_surface_kernels.jl                    | 4 ++--
 .../update_hydrostatic_free_surface_model_state.jl            | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index 2b4d6f726f..3c1c364a52 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -88,11 +88,11 @@ function unified_array(::GPU, arr::AbstractArray)
 end
 
 ## Only for contiguous data!! (i.e. only if the offset for pointer(dst::CuArray, offset::Int) is 1)
-@inline function device_copy_to!(dst::CuArray, src::CuArray; blocking::Bool = false) 
+@inline function device_copy_to!(dst::CuArray, src::CuArray; blocking::Bool = true) 
     n = length(src)
     context!(context(src)) do
         GC.@preserve src dst begin
-            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async = blocking)
+            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async = !(blocking))
         end
     end
     return dst
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 39d6a2d390..bc8455c202 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -146,7 +146,7 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = false, kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = true, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
@@ -164,7 +164,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     # Overlapping communication and computation, store requests in a `MPI.Request`
     # pool to be waited upon after tendency calculation
-    if blocking && !(arch isa BlockingDistributedArch)
+    if !blocking && !(arch isa BlockingDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index b36df30314..110c93c749 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -301,7 +301,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     @apply_regionally set!(free_surface.η, free_surface.state.η̅)
 
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
-    fill_halo_regions!(fields_to_fill; blocking = true)
+    fill_halo_regions!(fields_to_fill; blocking = false)
 
     return nothing
 end
@@ -341,7 +341,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
 
     fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
-    fill_halo_regions!(fields_to_fill; blocking = true)
+    fill_halo_regions!(fields_to_fill; blocking = false)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 5a301b8e5a..4f09f5592a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -25,7 +25,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
 
     @apply_regionally masking_immersed_model_fields!(model, grid)
 
-    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); blocking = true)
+    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); blocking = false)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 

From 831efe27cbfe19181903acc2c9b1bf08bbdb58be Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 13 Mar 2023 10:06:27 -0400
Subject: [PATCH 065/530] test hypothesis

---
 .../update_hydrostatic_free_surface_model_state.jl            | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 4f09f5592a..a1c9b80d79 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -29,6 +29,10 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 
+    fill_halo_regions!(model.velocities.w)
+    fill_halo_regions!(model.pressure.pHY′)
+    fill_halo_regions!(model.diffusivity_fields)
+
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
     compute_tendencies && 

From 6ddc70936bda12f69a19d17d7c0e29326f2f6cd8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 13 Mar 2023 10:21:36 -0400
Subject: [PATCH 066/530] not the problem

---
 .../update_hydrostatic_free_surface_model_state.jl            | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index a1c9b80d79..4f09f5592a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -29,10 +29,6 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 
-    fill_halo_regions!(model.velocities.w)
-    fill_halo_regions!(model.pressure.pHY′)
-    fill_halo_regions!(model.diffusivity_fields)
-
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
     compute_tendencies && 

From dab0b36cc001feaedbd92292c95153db25e58dfd Mon Sep 17 00:00:00 2001
From: ssilvest <ssilvest@eofe7.mit.edu>
Date: Mon, 13 Mar 2023 11:24:16 -0400
Subject: [PATCH 067/530] fixed split explicit

---
 .../split_explicit_free_surface_kernels.jl    | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 110c93c749..9cecb5f45d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -266,11 +266,21 @@ function barotropic_split_explicit_corrector!(u, v, free_surface, grid)
     return nothing
 end
 
-@kernel function _calc_ab2_tendencies!(G⁻, Gⁿ, χ)
-    i, j, k = @index(Global, NTuple)
-    @inbounds G⁻[i, j, k] = (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ)
+@kernel function _compute_integrated_ab2_tendencies!(auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+    i, j  = @index(Global, NTuple)	
+
+    # hand unroll first loop 	
+    @inbounds auxiliary.Gᵁ[i, j, 1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_add(i, j, 1, Gu⁻, Guⁿ, χ)
+    @inbounds auxiliary.Gⱽ[i, j, 1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_add(i, j, 1, Gv⁻, Gvⁿ, χ)
+
+    @unroll for k in 2:grid.Nz	
+        @inbounds auxiliary.Gᵁ[i, j, 1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_add(i, j, k, Gu⁻, Guⁿ, χ)
+        @inbounds auxiliary.Gⱽ[i, j, 1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_add(i, j, k, Gv⁻, Gvⁿ, χ)
+    end	
 end
 
+@inline ab2_add(i, j, k, G⁻, Gⁿ, χ) = (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ)
+
 """
 Explicitly step forward η in substeps.
 """
@@ -331,14 +341,14 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     grid = free_surface.η.grid
     
     # we start the time integration of η from the average ηⁿ     
-    Gu  = model.timestepper.G⁻.u
-    Gv  = model.timestepper.G⁻.v
+    Gu⁻ = model.timestepper.G⁻.u
+    Gv⁻ = model.timestepper.G⁻.v
     Guⁿ = model.timestepper.Gⁿ.u
     Gvⁿ = model.timestepper.Gⁿ.v
     
     auxiliary = free_surface.auxiliary
 
-    @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
+    @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
     fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
     fill_halo_regions!(fields_to_fill; blocking = false)
@@ -346,18 +356,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     return nothing
 end
 
-function setup_split_explicit_tendency!(auxiliary, grid, Gu, Gv, Guⁿ, Gvⁿ, χ)
-    arch = architecture(grid)
-
-    launch!(arch, grid, :xyz, _calc_ab2_tendencies!, Gu, Guⁿ, χ)
-    launch!(arch, grid, :xyz, _calc_ab2_tendencies!, Gv, Gvⁿ, χ)
-    
-    mask_immersed_field!(Gu)
-    mask_immersed_field!(Gv)
-
-    barotropic_mode!(auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu, Gv)
-
-    return nothing
-end
+setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) = 
+    launch(architecture(grid), grid, :xyz, _compute_integrated_ab2_tendencies!, auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
 wait_free_surface_communication!(free_surface) = nothing

From b441784cfa42a45aad6fc88bee0bf2b1cb6427e2 Mon Sep 17 00:00:00 2001
From: ssilvest <ssilvest@eofe7.mit.edu>
Date: Mon, 13 Mar 2023 11:34:50 -0400
Subject: [PATCH 068/530] bugfix

---
 .../split_explicit_free_surface_kernels.jl                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 9cecb5f45d..7aa5ca8a57 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -357,6 +357,6 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
 end
 
 setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) = 
-    launch(architecture(grid), grid, :xyz, _compute_integrated_ab2_tendencies!, auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+    launch!(architecture(grid), grid, :xyz, _compute_integrated_ab2_tendencies!, auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
 wait_free_surface_communication!(free_surface) = nothing

From face952a9fa113531a9671252bb6b9cd84d808fb Mon Sep 17 00:00:00 2001
From: ssilvest <ssilvest@eofe7.mit.edu>
Date: Mon, 13 Mar 2023 15:36:04 -0400
Subject: [PATCH 069/530] took a while

---
 .../split_explicit_free_surface_kernels.jl    | 48 +++++++++++--------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 7aa5ca8a57..c12c4e952e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -115,6 +115,8 @@ end
 @inline function advance_previous_velocity!(i, j, k, ::AdamsBashforth3Scheme, U, Uᵐ⁻¹, Uᵐ⁻²)
     @inbounds Uᵐ⁻²[i, j, k] = Uᵐ⁻¹[i, j, k] 
     @inbounds Uᵐ⁻¹[i, j, k] =    U[i, j, k] 
+
+    return nothing
 end
 
 @inline advance_previous_free_surface!(i, j, k, ::ForwardBackwardScheme, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) = nothing
@@ -123,6 +125,8 @@ end
     @inbounds ηᵐ⁻²[i, j, k] = ηᵐ⁻¹[i, j, k]
     @inbounds ηᵐ⁻¹[i, j, k] =   ηᵐ[i, j, k]
     @inbounds   ηᵐ[i, j, k] =    η[i, j, k]
+
+    return nothing
 end
 
 @kernel function split_explicit_free_surface_evolution_kernel!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
@@ -238,6 +242,8 @@ function initialize_free_surface_state!(free_surface_state, η)
     fill!(state.η̅, 0.0)
     fill!(state.U̅, 0.0)
     fill!(state.V̅, 0.0)
+
+    return nothing
 end
 
 @kernel function barotropic_split_explicit_corrector_kernel!(u, v, U̅, V̅, U, V, Hᶠᶜ, Hᶜᶠ)
@@ -266,21 +272,6 @@ function barotropic_split_explicit_corrector!(u, v, free_surface, grid)
     return nothing
 end
 
-@kernel function _compute_integrated_ab2_tendencies!(auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
-    i, j  = @index(Global, NTuple)	
-
-    # hand unroll first loop 	
-    @inbounds auxiliary.Gᵁ[i, j, 1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_add(i, j, 1, Gu⁻, Guⁿ, χ)
-    @inbounds auxiliary.Gⱽ[i, j, 1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_add(i, j, 1, Gv⁻, Gvⁿ, χ)
-
-    @unroll for k in 2:grid.Nz	
-        @inbounds auxiliary.Gᵁ[i, j, 1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_add(i, j, k, Gu⁻, Guⁿ, χ)
-        @inbounds auxiliary.Gⱽ[i, j, 1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_add(i, j, k, Gv⁻, Gvⁿ, χ)
-    end	
-end
-
-@inline ab2_add(i, j, k, G⁻, Gⁿ, χ) = (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ)
-
 """
 Explicitly step forward η in substeps.
 """
@@ -290,6 +281,8 @@ ab2_step_free_surface!(free_surface::SplitExplicitFreeSurface, model, Δt, χ) =
 function initialize_free_surface!(sefs::SplitExplicitFreeSurface, grid, velocities)
     @apply_regionally barotropic_mode!(sefs.state.U̅, sefs.state.V̅, grid, velocities.u, velocities.v)
     fill_halo_regions!((sefs.state.U̅, sefs.state.V̅))
+
+    return nothing
 end
 
 function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurface, model, Δt, χ)
@@ -334,8 +327,25 @@ function iterate_split_explicit!(free_surface, grid, Δt)
     return nothing
 end
 
-# Setting up the tendencies and the communicating the barotopic velocity components
-# This function is called after `calculate_tendency` and before `ab2_step!`
+# Calculate RHS for the barotopic time step. 
+@kernel function _compute_integrated_ab2_tendencies!(Gᵁ, Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+    i, j  = @index(Global, NTuple)	
+
+    # hand unroll first loop 	
+    @inbounds Gᵁ[i, j, 1] = Δzᶠᶜᶜ(i, j, 1, grid) * ab2_step_Gu(i, j, 1, grid, Gu⁻, Guⁿ, χ)
+    @inbounds Gⱽ[i, j, 1] = Δzᶜᶠᶜ(i, j, 1, grid) * ab2_step_Gv(i, j, 1, grid, Gv⁻, Gvⁿ, χ)
+
+    @unroll for k in 2:grid.Nz	
+        @inbounds Gᵁ[i, j, 1] += Δzᶠᶜᶜ(i, j, k, grid) * ab2_step_Gu(i, j, k, grid, Gu⁻, Guⁿ, χ)
+        @inbounds Gⱽ[i, j, 1] += Δzᶜᶠᶜ(i, j, k, grid) * ab2_step_Gv(i, j, k, grid, Gv⁻, Gvⁿ, χ)
+    end	
+end
+
+@inline ab2_step_Gu(i, j, k, grid, G⁻, Gⁿ, χ) = ifelse(peripheral_node(i, j, k, grid, f, c, c), zero(grid), (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ))
+@inline ab2_step_Gv(i, j, k, grid, G⁻, Gⁿ, χ) = ifelse(peripheral_node(i, j, k, grid, c, f, c), zero(grid), (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ))
+
+# Setting up the RHS for the barotropic step (tendencies of the barotopic velocity components)
+# This function is called after `calculate_tendency` and before `ab2_step_velocities!`
 function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
 
     grid = free_surface.η.grid
@@ -356,7 +366,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     return nothing
 end
 
-setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) = 
-    launch!(architecture(grid), grid, :xyz, _compute_integrated_ab2_tendencies!, auxiliary, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
+setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) =
+    launch!(architecture(grid), grid, :xy, _compute_integrated_ab2_tendencies!, auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
 wait_free_surface_communication!(free_surface) = nothing

From 0a36aebf5d9b2cda36d66b63b8a05096c24d001a Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 13 Mar 2023 18:34:12 -0400
Subject: [PATCH 070/530] Setup recv before staging the buffers

---
 src/Distributed/halo_communication.jl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index bc8455c202..4da3414ec4 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -205,14 +205,14 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
 
+            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+
             # This has to be synchronized!!
             $fill_all_send_buffers!(c, buffers, grid)
 
             sync_device!(child_architecture(arch))
 
-            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
-            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
-
             send_req1 = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
@@ -225,12 +225,13 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
             child_arch = child_architecture(arch)
             local_rank = bc_side.condition.from
 
+            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+
             $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
             $fill_side_send_buffers!(c, buffers, grid)
 
             sync_device!(child_arch)
 
-            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             
             return [send_req, recv_req]
@@ -242,12 +243,13 @@ for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north,
             child_arch = child_architecture(arch)
             local_rank = bc_opposite_side.condition.from
 
+            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+
             $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
             $fill_opposite_side_send_buffers!(c, buffers, grid)
 
             sync_device!(child_arch)
 
-            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
             send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req, recv_req]

From fab1300b4c49b8f9e60819d457cc84946320572f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 13 Mar 2023 23:57:30 -0400
Subject: [PATCH 071/530] test tasks

---
 src/Distributed/halo_communication.jl       | 41 +++++++++++++++++++--
 src/Distributed/interleave_comm_and_comp.jl |  2 +-
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 4da3414ec4..46ec6939f7 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -146,6 +146,31 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
+
+### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
+function cooperative_test!(req)
+    done = false
+    while !done
+        done, _ = MPI.Test(req, MPI.Status)
+        yield()
+    end
+end
+
+### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
+function cooperative_wait(task::Task)
+    while !Base.istaskdone(task)
+        MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
+        yield()
+    end
+    wait(task)
+end
+
+function cooperative_waitall!(tasks::Array{Task})
+    for task in tasks
+        cooperative_wait(task)
+    end
+end
+
 function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = true, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
@@ -170,7 +195,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
     end
 
     # Syncronous MPI fill_halo_event!
-    MPI.Waitall(requests)
+    cooperative_waitall!(requests)
     # Reset MPI tag
     arch.mpi_tag[1] -= arch.mpi_tag[1]
 
@@ -274,9 +299,13 @@ for side in sides
             send_tag = $side_send_tag(arch, location, local_rank, rank_to_send_to)
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
-            send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
+            
+            send = @async begin
+                send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
+                cooperative_test!(send_req)
+            end
 
-            return send_req
+            return send
         end
 
         @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
@@ -303,7 +332,11 @@ for side in sides
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
-            return recv_req
+            recv = @async begin
+                cooperative_test!(recv_req)
+            end
+
+            return recv
         end
 
         @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index ce4b938924..14c0d66af3 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -56,7 +56,7 @@ function complete_halo_communication!(field)
 
     # Wait for outstanding requests
     if !isempty(arch.mpi_requests) 
-        MPI.Waitall(arch.mpi_requests)
+        cooperative_waitall!(arch.mpi_requests)
 
         # Reset MPI tag
         arch.mpi_tag[1] -= arch.mpi_tag[1]

From ff6b4e9eed413a6b51f0aeb79d3d13fc806f0849 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Mar 2023 07:54:26 -0400
Subject: [PATCH 072/530] bugfix

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 46ec6939f7..650b992b94 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -159,7 +159,7 @@ end
 ### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
 function cooperative_wait(task::Task)
     while !Base.istaskdone(task)
-        MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
+        MPI.Iprobe(MPI.ANY_SOURCE, MPI.ANY_TAG, MPI.COMM_WORLD)
         yield()
     end
     wait(task)

From 9fc429b8165d7df91893903b843117d4706a1a8c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Mar 2023 07:56:32 -0400
Subject: [PATCH 073/530] MPI.Request -> Task

---
 src/Distributed/halo_communication.jl  | 1 -
 src/Distributed/multi_architectures.jl | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 650b992b94..bbc6067423 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -146,7 +146,6 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
-
 ### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
 function cooperative_test!(req)
     done = false
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 2d1f8e2332..bfa4ef9d36 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -105,7 +105,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
+    mpi_requests = enable_overlapped_computation ? Task[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)

From 52a218a69e98b151e861ca55a6cef24893d2d552 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 14 Mar 2023 11:39:40 -0400
Subject: [PATCH 074/530] back to Waitall!

---
 src/Distributed/halo_communication.jl       | 15 ++++-----------
 src/Distributed/interleave_comm_and_comp.jl |  2 +-
 src/Distributed/multi_architectures.jl      |  2 +-
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index bbc6067423..d90794390c 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -194,7 +194,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
     end
 
     # Syncronous MPI fill_halo_event!
-    cooperative_waitall!(requests)
+    MPI.Waitall!(requests)
     # Reset MPI tag
     arch.mpi_tag[1] -= arch.mpi_tag[1]
 
@@ -299,12 +299,9 @@ for side in sides
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             
-            send = @async begin
-                send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
-                cooperative_test!(send_req)
-            end
+            send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
 
-            return send
+            return send_req
         end
 
         @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
@@ -331,11 +328,7 @@ for side in sides
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
-            recv = @async begin
-                cooperative_test!(recv_req)
-            end
-
-            return recv
+            return recv_req
         end
 
         @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 14c0d66af3..4bad558f6b 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -56,7 +56,7 @@ function complete_halo_communication!(field)
 
     # Wait for outstanding requests
     if !isempty(arch.mpi_requests) 
-        cooperative_waitall!(arch.mpi_requests)
+        MPI.Waitall!(arch.mpi_requests)
 
         # Reset MPI tag
         arch.mpi_tag[1] -= arch.mpi_tag[1]
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index bfa4ef9d36..2d1f8e2332 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -105,7 +105,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = enable_overlapped_computation ? Task[] : nothing
+    mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)

From 52ce3bad1a218ea673a4a9fcf1a861d3957572f9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 16 Mar 2023 21:42:12 -0400
Subject: [PATCH 075/530] bugfix

---
 .../split_explicit_free_surface_kernels.jl                    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index c12c4e952e..73c6f48b77 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -306,6 +306,10 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
     fill_halo_regions!(fields_to_fill; blocking = false)
 
+    # Preparing velocities for the barotropic correction
+    mask_immersed_field!(model.velocities.u)
+    mask_immersed_field!(model.velocities.v)
+
     return nothing
 end
 

From f20d38879ab83067cdee1a6da9dc47141084cf98 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 16 Mar 2023 21:42:54 -0400
Subject: [PATCH 076/530] import it

---
 .../split_explicit_free_surface_kernels.jl                   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 73c6f48b77..a7c7641e24 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -5,8 +5,9 @@ using Oceananigans.Utils
 using Oceananigans.AbstractOperations: Δz  
 using Oceananigans.BoundaryConditions
 using Oceananigans.Operators
-using Oceananigans.ImmersedBoundaries: peripheral_node, immersed_inactive_node,
-                                       inactive_node, IBG, c, f
+using Oceananigans.ImmersedBoundaries: peripheral_node, immersed_inactive_node
+using Oceananigans.ImmersedBoundaries: inactive_node, IBG, c, f
+using Oceananigans.ImmersedBoundaries: mask_immersed_field!
 
 # constants for AB3 time stepping scheme (from https://doi.org/10.1016/j.ocemod.2004.08.002)
 const β = 0.281105

From 883befc6cfe9d22b4eb3793a2d2a0fc73c9a6ede Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Thu, 16 Mar 2023 21:59:45 -0400
Subject: [PATCH 077/530] more updates

---
 .../NonhydrostaticModels/update_nonhydrostatic_model_state.jl | 4 ++--
 src/Models/ShallowWaterModels/update_shallow_water_state.jl   | 4 +---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 21b38af7fe..b140049368 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -19,7 +19,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     foreach(mask_immersed_field!, model.tracers)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.velocities, model.tracers),  model.clock, fields(model))
+    fill_halo_regions!(merge(model.velocities, model.tracers))
 
     # Compute auxiliary fields
     for aux_field in model.auxiliary_fields
@@ -28,7 +28,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
 
     # Calculate diffusivities
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model))
+    fill_halo_regions!(model.diffusivity_fields)
 
     update_hydrostatic_pressure!(model)
     fill_halo_regions!(model.pressures.pHY′)
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index 25028b2cd9..4f7043bdb7 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -17,9 +17,7 @@ function update_state!(model::ShallowWaterModel, callbacks=[]; compute_tendencie
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.solution, model.tracers),
-                       model.clock,
-                       fields(model))
+    fill_halo_regions!(merge(model.solution, model.tracers))
 
     # Compute the velocities
 

From b2a29cd9f170839e5ced9c56e9347117cbd76f1a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 17 Mar 2023 00:57:52 -0400
Subject: [PATCH 078/530] revert

---
 .../update_nonhydrostatic_model_state.jl                    | 6 +++---
 src/Models/ShallowWaterModels/update_shallow_water_state.jl | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index b140049368..35659df2e2 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -19,7 +19,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     foreach(mask_immersed_field!, model.tracers)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.velocities, model.tracers))
+    fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model))
 
     # Compute auxiliary fields
     for aux_field in model.auxiliary_fields
@@ -28,10 +28,10 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
 
     # Calculate diffusivities
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-    fill_halo_regions!(model.diffusivity_fields)
+    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model))
 
     update_hydrostatic_pressure!(model)
-    fill_halo_regions!(model.pressures.pHY′)
+    fill_halo_regions!(model.pressures.pHY′, model.clock, fields(model))
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
 
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index 4f7043bdb7..e842c652d0 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -17,7 +17,7 @@ function update_state!(model::ShallowWaterModel, callbacks=[]; compute_tendencie
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.solution, model.tracers))
+    fill_halo_regions!(merge(model.solution, model.tracers)model.clock, fields(model))
 
     # Compute the velocities
 

From 084dc5301ac07b16bb82e5c210c2aff6da936bad Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 17 Mar 2023 01:35:05 -0400
Subject: [PATCH 079/530] bugfux

---
 src/Models/ShallowWaterModels/update_shallow_water_state.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index e842c652d0..083e422096 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -17,7 +17,7 @@ function update_state!(model::ShallowWaterModel, callbacks=[]; compute_tendencie
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.solution, model.tracers)model.clock, fields(model))
+    fill_halo_regions!(merge(model.solution, model.tracers), model.clock, fields(model))
 
     # Compute the velocities
 

From 83280dc00ecd69f4c1e0ea8cf7710e8f6f4d3e9f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 19 Mar 2023 14:12:56 -0400
Subject: [PATCH 080/530] all reduce wizard

---
 src/Distributed/distributed_utils.jl | 4 ++++
 src/Simulations/time_step_wizard.jl  | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index 7aed8d07e9..dc3696ddb3 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -4,6 +4,10 @@ using Oceananigans.Grids:
     left_halo_indices, right_halo_indices,
     underlying_left_halo_indices, underlying_right_halo_indices
 
+
+all_reduce(val, grid; op = +) = 
+    MPI.Allreduce(val, op, grid.architecture.communicator)
+
 # TODO: Move to Grids/grid_utils.jl
 
 #####
diff --git a/src/Simulations/time_step_wizard.jl b/src/Simulations/time_step_wizard.jl
index bc2aae38e3..dc241aea15 100644
--- a/src/Simulations/time_step_wizard.jl
+++ b/src/Simulations/time_step_wizard.jl
@@ -79,6 +79,8 @@ function TimeStepWizard(FT=Float64;
 end
 
 using Oceananigans.Grids: topology
+using Oceananigans.Distributed
+using Oceananigans.Distributed: all_reduce
 
 """
      new_time_step(old_Δt, wizard, model)
@@ -98,6 +100,10 @@ function new_time_step(old_Δt, wizard, model)
     new_Δt = max(wizard.min_change * old_Δt, new_Δt)
     new_Δt = clamp(new_Δt, wizard.min_Δt, wizard.max_Δt)
 
+    if model.architecture isa DistributedArch
+        new_Δt = all_reduce(new_Δt, model.grid; op = min)
+    end
+
     return new_Δt
 end
 

From c58e7c71960817ed3cf26736176ac63be2ed0080 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 20 Mar 2023 09:01:01 -0400
Subject: [PATCH 081/530] horizontally average Ri number (gaussian)

---
 .../ri_based_vertical_diffusivity.jl                          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 3782c62cb4..a7a77d325c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -197,8 +197,8 @@ end
     κᵉ = ifelse(Qᵇ > 0, Cᵉ * Qᵇ / N², zero(grid))
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
-    # Shear mixing diffusivity and viscosity
-    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+    # Shear mixing diffusivity and viscosity (diffused in the horizontal to add non-locality)
+    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, Riᶜᶜᶠ, velocities, tracers, buoyancy)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ

From 6ce9eed06abd4938268eabdb7f1f4c0dc4ba0a3c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Mon, 20 Mar 2023 17:34:58 -0400
Subject: [PATCH 082/530] changed Ri number

---
 .../ri_based_vertical_diffusivity.jl                          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index a7a77d325c..dfd226c172 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -150,8 +150,8 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 @inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
 
 @inline function Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
-    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, velocities.u)
-    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, velocities.v)
+    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ∂zᶠᶜᶠ, velocities.u)^2
+    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ∂zᶜᶠᶠ, velocities.v)^2
     S² = ∂z_u² + ∂z_v²
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     Ri = N² / S²

From aabef82b9aa3687c39c785b5d60e3838922a004d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 21 Mar 2023 19:28:40 -0400
Subject: [PATCH 083/530] correct initialization of eta

---
 .../split_explicit_free_surface_kernels.jl                     | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index a7c7641e24..a29eddd93c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -281,8 +281,7 @@ ab2_step_free_surface!(free_surface::SplitExplicitFreeSurface, model, Δt, χ) =
     
 function initialize_free_surface!(sefs::SplitExplicitFreeSurface, grid, velocities)
     @apply_regionally barotropic_mode!(sefs.state.U̅, sefs.state.V̅, grid, velocities.u, velocities.v)
-    fill_halo_regions!((sefs.state.U̅, sefs.state.V̅))
-
+    fill_halo_regions!((sefs.state.U̅, sefs.state.V̅, sefs.η))
     return nothing
 end
 

From 006c7176c07b30cee928102652a3606a2ce4d2a4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 21 Mar 2023 20:13:12 -0400
Subject: [PATCH 084/530] try float32?

---
 src/Distributed/distributed_grids.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 3c72f99943..e3bd66d8e6 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -145,7 +145,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                                          Δλᶠᵃᵃ, Δλᶜᵃᵃ, λᶠᵃᵃ, λᶜᵃᵃ,
                                                          Δφᵃᶠᵃ, Δφᵃᶜᵃ, φᵃᶠᵃ, φᵃᶜᵃ,
                                                          Δzᵃᵃᶠ, Δzᵃᵃᶜ, zᵃᵃᶠ, zᵃᵃᶜ,
-                                                         (nothing for i=1:10)..., radius)
+                                                         (nothing for i=1:10)..., FT(radius))
 
     return !precompute_metrics ? preliminary_grid : with_precomputed_metrics(preliminary_grid)
 end

From 00d984fb209b0b460bab1977d082caaaa5bcb0c7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 22 Mar 2023 18:33:41 -0400
Subject: [PATCH 085/530] time average (waiting for the other PR to merge)

---
 .../ri_based_vertical_diffusivity.jl                        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index dfd226c172..ae8f15a946 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -204,8 +204,10 @@ end
     κ★ = κ₀ * τ
     ν★ = ν₀ * τ
 
-    @inbounds diffusivities.κ[i, j, k] = κᶜ + κᵉ + κ★
-    @inbounds diffusivities.ν[i, j, k] = ν★
+    κⁿ = κᶜ + κᵉ + κ★
+    νⁿ = ν★
+    @inbounds diffusivities.κ[i, j, k] = 0.5 * (diffusivities.κ[i, j, k] + κⁿ)
+    @inbounds diffusivities.ν[i, j, k] = 0.5 * (diffusivities.ν[i, j, k] + νⁿ)
 end
 
 #####

From dbb995f7b56f6b2b4f160ede295e716b7fcfc317 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 24 Mar 2023 15:12:30 -0400
Subject: [PATCH 086/530] define active_cell

---
 src/ImmersedBoundaries/active_cells_map.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 2cf9f4d224..62bed12029 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -1,5 +1,5 @@
 using Oceananigans
-using Oceananigans.Grids: AbstractGrid, active_cell
+using Oceananigans.Grids: AbstractGrid
 
 using KernelAbstractions: @kernel, @index
 
@@ -26,8 +26,10 @@ function ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib; active_cells_map = false) wh
     return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, map)
 end
 
+@inline active_cell(i, j, k, grid, ib) = !immersed_cell(i, j, k, grid, ib)
+
 function compute_active_cells(grid, ib)
-    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, grid)
+    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, grid, ib)
     active_cells_field = Field{Center, Center, Center}(grid, Bool)
     set!(active_cells_field, is_immersed_operation)
     return active_cells_field

From 79ea0e5a88ec0226a348e08c9cf203cec501658b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 24 Mar 2023 15:17:16 -0400
Subject: [PATCH 087/530] for the moment this

---
 src/ImmersedBoundaries/active_cells_map.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 62bed12029..9114bd5283 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -29,7 +29,7 @@ end
 @inline active_cell(i, j, k, grid, ib) = !immersed_cell(i, j, k, grid, ib)
 
 function compute_active_cells(grid, ib)
-    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, grid, ib)
+    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, grid; computed_dependencies = (ib, ))
     active_cells_field = Field{Center, Center, Center}(grid, Bool)
     set!(active_cells_field, is_immersed_operation)
     return active_cells_field

From 308949337499908ec15027e05569ad0c4dfdf3ff Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 27 Mar 2023 13:56:21 -0400
Subject: [PATCH 088/530] NVTX store tendencies

---
 src/TimeSteppers/store_tendencies.jl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/TimeSteppers/store_tendencies.jl b/src/TimeSteppers/store_tendencies.jl
index 06d179bd3a..55eba418b3 100644
--- a/src/TimeSteppers/store_tendencies.jl
+++ b/src/TimeSteppers/store_tendencies.jl
@@ -1,6 +1,7 @@
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
 
+using NVTX
 using Oceananigans.Utils: launch!
 
 """ Store source terms for `u`, `v`, and `w`. """
@@ -14,10 +15,12 @@ function store_tendencies!(model)
     model_fields = prognostic_fields(model)
 
     for field_name in keys(model_fields)
-        launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
-                model.timestepper.G⁻[field_name],
-                model.grid,
-                model.timestepper.Gⁿ[field_name])
+        NVTX.@range "store tendencies for $(field_name)" begin
+            launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
+                    model.timestepper.G⁻[field_name],
+                    model.grid,
+                    model.timestepper.Gⁿ[field_name])
+        end
     end
 
     return nothing

From 076cea9e2721883442e5da8eb3d519c207cbf4fc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 27 Mar 2023 16:07:36 -0400
Subject: [PATCH 089/530] adding NVTX to Project.toml

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index b5c4025319..3852981a5f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,6 +23,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
+NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PencilArrays = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"

From eef4dbce2a96493b4259ee4f54a460ff2b7d91a2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:27:43 -0400
Subject: [PATCH 090/530] fix tag issues

---
 src/Distributed/halo_communication.jl            |  4 ++--
 .../ri_based_vertical_diffusivity.jl             | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d90794390c..ecfec172da 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -53,12 +53,12 @@ opposite_side = Dict(
 
 RANK_DIGITS = 2
 ID_DIGITS   = 2
-LOC_DIGITS  = 2
+LOC_DIGITS  = 1
 
 @inline loc_id(::Nothing) = 0
 @inline loc_id(::Face)    = 1
 @inline loc_id(::Center)  = 2
-@inline location_id(X, Y, Z) = loc_id(X) + 3*loc_id(Y) + 9*loc_id(Z)
+@inline location_id(X, Y, Z) = loc_id(Z)
 
 for side in sides
     side_str = string(side)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index ae8f15a946..901066fe11 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -63,12 +63,12 @@ Keyword Arguments
 function RiBasedVerticalDiffusivity(time_discretization = VerticallyImplicitTimeDiscretization(),
                                     FT = Float64;
                                     Ri_dependent_tapering = HyperbolicTangentRiDependentTapering(),
-                                    ν₀  = 0.30,
-                                    κ₀  = 0.42,
-                                    κᶜ  = 4.0,
-                                    Cᵉ  = 0.57,
-                                    Ri₀ = 0.27,
-                                    Riᵟ = 0.20,
+                                    ν₀  = 0.7,
+                                    κ₀  = 0.5,
+                                    κᶜ  = 1.7,
+                                    Cᵉ  = 0.1,
+                                    Ri₀ = 0.1,
+                                    Riᵟ = 0.40,
                                     warning = true)
     if warning
         @warn "RiBasedVerticalDiffusivity is an experimental turbulence closure that \n" *
@@ -206,8 +206,8 @@ end
 
     κⁿ = κᶜ + κᵉ + κ★
     νⁿ = ν★
-    @inbounds diffusivities.κ[i, j, k] = 0.5 * (diffusivities.κ[i, j, k] + κⁿ)
-    @inbounds diffusivities.ν[i, j, k] = 0.5 * (diffusivities.ν[i, j, k] + νⁿ)
+    @inbounds diffusivities.κ[i, j, k] = (0.6 * diffusivities.κ[i, j, k] + κⁿ) / 1.6
+    @inbounds diffusivities.ν[i, j, k] = (0.6 * diffusivities.ν[i, j, k] + νⁿ) / 1.6
 end
 
 #####

From 37ded32969d487e77e2a00e6f9890c839edf3228 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 31 Mar 2023 14:28:35 -0400
Subject: [PATCH 091/530] loc first (max 2)

---
 src/Distributed/halo_communication.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index ecfec172da..c4ef546306 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -71,7 +71,7 @@ for side in sides
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, field_id * loc_id * side_digit * from_digits * to_digits)
+            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
@@ -80,7 +80,7 @@ for side in sides
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, field_id * loc_id * side_digit * from_digits * to_digits)
+            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
         end
     end
 end

From 1a67ea91dd7c0be3a4568847d5e026561915ceb5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 10:40:12 -0400
Subject: [PATCH 092/530] back to previous Ri and use threads

---
 src/Distributed/halo_communication.jl            | 16 ++++++++++++----
 src/Distributed/interleave_comm_and_comp.jl      |  2 +-
 .../ri_based_vertical_diffusivity.jl             |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index c4ef546306..c47fe01fda 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index
+using KernelAbstractions: @kernel, @index, priority
 using OffsetArrays: OffsetArray
 using CUDA: synchronize
 import Oceananigans.Utils: sync_device!
@@ -299,9 +299,12 @@ for side in sides
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             
-            send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
+            send_event = Threads.@spawn begin
+                send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
+                cooperative_test!(send_req)
+            end
 
-            return send_req
+            return send_event
         end
 
         @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
@@ -328,7 +331,12 @@ for side in sides
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
-            return recv_req
+            recv_event = Threads.@spawn begin
+                KernelAbstractions.priority!(device(arch), :high)
+                cooperative_test!(recv_req)
+            end
+
+            return recv_event
         end
 
         @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 4bad558f6b..14c0d66af3 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -56,7 +56,7 @@ function complete_halo_communication!(field)
 
     # Wait for outstanding requests
     if !isempty(arch.mpi_requests) 
-        MPI.Waitall!(arch.mpi_requests)
+        cooperative_waitall!(arch.mpi_requests)
 
         # Reset MPI tag
         arch.mpi_tag[1] -= arch.mpi_tag[1]
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 901066fe11..ae8c98c321 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -198,7 +198,7 @@ end
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
     # Shear mixing diffusivity and viscosity (diffused in the horizontal to add non-locality)
-    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, Riᶜᶜᶠ, velocities, tracers, buoyancy)
+    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ

From 52beba602f4672804667031da67958e59f910f4b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 10:45:34 -0400
Subject: [PATCH 093/530] compute N2 only once

---
 .../ri_based_vertical_diffusivity.jl                       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index ae8c98c321..308e73c1ea 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -149,11 +149,10 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 
 @inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
 
-@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, N²)
     ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ∂zᶠᶜᶠ, velocities.u)^2
     ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ∂zᶜᶠᶠ, velocities.v)^2
     S² = ∂z_u² + ∂z_v²
-    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     Ri = N² / S²
 
     # Clip N² and avoid NaN
@@ -197,8 +196,8 @@ end
     κᵉ = ifelse(Qᵇ > 0, Cᵉ * Qᵇ / N², zero(grid))
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
-    # Shear mixing diffusivity and viscosity (diffused in the horizontal to add non-locality)
-    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+    # Shear mixing diffusivity and viscosity
+    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, N²)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ

From a5d8b16b4078d3d5b672073e60ff4f69d173c54c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 10:53:23 -0400
Subject: [PATCH 094/530] cleanup

---
 .../ri_based_vertical_diffusivity.jl                         | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 308e73c1ea..5fd1062aa5 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -147,8 +147,6 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 @inline taper(::Exp,    x::T, x₀, δ) where T = exp(- max(zero(T), (x - x₀) / δ))
 @inline taper(::Tanh,   x::T, x₀, δ) where T = (one(T) - tanh((x - x₀) / δ)) / 2
 
-@inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
-
 @inline function Riᶜᶜᶠ(i, j, k, grid, velocities, N²)
     ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ∂zᶠᶜᶠ, velocities.u)^2
     ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ∂zᶜᶠᶠ, velocities.v)^2
@@ -159,9 +157,6 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
-@inline Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy) =
-    ℑzᵃᵃᶜ(i, j, k, grid, Riᶜᶜᶠ, velocities, tracers, buoyancy)
-
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
                                                  velocities, tracers, buoyancy, tracer_bcs, clock)
 

From c9272e81973b24413a3c7fb1e2a4ba8b3711ac45 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 11:01:18 -0400
Subject: [PATCH 095/530] bugfix

---
 src/Distributed/halo_communication.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index c47fe01fda..eb36708cf1 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -332,7 +332,6 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-                KernelAbstractions.priority!(device(arch), :high)
                 cooperative_test!(recv_req)
             end
 

From 50dc84f36df6e3aa07ab9cb1ce64c596d0aedb90 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 11:05:08 -0400
Subject: [PATCH 096/530] new priority

---
 src/Distributed/halo_communication.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index eb36708cf1..7775918185 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index, priority
+using KernelAbstractions: @kernel, @index, priority!
 using OffsetArrays: OffsetArray
 using CUDA: synchronize
 import Oceananigans.Utils: sync_device!
@@ -332,6 +332,7 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
+                priority!(device(arch), :high)
                 cooperative_test!(recv_req)
             end
 

From 258ba0597f6b9b21d64f1b0990127ddedc627f86 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 11:24:44 -0400
Subject: [PATCH 097/530] bugfix

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 7775918185..23a3c7f10d 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -194,7 +194,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
     end
 
     # Syncronous MPI fill_halo_event!
-    MPI.Waitall!(requests)
+    cooperative_waitall!(requests)
     # Reset MPI tag
     arch.mpi_tag[1] -= arch.mpi_tag[1]
 

From 10e6915fffdc1bcfcedeaa79e740610d2d778cff Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 11:27:27 -0400
Subject: [PATCH 098/530] remove NVTX

---
 src/TimeSteppers/store_tendencies.jl | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/TimeSteppers/store_tendencies.jl b/src/TimeSteppers/store_tendencies.jl
index 55eba418b3..d3625a849e 100644
--- a/src/TimeSteppers/store_tendencies.jl
+++ b/src/TimeSteppers/store_tendencies.jl
@@ -1,7 +1,6 @@
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: AbstractGrid
 
-using NVTX
 using Oceananigans.Utils: launch!
 
 """ Store source terms for `u`, `v`, and `w`. """
@@ -15,12 +14,10 @@ function store_tendencies!(model)
     model_fields = prognostic_fields(model)
 
     for field_name in keys(model_fields)
-        NVTX.@range "store tendencies for $(field_name)" begin
-            launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
-                    model.timestepper.G⁻[field_name],
-                    model.grid,
-                    model.timestepper.Gⁿ[field_name])
-        end
+        launch!(model.architecture, model.grid, :xyz, store_field_tendencies!,
+            model.timestepper.G⁻[field_name],
+            model.grid,
+            model.timestepper.Gⁿ[field_name])
     end
 
     return nothing

From c87bd09d83a0fe94a944ad1b36a6eab81694fa54 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 11:29:44 -0400
Subject: [PATCH 099/530] remove NVTX

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3852981a5f..b5c4025319 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,7 +23,6 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCDatasets = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
-NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PencilArrays = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"

From 1f9404a5e5bd79f998c2987086bebeb22b75be8c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 12:53:39 -0400
Subject: [PATCH 100/530] bugfix

---
 src/Distributed/multi_architectures.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 2d1f8e2332..bfa4ef9d36 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -105,7 +105,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
+    mpi_requests = enable_overlapped_computation ? Task[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)

From ee2e3b10558c9b487e4f1a1d2fa28b11c7f44fac Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 1 Apr 2023 15:41:36 -0400
Subject: [PATCH 101/530] remove priority

---
 src/Distributed/halo_communication.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 23a3c7f10d..9f8dec42e2 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index, priority!
+using KernelAbstractions: @kernel, @index
 using OffsetArrays: OffsetArray
 using CUDA: synchronize
 import Oceananigans.Utils: sync_device!
@@ -332,7 +332,6 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-                priority!(device(arch), :high)
                 cooperative_test!(recv_req)
             end
 

From 92d60f8999684d7348ace7861e88b02f360dfd23 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 00:01:01 -0400
Subject: [PATCH 102/530] test it out with synchronize

---
 src/Distributed/halo_communication.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 9f8dec42e2..1725cd27df 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -28,7 +28,9 @@ import Oceananigans.BoundaryConditions:
     fill_south_and_north_halo!,
     fill_bottom_and_top_halo!
 
-@inline sync_device!(::GPU) = synchronize()
+@inline sync_device!(::CPU)                 = nothing
+@inline sync_device!(::GPU)                 = synchronize()
+@inline sync_device!(arch::DistributedArch) = sync_device!(arch.child_architecture)
 
 #####
 ##### MPI tags for halo communication BCs
@@ -333,6 +335,7 @@ for side in sides
 
             recv_event = Threads.@spawn begin
                 cooperative_test!(recv_req)
+                sync_device!(arch)
             end
 
             return recv_event

From f2814319efa568bf73954a8320c0371ca219e4d2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 00:17:41 -0400
Subject: [PATCH 103/530] high priority for recv

---
 src/Distributed/halo_communication.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1725cd27df..0b1fe62746 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index
+using KernelAbstractions: @kernel, @index, priority!
 using OffsetArrays: OffsetArray
 using CUDA: synchronize
 import Oceananigans.Utils: sync_device!
@@ -334,6 +334,7 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
+                priority!(device(arch), :high)
                 cooperative_test!(recv_req)
                 sync_device!(arch)
             end

From ac369901b0e3e10bc7ff2c42242f47a2c8c348b4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 03:29:07 -0400
Subject: [PATCH 104/530] test without events

---
 src/Distributed/halo_communication.jl | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 0b1fe62746..7c8df0e6b0 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -334,7 +334,15 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-                priority!(device(arch), :high)
+                range = CUDA.priority_range()
+                priority = last(range)
+            
+                old_stream = CUDA.stream()
+                r_flags = Ref{Cuint}()
+                CUDA.cuStreamGetFlags(old_stream, r_flags)
+                flags = CUDA.CUstream_flags_enum(r_flags[])
+                new_stream = CUDA.CuStream(; flags, priority)
+                CUDA.stream!(new_stream)
                 cooperative_test!(recv_req)
                 sync_device!(arch)
             end

From 73c2e3879b736017e7f423a0b42a6635f11c4039 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 03:32:50 -0400
Subject: [PATCH 105/530] test an hypothesis

---
 src/Distributed/halo_communication.jl | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 7c8df0e6b0..301e994d91 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,6 +1,8 @@
 using KernelAbstractions: @kernel, @index, priority!
 using OffsetArrays: OffsetArray
 using CUDA: synchronize
+using CUDA: cuStreamGetFlags, stream, priority_range, CUstream_flags_enum, CuStream, stream!
+
 import Oceananigans.Utils: sync_device!
 using Oceananigans.Fields: fill_west_and_east_send_buffers!, 
                            fill_south_and_north_send_buffers!, 
@@ -334,15 +336,16 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-                range = CUDA.priority_range()
+                # Not pick up event for the moment
+                range = priority_range()
                 priority = last(range)
             
-                old_stream = CUDA.stream()
+                old_stream = stream()
                 r_flags = Ref{Cuint}()
-                CUDA.cuStreamGetFlags(old_stream, r_flags)
-                flags = CUDA.CUstream_flags_enum(r_flags[])
-                new_stream = CUDA.CuStream(; flags, priority)
-                CUDA.stream!(new_stream)
+                cuStreamGetFlags(old_stream, r_flags)
+                flags = CUstream_flags_enum(r_flags[])
+                new_stream = CuStream(; flags, priority)
+                stream!(new_stream)
                 cooperative_test!(recv_req)
                 sync_device!(arch)
             end

From 613c374d89a700aaffe18eb3cccdec1e656e8f7f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 03:44:40 -0400
Subject: [PATCH 106/530] remove more stuff

---
 src/Distributed/halo_communication.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 301e994d91..d5359c8213 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -340,11 +340,7 @@ for side in sides
                 range = priority_range()
                 priority = last(range)
             
-                old_stream = stream()
-                r_flags = Ref{Cuint}()
-                cuStreamGetFlags(old_stream, r_flags)
-                flags = CUstream_flags_enum(r_flags[])
-                new_stream = CuStream(; flags, priority)
+                new_stream = CuStream(; priority)
                 stream!(new_stream)
                 cooperative_test!(recv_req)
                 sync_device!(arch)

From aa8033205b8dae0f4dbf17a5c7e3ca887235c310 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 03:47:32 -0400
Subject: [PATCH 107/530] try a thing

---
 src/Distributed/halo_communication.jl | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d5359c8213..c5728f5b2e 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -336,12 +336,17 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-                # Not pick up event for the moment
-                range = priority_range()
-                priority = last(range)
+
+                # range = priority_range()
+                # priority = last(range)
             
-                new_stream = CuStream(; priority)
-                stream!(new_stream)
+                # old_stream = stream()
+                # r_flags = Ref{Cuint}()
+                # cuStreamGetFlags(old_stream, r_flags)
+                # flags = CUstream_flags_enum(r_flags[])
+                # new_stream = CuStream(; flags, priority)
+                # stream!(new_stream)
+                priority!(device(arch), :high)
                 cooperative_test!(recv_req)
                 sync_device!(arch)
             end

From 284ad22a8a25a86ecdc0efe78abbfb44a23c975b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 04:06:01 -0400
Subject: [PATCH 108/530] update packages

---
 Manifest.toml        | 94 ++++++++++++++++++++++----------------------
 src/Architectures.jl |  3 +-
 2 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 8a280b21f2..c2ead44498 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -20,9 +20,9 @@ version = "2.3.0+1"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409"
+git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.2.1"
+version = "1.3.1"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
@@ -42,9 +42,9 @@ version = "1.1.1"
 
 [[deps.ArrayInterface]]
 deps = ["Adapt", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "a89acc90c551067cd84119ff018619a1a76c6277"
+git-tree-sha1 = "38911c7737e123b28182d89027f4216cfc8a9da7"
 uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
-version = "7.2.1"
+version = "7.4.3"
 
 [[deps.Artifacts]]
 uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -77,29 +77,27 @@ version = "0.1.2"
 
 [[deps.CUDA]]
 deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "f659a5cac9fec5f47d4f62baa6f441e3d57b23c1"
-repo-rev = "vc/ka_transition"
-repo-url = "https://github.com/JuliaGPU/CUDA.jl.git"
+git-tree-sha1 = "6591ddc73adb429b9d97145c8197a0ac81664ab4"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "4.0.1"
+version = "4.1.3"
 
 [[deps.CUDA_Driver_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "75d7896d1ec079ef10d3aee8f3668c11354c03a1"
+git-tree-sha1 = "10ca2b63b496edc09258b3de5d1aa64094b18b1d"
 uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
-version = "0.2.0+0"
+version = "0.5.0+0"
 
 [[deps.CUDA_Runtime_Discovery]]
 deps = ["Libdl"]
-git-tree-sha1 = "58dd8ec29f54f08c04b052d2c2fa6760b4f4b3a4"
+git-tree-sha1 = "6c8fceaaa6850dea627288ac3bb86fdcdf05e326"
 uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
-version = "0.1.1"
+version = "0.2.0"
 
 [[deps.CUDA_Runtime_jll]]
-deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
-git-tree-sha1 = "d3e6ccd30f84936c1a3a53d622d85d7d3f9b9486"
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "802b1f2220fd43251d343219adf478e6b7992bd4"
 uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
-version = "0.2.3+2"
+version = "0.5.0+0"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
@@ -187,9 +185,9 @@ uuid = "b305315f-e792-5b7a-8f41-49f472929428"
 version = "1.0.1"
 
 [[deps.ExprTools]]
-git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.8"
+version = "0.1.9"
 
 [[deps.FFTW]]
 deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
@@ -214,9 +212,9 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "a28f752ffab0ccd6660fc7af5ad1c9ad176f45f7"
+git-tree-sha1 = "9ade6983c3dbbd492cf5729f865fe030d1541463"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.6.3"
+version = "8.6.6"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
@@ -225,10 +223,10 @@ uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
 version = "0.1.4"
 
 [[deps.GPUCompiler]]
-deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb"
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "590d394bad1055b798b2f9b308327ba871b7badf"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.17.2"
+version = "0.19.0"
 
 [[deps.Glob]]
 git-tree-sha1 = "4df9f7e06108728ebf00a0a11edee4b29a482bb2"
@@ -310,23 +308,21 @@ version = "1.12.0"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "a2fc41047b3dbeb9bbfc4c3a39ef1aaa83c35f1c"
-repo-rev = "vc/device_to_backend"
-repo-url = "https://github.com/JuliaGPU/KernelAbstractions.jl.git"
+git-tree-sha1 = "350a880e80004f4d5d82a17f737d8fcdc56c3462"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.0"
+version = "0.9.1"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "df115c31f5c163697eede495918d8e85045c8f04"
+git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "4.16.0"
+version = "5.0.0"
 
 [[deps.LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
-git-tree-sha1 = "7718cf44439c676bc0ec66a87099f41015a522d6"
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.16+2"
+version = "0.0.21+0"
 
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
@@ -387,9 +383,9 @@ version = "0.20.8"
 
 [[deps.MPICH_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-git-tree-sha1 = "7ec808cad4f3940316c015cb16608e4e632c2c89"
+git-tree-sha1 = "d790fbd913f85e8865c55bf4725aff197c5155c8"
 uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
-version = "4.1.0+1"
+version = "4.1.1+1"
 
 [[deps.MPIPreferences]]
 deps = ["Libdl", "Preferences"]
@@ -398,10 +394,10 @@ uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 version = "0.1.7"
 
 [[deps.MPItrampoline_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "b3f9e42685b4ad614eca0b44bd863cd41b1c86ea"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "ad88f863a5a16b3e26d14446afd3cd746266281b"
 uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
-version = "5.0.2+1"
+version = "5.2.1+3"
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
@@ -527,9 +523,9 @@ uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [[deps.ProgressBars]]
 deps = ["Printf"]
-git-tree-sha1 = "806ebc92e1b4b4f72192369a28dfcaf688566b2b"
+git-tree-sha1 = "9d84c8646109eb8bc7a006d59b157c64d5155c81"
 uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
-version = "1.4.1"
+version = "1.5.0"
 
 [[deps.Quaternions]]
 deps = ["LinearAlgebra", "Random", "RealDot"]
@@ -590,6 +586,12 @@ version = "1.4.0"
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 version = "0.7.0"
 
+[[deps.Scratch]]
+deps = ["Dates"]
+git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
+uuid = "6c6a2e73-6563-6170-7368-637461726353"
+version = "1.2.0"
+
 [[deps.SeawaterPolynomials]]
 git-tree-sha1 = "20e6926c620cedee2b7551b61169dd118b4e34f2"
 uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
@@ -619,9 +621,9 @@ version = "2.2.0"
 
 [[deps.Static]]
 deps = ["IfElse"]
-git-tree-sha1 = "d0435ba43ab5ad1cbb5f0d286ca4ba67029ed3ee"
+git-tree-sha1 = "08be5ee09a7632c32695d954a602df96a877bf0d"
 uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.8.4"
+version = "0.8.6"
 
 [[deps.StaticArrayInterface]]
 deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
@@ -631,9 +633,9 @@ version = "1.3.0"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "2d7d9e1ddadc8407ffd460e24218e37ef52dd9a3"
+git-tree-sha1 = "b8d897fe7fa688e93aef573711cb207c08c9e11e"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.16"
+version = "1.5.19"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
@@ -684,9 +686,9 @@ version = "1.0.1"
 
 [[deps.Tables]]
 deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
-git-tree-sha1 = "c79322d36826aa2f4fd8ecfa96ddb47b174ac78d"
+git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.10.0"
+version = "1.10.1"
 
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
@@ -740,9 +742,9 @@ version = "0.2.1"
 
 [[deps.UnsafeAtomicsLLVM]]
 deps = ["LLVM", "UnsafeAtomics"]
-git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e"
+git-tree-sha1 = "ea37e6066bf194ab78f4e747f5245261f17a7175"
 uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
-version = "0.1.0"
+version = "0.1.2"
 
 [[deps.VersionParsing]]
 git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"
diff --git a/src/Architectures.jl b/src/Architectures.jl
index 3c1c364a52..c81f286972 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -6,7 +6,6 @@ export device, architecture, array_type, arch_array, unified_array, device_copy_
 
 using CUDA
 using KernelAbstractions
-using CUDA.CUDAKernels
 using Adapt
 using OffsetArrays
 
@@ -37,7 +36,7 @@ struct GPU <: AbstractArchitecture end
 #####
 
 device(::CPU) = KernelAbstractions.CPU()
-device(::GPU) = CUDAKernels.CUDABackend(;always_inline=true)
+device(::GPU) = CUDA.CUDABackend(;always_inline=true)
 
 architecture() = nothing
 architecture(::Number) = nothing

From 5e2d5d5a917535adec10d0a5196e220e5fe64ba3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 11:16:19 -0400
Subject: [PATCH 109/530] first commit

---
 src/Distributed/halo_communication.jl         | 10 ---
 src/ImmersedBoundaries/ImmersedBoundaries.jl  | 16 ++--
 src/ImmersedBoundaries/active_cells_map.jl    | 76 ++++++++++++++-----
 .../grid_fitted_immersed_boundaries.jl        |  2 +-
 ...ate_hydrostatic_free_surface_tendencies.jl | 12 +--
 .../calculate_nonhydrostatic_tendencies.jl    | 12 +--
 src/Solvers/batched_tridiagonal_solver.jl     | 13 +++-
 src/Utils/kernel_launching.jl                 |  6 +-
 8 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index c5728f5b2e..1aacb1e516 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -336,16 +336,6 @@ for side in sides
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
             recv_event = Threads.@spawn begin
-
-                # range = priority_range()
-                # priority = last(range)
-            
-                # old_stream = stream()
-                # r_flags = Ref{Cuint}()
-                # cuStreamGetFlags(old_stream, r_flags)
-                # flags = CUstream_flags_enum(r_flags[])
-                # new_stream = CuStream(; flags, priority)
-                # stream!(new_stream)
                 priority!(device(arch), :high)
                 cooperative_test!(recv_req)
                 sync_device!(arch)
diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 90f5559f54..e64b876a02 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -104,19 +104,20 @@ abstract type AbstractImmersedBoundary end
 ##### ImmersedBoundaryGrid
 #####
 
-struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
+struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, S, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
     architecture :: Arch
     underlying_grid :: G
     immersed_boundary :: I
-    active_cells_map :: M
+    active_cells_interior :: M
+    active_cells_surface :: S
     
     # Internal interface
-    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, wcm::M) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M}
+    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M, ms::S) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M, S}
         FT = eltype(grid)
         arch = architecture(grid)
         Arch = typeof(arch)
         
-        return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, wcm)
+        return new{FT, TX, TY, TZ, G, I, M, S, Arch}(arch, grid, ib, mi, ms)
     end
 end
 
@@ -124,9 +125,10 @@ const IBG = ImmersedBoundaryGrid
 
 @inline Base.getproperty(ibg::IBG, property::Symbol) = get_ibg_property(ibg, Val(property))
 @inline get_ibg_property(ibg::IBG, ::Val{property}) where property = getfield(getfield(ibg, :underlying_grid), property)
-@inline get_ibg_property(ibg::IBG, ::Val{:immersed_boundary})  = getfield(ibg, :immersed_boundary)
-@inline get_ibg_property(ibg::IBG, ::Val{:underlying_grid})    = getfield(ibg, :underlying_grid)
-@inline get_ibg_property(ibg::IBG, ::Val{:active_cells_map})   = getfield(ibg, :active_cells_map)
+@inline get_ibg_property(ibg::IBG, ::Val{:immersed_boundary})      = getfield(ibg, :immersed_boundary)
+@inline get_ibg_property(ibg::IBG, ::Val{:underlying_grid})        = getfield(ibg, :underlying_grid)
+@inline get_ibg_property(ibg::IBG, ::Val{:active_cells_interior})  = getfield(ibg, :active_cells_interior)
+@inline get_ibg_property(ibg::IBG, ::Val{:active_cells_surface})   = getfield(ibg, :active_cells_surface)
 
 @inline architecture(ibg::IBG) = architecture(ibg.underlying_grid)
 
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 9114bd5283..068d37248f 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -7,46 +7,84 @@ import Oceananigans.Utils: active_cells_work_layout
 
 const ActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
 
-@inline use_only_active_cells(grid::AbstractGrid)   = false
-@inline use_only_active_cells(grid::ActiveCellsIBG) = true
+@inline use_only_active_interior_cells(grid::AbstractGrid)   = nothing
+@inline use_only_active_interior_cells(grid::ActiveCellsIBG) = Val(:interior)
 
-@inline active_cells_work_layout(size, grid::ActiveCellsIBG) = min(length(grid.active_cells_map), 256), length(grid.active_cells_map)
-@inline active_linear_index_to_ntuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_map[idx])
+@inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
+@inline use_only_active_surface_cells(grid::ActiveCellsIBG) = Val(:surface)
 
-function ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib; active_cells_map = false) where {TX, TY, TZ} 
+@inline active_cells_work_layout(size, ::Val{:surface},  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
+@inline active_cells_work_layout(size, ::Val{:interior}, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
 
+@inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_interior[idx])
+@inline active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG)  = Base.map(Int, grid.active_cells_surface[idx])
+
+function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool) 
+
+    ibg = ImmersedBoundaryGrid(grid, ib)
+    TX, TY, TZ = topology(ibg)
+    
     # Create the cells map on the CPU, then switch it to the GPU
     if active_cells_map 
-        map = active_cells_map(grid, ib)
-        map = arch_array(architecture(grid), map)
+        map_interior = active_cells_map_interior(ibg)
+        map_interior = arch_array(architecture(ibg), map_interior)
+
+        map_surface = active_cells_map_surface(ibg)
+        map_surface = arch_array(architecture(ibg), map_surface)
     else
-        map = nothing
+        map_interior = nothing
+        map_surface  = nothing
     end
 
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, map)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(ibg.underlying_grid, 
+                                            ibg.immersed_boundary, 
+                                            map_interior, map_surface)
 end
 
-@inline active_cell(i, j, k, grid, ib) = !immersed_cell(i, j, k, grid, ib)
+@inline active_cell(i, j, k, ibg) = !immersed_cell(i, j, k, ibg)
+@inline active_column(i, j, k, grid, column) = column[i, j, k] != 0
 
-function compute_active_cells(grid, ib)
-    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, grid; computed_dependencies = (ib, ))
-    active_cells_field = Field{Center, Center, Center}(grid, Bool)
+function compute_active_cells_interior(ibg)
+    is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, ibg)
+    active_cells_field = Field{Center, Center, Center}(ibg, Bool)
     set!(active_cells_field, is_immersed_operation)
     return active_cells_field
 end
 
+function compute_active_cells_surface(ibg)
+    one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0.0)
+    column    = sum(one_field, dims = 3)
+    is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, computed_dependencies = (column, ))
+    active_cells_field = Field{Center, Center, Nothing}(ibg, Bool)
+    set!(active_cells_field, is_immersed_column)
+    return active_cells_field
+end
+
 const MAXUInt8  = 2^8  - 1
 const MAXUInt16 = 2^16 - 1
 const MAXUInt32 = 2^32 - 1
 
-function active_cells_map(grid, ib)
-    active_cells_field = compute_active_cells(grid, ib)
-    full_indices       = arch_array(CPU(), findall(interior(active_cells_field)))
+function active_cells_map_interior(ibg)
+    active_cells_field = compute_active_cells_interior(ibg)
+    full_indices       = findall(arch_array(CPU(), interior(active_cells_field)))
+
+    # Reduce the size of the active_cells_map (originally a tuple of Int64)
+    N = maximum(size(ibg))
+    IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
+    smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType, IntType}
+    
+    return smaller_indices
+end
+
+function active_cells_map_surface(ibg)
+    active_cells_field = compute_active_cells_surface(ibg)
+    full_indices       = findall(arch_array(CPU(), interior(active_cells_field, :, :, 1)))
     
+    Nx, Ny, Nz = size(ibg)
     # Reduce the size of the active_cells_map (originally a tuple of Int64)
-    N = maximum(size(grid))
-    Type = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
-    smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{Type, Type, Type}
+    N = max(Nx, Ny)
+    IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
+    smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
     
     return smaller_indices
 end
diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index b665f8e5f0..f1984fe9ba 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -82,7 +82,7 @@ end
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)
     validate_ib_size(grid, ib)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing, nothing)
 end
 
 function validate_ib_size(grid, ib)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 5b415b5253..c38f2ef5db 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -9,7 +9,7 @@ using Oceananigans.Grids: halo_size
 import Oceananigans.Distributed: complete_communication_and_compute_boundary
 import Oceananigans.Distributed: interior_tendency_kernel_size, interior_tendency_kernel_offsets
 
-using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
+using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
 
 """
     compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
@@ -91,7 +91,7 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
     
-    only_active_cells = use_only_active_cells(grid)
+    only_active_cells = use_only_active_interior_cells(grid)
 
     kernel_size    =   interior_tendency_kernel_size(grid)
     kernel_offsets = interior_tendency_kernel_offsets(grid)
@@ -155,7 +155,7 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
 
-    only_active_cells = use_only_active_cells(grid)
+    only_active_cells = use_only_active_interior_cells(grid)
 
     kernel_size    =   interior_tendency_kernel_size(grid)
     kernel_offsets = interior_tendency_kernel_offsets(grid)
@@ -212,7 +212,7 @@ end
 
 @kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -227,7 +227,7 @@ end
 
 @kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -246,7 +246,7 @@ end
 
 @kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, tendency_kernel_function, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = tendency_kernel_function(i, j, k, grid, args...)
 end
 
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index e2498534aa..d9ec5f1bfd 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -3,7 +3,7 @@ import Oceananigans.TimeSteppers: compute_tendencies!
 using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
 using Oceananigans.Utils: work_layout
 
-using Oceananigans.ImmersedBoundaries: use_only_active_cells, ActiveCellsIBG, active_linear_index_to_ntuple
+using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
 
 """
     compute_tendencies!(model::NonhydrostaticModel)
@@ -79,7 +79,7 @@ function calculate_interior_tendency_contributions!(model)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args..., forcings, hydrostatic_pressure, clock)
     w_kernel_args = tuple(start_momentum_kernel_args..., w_immersed_bc, end_momentum_kernel_args..., forcings, clock)
     
-    only_active_cells = use_only_active_cells(grid)
+    only_active_cells = use_only_active_interior_cells(grid)
 
     launch!(arch, grid, :xyz, calculate_Gu!, 
             tendencies.u, u_kernel_args...;
@@ -126,7 +126,7 @@ end
 
 @kernel function calculate_Gu!(Gu, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -138,7 +138,7 @@ end
 
 @kernel function calculate_Gv!(Gv, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -150,7 +150,7 @@ end
 
 @kernel function calculate_Gw!(Gw, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
@@ -166,7 +166,7 @@ end
 
 @kernel function calculate_Gc!(Gc, grid::ActiveCellsIBG, args...)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index a65ed56ebc..bc532f5699 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -86,10 +86,13 @@ end
 @inline float_eltype(ϕ::AbstractArray{<:Complex{T}}) where T <: AbstractFloat = T
 
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args...)
-    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz
-
     i, j = @index(Global, NTuple)
+    _solve_batched_tridiagonal_system(i, j, ϕ, a, b, c, f, t, grid, p, args...)
+end
 
+@inline function _solve_batched_tridiagonal_system(i, j, ϕ, a, b, c, f, t, grid, p, args...)
+    
+    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz
     @inbounds begin
         β  = get_coefficient(b, i, j, 1, grid, p, args...)
         f₁ = get_coefficient(f, i, j, 1, grid, p, args...)
@@ -104,7 +107,7 @@ end
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
 
             fᵏ = get_coefficient(f, i, j, k, grid, p, args...)
-            
+
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
             # the algorithm is unstable and we elide the forward pass update of ϕ.
             definitely_diagonally_dominant = abs(β) > 10 * eps(float_eltype(ϕ))
@@ -116,4 +119,6 @@ end
             ϕ[i, j, k] -= t[i, j, k+1] * ϕ[i, j, k+1]
         end
     end
-end
+
+    return nothing
+end
\ No newline at end of file
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 2c9c56b4a4..7ce356dd62 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -62,14 +62,14 @@ function work_layout(grid, workdims::Symbol; include_right_boundaries=false, loc
                workdims == :yz  ? (Ny′, Nz′) : throw(ArgumentError("Unsupported launch configuration: $workdims"))
 
 
-    if only_active_cells
-        workgroup, worksize = active_cells_work_layout(worksize, grid) 
+    if !isnothing(only_active_cells)
+        workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
     end
 
     return workgroup, worksize
 end
 
-active_cells_work_layout(size, grid) = heuristic_workgroup(size...), size
+active_cells_work_layout(size, only_active_cells, grid) = heuristic_workgroup(size...), size
 
 """
     launch!(arch, grid, layout, kernel!, args...; kwargs...)

From 62ae4e5eea22bbf335aaca116454a7e835fbf518 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 12:43:24 -0400
Subject: [PATCH 110/530] should work like this?

---
 src/Distributed/distributed_grids.jl  |  84 ++++++++++--------
 src/Distributed/partition_assemble.jl | 117 ++++++++++++++++++--------
 2 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index e3bd66d8e6..a1ab311534 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -34,7 +34,8 @@ function RectilinearGrid(arch::DistributedArch,
                          z = nothing,
                          halo = nothing,
                          extent = nothing,
-                         topology = (Periodic, Periodic, Bounded))
+                         topology = (Periodic, Periodic, Bounded),
+                         partitioned_size = nothing)
 
     TX, TY, TZ, size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, size, halo, FT, extent, x, y, z)
@@ -49,18 +50,24 @@ function RectilinearGrid(arch::DistributedArch,
     TY = insert_connected_topology(TY, Ry, rj)
     TZ = insert_connected_topology(TZ, Rz, rk)
 
-    # Make sure we can put an integer number of grid points in each rank.
-    # Will generalize in the future.
-    @assert isinteger(Nx / Rx)
-    @assert isinteger(Ny / Ry)
-    @assert isinteger(Nz / Rz)
 
-    # Local sizes are denoted with lowercase `n`
-    nx, ny, nz = local_size = Nx÷Rx, Ny÷Ry, Nz÷Rz
+    if isnothing(partitioned_size)
+        @assert isinteger(Nx / Rx)
+        @assert isinteger(Ny / Ry)
+        @assert isinteger(Nz / Rz)
 
-    xl = partition(x, nx, Rx, ri)
-    yl = partition(y, ny, Ry, rj)
-    zl = partition(z, nz, Rz, rk)
+        nx, nx, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
+    
+        xl = partition(x, nx, Rx, ri)
+        yl = partition(y, ny, Ry, rj)
+        zl = partition(z, nz, Rz, rk)
+    else
+        nx, ny, nz = (partitioned_size[1][ri], partitioned_size[2][rj], Nz)
+
+        xl = partition(longitude, partitioned_size[1], Rx, ri)
+        yl = partition(latitude,  partitioned_size[2], Ry, rj)
+        zl = z
+    end
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1], nx, Hx, xl, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2], ny, Hy, yl, child_architecture(arch))
@@ -95,7 +102,8 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                z,           
                                topology = nothing,           
                                radius = R_Earth,
-                               halo = (1, 1, 1))
+                               halo = (1, 1, 1),
+                               partitioned_size = nothing)
 
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
         validate_lat_lon_grid_args(FT, latitude, longitude, z, size, halo, topology, precompute_metrics)
@@ -107,23 +115,30 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    # Make sure we can put an integer number of grid points in each rank.
-    # Will generalize in the future.
-    @assert isinteger(Nλ / Rx)
-    @assert isinteger(Nφ / Ry)
-    @assert isinteger(Nz / Rz)
+    if isnothing(partitioned_size)
+        @assert isinteger(Nλ / Rx)
+        @assert isinteger(Nφ / Ry)
+        @assert isinteger(Nz / Rz)
 
-    nλ, nφ, nz = local_size = Nλ÷Rx, Nφ÷Ry, Nz÷Rz
+        nλ, nφ, nz = Nλ÷Rx, Nφ÷Ry, Nz÷Rz
+    
+        λl = partition(longitude, nλ, Rx, ri)
+        φl = partition(latitude,  nφ, Ry, rj)
+        zl = partition(z,         nz, Rz, rk)
+    else
+        nλ, nφ, nz = (partitioned_size[1][ri], partitioned_size[2][rj], Nz)
 
-    λl = partition(longitude, nλ, Rx, ri)
-    φl = partition(latitude,  nφ, Ry, rj)
-    zl = partition(z,         nz, Rz, rk)
+        λl = partition(longitude, partitioned_size[1], Rx, ri)
+        φl = partition(latitude,  partitioned_size[2], Ry, rj)
+        zl = z
+    end
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 
     # it is stretched if being passed is a function or vector (as for the VerticallyStretchedRectilinearGrid)
     Lλ, λᶠᵃᵃ, λᶜᵃᵃ, Δλᶠᵃᵃ, Δλᶜᵃᵃ = generate_coordinate(FT, TX, nλ, Hλ, λl, arch.child_architecture)
     Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, TZ, nz, Hz, zl, arch.child_architecture)
+
     # The Latitudinal direction is _special_ :
     # Preconmpute metrics assumes that `length(φᵃᶠᵃ) = length(φᵃᶜᵃ) + 1`, which is always the case in a 
     # serial grid because `LatitudeLongitudeGrid` should be always `Bounded`, but it is not true for a
@@ -262,12 +277,12 @@ end
 # take precedence on `DistributedGrid` 
 function with_halo(new_halo, grid::DistributedRectilinearGrid) 
     new_grid = with_halo(new_halo, reconstruct_global_grid(grid))    
-    return scatter_local_grids(architecture(grid), new_grid)
+    return scatter_local_grids(architecture(grid), new_grid, size(grid))
 end
 
 function with_halo(new_halo, grid::DistributedLatitudeLongitudeGrid) 
     new_grid = with_halo(new_halo, reconstruct_global_grid(grid))    
-    return scatter_local_grids(architecture(grid), new_grid)
+    return scatter_local_grids(architecture(grid), new_grid, size(grid))
 end
 
 function with_halo(new_halo, grid::DistributedImmersedBoundaryGrid)
@@ -277,7 +292,7 @@ function with_halo(new_halo, grid::DistributedImmersedBoundaryGrid)
     new_underlying_grid   = with_halo(new_halo, underlying_grid)
     new_immersed_boundary = resize_immersed_boundary(immersed_boundary, new_underlying_grid)
     new_grid              = ImmersedBoundaryGrid(new_underlying_grid, new_immersed_boundary)
-    return scatter_local_grids(architecture(grid), new_grid)
+    return scatter_local_grids(architecture(grid), new_grid, size(grid))
 end
 
 """
@@ -342,28 +357,27 @@ function scatter_grid_properties(global_grid)
     z = cpu_face_constructor_z(global_grid)
 
     topo = topology(global_grid)
-    sz   = pop_flat_elements(size(global_grid), topo)
     halo = pop_flat_elements(halo_size(global_grid), topo)
 
-    return x, y, z, topo, sz, halo
+    return x, y, z, topo, halo
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::RectilinearGrid)
-    x, y, z, topo, sz, halo = scatter_grid_properties(global_grid)
-    return RectilinearGrid(arch, eltype(global_grid); size=sz, x=x, y=y, z=z, halo=halo, topology=topo)
+function scatter_local_grids(arch::DistributedArch, global_grid::RectilinearGrid, local_size)
+    x, y, z, topo, halo = scatter_grid_properties(global_grid)
+    return RectilinearGrid(arch, eltype(global_grid); size=local_size, x=x, y=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::LatitudeLongitudeGrid)
-    x, y, z, topo, sz, halo = scatter_grid_properties(global_grid)
-    return LatitudeLongitudeGrid(arch, eltype(global_grid); size=sz, longitude=x, latitude=y, z=z, halo=halo, topology=topo)
+function scatter_local_grids(arch::DistributedArch, global_grid::LatitudeLongitudeGrid, local_size)
+    x, y, z, topo, halo = scatter_grid_properties(global_grid)
+    return LatitudeLongitudeGrid(arch, eltype(global_grid); size=local_size, longitude=x, latitude=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::ImmersedBoundaryGrid)
+function scatter_local_grids(arch::DistributedArch, global_grid::ImmersedBoundaryGrid, local_size)
     ib = global_grid.immersed_boundary
     ug = global_grid.underlying_grid
 
-    local_ug = scatter_local_grids(arch, ug)
-    local_ib = getnamewrapper(ib)(partition_global_array(arch, ib.bottom_height, size(global_grid)))
+    local_ug = scatter_local_grids(arch, ug, local_size)
+    local_ib = getnamewrapper(ib)(partition_global_array(arch, ib.bottom_height, size(global_grid), local_size))
     
     return ImmersedBoundaryGrid(local_ug, local_ib)
 end
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index a1846f6a9e..99864b609b 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -4,9 +4,11 @@ using Oceananigans.Architectures: arch_array
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
 # which means that we need to repeat the value at the right boundary
 
+partition(c::Colon, Nc, Nr, r) = Colon()
+partition(c::Tuple, Nc, Nr, r) = (c[1] + (r-1) * (c[2] - c[1]) / Nr,  c[1] + r * (c[2] - c[1]) / Nr)
+
+# Have to fix this! This won't work for face constructors
 partition(c::AbstractVector, Nc, Nr, r) = c[1 + (r-1) * Nc : 1 + Nc * r]
-partition(c::Colon,          Nc, Nr, r) = Colon()
-partition(c::Tuple,          Nc, Nr, r) = (c[1] + (r-1) * (c[2] - c[1]) / Nr,    c[1] + r * (c[2] - c[1]) / Nr)
 
 function partition(c::UnitRange, Nc, Nr, r)
     g = (first(c), last(r))
@@ -14,6 +16,22 @@ function partition(c::UnitRange, Nc, Nr, r)
     return UnitRange(ℓ[1], ℓ[2])
 end
 
+# Have to fix this! This won't work for face constructors
+partition(c::AbstractVector, Nc::AbstractVector, Nr, r) = c[1 + sum(Nc[1:r-1]) : 1 + sum(Nc[1:r])]
+
+function partition(c::Tuple, Nc::AbstractVector, Nr, r)
+    Nt = sum(Nc)
+    Δl = (c[2] - c[1]) / Nt      
+
+    l = Tuple{Float64, Float64}[(c[1], c[1] + Δl * Nc[1])]
+    for i in 2:length(Nc)
+        lp = l[i-1][2]
+        push!(l, (lp, lp + Δl * Nc[i]))
+    end
+
+    return l[r]
+end
+
 """
     assemble(c::AbstractVector, Nc, Nr, r, arch) 
 
@@ -23,10 +41,15 @@ and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
 function assemble(c_local::AbstractVector, Nc, Nr, r, r1, r2, comm) 
-    c_global = zeros(eltype(c_local), Nc*Nr+1)
+
+    Nl = zeros(Int, Nr)
+    Nl[r] = Nc
+    MPI.Allreduce!(Nl, +, comm)
+
+    c_global = zeros(eltype(c_local), sum(Nl)+1)
 
     if r1 == 1 && r2 == 1
-        c_global[1 + (r-1) * Nc : Nc * r] .= c_local[1:end-1]
+        c_global[1 + sum(Nl[1:r-1]) : sum(Nl[1:r])] .= c_local[1:end-1]
         r == Nr && (c_global[end] = c_local[end])
     end
 
@@ -35,7 +58,20 @@ function assemble(c_local::AbstractVector, Nc, Nr, r, r1, r2, comm)
     return c_global
 end
 
-assemble(c::Tuple, Nc, Nr, r, r1, r2, comm) = (c[2] - r * (c[2] - c[1]), c[2] - (r - Nr) * (c[2] - c[1]))
+# Simple case, just take the first and the last core
+function assemble(c::Tuple, Nc, Nr, r, r1, r2, comm) 
+    c_global = zeros(Int, 2)
+
+    if r == 1
+        c_global[1] = c[1]
+    elseif r == Nr
+        c_global[2] = c[2]
+    end
+
+    MPI.Allreduce!(c_global, +, comm)
+
+    return tuple(c_global...)
+end 
 
 # TODO: partition_global_array and construct_global_array
 # do not currently work for 2D or 3D parallelizations
@@ -46,35 +82,39 @@ assemble(c::Tuple, Nc, Nr, r, r1, r2, comm) = (c[2] - r * (c[2] - c[1]), c[2] -
 Partition a global array (2D of size `(Nx, Ny)` or 3D of size `(Nx, Ny, Nz)`) in local arrays.
 Usefull for boundary arrays, forcings and initial conditions.
 """
-partition_global_array(arch, c_global::Function, N) = c_global 
+partition_global_array(arch, c_global::Function, Nl) = c_global 
 
-function partition_global_array(arch, c_global::AbstractArray, N) 
+# Here we just assume we cannot partition in z (we should remove support for that!!)
+function partition_global_array(arch, c_global::AbstractArray, nl) 
     c_global = arch_array(CPU(), c_global)
     Rx, Ry, Rz = R = arch.ranks
     ri, rj, rk = r = arch.local_index
 
     dims = length(size(c_global))
 
-    if dims == 2 
-        nx, ny = n = Int.(N[1:2] ./ R[1:2])
+    nx = zeros(Int, Rx)
+    nx[r] = nl[1]
+    MPI.Allreduce!(nx, +, comm)
 
-        c_local = zeros(eltype(c_global), nx, ny)
+    ny = zeros(Int, Ry)
+    ny[r] = nl[2]
+    MPI.Allreduce!(ny, +, comm)
 
-        c_local .= c_global[1 + (ri-1) * nx : nx * ri, 
-                            1 + (rj-1) * ny : ny * rj]
-    
-        return arch_array(child_architecture(arch), c_local)
-    else
-        nx, ny, nz = n = Int.(N ./ R)
+    nz = nl[3]
 
-        c_local = zeros(eltype(c_global), nx, ny, nz)
+    if dims == 2 
+        c_local = zeros(eltype(c_global), nx[ri], ny[rj])
 
-        c_local .= c_global[1 + (ri-1) * nx : nx * ri, 
-                            1 + (rj-1) * ny : ny * rj, 
-                            1 + (rk-1) * nz : nz * rk]
+        c_local .= c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
+                            1 + sum(ny[1:rj-1]) : sum(ny[1:rj])]
+    else
+        c_local = zeros(eltype(c_global), nx[ri], ny[rj], nz)
 
-        return arch_array(child_architecture(arch), c_local)
+        c_local .= c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
+                            1 + sum(ny[1:rj-1]) : sum(ny[1:rj]), 
+                            1:nz]
     end
+    return arch_array(child_architecture(arch), c_local)
 end
 
 """
@@ -86,37 +126,40 @@ Usefull for boundary arrays, forcings and initial conditions.
 construct_global_array(arch, c_local::Function, N) = c_local
 
 # TODO: This does not work for 2D parallelizations!!!
-function construct_global_array(arch, c_local::AbstractArray, n) 
+function construct_global_array(arch, c_local::AbstractArray, nl) 
     c_local = arch_array(CPU(), c_local)
     Rx, Ry, Rz = R = arch.ranks
     ri, rj, rk = r = arch.local_index
 
     dims = length(size(c_local))
 
-    if dims == 2 
-        nx, ny = n[1:2]
-        Nx, Ny = N = Int.(n[1:2] .* R[1:2])
+    nx = zeros(Int, Rx)
+    nx[r] = nl[1]
+    MPI.Allreduce!(nx, +, comm)
+
+    ny = zeros(Int, Ry)
+    ny[r] = nl[2]
+    MPI.Allreduce!(ny, +, comm)
     
+    Nx = sum(nxl)
+    Ny = sum(nyl)
+    Nz = n[3]
+
+    if dims == 2 
         c_global = zeros(eltype(c_local), Nx, Ny)
     
-        c_global[1 + (ri-1) * nx : nx * ri, 
-                 1 + (rj-1) * ny : ny * rj] .= c_local[1:nx, 1:ny]
+        c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
+                 1 + sum(ny[1:rj-1]) : sum(ny[1:rj])] .= c_local[1:nx[ri], 1:ny[rj]]
         
         MPI.Allreduce!(c_global, +, arch.communicator)
-        
-        return arch_array(child_architecture(arch), c_global)
     else
-        nx, ny, nz = n
-        Nx, Ny, Nz = N = Int.(n .* R)
-
         c_global = zeros(eltype(c_local), Nx, Ny, Nz)
 
-        c_global[1 + (ri-1) * nx : nx * ri, 
-                 1 + (rj-1) * ny : ny * rj, 
-                 1 + (rk-1) * nz : nz * rk] .= c_local[1:nx, 1:ny, 1:nz]
+        c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
+                 1 + sum(ny[1:rj-1]) : sum(ny[1:rj]),
+                 1:Nz] .= c_local[1:nxl[ri], 1:nyl[rj], 1:Nz]
         
         MPI.Allreduce!(c_global, +, arch.communicator)
-        
-        return arch_array(child_architecture(arch), c_global)
     end
+    return arch_array(child_architecture(arch), c_global)
 end

From 96e957779979ff94b82e940bafe8831c8c8c4f0d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 12:44:44 -0400
Subject: [PATCH 111/530] bugfix

---
 src/Distributed/distributed_grids.jl  | 2 +-
 src/Distributed/partition_assemble.jl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index a1ab311534..dbd53f55ff 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -377,7 +377,7 @@ function scatter_local_grids(arch::DistributedArch, global_grid::ImmersedBoundar
     ug = global_grid.underlying_grid
 
     local_ug = scatter_local_grids(arch, ug, local_size)
-    local_ib = getnamewrapper(ib)(partition_global_array(arch, ib.bottom_height, size(global_grid), local_size))
+    local_ib = getnamewrapper(ib)(partition_global_array(arch, ib.bottom_height, local_size))
     
     return ImmersedBoundaryGrid(local_ug, local_ib)
 end
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 99864b609b..3141d5ab05 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -77,9 +77,9 @@ end
 # do not currently work for 2D or 3D parallelizations
 # (They are not used anywhere in the code at the moment)
 """
-    partition_global_array(arch, c_global, (Nx, Ny, Nz))
+    partition_global_array(arch, c_global, (nx, ny, nz))
 
-Partition a global array (2D of size `(Nx, Ny)` or 3D of size `(Nx, Ny, Nz)`) in local arrays.
+Partition a global array in local arrays of size `(nx, ny)` if 2D or `(nx, ny, nz)` is 3D.
 Usefull for boundary arrays, forcings and initial conditions.
 """
 partition_global_array(arch, c_global::Function, Nl) = c_global 

From 0da34f82c832a90edb53135b129e0a79eec36e7f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 12:45:33 -0400
Subject: [PATCH 112/530] bugfix

---
 src/Distributed/partition_assemble.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 3141d5ab05..4bcdd74363 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -157,7 +157,7 @@ function construct_global_array(arch, c_local::AbstractArray, nl)
 
         c_global[1 + sum(nx[1:ri-1]) : sum(nx[1:ri]), 
                  1 + sum(ny[1:rj-1]) : sum(ny[1:rj]),
-                 1:Nz] .= c_local[1:nxl[ri], 1:nyl[rj], 1:Nz]
+                 1:Nz] .= c_local[1:nx[ri], 1:ny[rj], 1:Nz]
         
         MPI.Allreduce!(c_global, +, arch.communicator)
     end

From 69fccbe5f7e40f09c9bec18f95e2b5aeabb2ebe5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 13:02:22 -0400
Subject: [PATCH 113/530] another bugfix

---
 src/Distributed/partition_assemble.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 4bcdd74363..d379040dbf 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -82,7 +82,7 @@ end
 Partition a global array in local arrays of size `(nx, ny)` if 2D or `(nx, ny, nz)` is 3D.
 Usefull for boundary arrays, forcings and initial conditions.
 """
-partition_global_array(arch, c_global::Function, Nl) = c_global 
+partition_global_array(arch, c_global::Function, nl) = c_global 
 
 # Here we just assume we cannot partition in z (we should remove support for that!!)
 function partition_global_array(arch, c_global::AbstractArray, nl) 
@@ -143,7 +143,7 @@ function construct_global_array(arch, c_local::AbstractArray, nl)
     
     Nx = sum(nxl)
     Ny = sum(nyl)
-    Nz = n[3]
+    Nz = nl[3]
 
     if dims == 2 
         c_global = zeros(eltype(c_local), Nx, Ny)

From e5fd8bb182dbfdd3fbbc492edebee27aff3c9cc3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 17:29:27 -0400
Subject: [PATCH 114/530] load balanced

---
 src/Distributed/distributed_grids.jl          |  67 ++++------
 src/Distributed/partition_assemble.jl         | 116 ++++++++++--------
 src/ImmersedBoundaries/ImmersedBoundaries.jl  |   6 +-
 src/ImmersedBoundaries/active_cells_map.jl    |  20 +--
 src/Utils/kernel_launching.jl                 |  16 ++-
 .../mpi_geostrophic_adjustment.jl             |  14 ++-
 6 files changed, 120 insertions(+), 119 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index dbd53f55ff..69b62280b6 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -19,7 +19,7 @@ const DistributedRectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ} =
 const DistributedLatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ} = 
     LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:DistributedArch} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
 
-const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:DistributedArch} where {FT, TX, TY, TZ, I, M}
+const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, S, <:DistributedArch} where {FT, TX, TY, TZ, I, S, M}
 
 """
     RectilinearGrid(arch::DistributedArch, FT=Float64; kw...)
@@ -34,13 +34,14 @@ function RectilinearGrid(arch::DistributedArch,
                          z = nothing,
                          halo = nothing,
                          extent = nothing,
-                         topology = (Periodic, Periodic, Bounded),
-                         partitioned_size = nothing)
+                         topology = (Periodic, Periodic, Bounded))
 
-    TX, TY, TZ, size, halo, x, y, z =
-        validate_rectilinear_grid_args(topology, size, halo, FT, extent, x, y, z)
+    global_sizes = concatenate_local_size(size, arch)
+    global_size  = sum.(global_sizes)
+
+    TX, TY, TZ, global_size, halo, x, y, z =
+        validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
 
-    Nx, Ny, Nz = size
     Hx, Hy, Hz = halo
 
     ri, rj, rk = arch.local_index
@@ -50,24 +51,11 @@ function RectilinearGrid(arch::DistributedArch,
     TY = insert_connected_topology(TY, Ry, rj)
     TZ = insert_connected_topology(TZ, Rz, rk)
 
-
-    if isnothing(partitioned_size)
-        @assert isinteger(Nx / Rx)
-        @assert isinteger(Ny / Ry)
-        @assert isinteger(Nz / Rz)
-
-        nx, nx, nz = Nx÷Rx, Ny÷Ry, Nz÷Rz
+    nx, ny, nz = size
     
-        xl = partition(x, nx, Rx, ri)
-        yl = partition(y, ny, Ry, rj)
-        zl = partition(z, nz, Rz, rk)
-    else
-        nx, ny, nz = (partitioned_size[1][ri], partitioned_size[2][rj], Nz)
-
-        xl = partition(longitude, partitioned_size[1], Rx, ri)
-        yl = partition(latitude,  partitioned_size[2], Ry, rj)
-        zl = z
-    end
+    xl = partition(x, nx, Rx, ri)
+    yl = partition(y, ny, Ry, rj)
+    zl = partition(z, nz, Rz, rk)
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1], nx, Hx, xl, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2], ny, Hy, yl, child_architecture(arch))
@@ -102,11 +90,14 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                z,           
                                topology = nothing,           
                                radius = R_Earth,
-                               halo = (1, 1, 1),
-                               partitioned_size = nothing)
+                               halo = (1, 1, 1))
+
+
+    global_sizes = concatenate_local_size(size, arch)
+    global_size  = sum.(global_sizes)
 
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
-        validate_lat_lon_grid_args(FT, latitude, longitude, z, size, halo, topology, precompute_metrics)
+        validate_lat_lon_grid_args(FT, latitude, longitude, z, global_size, halo, topology, precompute_metrics)
     
     ri, rj, rk = arch.local_index
     Rx, Ry, Rz = arch.ranks
@@ -115,23 +106,11 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    if isnothing(partitioned_size)
-        @assert isinteger(Nλ / Rx)
-        @assert isinteger(Nφ / Ry)
-        @assert isinteger(Nz / Rz)
-
-        nλ, nφ, nz = Nλ÷Rx, Nφ÷Ry, Nz÷Rz
+    nλ, nφ, nz = size
     
-        λl = partition(longitude, nλ, Rx, ri)
-        φl = partition(latitude,  nφ, Ry, rj)
-        zl = partition(z,         nz, Rz, rk)
-    else
-        nλ, nφ, nz = (partitioned_size[1][ri], partitioned_size[2][rj], Nz)
-
-        λl = partition(longitude, partitioned_size[1], Rx, ri)
-        φl = partition(latitude,  partitioned_size[2], Ry, rj)
-        zl = z
-    end
+    λl = partition(longitude, nλ, Rx, ri)
+    φl = partition(latitude,  nφ, Ry, rj)
+    zl = partition(z,         nz, Rz, rk)
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 
@@ -179,7 +158,7 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
 
     nx, ny, nz = n = size(grid)
     Hx, Hy, Hz = H = halo_size(grid)
-    Nx, Ny, Nz = n .* R
+    Nx, Ny, Nz = sum.(concatenate_local_size(n, arch))
 
     TX, TY, TZ = topology(grid)
 
@@ -222,7 +201,7 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
 
     nλ, nφ, nz = n = size(grid)
     Hλ, Hφ, Hz = H = halo_size(grid)
-    Nλ, Nφ, Nz = n .* R
+    Nλ, Nφ, Nz = sum.(concatenate_local_size(n, arch))
 
     TX, TY, TZ = topology(grid)
 
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index d379040dbf..9e011d012a 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,32 +1,62 @@
 using Oceananigans.Architectures: arch_array
 
+concatenate_local_size(n, arch::DistributedArch) = (concatenate_local_size(n, arch, 1),
+                                                    concatenate_local_size(n, arch, 2),
+                                                    concatenate_local_size(n, arch, 3))
+
+function concatenate_local_size(n, arch::DistributedArch, idx)
+    R = arch.ranks[idx]
+    r = arch.local_index[idx]
+    n = n[idx]
+    l = zeros(Int, R)
+
+    r1, r2 = arch.local_index[[1, 2, 3] .!= idx]
+    
+    if r1 == 1 && r2 == 1
+        l[r] = n
+    end
+
+    MPI.Allreduce!(l, +, arch.communicator)
+    
+    return l
+end
+
+function concatenate_local_size(n, R, r) 
+    l = zeros(Int, R)
+    l[r] = n
+    MPI.Allreduce!(l, +, MPI.COMM_WORLD)
+
+    return l
+end
+
 # Partitioning (localization of global objects) and assembly (global assembly of local objects)
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
 # which means that we need to repeat the value at the right boundary
 
-partition(c::Colon, Nc, Nr, r) = Colon()
-partition(c::Tuple, Nc, Nr, r) = (c[1] + (r-1) * (c[2] - c[1]) / Nr,  c[1] + r * (c[2] - c[1]) / Nr)
+partition(c::Colon, n, R, r) = Colon()
 
-# Have to fix this! This won't work for face constructors
-partition(c::AbstractVector, Nc, Nr, r) = c[1 + (r-1) * Nc : 1 + Nc * r]
-
-function partition(c::UnitRange, Nc, Nr, r)
-    g = (first(c), last(r))
-    ℓ = partition(g, Nc, Nr, r)
+function partition(c::UnitRange, n, R, r)
+    g = (first(c), last(c))
+    ℓ = partition(g, n, R, r)
     return UnitRange(ℓ[1], ℓ[2])
 end
 
 # Have to fix this! This won't work for face constructors
-partition(c::AbstractVector, Nc::AbstractVector, Nr, r) = c[1 + sum(Nc[1:r-1]) : 1 + sum(Nc[1:r])]
+function partition(c::AbstractVector, n, R, r)
+    nl = concatenate_local_size(n, R, r)
+    return c[1 + sum(nl[1:r-1]) : 1 + sum(nl[1:r])]
+end
 
-function partition(c::Tuple, Nc::AbstractVector, Nr, r)
-    Nt = sum(Nc)
-    Δl = (c[2] - c[1]) / Nt      
+function partition(c::Tuple, n, R, r)
+    nl = concatenate_local_size(n, R, r)
+    N  = sum(nl)
 
-    l = Tuple{Float64, Float64}[(c[1], c[1] + Δl * Nc[1])]
-    for i in 2:length(Nc)
+    Δl = (c[2] - c[1]) / N  
+
+    l = Tuple{Float64, Float64}[(c[1], c[1] + Δl * nl[1])]
+    for i in 2:R
         lp = l[i-1][2]
-        push!(l, (lp, lp + Δl * Nc[i]))
+        push!(l, (lp, lp + Δl * nl[i]))
     end
 
     return l[r]
@@ -40,16 +70,13 @@ a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
-function assemble(c_local::AbstractVector, Nc, Nr, r, r1, r2, comm) 
-
-    Nl = zeros(Int, Nr)
-    Nl[r] = Nc
-    MPI.Allreduce!(Nl, +, comm)
+function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm) 
+    nl = concatenate_local_size(n, R, r)
 
-    c_global = zeros(eltype(c_local), sum(Nl)+1)
+    c_global = zeros(eltype(c_local), sum(nl)+1)
 
     if r1 == 1 && r2 == 1
-        c_global[1 + sum(Nl[1:r-1]) : sum(Nl[1:r])] .= c_local[1:end-1]
+        c_global[1 + sum(nl[1:r-1]) : sum(nl[1:r])] .= c_local[1:end-1]
         r == Nr && (c_global[end] = c_local[end])
     end
 
@@ -59,12 +86,12 @@ function assemble(c_local::AbstractVector, Nc, Nr, r, r1, r2, comm)
 end
 
 # Simple case, just take the first and the last core
-function assemble(c::Tuple, Nc, Nr, r, r1, r2, comm) 
+function assemble(c::Tuple, n, R, r, r1, r2, comm) 
     c_global = zeros(Int, 2)
 
-    if r == 1
+    if r == 1 && r1 == 1 && r2 == 1
         c_global[1] = c[1]
-    elseif r == Nr
+    elseif r == R && r1 == 1 && r2 == 1
         c_global[2] = c[2]
     end
 
@@ -75,32 +102,25 @@ end
 
 # TODO: partition_global_array and construct_global_array
 # do not currently work for 2D or 3D parallelizations
-# (They are not used anywhere in the code at the moment)
+# (They are not used anywhere in the code at the moment exept for immersed boundaries)
 """
     partition_global_array(arch, c_global, (nx, ny, nz))
 
 Partition a global array in local arrays of size `(nx, ny)` if 2D or `(nx, ny, nz)` is 3D.
 Usefull for boundary arrays, forcings and initial conditions.
 """
-partition_global_array(arch, c_global::Function, nl) = c_global 
+partition_global_array(arch, c_global::Function, n) = c_global 
 
 # Here we just assume we cannot partition in z (we should remove support for that!!)
-function partition_global_array(arch, c_global::AbstractArray, nl) 
+function partition_global_array(arch, c_global::AbstractArray, n) 
     c_global = arch_array(CPU(), c_global)
-    Rx, Ry, Rz = R = arch.ranks
+
     ri, rj, rk = r = arch.local_index
 
     dims = length(size(c_global))
+    nx, ny, nz = concatenate_local_size(n, arch)
 
-    nx = zeros(Int, Rx)
-    nx[r] = nl[1]
-    MPI.Allreduce!(nx, +, comm)
-
-    ny = zeros(Int, Ry)
-    ny[r] = nl[2]
-    MPI.Allreduce!(ny, +, comm)
-
-    nz = nl[3]
+    nz = nz[1]
 
     if dims == 2 
         c_local = zeros(eltype(c_global), nx[ri], ny[rj])
@@ -126,24 +146,18 @@ Usefull for boundary arrays, forcings and initial conditions.
 construct_global_array(arch, c_local::Function, N) = c_local
 
 # TODO: This does not work for 2D parallelizations!!!
-function construct_global_array(arch, c_local::AbstractArray, nl) 
+function construct_global_array(arch, c_local::AbstractArray, n) 
     c_local = arch_array(CPU(), c_local)
-    Rx, Ry, Rz = R = arch.ranks
-    ri, rj, rk = r = arch.local_index
+
+    ri, rj, rk = arch.local_index
 
     dims = length(size(c_local))
 
-    nx = zeros(Int, Rx)
-    nx[r] = nl[1]
-    MPI.Allreduce!(nx, +, comm)
+    nx, ny, nz = concatenate_local_size(n, arch)
 
-    ny = zeros(Int, Ry)
-    ny[r] = nl[2]
-    MPI.Allreduce!(ny, +, comm)
-    
-    Nx = sum(nxl)
-    Ny = sum(nyl)
-    Nz = nl[3]
+    Nx = sum(nx)
+    Ny = sum(ny)
+    Nz = nz[1]
 
     if dims == 2 
         c_global = zeros(eltype(c_local), Nx, Ny)
diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index e64b876a02..b6a8de3c6e 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -137,9 +137,11 @@ const IBG = ImmersedBoundaryGrid
 @inline z_domain(ibg::IBG) = z_domain(ibg.underlying_grid)
 
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
-    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_map))
+    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_interior), adapt(to, ibg.active_cells_surface))
 
-with_halo(halo, ibg::ImmersedBoundaryGrid) = ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
+function with_halo(halo, ibg::ImmersedBoundaryGrid) 
+    return ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
+end
 
 # ImmersedBoundaryGrids require an extra halo point to check the "inactivity" of a `Face` node at N + H 
 # (which requires checking `Center` nodes at N + H and N + H + 1)
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 068d37248f..8adfb8ca53 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -7,17 +7,20 @@ import Oceananigans.Utils: active_cells_work_layout
 
 const ActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
 
+struct InteriorMap end
+struct SurfaceMap end
+
 @inline use_only_active_interior_cells(grid::AbstractGrid)   = nothing
-@inline use_only_active_interior_cells(grid::ActiveCellsIBG) = Val(:interior)
+@inline use_only_active_interior_cells(grid::ActiveCellsIBG) = InteriorMap()
 
 @inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
-@inline use_only_active_surface_cells(grid::ActiveCellsIBG) = Val(:surface)
+@inline use_only_active_surface_cells(grid::ActiveCellsIBG) = SurfaceMap()
 
-@inline active_cells_work_layout(size, ::Val{:surface},  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
-@inline active_cells_work_layout(size, ::Val{:interior}, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
+@inline active_cells_work_layout(size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
+@inline active_cells_work_layout(size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
 
 @inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_interior[idx])
-@inline active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG)  = Base.map(Int, grid.active_cells_surface[idx])
+@inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_surface[idx])
 
 function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool) 
 
@@ -29,11 +32,12 @@ function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool)
         map_interior = active_cells_map_interior(ibg)
         map_interior = arch_array(architecture(ibg), map_interior)
 
-        map_surface = active_cells_map_surface(ibg)
-        map_surface = arch_array(architecture(ibg), map_surface)
+        map_surface  = nothing
+        # map_surface = active_cells_map_surface(ibg)
+        # map_surface = arch_array(architecture(ibg), map_surface)
     else
-        map_interior = nothing
         map_surface  = nothing
+        map_interior = nothing
     end
 
     return ImmersedBoundaryGrid{TX, TY, TZ}(ibg.underlying_grid, 
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 7ce356dd62..c06942133a 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -48,7 +48,7 @@ to be specified.
 
 For more information, see: https://github.com/CliMA/Oceananigans.jl/pull/308
 """
-function work_layout(grid, workdims::Symbol; include_right_boundaries=false, location=nothing, reduced_dimensions=(), only_active_cells = false)
+function work_layout(grid, workdims::Symbol; include_right_boundaries=false, location=nothing, reduced_dimensions=())
 
     Nx′, Ny′, Nz′ = include_right_boundaries ? size(location, grid) : size(grid)
     Nx′, Ny′, Nz′ = flatten_reduced_dimensions((Nx′, Ny′, Nz′), reduced_dimensions)
@@ -61,11 +61,6 @@ function work_layout(grid, workdims::Symbol; include_right_boundaries=false, loc
                workdims == :xz  ? (Nx′, Nz′) :
                workdims == :yz  ? (Ny′, Nz′) : throw(ArgumentError("Unsupported launch configuration: $workdims"))
 
-
-    if !isnothing(only_active_cells)
-        workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
-    end
-
     return workgroup, worksize
 end
 
@@ -81,14 +76,17 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                  include_right_boundaries = false,
                  reduced_dimensions = (),
                  location = nothing,
-                 only_active_cells = false,
+                 only_active_cells = nothing,
                  kwargs...)
 
     workgroup, worksize = work_layout(grid, workspec;
                                       include_right_boundaries,
                                       reduced_dimensions,
-                                      location, 
-                                      only_active_cells)
+                                      location)
+    
+    if !isnothing(only_active_cells)
+        workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
+    end
 
     loop! = kernel!(Architectures.device(arch), workgroup, worksize)
 
diff --git a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
index cf38583ff5..08f66ecdb3 100644
--- a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
+++ b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
@@ -30,13 +30,18 @@ arch = DistributedArch(CPU(); topology = topo,
 Lh = 100kilometers
 Lz = 400meters
 
+Nx = [10, 13, 18, 39]
+
 grid = RectilinearGrid(arch,
-                       size = (80, 3, 1),
+                       size = (Nx[rank+1], 3, 1),
                        x = (0, Lh), y = (0, Lh), z = (-Lz, 0),
-                       topology = topo)
+                       topology = topo,
+                       )
+
+@show rank, grid
 
 bottom(x, y) = x > 80kilometers && x < 90kilometers ? 100.0 : -500meters
-grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom))
+grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom), true)
 
 coriolis = FPlane(f = 1e-4)
 
@@ -88,7 +93,6 @@ function progress_message(sim)
     sim.model.clock.time, maximum(abs, sim.model.velocities.u))
 end
 
-
 simulation.callbacks[:save_η]   = Callback(save_η, IterationInterval(1))
 simulation.callbacks[:save_v]   = Callback(save_v, IterationInterval(1))
 simulation.callbacks[:save_u]   = Callback(save_u, IterationInterval(1))
@@ -98,4 +102,4 @@ run!(simulation)
 
 jldsave("variables_rank$(rank).jld2", varr = varr, ηarr = ηarr, uarr = uarr)
 
-MPI.Finalize()
+MPI.Finalize()
\ No newline at end of file

From e21bf3c380f940fda0e9bda0a4a738b68c1b845b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 18:20:42 -0400
Subject: [PATCH 115/530] do not distribute z

---
 src/Distributed/distributed_grids.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 69b62280b6..1e5b012c23 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -36,9 +36,8 @@ function RectilinearGrid(arch::DistributedArch,
                          extent = nothing,
                          topology = (Periodic, Periodic, Bounded))
 
-    global_sizes = concatenate_local_size(size, arch)
-    global_size  = sum.(global_sizes)
-
+    global_size = sum.(concatenate_local_size(size, arch))
+    
     TX, TY, TZ, global_size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
 
@@ -55,7 +54,7 @@ function RectilinearGrid(arch::DistributedArch,
     
     xl = partition(x, nx, Rx, ri)
     yl = partition(y, ny, Ry, rj)
-    zl = partition(z, nz, Rz, rk)
+    zl = z
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1], nx, Hx, xl, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2], ny, Hy, yl, child_architecture(arch))
@@ -110,7 +109,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     
     λl = partition(longitude, nλ, Rx, ri)
     φl = partition(latitude,  nφ, Ry, rj)
-    zl = partition(z,         nz, Rz, rk)
+    zl = z
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 

From c61b35a88bb001871861d3c9ac702925d992df7d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 20:48:06 -0400
Subject: [PATCH 116/530] maybe like this I avoid OOM

---
 src/Distributed/partition_assemble.jl      | 24 ++++++++++------------
 src/ImmersedBoundaries/active_cells_map.jl | 22 ++++++++++++--------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 9e011d012a..b1e257d970 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,5 +1,11 @@
 using Oceananigans.Architectures: arch_array
 
+"""
+    concatenate_local_size(n, arch::DistributedArch) 
+
+returns a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
+all 3 directions
+"""
 concatenate_local_size(n, arch::DistributedArch) = (concatenate_local_size(n, arch, 1),
                                                     concatenate_local_size(n, arch, 2),
                                                     concatenate_local_size(n, arch, 3))
@@ -33,15 +39,7 @@ end
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
 # which means that we need to repeat the value at the right boundary
 
-partition(c::Colon, n, R, r) = Colon()
-
-function partition(c::UnitRange, n, R, r)
-    g = (first(c), last(c))
-    ℓ = partition(g, n, R, r)
-    return UnitRange(ℓ[1], ℓ[2])
-end
-
-# Have to fix this! This won't work for face constructors
+# Have to fix this! This won't work for face constructors??
 function partition(c::AbstractVector, n, R, r)
     nl = concatenate_local_size(n, R, r)
     return c[1 + sum(nl[1:r-1]) : 1 + sum(nl[1:r])]
@@ -63,9 +61,9 @@ function partition(c::Tuple, n, R, r)
 end
 
 """
-    assemble(c::AbstractVector, Nc, Nr, r, arch) 
+    assemble(c::AbstractVector, n, R, r, r1, r2, comm) 
 
-Build a linear global coordinate vector given a local coordinate vector `c_local`
+Builds a linear global coordinate vector given a local coordinate vector `c_local`
 a local number of elements `Nc`, number of ranks `Nr`, rank `r`,
 and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
@@ -101,7 +99,7 @@ function assemble(c::Tuple, n, R, r, r1, r2, comm)
 end 
 
 # TODO: partition_global_array and construct_global_array
-# do not currently work for 2D or 3D parallelizations
+# do not currently work for 3D parallelizations
 # (They are not used anywhere in the code at the moment exept for immersed boundaries)
 """
     partition_global_array(arch, c_global, (nx, ny, nz))
@@ -145,7 +143,7 @@ Usefull for boundary arrays, forcings and initial conditions.
 """
 construct_global_array(arch, c_local::Function, N) = c_local
 
-# TODO: This does not work for 2D parallelizations!!!
+# TODO: This does not work for 3D parallelizations!!!
 function construct_global_array(arch, c_local::AbstractArray, n) 
     c_local = arch_array(CPU(), c_local)
 
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 8adfb8ca53..0d92270c98 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -70,20 +70,26 @@ const MAXUInt32 = 2^32 - 1
 
 function active_cells_map_interior(ibg)
     active_cells_field = compute_active_cells_interior(ibg)
-    full_indices       = findall(arch_array(CPU(), interior(active_cells_field)))
-
-    # Reduce the size of the active_cells_map (originally a tuple of Int64)
+    
     N = maximum(size(ibg))
     IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
-    smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType, IntType}
-    
-    return smaller_indices
+   
+    # Cannot findall on the entire field because we incur on OOM errors
+    active_indices = Tuple{IntType, IntType, IntType}[]
+    for k in 1:size(ibg, 3)
+        interior_cells = arch_array(CPU(), interior(active_cells_field, :, :, k))
+        push!(active_indices, getproperty.(findall(interior_cells), Ref(:I)) .|> Tuple{IntType, IntType, IntType})
+    end
+
+    return active_indices
 end
 
 function active_cells_map_surface(ibg)
     active_cells_field = compute_active_cells_surface(ibg)
-    full_indices       = findall(arch_array(CPU(), interior(active_cells_field, :, :, 1)))
-    
+    interior_cells     = arch_array(CPU(), interior(active_cells_field, :, :, 1))
+  
+    full_indices = findall(interior_cells)
+
     Nx, Ny, Nz = size(ibg)
     # Reduce the size of the active_cells_map (originally a tuple of Int64)
     N = max(Nx, Ny)

From 5e8d3dc923b2f3e3763c2efe854ea1f6a0a77f6d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 20:54:44 -0400
Subject: [PATCH 117/530] just test load balancing

---
 src/Distributed/halo_communication.jl  | 22 ++++++++++++----------
 src/Distributed/multi_architectures.jl |  2 +-
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1aacb1e516..d50859b754 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -174,6 +174,8 @@ function cooperative_waitall!(tasks::Array{Task})
     end
 end
 
+cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
+
 function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = true, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
@@ -303,12 +305,12 @@ for side in sides
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             
-            send_event = Threads.@spawn begin
+            # send_event = Threads.@spawn begin
                 send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
-                cooperative_test!(send_req)
-            end
+                # cooperative_test!(send_req)
+            # end
 
-            return send_event
+            return send_req
         end
 
         @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
@@ -335,13 +337,13 @@ for side in sides
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
-            recv_event = Threads.@spawn begin
-                priority!(device(arch), :high)
-                cooperative_test!(recv_req)
-                sync_device!(arch)
-            end
+            # recv_event = Threads.@spawn begin
+            #     priority!(device(arch), :high)
+            #     cooperative_test!(recv_req)
+            #     sync_device!(arch)
+            # end
 
-            return recv_event
+            return recv_req
         end
 
         @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index bfa4ef9d36..2d1f8e2332 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -105,7 +105,7 @@ function DistributedArch(child_architecture = CPU();
         isnothing(devices) ? device!(node_rank % ndevices()) : device!(devices[node_rank+1]) 
     end
 
-    mpi_requests = enable_overlapped_computation ? Task[] : nothing
+    mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
 
     B = use_buffers
     M = typeof(mpi_requests)

From 7eefbf121641128e5adf222d877d80b53c29fab6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 21:41:15 -0400
Subject: [PATCH 118/530] test hypothesis

---
 src/ImmersedBoundaries/active_cells_map.jl | 27 ++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 0d92270c98..fd61974ebe 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -74,16 +74,35 @@ function active_cells_map_interior(ibg)
     N = maximum(size(ibg))
     IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
    
+    IndicesType = Tuple{IntType, IntType, IntType}
+
     # Cannot findall on the entire field because we incur on OOM errors
-    active_indices = Tuple{IntType, IntType, IntType}[]
+    active_indices = IndicesType[]
+    active_indices = findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
+
+    return active_indices
+end
+
+function findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
+    
     for k in 1:size(ibg, 3)
-        interior_cells = arch_array(CPU(), interior(active_cells_field, :, :, k))
-        push!(active_indices, getproperty.(findall(interior_cells), Ref(:I)) .|> Tuple{IntType, IntType, IntType})
+        interior_cells = findall(arch_array(CPU(), interior(active_cells_field, :, :, k:k)))
+        interior_cells = convert_interior_cells(interior_cells, k, IndicesType)
+        active_indices = vcat(active_indices, interior_cells)
+        GC.gc()
     end
 
     return active_indices
 end
 
+function convert_interior_cells(interior_cells, k, IndicesType)
+    interior_cells = getproperty.(interior_cells, :I) 
+    interior_cells = add_3rd_index.(interior_cells, k) |> Array{IndicesType}
+    return interior_cells
+end
+
+@inline add_3rd_index(t::Tuple, k) = (t[1], t[2], k) 
+
 function active_cells_map_surface(ibg)
     active_cells_field = compute_active_cells_surface(ibg)
     interior_cells     = arch_array(CPU(), interior(active_cells_field, :, :, 1))
@@ -97,4 +116,4 @@ function active_cells_map_surface(ibg)
     smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
     
     return smaller_indices
-end
+end
\ No newline at end of file

From ce71fe9c3650d810e892298911fb522cca3b27ab Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 21:41:34 -0400
Subject: [PATCH 119/530] test hypothesis

---
 src/ImmersedBoundaries/active_cells_map.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index fd61974ebe..9d5e555670 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -89,6 +89,8 @@ function findall_active_indices!(active_indices, active_cells_field, ibg, Indice
         interior_cells = findall(arch_array(CPU(), interior(active_cells_field, :, :, k:k)))
         interior_cells = convert_interior_cells(interior_cells, k, IndicesType)
         active_indices = vcat(active_indices, interior_cells)
+
+        @show k
         GC.gc()
     end
 

From a9a740af47cda01dd79989862e52da63f75c2ff6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 4 Apr 2023 22:00:50 -0400
Subject: [PATCH 120/530] send the correct tuple

---
 src/Distributed/partition_assemble.jl      |  2 +-
 src/ImmersedBoundaries/active_cells_map.jl | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index b1e257d970..f6fab53781 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -85,7 +85,7 @@ end
 
 # Simple case, just take the first and the last core
 function assemble(c::Tuple, n, R, r, r1, r2, comm) 
-    c_global = zeros(Int, 2)
+    c_global = zeros(Float64, 2)
 
     if r == 1 && r1 == 1 && r2 == 1
         c_global[1] = c[1]
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 9d5e555670..26be7c9edd 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -86,21 +86,19 @@ end
 function findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
     
     for k in 1:size(ibg, 3)
-        interior_cells = findall(arch_array(CPU(), interior(active_cells_field, :, :, k:k)))
-        interior_cells = convert_interior_cells(interior_cells, k, IndicesType)
-        active_indices = vcat(active_indices, interior_cells)
-
-        @show k
+        interior_indices = findall(arch_array(CPU(), interior(active_cells_field, :, :, k:k)))
+        interior_indices = convert_interior_indices(interior_indices, k, IndicesType)
+        active_indices = vcat(active_indices, interior_indices)
         GC.gc()
     end
 
     return active_indices
 end
 
-function convert_interior_cells(interior_cells, k, IndicesType)
-    interior_cells = getproperty.(interior_cells, :I) 
-    interior_cells = add_3rd_index.(interior_cells, k) |> Array{IndicesType}
-    return interior_cells
+function convert_interior_indices(interior_indices, k, IndicesType)
+    interior_indices =   getproperty.(interior_indices, :I) 
+    interior_indices = add_3rd_index.(interior_indices, k) |> Array{IndicesType}
+    return interior_indices
 end
 
 @inline add_3rd_index(t::Tuple, k) = (t[1], t[2], k) 

From d9b3d67f4e82236102d4591cbb0cccde03dfc4e4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 06:43:49 -0400
Subject: [PATCH 121/530] remove requirement on grids

---
 src/Distributed/distributed_grids.jl  |  8 ++++----
 src/Distributed/partition_assemble.jl | 27 ++++++++++++++-------------
 src/OutputWriters/checkpointer.jl     |  2 +-
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 1e5b012c23..98728e53db 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -36,7 +36,7 @@ function RectilinearGrid(arch::DistributedArch,
                          extent = nothing,
                          topology = (Periodic, Periodic, Bounded))
 
-    global_size = sum.(concatenate_local_size(size, arch))
+    global_size = sum.(concatenate_local_sizes(size, arch))
     
     TX, TY, TZ, global_size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
@@ -92,7 +92,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                halo = (1, 1, 1))
 
 
-    global_sizes = concatenate_local_size(size, arch)
+    global_sizes = concatenate_local_sizes(size, arch)
     global_size  = sum.(global_sizes)
 
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
@@ -157,7 +157,7 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
 
     nx, ny, nz = n = size(grid)
     Hx, Hy, Hz = H = halo_size(grid)
-    Nx, Ny, Nz = sum.(concatenate_local_size(n, arch))
+    Nx, Ny, Nz = sum.(concatenate_local_sizes(n, arch))
 
     TX, TY, TZ = topology(grid)
 
@@ -200,7 +200,7 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
 
     nλ, nφ, nz = n = size(grid)
     Hλ, Hφ, Hz = H = halo_size(grid)
-    Nλ, Nφ, Nz = sum.(concatenate_local_size(n, arch))
+    Nλ, Nφ, Nz = sum.(concatenate_local_sizes(n, arch))
 
     TX, TY, TZ = topology(grid)
 
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index f6fab53781..392762dd87 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,16 +1,16 @@
 using Oceananigans.Architectures: arch_array
 
 """
-    concatenate_local_size(n, arch::DistributedArch) 
+    concatenate_local_sizes(n, arch::DistributedArch) 
 
 returns a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
 all 3 directions
 """
-concatenate_local_size(n, arch::DistributedArch) = (concatenate_local_size(n, arch, 1),
-                                                    concatenate_local_size(n, arch, 2),
-                                                    concatenate_local_size(n, arch, 3))
+concatenate_local_sizes(n, arch::DistributedArch) = (concatenate_local_sizes(n, arch, 1),
+                                                    concatenate_local_sizes(n, arch, 2),
+                                                    concatenate_local_sizes(n, arch, 3))
 
-function concatenate_local_size(n, arch::DistributedArch, idx)
+function concatenate_local_sizes(n, arch::DistributedArch, idx)
     R = arch.ranks[idx]
     r = arch.local_index[idx]
     n = n[idx]
@@ -27,7 +27,7 @@ function concatenate_local_size(n, arch::DistributedArch, idx)
     return l
 end
 
-function concatenate_local_size(n, R, r) 
+function concatenate_local_sizes(n, R, r) 
     l = zeros(Int, R)
     l[r] = n
     MPI.Allreduce!(l, +, MPI.COMM_WORLD)
@@ -41,12 +41,12 @@ end
 
 # Have to fix this! This won't work for face constructors??
 function partition(c::AbstractVector, n, R, r)
-    nl = concatenate_local_size(n, R, r)
+    nl = concatenate_local_sizes(n, R, r)
     return c[1 + sum(nl[1:r-1]) : 1 + sum(nl[1:r])]
 end
 
 function partition(c::Tuple, n, R, r)
-    nl = concatenate_local_size(n, R, r)
+    nl = concatenate_local_sizes(n, R, r)
     N  = sum(nl)
 
     Δl = (c[2] - c[1]) / N  
@@ -69,7 +69,7 @@ and `arch`itecture. Since we use a global reduction, only ranks at positions
 1 in the other two directions `r1 == 1` and `r2 == 1` fill the 1D array.
 """
 function assemble(c_local::AbstractVector, n, R, r, r1, r2, comm) 
-    nl = concatenate_local_size(n, R, r)
+    nl = concatenate_local_sizes(n, R, r)
 
     c_global = zeros(eltype(c_local), sum(nl)+1)
 
@@ -109,14 +109,14 @@ Usefull for boundary arrays, forcings and initial conditions.
 """
 partition_global_array(arch, c_global::Function, n) = c_global 
 
-# Here we just assume we cannot partition in z (we should remove support for that!!)
+# Here we assume that we cannot partition in z (we should remove support for that)
 function partition_global_array(arch, c_global::AbstractArray, n) 
     c_global = arch_array(CPU(), c_global)
 
-    ri, rj, rk = r = arch.local_index
+    ri, rj, rk = arch.local_index
 
     dims = length(size(c_global))
-    nx, ny, nz = concatenate_local_size(n, arch)
+    nx, ny, nz = concatenate_local_sizes(n, arch)
 
     nz = nz[1]
 
@@ -151,7 +151,7 @@ function construct_global_array(arch, c_local::AbstractArray, n)
 
     dims = length(size(c_local))
 
-    nx, ny, nz = concatenate_local_size(n, arch)
+    nx, ny, nz = concatenate_local_sizes(n, arch)
 
     Nx = sum(nx)
     Ny = sum(ny)
@@ -173,5 +173,6 @@ function construct_global_array(arch, c_local::AbstractArray, n)
         
         MPI.Allreduce!(c_global, +, arch.communicator)
     end
+
     return arch_array(child_architecture(arch), c_global)
 end
diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index fa948dc0b7..4b7b1b3854 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -204,7 +204,7 @@ function set!(model, filepath::AbstractString)
         checkpointed_grid = file["grid"]
 
         model.grid == checkpointed_grid ||
-             error("The grid associated with $filepath and model.grid are not the same!")
+             @warn "The grid associated with $filepath and model.grid are not the same!"
 
         model_fields = prognostic_fields(model)
 

From 46932ca8f143afb1e40422df85fdb3fddc9bfbf1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 06:58:55 -0400
Subject: [PATCH 122/530] at least fix one test

---
 test/dependencies_for_poisson_solvers.jl | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/test/dependencies_for_poisson_solvers.jl b/test/dependencies_for_poisson_solvers.jl
index 26f5c96267..d1d813f543 100644
--- a/test/dependencies_for_poisson_solvers.jl
+++ b/test/dependencies_for_poisson_solvers.jl
@@ -2,7 +2,7 @@ using CUDA
 using Oceananigans.Solvers: solve!, set_source_term!
 using Oceananigans.Solvers: poisson_eigenvalues
 using Oceananigans.Models.NonhydrostaticModels: solve_for_pressure!
-using Oceananigans.Models.HydrostaticFreeSurfaceModels: _compute_w_from_continuity!
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: compute_w_from_continuity!
 using Oceananigans.BoundaryConditions: regularize_field_boundary_conditions
 
 function poisson_solver_instantiates(grid, planner_flag)
@@ -57,14 +57,10 @@ function random_divergence_free_source_term(grid)
     set!(Rw, zeros(Nx, Ny, Nz))
 
     arch = architecture(grid)
-    fill_halo_regions!(Ru, nothing, nothing)
-    fill_halo_regions!(Rv, nothing, nothing)
-    fill_halo_regions!(Rw, nothing, nothing)
-
-    launch!(arch, grid, :xy, _compute_w_from_continuity!, U, grid)
-
-    fill_halo_regions!(Rw, nothing, nothing)
+    fill_halo_regions!((Ru, Rv, Rw))
 
+    compute_w_from_continuity!(U, arch, grid)
+    
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType

From 9fd7695dc2ca96927326719c433248a403262aa6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 07:16:15 -0400
Subject: [PATCH 123/530] fixed

---
 test/dependencies_for_poisson_solvers.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/dependencies_for_poisson_solvers.jl b/test/dependencies_for_poisson_solvers.jl
index d1d813f543..f09eb9d9a3 100644
--- a/test/dependencies_for_poisson_solvers.jl
+++ b/test/dependencies_for_poisson_solvers.jl
@@ -60,7 +60,8 @@ function random_divergence_free_source_term(grid)
     fill_halo_regions!((Ru, Rv, Rw))
 
     compute_w_from_continuity!(U, arch, grid)
-    
+    fill_halo_regions!(Rw)
+
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
     R = zeros(Nx, Ny, Nz) |> ArrayType

From 05c57930d560fd84deced41075900512b59e25bd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 07:37:52 -0400
Subject: [PATCH 124/530] remove the surface map

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl             | 9 ++++-----
 src/ImmersedBoundaries/active_cells_map.jl               | 2 +-
 .../grid_fitted_immersed_boundaries.jl                   | 6 +++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index b6a8de3c6e..a30f790f82 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -104,20 +104,19 @@ abstract type AbstractImmersedBoundary end
 ##### ImmersedBoundaryGrid
 #####
 
-struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, S, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
+struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch}
     architecture :: Arch
     underlying_grid :: G
     immersed_boundary :: I
     active_cells_interior :: M
-    active_cells_surface :: S
     
     # Internal interface
-    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M, ms::S) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M, S}
+    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M}
         FT = eltype(grid)
         arch = architecture(grid)
         Arch = typeof(arch)
         
-        return new{FT, TX, TY, TZ, G, I, M, S, Arch}(arch, grid, ib, mi, ms)
+        return new{FT, TX, TY, TZ, G, I, M, S, Arch}(arch, grid, ib, mi)
     end
 end
 
@@ -137,7 +136,7 @@ const IBG = ImmersedBoundaryGrid
 @inline z_domain(ibg::IBG) = z_domain(ibg.underlying_grid)
 
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
-    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_interior), adapt(to, ibg.active_cells_surface))
+    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_interior))
 
 function with_halo(halo, ibg::ImmersedBoundaryGrid) 
     return ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 26be7c9edd..02239c60f9 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -42,7 +42,7 @@ function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool)
 
     return ImmersedBoundaryGrid{TX, TY, TZ}(ibg.underlying_grid, 
                                             ibg.immersed_boundary, 
-                                            map_interior, map_surface)
+                                            map_interior)
 end
 
 @inline active_cell(i, j, k, ibg) = !immersed_cell(i, j, k, ibg)
diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index f1984fe9ba..36bb9db960 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -82,7 +82,7 @@ end
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)
     validate_ib_size(grid, ib)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing, nothing)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing)
 end
 
 function validate_ib_size(grid, ib)
@@ -177,9 +177,9 @@ function ImmersedBoundaryGrid(grid, ib::GridFittedBoundary; precompute_mask=true
     if precompute_mask
         mask_field = compute_mask(grid, ib)
         new_ib = GridFittedBoundary(mask_field)
-        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
+        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib, nothing)
     else
-        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
+        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing)
     end
 end
 

From 3b272a1c59fea4fc889206ea28071658835d0bfd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 07:42:56 -0400
Subject: [PATCH 125/530] bugfix

---
 src/Distributed/distributed_grids.jl         | 2 +-
 src/ImmersedBoundaries/ImmersedBoundaries.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 98728e53db..f74d7bc0c1 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -19,7 +19,7 @@ const DistributedRectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ} =
 const DistributedLatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ} = 
     LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:DistributedArch} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
 
-const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, S, <:DistributedArch} where {FT, TX, TY, TZ, I, S, M}
+const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:DistributedArch} where {FT, TX, TY, TZ, I, M}
 
 """
     RectilinearGrid(arch::DistributedArch, FT=Float64; kw...)
diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index a30f790f82..9f2ce94f16 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -116,7 +116,7 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
         arch = architecture(grid)
         Arch = typeof(arch)
         
-        return new{FT, TX, TY, TZ, G, I, M, S, Arch}(arch, grid, ib, mi)
+        return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, mi)
     end
 end
 

From 224f826de503ce43a11e764d473c70e3600faadf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 09:50:30 -0400
Subject: [PATCH 126/530] allow flat distributed grids

---
 src/Distributed/distributed_grids.jl  | 8 +++++---
 src/Distributed/partition_assemble.jl | 5 ++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index f74d7bc0c1..643b119af0 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -2,7 +2,7 @@ using MPI
 using OffsetArrays
 using Oceananigans.Utils: getnamewrapper
 using Oceananigans.Grids: topology, size, halo_size, architecture, pop_flat_elements
-using Oceananigans.Grids: validate_rectilinear_grid_args, validate_lat_lon_grid_args
+using Oceananigans.Grids: validate_rectilinear_grid_args, validate_lat_lon_grid_args, validate_size
 using Oceananigans.Grids: generate_coordinate, with_precomputed_metrics
 using Oceananigans.Grids: cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z
 using Oceananigans.Grids: R_Earth, metrics_precomputed
@@ -41,6 +41,8 @@ function RectilinearGrid(arch::DistributedArch,
     TX, TY, TZ, global_size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
 
+    size = validate_size(TX, TY, TZ, size)
+
     Hx, Hy, Hz = halo
 
     ri, rj, rk = arch.local_index
@@ -49,8 +51,6 @@ function RectilinearGrid(arch::DistributedArch,
     TX = insert_connected_topology(TX, Rx, ri)
     TY = insert_connected_topology(TY, Ry, rj)
     TZ = insert_connected_topology(TZ, Rz, rk)
-
-    nx, ny, nz = size
     
     xl = partition(x, nx, Rx, ri)
     yl = partition(y, ny, Ry, rj)
@@ -98,6 +98,8 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
         validate_lat_lon_grid_args(FT, latitude, longitude, z, global_size, halo, topology, precompute_metrics)
     
+    size = validate_size(topology..., size)
+
     ri, rj, rk = arch.local_index
     Rx, Ry, Rz = arch.ranks
 
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 392762dd87..3590f1bc10 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -6,9 +6,8 @@ using Oceananigans.Architectures: arch_array
 returns a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
 all 3 directions
 """
-concatenate_local_sizes(n, arch::DistributedArch) = (concatenate_local_sizes(n, arch, 1),
-                                                    concatenate_local_sizes(n, arch, 2),
-                                                    concatenate_local_sizes(n, arch, 3))
+concatenate_local_sizes(n, arch::DistributedArch) = 
+    Tuple(concatenate_local_sizes(n, arch, i) for i in 1:length(n))
 
 function concatenate_local_sizes(n, arch::DistributedArch, idx)
     R = arch.ranks[idx]

From 65007e39bef9536a674cb294d8920f173b00a350 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 10:34:05 -0400
Subject: [PATCH 127/530] fixit

---
 src/Distributed/distributed_grids.jl | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 643b119af0..1bd1046da4 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -41,8 +41,7 @@ function RectilinearGrid(arch::DistributedArch,
     TX, TY, TZ, global_size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
 
-    size = validate_size(TX, TY, TZ, size)
-
+    nx, ny, nz = validate_size(TX, TY, TZ, size)
     Hx, Hy, Hz = halo
 
     ri, rj, rk = arch.local_index
@@ -98,8 +97,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
         validate_lat_lon_grid_args(FT, latitude, longitude, z, global_size, halo, topology, precompute_metrics)
     
-    size = validate_size(topology..., size)
-
+    nλ, nφ, nz = validate_size(topology..., size)
     ri, rj, rk = arch.local_index
     Rx, Ry, Rz = arch.ranks
 
@@ -107,8 +105,6 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    nλ, nφ, nz = size
-    
     λl = partition(longitude, nλ, Rx, ri)
     φl = partition(latitude,  nφ, Ry, rj)
     zl = z

From 9f5cb7a08aa7e99b74b44328ce69ce149c941251 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 5 Apr 2023 12:12:14 -0400
Subject: [PATCH 128/530] fix all CPU tests

---
 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index 36bb9db960..b490f788d3 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -185,7 +185,7 @@ end
 
 function ImmersedBoundaryGrid(grid, ib::GridFittedBoundary{<:OffsetArray}; kw...)
     TX, TY, TZ = topology(grid)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing)
 end
 
 on_architecture(arch, ib::GridFittedBoundary{<:AbstractArray}) = GridFittedBoundary(arch_array(arch, ib.mask))

From 53449710e71f21dd8b53689e4c888e89b81226b5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 15:56:01 -0400
Subject: [PATCH 129/530] back to the previous version

---
 .../ri_based_vertical_diffusivity.jl          | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 5fd1062aa5..5bdae1f9fe 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -63,12 +63,12 @@ Keyword Arguments
 function RiBasedVerticalDiffusivity(time_discretization = VerticallyImplicitTimeDiscretization(),
                                     FT = Float64;
                                     Ri_dependent_tapering = HyperbolicTangentRiDependentTapering(),
-                                    ν₀  = 0.7,
-                                    κ₀  = 0.5,
-                                    κᶜ  = 1.7,
-                                    Cᵉ  = 0.1,
-                                    Ri₀ = 0.1,
-                                    Riᵟ = 0.40,
+                                    ν₀  = 0.30,
+                                    κ₀  = 0.42,
+                                    κᶜ  = 4.0,
+                                    Cᵉ  = 0.57,
+                                    Ri₀ = 0.27,
+                                    Riᵟ = 0.20,
                                     warning = true)
     if warning
         @warn "RiBasedVerticalDiffusivity is an experimental turbulence closure that \n" *
@@ -147,9 +147,10 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 @inline taper(::Exp,    x::T, x₀, δ) where T = exp(- max(zero(T), (x - x₀) / δ))
 @inline taper(::Tanh,   x::T, x₀, δ) where T = (one(T) - tanh((x - x₀) / δ)) / 2
 
-@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, N²)
+@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
     ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ∂zᶠᶜᶠ, velocities.u)^2
     ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ∂zᶜᶠᶠ, velocities.v)^2
+    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     S² = ∂z_u² + ∂z_v²
     Ri = N² / S²
 
@@ -192,7 +193,7 @@ end
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
     # Shear mixing diffusivity and viscosity
-    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, N²)
+    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶜᶜᵃ, Riᶜᶜᶠ, velocities, buoyancy, tracers)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ
@@ -200,8 +201,8 @@ end
 
     κⁿ = κᶜ + κᵉ + κ★
     νⁿ = ν★
-    @inbounds diffusivities.κ[i, j, k] = (0.6 * diffusivities.κ[i, j, k] + κⁿ) / 1.6
-    @inbounds diffusivities.ν[i, j, k] = (0.6 * diffusivities.ν[i, j, k] + νⁿ) / 1.6
+    @inbounds diffusivities.κ[i, j, k] = κⁿ
+    @inbounds diffusivities.ν[i, j, k] = νⁿ
 end
 
 #####

From 353c6bc6b5d1b0963b3572269553d471de799d62 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 15:56:22 -0400
Subject: [PATCH 130/530] back to the previous version

---
 .../ri_based_vertical_diffusivity.jl                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 5bdae1f9fe..cad507d568 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -193,7 +193,7 @@ end
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
     # Shear mixing diffusivity and viscosity
-    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶜᶜᵃ, Riᶜᶜᶠ, velocities, buoyancy, tracers)
+    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, Riᶜᶜᶠ, velocities, buoyancy, tracers)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ

From c849b834d5daccb858e7c75ca37e7bb31e627dbe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:27:46 -0400
Subject: [PATCH 131/530] new formulation

---
 .../ri_based_vertical_diffusivity.jl          | 32 +++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index cad507d568..0942473d86 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -2,6 +2,7 @@ using Oceananigans.Architectures: architecture, arch_array
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators
 using Oceananigans.Operators: ℑzᵃᵃᶜ
+using Oceananigans.ImmersedBoundaries: ActiveCellsIBG, use_only_active_interior_cells, active_linear_index_to_interior_tuple 
 
 struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
     ν₀  :: FT
@@ -120,6 +121,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
+    only_active_cells = use_only_active_interior_cells(grid)
+
     launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
             diffusivities,
@@ -130,7 +133,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock)
+            clock;
+            only_active_cells)
 
     return nothing
 end
@@ -158,8 +162,9 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
+
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
-                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
+                                                velocities, tracers, buoyancy, tracer_bcs, clock)
 
     i′, j′, k′ = @index(Global, NTuple)
 
@@ -167,6 +172,27 @@ end
     j = j′ + offs[2] 
     k = k′ + offs[3]
 
+    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+                                     velocities, tracers, buoyancy, tracer_bcs, clock)
+end
+
+@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
+
+    idx = @index(Global, Linear)
+
+    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
+
+    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+                                     velocities, tracers, buoyancy, tracer_bcs, clock)
+end
+
+@inline function _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+                                                  velocities, tracers, buoyancy, tracer_bcs, clock)
+
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
 
@@ -203,6 +229,8 @@ end
     νⁿ = ν★
     @inbounds diffusivities.κ[i, j, k] = κⁿ
     @inbounds diffusivities.ν[i, j, k] = νⁿ
+
+    return nothing
 end
 
 #####

From c13408d261df72b42d2be085b68b3ae3c41170ee Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:34:39 -0400
Subject: [PATCH 132/530] old formulation

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl      |  1 +
 src/ImmersedBoundaries/active_cells_map.jl        |  4 ++--
 .../additional_diffusivity_kernels.jl             | 15 +++++++++++++++
 .../ri_based_vertical_diffusivity.jl              | 15 ---------------
 src/Utils/kernel_launching.jl                     |  5 +++--
 5 files changed, 21 insertions(+), 19 deletions(-)
 create mode 100644 src/ImmersedBoundaries/additional_diffusivity_kernels.jl

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 9f2ce94f16..aa761a1b35 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -291,5 +291,6 @@ include("immersed_boundary_condition.jl")
 include("conditional_derivatives.jl")
 include("mask_immersed_field.jl")
 include("immersed_reductions.jl")
+include("additional_diffusivity_kernels.jl")
 
 end # module
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 02239c60f9..9dbef8d9d0 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -3,14 +3,14 @@ using Oceananigans.Grids: AbstractGrid
 
 using KernelAbstractions: @kernel, @index
 
-import Oceananigans.Utils: active_cells_work_layout
+import Oceananigans.Utils: active_cells_work_layout, 
+                           use_only_active_interior_cells
 
 const ActiveCellsIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractArray}
 
 struct InteriorMap end
 struct SurfaceMap end
 
-@inline use_only_active_interior_cells(grid::AbstractGrid)   = nothing
 @inline use_only_active_interior_cells(grid::ActiveCellsIBG) = InteriorMap()
 
 @inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
new file mode 100644
index 0000000000..0bbcf69d59
--- /dev/null
+++ b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
@@ -0,0 +1,15 @@
+import Oceananigans.TurbulenceClosures
+
+@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
+
+    idx = @index(Global, Linear)
+
+    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
+
+    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+                                     velocities, tracers, buoyancy, tracer_bcs, clock)
+end
\ No newline at end of file
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 0942473d86..f4fb571ffc 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -162,7 +162,6 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
-
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
 
@@ -176,20 +175,6 @@ end
                                      velocities, tracers, buoyancy, tracer_bcs, clock)
 end
 
-@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
-
-    idx = @index(Global, Linear)
-
-    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
-
-    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
-                                     velocities, tracers, buoyancy, tracer_bcs, clock)
-end
-
 @inline function _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
                                                   velocities, tracers, buoyancy, tracer_bcs, clock)
 
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index c06942133a..cb899fed98 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -64,8 +64,9 @@ function work_layout(grid, workdims::Symbol; include_right_boundaries=false, loc
     return workgroup, worksize
 end
 
-active_cells_work_layout(size, only_active_cells, grid) = heuristic_workgroup(size...), size
-
+@inline active_cells_work_layout(size, only_active_cells, grid) = heuristic_workgroup(size...), size
+@inline use_only_active_interior_cells(grid) = nothing
+\
 """
     launch!(arch, grid, layout, kernel!, args...; kwargs...)
 

From 178e408628ff8dddb59e79ab1bebb69cfd7bb4a6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:35:44 -0400
Subject: [PATCH 133/530] bugfix

---
 src/ImmersedBoundaries/additional_diffusivity_kernels.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
index 0bbcf69d59..d45cc1c2fb 100644
--- a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
+++ b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
@@ -1,4 +1,5 @@
-import Oceananigans.TurbulenceClosures
+using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!
+import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!
 
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
                                                  velocities, tracers, buoyancy, tracer_bcs, clock)

From 5f96cfe67ee8af2948f1524d39aca9751dd0fae6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:38:24 -0400
Subject: [PATCH 134/530] bugfix

---
 .../ri_based_vertical_diffusivity.jl                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index f4fb571ffc..f6ab62b3d3 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -2,7 +2,7 @@ using Oceananigans.Architectures: architecture, arch_array
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators
 using Oceananigans.Operators: ℑzᵃᵃᶜ
-using Oceananigans.ImmersedBoundaries: ActiveCellsIBG, use_only_active_interior_cells, active_linear_index_to_interior_tuple 
+using Oceananigans.Utils: use_only_active_interior_cells
 
 struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
     ν₀  :: FT

From ea8c9cbec00b846928e7b4ba8680d0019812ec3c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 16:42:14 -0400
Subject: [PATCH 135/530] bugfix

---
 src/ImmersedBoundaries/additional_diffusivity_kernels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
index d45cc1c2fb..fa6f21adbc 100644
--- a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
+++ b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
@@ -1,4 +1,4 @@
-using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!
+using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!, FlavorOfRBVD
 import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!
 
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,

From 3c523acc76b6e4cf0209c5a0a28626f88d590c55 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 18:05:31 -0400
Subject: [PATCH 136/530] trick to make it faster (do not merge)

---
 .../additional_diffusivity_kernels.jl         | 18 ++++++++--
 .../ri_based_vertical_diffusivity.jl          | 34 ++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
index fa6f21adbc..f807403476 100644
--- a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
+++ b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
@@ -1,5 +1,19 @@
-using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!, FlavorOfRBVD
-import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!
+using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!, Riᶜᶜᶠ, FlavorOfRBVD
+import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!, compute_ri_number!
+
+
+@kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+    velocities, tracers, buoyancy, tracer_bcs, clock)
+
+    idx = @index(Global, Linear)
+    
+    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
+
+    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
+end
 
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
                                                  velocities, tracers, buoyancy, tracer_bcs, clock)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index f6ab62b3d3..8d22543bf8 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -107,9 +107,10 @@ with_tracers(tracers, closure::FlavorOfRBVD) = closure
 
 # Note: computing diffusivities at cell centers for now.
 function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
-    κ = Field((Center, Center, Face), grid)
-    ν = Field((Center, Center, Face), grid)
-    return (; κ, ν)
+    κ  = Field((Center, Center, Face), grid)
+    ν  = Field((Center, Center, Face), grid)
+    Ri = Field((Center, Center, Face), grid)
+    return (; κ, ν, Ri)
 end
 
 function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
@@ -123,6 +124,19 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
 
     only_active_cells = use_only_active_interior_cells(grid)
 
+    launch!(arch, grid, kernel_size,
+            compute_ri_number!,
+            diffusivities,
+            kernel_offsets,
+            grid,
+            closure,
+            velocities,
+            tracers,
+            buoyancy,
+            top_tracer_bcs,
+            clock;
+            only_active_cells)
+
     launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
             diffusivities,
@@ -162,6 +176,18 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
+@kernel function compute_ri_number!(diffusivities, offs, grid, closure::FlavorOfRBVD,
+    velocities, tracers, buoyancy, tracer_bcs, clock)
+
+    i′, j′, k′ = @index(Global, NTuple)
+
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
+
+    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
+end
+
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
 
@@ -204,7 +230,7 @@ end
     κᵉ = ifelse(entraining, Cᵉ, zero(grid))
 
     # Shear mixing diffusivity and viscosity
-    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, Riᶜᶜᶠ, velocities, buoyancy, tracers)
+    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, diffusivities.Ri)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κ★ = κ₀ * τ

From d76752fb9e64ac7e3503b9dc95fc16f664ddb4de Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 19:54:52 -0400
Subject: [PATCH 137/530] bugfix

---
 .../ri_based_vertical_diffusivity.jl                          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 8d22543bf8..f6315d9e85 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -177,7 +177,7 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 end
 
 @kernel function compute_ri_number!(diffusivities, offs, grid, closure::FlavorOfRBVD,
-    velocities, tracers, buoyancy, tracer_bcs, clock)
+                                    velocities, tracers, buoyancy, tracer_bcs, clock)
 
     i′, j′, k′ = @index(Global, NTuple)
 
@@ -185,7 +185,7 @@ end
     j = j′ + offs[2] 
     k = k′ + offs[3]
 
-    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
+    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
 end
 
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,

From 126c19ffc69072e35524f26744c5fec305181e06 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 20:54:15 -0400
Subject: [PATCH 138/530] bugfix

---
 .../ri_based_vertical_diffusivity.jl                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index f6315d9e85..0d85d1522c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -165,7 +165,7 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 @inline taper(::Exp,    x::T, x₀, δ) where T = exp(- max(zero(T), (x - x₀) / δ))
 @inline taper(::Tanh,   x::T, x₀, δ) where T = (one(T) - tanh((x - x₀) / δ)) / 2
 
-@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
+@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
     ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ∂zᶠᶜᶠ, velocities.u)^2
     ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ∂zᶜᶠᶠ, velocities.v)^2
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)

From 5a4c0713fb41df6375777d0e03b89b1df492bece Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 20:54:47 -0400
Subject: [PATCH 139/530] bugfix

---
 src/ImmersedBoundaries/additional_diffusivity_kernels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
index f807403476..32f7ee6c25 100644
--- a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
+++ b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
@@ -12,7 +12,7 @@ import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!, compute
     j = j′ + offs[2] 
     k = k′ + offs[3]
 
-    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, bouyancy, tracers)
+    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
 end
 
 @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,

From c6c8db6622bdc7e70921011b182ad35123d9cb94 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Apr 2023 23:33:48 -0400
Subject: [PATCH 140/530] small bugfix

---
 src/Advection/weno_reconstruction.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/weno_reconstruction.jl b/src/Advection/weno_reconstruction.jl
index 41d81a0de9..08bc05c2af 100644
--- a/src/Advection/weno_reconstruction.jl
+++ b/src/Advection/weno_reconstruction.jl
@@ -114,7 +114,7 @@ function WENO(FT::DataType=Float64;
 
     if order < 3
         # WENO(order = 1) is equivalent to UpwindBiased(order = 1)
-        return UpwindBiased(order = 1)
+        return UpwindBiased(FT; order = 1)
     else
         N  = Int((order + 1) ÷ 2)
 

From 5dfd9711d406a443da30fc0e8efd7bddb1f45a6e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 00:21:49 -0400
Subject: [PATCH 141/530] fix float32

---
 src/Advection/centered_reconstruction.jl      |  4 ++--
 src/Advection/reconstruction_coefficients.jl  | 18 +++++++++---------
 src/Advection/upwind_biased_reconstruction.jl |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/Advection/centered_reconstruction.jl b/src/Advection/centered_reconstruction.jl
index 5c74d5b909..9ea5147323 100644
--- a/src/Advection/centered_reconstruction.jl
+++ b/src/Advection/centered_reconstruction.jl
@@ -100,8 +100,8 @@ for (dir, ξ, val) in zip((:xᶠᵃᵃ, :yᵃᶠᵃ, :zᵃᵃᶠ), (:x, :y, :z),
 
     for buffer in [1, 2, 3, 4, 5, 6]
         @eval begin
-            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer}, ψ, idx, loc, args...)           = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ, false)) .* retrieve_coeff(scheme, Val($val), idx, loc))
-            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer}, ψ::Function, idx, loc, args...) = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ,  true)) .* retrieve_coeff(scheme, Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer, FT}, ψ, idx, loc, args...)           where FT = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ, false)) .* retrieve_coeff(scheme, Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer, FT}, ψ::Function, idx, loc, args...) where FT = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ,  true)) .* retrieve_coeff(scheme, Val($val), idx, loc))
         end
     end
 end
diff --git a/src/Advection/reconstruction_coefficients.jl b/src/Advection/reconstruction_coefficients.jl
index 1c6c58cd03..6c06cddae0 100644
--- a/src/Advection/reconstruction_coefficients.jl
+++ b/src/Advection/reconstruction_coefficients.jl
@@ -123,13 +123,13 @@ Examples
 julia> using Oceananigans.Advection: calc_reconstruction_stencil
 
 julia> calc_reconstruction_stencil(1, :right, :x)
-:(+(coeff1_right[1] * ψ[i + 0, j, k]))
+:(+(FT(coeff1_right[1]) * ψ[i + 0, j, k]))
 
 julia> calc_reconstruction_stencil(1, :left, :x)
-:(+(coeff1_left[1] * ψ[i + -1, j, k]))
+:(+(FT(coeff1_left[1]) * ψ[i + -1, j, k]))
 
 julia> calc_reconstruction_stencil(1, :symm, :x)
-:(coeff2_symm[2] * ψ[i + -1, j, k] + coeff2_symm[1] * ψ[i + 0, j, k])
+:(FT(coeff2_symm[2]) * ψ[i + -1, j, k] + coeff2_symm[1] * ψ[i + 0, j, k])
 
 julia> calc_reconstruction_stencil(2, :symm, :x)
 :(coeff4_symm[4] * ψ[i + -2, j, k] + coeff4_symm[3] * ψ[i + -1, j, k] + coeff4_symm[2] * ψ[i + 0, j, k] + coeff4_symm[1] * ψ[i + 1, j, k])
@@ -154,16 +154,16 @@ julia> calc_reconstruction_stencil(3, :left, :x)
         c = n - buffer - 1
         if func
             stencil_full[idx] = dir == :x ? 
-                                :($coeff[$(order - idx + 1)] * ψ(i + $c, j, k, grid, args...)) :
+                                :(FT($coeff[$(order - idx + 1)]) * ψ(i + $c, j, k, grid, args...)) :
                                 dir == :y ?
-                                :($coeff[$(order - idx + 1)] * ψ(i, j + $c, k, grid, args...)) :
-                                :($coeff[$(order - idx + 1)] * ψ(i, j, k + $c, grid, args...))
+                                :(FT($coeff[$(order - idx + 1)]) * ψ(i, j + $c, k, grid, args...)) :
+                                :(FT($coeff[$(order - idx + 1)]) * ψ(i, j, k + $c, grid, args...))
         else
             stencil_full[idx] =  dir == :x ? 
-                                :($coeff[$(order - idx + 1)] * ψ[i + $c, j, k]) :
+                                :(FT($coeff[$(order - idx + 1)]) * ψ[i + $c, j, k]) :
                                 dir == :y ?
-                                :($coeff[$(order - idx + 1)] * ψ[i, j + $c, k]) :
-                                :($coeff[$(order - idx + 1)] * ψ[i, j, k + $c])
+                                :(FT($coeff[$(order - idx + 1)]) * ψ[i, j + $c, k]) :
+                                :(FT($coeff[$(order - idx + 1)]) * ψ[i, j, k + $c])
         end
     end
     return Expr(:call, :+, stencil_full...)
diff --git a/src/Advection/upwind_biased_reconstruction.jl b/src/Advection/upwind_biased_reconstruction.jl
index 9c19d91024..843c3670fd 100644
--- a/src/Advection/upwind_biased_reconstruction.jl
+++ b/src/Advection/upwind_biased_reconstruction.jl
@@ -124,8 +124,8 @@ for (sd, side) in enumerate((:left, :right)), (dir, ξ, val) in zip((:xᶠᵃᵃ
 
     for buffer in [1, 2, 3, 4, 5, 6]
         @eval begin
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer}, ψ, idx, loc, args...)           = @inbounds sum($(reconstruction_stencil(buffer, side, ξ, false)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer}, ψ::Function, idx, loc, args...) = @inbounds sum($(reconstruction_stencil(buffer, side, ξ,  true)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ, idx, loc, args...)           where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ, false)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ::Function, idx, loc, args...) where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ,  true)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
         end
     end
 end

From c3406528c70a7d2bf268ea8fa93933322c6de3ba Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 00:58:32 -0400
Subject: [PATCH 142/530] more fixing

---
 src/Advection/reconstruction_coefficients.jl | 6 +++---
 src/TimeSteppers/quasi_adams_bashforth_2.jl  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Advection/reconstruction_coefficients.jl b/src/Advection/reconstruction_coefficients.jl
index 6c06cddae0..e385954d1a 100644
--- a/src/Advection/reconstruction_coefficients.jl
+++ b/src/Advection/reconstruction_coefficients.jl
@@ -129,13 +129,13 @@ julia> calc_reconstruction_stencil(1, :left, :x)
 :(+(FT(coeff1_left[1]) * ψ[i + -1, j, k]))
 
 julia> calc_reconstruction_stencil(1, :symm, :x)
-:(FT(coeff2_symm[2]) * ψ[i + -1, j, k] + coeff2_symm[1] * ψ[i + 0, j, k])
+:(FT(coeff2_symm[2]) * ψ[i + -1, j, k] + FT(coeff2_symm[1]) * ψ[i + 0, j, k])
 
 julia> calc_reconstruction_stencil(2, :symm, :x)
-:(coeff4_symm[4] * ψ[i + -2, j, k] + coeff4_symm[3] * ψ[i + -1, j, k] + coeff4_symm[2] * ψ[i + 0, j, k] + coeff4_symm[1] * ψ[i + 1, j, k])
+:(FT(coeff4_symm[4]) * ψ[i + -2, j, k] + FT(coeff4_symm[3]) * ψ[i + -1, j, k] + FT(coeff4_symm[2]) * ψ[i + 0, j, k] + FT(coeff4_symm[1]) * ψ[i + 1, j, k])
 
 julia> calc_reconstruction_stencil(3, :left, :x)
-:(coeff5_left[5] * ψ[i + -3, j, k] + coeff5_left[4] * ψ[i + -2, j, k] + coeff5_left[3] * ψ[i + -1, j, k] + coeff5_left[2] * ψ[i + 0, j, k] + coeff5_left[1] * ψ[i + 1, j, k])
+:(FT(coeff5_left[5]) * ψ[i + -3, j, k] + FT(coeff5_left[4]) * ψ[i + -2, j, k] + FT(coeff5_left[3]) * ψ[i + -1, j, k] + FT(coeff5_left[2]) * ψ[i + 0, j, k] + FT(coeff5_left[1]) * ψ[i + 1, j, k])
 ```
 """
 @inline function calc_reconstruction_stencil(buffer, shift, dir, func::Bool = false)
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 248da07f64..1880f91a10 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -47,6 +47,7 @@ function QuasiAdamsBashforth2TimeStepper(grid, tracers,
 
     FT = eltype(grid)
     GT = typeof(Gⁿ)
+    χ  = FT(χ)
 
     return QuasiAdamsBashforth2TimeStepper{FT, GT, IT}(χ, Inf, Gⁿ, G⁻, implicit_solver)
 end

From aec8245d9d21d8fba11f91b879cddb151f750522 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 08:59:34 -0400
Subject: [PATCH 143/530] bugfix

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl  |  1 -
 .../additional_diffusivity_kernels.jl         | 30 -------------------
 src/TimeSteppers/quasi_adams_bashforth_2.jl   |  2 +-
 .../ri_based_vertical_diffusivity.jl          |  8 ++---
 4 files changed, 3 insertions(+), 38 deletions(-)
 delete mode 100644 src/ImmersedBoundaries/additional_diffusivity_kernels.jl

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index aa761a1b35..9f2ce94f16 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -291,6 +291,5 @@ include("immersed_boundary_condition.jl")
 include("conditional_derivatives.jl")
 include("mask_immersed_field.jl")
 include("immersed_reductions.jl")
-include("additional_diffusivity_kernels.jl")
 
 end # module
diff --git a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl b/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
deleted file mode 100644
index 32f7ee6c25..0000000000
--- a/src/ImmersedBoundaries/additional_diffusivity_kernels.jl
+++ /dev/null
@@ -1,30 +0,0 @@
-using Oceananigans.TurbulenceClosures: _compute_ri_based_diffusivities!, Riᶜᶜᶠ, FlavorOfRBVD
-import Oceananigans.TurbulenceClosures: compute_ri_based_diffusivities!, compute_ri_number!
-
-
-@kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-    velocities, tracers, buoyancy, tracer_bcs, clock)
-
-    idx = @index(Global, Linear)
-    
-    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
-
-    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
-end
-
-@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
-
-    idx = @index(Global, Linear)
-
-    i′, j′, k′ = active_linear_index_to_interior_tuple(idx, grid)
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
-
-    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
-                                     velocities, tracers, buoyancy, tracer_bcs, clock)
-end
\ No newline at end of file
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 1880f91a10..e32113d121 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -151,7 +151,7 @@ Time step velocity fields via the 2nd-order quasi Adams-Bashforth method
 
     T = eltype(u)
     one_point_five = convert(T, 1.5)
-    oh_point_five = convert(T, 0.5)
+    oh_point_five  = convert(T, 0.5)
 
     @inbounds u[i, j, k] += Δt * ((one_point_five + χ) * Gⁿ[i, j, k] - (oh_point_five + χ) * G⁻[i, j, k])
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 0d85d1522c..03372c39f5 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -122,8 +122,6 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    only_active_cells = use_only_active_interior_cells(grid)
-
     launch!(arch, grid, kernel_size,
             compute_ri_number!,
             diffusivities,
@@ -134,8 +132,7 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock;
-            only_active_cells)
+            clock)
 
     launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
@@ -147,8 +144,7 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock;
-            only_active_cells)
+            clock)
 
     return nothing
 end

From 8bfa22d0bc24b0a9f2d56076b837056c9c8c1b38 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 09:37:09 -0400
Subject: [PATCH 144/530] why is this not working?

---
 src/TimeSteppers/quasi_adams_bashforth_2.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index e32113d121..a82958a24d 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -149,9 +149,9 @@ Time step velocity fields via the 2nd-order quasi Adams-Bashforth method
 @kernel function ab2_step_field!(u, Δt, χ, Gⁿ, G⁻)
     i, j, k = @index(Global, NTuple)
 
-    T = eltype(u)
-    one_point_five = convert(T, 1.5)
-    oh_point_five  = convert(T, 0.5)
+    FT = eltype(χ)
+    one_point_five = FT(1.5)
+    oh_point_five  = FT(0.5)
 
     @inbounds u[i, j, k] += Δt * ((one_point_five + χ) * Gⁿ[i, j, k] - (oh_point_five + χ) * G⁻[i, j, k])
 end

From 79064a786d60959577f6288331e8e574aaf69d21 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 09:44:25 -0400
Subject: [PATCH 145/530] bugfix

---
 src/Simulations/run.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Simulations/run.jl b/src/Simulations/run.jl
index 7b5e427f9c..eb48bf2fc5 100644
--- a/src/Simulations/run.jl
+++ b/src/Simulations/run.jl
@@ -41,7 +41,7 @@ end
 Return a time step 'aligned' with `sim.stop_time`, output writer schedules, 
 and callback schedules. Alignment with `sim.stop_time` takes precedence.
 """
-function aligned_time_step(sim::Simulation, Δt)
+function aligned_time_step(sim::Simulation, Δt::FT)
     clock = sim.model.clock
 
     aligned_Δt = Δt
@@ -55,7 +55,7 @@ function aligned_time_step(sim::Simulation, Δt)
     # Temporary fix for https://github.com/CliMA/Oceananigans.jl/issues/1280
     aligned_Δt = aligned_Δt <= 0 ? Δt : aligned_Δt
 
-    return aligned_Δt
+    return FT(aligned_Δt)
 end
 
 """

From 3a56aa148560f7f991fb204f9e374c537665ea2d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 7 Apr 2023 09:47:02 -0400
Subject: [PATCH 146/530] bugfix

---
 src/Simulations/run.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Simulations/run.jl b/src/Simulations/run.jl
index eb48bf2fc5..a0ea502da0 100644
--- a/src/Simulations/run.jl
+++ b/src/Simulations/run.jl
@@ -41,9 +41,10 @@ end
 Return a time step 'aligned' with `sim.stop_time`, output writer schedules, 
 and callback schedules. Alignment with `sim.stop_time` takes precedence.
 """
-function aligned_time_step(sim::Simulation, Δt::FT)
+function aligned_time_step(sim::Simulation, Δt)
     clock = sim.model.clock
 
+    FT = eltype(Δt)
     aligned_Δt = Δt
 
     # Align time step with output writing and callback execution

From 8642f75bf4d02cef4d4713b826e2864aa1cd7488 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 8 Apr 2023 18:08:57 -0400
Subject: [PATCH 147/530] final configuration

---
 src/ImmersedBoundaries/active_cells_map.jl    | 23 ++++++++++++++++++-
 .../ri_based_vertical_diffusivity.jl          |  8 +++++--
 src/Utils/kernel_launching.jl                 |  2 +-
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 9dbef8d9d0..d0a2035632 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -116,4 +116,25 @@ function active_cells_map_surface(ibg)
     smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
     
     return smaller_indices
-end
\ No newline at end of file
+end
+
+using Ocenanaigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!
+import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
+
+@kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+    velocities, tracers, buoyancy, tracer_bcs, clock)
+    idx = @index(Global, Linear)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+
+    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
+end
+
+@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+                velocities, tracers, buoyancy, tracer_bcs, clock)
+
+    idx = @index(Global, Linear)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+            
+    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+     velocities, tracers, buoyancy, tracer_bcs, clock)
+end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 03372c39f5..0d85d1522c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -122,6 +122,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
+    only_active_cells = use_only_active_interior_cells(grid)
+
     launch!(arch, grid, kernel_size,
             compute_ri_number!,
             diffusivities,
@@ -132,7 +134,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock)
+            clock;
+            only_active_cells)
 
     launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
@@ -144,7 +147,8 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock)
+            clock;
+            only_active_cells)
 
     return nothing
 end
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index cb899fed98..b40b1328ec 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -66,7 +66,7 @@ end
 
 @inline active_cells_work_layout(size, only_active_cells, grid) = heuristic_workgroup(size...), size
 @inline use_only_active_interior_cells(grid) = nothing
-\
+
 """
     launch!(arch, grid, layout, kernel!, args...; kwargs...)
 

From 398eea496b1753ea9cb0ef0fee7871a87480c485 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 9 Apr 2023 14:32:20 -0400
Subject: [PATCH 148/530] bugfix

---
 src/ImmersedBoundaries/active_cells_map.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index d0a2035632..00abf0430f 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -118,7 +118,7 @@ function active_cells_map_surface(ibg)
     return smaller_indices
 end
 
-using Ocenanaigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!
+using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!
 import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
 
 @kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,

From 9303de5eb9fbaf9466bce55bdfe16c1de2720d05 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 9 Apr 2023 14:35:49 -0400
Subject: [PATCH 149/530] bugfix

---
 src/ImmersedBoundaries/active_cells_map.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 00abf0430f..00496ab8d5 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -118,7 +118,7 @@ function active_cells_map_surface(ibg)
     return smaller_indices
 end
 
-using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!
+using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!, FlavorOfRBVD
 import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
 
 @kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,

From 7ba909a02afc6fd1e7ffe74c59ee4f252ce6debf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 9 Apr 2023 20:31:36 -0400
Subject: [PATCH 150/530] fill the corners!!

---
 src/Distributed/halo_communication.jl         | 110 ++++++++---
 src/Distributed/multi_architectures.jl        |  41 ++--
 src/Fields/field_boundary_buffers.jl          | 183 ++++++++++++++----
 ...distributed_split_explicit_free_surface.jl |   6 +-
 4 files changed, 254 insertions(+), 86 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d50859b754..10666b72be 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -10,6 +10,10 @@ using Oceananigans.Fields: fill_west_and_east_send_buffers!,
                            fill_east_send_buffers!,
                            fill_south_send_buffers!,
                            fill_north_send_buffers!,
+                           fill_southwest_send_buffers!,
+                           fill_southeast_send_buffers!,
+                           fill_northwest_send_buffers!,
+                           fill_northeast_send_buffers!,
                            recv_from_buffers!, 
                            reduced_dimensions, 
                            instantiated_location
@@ -38,22 +42,26 @@ import Oceananigans.BoundaryConditions:
 ##### MPI tags for halo communication BCs
 #####
 
-sides  = (:west, :east, :south, :north, :top, :bottom)
-side_id = Dict(side => n for (n, side) in enumerate(sides))
+sides  = (:west, :east, :south, :north, :top, :bottom, :southwest, :southeast, :northwest, :northeast)
+side_id = Dict(side => n-1 for (n, side) in enumerate(sides))
 
 opposite_side = Dict(
     :west => :east, :east => :west,
     :south => :north, :north => :south,
-    :bottom => :top, :top => :bottom
+    :bottom => :top, :top => :bottom,
+    :southwest => :northeast, 
+    :southeast => :northwest, 
+    :northwest => :southeast, 
+    :northeast => :southwest, 
 )
 
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
 #   digit 1-2: an identifier for the field that is reset each timestep
-#   digit 3-4: an identifier for the field's location 
-#   digit 3: the side
-#   digits 4-6: the "from" rank
-#   digits 7-9: the "to" rank
+#   digit 3: an identifier for the field's location 
+#   digit 4: the side
+#   digits 5-6: the "from" rank
+#   digits 7-8: the "to" rank
 
 RANK_DIGITS = 2
 ID_DIGITS   = 2
@@ -120,32 +128,78 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     for task = 1:3
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     end
-
-    # fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    
+    fill_eventual_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     arch.mpi_tag[1] += 1
 
     return nothing
 end
 
+for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2, 3, 3])
+    fill_corner_halo! = Symbol("fill_$(side)_halo!")
+    send_side_halo  = Symbol("send_$(side)_halo")
+    recv_and_fill_side_halo! = Symbol("recv_and_fill_$(side)_halo!")
+    fill_side_send_buffers! = Symbol("fill_$(side)_send_buffers!")    
+
+    @eval begin
+        $fill_corner_halo!(::Nothing, args...; kwargs...) = nothing
+
+        function $fill_corner_halo!(corner, c, indices, loc, arch, grid, buffers, args...; kwargs...) 
+            child_arch = child_architecture(arch)
+            local_rank = arch.local_rank
+
+            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
+            $fill_side_send_buffers!(c, buffers, grid)
+            sync_device!(child_arch)
+
+            send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
+            
+            return [send_req, recv_req]
+        end
+    end
+end
+
 # If more than one direction is communicating we need to repeat one fill halo to fill the freaking corners!
-function fill_eventual_corners!(halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
-    hbc_left  = filter(bc -> bc isa DCBC, halo_tuple[2])
-    hbc_right = filter(bc -> bc isa DCBC, halo_tuple[3])
-
-    # 2D/3D Parallelization when `length(hbc_left) > 1 || length(hbc_right) > 1`
-    if length(hbc_left) > 1 
-        idx = findfirst(bc -> bc isa DCBC, halo_tuple[2])
-        fill_halo_event!(idx, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+function fill_eventual_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; blocking = true, kwargs...)
+    
+    requests = []
+
+    reqsw = fill_southwest_halo!(connectivity.southwest, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    reqse = fill_southeast_halo!(connectivity.southeast, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    reqnw = fill_northwest_halo!(connectivity.northwest, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    reqne = fill_northeast_halo!(connectivity.northeast, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+
+    !isnothing(reqsw) && push!(requests, reqsw...)
+    !isnothing(reqse) && push!(requests, reqse...)
+    !isnothing(reqnw) && push!(requests, reqnw...)
+    !isnothing(reqne) && push!(requests, reqne...)
+
+    if isempty(requests)
         return nothing
     end
 
-    if length(hbc_right) > 1 
-        idx = findfirst(bc -> bc isa DCBC, halo_tuple[3])
-        fill_halo_event!(idx, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    if !blocking && !(arch isa BlockingDistributedArch)
+        push!(arch.mpi_requests, requests...)
         return nothing
     end
+
+    requests = requests |> Array{MPI.Request}
+
+    # Syncronous MPI fill_halo_event!
+    cooperative_waitall!(requests)
+
+    # Reset MPI tag
+    arch.mpi_tag[1] -= arch.mpi_tag[1]
+    recv_from_buffers!(c, buffers, grid, Val(:corners))    
+
+    return nothing
 end
 
+@inline mpi_communication_side(::Val{fill_southwest_halo!}) = :southwest
+@inline mpi_communication_side(::Val{fill_southeast_halo!}) = :southeast
+@inline mpi_communication_side(::Val{fill_northwest_halo!}) = :northwest
+@inline mpi_communication_side(::Val{fill_northeast_halo!}) = :northeast
+
 @inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
@@ -174,6 +228,7 @@ function cooperative_waitall!(tasks::Array{Task})
     end
 end
 
+cooperative_wait(req::MPI.Request) = MPI.Waitall(req)
 cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
 
 function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = true, kwargs...)
@@ -305,16 +360,13 @@ for side in sides
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             
-            # send_event = Threads.@spawn begin
-                send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
-                # cooperative_test!(send_req)
-            # end
+            send_req = MPI.Isend(send_buffer, rank_to_send_to, send_tag, arch.communicator)
 
             return send_req
         end
 
         @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
-        @inline $get_side_send_buffer(c, grid, side_location, buffers, arch)             = buffers.$side.send     
+        @inline $get_side_send_buffer(c, grid, side_location, buffers, arch)                   = buffers.$side.send     
     end
 end
 
@@ -337,16 +389,10 @@ for side in sides
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)
 
-            # recv_event = Threads.@spawn begin
-            #     priority!(device(arch), :high)
-            #     cooperative_test!(recv_req)
-            #     sync_device!(arch)
-            # end
-
             return recv_req
         end
 
         @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
-        @inline $get_side_recv_buffer(c, grid, side_location, buffers, arch)             = buffers.$side.recv
+        @inline $get_side_recv_buffer(c, grid, side_location, buffers, arch)                   = buffers.$side.recv
     end
 end
\ No newline at end of file
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 2d1f8e2332..006707f119 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -147,17 +147,21 @@ end
 ##### Rank connectivity graph
 #####
 
-struct RankConnectivity{E, W, N, S, T, B}
-      east :: E
-      west :: W
-     north :: N
-     south :: S
-       top :: T
-    bottom :: B
+struct RankConnectivity{E, W, N, S, T, B, SW, SE, NW, NE}
+         east :: E
+         west :: W
+        north :: N
+        south :: S
+          top :: T
+       bottom :: B
+    southwest :: SW
+    southeast :: SE
+    northwest :: NW
+    northeast :: NE
 end
 
-RankConnectivity(; east, west, north, south, top, bottom) =
-    RankConnectivity(east, west, north, south, top, bottom)
+RankConnectivity(; east, west, north, south, top, bottom, southwest, southeast, northwest, northeast) =
+    RankConnectivity(east, west, north, south, top, bottom, southwest, southeast, northwest, northeast)
 
 # The "Periodic" topologies are `Periodic`, `FullyConnected` and `RightConnected`
 # The "Bounded" topologies are `Bounded` and `LeftConnected`
@@ -207,8 +211,18 @@ function RankConnectivity(model_index, ranks, topology)
     r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
     r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
+    r_northeast = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
+
+    r_northwest = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
+    r_southeast = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
+    r_southwest = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
+
     return RankConnectivity(east=r_east, west=r_west, north=r_north,
-                            south=r_south, top=r_top, bottom=r_bot)
+                            south=r_south, top=r_top, bottom=r_bot,
+                            southwest=r_southwest,
+                            southeast=r_southeast,
+                            northwest=r_northwest,
+                            northeast=r_northeast)
 end
 
 #####
@@ -225,5 +239,10 @@ function Base.show(io::IO, arch::DistributedArch)
               isnothing(c.north) ? "" : " north=$(c.north)",
               isnothing(c.south) ? "" : " south=$(c.south)",
               isnothing(c.top) ? "" : " top=$(c.top)",
-              isnothing(c.bottom) ? "" : " bottom=$(c.bottom)")
+              isnothing(c.bottom) ? "" : " bottom=$(c.bottom)",
+              isnothing(c.southwest) ? "" : " southwest=$(c.southwest)",
+              isnothing(c.southeast) ? "" : " southeast=$(c.southeast)",
+              isnothing(c.northwest) ? "" : " northwest=$(c.northwest)",
+              isnothing(c.northeast) ? "" : " northeast=$(c.northeast)")
 end
+              
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 4d6f53b9ab..8cf8e45682 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -1,18 +1,22 @@
 using Oceananigans.BoundaryConditions: MCBC, DCBC
 using Oceananigans.Architectures: arch_array
-using Oceananigans.Grids: halo_size
+using Oceananigans.Grids: halo_size, size
 using Oceananigans.Utils: launch!
 using KernelAbstractions: @kernel, @index
 using KernelAbstractions.Extras.LoopInfo: @unroll
 
-struct FieldBoundaryBuffers{W, E, S, N}
+struct FieldBoundaryBuffers{W, E, S, N, SW, SE, NW, NE}
     west :: W
     east :: E
    south :: S
    north :: N
+   southwest :: SW
+   southeast :: SE
+   northwest :: NW
+   northeast :: NE
 end
 
-FieldBoundaryBuffers() = FieldBoundaryBuffers(nothing, nothing, nothing, nothing)
+FieldBoundaryBuffers() = FieldBoundaryBuffers(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)
 FieldBoundaryBuffers(grid, data, ::Missing) = nothing
 FieldBoundaryBuffers(grid, data, ::Nothing) = nothing
 
@@ -20,42 +24,70 @@ function FieldBoundaryBuffers(grid, data, boundary_conditions)
 
     Hx, Hy, Hz = halo_size(grid)
 
-    west  = create_buffer_x(architecture(grid), data, Hx, boundary_conditions.west)
-    east  = create_buffer_x(architecture(grid), data, Hx, boundary_conditions.east)
-    south = create_buffer_y(architecture(grid), data, Hy, boundary_conditions.south)
-    north = create_buffer_y(architecture(grid), data, Hy, boundary_conditions.north)
+    arch = architecture(grid)
+
+    west  = create_buffer_x(architecture(grid), grid, data, Hx, boundary_conditions.west)
+    east  = create_buffer_x(architecture(grid), grid, data, Hx, boundary_conditions.east)
+    south = create_buffer_y(architecture(grid), grid, data, Hy, boundary_conditions.south)
+    north = create_buffer_y(architecture(grid), grid, data, Hy, boundary_conditions.north)
+
+    if hasproperty(arch, :connectivity)
+        sw = create_buffer_corner(arch, grid, data, Hx, Hy, arch.connectivity.southwest)
+        se = create_buffer_corner(arch, grid, data, Hx, Hy, arch.connectivity.southeast)
+        nw = create_buffer_corner(arch, grid, data, Hx, Hy, arch.connectivity.northwest)
+        ne = create_buffer_corner(arch, grid, data, Hx, Hy, arch.connectivity.northeast)
+    else
+        sw = nothing
+        se = nothing
+        nw = nothing
+        ne = nothing
+    end
 
-    return FieldBoundaryBuffers(west, east, south, north)
+    return FieldBoundaryBuffers(west, east, south, north, sw, se, nw, ne)
 end
 
-create_buffer_x(arch, data, H, bc) = nothing
-create_buffer_y(arch, data, H, bc) = nothing
+create_buffer_x(arch, grid, data, H, bc) = nothing
+create_buffer_y(arch, grid, data, H, bc) = nothing
+
+create_buffer_corner(arch, grid, data, Hx, Hy, ::Nothing) = nothing
+
+function create_buffer_corner(arch, grid, data, Hx, Hy, side)
+    if !using_buffered_communication(arch)
+        return nothing
+    end
+    return (send = arch_array(arch, zeros(eltype(data), Hx, Hy, size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), Hx, Hy, size(parent(data), 3))))    
+end
 
 using_buffered_communication(arch) = true
 
 const PassingBC = Union{MCBC, DCBC}
 
-function create_buffer_x(arch, data, H, ::PassingBC) 
+function create_buffer_x(arch, grid, data, H, ::PassingBC) 
     if !using_buffered_communication(arch)
         return nothing
     end
-    return (send = arch_array(arch, zeros(eltype(data), H, size(parent(data), 2), size(parent(data), 3))), 
-            recv = arch_array(arch, zeros(eltype(data), H, size(parent(data), 2), size(parent(data), 3))))    
+    return (send = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))))    
 end
 
-function create_buffer_y(arch, data, H, ::PassingBC)
+function create_buffer_y(arch, grid, data, H, ::PassingBC)
     if !using_buffered_communication(arch)
         return nothing
     end
-    return (send = arch_array(arch, zeros(eltype(data), size(parent(data), 1), H, size(parent(data), 3))), 
-            recv = arch_array(arch, zeros(eltype(data), size(parent(data), 1), H, size(parent(data), 3))))
+    return (send = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))))
 end
 
 Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
     FieldBoundaryBuffers(Adapt.adapt(to, buff.west), 
                          Adapt.adapt(to, buff.east),    
                          Adapt.adapt(to, buff.north), 
-                         Adapt.adapt(to, buff.south))
+                         Adapt.adapt(to, buff.south), 
+                         Adapt.adapt(to, buff.southwest), 
+                         Adapt.adapt(to, buff.southeast), 
+                         Adapt.adapt(to, buff.northwest), 
+                         Adapt.adapt(to, buff.northeast))
 
 """
     fill_send_buffers(c, buffers, arch)
@@ -67,29 +99,41 @@ function fill_west_and_east_send_buffers!(c::OffsetArray, buffers::FieldBoundary
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-    _fill_west_send_buffer!(parent(c), buffers.west, Hx, Nx)
-    _fill_east_send_buffer!(parent(c), buffers.east, Hx, Nx)
+    _fill_west_send_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
+    _fill_east_send_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
 end
 
 function fill_south_and_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-    _fill_south_send_buffer!(parent(c), buffers.south, Hy, Ny)
-    _fill_north_send_buffer!(parent(c), buffers.north, Hy, Ny)
+    _fill_south_send_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
+    _fill_north_send_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
 end
 
 fill_west_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_west_send_buffer!(parent(c), buffers.west, halo_size(grid)[1], size(grid)[1])
+    _fill_west_send_buffer!(parent(c), buffers.west, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 fill_east_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_east_send_buffer!(parent(c), buffers.east, halo_size(grid)[1], size(grid)[1])
+    _fill_east_send_buffer!(parent(c), buffers.east, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 fill_south_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_south_send_buffer!(parent(c), buffers.south, halo_size(grid)[2], size(grid)[2])
+    _fill_south_send_buffer!(parent(c), buffers.south, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 fill_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_north_send_buffer!(parent(c), buffers.north, halo_size(grid)[2], size(grid)[2])
+    _fill_north_send_buffer!(parent(c), buffers.north,halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+fill_southwest_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
+    _fill_southwest_send_buffer!(parent(c), buffers.southwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+fill_southeast_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
+    _fill_southeast_send_buffer!(parent(c), buffers.southeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+fill_northwest_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
+    _fill_northwest_send_buffer!(parent(c), buffers.northwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+fill_northeast_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
+    _fill_northeast_send_buffer!(parent(c), buffers.northeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 """
     recv_from_buffers(c, buffers, arch)
@@ -101,46 +145,103 @@ function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-     _recv_from_west_buffer!(parent(c), buffers.west,  Hx, Nx)
-     _recv_from_east_buffer!(parent(c), buffers.east,  Hx, Nx)
-    _recv_from_south_buffer!(parent(c), buffers.south, Hy, Ny)
-    _recv_from_north_buffer!(parent(c), buffers.north, Hy, Ny)
+     _recv_from_west_buffer!(parent(c), buffers.west,  Hx, Hy, Nx, Ny)
+     _recv_from_east_buffer!(parent(c), buffers.east,  Hx, Hy, Nx, Ny)
+    _recv_from_south_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
+    _recv_from_north_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+   
+   _recv_from_southwest_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+   _recv_from_southeast_buffer!(parent(c), buffers.southeast, Hx, Hy, Nx, Ny)
+   _recv_from_northwest_buffer!(parent(c), buffers.northwest, Hx, Hy, Nx, Ny)
+   _recv_from_northeast_buffer!(parent(c), buffers.northeast, Hx, Hy, Nx, Ny)
+
+   return nothing
+end
+
+function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:corners})
+    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, _ = size(grid)
+
+   _recv_from_southwest_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+   _recv_from_southeast_buffer!(parent(c), buffers.southeast, Hx, Hy, Nx, Ny)
+   _recv_from_northwest_buffer!(parent(c), buffers.northwest, Hx, Hy, Nx, Ny)
+   _recv_from_northeast_buffer!(parent(c), buffers.northeast, Hx, Hy, Nx, Ny)
+
+   return nothing
 end
 
 function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:west_and_east})
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-    _recv_from_west_buffer!(parent(c), buffers.west, Hx, Nx)
-    _recv_from_east_buffer!(parent(c), buffers.east, Hx, Nx)
+    _recv_from_west_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
+    _recv_from_east_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
+
+    return nothing
 end
 
 function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:south_and_north})
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-   _recv_from_south_buffer!(parent(c), buffers.south, Hy, Ny)
-   _recv_from_north_buffer!(parent(c), buffers.north, Hy, Ny)
+   _recv_from_south_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
+   _recv_from_north_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+
+   return nothing
 end
 
 recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:bottom_and_top}) = nothing
 
+recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:southwest}) = 
+        _recv_from_southwest_buffer!(c, buffers.southwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:southeast}) = 
+        _recv_from_southeast_buffer!(c, buffers.southeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:northwest}) = 
+        _recv_from_northwest_buffer!(c, buffers.northwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
+recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:northeast}) = 
+        _recv_from_northeast_buffer!(c, buffers.northeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+
  _fill_west_send_buffer!(c, ::Nothing, args...) = nothing
  _fill_east_send_buffer!(c, ::Nothing, args...) = nothing
 _fill_north_send_buffer!(c, ::Nothing, args...) = nothing
 _fill_south_send_buffer!(c, ::Nothing, args...) = nothing
 
+_fill_southwest_send_buffer!(c, ::Nothing, args...) = nothing
+_fill_southeast_send_buffer!(c, ::Nothing, args...) = nothing
+_fill_northwest_send_buffer!(c, ::Nothing, args...) = nothing
+_fill_northeast_send_buffer!(c, ::Nothing, args...) = nothing
+
  _recv_from_west_buffer!(c, ::Nothing, args...) = nothing
  _recv_from_east_buffer!(c, ::Nothing, args...) = nothing
 _recv_from_north_buffer!(c, ::Nothing, args...) = nothing
 _recv_from_south_buffer!(c, ::Nothing, args...) = nothing
 
- _fill_west_send_buffer!(c, buff, H, N) = buff.send .= view(c, 1+H:2H,  :, :)
- _fill_east_send_buffer!(c, buff, H, N) = buff.send .= view(c, 1+N:N+H, :, :)
-_fill_south_send_buffer!(c, buff, H, N) = buff.send .= view(c, :, 1+H:2H,  :)
-_fill_north_send_buffer!(c, buff, H, N) = buff.send .= view(c, :, 1+N:N+H, :)
+_recv_from_southwest_buffer!(c, ::Nothing, args...) = nothing
+_recv_from_southeast_buffer!(c, ::Nothing, args...) = nothing
+_recv_from_northwest_buffer!(c, ::Nothing, args...) = nothing
+_recv_from_northeast_buffer!(c, ::Nothing, args...) = nothing
+
+ _fill_west_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:Ny+Hy, :)
+ _fill_east_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:Ny+Hy, :)
+_fill_south_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Hy:2Hy,  :)
+_fill_north_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Ny:Ny+Hy, :)
+
+ _recv_from_west_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Hy:Ny+Hy,     :) .= buff.recv
+ _recv_from_east_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Hy:Ny+Hy,     :) .= buff.recv
+_recv_from_south_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1:Hy,           :) .= buff.recv
+_recv_from_north_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+
+_fill_southwest_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:2Hy,   :)
+_fill_southeast_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:2Hy,   :)
+_fill_northwest_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Ny:Ny+Hy, :)
+_fill_northeast_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Ny:Ny+Hy, :)
+
+_recv_from_southwest_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1:Hy,           :) .= buff.recv
+_recv_from_southeast_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1:Hy,           :) .= buff.recv
+_recv_from_northwest_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+_recv_from_northeast_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+
 
- _recv_from_west_buffer!(c, buff, H, N) = view(c, 1:H,        :, :) .= buff.recv
- _recv_from_east_buffer!(c, buff, H, N) = view(c, 1+N+H:N+2H, :, :) .= buff.recv
-_recv_from_south_buffer!(c, buff, H, N) = view(c, :, 1:H,        :) .= buff.recv
-_recv_from_north_buffer!(c, buff, H, N) = view(c, :, 1+N+H:N+2H, :) .= buff.recv
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 7f795b86d5..123b2c0c23 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -23,6 +23,8 @@ function SplitExplicitAuxiliaryFields(grid::DistributedGrid)
     # In a non-parallel grid we calculate only the interior
     kernel_size    = augmented_kernel_size(grid)
     kernel_offsets = augmented_kernel_offsets(grid)
+
+    @show kernel_size, kernel_offsets
     
     return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_size, kernel_offsets)
 end
@@ -42,7 +44,7 @@ end
     Rx, Ry, _ = architecture(grid).ranks
 
     Ax = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx + Hx - 1 : Nx + 2Hx - 2)
-    Ay = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny + Hy - 1 : Nx + 2Hy - 2)
+    Ay = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny + Hy - 1 : Ny + 2Hy - 2)
 
     return (Ax, Ay)
 end
@@ -67,7 +69,7 @@ function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::D
 
         new_halos = split_explicit_halos(old_halos, settings.substeps+1, grid)         
         new_grid  = with_halo(new_halos, grid)
-    
+
         η = ZFaceField(new_grid, indices = (:, :, size(new_grid, 3)+1))
 
         return SplitExplicitFreeSurface(η,

From 004c21be3ad024821eece3bf5194dd59d881f620 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 10 Apr 2023 18:04:05 -0400
Subject: [PATCH 151/530] remove active Ri

---
 src/ImmersedBoundaries/active_cells_map.jl | 30 +++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 00496ab8d5..1686e304da 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -118,23 +118,23 @@ function active_cells_map_surface(ibg)
     return smaller_indices
 end
 
-using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!, FlavorOfRBVD
-import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
+# using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!, FlavorOfRBVD
+# import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
 
-@kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-    velocities, tracers, buoyancy, tracer_bcs, clock)
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+# @kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+#     velocities, tracers, buoyancy, tracer_bcs, clock)
+#     idx = @index(Global, Linear)
+#     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
 
-    @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
-end
+#     @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
+# end
 
-@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-                velocities, tracers, buoyancy, tracer_bcs, clock)
+# @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
+#                 velocities, tracers, buoyancy, tracer_bcs, clock)
 
-    idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
+#     idx = @index(Global, Linear)
+#     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
             
-    _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
-     velocities, tracers, buoyancy, tracer_bcs, clock)
-end
+#     _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
+#      velocities, tracers, buoyancy, tracer_bcs, clock)
+# end

From b0aa904831c84e036ae830749c48471e881f3c12 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 10 Apr 2023 19:13:52 -0400
Subject: [PATCH 152/530] bugfix

---
 .../ri_based_vertical_diffusivity.jl                      | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 0d85d1522c..03372c39f5 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -122,8 +122,6 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    only_active_cells = use_only_active_interior_cells(grid)
-
     launch!(arch, grid, kernel_size,
             compute_ri_number!,
             diffusivities,
@@ -134,8 +132,7 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock;
-            only_active_cells)
+            clock)
 
     launch!(arch, grid, kernel_size,
             compute_ri_based_diffusivities!,
@@ -147,8 +144,7 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             tracers,
             buoyancy,
             top_tracer_bcs,
-            clock;
-            only_active_cells)
+            clock)
 
     return nothing
 end

From 07647bd3c8a95e36470d6a6b541fb7e5797316a5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 11 Apr 2023 23:10:27 -0400
Subject: [PATCH 153/530] weights were not correct

---
 .../split_explicit_free_surface.jl                     | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index eb8511bf75..76dd8b832a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -67,7 +67,7 @@ end
 function SplitExplicitFreeSurface(grid; gravitational_acceleration = g_Earth,
                                         settings = SplitExplicitSettings(eltype(grid); substeps = 200))
 
-η = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
+    η = ZFaceField(grid, indices = (:, :, size(grid, 3)+1))
 
     return SplitExplicitFreeSurface(η,
                                     SplitExplicitState(grid),
@@ -264,9 +264,9 @@ function SplitExplicitSettings(FT::DataType=Float64;
     mass_flux_weights ./= sum(mass_flux_weights)
 
     return SplitExplicitSettings(substeps,
-                                 averaging_weights,
-                                 mass_flux_weights, 
-                                 Δτ, 
+                                 FT.(averaging_weights),
+                                 FT.(mass_flux_weights), 
+                                 FT(Δτ), 
                                  timestepper)
 end
 
@@ -289,7 +289,7 @@ Base.show(io::IO, sefs::SplitExplicitFreeSurface) = print(io, "$(summary(sefs))\
 function reset!(sefs::SplitExplicitFreeSurface)
     for name in propertynames(sefs.state)
         var = getproperty(sefs.state, name)
-        fill!(var, 0.0)
+        fill!(var, 0)
     end
     fill!(sefs.auxiliary.Gᵁ, 0)
     fill!(sefs.auxiliary.Gⱽ, 0)

From 4fb5cc095970b336d3e3aaca773acd6ddfeb0667 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 11 Apr 2023 23:13:33 -0400
Subject: [PATCH 154/530] Reorganize immersed boundary implementation + Field
 for bottom_height

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl  |   9 +-
 .../abstract_grid_fitted_boundary.jl          |  54 +++++
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 112 ++++++++++
 .../grid_fitted_boundary.jl                   |  39 ++++
 .../grid_fitted_immersed_boundaries.jl        | 211 ------------------
 src/ImmersedBoundaries/partial_cell_bottom.jl | 145 ++++++++++++
 .../partial_cell_immersed_boundaries.jl       | 101 ---------
 7 files changed, 356 insertions(+), 315 deletions(-)
 create mode 100644 src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
 create mode 100644 src/ImmersedBoundaries/grid_fitted_bottom.jl
 create mode 100644 src/ImmersedBoundaries/grid_fitted_boundary.jl
 delete mode 100644 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
 create mode 100644 src/ImmersedBoundaries/partial_cell_bottom.jl
 delete mode 100644 src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 0ac0fadf1b..15d2d8881d 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -11,7 +11,7 @@ using Oceananigans.Utils
 using Oceananigans.Architectures
 
 using Oceananigans.TurbulenceClosures: AbstractTurbulenceClosure, time_discretization
-using Oceananigans.Grids: size_summary, inactive_node, peripheral_node
+using Oceananigans.Grids: size_summary, inactive_node, peripheral_node, AbstractGrid
 
 using Oceananigans.TurbulenceClosures:
     viscous_flux_ux,
@@ -291,8 +291,10 @@ end
 
 include("active_cells_map.jl")
 include("immersed_grid_metrics.jl")
-include("grid_fitted_immersed_boundaries.jl")
-include("partial_cell_immersed_boundaries.jl")
+include("abstract_grid_fitted_boundary.jl")
+include("grid_fitted_boundary.jl")
+include("grid_fitted_bottom.jl")
+include("partial_cell_bottom.jl")
 include("conditional_fluxes.jl")
 include("immersed_boundary_condition.jl")
 include("conditional_derivatives.jl")
@@ -300,3 +302,4 @@ include("mask_immersed_field.jl")
 include("immersed_reductions.jl")
 
 end # module
+
diff --git a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
new file mode 100644
index 0000000000..e3fe4e7bb6
--- /dev/null
+++ b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
@@ -0,0 +1,54 @@
+abstract type AbstractGridFittedBoundary <: AbstractImmersedBoundary end
+
+const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractGridFittedBoundary}
+
+#####
+##### Implicit vertical diffusion
+#####
+##### For a center solver we have to check the interface "solidity" at faces k+1 in both the
+##### Upper diagonal and the Lower diagonal 
+##### (because of tridiagonal convention where lower_diagonal on row k is found at k-1)
+##### Same goes for the face solver, where we check at centers k in both Upper and lower diagonal
+#####
+
+@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Center) = immersed_peripheral_node(i, j, k+1, ibg, LX, LY, Face())
+@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Face)   = immersed_peripheral_node(i, j, k,   ibg, LX, LY, Center())
+
+# Extend the upper and lower diagonal functions of the batched tridiagonal solver
+
+for location in (:upper_, :lower_)
+    immersed_func = Symbol(:immersed_ivd_, location, :diagonal)
+    ordinary_func = Symbol(:ivd_ ,         location, :diagonal)
+    @eval begin
+        # Disambiguation
+        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Face, clock, Δt, κz) =
+                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+
+        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Center, clock, Δt, κz) =
+                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+
+        @inline function $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+            return ifelse(immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz),
+                          zero(eltype(ibg.underlying_grid)),
+                          $ordinary_func(i, j, k, ibg.underlying_grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz))
+        end
+    end
+end
+
+# Support for Flat grids
+# Note that instances of AbstractGridFittedBoundary should define _immersed_cell
+# rather than immersed_cell.
+const AGFB = AbstractGridFittedBoundary
+
+immersed_cell(i, j, k, grid, ib) = _immersed_cell(i, j, k, grid, ib)
+
+@eval begin
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, <:Any, <:Any}, ib::AGFB) = _immersed_cell(1, j, k, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, Flat, <:Any}, ib::AGFB) = _immersed_cell(i, 1, k, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, <:Any, Flat}, ib::AGFB) = _immersed_cell(i, j, 1, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, Flat, <:Any},  ib::AGFB) = _immersed_cell(1, 1, k, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, <:Any, Flat},  ib::AGFB) = _immersed_cell(1, j, 1, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, Flat, Flat},  ib::AGFB) = _immersed_cell(i, 1, 1, grid, ib)
+    @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, Flat, Flat},   ib::AGFB) = _immersed_cell(1, 1, 1, grid, ib)
+end
+
diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
new file mode 100644
index 0000000000..089be473d8
--- /dev/null
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -0,0 +1,112 @@
+using Adapt
+using CUDA: CuArray
+using OffsetArrays: OffsetArray
+using Oceananigans.Utils: getnamewrapper
+using Oceananigans.Grids: total_size
+using Oceananigans.Fields: fill_halo_regions!
+using Oceananigans.Architectures: arch_array
+using Oceananigans.BoundaryConditions: FBC
+using Printf
+
+import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
+                                        ivd_lower_diagonal,
+                                        z_bottom
+
+import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
+                                        immersed_∂ⱼ_τ₂ⱼ,
+                                        immersed_∂ⱼ_τ₃ⱼ,
+                                        immersed_∇_dot_qᶜ
+
+#####
+##### GridFittedBottom (2.5D immersed boundary with modified bottom height)
+#####
+
+abstract type AbstractGridFittedBottom{H} <: AbstractGridFittedBoundary end
+
+# To enable comparison with PartialCellBottom in the limiting case that
+# fractional cell height is 1.0.
+struct CenterImmersedCondition end
+struct InterfaceImmersedCondition end
+
+Base.summary(::CenterImmersedCondition) = "CenterImmersedCondition"
+Base.summary(::InterfaceImmersedCondition) = "InterfaceImmersedCondition"
+
+"""
+    GridFittedBottom(bottom_height, [immersed_condition=CenterImmersedCondition()])
+
+Return an immersed boundary with an irregular bottom fit to the underlying grid.
+"""
+struct GridFittedBottom{H, I} <: AbstractGridFittedBottom{H}
+    bottom_height :: H
+    immersed_condition :: I
+end
+
+const GFBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:GridFittedBottom}
+
+GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
+
+function Base.summary(ib::GridFittedBottom)
+    hmax = maximum(ib.bottom_height)
+    hmin = minimum(ib.bottom_height)
+    hmean = mean(ib.bottom_height)
+
+    summary1 = "GridFittedBottom("
+
+    summary2 = string("mean(z)=", prettysummary(hmean),
+                      ", min(z)=", prettysummary(hmin),
+                      ", max(z)=", prettysummary(hmax))
+
+    summary3 = ")"
+
+    return summary1 * summary2 * summary3
+end
+
+Base.summary(ib::GridFittedBottom{<:Function}) = @sprintf("GridFittedBottom(%s)", ib.bottom_height)
+
+function Base.show(io::IO, ib::GridFittedBottom)
+    print(io, summary(ib), '\n')
+    print(io, "├── bottom_height: ", prettysummary(ib.bottom_height), '\n')
+    print(io, "└── immersed_condition: ", summary(ib.immersed_condition))
+end
+
+"""
+    ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
+
+Return a grid with `GridFittedBottom` immersed boundary.
+
+Computes ib.bottom_height and wraps in an array.
+"""
+function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
+    bottom_field = Field{Center, Center, Nothing}(grid)
+    set!(bottom_field, ib.bottom_height)
+    fill_halo_regions!(bottom_field)
+    new_ib = GridFittedBottom(bottom_field, ib.immersed_condition)
+    TX, TY, TZ = topology(grid)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
+end
+
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
+    z = znode(i, j, k+1, underlying_grid, c, c, f)
+    h = @inbounds ib.bottom_height[i, j]
+    return z <= h
+end
+
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:CenterImmersedCondition})
+    z = znode(i, j, k, underlying_grid, c, c, c)
+    h = @inbounds ib.bottom_height[i, j]
+    return z <= h
+end
+
+@inline z_bottom(i, j, ibg::GFBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(ib.bottom_height, ib.immersed_condition)
+
+function on_architecture(arch, ib::GridFittedBottom{<:Field})
+    architecture(ib.bottom_height) == arch && return ib
+    arch_grid = on_architecture(arch, ib.bottom_height.grid)
+    new_bottom_height = Field{Center, Center, Nothing}(arch_grid)
+    copyto!(parent(new_bottom_height), parent(ib.bottom_height))
+    return GridFittedBottom(new_bottom_height, ib.immersed_condition)
+end
+
+Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height), ib.immersed_condition)
+
diff --git a/src/ImmersedBoundaries/grid_fitted_boundary.jl b/src/ImmersedBoundaries/grid_fitted_boundary.jl
new file mode 100644
index 0000000000..c66c8fcafc
--- /dev/null
+++ b/src/ImmersedBoundaries/grid_fitted_boundary.jl
@@ -0,0 +1,39 @@
+using OffsetArrays
+
+struct GridFittedBoundary{M} <: AbstractGridFittedBoundary
+    mask :: M
+end
+
+@inline _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBoundary{<:AbstractArray}) = @inbounds ib.mask[i, j, k]
+
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBoundary)
+    x, y, z = node(i, j, k, underlying_grid, c, c, c)
+    return ib.mask(x, y, z)
+end
+
+function compute_mask(grid, ib)
+    mask_field = Field{Center, Center, Center}(grid, Bool)
+    set!(mask_field, ib.mask)
+    fill_halo_regions!(mask_field)
+    return mask_field
+end
+
+function ImmersedBoundaryGrid(grid, ib::GridFittedBoundary; precompute_mask=true)
+    TX, TY, TZ = topology(grid)
+
+    # TODO: validate ib
+
+    if precompute_mask
+        mask_field = compute_mask(grid, ib)
+        new_ib = GridFittedBoundary(mask_field)
+        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
+    else
+        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
+    end
+end
+
+on_architecture(arch, ib::GridFittedBoundary{<:Field}) = GridFittedBoundary(compute_mask(on_architecture(arch, ib.mask.grid), ib))
+on_architecture(arch, ib::GridFittedBoundary) = ib # need a workaround...
+
+Adapt.adapt_structure(to, ib::AbstractGridFittedBoundary) = GridFittedBoundary(adapt(to, ib.mask))
+
diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
deleted file mode 100644
index 46c0418b14..0000000000
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ /dev/null
@@ -1,211 +0,0 @@
-using Adapt
-using CUDA: CuArray
-using OffsetArrays: OffsetArray
-using Oceananigans.Utils: getnamewrapper
-using Oceananigans.Grids: total_size
-using Oceananigans.Fields: fill_halo_regions!
-using Oceananigans.Architectures: arch_array
-using Oceananigans.BoundaryConditions: FBC
-using Printf
-
-import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
-                                        ivd_lower_diagonal,
-                                        z_bottom
-
-import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
-                                        immersed_∂ⱼ_τ₂ⱼ,
-                                        immersed_∂ⱼ_τ₃ⱼ,
-                                        immersed_∇_dot_qᶜ
-
-#####
-##### Some conveniences for grid fitted boundaries
-#####
-
-abstract type AbstractGridFittedBoundary <: AbstractImmersedBoundary end
-
-const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractGridFittedBoundary}
-
-#####
-##### ImmersedBoundaryGrids require one additional halo to check `inactive_node` for
-##### Faces on the first halo
-#####
-
-#####
-##### GridFittedBottom (2.5D immersed boundary with modified bottom height)
-#####
-
-abstract type AbstractGridFittedBottom{H} <: AbstractGridFittedBoundary end
-
-struct CenterImmersedCondition end
-struct InterfaceImmersedCondition end
-
-"""
-    GridFittedBottom(bottom_height, [immersed_condition=CenterImmersedCondition()])
-
-Return an immersed boundary with an irregular bottom fit to the underlying grid.
-"""
-struct GridFittedBottom{H, I} <: AbstractGridFittedBottom{H}
-    bottom_height :: H
-    immersed_condition :: I
-end
-
-GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
-
-function Base.summary(ib::GridFittedBottom)
-    hmax = maximum(parent(ib.bottom_height))
-    hmin = minimum(parent(ib.bottom_height))
-    return @sprintf("GridFittedBottom(min(h)=%.2e, max(h)=%.2e)", hmin, hmax)
-end
-
-Base.summary(ib::GridFittedBottom{<:Function}) = @sprintf("GridFittedBottom(%s)", ib.bottom_height)
-
-Base.show(io::IO, ib::GridFittedBottom) = print(io, summary(ib))
-
-"""
-    ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
-
-Return a grid with `GridFittedBottom` immersed boundary.
-
-Computes ib.bottom_height and wraps in an array.
-"""
-function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom)
-    bottom_field = Field((Center, Center, Nothing), grid)
-    set!(bottom_field, ib.bottom_height)
-    fill_halo_regions!(bottom_field)
-    offset_bottom_array = dropdims(bottom_field.data, dims=3)
-
-    # TODO: maybe clean this up
-    new_ib = getnamewrapper(ib)(offset_bottom_array)
-
-    return ImmersedBoundaryGrid(grid, new_ib)
-end
-
-function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
-    TX, TY, TZ = topology(grid)
-    validate_ib_size(grid, ib)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
-end
-
-function validate_ib_size(grid, ib)
-    bottom_height_size = total_size(grid, (Center, Center, Nothing))[1:2]
-
-    size(ib.bottom_height) != bottom_height_size &&
-        throw(ArgumentError("The dimensions of the immersed boundary $(size(ib.bottom_height)) do not match the grid size $(bottom_height_size)"))
-
-    return nothing
-end
-
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
-    z = znode(i, j, k+1, underlying_grid, c, c, f)
-    h = @inbounds ib.bottom_height[i, j]
-    return z <= h
-end
-
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:CenterImmersedCondition})
-    z = znode(i, j, k, underlying_grid, c, c, c)
-    h = @inbounds ib.bottom_height[i, j]
-    return z <= h
-end
-
-@inline z_bottom(i, j, ibg::GFIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
-
-on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(arch_array(arch, ib.bottom_height))
-Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height))     
-
-#####
-##### Implicit vertical diffusion
-#####
-
-#####
-##### For a center solver we have to check the interface "solidity" at faces k+1 in both the
-##### Upper diagonal and the Lower diagonal 
-##### (because of tridiagonal convention where lower_diagonal on row k is found at k-1)
-##### Same goes for the face solver, where we check at centers k in both Upper and lower diagonal
-#####
-
-@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Center) = immersed_peripheral_node(i, j, k+1, ibg, LX, LY, Face())
-@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Face)   = immersed_peripheral_node(i, j, k,   ibg, LX, LY, Center())
-
-# Extend the upper and lower diagonal functions of the batched tridiagonal solver
-
-for location in (:upper_, :lower_)
-    immersed_func = Symbol(:immersed_ivd_, location, :diagonal)
-    ordinary_func = Symbol(:ivd_ ,         location, :diagonal)
-    @eval begin
-        # Disambiguation
-        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Face, clock, Δt, κz) =
-                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
-
-        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Center, clock, Δt, κz) =
-                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
-
-        @inline function $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
-            return ifelse(immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz),
-                          zero(eltype(ibg.underlying_grid)),
-                          $ordinary_func(i, j, k, ibg.underlying_grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz))
-        end
-    end
-end
-
-#####
-##### GridFittedBoundary (experimental 3D immersed boundary)
-#####
-
-struct GridFittedBoundary{M} <: AbstractGridFittedBoundary
-    mask :: M
-end
-
-@inline _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBoundary{<:AbstractArray}) = @inbounds ib.mask[i, j, k]
-
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBoundary)
-    x, y, z = node(i, j, k, underlying_grid, c, c, c)
-    return ib.mask(x, y, z)
-end
-
-function compute_mask(grid, ib)
-    mask_field = Field{Center, Center, Center}(grid, Bool)
-    set!(mask_field, ib.mask)
-    fill_halo_regions!(mask_field)
-    return mask_field
-end
-
-function ImmersedBoundaryGrid(grid, ib::GridFittedBoundary; precompute_mask=true)
-    TX, TY, TZ = topology(grid)
-
-    if precompute_mask
-        mask_field = compute_mask(grid, ib)
-        new_ib = GridFittedBoundary(mask_field)
-        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
-    else
-        return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
-    end
-end
-
-function ImmersedBoundaryGrid(grid, ib::GridFittedBoundary{<:OffsetArray}; kw...)
-    TX, TY, TZ = topology(grid)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
-end
-
-on_architecture(arch, ib::GridFittedBoundary{<:AbstractArray}) = GridFittedBoundary(arch_array(arch, ib.mask))
-on_architecture(arch, ib::GridFittedBoundary{<:Field}) = GridFittedBoundary(compute_mask(on_architecture(arch, ib.mask.grid), ib))
-on_architecture(arch, ib::GridFittedBoundary) = ib # need a workaround...
-
-Adapt.adapt_structure(to, ib::AbstractGridFittedBoundary) = GridFittedBoundary(adapt(to, ib.mask))
-
-# fallback
-immersed_cell(i, j, k, grid, ib) = _immersed_cell(i, j, k, grid, ib)
-
-# support for Flat grids
-using Oceananigans.Grids: AbstractGrid
-for IB in [:GridFittedBottom, :GridFittedBoundary]
-    @eval begin
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, <:Any, <:Any}, ib::$IB) = _immersed_cell(1, j, k, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, Flat, <:Any}, ib::$IB) = _immersed_cell(i, 1, k, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, <:Any, Flat}, ib::$IB) = _immersed_cell(i, j, 1, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, Flat, <:Any},  ib::$IB) = _immersed_cell(1, 1, k, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, <:Any, Flat},  ib::$IB) = _immersed_cell(1, j, 1, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, <:Any, Flat, Flat},  ib::$IB) = _immersed_cell(i, 1, 1, grid, ib)
-        @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, Flat, Flat},   ib::$IB) = _immersed_cell(1, 1, 1, grid, ib)
-    end
-end
-
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
new file mode 100644
index 0000000000..e30defb198
--- /dev/null
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -0,0 +1,145 @@
+using Oceananigans.Utils: prettysummary
+using Oceananigans.Fields: fill_halo_regions!
+using Oceananigans.Architectures: arch_array
+using Printf
+
+#####
+##### PartialCellBottom
+#####
+
+struct PartialCellBottom{H, E} <: AbstractGridFittedBottom{H}
+    bottom_height :: H
+    minimum_fractional_cell_height :: E
+end
+
+const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialCellBottom}
+
+function Base.summary(ib::PartialCellBottom)
+    hmax = maximum(ib.bottom_height)
+    hmin = minimum(ib.bottom_height)
+    hmean = mean(ib.bottom_height)
+
+    summary1 = "PartialCellBottom("
+
+    summary2 = string("mean(z)=", prettysummary(hmean),
+                      ", min(z)=", prettysummary(hmin),
+                      ", max(z)=", prettysummary(hmax),
+                      ", ϵ=", prettysummary(ib.minimum_fractional_cell_height))
+
+    summary3 = ")"
+
+    return summary1 * summary2 * summary3
+end
+
+Base.summary(ib::PartialCellBottom{<:Function}) = @sprintf("PartialCellBottom(%s, ϵ=%.1f)",
+                                                           prettysummary(ib.bottom_height, false),
+                                                           prettysummary(ib.minimum_fractional_cell_height))
+
+function Base.show(io::IO, ib::PartialCellBottom)
+    print(io, summary(ib), '\n')
+    print(io, "├── bottom_height: ", prettysummary(ib.bottom_height), '\n')
+    print(io, "└── minimum_fractional_cell_height: ", prettysummary(ib.minimum_fractional_cell_height))
+end
+
+"""
+    PartialCellBottom(bottom_height; minimum_fractional_cell_height=0.2)
+
+Return `PartialCellBottom` representing an immersed boundary with "partial"
+bottom cells. That is, the height of the bottommost cell in each column is reduced
+to fit the provided `bottom_height`, which may be a `Field`, `Array`, or function
+of `(x, y)`.
+
+The height of partial bottom cells is greater than
+
+```
+minimum_fractional_cell_height * Δz,
+```
+
+where `Δz` is the original height of the bottom cell underlying grid.
+"""
+function PartialCellBottom(bottom_height; minimum_fractional_cell_height=0.2)
+    return PartialCellBottom(bottom_height, minimum_fractional_cell_height)
+end
+
+function ImmersedBoundaryGrid(grid, ib::PartialCellBottom)
+    bottom_field = Field{Center, Center, Nothing}(grid)
+    set!(bottom_field, ib.bottom_height)
+    fill_halo_regions!(bottom_field)
+    new_ib = PartialCellBottom(bottom_field, ib.minimum_fractional_cell_height)
+    TX, TY, TZ = topology(grid)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
+end
+
+function on_architecture(arch, ib::PartialCellBottom{<:Field})
+    architecture(ib.bottom_height) == arch && return ib
+    arch_grid = on_architecture(arch, ib.bottom_height.grid)
+    new_bottom_height = Field{Center, Center, Nothing}(arch_grid)
+    copyto!(parent(new_bottom_height), parent(ib.bottom_height))
+    return PartialCellBottom(new_bottom_height, ib.minimum_fractional_cell_height)
+end
+
+Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height),
+                                                                     ib.minimum_fractional_cell_height)     
+
+"""
+
+        --x--
+          ∘   k+1
+    k+1 --x--    ↑      <- node z
+          ∘   k  | Δz
+      k --x--    ↓
+      
+Criterion is h >= z - ϵ Δz
+
+"""
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::PartialCellBottom)
+    # Face node above current cell
+    z = znode(i, j, k+1, underlying_grid, c, c, f)
+    h = @inbounds ib.bottom_height[i, j]
+    return z <= h
+end
+
+@inline bottom_cell(i, j, k, ibg::PCBIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
+                                            immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
+
+@inline function Δzᶜᶜᶜ(i, j, k, ibg::PCBIBG)
+    underlying_grid = ibg.underlying_grid
+    ib = ibg.immersed_boundary
+    # Get node at face above and defining nodes on c,c,f
+    x, y, z = node(i, j, k+1, underlying_grid, c, c, f)
+
+    # Get bottom height and fractional Δz parameter
+    h = @inbounds ib.bottom_height[i, j]
+    ϵ = ibg.immersed_boundary.minimum_fractional_cell_height
+
+    # Are we in a bottom cell?
+    at_the_bottom = bottom_cell(i, j, k, ibg)
+
+    full_Δz = Δzᶜᶜᶜ(i, j, k, ibg.underlying_grid)
+    partial_Δz = max(ϵ * full_Δz, z - h)
+
+    return ifelse(at_the_bottom, partial_Δz, full_Δz)
+end
+
+@inline function Δzᶜᶜᶠ(i, j, k, ibg::PCBIBG)
+    just_above_bottom = bottom_cell(i, j, k-1, ibg)
+    zc = znode(i, j, k, ibg.underlying_grid, c, c, c)
+    zf = znode(i, j, k, ibg.underlying_grid, c, c, f)
+
+    full_Δz = Δzᶜᶜᶠ(i, j, k, ibg.underlying_grid)
+    partial_Δz = zc - zf + Δzᶜᶜᶜ(i, j, k-1, ibg) / 2
+
+    Δz = ifelse(just_above_bottom, partial_Δz, full_Δz)
+
+    return Δz
+end
+
+@inline Δzᶠᶜᶜ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶜ(i-1, j, k, ibg), Δzᶜᶜᶜ(i, j, k, ibg))
+@inline Δzᶜᶠᶜ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶜ(i, j-1, k, ibg), Δzᶜᶜᶜ(i, j, k, ibg))
+@inline Δzᶠᶠᶜ(i, j, k, ibg::PCBIBG) = min(Δzᶠᶜᶜ(i, j-1, k, ibg), Δzᶠᶜᶜ(i, j, k, ibg))
+      
+@inline Δzᶠᶜᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶠ(i-1, j, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))
+@inline Δzᶜᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶠ(i, j-1, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))      
+@inline Δzᶠᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶠᶜᶠ(i, j-1, k, ibg), Δzᶠᶜᶠ(i, j, k, ibg))
+
+@inline z_bottom(i, j, ibg::PCBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
diff --git a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl b/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
deleted file mode 100644
index 7d2f021ef2..0000000000
--- a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
+++ /dev/null
@@ -1,101 +0,0 @@
-using Oceananigans.Fields: fill_halo_regions!
-using Oceananigans.Architectures: arch_array
-using Printf
-
-#####
-##### PartialCellBottom
-#####
-
-struct PartialCellBottom{H, E} <: AbstractGridFittedBottom{H}
-    bottom_height :: H
-    minimum_fractional_Δz :: E
-end
-
-function Base.summary(ib::PartialCellBottom)
-    hmax = maximum(ib.bottom_height)
-    hmin = minimum(ib.bottom_height)
-    return @sprintf("PartialCellBottom(min(h)=%.2e, max(h)=%.2e, ϵ=%.1f)",
-                    hmin, hmax, ib.minimum_fractional_Δz)
-end
-
-Base.summary(ib::PartialCellBottom{<:Function}) = @sprintf("GridFittedBottom(%s, ϵ=%.1f)", ib.bottom_height, ib.minimum_fractional_Δz)
-
-
-# TODO: nicer show method?
-Base.show(io::IO, ib::PartialCellBottom) = print(io, summary(ib))
-
-"""
-    PartialCellBottom(bottom, minimum_height)
-
-Return an immersed boundary...
-"""
-PartialCellBottom(bottom_height; minimum_fractional_Δz=0.1) =
-    PartialCellBottom(bottom_height, minimum_fractional_Δz)
-
-"""
-
-        --x--
-          ∘   k+1
-    k+1 --x--    ↑      <- node z
-          ∘   k  | Δz
-      k --x--    ↓
-      
-Criterion is h >= z - ϵ Δz
-
-"""
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::PartialCellBottom)
-    # Face node above current cell
-    z = znode(i, j, k+1, underlying_grid, c, c, f)
-    h = @inbounds ib.bottom_height[i, j]
-    return z <= h
-end
-
-const PCIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialCellBottom}
-
-on_architecture(arch, ib::PartialCellBottom) = PartialCellBottom(arch_array(arch, ib.bottom_height), ib.minimum_fractional_Δz)
-Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height), ib.minimum_fractional_Δz)     
-
-bottom_cell(i, j, k, ibg::PCIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
-                                    immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
-
-@inline function Δzᶜᶜᶜ(i, j, k, ibg::PCIBG)
-    underlying_grid = ibg.underlying_grid
-    ib = ibg.immersed_boundary
-    # Get node at face above and defining nodes on c,c,f
-    x, y, z = node(i, j, k+1, underlying_grid, c, c, f)
-
-    # Get bottom height and fractional Δz parameter
-    h = @inbounds ib.bottom_height[i, j]
-    ϵ = ibg.immersed_boundary.minimum_fractional_Δz
-
-    # Are we in a bottom cell?
-    at_the_bottom = bottom_cell(i, j, k, ibg)
-
-    full_Δz = Δzᶜᶜᶜ(i, j, k, ibg.underlying_grid)
-    partial_Δz = max(ϵ * full_Δz, z - h)
-
-    return ifelse(at_the_bottom, partial_Δz, full_Δz)
-end
-
-@inline function Δzᶜᶜᶠ(i, j, k, ibg::PCIBG)
-    just_above_bottom = bottom_cell(i, j, k-1, ibg)
-    zc = znode(i, j, k, ibg.underlying_grid, c, c, c)
-    zf = znode(i, j, k, ibg.underlying_grid, c, c, f)
-
-    full_Δz = Δzᶜᶜᶠ(i, j, k, ibg.underlying_grid)
-    partial_Δz = zc - zf + Δzᶜᶜᶜ(i, j, k-1, ibg) / 2
-
-    Δz = ifelse(just_above_bottom, partial_Δz, full_Δz)
-
-    return Δz
-end
-
-@inline Δzᶠᶜᶜ(i, j, k, ibg::PCIBG) = min(Δzᶜᶜᶜ(i-1, j, k, ibg), Δzᶜᶜᶜ(i, j, k, ibg))
-@inline Δzᶜᶠᶜ(i, j, k, ibg::PCIBG) = min(Δzᶜᶜᶜ(i, j-1, k, ibg), Δzᶜᶜᶜ(i, j, k, ibg))
-@inline Δzᶠᶠᶜ(i, j, k, ibg::PCIBG) = min(Δzᶠᶜᶜ(i, j-1, k, ibg), Δzᶠᶜᶜ(i, j, k, ibg))
-      
-@inline Δzᶠᶜᶠ(i, j, k, ibg::PCIBG) = min(Δzᶜᶜᶠ(i-1, j, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))
-@inline Δzᶜᶠᶠ(i, j, k, ibg::PCIBG) = min(Δzᶜᶜᶠ(i, j-1, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))      
-@inline Δzᶠᶠᶠ(i, j, k, ibg::PCIBG) = min(Δzᶠᶜᶠ(i, j-1, k, ibg), Δzᶠᶜᶠ(i, j, k, ibg))
-
-@inline z_bottom(i, j, ibg::PCIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]

From 221a92f63c5c0ffd7aa653ba521d4a12fcf63268 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 11 Apr 2023 23:13:40 -0400
Subject: [PATCH 155/530] Better comment in set!

---
 src/Fields/set!.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Fields/set!.jl b/src/Fields/set!.jl
index 3dea8111fd..9ad53a1202 100644
--- a/src/Fields/set!.jl
+++ b/src/Fields/set!.jl
@@ -43,7 +43,7 @@ end
 
 function set!(u::Field, v::Field)
     # Note: we only copy interior points.
-    # To copy halos use `parent(u) .= parent(v)`.
+    # To copy halos use `copyto!(parent(u), parent(v))`.
     
     if architecture(u) === architecture(v)
         interior(u) .= interior(v)

From cd756c115985d0cd58e808a87e12bd4617c66539 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 11 Apr 2023 23:37:15 -0400
Subject: [PATCH 156/530] FT added to split explicit

---
 .../split_explicit_free_surface.jl                            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index 76dd8b832a..bef6fdd9bc 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -50,9 +50,9 @@ Keyword Arguments
                                  (i.e., ∑(aₘ m /M) = 1).
 - `timestepper`: Time stepping scheme used, either `ForwardBackwardScheme()` or `AdamsBashforth3Scheme()`.
 """
-SplitExplicitFreeSurface(; gravitational_acceleration = g_Earth, kwargs...) =
+SplitExplicitFreeSurface(FT::DataType=Float64; gravitational_acceleration = g_Earth, kwargs...) =
     SplitExplicitFreeSurface(nothing, nothing, nothing,
-                             gravitational_acceleration, SplitExplicitSettings(; kwargs...))
+                             gravitational_acceleration, SplitExplicitSettings(FT; kwargs...))
 
 # The new constructor is defined later on after the state, settings, auxiliary have been defined
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid)

From e83909846a45f3cb7c0f670bd7319db1220f4260 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 01:19:34 -0400
Subject: [PATCH 157/530] see if it works like this

---
 .../split_explicit_free_surface_kernels.jl                    | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index a29eddd93c..a62a0cc222 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -301,7 +301,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     # Reset eta for the next timestep
     # this is the only way in which η̅ is used: as a smoother for the 
     # substepped η field
-    @apply_regionally set!(free_surface.η, free_surface.state.η̅)
+    @apply_regionally set_η!(free_surface.η, free_surface.state.η̅)
 
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
     fill_halo_regions!(fields_to_fill; blocking = false)
@@ -313,6 +313,8 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     return nothing
 end
 
+@inline set_η!(η, η̅) = parent(η) .= parent(η̅)
+
 function iterate_split_explicit!(free_surface, grid, Δt)
     arch = architecture(grid)
 

From 0a2686d81a64407b0b26f397e29d311d4fa7e951 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 12 Apr 2023 02:59:18 -0400
Subject: [PATCH 158/530] Update manifest

---
 Manifest.toml | 136 ++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 65 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index a00adccbd5..e0ac67c193 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,6 +1,6 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.8.5"
+julia_version = "1.8.2"
 manifest_format = "2.0"
 project_hash = "31dd08f0370dcfe9232b9df8c2131e894bcff3ca"
 
@@ -11,16 +11,16 @@ uuid = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
 version = "0.1.4"
 
 [[deps.AMGX_jll]]
-deps = ["Artifacts", "CUDA_jll", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "c846a105d1bfabc86f6302d747cc48acbc7bb489"
+deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
+git-tree-sha1 = "9a9e64c4d2acee7b89286985eaa7489ac3e97328"
 uuid = "656d14af-56e4-5275-8e68-4e861d7b5043"
-version = "2.1.0+0"
+version = "2.3.0+1"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409"
+git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.2.1"
+version = "1.3.1"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra"]
@@ -91,11 +91,17 @@ git-tree-sha1 = "1680366a69e9c95744ef23a239e6cfe61cf2e1ca"
 uuid = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 version = "0.4.7"
 
-[[deps.CUDA_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "108e3ee33d8614b96c2ea43a621b0e8396d8a273"
-uuid = "e9e359dc-d701-5aa8-82ae-09bbf812ea83"
-version = "10.0.130+3"
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "498f45593f6ddc0adff64a9310bb6710e851781b"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.5.0+1"
+
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "9ac3ffda60eeae5291be20f35ca264eb8e95bbc6"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.5.0+1"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
@@ -105,9 +111,9 @@ version = "1.15.7"
 
 [[deps.ChangesOfVariables]]
 deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
-git-tree-sha1 = "844b061c104c408b24537482469400af6075aae4"
+git-tree-sha1 = "485193efd2176b88e6622a39a246f8c5b600e74e"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.5"
+version = "0.1.6"
 
 [[deps.CommonSolve]]
 git-tree-sha1 = "9441451ee712d1aec22edad62db1a9af3dc8d852"
@@ -116,14 +122,14 @@ version = "0.2.3"
 
 [[deps.Compat]]
 deps = ["Dates", "LinearAlgebra", "UUIDs"]
-git-tree-sha1 = "61fdd77467a5c3ad071ef8277ac6bd6af7dd4c04"
+git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.6.0"
+version = "4.6.1"
 
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.0.1+0"
+version = "0.5.2+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -177,9 +183,9 @@ uuid = "b305315f-e792-5b7a-8f41-49f472929428"
 version = "1.0.1"
 
 [[deps.ExprTools]]
-git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d"
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
 uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.8"
+version = "0.1.9"
 
 [[deps.FFTW]]
 deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
@@ -204,21 +210,21 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 
 [[deps.GPUArrays]]
 deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
-git-tree-sha1 = "4dfaff044eb2ce11a897fecd85538310e60b91e6"
+git-tree-sha1 = "9ade6983c3dbbd492cf5729f865fe030d1541463"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
-version = "8.6.2"
+version = "8.6.6"
 
 [[deps.GPUArraysCore]]
 deps = ["Adapt"]
-git-tree-sha1 = "57f7cde02d7a53c9d1d28443b9f11ac5fbe7ebc9"
+git-tree-sha1 = "1cd7f0af1aa58abc02ea1d872953a97359cb87fa"
 uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
-version = "0.1.3"
+version = "0.1.4"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "95185985a5d2388c6d0fedb06181ad4ddd40e0cb"
+git-tree-sha1 = "19d693666a304e8c371798f4900f7435558c7cde"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.17.2"
+version = "0.17.3"
 
 [[deps.Glob]]
 git-tree-sha1 = "4df9f7e06108728ebf00a0a11edee4b29a482bb2"
@@ -259,9 +265,9 @@ uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
 version = "0.1.8"
 
 [[deps.IrrationalConstants]]
-git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151"
+git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
-version = "0.1.1"
+version = "0.2.2"
 
 [[deps.IterativeSolvers]]
 deps = ["LinearAlgebra", "Printf", "Random", "RecipesBase", "SparseArrays"]
@@ -275,10 +281,10 @@ uuid = "82899510-4779-5014-852e-03e436cf321d"
 version = "1.0.0"
 
 [[deps.JLD2]]
-deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "TranscodingStreams", "UUIDs"]
-git-tree-sha1 = "c3244ef42b7d4508c638339df1bdbf4353e144db"
+deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "Requires", "TranscodingStreams", "UUIDs"]
+git-tree-sha1 = "42c17b18ced77ff0be65957a591d34f4ed57c631"
 uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-version = "0.4.30"
+version = "0.4.31"
 
 [[deps.JLLWrappers]]
 deps = ["Preferences"]
@@ -306,15 +312,15 @@ version = "0.8.6"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "b8ae281340f0d3e973aae7b96fb7502b0119b376"
+git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "4.15.0"
+version = "4.17.1"
 
 [[deps.LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
-git-tree-sha1 = "7718cf44439c676bc0ec66a87099f41015a522d6"
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.16+2"
+version = "0.0.18+0"
 
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
@@ -354,9 +360,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[deps.LogExpFunctions]]
 deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
-git-tree-sha1 = "680e733c3a0a9cea9e935c8c2184aea6a63fa0b5"
+git-tree-sha1 = "0a1b7c2863e44523180fdb3146534e265a91870b"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
-version = "0.3.21"
+version = "0.3.23"
 
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -374,10 +380,10 @@ uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 version = "0.20.8"
 
 [[deps.MPICH_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "6d4fa43afab4611d090b11617ecea1a144b21d35"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "d790fbd913f85e8865c55bf4725aff197c5155c8"
 uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
-version = "4.0.2+5"
+version = "4.1.1+1"
 
 [[deps.MPIPreferences]]
 deps = ["Libdl", "Preferences"]
@@ -386,10 +392,10 @@ uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 version = "0.1.7"
 
 [[deps.MPItrampoline_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "b3f9e42685b4ad614eca0b44bd863cd41b1c86ea"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "ad88f863a5a16b3e26d14446afd3cd746266281b"
 uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
-version = "5.0.2+1"
+version = "5.2.1+3"
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
@@ -408,9 +414,9 @@ version = "2.28.0+0"
 
 [[deps.MicrosoftMPI_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "a16aa086d335ed7e0170c5265247db29172af2f9"
+git-tree-sha1 = "a8027af3d1743b3bfae34e54872359fdebb31422"
 uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
-version = "10.1.3+2"
+version = "10.1.3+4"
 
 [[deps.Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
@@ -452,16 +458,16 @@ uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
 version = "0.8.1+0"
 
 [[deps.OpenMPI_jll]]
-deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "Pkg", "TOML"]
-git-tree-sha1 = "346d6b357a480300ed7854dbc70e746ac52e10fd"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "f3080f4212a8ba2ceb10a34b938601b862094314"
 uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
-version = "4.1.3+3"
+version = "4.1.5+0"
 
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "f6e9dba33f9f2c44e08a020b0caf6903be540004"
+git-tree-sha1 = "9ff31d101d987eb9d66bd8b176ac7c277beccd09"
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "1.1.19+0"
+version = "1.1.20+0"
 
 [[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
@@ -476,9 +482,9 @@ version = "1.4.1"
 
 [[deps.Parsers]]
 deps = ["Dates", "SnoopPrecompile"]
-git-tree-sha1 = "151d91d63d8d6c1a5789ecb7de51547e00480f1b"
+git-tree-sha1 = "478ac6c952fddd4399e71d4779797c538d0ff2bf"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "2.5.4"
+version = "2.5.8"
 
 [[deps.PencilArrays]]
 deps = ["Adapt", "ArrayInterface", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
@@ -509,9 +515,9 @@ uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 
 [[deps.ProgressBars]]
 deps = ["Printf"]
-git-tree-sha1 = "806ebc92e1b4b4f72192369a28dfcaf688566b2b"
+git-tree-sha1 = "9d84c8646109eb8bc7a006d59b157c64d5155c81"
 uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
-version = "1.4.1"
+version = "1.5.0"
 
 [[deps.Quaternions]]
 deps = ["LinearAlgebra", "Random", "RealDot"]
@@ -595,21 +601,21 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
 [[deps.SpecialFunctions]]
 deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "d75bda01f8c31ebb72df80a46c88b25d1c79c56d"
+git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.1.7"
+version = "2.2.0"
 
 [[deps.Static]]
 deps = ["IfElse"]
-git-tree-sha1 = "c35b107b61e7f34fa3f124026f2a9be97dea9e1c"
+git-tree-sha1 = "08be5ee09a7632c32695d954a602df96a877bf0d"
 uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
-version = "0.8.3"
+version = "0.8.6"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "6954a456979f23d05085727adb17c4551c19ecd1"
+git-tree-sha1 = "b8d897fe7fa688e93aef573711cb207c08c9e11e"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.12"
+version = "1.5.19"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
@@ -660,9 +666,9 @@ version = "1.0.1"
 
 [[deps.Tables]]
 deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
-git-tree-sha1 = "c79322d36826aa2f4fd8ecfa96ddb47b174ac78d"
+git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec"
 uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
-version = "1.10.0"
+version = "1.10.1"
 
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
@@ -687,9 +693,9 @@ version = "0.5.22"
 
 [[deps.TranscodingStreams]]
 deps = ["Random", "Test"]
-git-tree-sha1 = "94f38103c984f89cf77c402f2a68dbd870f8165f"
+git-tree-sha1 = "0b829474fed270a4b0ab07117dce9b9a2fa7581a"
 uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.9.11"
+version = "0.9.12"
 
 [[deps.TupleTools]]
 git-tree-sha1 = "3c712976c47707ff893cf6ba4354aa14db1d8938"
@@ -710,9 +716,9 @@ version = "0.2.1"
 
 [[deps.UnsafeAtomicsLLVM]]
 deps = ["LLVM", "UnsafeAtomics"]
-git-tree-sha1 = "33af9d2031d0dc09e2be9a0d4beefec4466def8e"
+git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219"
 uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
-version = "0.1.0"
+version = "0.1.1"
 
 [[deps.VersionParsing]]
 git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"

From aa1524a5538c3eba0b3240f2bd3cc9b57e718ed8 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 11 Apr 2023 23:29:12 -0800
Subject: [PATCH 159/530] Move around imports

---
 src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl | 3 +++
 src/ImmersedBoundaries/grid_fitted_bottom.jl            | 9 +--------
 src/ImmersedBoundaries/immersed_boundary_condition.jl   | 5 +++++
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
index e3fe4e7bb6..5cfc40dcc3 100644
--- a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
+++ b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
@@ -1,5 +1,8 @@
 abstract type AbstractGridFittedBoundary <: AbstractImmersedBoundary end
 
+import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
+                                        ivd_lower_diagonal
+
 const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractGridFittedBoundary}
 
 #####
diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 089be473d8..47ba897b6e 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -8,14 +8,7 @@ using Oceananigans.Architectures: arch_array
 using Oceananigans.BoundaryConditions: FBC
 using Printf
 
-import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
-                                        ivd_lower_diagonal,
-                                        z_bottom
-
-import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
-                                        immersed_∂ⱼ_τ₂ⱼ,
-                                        immersed_∂ⱼ_τ₃ⱼ,
-                                        immersed_∇_dot_qᶜ
+import Oceananigans.TurbulenceClosures: z_bottom
 
 #####
 ##### GridFittedBottom (2.5D immersed boundary with modified bottom height)
diff --git a/src/ImmersedBoundaries/immersed_boundary_condition.jl b/src/ImmersedBoundaries/immersed_boundary_condition.jl
index 4c2902f350..a50394cf2f 100644
--- a/src/ImmersedBoundaries/immersed_boundary_condition.jl
+++ b/src/ImmersedBoundaries/immersed_boundary_condition.jl
@@ -6,6 +6,11 @@ using Oceananigans.Operators: index_left, index_right, Δx, Δy, Δz, div
 
 import Oceananigans.BoundaryConditions: regularize_immersed_boundary_condition, bc_str
 
+import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
+                                        immersed_∂ⱼ_τ₂ⱼ,
+                                        immersed_∂ⱼ_τ₃ⱼ,
+                                        immersed_∇_dot_qᶜ
+
 struct ImmersedBoundaryCondition{W, E, S, N, B, T}
     west :: W                  
     east :: E

From 2cc1954efccd45e7938e0052e11483b829095b6f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 05:06:08 -0400
Subject: [PATCH 160/530] reduce tag

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d50859b754..a7cdfc52f9 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -56,7 +56,7 @@ opposite_side = Dict(
 #   digits 7-9: the "to" rank
 
 RANK_DIGITS = 2
-ID_DIGITS   = 2
+ID_DIGITS   = 1
 LOC_DIGITS  = 1
 
 @inline loc_id(::Nothing) = 0

From 9515e1812786168b69e18ec505033ae01abc6b6f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 05:44:11 -0400
Subject: [PATCH 161/530] this should work

---
 src/Distributed/halo_communication.jl | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index a7cdfc52f9..546eb122de 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -57,34 +57,32 @@ opposite_side = Dict(
 
 RANK_DIGITS = 2
 ID_DIGITS   = 1
-LOC_DIGITS  = 1
 
-@inline loc_id(::Nothing) = 0
-@inline loc_id(::Face)    = 1
-@inline loc_id(::Center)  = 2
-@inline location_id(X, Y, Z) = loc_id(Z)
+@inline loc_id(::Nothing, tag) = tag + 5
+@inline loc_id(::Face,    tag) = tag
+@inline loc_id(::Center,  tag) = tag
+@inline location_id(X, Y, Z) = loc_id(Z, tag)
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
+        # REMEMBER, we need to reset the tag not more than once every four passes!!
         function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
+            field_id    = string(location_id(location..., arch.mpi_tag[1]), pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * side_digit * from_digits * to_digits)
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
+            field_id    = string(location_id(location..., arch.mpi_tag[1]), pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * side_digit * from_digits * to_digits)
         end
     end
 end

From 381f124d06833dadef71eb3196096b9e5ce292c8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 05:44:29 -0400
Subject: [PATCH 162/530] shoot no errors pls

---
 test_partitioning.jl | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 test_partitioning.jl

diff --git a/test_partitioning.jl b/test_partitioning.jl
new file mode 100644
index 0000000000..a9b11f0295
--- /dev/null
+++ b/test_partitioning.jl
@@ -0,0 +1,44 @@
+using Oceananigans
+using Oceananigans.Distributed
+using Oceananigans.Distributed: partition_global_array
+using Oceananigans.Grids: architecture
+using Oceananigans.Units
+using MPI
+
+MPI.Init()
+
+comm   = MPI.COMM_WORLD
+rank   = MPI.Comm_rank(comm)
+Nranks = MPI.Comm_size(comm)
+
+topo = (Bounded, Periodic, Bounded)
+arch = DistributedArch(CPU(); topology = topo, 
+                 ranks=(Nranks, 1, 1),
+                 use_buffers = true)
+
+Lh = 100kilometers
+Lz = 400meters
+
+Nx = [10, 13, 18, 39]
+
+grid = RectilinearGrid(arch,
+                       size = (Nx[rank+1], 2, 1),
+                       x = (0, Lh), y = (0, Lh), z = (-Lz, 0),
+                       topology = topo,
+                       )
+
+
+array_full = zeros(prod(Nx), 2)
+for element in 1:prod(Nx)
+    array_full[element, :] .= element
+end
+
+arr = partition_global_array(architecture(grid), array_full, size(grid))
+
+@info "on rank $rank" size(grid) arr
+for r in 0:Nranks-1
+    if r == rank
+        @show rank arr
+    end
+    MPI.Barrier(MPI.COMM_WORLD)
+end
\ No newline at end of file

From 45195e8e1c9acf351b3ae05ced64e40d23949dc5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 05:46:11 -0400
Subject: [PATCH 163/530] error error

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 546eb122de..a5a0a0b5ba 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -61,7 +61,7 @@ ID_DIGITS   = 1
 @inline loc_id(::Nothing, tag) = tag + 5
 @inline loc_id(::Face,    tag) = tag
 @inline loc_id(::Center,  tag) = tag
-@inline location_id(X, Y, Z) = loc_id(Z, tag)
+@inline location_id(X, Y, Z, tag) = loc_id(Z, tag)
 
 for side in sides
     side_str = string(side)

From d7aedc2e8cec88bfbb1d955699991fd6e34eb27d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 05:52:39 -0400
Subject: [PATCH 164/530] more compressed tag

---
 src/Distributed/halo_communication.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index a5a0a0b5ba..3fd4e69bbf 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -39,7 +39,7 @@ import Oceananigans.BoundaryConditions:
 #####
 
 sides  = (:west, :east, :south, :north, :top, :bottom)
-side_id = Dict(side => n for (n, side) in enumerate(sides))
+side_id = Dict(side => n-1 for (n, side) in enumerate(sides))
 
 opposite_side = Dict(
     :west => :east, :east => :west,
@@ -70,19 +70,19 @@ for side in sides
     @eval begin
         # REMEMBER, we need to reset the tag not more than once every four passes!!
         function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
-            field_id    = string(location_id(location..., arch.mpi_tag[1]), pad=ID_DIGITS)
+            side_digit  = side_id[Symbol($side_str)]
+            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
-            side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * from_digits * to_digits)
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
+            side_digit  = [opposite_side[Symbol($side_str)]]
             field_id    = string(location_id(location..., arch.mpi_tag[1]), pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
-            side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * from_digits * to_digits)
         end
     end
 end

From 767c4e73d55af46111366d57fb04dcc18245fdaa Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 06:02:01 -0400
Subject: [PATCH 165/530] another bugfix

---
 src/Distributed/halo_communication.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 3fd4e69bbf..59abe90401 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -58,6 +58,7 @@ opposite_side = Dict(
 RANK_DIGITS = 2
 ID_DIGITS   = 1
 
+# REMEMBER!!! This won't work for tracers!!! (It assumes you are passing maximum 4 at a time)
 @inline loc_id(::Nothing, tag) = tag + 5
 @inline loc_id(::Face,    tag) = tag
 @inline loc_id(::Center,  tag) = tag
@@ -79,7 +80,7 @@ for side in sides
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
             side_digit  = [opposite_side[Symbol($side_str)]]
-            field_id    = string(location_id(location..., arch.mpi_tag[1]), pad=ID_DIGITS)
+            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             return parse(Int, field_id * from_digits * to_digits)

From 95e46d88f91ed949cb562ca5f2716e1f6d0f1bb9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 06:13:43 -0400
Subject: [PATCH 166/530] it's getting late...

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 59abe90401..6cc3e62771 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -79,7 +79,7 @@ for side in sides
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
-            side_digit  = [opposite_side[Symbol($side_str)]]
+            side_digit  = side_id[opposite_side[Symbol($side_str)]]
             field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)

From 5f0e917bbbd84a34a286554c664330011c2bc404 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Apr 2023 06:51:19 -0400
Subject: [PATCH 167/530] fix it for large MPI ranks

---
 src/Distributed/halo_communication.jl | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 10666b72be..4557b7d741 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -64,39 +64,39 @@ opposite_side = Dict(
 #   digits 7-8: the "to" rank
 
 RANK_DIGITS = 2
-ID_DIGITS   = 2
-LOC_DIGITS  = 1
+ID_DIGITS   = 1
 
-@inline loc_id(::Nothing) = 0
-@inline loc_id(::Face)    = 1
-@inline loc_id(::Center)  = 2
-@inline location_id(X, Y, Z) = loc_id(Z)
+# REMEMBER!!! This won't work for tracers!!! (It assumes you are passing maximum 4 at a time)
+@inline loc_id(::Nothing, tag) = tag + 5
+@inline loc_id(::Face,    tag) = tag
+@inline loc_id(::Center,  tag) = tag
+@inline location_id(X, Y, Z, tag) = loc_id(Z, tag)
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
+        # REMEMBER, we need to reset the tag not more than once every four passes!!
         function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
+            side_digit  = side_id[Symbol($side_str)]
+            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
-            side_digit  = string(side_id[Symbol($side_str)])
-            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * from_digits * to_digits)
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            loc_id      = string(location_id(location...), pad=LOC_DIGITS)
+            side_digit  = side_id[opposite_side[Symbol($side_str)]]
+            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
-            side_digit  = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, loc_id * field_id * side_digit * from_digits * to_digits)
+            return parse(Int, field_id * from_digits * to_digits)
         end
     end
 end
 
+
 #####
 ##### Filling halos for halo communication boundary conditions
 #####

From 7542389d17653fac7c3d9141abd29cabc779899a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 12 Apr 2023 07:34:09 -0800
Subject: [PATCH 168/530] Update windy convection validation

---
 .../heterogeneous_windy_convection.jl         | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
index 654356cf6e..9e0eb6aa98 100644
--- a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
+++ b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
@@ -3,25 +3,39 @@ using GLMakie
 using Oceananigans
 using Oceananigans.Units
 using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
-using Oceananigans.TurbulenceClosures: ExplicitTimeDiscretization
+using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom
 
 Nx = 1
 Ny = 64
-Nz = 32
 
 const Lx = 100kilometers
 const Ly = Lx
-const Lz = 256
+const Lz = 1000
+
+# Stretched vertical grid
+γ = 1.02
+Δz₀ = 8
+h₀ = 128
+z = [-Δz₀ * k for k = 0:ceil(h₀ / Δz₀)]
+while z[end] > -Lz
+    push!(z, z[end] - (z[end-1] - z[end])^γ)
+end
+z = reverse(z)
+Nz = length(z) - 1
 
 grid = RectilinearGrid(size = (Nx, Ny, Nz),
+                       halo = (4, 4, 4),
                        x = (0, Lx),
-                       y = (0, Ly),
-                       z = (-Lz, 0),
+                       y = (-Ly/2, Ly/2),
+                       z = z,
                        topology=(Periodic, Bounded, Bounded))
 
+z_bottom(x, y) = - Lz * (1 - (2y / Ly)^2)
+grid = ImmersedBoundaryGrid(grid, GridFittedBottom(z_bottom))
+
 @show grid
 @inline Qᵇ(x, y, t) = 2e-8 #* sin(2π * y / Ly)
-@inline Qᵘ(x, y, t) = -1e-4 * sin(π * y / Ly)
+@inline Qᵘ(x, y, t) = -1e-4 * cos(π * y / Ly)
 
 b_top_bc = FluxBoundaryCondition(Qᵇ)
 u_top_bc = FluxBoundaryCondition(Qᵘ)
@@ -29,7 +43,6 @@ u_top_bc = FluxBoundaryCondition(Qᵘ)
 b_bcs = FieldBoundaryConditions(top=b_top_bc)
 u_bcs = FieldBoundaryConditions(top=u_top_bc)
 
-etd = ExplicitTimeDiscretization()
 closure = CATKEVerticalDiffusivity()
 
 model = HydrostaticFreeSurfaceModel(; grid, closure,
@@ -55,7 +68,7 @@ simulation.output_writers[:fields] = JLD2OutputWriter(model, merge(model.velocit
 function progress(sim)
     u, v, w = sim.model.velocities
     e = sim.model.tracers.e
-    κᶜ = sim.model.diffusivity_fields.Kᶜ
+    κᶜ = sim.model.diffusivity_fields.κᶜ
 
     msg = @sprintf("Iter: %d, t: %s, max|u|: (%6.2e, %6.2e, %6.2e) m s⁻¹", 
                    iteration(sim), prettytime(sim),

From 2e895fb1f614229211f041658527f2d17b82afd2 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 12 Apr 2023 14:49:36 -0800
Subject: [PATCH 169/530] More sophisticated windy convection

---
 .../heterogeneous_windy_convection.jl         | 174 ++++++++++++------
 1 file changed, 118 insertions(+), 56 deletions(-)

diff --git a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
index 9e0eb6aa98..3448f08345 100644
--- a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
+++ b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
@@ -1,19 +1,21 @@
 using Printf
+using Statistics
 using GLMakie
+
 using Oceananigans
 using Oceananigans.Units
 using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
 using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom
 
 Nx = 1
-Ny = 64
+Ny = 100
 
-const Lx = 100kilometers
+const Lx = 1000kilometers
 const Ly = Lx
 const Lz = 1000
 
 # Stretched vertical grid
-γ = 1.02
+γ = 1.01
 Δz₀ = 8
 h₀ = 128
 z = [-Δz₀ * k for k = 0:ceil(h₀ / Δz₀)]
@@ -31,11 +33,11 @@ grid = RectilinearGrid(size = (Nx, Ny, Nz),
                        topology=(Periodic, Bounded, Bounded))
 
 z_bottom(x, y) = - Lz * (1 - (2y / Ly)^2)
-grid = ImmersedBoundaryGrid(grid, GridFittedBottom(z_bottom))
+grid = ImmersedBoundaryGrid(grid, PartialCellBottom(z_bottom, minimum_fractional_cell_height=0.1))
 
 @show grid
-@inline Qᵇ(x, y, t) = 2e-8 #* sin(2π * y / Ly)
-@inline Qᵘ(x, y, t) = -1e-4 * cos(π * y / Ly)
+@inline Qᵇ(x, y, t) = 1e-7
+@inline Qᵘ(x, y, t) = -1e-3 * cos(π * y / Ly)
 
 b_top_bc = FluxBoundaryCondition(Qᵇ)
 u_top_bc = FluxBoundaryCondition(Qᵘ)
@@ -43,32 +45,49 @@ u_top_bc = FluxBoundaryCondition(Qᵘ)
 b_bcs = FieldBoundaryConditions(top=b_top_bc)
 u_bcs = FieldBoundaryConditions(top=u_top_bc)
 
-closure = CATKEVerticalDiffusivity()
+vertical_mixing = CATKEVerticalDiffusivity()
+#vertical_mixing = RiBasedVerticalDiffusivity()
+
+Δy = Ly / Ny
+ν₄ = Δy^4 / 70minutes
+hyperviscosity = HorizontalScalarBiharmonicDiffusivity(ν=ν₄)
+
+#closure = vertical_mixing
+closure = (vertical_mixing, hyperviscosity)
+
+filename = "heterogeneous_cooling_with_hyperviscosity.jld2"
 
 model = HydrostaticFreeSurfaceModel(; grid, closure,
                                     momentum_advection = WENO(),
                                     tracer_advection = WENO(),
+                                    coriolis = FPlane(f=1e-4),
                                     tracers = (:b, :e),
                                     boundary_conditions = (; b=b_bcs, u=u_bcs),
                                     buoyancy = BuoyancyTracer())
 
 N² = 1e-5
-h = Lz / 3
 bᵢ(x, y, z) = N² * z
 set!(model, b=bᵢ, e=1e-6)
 
-simulation = Simulation(model, Δt=10minute, stop_iteration=400)
+simulation = Simulation(model, Δt=5minute, stop_time=2days)
+
+κᶜ = if model.closure isa Tuple
+    model.diffusivity_fields[1].κᶜ
+else
+    model.diffusivity_fields.κᶜ
+end
+
+outputs = (; model.velocities..., model.tracers..., κᶜ=κᶜ)
 
-filename = "heterogeneous_cooling.jld2"
-simulation.output_writers[:fields] = JLD2OutputWriter(model, merge(model.velocities, model.tracers);
+simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs;
                                                       filename,
-                                                      schedule = IterationInterval(1),
+                                                      schedule = TimeInterval(1hour),
                                                       overwrite_existing = true)
 
 function progress(sim)
     u, v, w = sim.model.velocities
     e = sim.model.tracers.e
-    κᶜ = sim.model.diffusivity_fields.κᶜ
+
 
     msg = @sprintf("Iter: %d, t: %s, max|u|: (%6.2e, %6.2e, %6.2e) m s⁻¹", 
                    iteration(sim), prettytime(sim),
@@ -91,77 +110,120 @@ e_ts = FieldTimeSeries(filename, "e")
 u_ts = FieldTimeSeries(filename, "u")
 v_ts = FieldTimeSeries(filename, "v")
 w_ts = FieldTimeSeries(filename, "w")
+#κ_ts = FieldTimeSeries(filename, "κᶜ")
 Nt = length(b_ts.times)
 
-fig = Figure(resolution=(1600, 1200))
+for ψ in (b_ts, e_ts, u_ts, v_ts, w_ts)
+    ψp = parent(ψ)
+    ψp[ψp .== 0] .= NaN
+end
+
+fig = Figure(resolution=(1600, 800))
 
-ax_bxy = Axis(fig[1, 1])
-ax_uxy = Axis(fig[1, 2])
-ax_exy = Axis(fig[1, 3])
-ax_vyz = Axis(fig[2, 1])
-ax_wyz = Axis(fig[2, 2])
-ax_eyz = Axis(fig[2, 3])
+ax_uyz = Axis(fig[1, 1], title="u(y, z) - <u(y, z)>")
+ax_vyz = Axis(fig[1, 2], title="v(y, z)")
+ax_wyz = Axis(fig[1, 3], title="w(y, z)")
+ax_eyz = Axis(fig[1, 4], title="e(y, z)")
+#ax_κyz = Axis(fig[1, 4], title="κ(y, z)")
 
-ax_ey = Axis(fig[3, 1:2])
-ax_ez = Axis(fig[3, 3])
+ax_bz = Axis(fig[2, 1], title="b(z)", xlabel="y")
+ax_uz = Axis(fig[2, 2], title="u(z)", ylabel="z")
+ax_vz = Axis(fig[2, 3], title="v(z)", ylabel="z")
+ax_ez = Axis(fig[2, 4], title="e(z)", ylabel="z")
+#ax_κz = Axis(fig[2, 4], title="κ(z)", ylabel="z")
 
-slider = Slider(fig[4, :], range=1:Nt, startvalue=1)
+slider = Slider(fig[3, :], range=1:Nt, startvalue=1)
 n = slider.value
 
-b_xy = @lift interior(b_ts[$n], :, :, Nz)
-b_xz = @lift interior(b_ts[$n], :, 1, :)
-b_yz = @lift interior(b_ts[$n], 1, :, :)
+title = @lift string("Two-dimensional channel at t = ", prettytime(b_ts.times[$n]))
+Label(fig[0, :], title, fontsize=24)
 
-e_xy = @lift interior(e_ts[$n], :, :, Nz)
-e_xz = @lift interior(e_ts[$n], :, 1, :)
+b_yz = @lift interior(b_ts[$n], 1, :, :)
 e_yz = @lift interior(e_ts[$n], 1, :, :)
 
-e_y1 = @lift interior(e_ts[$n], 1, :, 32)
-e_y2 = @lift interior(e_ts[$n], 1, :, 30)
-e_y3 = @lift interior(e_ts[$n], 1, :, 28)
+u_yz = @lift begin
+    u = interior(u_ts[$n], 1, :, :)
+    u .- mean(filter(!isnan, u))
+end
+
+v_yz = @lift interior(v_ts[$n], 1, :, :)
+w_yz = @lift interior(w_ts[$n], 1, :, :)
+w_yz = @lift interior(w_ts[$n], 1, :, :)
+#κ_yz = @lift interior(κ_ts[$n], 1, :, :)
+
+Nx, Ny, Nz = size(grid)
+
+b_z1 = @lift interior(b_ts[$n], 1, 16, :)
+b_z2 = @lift interior(b_ts[$n], 1, 32, :)
+b_z3 = @lift interior(b_ts[$n], 1, 8, :)
+
 e_z1 = @lift interior(e_ts[$n], 1, 16, :)
 e_z2 = @lift interior(e_ts[$n], 1, 32, :)
 e_z3 = @lift interior(e_ts[$n], 1, 8, :)
 
-u_xy = @lift interior(u_ts[$n], :, :, Nz)
-u_xz = @lift interior(u_ts[$n], :, 1, :)
-u_yz = @lift interior(u_ts[$n], 1, :, :)
+# κ_z1 = @lift interior(κ_ts[$n], 1, 16, :)
+# κ_z2 = @lift interior(κ_ts[$n], 1, 32, :)
+# κ_z3 = @lift interior(κ_ts[$n], 1, 8, :)
 
-v_xy = @lift interior(v_ts[$n], :, :, Nz)
-v_xz = @lift interior(v_ts[$n], :, 1, :)
-v_yz = @lift interior(v_ts[$n], 1, :, :)
+u_z1 = @lift interior(u_ts[$n], 1, 16, :)
+u_z2 = @lift interior(u_ts[$n], 1, 32, :)
+u_z3 = @lift interior(u_ts[$n], 1, 8, :)
 
-w_xy = @lift interior(w_ts[$n], :, :, Nz)
-w_xz = @lift interior(w_ts[$n], :, 1, :)
-w_yz = @lift interior(w_ts[$n], 1, :, :)
+v_z1 = @lift interior(v_ts[$n], 1, 16, :)
+v_z2 = @lift interior(v_ts[$n], 1, 32, :)
+v_z3 = @lift interior(v_ts[$n], 1, 8, :)
 
 x, y, z = nodes(b_ts)
+#xκ, yκ, zκ = nodes(κ_ts)
+
+elim = 6e-4
+ulim = 0.2
+vlim = 2e-2
+wlim = 2e-4
+κlim = 1e1
+
+heatmap!(ax_eyz, y, z, e_yz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
+contour!(ax_eyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_bxy, x, y, b_xy)
-heatmap!(ax_uxy, x, y, u_xy)
-heatmap!(ax_exy, x, y, e_xy)
+#heatmap!(ax_κyz, y, zκ κ_yz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
+#contour!(ax_κyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_eyz, y, z, e_yz)
-#contour!(ax_eyz, y, z, b_yz, levels=15, linecolor=:black)
+heatmap!(ax_uyz, y, z, u_yz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
+contour!(ax_uyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_vyz, y, z, v_yz)
-#contour!(ax_uyz, y, z, b_yz, levels=15, linecolor=:black)
+heatmap!(ax_vyz, y, z, v_yz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
+contour!(ax_vyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_wyz, y, z, w_yz)
-#contour!(ax_vyz, y, z, b_yz, levels=15, linecolor=:black)
+heatmap!(ax_wyz, y, z, w_yz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
+contour!(ax_wyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_vyz, y, z, v_yz)
-#contour!(ax_vyz, y, z, b_yz, levels=15, linecolor=:black)
+lines!(ax_bz, b_z1, z)
+lines!(ax_bz, b_z2, z)
+lines!(ax_bz, b_z3, z)
 
-lines!(ax_ey, y, e_y1)
-lines!(ax_ey, y, e_y2)
-lines!(ax_ey, y, e_y3)
 lines!(ax_ez, e_z1, z)
 lines!(ax_ez, e_z2, z)
 lines!(ax_ez, e_z3, z)
 
-ylims!(ax_ey, -1e-5, 4e-4)
-xlims!(ax_ez, -1e-5, 4e-4)
+lines!(ax_uz, u_z1, z)
+lines!(ax_uz, u_z2, z)
+lines!(ax_uz, u_z3, z)
+
+lines!(ax_vz, v_z1, z)
+lines!(ax_vz, v_z2, z)
+lines!(ax_vz, v_z3, z)
+
+xlims!(ax_ez, -elim/10, 2elim)
+xlims!(ax_uz, -2ulim, 2ulim)
+xlims!(ax_vz, -2vlim, 2vlim)
+ylims!(ax_bz, -1020, 20)
+ylims!(ax_uz, -1020, 20)
+ylims!(ax_vz, -1020, 20)
+ylims!(ax_ez, -1020, 20)
 
 display(fig)
 
+record(fig, filename[1:end-5] * ".mp4", 1:Nt, framerate=24) do nn
+    @info "Plotting frame $nn of $Nt..."
+    n[] = nn
+end

From 5c699a38a41f1910c3b70cb3baa161451790599e Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 12 Apr 2023 14:49:45 -0800
Subject: [PATCH 170/530] Fix CATKE problems with tuples

---
 src/TurbulenceClosures/closure_tuples.jl      | 66 ++++++++++---------
 .../vertically_implicit_diffusion_solver.jl   | 22 +++----
 2 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index e6423af216..c8cecb68d7 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -18,38 +18,44 @@ end
 ##### Kernel functions
 #####
 
-funcs     = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :maybe_tupled_ivd_upper_diagonal, :maybe_tupled_ivd_lower_diagonal, :maybe_tupled_implicit_linear_term]
-alt_funcs = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :ivd_upper_diagonal, :ivd_lower_diagonal, :implicit_linear_term]
+outer_tendency_functions = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ]
+inner_tendency_functions = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ]
 
-for (f, alt_f) in zip(funcs, alt_funcs)
+outer_ivd_functions = [:_ivd_upper_diagonal, :_ivd_lower_diagonal, :_implicit_linear_coefficient]
+inner_ivd_functions = [:ivd_upper_diagonal,  :ivd_lower_diagonal,   :implicit_linear_coefficient]
+
+outer_funcs = vcat(outer_tendency_functions, outer_ivd_functions)
+inner_funcs = vcat(inner_tendency_functions, inner_ivd_functions)
+
+for (outer_f, inner_f) in zip(outer_funcs, inner_funcs)
     @eval begin
-        @inline $f(i, j, k, grid, closures::Tuple{<:Any}, Ks, args...) =
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
-
-        @inline $f(i, j, k, grid, closures::Tuple{<:Any, <:Any}, Ks, args...) = (
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
-                  + $alt_f(i, j, k, grid, closures[2], Ks[2], args...))
-
-        @inline $f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any}, Ks, args...) = (
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
-                  + $alt_f(i, j, k, grid, closures[2], Ks[2], args...) 
-                  + $alt_f(i, j, k, grid, closures[3], Ks[3], args...))
-
-        @inline $f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any, <:Any}, Ks, args...) = (
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
-                  + $alt_f(i, j, k, grid, closures[2], Ks[2], args...) 
-                  + $alt_f(i, j, k, grid, closures[3], Ks[3], args...) 
-                  + $alt_f(i, j, k, grid, closures[4], Ks[4], args...))
-
-        @inline $f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any, <:Any, <:Any}, Ks, args...) = (
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
-                  + $alt_f(i, j, k, grid, closures[2], Ks[2], args...) 
-                  + $alt_f(i, j, k, grid, closures[3], Ks[3], args...) 
-                  + $alt_f(i, j, k, grid, closures[4], Ks[4], args...)
-                  + $alt_f(i, j, k, grid, closures[5], Ks[5], args...))
-
-        @inline $f(i, j, k, grid, closures::Tuple, Ks, args...) = (
-                    $alt_f(i, j, k, grid, closures[1], Ks[1], args...)
+        @inline $outer_f(i, j, k, grid, closures::Tuple{<:Any}, Ks, args...) =
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
+
+        @inline $outer_f(i, j, k, grid, closures::Tuple{<:Any, <:Any}, Ks, args...) = (
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
+                  + $inner_f(i, j, k, grid, closures[2], Ks[2], args...))
+
+        @inline $outer_f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any}, Ks, args...) = (
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
+                  + $inner_f(i, j, k, grid, closures[2], Ks[2], args...) 
+                  + $inner_f(i, j, k, grid, closures[3], Ks[3], args...))
+
+        @inline $outer_f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any, <:Any}, Ks, args...) = (
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
+                  + $inner_f(i, j, k, grid, closures[2], Ks[2], args...) 
+                  + $inner_f(i, j, k, grid, closures[3], Ks[3], args...) 
+                  + $inner_f(i, j, k, grid, closures[4], Ks[4], args...))
+
+        @inline $outer_f(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any, <:Any, <:Any}, Ks, args...) = (
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
+                  + $inner_f(i, j, k, grid, closures[2], Ks[2], args...) 
+                  + $inner_f(i, j, k, grid, closures[3], Ks[3], args...) 
+                  + $inner_f(i, j, k, grid, closures[4], Ks[4], args...)
+                  + $inner_f(i, j, k, grid, closures[5], Ks[5], args...))
+
+        @inline $outer_f(i, j, k, grid, closures::Tuple, Ks, args...) = (
+                    $inner_f(i, j, k, grid, closures[1], Ks[1], args...)
                   + $f(i, j, k, grid, closures[2:end], Ks[2:end], args...))
     end
 end
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 209e4403ad..d4f7b53c55 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -88,13 +88,13 @@ end
 
 @inline ivd_diagonal(i, j, k, grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) =
     one(eltype(grid)) -
-        Δt * maybe_tupled_implicit_linear_coefficient(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
-                      maybe_tupled_ivd_upper_diagonal(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
-                      maybe_tupled_ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, LX, LY, LZ, clock, Δt, κz)
+        Δt * _implicit_linear_coefficient(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
+                      _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
+                      _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, LX, LY, LZ, clock, Δt, κz)
 
-@inline maybe_tupled_implicit_linear_coefficient(args...) = implicit_linear_coefficient(args...)
-@inline maybe_tupled_ivd_upper_diagonal(args...) = ivd_upper_diagonal(args...)
-@inline maybe_tupled_ivd_lower_diagonal(args...) = ivd_lower_diagonal(args...)
+@inline _implicit_linear_coefficient(args...) = implicit_linear_coefficient(args...)
+@inline _ivd_upper_diagonal(args...) = ivd_upper_diagonal(args...)
+@inline _ivd_lower_diagonal(args...) = ivd_lower_diagonal(args...)
 
 #####
 ##### Solver constructor
@@ -125,18 +125,18 @@ function implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
                                  "grids that are Bounded in the z-direction.")
 
     z_solver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = Val(:maybe_tupled_ivd_lower_diagonal),
+                                        lower_diagonal = Val(:_ivd_lower_diagonal),
                                         diagonal       = Val(:ivd_diagonal),
-                                        upper_diagonal = Val(:maybe_tupled_ivd_upper_diagonal))
+                                        upper_diagonal = Val(:_ivd_upper_diagonal))
 
     return z_solver
 end
 
 # Extend the `get_coefficient` function to retrieve the correct `ivd_diagonal`, `ivd_lower_diagonal` and `ivd_upper_diagonal` functions
 # REMEMBER: `get_coefficient(f::Function, args...)` leads to massive performance decrease on the CPU (https://github.com/CliMA/Oceananigans.jl/issues/2996) 
-@inline get_coefficient(::Val{:maybe_tupled_ivd_lower_diagonal}, i, j, k, grid, p, args...) = maybe_tupled_ivd_lower_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::Val{:maybe_tupled_ivd_upper_diagonal}, i, j, k, grid, p, args...) = maybe_tupled_ivd_upper_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::Val{:ivd_diagonal}, i, j, k, grid, p, args...) = ivd_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(::Val{:_ivd_lower_diagonal}, i, j, k, grid, p, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(::Val{:_ivd_upper_diagonal}, i, j, k, grid, p, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(::Val{:ivd_diagonal},        i, j, k, grid, p, args...) = ivd_diagonal(i, j, k, grid, args...)
 
 #####
 ##### Implicit step functions

From 98c7c58dde827f1287ce7b7c099cf95b318d6879 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:03:27 -0400
Subject: [PATCH 171/530] function immersed boundary

---
 .../grid_fitted_immersed_boundaries.jl        | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index b490f788d3..bf4277bf8a 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -79,6 +79,8 @@ function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom)
     return ImmersedBoundaryGrid(grid, new_ib)
 end
 
+ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid(grid, ib)
+
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)
     validate_ib_size(grid, ib)
@@ -97,6 +99,22 @@ function validate_ib_size(grid, ib)
         throw(ArgumentError("The dimensions of the immersed boundary $(size(ib.bottom_height)) do not match the grid size $(bottom_height_size)"))
 end
 
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Function, <:InterfaceImmersedCondition})
+    x = xnode(c, c, f, i, j, k+1, underlying_grid)
+    y = ynode(c, c, f, i, j, k+1, underlying_grid)
+    z = znode(c, c, f, i, j, k+1, underlying_grid)
+    h = @inbounds ib.bottom_height(x, y)
+    return z <= h
+end
+
+@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Function, <:CenterImmersedCondition})
+    x = xnode(c, c, c, i, j, k, underlying_grid)
+    y = ynode(c, c, c, i, j, k, underlying_grid)
+    z = znode(c, c, c, i, j, k, underlying_grid)
+    h = @inbounds ib.bottom_height(x, y)
+    return z <= h
+end
+
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
     z = znode(c, c, f, i, j, k+1, underlying_grid)
     h = @inbounds ib.bottom_height[i, j]
@@ -110,6 +128,11 @@ end
 end
 
 @inline bottom(i, j, k, ibg::GFIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+@inline function bottom(i, j, k, ibg::GridFittedBottom{<:Function}) 
+    x = xnode(c, c, c, i, j, k, ibg.underlying_grid)
+    y = ynode(c, c, c, i, j, k, ibg.underlying_grid)
+    return ibg.immersed_boundary.bottom_height(i, j)
+end
 
 on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(arch_array(arch, ib.bottom_height))
 Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height))     

From 540fb72f0f5bd54289251e0e123abf51c4d8de97 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:26:29 -0400
Subject: [PATCH 172/530] bugfix

---
 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index bf4277bf8a..33b4ac8b4b 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -79,7 +79,7 @@ function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom)
     return ImmersedBoundaryGrid(grid, new_ib)
 end
 
-ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid(grid, ib)
+ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing)
 
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)

From 5e63c015319f12ed1bef13c6579543e1a62acba7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:35:05 -0400
Subject: [PATCH 173/530] bugfix

---
 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index 33b4ac8b4b..818cebc0d6 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -79,7 +79,7 @@ function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom)
     return ImmersedBoundaryGrid(grid, new_ib)
 end
 
-ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib, nothing)
+ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid{topology(grid)...}(grid, ib, nothing)
 
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)

From f8d8a927c85945773abd4892696a649801548918 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:25:13 -0800
Subject: [PATCH 174/530] Better abstraction than truefunc and cosmetic change
 to conditional operation

---
 .../conditional_operations.jl                 | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/AbstractOperations/conditional_operations.jl b/src/AbstractOperations/conditional_operations.jl
index de17808f92..7934a0f54c 100644
--- a/src/AbstractOperations/conditional_operations.jl
+++ b/src/AbstractOperations/conditional_operations.jl
@@ -90,8 +90,20 @@ function ConditionalOperation(c::ConditionalOperation;
     return ConditionalOperation{LX, LY, LZ}(c.operand, func, c.grid, condition, mask)
 end
 
+struct TrueCondition end
+
+@inline function Base.getindex(c::ConditionalOperation, i, j, k)
+    return ifelse(evaluate_condition(c.condition, i, j, k, c.grid, c),
+                  c.func(getindex(c.operand, i, j, k)),
+                  c.mask)
+end
+
+@inline evaluate_condition(condition, i, j, k, grid, args...)                = condition(i, j, k, grid, args...)
+@inline evaluate_condition(::TrueCondition, i, j, k, grid, args...)          = true
+@inline evaluate_condition(condition::AbstractArray, i, j, k, grid, args...) = @inbounds condition[i, j, k]
+
 @inline condition_operand(func::Function, op::AbstractField, condition, mask) = ConditionalOperation(op; func, condition, mask)
-@inline condition_operand(func::Function, op::AbstractField, ::Nothing, mask) = ConditionalOperation(op; func, condition = truefunc, mask)
+@inline condition_operand(func::Function, op::AbstractField, ::Nothing, mask) = ConditionalOperation(op; func, condition=TrueCondition(), mask)
 
 @inline function condition_operand(func::Function, operand::AbstractField, condition::AbstractArray, mask)
     condition = arch_array(architecture(operand.grid), condition)
@@ -101,7 +113,13 @@ end
 @inline condition_operand(func::typeof(identity), c::ConditionalOperation, ::Nothing, mask) = ConditionalOperation(c; mask)
 @inline condition_operand(func::Function,         c::ConditionalOperation, ::Nothing, mask) = ConditionalOperation(c; func, mask)
 
-@inline truefunc(args...) = true
+@inline materialize_condition!(c::ConditionalOperation) = set!(c.operand, c)
+
+function materialize_condition(c::ConditionalOperation)
+    f = similar(c.operand)
+    set!(f, c)
+    return f
+end
 
 @inline condition_onefield(c::ConditionalOperation{LX, LY, LZ}, mask) where {LX, LY, LZ} =
                               ConditionalOperation{LX, LY, LZ}(OneField(Int), identity, c.grid, c.condition, mask)
@@ -116,27 +134,9 @@ Adapt.adapt_structure(to, c::ConditionalOperation{LX, LY, LZ}) where {LX, LY, LZ
                                      adapt(to, c.condition),
                                      adapt(to, c.mask))
 
-@inline function Base.getindex(c::ConditionalOperation, i, j, k)
-    return ifelse(get_condition(c.condition, i, j, k, c.grid, c),
-                  c.func(getindex(c.operand, i, j, k)),
-                  c.mask)
-end
-
-@inline concretize_condition!(c::ConditionalOperation) = set!(c.operand, c)
-
-function concretize_condition(c::ConditionalOperation)
-    f = similar(c.operand)
-    set!(f, c)
-    return f
-end
-
-@inline get_condition(condition, i, j, k, grid, args...)                = condition(i, j, k, grid, args...)
-@inline get_condition(condition::AbstractArray, i, j, k, grid, args...) = @inbounds condition[i, j, k]
-
 Base.summary(c::ConditionalOperation) = string("ConditionalOperation of ", summary(c.operand), " with condition ", summary(c.condition))
 
 compute_at!(c::ConditionalOperation, time) = compute_at!(c.operand, time)
-
 indices(c::ConditionalOperation) = indices(c.operand)
 
 Base.show(io::IO, operation::ConditionalOperation) =

From 61e17891153478643a98f1d73113b59d75e60f23 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:25:34 -0800
Subject: [PATCH 175/530] Cosmetic change to field

---
 src/Fields/field.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index db2f00f4a3..802242b8dc 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -546,12 +546,12 @@ const MinimumReduction = typeof(Base.minimum!)
 const AllReduction     = typeof(Base.all!)
 const AnyReduction     = typeof(Base.any!)
 
-check_version_larger_than_7() = VERSION.minor > 7
+isversion8⁺() = VERSION.minor > 7
 
-initialize_reduced_field!(::SumReduction,  f, r::ReducedField, c) = check_version_larger_than_7() ? Base.initarray!(interior(r), f, Base.add_sum, true, interior(c))  : Base.initarray!(interior(r), Base.add_sum, true, interior(c))
-initialize_reduced_field!(::ProdReduction, f, r::ReducedField, c) = check_version_larger_than_7() ? Base.initarray!(interior(r), f, Base.mul_prod, true, interior(c)) : Base.initarray!(interior(r), Base.mul_prod, true, interior(c))
-initialize_reduced_field!(::AllReduction,  f, r::ReducedField, c) = check_version_larger_than_7() ? Base.initarray!(interior(r), f, &, true, interior(c))             : Base.initarray!(interior(r), &, true, interior(c))
-initialize_reduced_field!(::AnyReduction,  f, r::ReducedField, c) = check_version_larger_than_7() ? Base.initarray!(interior(r), f, |, true, interior(c))             : Base.initarray!(interior(r), |, true, interior(c))
+initialize_reduced_field!(::SumReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, Base.add_sum, true, interior(c))  : Base.initarray!(interior(r), Base.add_sum, true, interior(c))
+initialize_reduced_field!(::ProdReduction, f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, Base.mul_prod, true, interior(c)) : Base.initarray!(interior(r), Base.mul_prod, true, interior(c))
+initialize_reduced_field!(::AllReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, &, true, interior(c))             : Base.initarray!(interior(r), &, true, interior(c))
+initialize_reduced_field!(::AnyReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, |, true, interior(c))             : Base.initarray!(interior(r), |, true, interior(c))
 
 initialize_reduced_field!(::MaximumReduction, f, r::ReducedField, c) = Base.mapfirst!(f, interior(r), interior(c))
 initialize_reduced_field!(::MinimumReduction, f, r::ReducedField, c) = Base.mapfirst!(f, interior(r), interior(c))

From f2668f2c9af00b1770256cf1b4c555b1d18def6c Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:26:32 -0800
Subject: [PATCH 176/530] Minor change to conditional fluxes

---
 src/ImmersedBoundaries/conditional_fluxes.jl | 57 ++++++++++----------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/src/ImmersedBoundaries/conditional_fluxes.jl b/src/ImmersedBoundaries/conditional_fluxes.jl
index f256e5cb68..a535712c3d 100644
--- a/src/ImmersedBoundaries/conditional_fluxes.jl
+++ b/src/ImmersedBoundaries/conditional_fluxes.jl
@@ -10,13 +10,15 @@ const ATD = AbstractTimeDiscretization
 
 Return either
 
-    i) The boundary flux `qᴮ` if the node condition `nc` is true (default: `nc = peripheral_node`), or
+    i) The boundary flux `qᴮ` if the node condition `nc` is true (default: `nc = immersed_peripheral_node`), or
     ii) The interior flux `qᴵ` otherwise.
 
 This can be used either to condition intrinsic flux functions, or immersed boundary flux functions.
 """
-@inline conditional_flux(i, j, k, ibg, ℓx, ℓy, ℓz, qᴮ, qᴵ) =
-    ifelse(immersed_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz), qᴮ, qᴵ)
+@inline function conditional_flux(i, j, k, ibg, ℓx, ℓy, ℓz, q_boundary, q_interior)
+    on_immersed_periphery = immersed_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz)
+    return ifelse(on_immersed_periphery, q_boundary, q_interior)
+end
 
 # Conveniences
 @inline conditional_flux_ccc(i, j, k, ibg::IBG, qᴮ, qᴵ) = conditional_flux(i, j, k, ibg, c, c, c, qᴮ, qᴵ)
@@ -33,24 +35,24 @@ This can be used either to condition intrinsic flux functions, or immersed bound
 #####
 
 # ccc, ffc, fcf
-@inline _viscous_flux_ux(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_ux(i, j, k, ibg, args...))
-@inline _viscous_flux_uy(i, j, k, ibg::GFIBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_uy(i, j, k, ibg, args...))
-@inline _viscous_flux_uz(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_uz(i, j, k, ibg, args...))
+@inline _viscous_flux_ux(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), viscous_flux_ux(i, j, k, ibg, args...))
+@inline _viscous_flux_uy(i, j, k, ibg::IBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(ibg), viscous_flux_uy(i, j, k, ibg, args...))
+@inline _viscous_flux_uz(i, j, k, ibg::IBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(ibg), viscous_flux_uz(i, j, k, ibg, args...))
  
  # ffc, ccc, cff
-@inline _viscous_flux_vx(i, j, k, ibg::GFIBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_vx(i, j, k, ibg, args...))
-@inline _viscous_flux_vy(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_vy(i, j, k, ibg, args...))
-@inline _viscous_flux_vz(i, j, k, ibg::GFIBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_vz(i, j, k, ibg, args...))
+@inline _viscous_flux_vx(i, j, k, ibg::IBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(ibg), viscous_flux_vx(i, j, k, ibg, args...))
+@inline _viscous_flux_vy(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), viscous_flux_vy(i, j, k, ibg, args...))
+@inline _viscous_flux_vz(i, j, k, ibg::IBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(ibg), viscous_flux_vz(i, j, k, ibg, args...))
 
  # fcf, cff, ccc
-@inline _viscous_flux_wx(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_wx(i, j, k, ibg, args...))
-@inline _viscous_flux_wy(i, j, k, ibg::GFIBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_wy(i, j, k, ibg, args...))
-@inline _viscous_flux_wz(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), viscous_flux_wz(i, j, k, ibg, args...))
+@inline _viscous_flux_wx(i, j, k, ibg::IBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(ibg), viscous_flux_wx(i, j, k, ibg, args...))
+@inline _viscous_flux_wy(i, j, k, ibg::IBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(ibg), viscous_flux_wy(i, j, k, ibg, args...))
+@inline _viscous_flux_wz(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), viscous_flux_wz(i, j, k, ibg, args...))
 
 # fcc, cfc, ccf
-@inline _diffusive_flux_x(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcc(i, j, k, ibg, zero(eltype(ibg)), diffusive_flux_x(i, j, k, ibg, args...))
-@inline _diffusive_flux_y(i, j, k, ibg::GFIBG, args...) = conditional_flux_cfc(i, j, k, ibg, zero(eltype(ibg)), diffusive_flux_y(i, j, k, ibg, args...))
-@inline _diffusive_flux_z(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccf(i, j, k, ibg, zero(eltype(ibg)), diffusive_flux_z(i, j, k, ibg, args...))
+@inline _diffusive_flux_x(i, j, k, ibg::IBG, args...) = conditional_flux_fcc(i, j, k, ibg, zero(ibg), diffusive_flux_x(i, j, k, ibg, args...))
+@inline _diffusive_flux_y(i, j, k, ibg::IBG, args...) = conditional_flux_cfc(i, j, k, ibg, zero(ibg), diffusive_flux_y(i, j, k, ibg, args...))
+@inline _diffusive_flux_z(i, j, k, ibg::IBG, args...) = conditional_flux_ccf(i, j, k, ibg, zero(ibg), diffusive_flux_z(i, j, k, ibg, args...))
 
 #####
 ##### Advective fluxes
@@ -58,25 +60,25 @@ This can be used either to condition intrinsic flux functions, or immersed bound
 
 # dx(uu), dy(vu), dz(wu)
 # ccc,    ffc,    fcf
-@inline _advective_momentum_flux_Uu(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Uu(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Vu(i, j, k, ibg::GFIBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Vu(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Wu(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Wu(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Uu(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), advective_momentum_flux_Uu(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Vu(i, j, k, ibg::IBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(ibg), advective_momentum_flux_Vu(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Wu(i, j, k, ibg::IBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(ibg), advective_momentum_flux_Wu(i, j, k, ibg, args...))
 
 # dx(uv), dy(vv), dz(wv)
 # ffc,    ccc,    cff
-@inline _advective_momentum_flux_Uv(i, j, k, ibg::GFIBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Uv(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Vv(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Vv(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Wv(i, j, k, ibg::GFIBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Wv(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Uv(i, j, k, ibg::IBG, args...) = conditional_flux_ffc(i, j, k, ibg, zero(ibg), advective_momentum_flux_Uv(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Vv(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), advective_momentum_flux_Vv(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Wv(i, j, k, ibg::IBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(ibg), advective_momentum_flux_Wv(i, j, k, ibg, args...))
 
 # dx(uw), dy(vw), dz(ww)
 # fcf,    cff,    ccc
-@inline _advective_momentum_flux_Uw(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Uw(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Vw(i, j, k, ibg::GFIBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Vw(i, j, k, ibg, args...))
-@inline _advective_momentum_flux_Ww(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(eltype(ibg)), advective_momentum_flux_Ww(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Uw(i, j, k, ibg::IBG, args...) = conditional_flux_fcf(i, j, k, ibg, zero(ibg), advective_momentum_flux_Uw(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Vw(i, j, k, ibg::IBG, args...) = conditional_flux_cff(i, j, k, ibg, zero(ibg), advective_momentum_flux_Vw(i, j, k, ibg, args...))
+@inline _advective_momentum_flux_Ww(i, j, k, ibg::IBG, args...) = conditional_flux_ccc(i, j, k, ibg, zero(ibg), advective_momentum_flux_Ww(i, j, k, ibg, args...))
 
-@inline _advective_tracer_flux_x(i, j, k, ibg::GFIBG, args...) = conditional_flux_fcc(i, j, k, ibg, zero(eltype(ibg)), advective_tracer_flux_x(i, j, k, ibg, args...))
-@inline _advective_tracer_flux_y(i, j, k, ibg::GFIBG, args...) = conditional_flux_cfc(i, j, k, ibg, zero(eltype(ibg)), advective_tracer_flux_y(i, j, k, ibg, args...))
-@inline _advective_tracer_flux_z(i, j, k, ibg::GFIBG, args...) = conditional_flux_ccf(i, j, k, ibg, zero(eltype(ibg)), advective_tracer_flux_z(i, j, k, ibg, args...))
+@inline _advective_tracer_flux_x(i, j, k, ibg::IBG, args...) = conditional_flux_fcc(i, j, k, ibg, zero(ibg), advective_tracer_flux_x(i, j, k, ibg, args...))
+@inline _advective_tracer_flux_y(i, j, k, ibg::IBG, args...) = conditional_flux_cfc(i, j, k, ibg, zero(ibg), advective_tracer_flux_y(i, j, k, ibg, args...))
+@inline _advective_tracer_flux_z(i, j, k, ibg::IBG, args...) = conditional_flux_ccf(i, j, k, ibg, zero(ibg), advective_tracer_flux_z(i, j, k, ibg, args...))
 
 #####
 ##### "Boundary-aware" reconstruct
@@ -236,3 +238,4 @@ for bias in (:left_biased, :right_biased)
         end
     end
 end
+

From 9eba7ac5fdebafba92900baf198fe6f17204bf91 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:26:54 -0800
Subject: [PATCH 177/530] Fix immersed reductions so they dont evaluate
 inactive nodes

---
 src/ImmersedBoundaries/immersed_reductions.jl | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/ImmersedBoundaries/immersed_reductions.jl b/src/ImmersedBoundaries/immersed_reductions.jl
index 2ff322c873..fb988db2be 100644
--- a/src/ImmersedBoundaries/immersed_reductions.jl
+++ b/src/ImmersedBoundaries/immersed_reductions.jl
@@ -1,6 +1,6 @@
 using Oceananigans.Fields: AbstractField, offset_compute_index, indices
 
-import Oceananigans.AbstractOperations: ConditionalOperation, get_condition, truefunc
+import Oceananigans.AbstractOperations: ConditionalOperation, evaluate_condition
 import Oceananigans.Fields: condition_operand, conditional_length
 
 #####
@@ -8,6 +8,8 @@ import Oceananigans.Fields: condition_operand, conditional_length
 ##### which includes both external nodes and nodes on the immersed interface.
 #####
 
+@inline truefunc(args...) = true
+
 struct NotImmersed{F} <: Function
     func :: F
 end
@@ -22,9 +24,10 @@ const IF = AbstractField{<:Any, <:Any, <:Any, <:ImmersedBoundaryGrid}
 @inline conditional_length(c::IF)       = conditional_length(condition_operand(identity, c, nothing, 0))
 @inline conditional_length(c::IF, dims) = conditional_length(condition_operand(identity, c, nothing, 0), dims)
 
-@inline function get_condition(condition::NotImmersed, i, j, k, ibg, co::ConditionalOperation, args...)
-    LX, LY, LZ = location(co)
-    return get_condition(condition.func, i, j, k, ibg, args...) & !(immersed_peripheral_node(i, j, k, ibg, LX(), LY(), LZ()))
+@inline function evaluate_condition(condition::NotImmersed, i, j, k, ibg, co::ConditionalOperation, args...)
+    ℓx, ℓy, ℓz = map(instantiate, location(co))
+    immersed = immersed_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz) | inactive_node(i, j, k, ibg, ℓx, ℓy, ℓz)
+    return !immersed & evaluate_condition(condition.func, i, j, k, ibg, args...)
 end 
 
 #####
@@ -57,20 +60,20 @@ const IRF = Union{XIRF, YIRF, ZIRF, YZIRF, XZIRF, XYIRF, XYZIRF}
 @inline condition_operand(func::typeof(identity), op::IRF, ::Nothing, mask) = ConditionalOperation(op; func, condition=NotImmersedColumn(immersed_column(op), truefunc), mask)
 
 @inline function immersed_column(field::IRF)
-    reduced_dims  = reduced_dimensions(field)
-    full_location = fill_location.(location(field)) 
-    one_field    = ConditionalOperation{full_location...}(OneField(Int), identity, field.grid, NotImmersed(truefunc), 0.0)
-
-    return sum(one_field, dims = reduced_dims)
+    grid         = field.grid
+    reduced_dims = reduced_dimensions(field)
+    LX, LY, LZ   = map(center_to_nothing, location(field))
+    one_field    = ConditionalOperation{LX, LY, LZ}(OneField(Int), identity, grid, NotImmersed(truefunc), zero(grid))
+    return sum(one_field, dims=reduced_dims)
 end
 
-@inline fill_location(::Type{Face})    = Face
-@inline fill_location(::Type{Center})  = Center
-@inline fill_location(::Type{Nothing}) = Center
+@inline center_to_nothing(::Type{Face})    = Face
+@inline center_to_nothing(::Type{Center})  = Center
+@inline center_to_nothing(::Type{Nothing}) = Center
 
-@inline function get_condition(condition::NotImmersedColumn, i, j, k, ibg, co::ConditionalOperation, args...)
+@inline function evaluate_condition(condition::NotImmersedColumn, i, j, k, ibg, co::ConditionalOperation, args...)
     LX, LY, LZ = location(co)
-    return get_condition(condition.func, i, j, k, ibg, args...) & !(is_immersed_column(i, j, k, condition.immersed_column))
+    return evaluate_condition(condition.func, i, j, k, ibg, args...) & !(is_immersed_column(i, j, k, condition.immersed_column))
 end 
 
-is_immersed_column(i, j, k, column) = column[i, j, k] == 0
+@inline is_immersed_column(i, j, k, column) = @inbounds column[i, j, k] == 0

From 5d8cdf41116689dbf1f6c4caa981f57f5f9011f6 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:27:05 -0800
Subject: [PATCH 178/530] Formatting

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 15d2d8881d..81673894f4 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -153,8 +153,8 @@ end
 
 function show(io::IO, ibg::ImmersedBoundaryGrid)
     print(io, summary(ibg), ":", "\n",
-              "├── immersed_boundary: ", summary(ibg.immersed_boundary), "\n",
-              "├── underlying_grid: ", summary(ibg.underlying_grid), "\n")
+             "├── immersed_boundary: ", summary(ibg.immersed_boundary), "\n",
+             "├── underlying_grid: ", summary(ibg.underlying_grid), "\n")
 
     return show(io, ibg.underlying_grid, false)
 end

From 6f850dcfe56c8bcb2b99aa33a57f8c269717a04b Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:27:33 -0800
Subject: [PATCH 179/530] Extend grid fitted functions to all IBG

---
 .../abstract_grid_fitted_boundary.jl          | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
index 5cfc40dcc3..3a788263f2 100644
--- a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
+++ b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
@@ -1,7 +1,6 @@
 abstract type AbstractGridFittedBoundary <: AbstractImmersedBoundary end
 
-import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
-                                        ivd_lower_diagonal
+import Oceananigans.TurbulenceClosures: ivd_upper_diagonal, ivd_lower_diagonal
 
 const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractGridFittedBoundary}
 
@@ -20,21 +19,20 @@ const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Abstract
 # Extend the upper and lower diagonal functions of the batched tridiagonal solver
 
 for location in (:upper_, :lower_)
-    immersed_func = Symbol(:immersed_ivd_, location, :diagonal)
     ordinary_func = Symbol(:ivd_ ,         location, :diagonal)
+    immersed_func = Symbol(:immersed_ivd_, location, :diagonal)
     @eval begin
         # Disambiguation
-        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Face, clock, Δt, κz) =
-                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+        @inline $ordinary_func(i, j, k, ibg::IBG, closure, K, id, ℓx, ℓy, ℓz::Face, clock, Δt, κz) =
+                $immersed_func(i, j, k, ibg::IBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
 
-        @inline $ordinary_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz::Center, clock, Δt, κz) =
-                $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+        @inline $ordinary_func(i, j, k, ibg::IBG, closure, K, id, ℓx, ℓy, ℓz::Center, clock, Δt, κz) =
+                $immersed_func(i, j, k, ibg::IBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
 
-        @inline function $immersed_func(i, j, k, ibg::GFIBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
-            return ifelse(immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz),
-                          zero(eltype(ibg.underlying_grid)),
-                          $ordinary_func(i, j, k, ibg.underlying_grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz))
-        end
+        @inline $immersed_func(i, j, k, ibg::IBG, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) =
+            ifelse(immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ℓz),
+                   zero(ibg),
+                   $ordinary_func(i, j, k, ibg.underlying_grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz))
     end
 end
 
@@ -43,7 +41,7 @@ end
 # rather than immersed_cell.
 const AGFB = AbstractGridFittedBoundary
 
-immersed_cell(i, j, k, grid, ib) = _immersed_cell(i, j, k, grid, ib)
+@inline immersed_cell(i, j, k, grid, ib) = _immersed_cell(i, j, k, grid, ib)
 
 @eval begin
     @inline immersed_cell(i, j, k, grid::AbstractGrid{<:Any, Flat, <:Any, <:Any}, ib::AGFB) = _immersed_cell(1, j, k, grid, ib)

From d04e5ad8d1456be5fa14891c338f29ce96632abb Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:27:47 -0800
Subject: [PATCH 180/530] Cosmetics

---
 src/ImmersedBoundaries/partial_cell_bottom.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index e30defb198..832012a82b 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -100,7 +100,7 @@ Criterion is h >= z - ϵ Δz
 end
 
 @inline bottom_cell(i, j, k, ibg::PCBIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
-                                            immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
+                                             immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
 
 @inline function Δzᶜᶜᶜ(i, j, k, ibg::PCBIBG)
     underlying_grid = ibg.underlying_grid

From a0b3d897c8d52f6be578b24428a4c1cc1c98ab67 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:28:07 -0800
Subject: [PATCH 181/530] Improve height above bottom and depth

---
 src/TurbulenceClosures/TurbulenceClosures.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 3ade49b182..d3b03a0eab 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -92,23 +92,25 @@ function hydrostatic_turbulent_kinetic_energy_tendency end
 const c = Center()
 const f = Face()
 
-@inline z_top(i, j, grid)          = znode(i, j, grid.Nz+1, grid, c, c, f)
-@inline z_bottom(i, j,  grid)      = znode(i, j, 1,         grid, c, c, f)
+@inline z_top(i, j, grid)    = znode(i, j, grid.Nz+1, grid, c, c, f)
+@inline z_bottom(i, j, grid) = znode(i, j, 1,         grid, c, c, f)
 
 @inline depthᶜᶜᶠ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, f))
 @inline depthᶜᶜᶜ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, c))
 @inline total_depthᶜᶜᵃ(i, j, grid) = clip(z_top(i, j, grid) - z_bottom(i, j, grid))
 
 @inline function height_above_bottomᶜᶜᶠ(i, j, k, grid)
-    Δz = Δzᶜᶜᶠ(i, j, k, grid)
     h = znode(i, j, k, grid, c, c, f) - z_bottom(i, j, grid)
+
+    # Limit by thickness of cell below
+    Δz = Δzᶜᶜᶜ(i, j, k-1, grid)
     return max(Δz, h)
 end
 
 @inline function height_above_bottomᶜᶜᶜ(i, j, k, grid)
     Δz = Δzᶜᶜᶜ(i, j, k, grid)
     h = znode(i, j, k, grid, c, c, c) - z_bottom(i, j, grid)
-    return max(Δz, h)
+    return max(Δz/2, h)
 end
 
 @inline wall_vertical_distanceᶜᶜᶠ(i, j, k, grid) = min(depthᶜᶜᶠ(i, j, k, grid), height_above_bottomᶜᶜᶠ(i, j, k, grid))

From 6bf66137cf01f63b6eeb855e4bf459ece9b70782 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:28:45 -0800
Subject: [PATCH 182/530] CATKE improvements for bottom boundary layers

---
 .../CATKEVerticalDiffusivities.jl             |  71 ++++++------
 .../mixing_length.jl                          | 106 ++++++++++--------
 .../turbulent_kinetic_energy_equation.jl      |  20 ++--
 3 files changed, 105 insertions(+), 92 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 7f9b88f916..77d3983b09 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -11,6 +11,7 @@ using Oceananigans.Fields
 using Oceananigans.Operators
 
 using Oceananigans.Utils: prettysummary
+using Oceananigans.Grids: peripheral_node, inactive_node, inactive_cell
 using Oceananigans.Fields: ZeroField
 using Oceananigans.BoundaryConditions: default_prognostic_bc, DefaultBoundaryCondition
 using Oceananigans.BoundaryConditions: BoundaryCondition, FieldBoundaryConditions
@@ -107,34 +108,10 @@ const FlavorOfCATKE{TD} = Union{CATKEVD{TD}, CATKEVDArray{TD}} where TD
 include("mixing_length.jl")
 include("turbulent_kinetic_energy_equation.jl")
 
-# Optimal parameters for "favorite CATKE" from Wagner et al. 2023 (in prep)
-optimal_turbulent_kinetic_energy_equation(FT) = TurbulentKineticEnergyEquation(
-    C⁻D  = FT(4.4),
-    C⁺D  = FT(3.3),
-    CᶜD  = FT(0.23),
-    CᵉD  = FT(0.0),
-    Cᵂu★ = FT(1.8),
-    CᵂwΔ = FT(12.0))
-
-optimal_mixing_length(FT) = MixingLength(
-    Cᵇ   = FT(0.37), 
-    Cᶜc  = FT(4.8),
-    Cᶜe  = FT(1.1),
-    Cᵉc  = FT(0.049),
-    Cᵉe  = FT(0.0),
-    Cˢᶜ  = FT(0.29),
-    C⁻u  = FT(0.36),
-    C⁺u  = FT(0.24),
-    C⁻c  = FT(0.41),
-    C⁺c  = FT(0.12),
-    C⁻e  = FT(6.7),
-    C⁺e  = FT(5.4),
-    CRiʷ = FT(0.011),
-    CRiᶜ = FT(0.76))
-
-function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(), FT=Float64;
-                                  mixing_length = optimal_mixing_length(FT),
-                                  turbulent_kinetic_energy_equation = optimal_turbulent_kinetic_energy_equation(FT),
+function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(),
+                                  FT = Float64;
+                                  mixing_length = MixingLength(),
+                                  turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
                                   maximum_diffusivity = Inf,
                                   minimum_turbulent_kinetic_energy = 0,
                                   minimum_convective_buoyancy_flux = 1e-11,
@@ -252,6 +229,9 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     return nothing
 end
 
+const c = Center()
+const f = Face()
+
 @kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     i, j, k, = @index(Global, NTuple)
 
@@ -261,9 +241,19 @@ end
     max_K = closure_ij.maximum_diffusivity
 
     @inbounds begin
-        diffusivities.κᵘ[i, j, k] = min(max_K, κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
-        diffusivities.κᶜ[i, j, k] = min(max_K, κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
-        diffusivities.κᵉ[i, j, k] = min(max_K, κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
+        κᵘ★ = κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        κᶜ★ = κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        κᵉ★ = κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+
+        on_periphery = peripheral_node(i, j, k, grid, c, c, f)
+        within_inactive = inactive_node(i, j, k, grid, c, c, f)
+        κᵘ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵘ★))
+        κᶜ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᶜ★))
+        κᵉ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵉ★))
+
+        diffusivities.κᵘ[i, j, k] = min(max_K, κᵘ★) 
+        diffusivities.κᶜ[i, j, k] = min(max_K, κᶜ★)
+        diffusivities.κᵉ[i, j, k] = min(max_K, κᵉ★)
 
         # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
         # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
@@ -273,8 +263,17 @@ end
         # See `buoyancy_flux`
         dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
         wb_e = ifelse(dissipative_buoyancy_flux, wb / eⁱʲᵏ, zero(grid))
+
+        # Implicit TKE flux at solid bottoms (extra damping for TKE near boundaries)
+        on_bottom = !inactive_cell(i, j, k, grid) & inactive_cell(i, j, k-1, grid)
+        Δz = Δzᶜᶜᶜ(i, j, k, grid)
+        Cᵂϵ = closure_ij.turbulent_kinetic_energy_equation.Cᵂϵ
+        Q_e = - Cᵂϵ * turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
+
+        # Implicit TKE dissipation
+        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
         
-        diffusivities.Lᵉ[i, j, k] = - wb_e + implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        diffusivities.Lᵉ[i, j, k] = - wb_e + ϵ_e + Q_e
     end
 end
 
@@ -283,7 +282,7 @@ end
     return @inbounds L[i, j, k]
 end
 
-@inline function turbulent_velocity(i, j, k, grid, closure, e)
+@inline function turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)
     eᵢ = @inbounds e[i, j, k]
     eᵐⁱⁿ = closure.minimum_turbulent_kinetic_energy
     return sqrt(max(eᵐⁱⁿ, eᵢ))
@@ -291,19 +290,19 @@ end
 @inline is_stableᶜᶜᶠ(i, j, k, grid, tracers, buoyancy) = ∂z_b(i, j, k, grid, buoyancy, tracers) >= 0
 
 @inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓu * u★
 end
 
 @inline function κcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     ℓc = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓc * u★
 end
 
 @inline function κeᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     ℓe = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓe * u★
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index fc02c100ea..18609d9fb0 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -1,4 +1,11 @@
-using ..TurbulenceClosures: wall_vertical_distanceᶜᶜᶠ, wall_vertical_distanceᶜᶜᶜ, total_depthᶜᶜᵃ
+using ..TurbulenceClosures:
+    wall_vertical_distanceᶜᶜᶠ,
+    wall_vertical_distanceᶜᶜᶜ,
+    depthᶜᶜᶠ,
+    height_above_bottomᶜᶜᶠ,
+    depthᶜᶜᶜ,
+    height_above_bottomᶜᶜᶜ,
+    total_depthᶜᶜᵃ
 
 """
     struct MixingLength{FT}
@@ -6,20 +13,21 @@ using ..TurbulenceClosures: wall_vertical_distanceᶜᶜᶠ, wall_vertical_dista
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
-    Cᵇ   :: FT = Inf
-    Cᶜc  :: FT = 0.0
-    Cᶜe  :: FT = 0.0
-    Cᵉc  :: FT = 0.0
+    Cᴺ   :: FT = 0.37
+    Cᵇ   :: FT = 0.01
+    Cᶜc  :: FT = 4.8
+    Cᶜe  :: FT = 1.1
+    Cᵉc  :: FT = 0.049
     Cᵉe  :: FT = 0.0
-    Cˢᶜ  :: FT = 0.0
-    C⁻u  :: FT = 1.0
-    C⁺u  :: FT = 1.0
-    C⁻c  :: FT = 1.0
-    C⁺c  :: FT = 1.0
-    C⁻e  :: FT = 1.0
-    C⁺e  :: FT = 1.0
-    CRiʷ :: FT = 1.0
-    CRiᶜ :: FT = 0.0
+    Cˢᶜ  :: FT = 0.29
+    C⁻u  :: FT = 0.36
+    C⁺u  :: FT = 0.24
+    C⁻c  :: FT = 0.41
+    C⁺c  :: FT = 0.12
+    C⁻e  :: FT = 6.7
+    C⁺e  :: FT = 5.4
+    CRiʷ :: FT = 0.011
+    CRiᶜ :: FT = 0.76
 end
 
 #####
@@ -46,8 +54,9 @@ end
 @inline function buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    #N² = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, ∂z_b, buoyancy, tracers)
     N²⁺ = clip(N²)
-    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, e)
+    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, e)
     return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
 end
 
@@ -55,28 +64,42 @@ end
     FT = eltype(grid)
     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     N²⁺ = clip(N²)
-    w★ = turbulent_velocity(i, j, k, grid, closure, e)
+    w★ = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)
     return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
 end
 
-@inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ::Number, e, velocities, tracers, buoyancy)
-    ℓᵇ = Cᵇ * buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
-    d = wall_vertical_distanceᶜᶜᶠ(i, j, k, grid)
-    ℓᵇ = ifelse(isnan(ℓᵇ), d, ℓᵇ)
-    ℓ = min(d, ℓᵇ)
+@inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
+    Cᴺ = closure.mixing_length.Cᴺ
+    ℓᴺ = Cᴺ * buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
+
+    Cᵇ = closure.mixing_length.Cᵇ
+    d_up   = depthᶜᶜᶠ(i, j, k, grid)
+    d_down = Cᵇ * height_above_bottomᶜᶜᶠ(i, j, k, grid)
+    d = min(d_up, d_down)
+
+    ℓ = min(d, ℓᴺ)
+    ℓ = ifelse(isnan(ℓ), d, ℓ)
+
     return ℓ
 end
 
-@inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᵇ::Number, e, velocities, tracers, buoyancy)
-    ℓᵇ = Cᵇ * buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
-    d = wall_vertical_distanceᶜᶜᶜ(i, j, k, grid)
-    ℓᵇ = ifelse(isnan(ℓᵇ), d, ℓᵇ)
-    ℓ = min(d, ℓᵇ)
+@inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
+    Cᴺ = closure.mixing_length.Cᴺ
+    ℓᴺ = Cᴺ * buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+
+    Cᵇ = closure.mixing_length.Cᵇ
+    d_up = depthᶜᶜᶜ(i, j, k, grid)
+    d_down = Cᵇ * height_above_bottomᶜᶜᶜ(i, j, k, grid)
+    d = min(d_up, d_down)
+
+    ℓ = min(d, ℓᴺ)
+    ℓ = ifelse(isnan(ℓ), d, ℓ)
+
     return ℓ
 end
 
-@inline three_halves_tke(i, j, k, grid, closure, e) = turbulent_velocity(i, j, k, grid, closure, e)^3
-@inline squared_tke(i, j, k, grid, closure, e) = turbulent_velocity(i, j, k, grid, closure, e)^2
+@inline three_halves_tke(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^3
+@inline squared_tke(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^2
 
 @inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
                                             velocities, tracers, buoyancy, clock, tracer_bcs)
@@ -85,15 +108,13 @@ end
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
     Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
-    w★       = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
+    w★       = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     w★²      = ℑzᵃᵃᶠ(i, j, k, grid, squared_tke, closure, tracers.e)
-    w★³      = turbulent_velocity(i, j, grid.Nz, grid, closure, tracers.e)^3
+    w★³      = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
     S²       = shearᶜᶜᶠ(i, j, k, grid, u, v)
     N²       = ∂z_b(i, j, k, grid, buoyancy, tracers)
     N²_above = ∂z_b(i, j, k+1, grid, buoyancy, tracers)
 
-    #w★³ = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
-
     # "Convective length"
     # ℓᶜ ∼ boundary layer depth according to Deardorff scaling
     ℓᶜ = Cᶜ * w★³ / (Qᵇ + Qᵇᵋ)
@@ -130,15 +151,13 @@ end
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
     Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
-    w★       = turbulent_velocity(i, j, k, grid, closure, tracers.e)
-    w★²      = turbulent_velocity(i, j, k, grid, closure, tracers.e)^2
-    w★³      = turbulent_velocity(i, j, grid.Nz, grid, closure, tracers.e)^3
+    w★       = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)
+    w★²      = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)^2
+    w★³      = turbulent_velocityᶜᶜᶜ(i, j, grid.Nz, grid, closure, tracers.e)^3
     S²       = shearᶜᶜᶜ(i, j, k, grid, u, v)
     N²       = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     N²_above = ℑzᵃᵃᶜ(i, j, k+1, grid, ∂z_b, buoyancy, tracers)
 
-    #w★³ = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
-
     # "Convective length"
     # ℓᶜ ∼ boundary layer depth according to Deardorff scaling
     ℓᶜ = Cᶜ * w★³ / (Qᵇ + Qᵇᵋ)
@@ -184,8 +203,7 @@ end
     C⁺ = closure.mixing_length.C⁺u
     σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
 
-    Cᵇ = closure.mixing_length.Cᵇ
-    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
@@ -202,9 +220,7 @@ end
     C⁻ = closure.mixing_length.C⁻c
     C⁺ = closure.mixing_length.C⁺c
     σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
-
-    Cᵇ = closure.mixing_length.Cᵇ
-    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
@@ -222,9 +238,7 @@ end
     C⁻ = closure.mixing_length.C⁻e
     C⁺ = closure.mixing_length.C⁺e
     σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
-
-    Cᵇ = closure.mixing_length.Cᵇ
-    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
@@ -237,7 +251,7 @@ Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"
 
 Base.show(io::IO, ml::MixingLength) =
     print(io, "CATKEVerticalDiffusivities.MixingLength parameters:", '\n',
-              "    Cᵇ   = $(ml.Cᵇ)",   '\n',
+              "    Cᴺ   = $(ml.Cᴺ)",   '\n',
               "    Cᶜc  = $(ml.Cᶜc)",  '\n',
               "    Cᶜe  = $(ml.Cᶜe)",  '\n',
               "    Cᵉc  = $(ml.Cᵉc)",  '\n',
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 79bbd8e216..f997e4c817 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,12 +5,13 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    C⁻D   :: FT = 1.0
-    C⁺D   :: FT = 1.0
-    CᶜD   :: FT = 0.0
+    C⁻D   :: FT = 4.4
+    C⁺D   :: FT = 3.3
+    CᶜD   :: FT = 0.23
     CᵉD   :: FT = 0.0
-    Cᵂu★  :: FT = 1.0
-    CᵂwΔ  :: FT = 1.0
+    Cᵂu★  :: FT = 1.8
+    CᵂwΔ  :: FT = 12.0
+    Cᵂϵ   :: FT = 20.0
 end
 
 #####
@@ -76,9 +77,8 @@ end
     Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
     σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
 
-    Cᵇ = closure.mixing_length.Cᵇ
-    #ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
-    ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
+    #ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
@@ -194,7 +194,7 @@ end
                                tke::TurbulentKineticEnergyEquation, closure::CATKEVD,
                                buoyancy, top_tracer_bcs, top_velocity_bcs)
 
-    wΔ³ = top_convective_turbulent_velocity³(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
+    wΔ³ = top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
     u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
 
     Cᵂu★ = tke.Cᵂu★
@@ -212,7 +212,7 @@ end
 end
 
 """ Computes the convective velocity w★. """
-@inline function top_convective_turbulent_velocity³(i, j, grid, clock, fields, buoyancy, tracer_bcs)
+@inline function top_convective_turbulent_velocity_cubed(i, j, grid, clock, fields, buoyancy, tracer_bcs)
     Qᵇ = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, fields)
     Δz = Δzᶜᶜᶜ(i, j, grid.Nz, grid)
     return clip(Qᵇ) * Δz   

From eb4af279904b853b629d5563a06d392f545e3e1d Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:28:59 -0800
Subject: [PATCH 183/530] Improve ri-based for immersed boundaries

---
 .../ri_based_vertical_diffusivity.jl          | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index d15b59e795..083ec50445 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -1,6 +1,7 @@
 using Oceananigans.Architectures: architecture, device_event, arch_array
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators
+using Oceananigans.Grids: inactive_node
 using Oceananigans.Operators: ℑzᵃᵃᶜ
 
 struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
@@ -160,7 +161,13 @@ const Tanh   = HyperbolicTangentRiDependentTapering
 
 @inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
 
-@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+@inline function shear_squaredᶜᶜᶠ(i, j, k, grid, velocities)
+    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, velocities.u)
+    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, velocities.v)
+    return ∂z_u² + ∂z_v²
+end
+
+@inline function Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
     ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, velocities.u)
     ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, velocities.v)
     S² = ∂z_u² + ∂z_v²
@@ -171,6 +178,9 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
+const c = Center()
+const f = Face()
+
 @kernel function compute_ri_based_diffusivities!(diffusivities, grid, closure::FlavorOfRBVD,
                                                  velocities, tracers, buoyancy, tracer_bcs, clock)
 
@@ -203,11 +213,19 @@ end
     κᵉⁿ = ifelse(entraining, Cᵉⁿ, zero(grid))
 
     # Shear mixing diffusivity and viscosity
-    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
+    Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
+    τ = taper(tapering, Ri, Ri₀, Riᵟ)
+    κᵘ★ = ν₀ * τ
 
+    # Tracer diffusivity: average more than necessary to eliminate noise
+    #∂z_u² = ℑyᵃᶜᵃ(i, j, k, grid, ℑxyᶜᶠᵃ, ϕ², ∂zᶠᶜᶠ, velocities.u)
+    #∂z_v² = ℑxᶜᵃᵃ(i, j, k, grid, ℑxyᶜᶠᵃ, ϕ², ∂zᶜᶠᶠ, velocities.v)
+    #S² = ∂z_u² + ∂z_v²
+    #N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    #Ri = ifelse(N² <= 0, zero(grid), N² / S²)
+    Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, Riᶜᶜᶠ, velocities, buoyancy, tracers)
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
     κᶜ★ = κ₀ * τ
-    κᵘ★ = ν₀ * τ
 
     # Previous diffusivities
     κᶜ = diffusivities.κᶜ
@@ -217,6 +235,12 @@ end
     κᶜ⁺ = κᶜᵃ + κᵉⁿ + κᶜ★
     κᵘ⁺ = κᵘ★
 
+    # Set to zero on periphery and NaN within inactive region
+    on_periphery = peripheral_node(i, j, k, grid, c, c, f)
+    within_inactive = inactive_node(i, j, k, grid, c, c, f)
+    κᶜ⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᶜ⁺))
+    κᵘ⁺ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵘ⁺))
+
     # Update by averaging in time
     @inbounds κᶜ[i, j, k] = (Cᵃᵛ * κᶜ[i, j, k] + κᶜ⁺) / (1 + Cᵃᵛ)
     @inbounds κᵘ[i, j, k] = (Cᵃᵛ * κᵘ[i, j, k] + κᵘ⁺) / (1 + Cᵃᵛ)

From eeacb00129786fed0487238488d00a269fc0d643 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 18:29:20 -0800
Subject: [PATCH 184/530] Cosmetics for vertically implicit diffusion

---
 .../vertically_implicit_diffusion_solver.jl   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index d4f7b53c55..ac0fe537d9 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -19,7 +19,7 @@ import Oceananigans.Solvers: get_coefficient
 
 # Fallbacks: extend these function for `closure` to support.
 # TODO: docstring
-@inline implicit_linear_coefficient(i, j, k, grid, closure, diffusivity_fields, tracer_index, LX, LY, LZ, clock, Δt, κz) =
+@inline implicit_linear_coefficient(i, j, k, grid, closure, diffusivity_fields, tracer_index, ℓx, ℓy, ℓz, clock, Δt, κz) =
     zero(grid)
 
 @inline νzᶠᶜᶠ(i, j, k, grid, closure, diffusivity_fields, clock, args...) = zero(grid) # u
@@ -43,7 +43,7 @@ implicit_diffusion_solver(::ExplicitTimeDiscretization, args...; kwargs...) = no
 
 # Tracers and horizontal velocities at cell centers in z
 
-@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Center, clock, Δt, κz)
+@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
     closure_ij = getclosure(i, j, closure)  
     κᵏ⁺¹ = κz(i, j, k+1, grid, closure_ij, K, id, clock)
 
@@ -52,7 +52,7 @@ implicit_diffusion_solver(::ExplicitTimeDiscretization, args...; kwargs...) = no
                   - Δt * κ_Δz²(i, j, k, k+1, grid, κᵏ⁺¹))
 end
 
-@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Center, clock, Δt, κz)
+@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
     k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
     κᵏ = κz(i, j, k′, grid, closure_ij, K, id, clock)
@@ -66,7 +66,7 @@ end
 #
 # Note: these coefficients are specific to vertically-bounded grids (and so is
 # the BatchedTridiagonalSolver).
-@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Face, clock, Δt, νzᶜᶜᶜ) 
+@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ) 
     closure_ij = getclosure(i, j, closure)  
     νᵏ = νzᶜᶜᶜ(i, j, k, grid, closure_ij, K, clock)
 
@@ -75,7 +75,7 @@ end
                   - Δt * κ_Δz²(i, j, k, k, grid, νᵏ))
 end
 
-@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Face, clock, Δt, νzᶜᶜᶜ)
+@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ)
     k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
     νᵏ⁻¹ = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
@@ -86,11 +86,11 @@ end
 
 ### Diagonal terms
 
-@inline ivd_diagonal(i, j, k, grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) =
-    one(eltype(grid)) -
-        Δt * _implicit_linear_coefficient(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
-                      _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, LX, LY, LZ, clock, Δt, κz) -
-                      _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, LX, LY, LZ, clock, Δt, κz)
+@inline ivd_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) =
+    one(grid) -
+        Δt * _implicit_linear_coefficient(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
+                      _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
+                      _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
 
 @inline _implicit_linear_coefficient(args...) = implicit_linear_coefficient(args...)
 @inline _ivd_upper_diagonal(args...) = ivd_upper_diagonal(args...)

From 4854b7ed30980eff8a8e947f1acad7f81e089413 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 22:16:17 -0800
Subject: [PATCH 185/530] Fix show

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl     | 1 +
 .../CATKEVerticalDiffusivities/mixing_length.jl                  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 77d3983b09..0556447f06 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -328,6 +328,7 @@ function Base.show(io::IO, closure::FlavorOfCATKE)
               "├── negative_turbulent_kinetic_energy_damping_time_scale: ", prettysummary(closure.negative_turbulent_kinetic_energy_damping_time_scale), '\n',
               "├── minimum_convective_buoyancy_flux: ", prettysummary(closure.minimum_convective_buoyancy_flux), '\n',
               "├── mixing_length: ", prettysummary(closure.mixing_length), '\n',
+              "│   ├── Cᴺ:   ", prettysummary(closure.mixing_length.Cᴺ), '\n',
               "│   ├── Cᵇ:   ", prettysummary(closure.mixing_length.Cᵇ), '\n',
               "│   ├── Cᶜc:  ", prettysummary(closure.mixing_length.Cᶜc), '\n',
               "│   ├── Cᶜe:  ", prettysummary(closure.mixing_length.Cᶜe), '\n',
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 18609d9fb0..33742611b8 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -252,6 +252,7 @@ Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"
 Base.show(io::IO, ml::MixingLength) =
     print(io, "CATKEVerticalDiffusivities.MixingLength parameters:", '\n',
               "    Cᴺ   = $(ml.Cᴺ)",   '\n',
+              "    Cᵇ   = $(ml.Cᵇ)",   '\n',
               "    Cᶜc  = $(ml.Cᶜc)",  '\n',
               "    Cᶜe  = $(ml.Cᶜe)",  '\n',
               "    Cᵉc  = $(ml.Cᵉc)",  '\n',

From 47d089186721267f9c3cd4681eb968427b020843 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 13 Apr 2023 22:16:24 -0800
Subject: [PATCH 186/530] Update to homogeneous windy convection

---
 .../heterogeneous_windy_convection.jl         | 81 ++++++++++++-------
 1 file changed, 52 insertions(+), 29 deletions(-)

diff --git a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
index 3448f08345..140f9a26e1 100644
--- a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
+++ b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
@@ -7,8 +7,10 @@ using Oceananigans.Units
 using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
 using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom
 
+import Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities
+
 Nx = 1
-Ny = 100
+Ny = 200
 
 const Lx = 1000kilometers
 const Ly = Lx
@@ -45,17 +47,20 @@ u_top_bc = FluxBoundaryCondition(Qᵘ)
 b_bcs = FieldBoundaryConditions(top=b_top_bc)
 u_bcs = FieldBoundaryConditions(top=u_top_bc)
 
-vertical_mixing = CATKEVerticalDiffusivity()
+vertical_mixing = CATKEVerticalDiffusivity(; minimum_turbulent_kinetic_energy=1e-9)
 #vertical_mixing = RiBasedVerticalDiffusivity()
 
+@show vertical_mixing
+
 Δy = Ly / Ny
-ν₄ = Δy^4 / 70minutes
+ν₄ = Δy^4 / 1hours
 hyperviscosity = HorizontalScalarBiharmonicDiffusivity(ν=ν₄)
 
 #closure = vertical_mixing
 closure = (vertical_mixing, hyperviscosity)
 
 filename = "heterogeneous_cooling_with_hyperviscosity.jld2"
+#filename = "heterogeneous_cooling.jld2"
 
 model = HydrostaticFreeSurfaceModel(; grid, closure,
                                     momentum_advection = WENO(),
@@ -65,11 +70,11 @@ model = HydrostaticFreeSurfaceModel(; grid, closure,
                                     boundary_conditions = (; b=b_bcs, u=u_bcs),
                                     buoyancy = BuoyancyTracer())
 
-N² = 1e-5
-bᵢ(x, y, z) = N² * z
-set!(model, b=bᵢ, e=1e-6)
+N²ᵢ = 1e-5
+bᵢ(x, y, z) = N²ᵢ * z
+set!(model, b=bᵢ, e=1e-9)
 
-simulation = Simulation(model, Δt=5minute, stop_time=2days)
+simulation = Simulation(model, Δt=5minute, stop_time=10days)
 
 κᶜ = if model.closure isa Tuple
     model.diffusivity_fields[1].κᶜ
@@ -77,7 +82,9 @@ else
     model.diffusivity_fields.κᶜ
 end
 
-outputs = (; model.velocities..., model.tracers..., κᶜ=κᶜ)
+b = model.tracers.b
+N² = ∂z(b)
+outputs = (; model.velocities..., model.tracers..., κᶜ=κᶜ, N²=N²)
 
 simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs;
                                                       filename,
@@ -93,8 +100,8 @@ function progress(sim)
                    iteration(sim), prettytime(sim),
                    maximum(abs, u), maximum(abs, v), maximum(abs, w))
 
-    msg *= @sprintf(", extrema(e): (%6.2e, %6.2e) m² s⁻²", minimum(e), maximum(e))
-    msg *= @sprintf(", extrema(κᶜ): (%6.2e, %6.2e) m² s⁻²", minimum(κᶜ), maximum(κᶜ))
+    msg *= @sprintf(", max(e): %6.2e m² s⁻²", maximum(e))
+    msg *= @sprintf(", max(κᶜ): %6.2e m² s⁻¹", maximum(κᶜ))
 
     @info msg
     
@@ -110,10 +117,11 @@ e_ts = FieldTimeSeries(filename, "e")
 u_ts = FieldTimeSeries(filename, "u")
 v_ts = FieldTimeSeries(filename, "v")
 w_ts = FieldTimeSeries(filename, "w")
-#κ_ts = FieldTimeSeries(filename, "κᶜ")
+κ_ts = FieldTimeSeries(filename, "κᶜ")
+N_ts = FieldTimeSeries(filename, "N²")
 Nt = length(b_ts.times)
 
-for ψ in (b_ts, e_ts, u_ts, v_ts, w_ts)
+for ψ in (b_ts, e_ts, u_ts, v_ts, w_ts, κ_ts) #, N_ts)
     ψp = parent(ψ)
     ψp[ψp .== 0] .= NaN
 end
@@ -121,16 +129,17 @@ end
 fig = Figure(resolution=(1600, 800))
 
 ax_uyz = Axis(fig[1, 1], title="u(y, z) - <u(y, z)>")
-ax_vyz = Axis(fig[1, 2], title="v(y, z)")
-ax_wyz = Axis(fig[1, 3], title="w(y, z)")
+#ax_vyz = Axis(fig[1, 2], title="v(y, z)")
+ax_wyz = Axis(fig[1, 2], title="w(y, z)")
+ax_Nyz = Axis(fig[1, 3], title="N²(y, z)")
 ax_eyz = Axis(fig[1, 4], title="e(y, z)")
-#ax_κyz = Axis(fig[1, 4], title="κ(y, z)")
+ax_κyz = Axis(fig[1, 5], title="κ(y, z)")
 
-ax_bz = Axis(fig[2, 1], title="b(z)", xlabel="y")
+ax_bz = Axis(fig[2, 1], title="b(z)", xlabel="z")
 ax_uz = Axis(fig[2, 2], title="u(z)", ylabel="z")
 ax_vz = Axis(fig[2, 3], title="v(z)", ylabel="z")
 ax_ez = Axis(fig[2, 4], title="e(z)", ylabel="z")
-#ax_κz = Axis(fig[2, 4], title="κ(z)", ylabel="z")
+ax_κz = Axis(fig[2, 5], title="κ(z)", ylabel="z")
 
 slider = Slider(fig[3, :], range=1:Nt, startvalue=1)
 n = slider.value
@@ -149,7 +158,8 @@ end
 v_yz = @lift interior(v_ts[$n], 1, :, :)
 w_yz = @lift interior(w_ts[$n], 1, :, :)
 w_yz = @lift interior(w_ts[$n], 1, :, :)
-#κ_yz = @lift interior(κ_ts[$n], 1, :, :)
+N_yz = @lift interior(N_ts[$n], 1, :, :)
+κ_yz = @lift interior(κ_ts[$n], 1, :, :)
 
 Nx, Ny, Nz = size(grid)
 
@@ -161,9 +171,9 @@ e_z1 = @lift interior(e_ts[$n], 1, 16, :)
 e_z2 = @lift interior(e_ts[$n], 1, 32, :)
 e_z3 = @lift interior(e_ts[$n], 1, 8, :)
 
-# κ_z1 = @lift interior(κ_ts[$n], 1, 16, :)
-# κ_z2 = @lift interior(κ_ts[$n], 1, 32, :)
-# κ_z3 = @lift interior(κ_ts[$n], 1, 8, :)
+κ_z1 = @lift interior(κ_ts[$n], 1, 16, :)
+κ_z2 = @lift interior(κ_ts[$n], 1, 32, :)
+κ_z3 = @lift interior(κ_ts[$n], 1, 8, :)
 
 u_z1 = @lift interior(u_ts[$n], 1, 16, :)
 u_z2 = @lift interior(u_ts[$n], 1, 32, :)
@@ -174,29 +184,32 @@ v_z2 = @lift interior(v_ts[$n], 1, 32, :)
 v_z3 = @lift interior(v_ts[$n], 1, 8, :)
 
 x, y, z = nodes(b_ts)
-#xκ, yκ, zκ = nodes(κ_ts)
+xκ, yκ, zκ = nodes(κ_ts)
 
-elim = 6e-4
+elim = 1e-4
 ulim = 0.2
 vlim = 2e-2
-wlim = 2e-4
-κlim = 1e1
+wlim = 1e-5
+κlim = 1e-3 # 1e1
 
 heatmap!(ax_eyz, y, z, e_yz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
 contour!(ax_eyz, y, z, b_yz, levels=15, color=:black)
 
-#heatmap!(ax_κyz, y, zκ κ_yz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
-#contour!(ax_κyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_κyz, y, zκ, κ_yz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
+contour!(ax_κyz, y, z, b_yz, levels=15, color=:black)
 
 heatmap!(ax_uyz, y, z, u_yz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
 contour!(ax_uyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_vyz, y, z, v_yz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
-contour!(ax_vyz, y, z, b_yz, levels=15, color=:black)
+# heatmap!(ax_vyz, y, z, v_yz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
+# contour!(ax_vyz, y, z, b_yz, levels=15, color=:black)
 
 heatmap!(ax_wyz, y, z, w_yz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
 contour!(ax_wyz, y, z, b_yz, levels=15, color=:black)
 
+heatmap!(ax_Nyz, y, z, N_yz, colormap=:thermal, colorrange=(1e-6, 1e-5), nan_color=:gray)
+contour!(ax_Nyz, y, z, b_yz, levels=15, color=:black)
+
 lines!(ax_bz, b_z1, z)
 lines!(ax_bz, b_z2, z)
 lines!(ax_bz, b_z3, z)
@@ -205,6 +218,10 @@ lines!(ax_ez, e_z1, z)
 lines!(ax_ez, e_z2, z)
 lines!(ax_ez, e_z3, z)
 
+lines!(ax_κz, κ_z1, zκ)
+lines!(ax_κz, κ_z2, zκ)
+lines!(ax_κz, κ_z3, zκ)
+
 lines!(ax_uz, u_z1, z)
 lines!(ax_uz, u_z2, z)
 lines!(ax_uz, u_z3, z)
@@ -216,14 +233,20 @@ lines!(ax_vz, v_z3, z)
 xlims!(ax_ez, -elim/10, 2elim)
 xlims!(ax_uz, -2ulim, 2ulim)
 xlims!(ax_vz, -2vlim, 2vlim)
+xlims!(ax_κz, -κlim/10, 2κlim)
+
 ylims!(ax_bz, -1020, 20)
 ylims!(ax_uz, -1020, 20)
 ylims!(ax_vz, -1020, 20)
 ylims!(ax_ez, -1020, 20)
+ylims!(ax_κz, -1020, 20)
 
 display(fig)
 
+#=
 record(fig, filename[1:end-5] * ".mp4", 1:Nt, framerate=24) do nn
     @info "Plotting frame $nn of $Nt..."
     n[] = nn
 end
+=#
+

From 6437385eb71ffc5a28c02a1f7a55d88d2875d523 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Apr 2023 08:15:09 -0400
Subject: [PATCH 187/530] fixed immersed

---
 src/Distributed/distributed_grids.jl          |  8 +++----
 .../grid_fitted_immersed_boundaries.jl        | 23 -------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 1bd1046da4..cd34b96252 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -262,13 +262,11 @@ function with_halo(new_halo, grid::DistributedLatitudeLongitudeGrid)
 end
 
 function with_halo(new_halo, grid::DistributedImmersedBoundaryGrid)
-    global_immmersed_grid = reconstruct_global_grid(grid)
-    immersed_boundary     = global_immmersed_grid.immersed_boundary
-    underlying_grid       = global_immmersed_grid.underlying_grid
+    immersed_boundary     = grid.immersed_boundary
+    underlying_grid       = grid.underlying_grid
     new_underlying_grid   = with_halo(new_halo, underlying_grid)
     new_immersed_boundary = resize_immersed_boundary(immersed_boundary, new_underlying_grid)
-    new_grid              = ImmersedBoundaryGrid(new_underlying_grid, new_immersed_boundary)
-    return scatter_local_grids(architecture(grid), new_grid, size(grid))
+    return ImmersedBoundaryGrid(new_underlying_grid, new_immersed_boundary)
 end
 
 """
diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index 818cebc0d6..b490f788d3 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -79,8 +79,6 @@ function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom)
     return ImmersedBoundaryGrid(grid, new_ib)
 end
 
-ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:Function}) = ImmersedBoundaryGrid{topology(grid)...}(grid, ib, nothing)
-
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)
     validate_ib_size(grid, ib)
@@ -99,22 +97,6 @@ function validate_ib_size(grid, ib)
         throw(ArgumentError("The dimensions of the immersed boundary $(size(ib.bottom_height)) do not match the grid size $(bottom_height_size)"))
 end
 
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Function, <:InterfaceImmersedCondition})
-    x = xnode(c, c, f, i, j, k+1, underlying_grid)
-    y = ynode(c, c, f, i, j, k+1, underlying_grid)
-    z = znode(c, c, f, i, j, k+1, underlying_grid)
-    h = @inbounds ib.bottom_height(x, y)
-    return z <= h
-end
-
-@inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Function, <:CenterImmersedCondition})
-    x = xnode(c, c, c, i, j, k, underlying_grid)
-    y = ynode(c, c, c, i, j, k, underlying_grid)
-    z = znode(c, c, c, i, j, k, underlying_grid)
-    h = @inbounds ib.bottom_height(x, y)
-    return z <= h
-end
-
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
     z = znode(c, c, f, i, j, k+1, underlying_grid)
     h = @inbounds ib.bottom_height[i, j]
@@ -128,11 +110,6 @@ end
 end
 
 @inline bottom(i, j, k, ibg::GFIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
-@inline function bottom(i, j, k, ibg::GridFittedBottom{<:Function}) 
-    x = xnode(c, c, c, i, j, k, ibg.underlying_grid)
-    y = ynode(c, c, c, i, j, k, ibg.underlying_grid)
-    return ibg.immersed_boundary.bottom_height(i, j)
-end
 
 on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(arch_array(arch, ib.bottom_height))
 Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height))     

From dd1412bf4a6bb35cb17d6dc7884cff1d4e7615fe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Apr 2023 09:17:37 -0400
Subject: [PATCH 188/530] ensure no 0-size kernels

---
 src/Utils/kernel_launching.jl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index b40b1328ec..6e83b3d47a 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -85,10 +85,16 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                                       reduced_dimensions,
                                       location)
     
+
+
     if !isnothing(only_active_cells)
         workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
     end
 
+    if worksize == 0
+        return nothing
+    end
+    
     loop! = kernel!(Architectures.device(arch), workgroup, worksize)
 
     @debug "Launching kernel $kernel! with worksize $worksize"

From 26697223cfbe3fdd9e7b987ec84eca53b8a467bd Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 17 Apr 2023 16:00:20 -0800
Subject: [PATCH 189/530] Maybe fix immersed boundaries with Field
 bottom-height

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 6 +++---
 src/ImmersedBoundaries/partial_cell_bottom.jl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 47ba897b6e..2b7fef2bde 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -80,17 +80,17 @@ end
 
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
     z = znode(i, j, k+1, underlying_grid, c, c, f)
-    h = @inbounds ib.bottom_height[i, j]
+    h = @inbounds ib.bottom_height[i, j, 1]
     return z <= h
 end
 
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:CenterImmersedCondition})
     z = znode(i, j, k, underlying_grid, c, c, c)
-    h = @inbounds ib.bottom_height[i, j]
+    h = @inbounds ib.bottom_height[i, j, 1]
     return z <= h
 end
 
-@inline z_bottom(i, j, ibg::GFBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+@inline z_bottom(i, j, ibg::GFBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j, 1]
 on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(ib.bottom_height, ib.immersed_condition)
 
 function on_architecture(arch, ib::GridFittedBottom{<:Field})
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index 832012a82b..b84454cf36 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -95,7 +95,7 @@ Criterion is h >= z - ϵ Δz
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::PartialCellBottom)
     # Face node above current cell
     z = znode(i, j, k+1, underlying_grid, c, c, f)
-    h = @inbounds ib.bottom_height[i, j]
+    h = @inbounds ib.bottom_height[i, j, 1]
     return z <= h
 end
 
@@ -109,7 +109,7 @@ end
     x, y, z = node(i, j, k+1, underlying_grid, c, c, f)
 
     # Get bottom height and fractional Δz parameter
-    h = @inbounds ib.bottom_height[i, j]
+    h = @inbounds ib.bottom_height[i, j, 1]
     ϵ = ibg.immersed_boundary.minimum_fractional_cell_height
 
     # Are we in a bottom cell?
@@ -142,4 +142,4 @@ end
 @inline Δzᶜᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶜᶜᶠ(i, j-1, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))      
 @inline Δzᶠᶠᶠ(i, j, k, ibg::PCBIBG) = min(Δzᶠᶜᶠ(i, j-1, k, ibg), Δzᶠᶜᶠ(i, j, k, ibg))
 
-@inline z_bottom(i, j, ibg::PCBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+@inline z_bottom(i, j, ibg::PCBIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j, 1]

From b5b42d37dce3dda24f438f2f28254c7e805403c7 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 17 Apr 2023 16:27:57 -0800
Subject: [PATCH 190/530] Simplify via adapt

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 3 ++-
 src/ImmersedBoundaries/partial_cell_bottom.jl | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 2b7fef2bde..114a05bec1 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -101,5 +101,6 @@ function on_architecture(arch, ib::GridFittedBottom{<:Field})
     return GridFittedBottom(new_bottom_height, ib.immersed_condition)
 end
 
-Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height), ib.immersed_condition)
+Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height.data),
+                                                                   ib.immersed_condition)
 
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index b84454cf36..985a4cc342 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -78,7 +78,7 @@ function on_architecture(arch, ib::PartialCellBottom{<:Field})
     return PartialCellBottom(new_bottom_height, ib.minimum_fractional_cell_height)
 end
 
-Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height),
+Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height.data),
                                                                      ib.minimum_fractional_cell_height)     
 
 """

From a531eb0d1750a092df06a33022120418f1654b5e Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 18 Apr 2023 04:32:59 -0800
Subject: [PATCH 191/530] Simpler reductions for bottom_height

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 6 +++---
 src/ImmersedBoundaries/partial_cell_bottom.jl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 114a05bec1..cf57db762b 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -39,9 +39,9 @@ const GFBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:GridFit
 GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
 
 function Base.summary(ib::GridFittedBottom)
-    hmax = maximum(ib.bottom_height)
-    hmin = minimum(ib.bottom_height)
-    hmean = mean(ib.bottom_height)
+    hmax = maximum(interior(ib.bottom_height))
+    hmin = minimum(interior(ib.bottom_height))
+    hmean = mean(interior(ib.bottom_height))
 
     summary1 = "GridFittedBottom("
 
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index 985a4cc342..ba1e2173ae 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -15,9 +15,9 @@ end
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialCellBottom}
 
 function Base.summary(ib::PartialCellBottom)
-    hmax = maximum(ib.bottom_height)
-    hmin = minimum(ib.bottom_height)
-    hmean = mean(ib.bottom_height)
+    hmax = maximum(interior(ib.bottom_height))
+    hmin = minimum(interior(ib.bottom_height))
+    hmean = mean(interior(ib.bottom_height))
 
     summary1 = "PartialCellBottom("
 

From 9edc32b5e9b085ee847404950506e09e7c4f0b0b Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 18 Apr 2023 05:34:34 -0800
Subject: [PATCH 192/530] Interior to parent

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 6 +++---
 src/ImmersedBoundaries/partial_cell_bottom.jl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index cf57db762b..0c476e45e9 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -39,9 +39,9 @@ const GFBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:GridFit
 GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
 
 function Base.summary(ib::GridFittedBottom)
-    hmax = maximum(interior(ib.bottom_height))
-    hmin = minimum(interior(ib.bottom_height))
-    hmean = mean(interior(ib.bottom_height))
+    hmax = maximum(parent(ib.bottom_height))
+    hmin = minimum(parent(ib.bottom_height))
+    hmean = mean(parent(ib.bottom_height))
 
     summary1 = "GridFittedBottom("
 
diff --git a/src/ImmersedBoundaries/partial_cell_bottom.jl b/src/ImmersedBoundaries/partial_cell_bottom.jl
index ba1e2173ae..608c413679 100644
--- a/src/ImmersedBoundaries/partial_cell_bottom.jl
+++ b/src/ImmersedBoundaries/partial_cell_bottom.jl
@@ -15,9 +15,9 @@ end
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialCellBottom}
 
 function Base.summary(ib::PartialCellBottom)
-    hmax = maximum(interior(ib.bottom_height))
-    hmin = minimum(interior(ib.bottom_height))
-    hmean = mean(interior(ib.bottom_height))
+    hmax = maximum(parent(ib.bottom_height))
+    hmin = minimum(parent(ib.bottom_height))
+    hmean = mean(parent(ib.bottom_height))
 
     summary1 = "PartialCellBottom("
 

From 61a06210307d1542d56bb2591c0d0c289546cf8c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 18 Apr 2023 21:33:55 -0400
Subject: [PATCH 193/530] update

---
 src/Distributed/halo_communication.jl         | 42 ++-------
 src/Distributed/interleave_comm_and_comp.jl   |  4 +-
 src/Fields/field_boundary_buffers.jl          |  2 +-
 .../recompute_boundary_tendencies.jl          |  4 +-
 src/Utils/kernel_launching.jl                 |  2 -
 .../mpi_hydrostatic_turbulence.jl             | 85 +++++++++++++++++++
 6 files changed, 98 insertions(+), 41 deletions(-)
 create mode 100644 validation/distributed_simulations/mpi_hydrostatic_turbulence.jl

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index bb3ddc412e..4f28f438aa 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -42,13 +42,14 @@ import Oceananigans.BoundaryConditions:
 ##### MPI tags for halo communication BCs
 #####
 
-sides  = (:west, :east, :south, :north, :southwest, :southeast, :northwest, :northeast, :top, :bottom)
+sides  = (:west, :east, :south, :north, :southwest, :southeast, :northwest, :northeast)
 side_id = Dict(side => n-1 for (n, side) in enumerate(sides))
 
 opposite_side = Dict(
-    :west => :east, :east => :west,
-    :south => :north, :north => :south,
-    :bottom => :top, :top => :bottom,
+    :west => :east, 
+    :east => :west,
+    :south => :north,
+    :north => :south,
     :southwest => :northeast, 
     :southeast => :northwest, 
     :northwest => :southeast, 
@@ -96,7 +97,6 @@ for side in sides
     end
 end
 
-
 #####
 ##### Filling halos for halo communication boundary conditions
 #####
@@ -129,7 +129,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     end
     
-    fill_eventual_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+    fill_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     arch.mpi_tag[1] += 1
 
     return nothing
@@ -160,9 +160,9 @@ for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2,
 end
 
 # If more than one direction is communicating we need to repeat one fill halo to fill the freaking corners!
-function fill_eventual_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; blocking = true, kwargs...)
+function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; blocking = true, kwargs...)
     
-    requests = []
+    requests = MPI.Request[]
 
     reqsw = fill_southwest_halo!(connectivity.southwest, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     reqse = fill_southeast_halo!(connectivity.southeast, c, indices, loc, arch, grid, buffers, args...; kwargs...)
@@ -183,8 +183,6 @@ function fill_eventual_corners!(connectivity, c, indices, loc, arch, grid, buffe
         return nothing
     end
 
-    requests = requests |> Array{MPI.Request}
-
     # Syncronous MPI fill_halo_event!
     cooperative_waitall!(requests)
 
@@ -204,30 +202,6 @@ end
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 @inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
-### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
-function cooperative_test!(req)
-    done = false
-    while !done
-        done, _ = MPI.Test(req, MPI.Status)
-        yield()
-    end
-end
-
-### JUST TO TEST, EVENTUALLY IMPORT FROM MPI OR KA
-function cooperative_wait(task::Task)
-    while !Base.istaskdone(task)
-        MPI.Iprobe(MPI.ANY_SOURCE, MPI.ANY_TAG, MPI.COMM_WORLD)
-        yield()
-    end
-    wait(task)
-end
-
-function cooperative_waitall!(tasks::Array{Task})
-    for task in tasks
-        cooperative_wait(task)
-    end
-end
-
 cooperative_wait(req::MPI.Request) = MPI.Waitall(req)
 cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
 
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 14c0d66af3..4804f371e1 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -10,13 +10,13 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     end
 
     # HERE we have to put fill_eventual_halo_corners
-    recompute_boundary_tendencies!(model)
+    compute_boundary_tendencies!(model)
 
     return nothing
 end
 
 complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::BlockingDistributedArch) = nothing
-recompute_boundary_tendencies!() = nothing
+compute_boundary_tendencies!(model) = nothing
 
 interior_tendency_kernel_size(grid::DistributedGrid)    = interior_tendency_kernel_size(grid,    architecture(grid))
 interior_tendency_kernel_offsets(grid::DistributedGrid) = interior_tendency_kernel_offsets(grid, architecture(grid))
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 8cf8e45682..474a7d3e53 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -121,7 +121,7 @@ fill_south_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) =
     _fill_south_send_buffer!(parent(c), buffers.south, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 fill_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_north_send_buffer!(parent(c), buffers.north,halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+    _fill_north_send_buffer!(parent(c), buffers.north, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
 
 fill_southwest_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
     _fill_southwest_send_buffer!(parent(c), buffers.southwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index c69ab4e53d..53f27be518 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -1,7 +1,7 @@
-import Oceananigans.Distributed: recompute_boundary_tendencies!
+import Oceananigans.Distributed: compute_boundary_tendencies!
 
 # We assume here that top/bottom BC are always synched (no partitioning in z)
-function recompute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
+function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
     arch = architecture(grid)
 
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 6e83b3d47a..89ed5f25f8 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -84,8 +84,6 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                                       include_right_boundaries,
                                       reduced_dimensions,
                                       location)
-    
-
 
     if !isnothing(only_active_cells)
         workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
new file mode 100644
index 0000000000..d7c8ea568b
--- /dev/null
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -0,0 +1,85 @@
+using Oceananigans
+using MPI
+
+MPI.Initialized() || MPI.Init()
+
+     comm = MPI.COMM_WORLD
+mpi_ranks = MPI.Comm_size(comm)
+
+@assert mpi_ranks == 4
+
+using Statistics
+using Oceananigans
+using Oceananigans.Distributed
+
+ranks = (2, 2, 1)
+topo  = (Periodic, Periodic, Bounded)
+arch  = DistributedArch(CPU(), ranks=ranks, topology=topo, use_buffers=true)
+
+grid  = RectilinearGrid(arch, topology=topo, size=(28, 28, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
+
+local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+
+free_surface = SplitExplicitFreeSurface(; substeps = 30)
+
+model = HydrostaticFreeSurfaceModel(; grid, free_surface,
+                     momentum_advection = WENO(),
+                     tracer_advection = WENO(),
+                     buoyancy = nothing,
+                     coriolis = FPlane(f = 1),
+                     tracers = :c)
+
+using Random
+Random.seed!(1234 * (local_rank +1))
+
+set!(model, u = (x, y, z) -> rand(), v = (x, y, z) -> rand())
+
+mask(x, y, z) = x > π && x < 2π && y > π && y < 2π ? 1.0 : 0.0
+if local_rank == 0
+    set!(model.tracers.c, mask)
+end
+
+u, v, _ = model.velocities
+outputs = merge(model.velocities, model.tracers)
+
+progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
+simulation = Simulation(model, Δt=0.001, stop_time=100.0)
+
+simulation.callbacks[:progress] = Callback(progress, IterationInterval(100))
+
+filepath = "mpi_hydrostatic_turbulence_rank$(local_rank)"
+simulation.output_writers[:fields] =
+    JLD2OutputWriter(model, outputs, filename=filepath, schedule=TimeInterval(0.1),
+                     overwrite_existing=true)
+
+MPI.Barrier(MPI.COMM_WORLD)
+
+run!(simulation)
+
+if rank == 0
+    using Printf
+    using NCDatasets
+    using GLMakie
+
+    iter = Observable(1)
+
+    z1 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank0.jld2", "u")
+    z2 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank1.jld2", "u")
+    z3 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank2.jld2", "u")
+    z4 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank3.jld2", "u")
+
+    ζ1 = @lift(interior(z1[$iter], 1:28, 1:28, 1))
+    ζ2 = @lift(interior(z2[$iter], 1:28, 1:28, 1))
+    ζ3 = @lift(interior(z3[$iter], 1:28, 1:28, 1))
+    ζ4 = @lift(interior(z4[$iter], 1:28, 1:28, 1))
+
+    x1, y1 = z1.grid.xᶠᵃᵃ[1:28], z1.grid.yᵃᶜᵃ[1:28]
+    x2, y2 = z4.grid.xᶠᵃᵃ[1:28], z4.grid.yᵃᶜᵃ[1:28]
+
+    fig = Figure()
+    ax = Axis(fig[1, 1])
+    heatmap!(ax, x1, y1, ζ1, colorrange = (-1.0, 1.0))
+    heatmap!(ax, x1, y2, ζ2, colorrange = (-1.0, 1.0))
+    heatmap!(ax, x2, y1, ζ3, colorrange = (-1.0, 1.0))
+    heatmap!(ax, x2, y2, ζ4, colorrange = (-1.0, 1.0))
+end
\ No newline at end of file

From 76baae87518f1926522a5847c858b7f97867cbaa Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 19 Apr 2023 07:55:59 -0400
Subject: [PATCH 194/530] beautify

---
 .../split_explicit_free_surface_kernels.jl    | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index a62a0cc222..4501767669 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -293,22 +293,22 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     wait_free_surface_communication!(free_surface)
 
     # reset free surface averages
-    @apply_regionally initialize_free_surface_state!(free_surface.state, free_surface.η)
-
-    # Solve for the free surface at tⁿ⁺¹
-    @apply_regionally iterate_split_explicit!(free_surface, grid, Δt)
+    @apply_regionally begin 
+        initialize_free_surface_state!(free_surface.state, free_surface.η)
+        # Solve for the free surface at tⁿ⁺¹
+        iterate_split_explicit!(free_surface, grid, Δt)
+        # Reset eta for the next timestep
+        set_η!(free_surface.η, free_surface.state.η̅)
+    end
     
-    # Reset eta for the next timestep
-    # this is the only way in which η̅ is used: as a smoother for the 
-    # substepped η field
-    @apply_regionally set_η!(free_surface.η, free_surface.state.η̅)
-
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
     fill_halo_regions!(fields_to_fill; blocking = false)
 
     # Preparing velocities for the barotropic correction
-    mask_immersed_field!(model.velocities.u)
-    mask_immersed_field!(model.velocities.v)
+    @apply_regionally begin 
+        mask_immersed_field!(model.velocities.u)
+        mask_immersed_field!(model.velocities.v)
+    end
 
     return nothing
 end

From b36e7223b1ef5f12817229fb55b25445ba13dacf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 19 Apr 2023 07:56:32 -0400
Subject: [PATCH 195/530] remove vestigial

---
 .../split_explicit_free_surface_kernels.jl                  | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 4501767669..84ade2d784 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -298,9 +298,9 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
         # Solve for the free surface at tⁿ⁺¹
         iterate_split_explicit!(free_surface, grid, Δt)
         # Reset eta for the next timestep
-        set_η!(free_surface.η, free_surface.state.η̅)
+        set!(free_surface.η, free_surface.state.η̅)
     end
-    
+
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
     fill_halo_regions!(fields_to_fill; blocking = false)
 
@@ -313,8 +313,6 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     return nothing
 end
 
-@inline set_η!(η, η̅) = parent(η) .= parent(η̅)
-
 function iterate_split_explicit!(free_surface, grid, Δt)
     arch = architecture(grid)
 

From 794410f9570746db9d98a89c96e100a623966a5f Mon Sep 17 00:00:00 2001
From: "Gregory L. Wagner" <wagner.greg@gmail.com>
Date: Wed, 19 Apr 2023 15:02:21 -0800
Subject: [PATCH 196/530] Update src/Fields/field.jl

---
 src/Fields/field.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index 638d77e7cc..29bd3ebae4 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -547,10 +547,10 @@ const AnyReduction     = typeof(Base.any!)
 
 isversion8⁺() = VERSION.minor > 7
 
-initialize_reduced_field!(::SumReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, Base.add_sum, true, interior(c))  : Base.initarray!(interior(r), Base.add_sum, true, interior(c))
-initialize_reduced_field!(::ProdReduction, f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, Base.mul_prod, true, interior(c)) : Base.initarray!(interior(r), Base.mul_prod, true, interior(c))
-initialize_reduced_field!(::AllReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, &, true, interior(c))             : Base.initarray!(interior(r), &, true, interior(c))
-initialize_reduced_field!(::AnyReduction,  f, r::ReducedField, c) = isversion8⁺() ? Base.initarray!(interior(r), f, |, true, interior(c))             : Base.initarray!(interior(r), |, true, interior(c))
+initialize_reduced_field!(::SumReduction,  f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, Base.add_sum, true, interior(c))  : Base.initarray!(interior(r), Base.add_sum, true, interior(c))
+initialize_reduced_field!(::ProdReduction, f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, Base.mul_prod, true, interior(c)) : Base.initarray!(interior(r), Base.mul_prod, true, interior(c))
+initialize_reduced_field!(::AllReduction,  f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, &, true, interior(c))             : Base.initarray!(interior(r), &, true, interior(c))
+initialize_reduced_field!(::AnyReduction,  f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, |, true, interior(c))             : Base.initarray!(interior(r), |, true, interior(c))
 
 initialize_reduced_field!(::MaximumReduction, f, r::ReducedField, c) = Base.mapfirst!(f, interior(r), interior(c))
 initialize_reduced_field!(::MinimumReduction, f, r::ReducedField, c) = Base.mapfirst!(f, interior(r), interior(c))

From d0e03f77350d9b4a6a86485db3cf7795f670c803 Mon Sep 17 00:00:00 2001
From: "Gregory L. Wagner" <wagner.greg@gmail.com>
Date: Wed, 19 Apr 2023 15:02:26 -0800
Subject: [PATCH 197/530] Update src/Fields/field.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/Fields/field.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Fields/field.jl b/src/Fields/field.jl
index 29bd3ebae4..9de66583da 100644
--- a/src/Fields/field.jl
+++ b/src/Fields/field.jl
@@ -545,7 +545,7 @@ const MinimumReduction = typeof(Base.minimum!)
 const AllReduction     = typeof(Base.all!)
 const AnyReduction     = typeof(Base.any!)
 
-isversion8⁺() = VERSION.minor > 7
+isjuliaversion1p8⁺() = VERSION.minor > 7
 
 initialize_reduced_field!(::SumReduction,  f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, Base.add_sum, true, interior(c))  : Base.initarray!(interior(r), Base.add_sum, true, interior(c))
 initialize_reduced_field!(::ProdReduction, f, r::ReducedField, c) = isjuliaversion1p8⁺() ? Base.initarray!(interior(r), f, Base.mul_prod, true, interior(c)) : Base.initarray!(interior(r), Base.mul_prod, true, interior(c))

From 361e09f086dc702ca22578b32fa89c63bcf6ced3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 12:31:43 -0400
Subject: [PATCH 198/530] Update CATKE

---
 .../CATKEVerticalDiffusivities.jl             | 199 ++++++++++++------
 .../mixing_length.jl                          | 189 +++++++++++------
 .../turbulent_kinetic_energy_equation.jl      |  66 +++---
 3 files changed, 307 insertions(+), 147 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 439a31d199..39076d3628 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -6,9 +6,11 @@ using KernelAbstractions: @kernel, @index
 using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.Utils
+using Oceananigans.Units
 using Oceananigans.Fields
 using Oceananigans.Operators
 
+using Oceananigans.Utils: prettysummary
 using Oceananigans.Fields: ZeroField
 using Oceananigans.BoundaryConditions: default_prognostic_bc, DefaultBoundaryCondition
 using Oceananigans.BoundaryConditions: BoundaryCondition, FieldBoundaryConditions
@@ -39,28 +41,62 @@ import Oceananigans.TurbulenceClosures:
     diffusive_flux_y,
     diffusive_flux_z
 
-
-struct CATKEVerticalDiffusivity{TD, CL, TKE} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
+struct CATKEVerticalDiffusivity{TD, CL, FT, TKE} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
     mixing_length :: CL
     turbulent_kinetic_energy_equation :: TKE
+    maximum_diffusivity :: FT
+    minimum_turbulent_kinetic_energy :: FT
+    minimum_convective_buoyancy_flux :: FT
+    negative_turbulent_kinetic_energy_damping_time_scale :: FT
 end
 
-function CATKEVerticalDiffusivity{TD}(mixing_length :: CL,
-                                      turbulent_kinetic_energy_equation :: TKE) where {TD, CL, TKE}
-
-    return CATKEVerticalDiffusivity{TD, CL, TKE}(mixing_length,
-                                                 turbulent_kinetic_energy_equation)
+function CATKEVerticalDiffusivity{TD}(mixing_length::CL,
+                                      turbulent_kinetic_energy_equation::TKE,
+                                      maximum_diffusivity::FT,
+                                      minimum_turbulent_kinetic_energy::FT,
+                                      minimum_convective_buoyancy_flux::FT,
+                                      negative_turbulent_kinetic_energy_damping_time_scale::FT) where {TD, CL, TKE, FT}
+
+    return CATKEVerticalDiffusivity{TD, CL, FT, TKE}(mixing_length,
+                                                     turbulent_kinetic_energy_equation,
+                                                     maximum_diffusivity,
+                                                     minimum_turbulent_kinetic_energy,
+                                                     minimum_convective_buoyancy_flux,
+                                                     negative_turbulent_kinetic_energy_damping_time_scale)
 end
 
 """
     CATKEVerticalDiffusivity(time_discretization = VerticallyImplicitTimeDiscretization(), FT=Float64;
                              mixing_length = MixingLength{FT}(),
                              turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation{FT}(),
-                             warning = true)
+                             maximum_diffusivity = Inf,
+                             minimum_turbulent_kinetic_energy = zero(FT),
+                             negative_turbulent_kinetic_energy_damping_time_scale = 1minute)
 
-Returns the `CATKEVerticalDiffusivity` turbulence closure for vertical mixing by
+Return the `CATKEVerticalDiffusivity` turbulence closure for vertical mixing by
 small-scale ocean turbulence based on the prognostic evolution of subgrid
 Turbulent Kinetic Energy (TKE).
+
+Keyword arguments
+=================
+  - `maximum_diffusivity`: Maximum value for tracer, momentum, and TKE diffusivities.
+                           Used to clip the diffusivity when/if CATKE predicts
+                           diffusivities that are too large.
+                           Default: `Inf`.
+
+  - `minimum_turbulent_kinetic_energy`: Minimum value for the turbulent kinetic energy.
+                                        Can be used to model the presence "background" TKE
+                                        levels due to, for example, mixing by breaking internal waves.
+                                        Default: 0.
+
+  - `negative_turbulent_kinetic_energy_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
+                                                            typically generated by oscillatory errors associated
+                                                            with TKE advection.
+                                                            Default: 1 minute.
+
+Note that for numerical stability, it is recommended to either have a relative short
+`negative_turbulent_kinetic_energy_damping_time_scale` or a reasonable
+`minimum_turbulent_kinetic_energy`, or both.
 """
 CATKEVerticalDiffusivity(FT::DataType; kw...) = CATKEVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
 
@@ -71,35 +107,38 @@ const FlavorOfCATKE{TD} = Union{CATKEVD{TD}, CATKEVDArray{TD}} where TD
 include("mixing_length.jl")
 include("turbulent_kinetic_energy_equation.jl")
 
-# "Favorite" parameters from Wagner et al. 2023 (in prep)
-favorite_turbulent_kinetic_energy_equation(FT) = TurbulentKineticEnergyEquation(
-    C⁻D  = FT(1.2),
-    C⁺D  = FT(8.0),
-    CᶜD  = FT(1.0),
+# Optimal parameters for "favorite CATKE" from Wagner et al. 2023 (in prep)
+optimal_turbulent_kinetic_energy_equation(FT) = TurbulentKineticEnergyEquation(
+    C⁻D  = FT(4.4),
+    C⁺D  = FT(3.3),
+    CᶜD  = FT(0.23),
     CᵉD  = FT(0.0),
-    Cᵂu★ = FT(1.5),
-    CᵂwΔ = FT(3.3))
-
-favorite_mixing_length(FT) = MixingLength(
-    Cᵇ   = FT(0.6), 
-    Cˢ   = FT(Inf),
-    Cᶜc  = FT(1.4),
-    Cᶜe  = FT(9.1),
-    Cᵉc  = FT(0.34),
+    Cᵂu★ = FT(1.8),
+    CᵂwΔ = FT(12.0))
+
+optimal_mixing_length(FT) = MixingLength(
+    Cᵇ   = FT(0.37), 
+    Cᶜc  = FT(4.8),
+    Cᶜe  = FT(1.1),
+    Cᵉc  = FT(0.049),
     Cᵉe  = FT(0.0),
-    Cˢᶜ  = FT(0.18),
-    C⁻u  = FT(0.49),
-    C⁺u  = FT(0.17),
-    C⁻c  = FT(0.54),
-    C⁺c  = FT(0.10),
-    C⁻e  = FT(7.5),
-    C⁺e  = FT(1.3),
-    CRiʷ = FT(0.42),
-    CRiᶜ = FT(0.49))
+    Cˢᶜ  = FT(0.29),
+    C⁻u  = FT(0.36),
+    C⁺u  = FT(0.24),
+    C⁻c  = FT(0.41),
+    C⁺c  = FT(0.12),
+    C⁻e  = FT(6.7),
+    C⁺e  = FT(5.4),
+    CRiʷ = FT(0.011),
+    CRiᶜ = FT(0.76))
 
 function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(), FT=Float64;
-                                  mixing_length = favorite_mixing_length(FT),
-                                  turbulent_kinetic_energy_equation = favorite_turbulent_kinetic_energy_equation(FT),
+                                  mixing_length = optimal_mixing_length(FT),
+                                  turbulent_kinetic_energy_equation = optimal_turbulent_kinetic_energy_equation(FT),
+                                  maximum_diffusivity = Inf,
+                                  minimum_turbulent_kinetic_energy = 0,
+                                  minimum_convective_buoyancy_flux = 1e-11,
+                                  negative_turbulent_kinetic_energy_damping_time_scale = 1minute,
                                   warning = true) where TD
 
     if warning
@@ -113,7 +152,13 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
     mixing_length = convert_eltype(FT, mixing_length)
     turbulent_kinetic_energy_equation = convert_eltype(FT, turbulent_kinetic_energy_equation)
 
-    return CATKEVerticalDiffusivity{TD}(mixing_length, turbulent_kinetic_energy_equation)
+    return CATKEVerticalDiffusivity{TD}(mixing_length,
+                                        turbulent_kinetic_energy_equation,
+                                        FT(maximum_diffusivity),
+                                        FT(minimum_turbulent_kinetic_energy),
+                                        FT(minimum_convective_buoyancy_flux),
+                                        FT(negative_turbulent_kinetic_energy_damping_time_scale))
+                                  
 end
 
 function with_tracers(tracer_names, closure::FlavorOfCATKE)
@@ -164,22 +209,22 @@ end
 
 function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCATKE)
 
-    default_diffusivity_bcs = (Kᵘ = FieldBoundaryConditions(grid, (Center, Center, Face)),
-                               Kᶜ = FieldBoundaryConditions(grid, (Center, Center, Face)),
-                               Kᵉ = FieldBoundaryConditions(grid, (Center, Center, Face)))
+    default_diffusivity_bcs = (κᵘ = FieldBoundaryConditions(grid, (Center, Center, Face)),
+                               κᶜ = FieldBoundaryConditions(grid, (Center, Center, Face)),
+                               κᵉ = FieldBoundaryConditions(grid, (Center, Center, Face)))
 
     bcs = merge(default_diffusivity_bcs, bcs)
 
-    Kᵘ = CenterField(grid, boundary_conditions=bcs.Kᵘ)
-    Kᶜ = CenterField(grid, boundary_conditions=bcs.Kᶜ)
-    Kᵉ = CenterField(grid, boundary_conditions=bcs.Kᵉ)
+    κᵘ = ZFaceField(grid, boundary_conditions=bcs.κᵘ)
+    κᶜ = ZFaceField(grid, boundary_conditions=bcs.κᶜ)
+    κᵉ = ZFaceField(grid, boundary_conditions=bcs.κᵉ)
     Lᵉ = CenterField(grid) #, boundary_conditions=nothing)
 
     # Secret tuple for getting tracer diffusivities with tuple[tracer_index]
-    _tupled_tracer_diffusivities         = NamedTuple(name => name === :e ? Kᵉ : Kᶜ          for name in tracer_names)
+    _tupled_tracer_diffusivities         = NamedTuple(name => name === :e ? κᵉ : κᶜ          for name in tracer_names)
     _tupled_implicit_linear_coefficients = NamedTuple(name => name === :e ? Lᵉ : ZeroField() for name in tracer_names)
 
-    return (; Kᵘ, Kᶜ, Kᵉ, Lᵉ, _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
+    return (; κᵘ, κᶜ, κᵉ, Lᵉ, _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
 end        
 
 @inline viscosity_location(::FlavorOfCATKE) = (Center(), Center(), Face())
@@ -204,24 +249,29 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     return nothing
 end
 
-@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, args...)
+@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     i, j, k, = @index(Global, NTuple)
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
 
+    max_K = closure_ij.maximum_diffusivity
+
     @inbounds begin
-        diffusivities.Kᵘ[i, j, k] = Kuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, args...)
-        diffusivities.Kᶜ[i, j, k] = Kcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, args...)
-        diffusivities.Kᵉ[i, j, k] = Keᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, args...)
+        diffusivities.κᵘ[i, j, k] = min(max_K, κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
+        diffusivities.κᶜ[i, j, k] = min(max_K, κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
+        diffusivities.κᵉ[i, j, k] = min(max_K, κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs))
 
         # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
         # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
-        Qᵇ = buoyancy_flux(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, diffusivities)
+        wb = buoyancy_flux(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, diffusivities)
         eⁱʲᵏ = @inbounds tracers.e[i, j, k]
-        Qᵇ_e = ifelse(sign(Qᵇ) * sign(eⁱʲᵏ) < 0, Qᵇ / eⁱʲᵏ, zero(grid))
+
+        # See `buoyancy_flux`
+        dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
+        wb_e = ifelse(dissipative_buoyancy_flux, wb / eⁱʲᵏ, zero(grid))
         
-        diffusivities.Lᵉ[i, j, k] = - Qᵇ_e + implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, args...)
+        diffusivities.Lᵉ[i, j, k] = - wb_e + implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     end
 end
 
@@ -230,28 +280,32 @@ end
     return @inbounds L[i, j, k]
 end
 
-@inline turbulent_velocity(i, j, k, grid, e) = @inbounds sqrt(clip(e[i, j, k]))
+@inline function turbulent_velocity(i, j, k, grid, closure, e)
+    eᵢ = @inbounds e[i, j, k]
+    eᵐⁱⁿ = closure.minimum_turbulent_kinetic_energy
+    return sqrt(max(eᵐⁱⁿ, eᵢ))
+end
 @inline is_stableᶜᶜᶠ(i, j, k, grid, tracers, buoyancy) = ∂z_b(i, j, k, grid, buoyancy, tracers) >= 0
 
-@inline function Kuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, tracers.e)
+@inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
     ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓu * u★
 end
 
-@inline function Kcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, tracers.e)
+@inline function κcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
     ℓc = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓc * u★
 end
 
-@inline function Keᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, tracers.e)
+@inline function κeᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
     ℓe = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     return ℓe * u★
 end
 
-@inline viscosity(::FlavorOfCATKE, diffusivities) = diffusivities.Kᵘ
+@inline viscosity(::FlavorOfCATKE, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfCATKE, diffusivities, ::Val{id}) where id = diffusivities._tupled_tracer_diffusivities[id]
     
 #####
@@ -264,11 +318,34 @@ function Base.summary(closure::CATKEVD)
 end
 
 function Base.show(io::IO, closure::FlavorOfCATKE)
+    # └
     print(io, summary(closure))
     print(io, '\n')
-    show(io, closure.mixing_length)
-    print(io, '\n')
-    show(io, closure.turbulent_kinetic_energy_equation)
+    print(io, "├── maximum_diffusivity: ", prettysummary(closure.maximum_diffusivity), '\n',
+              "├── minimum_turbulent_kinetic_energy: ", prettysummary(closure.minimum_turbulent_kinetic_energy), '\n',
+              "├── negative_turbulent_kinetic_energy_damping_time_scale: ", prettysummary(closure.negative_turbulent_kinetic_energy_damping_time_scale), '\n',
+              "├── minimum_convective_buoyancy_flux: ", prettysummary(closure.minimum_convective_buoyancy_flux), '\n',
+              "├── mixing_length: ", prettysummary(closure.mixing_length), '\n',
+              "│   ├── Cᵇ:   ", prettysummary(closure.mixing_length.Cᵇ), '\n',
+              "│   ├── Cᶜc:  ", prettysummary(closure.mixing_length.Cᶜc), '\n',
+              "│   ├── Cᶜe:  ", prettysummary(closure.mixing_length.Cᶜe), '\n',
+              "│   ├── Cᵉc:  ", prettysummary(closure.mixing_length.Cᵉc), '\n',
+              "│   ├── Cᵉe:  ", prettysummary(closure.mixing_length.Cᵉe), '\n',
+              "│   ├── C⁻u:  ", prettysummary(closure.mixing_length.C⁻u), '\n',
+              "│   ├── C⁻c:  ", prettysummary(closure.mixing_length.C⁻c), '\n',
+              "│   ├── C⁻e:  ", prettysummary(closure.mixing_length.C⁻e), '\n',
+              "│   ├── C⁺u:  ", prettysummary(closure.mixing_length.C⁺u), '\n',
+              "│   ├── C⁺c:  ", prettysummary(closure.mixing_length.C⁺c), '\n',
+              "│   ├── C⁺e:  ", prettysummary(closure.mixing_length.C⁺e), '\n',
+              "│   ├── CRiʷ: ", prettysummary(closure.mixing_length.CRiʷ), '\n',
+              "│   └── CRiᶜ: ", prettysummary(closure.mixing_length.CRiᶜ), '\n',
+              "└── turbulent_kinetic_energy_equation: ", prettysummary(closure.turbulent_kinetic_energy_equation), '\n',
+              "    ├── C⁻D:  ", prettysummary(closure.turbulent_kinetic_energy_equation.C⁻D),  '\n',
+              "    ├── C⁺D:  ", prettysummary(closure.turbulent_kinetic_energy_equation.C⁺D),  '\n',
+              "    ├── CᶜD:  ", prettysummary(closure.turbulent_kinetic_energy_equation.CᶜD),  '\n',
+              "    ├── CᵉD:  ", prettysummary(closure.turbulent_kinetic_energy_equation.CᵉD),  '\n',
+              "    ├── Cᵂu★: ", prettysummary(closure.turbulent_kinetic_energy_equation.Cᵂu★), '\n',
+              "    └── CᵂwΔ: ", prettysummary(closure.turbulent_kinetic_energy_equation.CᵂwΔ))
 end
 
 end # module
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 2306806ce9..fc02c100ea 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -1,4 +1,4 @@
-using ..TurbulenceClosures: wall_vertical_distanceᶜᶜᶠ
+using ..TurbulenceClosures: wall_vertical_distanceᶜᶜᶠ, wall_vertical_distanceᶜᶜᶜ, total_depthᶜᶜᵃ
 
 """
     struct MixingLength{FT}
@@ -7,7 +7,6 @@ Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
     Cᵇ   :: FT = Inf
-    Cˢ   :: FT = Inf
     Cᶜc  :: FT = 0.0
     Cᶜe  :: FT = 0.0
     Cᵉc  :: FT = 0.0
@@ -30,69 +29,138 @@ end
 @inline ϕ⁺(i, j, k, grid, ψ) = @inbounds clip(ψ[i, j, k])
 @inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
 
-@inline function shearᶜᶜᶠ(i, j, k, grid, velocities)
-    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, velocities.u)
-    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, velocities.v)
+@inline function shearᶜᶜᶠ(i, j, k, grid, u, v)
+    ∂z_u² = ℑxᶜᵃᵃ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, u)
+    ∂z_v² = ℑyᵃᶜᵃ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, v)
     S² = ∂z_u² + ∂z_v²
     return S²
 end
 
-@inline function buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, e, tracers, buoyancy)
+@inline function shearᶜᶜᶜ(i, j, k, grid, u, v)
+    ∂z_u² = ℑxzᶜᵃᶜ(i, j, k, grid, ϕ², ∂zᶠᶜᶠ, u)
+    ∂z_v² = ℑyzᵃᶜᶜ(i, j, k, grid, ϕ², ∂zᶜᶠᶠ, v)
+    S² = ∂z_u² + ∂z_v²
+    return S²
+end
+
+@inline function buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     N²⁺ = clip(N²)
-    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, e)
+    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, e)
     return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
 end
 
-@inline function shear_mixing_lengthᶜᶜᶠ(i, j, k, grid, e, velocities)
+@inline function buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
-    S² = shearᶜᶜᶠ(i, j, k, grid, velocities)
-    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, e)
-    return ifelse(S² == 0, FT(Inf), w★ / sqrt(S²))
+    N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+    N²⁺ = clip(N²)
+    w★ = turbulent_velocity(i, j, k, grid, closure, e)
+    return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
 end
 
-@inline function stable_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᵇ::Number, Cˢ::Number, e, velocities, tracers, buoyancy)
-    ℓᵇ = Cᵇ * buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, e, tracers, buoyancy)
+@inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ::Number, e, velocities, tracers, buoyancy)
+    ℓᵇ = Cᵇ * buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     d = wall_vertical_distanceᶜᶜᶠ(i, j, k, grid)
     ℓᵇ = ifelse(isnan(ℓᵇ), d, ℓᵇ)
     ℓ = min(d, ℓᵇ)
     return ℓ
 end
 
-@inline three_halves(i, j, k, grid, e) = @inbounds sqrt(clip(e[i, j, k])^3)
+@inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᵇ::Number, e, velocities, tracers, buoyancy)
+    ℓᵇ = Cᵇ * buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+    d = wall_vertical_distanceᶜᶜᶜ(i, j, k, grid)
+    ℓᵇ = ifelse(isnan(ℓᵇ), d, ℓᵇ)
+    ℓ = min(d, ℓᵇ)
+    return ℓ
+end
+
+@inline three_halves_tke(i, j, k, grid, closure, e) = turbulent_velocity(i, j, k, grid, closure, e)^3
+@inline squared_tke(i, j, k, grid, closure, e) = turbulent_velocity(i, j, k, grid, closure, e)^2
 
-@inline function convective_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
-                                             velocities, tracers, buoyancy, clock, tracer_bcs)
+@inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
+                                            velocities, tracers, buoyancy, clock, tracer_bcs)
 
-    Qᵇ  = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
-    N²  = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    w★  = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, tracers.e)
-    w★³ = ℑzᵃᵃᶠ(i, j, k, grid, three_halves, tracers.e)
-    w★² = ℑzᵃᵃᶠ(i, j, k, grid, ϕ⁺, tracers.e)
+    u, v, w = velocities
+
+    Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
+    Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
+    w★       = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocity, closure, tracers.e)
+    w★²      = ℑzᵃᵃᶠ(i, j, k, grid, squared_tke, closure, tracers.e)
+    w★³      = turbulent_velocity(i, j, grid.Nz, grid, closure, tracers.e)^3
+    S²       = shearᶜᶜᶠ(i, j, k, grid, u, v)
+    N²       = ∂z_b(i, j, k, grid, buoyancy, tracers)
+    N²_above = ∂z_b(i, j, k+1, grid, buoyancy, tracers)
+
+    #w★³ = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
 
     # "Convective length"
     # ℓᶜ ∼ boundary layer depth according to Deardorff scaling
-    ℓᶜ = Cᶜ * w★³ / Qᵇ
+    ℓᶜ = Cᶜ * w★³ / (Qᵇ + Qᵇᵋ)
+    ℓᶜ = ifelse(isnan(ℓᶜ), zero(grid), ℓᶜ)
+
+    # Figure out which mixing length applies
+    convecting = (Qᵇ > Qᵇᵋ) & (N² < 0)
+
+    # Model for shear-convection interaction
+    Sc = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sc = "Sheared convection number"
+    ϵᶜˢ = 1 - Cˢᶜ * Sc               # ϵ = Sheared convection factor
+    
+    # Reduce convective and entraining mixing lengths by sheared convection factor
+    # end ensure non-negativity
+    ℓᶜ = clip(ϵᶜˢ * ℓᶜ)
 
     # "Entrainment length"
     # Ensures that w′b′ ~ Qᵇ at entrainment depth
-    ℓᵉ = Cᵉ * Qᵇ / (w★ * N²)
+    ℓᵉ = Cᵉ * Qᵇ / (w★ * N² + Qᵇᵋ)
+    ℓᵉ = clip(ϵᶜˢ * ℓᵉ)
+    
+    entraining = (Qᵇ > Qᵇᵋ) & (N² > 0) & (N²_above < 0)
+
+    ℓ = ifelse(convecting, ℓᶜ,
+        ifelse(entraining, ℓᵉ, zero(grid)))
+
+    return ifelse(isnan(ℓ), zero(grid), ℓ)
+end
+
+@inline function convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
+                                            velocities, tracers, buoyancy, clock, tracer_bcs)
+
+    u, v, w = velocities
+
+    Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
+    Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
+    w★       = turbulent_velocity(i, j, k, grid, closure, tracers.e)
+    w★²      = turbulent_velocity(i, j, k, grid, closure, tracers.e)^2
+    w★³      = turbulent_velocity(i, j, grid.Nz, grid, closure, tracers.e)^3
+    S²       = shearᶜᶜᶜ(i, j, k, grid, u, v)
+    N²       = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+    N²_above = ℑzᵃᵃᶜ(i, j, k+1, grid, ∂z_b, buoyancy, tracers)
+
+    #w★³ = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
+
+    # "Convective length"
+    # ℓᶜ ∼ boundary layer depth according to Deardorff scaling
+    ℓᶜ = Cᶜ * w★³ / (Qᵇ + Qᵇᵋ)
+    ℓᶜ = ifelse(isnan(ℓᶜ), zero(grid), ℓᶜ)
 
     # Figure out which mixing length applies
-    N²_above = ∂z_b(i, j, k+1, grid, buoyancy, tracers) # buoyancy frequency
-    convecting = (Qᵇ > 0) & (N² < 0)
-    entraining = (Qᵇ > 0) & (N² > 0) & (N²_above < 0)
+    convecting = (Qᵇ > Qᵇᵋ) & (N² < 0)
 
     # Model for shear-convection interaction
-    S² = shearᶜᶜᶠ(i, j, k, grid, velocities)
-    Sc = sqrt(S²) * w★² / Qᵇ # Cs = "Sheared convection number"
-    ϵᶜˢ = 1 - Cˢᶜ * Sc       # ϵ = Sheared convection factor
+    Sc = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sc = "Sheared convection number"
+    ϵᶜˢ = 1 - Cˢᶜ * Sc               # ϵ = Sheared convection factor
     
     # Reduce convective and entraining mixing lengths by sheared convection factor
     # end ensure non-negativity
     ℓᶜ = clip(ϵᶜˢ * ℓᶜ)
+
+    # "Entrainment length"
+    # Ensures that w′b′ ~ Qᵇ at entrainment depth
+    ℓᵉ = Cᵉ * Qᵇ / (w★ * N² + Qᵇᵋ)
     ℓᵉ = clip(ϵᶜˢ * ℓᵉ)
+    
+    entraining = (Qᵇ > Qᵇᵋ) & (N² > 0) & (N²_above < 0)
 
     ℓ = ifelse(convecting, ℓᶜ,
         ifelse(entraining, ℓᵉ, zero(grid)))
@@ -102,9 +170,9 @@ end
 
 """Piecewise linear function between 0 (when x < c) and 1 (when x - c > w)."""
 @inline step(x, c, w) = max(zero(x), min(one(x), (x - c) / w))
-@inline scale(Ri, σ⁻, σ⁺, c, w)    = σ⁻ + (σ⁺ - σ⁻) * step(Ri, c, w)
+@inline scale(Ri, σ⁻, σ⁺, c, w) = σ⁻ + (σ⁺ - σ⁻) * step(Ri, c, w)
 
-@inline function stable_mixing_scaleᶜᶜᶠ(i, j, k, grid, C⁻, C⁺, closure, velocities, tracers, buoyancy)
+@inline function stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
     Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
     CRiᶜ = closure.mixing_length.CRiᶜ
     CRiʷ = closure.mixing_length.CRiʷ
@@ -114,71 +182,72 @@ end
 @inline function momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
     C⁻ = closure.mixing_length.C⁻u
     C⁺ = closure.mixing_length.C⁺u
-    σ = stable_mixing_scaleᶜᶜᶠ(i, j, k, grid, C⁻, C⁺, closure, velocities, tracers, buoyancy)
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
 
     Cᵇ = closure.mixing_length.Cᵇ
-    Cˢ = closure.mixing_length.Cˢ
-    ℓ★ = σ * stable_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᵇ, Cˢ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
 
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
-    return min(grid.Lz, ℓ★)
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    return min(H, ℓ★)
 end
 
 @inline function tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
     Cᶜ  = closure.mixing_length.Cᶜc
     Cᵉ  = closure.mixing_length.Cᵉc
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
 
     C⁻ = closure.mixing_length.C⁻c
     C⁺ = closure.mixing_length.C⁺c
-    σ = stable_mixing_scaleᶜᶜᶠ(i, j, k, grid, C⁻, C⁺, closure, velocities, tracers, buoyancy)
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
 
     Cᵇ = closure.mixing_length.Cᵇ
-    Cˢ = closure.mixing_length.Cˢ
-    ℓ★ = σ * stable_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᵇ, Cˢ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
-    return min(grid.Lz, ℓ★ + ℓʰ)
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    return min(H, ℓ★ + ℓʰ)
 end
 
 @inline function TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
     Cᶜ  = closure.mixing_length.Cᶜe
     Cᵉ  = closure.mixing_length.Cᵉe
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
 
     C⁻ = closure.mixing_length.C⁻e
     C⁺ = closure.mixing_length.C⁺e
-    σ = stable_mixing_scaleᶜᶜᶠ(i, j, k, grid, C⁻, C⁺, closure, velocities, tracers, buoyancy)
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
 
     Cᵇ = closure.mixing_length.Cᵇ
-    Cˢ = closure.mixing_length.Cˢ
-    ℓ★ = σ * stable_mixing_lengthᶜᶜᶠ(i, j, k, grid, Cᵇ, Cˢ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
-    return min(grid.Lz, ℓ★ + ℓʰ)
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    return min(H, ℓ★ + ℓʰ)
 end
 
-Base.show(io::IO, ML::MixingLength) =
+Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"
+
+Base.show(io::IO, ml::MixingLength) =
     print(io, "CATKEVerticalDiffusivities.MixingLength parameters:", '\n',
-              "    Cᵇ   = $(ML.Cᵇ)",   '\n',
-              "    Cˢ   = $(ML.Cˢ)",   '\n',
-              "    Cᶜc  = $(ML.Cᶜc)",  '\n',
-              "    Cᶜe  = $(ML.Cᶜe)",  '\n',
-              "    Cᵉc  = $(ML.Cᵉc)",  '\n',
-              "    Cᵉe  = $(ML.Cᵉe)",  '\n',
-              "    C⁻u  = $(ML.C⁻u)", '\n',
-              "    C⁻c  = $(ML.C⁻c)", '\n',
-              "    C⁻e  = $(ML.C⁻e)", '\n',
-              "    C⁺u  = $(ML.C⁺u)", '\n',
-              "    C⁺c  = $(ML.C⁺c)", '\n',
-              "    C⁺e  = $(ML.C⁺e)", '\n',
-              "    CRiʷ = $(ML.CRiʷ)", '\n',
-              "    CRiᶜ = $(ML.CRiᶜ)")
+              "    Cᵇ   = $(ml.Cᵇ)",   '\n',
+              "    Cᶜc  = $(ml.Cᶜc)",  '\n',
+              "    Cᶜe  = $(ml.Cᶜe)",  '\n',
+              "    Cᵉc  = $(ml.Cᵉc)",  '\n',
+              "    Cᵉe  = $(ml.Cᵉe)",  '\n',
+              "    C⁻u  = $(ml.C⁻u)", '\n',
+              "    C⁻c  = $(ml.C⁻c)", '\n',
+              "    C⁻e  = $(ml.C⁻e)", '\n',
+              "    C⁺u  = $(ml.C⁺u)", '\n',
+              "    C⁺c  = $(ml.C⁺c)", '\n',
+              "    C⁺e  = $(ml.C⁺e)", '\n',
+              "    CRiʷ = $(ml.CRiʷ)", '\n',
+              "    CRiᶜ = $(ml.CRiᶜ)")
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index af8613702c..79bbd8e216 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -23,14 +23,16 @@ end
 @inline ν_∂z_v²(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
 @inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, diffusivities)
-    Kᵘ = diffusivities.Kᵘ
+    κᵘ = diffusivities.κᵘ
     u = velocities.u
     v = velocities.v
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², Kᵘ, u) + ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², Kᵘ, v)
+
+    # Separate reconstruction of the u- and v- contributions is essential for numerical stability
+    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², κᵘ, u) + ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², κᵘ, v)
 end
 
 @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-    κᶻ = @inbounds diffusivities.Kᶜ[i, j, k]
+    κᶻ = @inbounds diffusivities.κᶜ[i, j, k]
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     return - κᶻ * N²
 end
@@ -44,14 +46,17 @@ const VITD = VerticallyImplicitTimeDiscretization
     wb = ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
     eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
+    dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
+
     # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
-    # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
-    return ifelse(sign(wb) * sign(eⁱʲᵏ) < 0, zero(grid), wb)
+    # If buoyancy flux is a _sink_ of TKE, we treat it implicitly, and return zero here for
+    # the explicit buoyancy flux.
+    return ifelse(dissipative_buoyancy_flux, zero(grid), wb)
 end
 
 @inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
 
-@inline function implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE{<:VITD},
+@inline function implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE,
                                                   velocities, tracers, buoyancy, clock, tracer_bcs)
     e = tracers.e
     FT = eltype(grid)
@@ -60,7 +65,8 @@ end
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = ℑzᵃᵃᶜ(i, j, k, grid, convective_mixing_lengthᶜᶜᶠ, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = ℑzᵃᵃᶜ(i, j, k, grid, convective_length_scaleᶜᶜᶠ, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    #ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
 
     # "Stable" dissipation length
     C⁻D = closure.turbulent_kinetic_energy_equation.C⁻D
@@ -71,19 +77,18 @@ end
     σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
 
     Cᵇ = closure.mixing_length.Cᵇ
-    Cˢ = closure.mixing_length.Cˢ
-    ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_mixing_lengthᶜᶜᶠ, Cᵇ, Cˢ, tracers.e, velocities, tracers, buoyancy)
+    #ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
     # Dissipation length
-    ℓᴰ = min(grid.Lz, ℓ★ + ℓʰ)
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    ℓᴰ = min(H, ℓ★ + ℓʰ)
 
     eᵢ = @inbounds e[i, j, k]
     
-    #ℓᴰ = ifelse(eᵢ < 0, Δzᶜᶜᶜ(i, j, k, grid) / 10, ℓᴰ)
-
     # Note:
     #   Because   ∂t e + ⋯ = ⋯ + L e = ⋯ - ϵ,
     #
@@ -92,12 +97,17 @@ end
     #
     #   and thus    L = - Cᴰ √e / ℓ .
 
-    return - sqrt(abs(eᵢ)) / ℓᴰ
+    τ = closure.negative_turbulent_kinetic_energy_damping_time_scale
+
+    return ifelse(eᵢ < 0, -1/τ, -sqrt(abs(eᵢ)) / ℓᴰ)
 end
 
 # Fallbacks for explicit time discretization
-@inline dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...) =
-    @inbounds - tracers.e[i, j, k] * implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
+@inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
+    eᵢ = @inbounds tracers.e[i, j, k]
+    L = implicit_dissipation_coefficient(i, j, k, grid, closure, velocities, tracers, args...)
+    return - L * eᵢ
+end
 
 @inline implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)
 
@@ -108,6 +118,7 @@ end
 # TODO: include shear production and buoyancy flux from AbstractScalarDiffusivity
 
 @inline shear_production(i, j, k, grid, closure, velocities, diffusivities) = zero(grid)
+
 @inline shear_production(i, j, k, grid, closures::Tuple{<:Any}, velocities, diffusivities) =
     shear_production(i, j, k, grid, closures[1], velocities, diffusivities[1])
 
@@ -121,6 +132,7 @@ end
     shear_production(i, j, k, grid, closures[3], velocities, diffusivities[3])
 
 @inline buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) = zero(grid)
+
 @inline buoyancy_flux(i, j, k, grid, closures::Tuple{<:Any}, velocities, tracers, buoyancy, diffusivities) =
     buoyancy_flux(i, j, k, grid, closures[1], velocities, diffusivities[1])
 
@@ -137,6 +149,7 @@ end
 @inline dissipation(i, j, k, grid, closure, args...) = zero(grid)
 
 @inline dissipation(i, j, k, grid, closures::Tuple{<:Any}, args...) = dissipation(i, j, k, grid, closures[1], args...)
+
 @inline dissipation(i, j, k, grid, closures::Tuple{<:Any, <:Any}, args...) = 
     dissipation(i, j, k, grid, closures[1], args...) +
     dissipation(i, j, k, grid, closures[2], args...)
@@ -152,20 +165,19 @@ end
 
 """ Compute the flux of TKE through the surface / top boundary. """
 @inline function top_tke_flux(i, j, grid, clock, fields, parameters, closure::FlavorOfCATKE, buoyancy)
+    closure = getclosure(i, j, closure)
+
     top_tracer_bcs = parameters.top_tracer_boundary_conditions
     top_velocity_bcs = parameters.top_velocity_boundary_conditions
-    closure = getclosure(i, j, closure)
     tke_parameters = closure.turbulent_kinetic_energy_equation
 
-    return _top_tke_flux(i, j, grid, tke_parameters, closure,
-                         buoyancy, fields, top_tracer_bcs, top_velocity_bcs, clock)
+    return _top_tke_flux(i, j, grid, clock, fields, tke_parameters, closure,
+                         buoyancy, top_tracer_bcs, top_velocity_bcs)
 end
 
 """ Compute the flux of TKE through the surface / top boundary. """
 @inline top_tke_flux(i, j, grid, clock, fields, parameters, closure, buoyancy) = zero(grid)
 
-@inline top_tke_flux(i, j, grid, clock, fields, parameters, closure::Tuple{}, buoyancy) = zero(grid)
-
 @inline top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple::Tuple{<:Any}, buoyancy) =
     top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[1], buoyancy)
 
@@ -178,8 +190,9 @@ end
     top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[2], buoyancy) + 
     top_tke_flux(i, j, grid, clock, fields, parameters, closure_tuple[3], buoyancy)
 
-@inline function _top_tke_flux(i, j, grid, tke::TurbulentKineticEnergyEquation, closure::CATKEVD,
-                               buoyancy, fields, top_tracer_bcs, top_velocity_bcs, clock)
+@inline function _top_tke_flux(i, j, grid, clock, fields,
+                               tke::TurbulentKineticEnergyEquation, closure::CATKEVD,
+                               buoyancy, top_tracer_bcs, top_velocity_bcs)
 
     wΔ³ = top_convective_turbulent_velocity³(i, j, grid, clock, fields, buoyancy, top_tracer_bcs)
     u★ = friction_velocity(i, j, grid, clock, fields, top_velocity_bcs)
@@ -218,11 +231,11 @@ using Oceananigans.BoundaryConditions: Flux
 const TKEBoundaryFunction = DiscreteBoundaryFunction{<:TKETopBoundaryConditionParameters}
 const TKEBoundaryCondition = BoundaryCondition{<:Flux, <:TKEBoundaryFunction}
 
-@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, grid::AbstractGrid, clock, model_fields, closure, buoyancy) =
-    bc.condition.func(i, j, grid, clock, model_fields, bc.condition.parameters, closure, buoyancy)
+@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
+    bc.condition.func(i, j, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
 
-@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, k::Integer, grid::AbstractGrid, clock, model_fields, closure, buoyancy) =
-    bc.condition.func(i, j, k, grid, clock, model_fields, bc.condition.parameters, closure, buoyancy)
+@inline getbc(bc::TKEBoundaryCondition, i::Integer, j::Integer, k::Integer, grid::AbstractGrid, clock, fields, clo, buoyancy) =
+    bc.condition.func(i, j, k, grid, clock, fields, bc.condition.parameters, clo, buoyancy)
 
 #####
 ##### Utilities for model constructors
@@ -279,6 +292,7 @@ function add_closure_specific_boundary_conditions(closure::FlavorOfCATKE,
     return new_boundary_conditions
 end
 
+Base.summary(::TurbulentKineticEnergyEquation) = "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation"
 Base.show(io::IO, tke::TurbulentKineticEnergyEquation) =
     print(io, "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation parameters: \n" *
               "    C⁻D  = $(tke.C⁻D),  \n" *

From fd141898b3d80108f223cccaf2b61e5e2f9e23ba Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 12:42:27 -0400
Subject: [PATCH 199/530] adapt CATKE to distributed

---
 src/TurbulenceClosures/TurbulenceClosures.jl  | 49 ++++++++++++++++---
 ...t_scalar_biharmonic_diffusivity_closure.jl |  4 +-
 .../abstract_scalar_diffusivity_closure.jl    | 37 +++-----------
 .../closure_kernel_operators.jl               |  2 +-
 src/TurbulenceClosures/closure_tuples.jl      | 20 ++++++--
 .../leith_enstrophy_diffusivity.jl            |  4 +-
 .../mews_vertical_diffusivity.jl              |  1 -
 .../nothing_closure.jl                        |  7 ++-
 .../scalar_biharmonic_diffusivity.jl          |  2 +-
 .../scalar_diffusivity.jl                     |  2 +-
 .../smagorinsky_lilly.jl                      |  2 +-
 .../vertically_implicit_diffusion_solver.jl   | 18 +++++--
 12 files changed, 92 insertions(+), 56 deletions(-)

diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index b938a7bed5..77e31887bc 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -67,7 +67,7 @@ abstract type AbstractTurbulenceClosure{TimeDiscretization} end
 validate_closure(closure) = closure
 closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
-calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
+calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...) = nothing
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
@@ -78,6 +78,23 @@ function buoyancy_flux end
 function dissipation end
 function hydrostatic_turbulent_kinetic_energy_tendency end
 
+#####
+##### Fallback: flux = 0
+#####
+
+for dir in (:x, :y, :z)
+    diffusive_flux = Symbol(:diffusive_flux_, dir)
+    viscous_flux_u = Symbol(:viscous_flux_u, dir)
+    viscous_flux_v = Symbol(:viscous_flux_v, dir)
+    viscous_flux_w = Symbol(:viscous_flux_w, dir)
+    @eval begin
+        @inline $diffusive_flux(i, j, k, grid, clo::AbstractTurbulenceClosure, args...) = zero(grid)
+        @inline $viscous_flux_u(i, j, k, grid, clo::AbstractTurbulenceClosure, args...) = zero(grid)
+        @inline $viscous_flux_v(i, j, k, grid, clo::AbstractTurbulenceClosure, args...) = zero(grid)
+        @inline $viscous_flux_w(i, j, k, grid, clo::AbstractTurbulenceClosure, args...) = zero(grid)
+    end
+end
+
 #####
 ##### The magic
 #####
@@ -87,12 +104,32 @@ function hydrostatic_turbulent_kinetic_energy_tendency end
 @inline getclosure(i, j, closure::AbstractVector{<:AbstractTurbulenceClosure}) = @inbounds closure[i]
 @inline getclosure(i, j, closure::AbstractTurbulenceClosure) = closure
 
-@inline surface(i, j, k, grid)                = znode(Center(), Center(), Face(), i, j, grid.Nz+1, grid)
-@inline bottom(i, j, k, grid)                 = znode(Center(), Center(), Face(), i, j, 1, grid)
-@inline depthᶜᶜᶠ(i, j, k, grid)               = surface(i, j, k, grid) - znode(Center(), Center(), Face(), i, j, k, grid)
-@inline height_above_bottomᶜᶜᶠ(i, j, k, grid) = znode(Center(), Center(), Face(), i, j, k, grid) - bottom(i, j, k, grid)
+@inline clip(x) = max(zero(x), x)
+
+const c = Center()
+const f = Face()
+
+@inline z_top(i, j, grid)          = znode(i, j, grid.Nz+1, grid, c, c, f)
+@inline z_bottom(i, j,  grid)      = znode(i, j, 1,         grid, c, c, f)
+
+@inline depthᶜᶜᶠ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, f))
+@inline depthᶜᶜᶜ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, c))
+@inline total_depthᶜᶜᵃ(i, j, grid) = clip(z_top(i, j, grid) - z_bottom(i, j, grid))
+
+@inline function height_above_bottomᶜᶜᶠ(i, j, k, grid)
+    Δz = Δzᶜᶜᶠ(i, j, k, grid)
+    h = znode(i, j, k, grid, c, c, f) - z_bottom(i, j, grid)
+    return max(Δz, h)
+end
+
+@inline function height_above_bottomᶜᶜᶜ(i, j, k, grid)
+    Δz = Δzᶜᶜᶜ(i, j, k, grid)
+    h = znode(i, j, k, grid, c, c, c) - z_bottom(i, j, grid)
+    return max(Δz, h)
+end
+
 @inline wall_vertical_distanceᶜᶜᶠ(i, j, k, grid) = min(depthᶜᶜᶠ(i, j, k, grid), height_above_bottomᶜᶜᶠ(i, j, k, grid))
-@inline opposite_wall_vertical_distanceᶜᶜᶠ(i, j, k, grid) = max(depthᶜᶜᶠ(i, j, k, grid), height_above_bottomᶜᶜᶠ(i, j, k, grid))
+@inline wall_vertical_distanceᶜᶜᶜ(i, j, k, grid) = min(depthᶜᶜᶜ(i, j, k, grid), height_above_bottomᶜᶜᶜ(i, j, k, grid))
 
 include("discrete_diffusion_function.jl")
 include("implicit_explicit_time_discretization.jl")
diff --git a/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl b/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
index 5f68396ee3..09b7cc7d45 100644
--- a/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
+++ b/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
@@ -1,9 +1,9 @@
 using Oceananigans.Grids: peripheral_node
 
 """
-    abstract type AbstractScalarDiffusivity <: AbstractTurbulenceClosure end
+    abstract type AbstractScalarBiharmonicDiffusivity <: AbstractTurbulenceClosure end
 
-Abstract type for closures with *isotropic* diffusivities.
+Abstract type for closures with scalar biharmonic diffusivities.
 """
 abstract type AbstractScalarBiharmonicDiffusivity{F} <: AbstractTurbulenceClosure{ExplicitTimeDiscretization} end
 
diff --git a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
index aa9d25fb2f..3f6ad1a504 100644
--- a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
+++ b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
@@ -3,7 +3,7 @@ using Oceananigans.Operators: ℑxyᶠᶠᵃ, ℑxzᶠᵃᶠ, ℑyzᵃᶠᶠ
 """
     abstract type AbstractScalarDiffusivity <: AbstractTurbulenceClosure end
 
-Abstract type for closures with *isotropic* diffusivities.
+Abstract type for closures with scalar diffusivities.
 """
 abstract type AbstractScalarDiffusivity{TD, F} <: AbstractTurbulenceClosure{TD} end
 
@@ -147,27 +147,6 @@ const C = Center
 @inline h_diffusivity(i, j, k, grid, ::F, ::C, ::F, closure::ASD, K, ::Nothing, args...) = νhᶠᶜᶠ(i, j, k, grid, closure, K, args...)
 @inline h_diffusivity(i, j, k, grid, ::C, ::F, ::F, closure::ASD, K, ::Nothing, args...) = νhᶜᶠᶠ(i, j, k, grid, closure, K, args...)
 
-#####
-##### Stress divergences
-#####
-
-#####
-##### Fallback: flux = 0
-#####
-
-for dir in (:x, :y, :z)
-    diffusive_flux = Symbol(:diffusive_flux_, dir)
-    viscous_flux_u = Symbol(:viscous_flux_u, dir)
-    viscous_flux_v = Symbol(:viscous_flux_v, dir)
-    viscous_flux_w = Symbol(:viscous_flux_w, dir)
-    @eval begin
-        @inline $diffusive_flux(i, j, k, grid, args...) = zero(grid)
-        @inline $viscous_flux_u(i, j, k, grid, args...) = zero(grid)
-        @inline $viscous_flux_v(i, j, k, grid, args...) = zero(grid)
-        @inline $viscous_flux_w(i, j, k, grid, args...) = zero(grid)
-    end
-end
-
 
 # Horizontal viscous fluxes for isotropic diffusivities
 @inline ν_σᶜᶜᶜ(i, j, k, grid, closure, K, clock, fields, σᶜᶜᶜ, args...) = νᶜᶜᶜ(i, j, k, grid, closure, K, clock, fields) * σᶜᶜᶜ(i, j, k, grid, args...)
@@ -325,14 +304,14 @@ const Lᶜᶜᶠ = Tuple{Center, Center, Face}
 const c = Center()
 const f = Face()
 
-@inline νᶜᶜᶜ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(c, c, c, i, j, k, grid)..., clock.time)
-@inline νᶠᶜᶠ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(f, c, f, i, j, k, grid)..., clock.time)
-@inline νᶜᶠᶠ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(c, f, f, i, j, k, grid)..., clock.time)
-@inline νᶠᶠᶜ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(f, f, c, i, j, k, grid)..., clock.time)
+@inline νᶜᶜᶜ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(i, j, k, grid, c, c, c)..., clock.time)
+@inline νᶠᶜᶠ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(i, j, k, grid, f, c, f)..., clock.time)
+@inline νᶜᶠᶠ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(i, j, k, grid, c, f, f)..., clock.time)
+@inline νᶠᶠᶜ(i, j, k, grid, loc, ν::F, clock, args...) where F<:Function = ν(node(i, j, k, grid, f, f, c)..., clock.time)
 
-@inline κᶠᶜᶜ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(f, c, c, i, j, k, grid)..., clock.time)
-@inline κᶜᶠᶜ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(c, f, c, i, j, k, grid)..., clock.time)
-@inline κᶜᶜᶠ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(c, c, f, i, j, k, grid)..., clock.time)
+@inline κᶠᶜᶜ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(i, j, k, grid, f, c, c)..., clock.time)
+@inline κᶜᶠᶜ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(i, j, k, grid, c, f, c)..., clock.time)
+@inline κᶜᶜᶠ(i, j, k, grid, loc, κ::F, clock, args...) where F<:Function = κ(node(i, j, k, grid, c, c, f)..., clock.time)
 
 # "DiscreteDiffusionFunction"
 @inline νᶜᶜᶜ(i, j, k, grid, loc, ν::DiscreteDiffusionFunction, clock, fields) = getdiffusivity(ν, i, j, k, grid, (c, c, c), clock, fields)
diff --git a/src/TurbulenceClosures/closure_kernel_operators.jl b/src/TurbulenceClosures/closure_kernel_operators.jl
index e325eec7ab..cb6afb6707 100644
--- a/src/TurbulenceClosures/closure_kernel_operators.jl
+++ b/src/TurbulenceClosures/closure_kernel_operators.jl
@@ -15,7 +15,7 @@ using Oceananigans.Operators: Δy_qᶠᶜᶜ, Δx_qᶜᶠᶜ, Δx_qᶠᶜᶜ
 @inline _diffusive_flux_y(args...) = diffusive_flux_y(args...)
 @inline _diffusive_flux_z(args...) = diffusive_flux_z(args...)
 
-#####                                                            
+#####
 ##### Viscous flux divergences
 #####
 
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index d75186dd69..791e13fb65 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -18,8 +18,17 @@ end
 ##### Kernel functions
 #####
 
-funcs     = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :maybe_tupled_ivd_upper_diagonal, :maybe_tupled_ivd_lower_diagonal, :maybe_tupled_implicit_linear_term]
-alt_funcs = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :ivd_upper_diagonal, :ivd_lower_diagonal, :implicit_linear_term]
+diffusive_fluxes = (:diffusive_flux_x, :diffusive_flux_y, :diffusive_flux_z)
+
+viscous_fluxes   = (:viscous_flux_ux, :viscous_flux_uy, :viscous_flux_uz,
+                    :viscous_flux_vx, :viscous_flux_vy, :viscous_flux_vz,
+                    :viscous_flux_wx, :viscous_flux_wy, :viscous_flux_wz)
+
+divergences     = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :maybe_tupled_ivd_upper_diagonal, :maybe_tupled_ivd_lower_diagonal, :maybe_tupled_implicit_linear_coefficient]
+alt_divergences = [:∂ⱼ_τ₁ⱼ, :∂ⱼ_τ₂ⱼ, :∂ⱼ_τ₃ⱼ, :∇_dot_qᶜ, :ivd_upper_diagonal,              :ivd_lower_diagonal,              :implicit_linear_coefficient]
+
+funcs     = [divergences...,     diffusive_fluxes..., viscous_fluxes...]
+alt_funcs = [alt_divergences..., diffusive_fluxes..., viscous_fluxes...]
 
 for (f, alt_f) in zip(funcs, alt_funcs)
     @eval begin
@@ -54,16 +63,17 @@ for (f, alt_f) in zip(funcs, alt_funcs)
     end
 end
 
+
 #####
 ##### Utilities
 #####
 
 with_tracers(tracers, closure_tuple::Tuple) = Tuple(with_tracers(tracers, closure) for closure in closure_tuple)
 
-function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...; kwargs...)
+function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...)
     for (α, closure) in enumerate(closure_tuple)
-        @inbounds diffusivity_fields = diffusivity_fields_tuple[α]
-        calculate_diffusivities!(diffusivity_fields, closure, args...; kwargs...)
+        diffusivity_fields = diffusivity_fields_tuple[α]
+        calculate_diffusivities!(diffusivity_fields, closure, args...)
     end
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index a6f1994ec2..4717d07f27 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -104,8 +104,8 @@ function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLei
     return nothing
 end
 
-"Return the filter width for a Leith Diffusivity on a regular rectilinear grid."
-@inline Δᶠ(i, j, k, grid::RectilinearGrid, ::TwoDimensionalLeith) = sqrt(Δxᶜᶜᶜ(i, j, k, grid) * Δyᶜᶜᶜ(i, j, k, grid)) 
+"Return the filter width for a Leith Diffusivity on a general grid."
+@inline Δᶠ(i, j, k, grid, ::TwoDimensionalLeith) = sqrt(Δxᶜᶜᶜ(i, j, k, grid) * Δyᶜᶜᶜ(i, j, k, grid)) 
 
 function DiffusivityFields(grid, tracer_names, bcs, ::TwoDimensionalLeith)
     default_eddy_viscosity_bcs = (; νₑ = FieldBoundaryConditions(grid, (Center, Center, Center)))
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index 44ba6fb265..ba6c72b656 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -19,7 +19,6 @@ using Oceananigans.Coriolis: fᶠᶠᵃ
 
 using Oceananigans.TurbulenceClosures:
     wall_vertical_distanceᶜᶜᶠ,
-    opposite_wall_vertical_distanceᶜᶜᶠ,
     getclosure,
     AbstractScalarDiffusivity,
     VerticallyImplicitTimeDiscretization,
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
index 4e6df0477b..0cda1ee731 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
@@ -3,5 +3,8 @@
 @inline ∂ⱼ_τ₂ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 @inline ∂ⱼ_τ₃ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 
-calculate_diffusivities!(diffusivities, ::Nothing, args...; kwargs...) = nothing
-calculate_diffusivities!(::Nothing, ::Nothing, args...; kwargs...) = nothing
+calculate_diffusivities!(diffusivities, ::Nothing, args...) = nothing
+calculate_diffusivities!(::Nothing, ::Nothing, args...) = nothing
+
+@inline viscosity(::Nothing, ::Nothing) = 0
+@inline diffusivity(::Nothing, ::Nothing, ::Val{id}) where id = 0
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index e21eddf670..0c606667ef 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -71,7 +71,7 @@ end
 @inline viscosity(closure::ScalarBiharmonicDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarBiharmonicDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...; kwargs...) = nothing
+calculate_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...) = nothing
 
 function Base.summary(closure::ScalarBiharmonicDiffusivity)
     F = summary(formulation(closure))
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 6995ff2a0f..1f9ac99107 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -111,7 +111,7 @@ end
 @inline viscosity(closure::ScalarDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, ::ScalarDiffusivity, args...; kwargs...) = nothing
+calculate_diffusivities!(diffusivities, ::ScalarDiffusivity, args...) = nothing
 
 # Note: we could compute ν and κ (if they are Field):
 # function calculate_diffusivities!(diffusivities, closure::ScalarDiffusivity, args...)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index bb53356a2d..5ad9e84e92 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -106,7 +106,7 @@ filter width `Δᶠ`, and strain tensor dot product `Σ²`.
 end
 
 
-function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; kwargs...)
+function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index d33e658fc8..c7234ed400 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -2,6 +2,8 @@ using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ
 using Oceananigans.AbstractOperations: flip
 using Oceananigans.Solvers: BatchedTridiagonalSolver, solve!
 
+import Oceananigans.Solvers: get_coefficient
+
 #####
 ##### implicit_step! interface
 #####
@@ -123,13 +125,19 @@ function implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
                                  "grids that are Bounded in the z-direction.")
 
     z_solver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = maybe_tupled_ivd_lower_diagonal,
-                                        diagonal = ivd_diagonal,
-                                        upper_diagonal = maybe_tupled_ivd_upper_diagonal)
+                                        lower_diagonal = Val(:maybe_tupled_ivd_lower_diagonal),
+                                        diagonal       = Val(:ivd_diagonal),
+                                        upper_diagonal = Val(:maybe_tupled_ivd_upper_diagonal))
 
     return z_solver
 end
 
+# Extend the `get_coefficient` function to retrieve the correct `ivd_diagonal`, `ivd_lower_diagonal` and `ivd_upper_diagonal` functions
+# REMEMBER: `get_coefficient(f::Function, args...)` leads to massive performance decrease on the CPU (https://github.com/CliMA/Oceananigans.jl/issues/2996) 
+@inline get_coefficient(::Val{:maybe_tupled_ivd_lower_diagonal}, i, j, k, grid, p, args...) = maybe_tupled_ivd_lower_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(::Val{:maybe_tupled_ivd_upper_diagonal}, i, j, k, grid, p, args...) = maybe_tupled_ivd_upper_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(::Val{:ivd_diagonal}, i, j, k, grid, p, args...) = ivd_diagonal(i, j, k, grid, args...)
+
 #####
 ##### Implicit step functions
 #####
@@ -179,7 +187,7 @@ function implicit_step!(field::Field,
     if closure isa Tuple
         closure_tuple = closure
         N = length(closure_tuple)
-        vi_closure = Tuple(closure[n] for n = 1:N if is_vertically_implicit(closure[n]))
+        vi_closure            = Tuple(closure[n]            for n = 1:N if is_vertically_implicit(closure[n]))
         vi_diffusivity_fields = Tuple(diffusivity_fields[n] for n = 1:N if is_vertically_implicit(closure[n]))
     else
         vi_closure = closure
@@ -188,6 +196,6 @@ function implicit_step!(field::Field,
 
     return solve!(field, implicit_solver, field,
                   # ivd_*_diagonal gets called with these args after (i, j, k, grid):
-                  vi_closure, vi_diffusivity_fields, tracer_index, instantiate.(loc)..., clock, Δt, κz)
+                  vi_closure, vi_diffusivity_fields, tracer_index, map(ℓ -> ℓ(), loc)..., clock, Δt, κz)
 end
 

From d70873028d5dd40486c36822fd959a14155a6881 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 12:42:41 -0400
Subject: [PATCH 200/530] adapt CATKE to distributed

---
 .../CATKEVerticalDiffusivities.jl                 | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 39076d3628..f2f2b880c4 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -232,7 +232,7 @@ end
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
 
     arch = model.architecture
     grid = model.grid
@@ -242,15 +242,20 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     clock = model.clock
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, kernel_size,
             calculate_CATKE_diffusivities!,
-            diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+            diffusivities, kernel_offsets, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
     return nothing
 end
 
-@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    i, j, k, = @index(Global, NTuple)
+@kernel function calculate_CATKE_diffusivities!(diffusivities, offs, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+
+    i′, j′, k′ = @index(Global, NTuple)
+
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
+    k = k′ + offs[3]
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)

From 5da2ddf43b62f60786217839a25f01682a92948e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 13:49:29 -0400
Subject: [PATCH 201/530] bugfix

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index f2f2b880c4..7b5a9d9106 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -22,7 +22,9 @@ using Oceananigans.TurbulenceClosures:
     time_discretization,
     AbstractScalarDiffusivity,
     VerticallyImplicitTimeDiscretization,
-    VerticalFormulation
+    VerticalFormulation,
+    κ_kernel_size,
+    κ_kernel_offsets
 
 import Oceananigans.BoundaryConditions: getbc
 import Oceananigans.Utils: with_tracers

From 7154eb00ad5fd04a35521755df8c814e567ae009 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 15:14:38 -0400
Subject: [PATCH 202/530] small bugfix

---
 src/TurbulenceClosures/TurbulenceClosures.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 77e31887bc..7ea8a30814 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -109,22 +109,22 @@ end
 const c = Center()
 const f = Face()
 
-@inline z_top(i, j, grid)          = znode(i, j, grid.Nz+1, grid, c, c, f)
-@inline z_bottom(i, j,  grid)      = znode(i, j, 1,         grid, c, c, f)
+@inline z_top(i, j, grid)          = znode(c, c, f, i, j, grid.Nz+1, grid)
+@inline z_bottom(i, j,  grid)      = znode(c, c, f, i, j, 1,         grid)
 
-@inline depthᶜᶜᶠ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, f))
-@inline depthᶜᶜᶜ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(i, j, k, grid, c, c, c))
+@inline depthᶜᶜᶠ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(c, c, f, i, j, k, grid))
+@inline depthᶜᶜᶜ(i, j, k, grid)    = clip(z_top(i, j, grid) - znode(c, c, c, i, j, k, grid))
 @inline total_depthᶜᶜᵃ(i, j, grid) = clip(z_top(i, j, grid) - z_bottom(i, j, grid))
 
 @inline function height_above_bottomᶜᶜᶠ(i, j, k, grid)
     Δz = Δzᶜᶜᶠ(i, j, k, grid)
-    h = znode(i, j, k, grid, c, c, f) - z_bottom(i, j, grid)
+    h = znode(c, c, f, i, j, k, grid) - z_bottom(i, j, grid)
     return max(Δz, h)
 end
 
 @inline function height_above_bottomᶜᶜᶜ(i, j, k, grid)
     Δz = Δzᶜᶜᶜ(i, j, k, grid)
-    h = znode(i, j, k, grid, c, c, c) - z_bottom(i, j, grid)
+    h = znode(c, c, c, i, j, k, grid) - z_bottom(i, j, grid)
     return max(Δz, h)
 end
 

From 016f4fa34853d4c4d9a8b19345e325bd4b8cb4ff Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 18:25:36 -0400
Subject: [PATCH 203/530] bugfix

---
 .../update_hydrostatic_pressure.jl                   | 12 ++++++------
 src/TurbulenceClosures/TurbulenceClosures.jl         |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 170197d02f..5f315e3dc5 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -39,19 +39,19 @@ using Oceananigans.Grids: topology
 @inline function p_kernel_size(grid) 
     Nx, Ny, _ = size(grid)
 
-    Tx, Ty, _ = topology(grid)
+    TX, TY, _ = topology(grid)
 
-    Ax = Tx == Flat ? Nx : Nx + 2 
-    Ay = Ty == Flat ? Ny : Ny + 2 
+    Ax = TX == Flat ? Nx : Nx + 2 
+    Ay = TY == Flat ? Ny : Ny + 2 
 
     return (Ax, Ay)
 end
 
 @inline function p_kernel_offsets(grid)
-    Tx, Ty, _ = topology(grid)
+    TX, TY, _ = topology(grid)
 
-    Ax = Tx == Flat ? 0 : - 1 
-    Ay = Ty == Flat ? 0 : - 1 
+    Ax = TX == Flat ? 0 : - 1 
+    Ay = TY == Flat ? 0 : - 1 
 
     return (Ax, Ay)
 end
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 7ea8a30814..034b2f356c 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -67,7 +67,7 @@ abstract type AbstractTurbulenceClosure{TimeDiscretization} end
 validate_closure(closure) = closure
 closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
-calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...) = nothing
+calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs

From b93d363d13d858def4e6027191d677cc79c91158 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 21:24:29 -0400
Subject: [PATCH 204/530] corrected bug

---
 src/TurbulenceClosures/turbulence_closure_utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index c2f1e125ab..a2f83b6d3c 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -59,5 +59,5 @@ end
     Ay = Ty == Flat ? 0 : - 1 
     Az = Ty == Flat ? 0 : - 1 
 
-    return (Ax, Ay, 0)
+    return (Ax, Ay, Az)
 end
\ No newline at end of file

From 116a798f13a9cbe8ddea7471be02af567e6854c1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 22:20:48 -0400
Subject: [PATCH 205/530] bugfix

---
 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl | 2 +-
 src/TurbulenceClosures/turbulence_closure_utils.jl        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index b490f788d3..d230e5e9cb 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -9,7 +9,7 @@ using Printf
 
 import Oceananigans.TurbulenceClosures: ivd_upper_diagonal,
                                         ivd_lower_diagonal,
-                                        bottom
+                                        z_bottom
 
 import Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ,
                                         immersed_∂ⱼ_τ₂ⱼ,
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index a2f83b6d3c..495c33ef77 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -57,7 +57,7 @@ end
 
     Ax = Tx == Flat ? 0 : - 1 
     Ay = Ty == Flat ? 0 : - 1 
-    Az = Ty == Flat ? 0 : - 1 
+    Az = Tz == Flat ? 0 : - 1 
 
     return (Ax, Ay, Az)
 end
\ No newline at end of file

From a820f92ada0f0efd286f30150d483f81a0a02517 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Apr 2023 22:56:29 -0400
Subject: [PATCH 206/530] test hypothesis

---
 src/TurbulenceClosures/turbulence_closure_utils.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 495c33ef77..59d8f707cd 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -47,9 +47,8 @@ end
 
     Ax = Tx == Flat ? Nx : Nx + 2 
     Ay = Ty == Flat ? Ny : Ny + 2 
-    Az = Tz == Flat ? Nz : Nz + 2 
 
-    return (Ax, Ay, Az)
+    return (Ax, Ay, Nz)
 end
 
 @inline function κ_kernel_offsets(grid)
@@ -57,7 +56,6 @@ end
 
     Ax = Tx == Flat ? 0 : - 1 
     Ay = Ty == Flat ? 0 : - 1 
-    Az = Tz == Flat ? 0 : - 1 
 
-    return (Ax, Ay, Az)
+    return (Ax, Ay, 0)
 end
\ No newline at end of file

From 8c3e0225f56c22bdaf3a1360e258b68495866113 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 20 Apr 2023 22:00:06 -0800
Subject: [PATCH 207/530] Use max in mixing length

---
 .../CATKEVerticalDiffusivities/mixing_length.jl          | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 33742611b8..961aa442a1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -14,7 +14,7 @@ Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
     Cᴺ   :: FT = 0.37
-    Cᵇ   :: FT = 0.01
+    Cᵇ   :: FT = 1.0
     Cᶜc  :: FT = 4.8
     Cᶜe  :: FT = 1.1
     Cᵉc  :: FT = 0.049
@@ -204,10 +204,10 @@ end
     σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
 
     ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
+
     return min(H, ℓ★)
 end
 
@@ -226,7 +226,8 @@ end
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
-    return min(H, ℓ★ + ℓʰ)
+
+    return min(H, max(ℓ★, ℓʰ))
 end
 
 @inline function TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
@@ -244,7 +245,7 @@ end
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
-    return min(H, ℓ★ + ℓʰ)
+    return min(H, max(ℓ★, ℓʰ))
 end
 
 Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"

From d81db20502f3700a17c23ac74cf2890176b63568 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 21 Apr 2023 10:35:13 -0400
Subject: [PATCH 208/530] maybe it was the bottom?

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl               | 4 ++--
 src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl  | 2 +-
 src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl | 2 +-
 src/TurbulenceClosures/turbulence_closure_utils.jl         | 1 -
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 9f2ce94f16..e3b5242ba2 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -91,7 +91,7 @@ import Oceananigans.TurbulenceClosures:
     νᶠᶠᶜ,
     νᶜᶠᶠ,
     νᶠᶜᶠ,
-    bottom
+    z_bottom
 
 """
     abstract type AbstractImmersedBoundary
@@ -147,7 +147,7 @@ end
 inflate_halo_size_one_dimension(req_H, old_H, _, ::IBG)            = max(req_H + 1, old_H)
 inflate_halo_size_one_dimension(req_H, old_H, ::Type{Flat}, ::IBG) = 0
 
-@inline bottom(i, j, k, ibg::IBG) = error("The function `bottom` has not been defined for $(summary(ibg))!")
+@inline z_bottom(i, j, k, ibg::IBG) = error("The function `z_bottom` has not been defined for $(summary(ibg))!")
 
 function Base.summary(grid::ImmersedBoundaryGrid)
     FT = eltype(grid)
diff --git a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
index d230e5e9cb..77ab493f2d 100644
--- a/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/grid_fitted_immersed_boundaries.jl
@@ -109,7 +109,7 @@ end
     return z <= h
 end
 
-@inline bottom(i, j, k, ibg::GFIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+@inline z_bottom(i, j, k, ibg::GFIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
 
 on_architecture(arch, ib::GridFittedBottom) = GridFittedBottom(arch_array(arch, ib.bottom_height))
 Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height))     
diff --git a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl b/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
index 0ad4c9a442..05aba20cff 100644
--- a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
@@ -98,4 +98,4 @@ end
 @inline Δzᶜᶠᶠ(i, j, k, ibg::PCIBG) = min(Δzᶜᶜᶠ(i, j-1, k, ibg), Δzᶜᶜᶠ(i, j, k, ibg))      
 @inline Δzᶠᶠᶠ(i, j, k, ibg::PCIBG) = min(Δzᶠᶜᶠ(i, j-1, k, ibg), Δzᶠᶜᶠ(i, j, k, ibg))
 
-@inline bottom(i, j, k, ibg::PCIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
+@inline z_bottom(i, j, k, ibg::PCIBG) = @inbounds ibg.immersed_boundary.bottom_height[i, j]
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 59d8f707cd..3ff4f4473a 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -42,7 +42,6 @@ end
 # extend κ kernel to compute also the boundaries
 @inline function κ_kernel_size(grid) 
     Nx, Ny, Nz = size(grid)
-
     Tx, Ty, Tz = topology(grid)
 
     Ax = Tx == Flat ? Nx : Nx + 2 

From 82be5bd7c2d9c8a2e0fdfe3a21fb944895cc4b6c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 21 Apr 2023 11:20:59 -0400
Subject: [PATCH 209/530] test hypothesis

---
 .../turbulence_closure_utils.jl               | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 3ff4f4473a..ce8cd7d97e 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -40,21 +40,26 @@ end
 end
 
 # extend κ kernel to compute also the boundaries
-@inline function κ_kernel_size(grid) 
-    Nx, Ny, Nz = size(grid)
-    Tx, Ty, Tz = topology(grid)
+# @inline function κ_kernel_size(grid) 
+#     Nx, Ny, Nz = size(grid)
+#     Tx, Ty, Tz = topology(grid)
 
-    Ax = Tx == Flat ? Nx : Nx + 2 
-    Ay = Ty == Flat ? Ny : Ny + 2 
+#     Ax = Tx == Flat ? Nx : Nx + 2 
+#     Ay = Ty == Flat ? Ny : Ny + 2 
+#     Az = Tz == Flat ? Nz : Nz + 2 
 
-    return (Ax, Ay, Nz)
-end
+#     return (Ax, Ay, Az)
+# end
+
+# @inline function κ_kernel_offsets(grid)
+#     Tx, Ty, Tz = topology(grid)
 
-@inline function κ_kernel_offsets(grid)
-    Tx, Ty, Tz = topology(grid)
+#     Ax = Tx == Flat ? 0 : - 1 
+#     Ay = Ty == Flat ? 0 : - 1 
+#     Az = Tz == Flat ? 0 : - 1 
 
-    Ax = Tx == Flat ? 0 : - 1 
-    Ay = Ty == Flat ? 0 : - 1 
+#     return (Ax, Ay, Az)
+# end
 
-    return (Ax, Ay, 0)
-end
\ No newline at end of file
+@inline κ_kernel_size(grid) = :xyz
+@inline κ_kernel_offsets(grid) = (0, 0, 0)
\ No newline at end of file

From fea5fc0de0df4ce857934643e2cc74e6e23a4a1f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 21 Apr 2023 11:59:31 -0400
Subject: [PATCH 210/530] test hypothesis

---
 .../recompute_boundary_tendencies.jl                      | 8 ++++----
 src/TurbulenceClosures/turbulence_closure_utils.jl        | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 53f27be518..0f22554a39 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -93,11 +93,11 @@ function recompute_auxiliaries!(model, grid, arch)
         update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
     end
 
-    sizes, offs = size_κ_kernel(grid, arch)
+    # sizes, offs = size_κ_kernel(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
-    end
+    # for (kernel_size, kernel_offsets) in zip(sizes, offs)
+    #     calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
+    # end
 end
 
 function size_w_kernel(grid, arch)
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index ce8cd7d97e..42b5098057 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -39,7 +39,7 @@ end
     @inbounds κₑ[i, j, k] = calc_nonlinear_κᶜᶜᶜ(i, j, k, grid, closure, tracer, tracer_index, U)
 end
 
-# extend κ kernel to compute also the boundaries
+# # extend κ kernel to compute also the boundaries
 # @inline function κ_kernel_size(grid) 
 #     Nx, Ny, Nz = size(grid)
 #     Tx, Ty, Tz = topology(grid)

From bc0194f1185de41f89e05b7c39ddb51476795b58 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 21 Apr 2023 12:49:26 -0400
Subject: [PATCH 211/530] back to what it was

---
 .../recompute_boundary_tendencies.jl          |  8 ++---
 .../turbulence_closure_utils.jl               | 36 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 0f22554a39..53f27be518 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -93,11 +93,11 @@ function recompute_auxiliaries!(model, grid, arch)
         update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
     end
 
-    # sizes, offs = size_κ_kernel(grid, arch)
+    sizes, offs = size_κ_kernel(grid, arch)
 
-    # for (kernel_size, kernel_offsets) in zip(sizes, offs)
-    #     calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
-    # end
+    for (kernel_size, kernel_offsets) in zip(sizes, offs)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
+    end
 end
 
 function size_w_kernel(grid, arch)
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 42b5098057..42bd74ede2 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -39,27 +39,27 @@ end
     @inbounds κₑ[i, j, k] = calc_nonlinear_κᶜᶜᶜ(i, j, k, grid, closure, tracer, tracer_index, U)
 end
 
-# # extend κ kernel to compute also the boundaries
-# @inline function κ_kernel_size(grid) 
-#     Nx, Ny, Nz = size(grid)
-#     Tx, Ty, Tz = topology(grid)
+# extend κ kernel to compute also the boundaries
+@inline function κ_kernel_size(grid) 
+    Nx, Ny, Nz = size(grid)
+    Tx, Ty, Tz = topology(grid)
 
-#     Ax = Tx == Flat ? Nx : Nx + 2 
-#     Ay = Ty == Flat ? Ny : Ny + 2 
-#     Az = Tz == Flat ? Nz : Nz + 2 
+    Ax = Tx == Flat ? Nx : Nx + 2 
+    Ay = Ty == Flat ? Ny : Ny + 2 
+    Az = Tz == Flat ? Nz : Nz + 2 
 
-#     return (Ax, Ay, Az)
-# end
+    return (Ax, Ay, Az)
+end
 
-# @inline function κ_kernel_offsets(grid)
-#     Tx, Ty, Tz = topology(grid)
+@inline function κ_kernel_offsets(grid)
+    Tx, Ty, Tz = topology(grid)
 
-#     Ax = Tx == Flat ? 0 : - 1 
-#     Ay = Ty == Flat ? 0 : - 1 
-#     Az = Tz == Flat ? 0 : - 1 
+    Ax = Tx == Flat ? 0 : - 1
+    Ay = Ty == Flat ? 0 : - 1 
+    Az = Tz == Flat ? 0 : - 1 
 
-#     return (Ax, Ay, Az)
-# end
+    return (Ax, Ay, Az)
+end
 
-@inline κ_kernel_size(grid) = :xyz
-@inline κ_kernel_offsets(grid) = (0, 0, 0)
\ No newline at end of file
+# @inline κ_kernel_size(grid)    = :xyz
+# @inline κ_kernel_offsets(grid) = (0, 0, 0)
\ No newline at end of file

From 67e0ba4c46bacc489abc65eb0b714ca271ed6d29 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 21 Apr 2023 15:40:53 -0400
Subject: [PATCH 212/530] done

---
 .../recompute_boundary_tendencies.jl          |  8 +++---
 .../CATKEVerticalDiffusivities.jl             | 25 ++++++++++++++++++-
 .../turbulence_closure_utils.jl               |  3 ---
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 53f27be518..68f49113f7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -141,11 +141,11 @@ function size_κ_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
     Rx, Ry, _  = arch.ranks
 
-    size_x = (1, Ny, Nz)
-    size_y = (Nx, 1, Nz)
+    size_x = (2, Ny, Nz)
+    size_y = (Nx, 2, Nz)
 
-    offsᴸx = (-1,  0, 0)
-    offsᴸy = (0,  -1, 0)
+    offsᴸx = (-2,  0, 0)
+    offsᴸy = (0,  -2, 0)
     offsᴿx = (Nx,  0, 0)
     offsᴿy = (0,  Ny, 0)
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 7b5a9d9106..db08d53c75 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -234,7 +234,7 @@ end
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; kernel_size = κ_CATKE_kernel_size(model.grid), kernel_offsets = κ_CATKE_kernel_offsets(model.grid))
 
     arch = model.architecture
     grid = model.grid
@@ -251,6 +251,29 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model;
     return nothing
 end
 
+# extend κ kernel to compute also the boundaries
+@inline function κ_CATKE_kernel_size(grid) 
+    Nx, Ny, Nz = size(grid)
+    Tx, Ty, Tz = topology(grid)
+
+    Ax = Tx == Flat ? Nx : Nx + 4 
+    Ay = Ty == Flat ? Ny : Ny + 4 
+    Az = Tz == Flat ? Nz : Nz + 2
+
+    return (Ax, Ay, Az)
+end
+
+@inline function κ_CATKE_kernel_offsets(grid)
+    Tx, Ty, Tz = topology(grid)
+
+    Ax = Tx == Flat ? 0 : - 2
+    Ay = Ty == Flat ? 0 : - 2 
+    Az = Tz == Flat ? 0 : - 1 
+
+    return (Ax, Ay, Az)
+end
+
+
 @kernel function calculate_CATKE_diffusivities!(diffusivities, offs, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
     i′, j′, k′ = @index(Global, NTuple)
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 42bd74ede2..45b8daa58b 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -60,6 +60,3 @@ end
 
     return (Ax, Ay, Az)
 end
-
-# @inline κ_kernel_size(grid)    = :xyz
-# @inline κ_kernel_offsets(grid) = (0, 0, 0)
\ No newline at end of file

From da0534aabf6d20fa64a9c37d8c0d2b0e716643f9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 24 Apr 2023 09:34:30 -0400
Subject: [PATCH 213/530] add catke vars

---
 .../recompute_boundary_tendencies.jl                 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 68f49113f7..52dbbf7488 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -141,13 +141,13 @@ function size_κ_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
     Rx, Ry, _  = arch.ranks
 
-    size_x = (2, Ny, Nz)
-    size_y = (Nx, 2, Nz)
+    size_x = (Hx, Ny, Nz)
+    size_y = (Nx, Hy, Nz)
 
-    offsᴸx = (-2,  0, 0)
-    offsᴸy = (0,  -2, 0)
-    offsᴿx = (Nx,  0, 0)
-    offsᴿy = (0,  Ny, 0)
+    offsᴸx = (-Hx+2, 0, 0)
+    offsᴸy = (0, -Hy+2, 0)
+    offsᴿx = (Nx-2,  0, 0)
+    offsᴿy = (0,  Ny-2, 0)
 
     sizes = (size_x, size_y, size_x, size_y)
     offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)

From 6903248d82c19f1a5b3e5a27095b914e82d70ad4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 24 Apr 2023 10:41:50 -0400
Subject: [PATCH 214/530] bugfix

---
 .../recompute_boundary_tendencies.jl                             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 52dbbf7488..b0a19e5cdc 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -139,6 +139,7 @@ end
 
 function size_κ_kernel(grid, arch)
     Nx, Ny, Nz = size(grid)
+    Hx, Hy, _ = halo_size(grid)
     Rx, Ry, _  = arch.ranks
 
     size_x = (Hx, Ny, Nz)

From 2a117d603eaf5a8afbdf510e3b097048d76693a0 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 24 Apr 2023 14:49:08 -0800
Subject: [PATCH 215/530] Add output for ZeroField

---
 src/OutputWriters/fetch_output.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/OutputWriters/fetch_output.jl b/src/OutputWriters/fetch_output.jl
index fc9efe140d..8c8e975a75 100644
--- a/src/OutputWriters/fetch_output.jl
+++ b/src/OutputWriters/fetch_output.jl
@@ -1,6 +1,6 @@
 using CUDA
 
-using Oceananigans.Fields: AbstractField, compute_at!
+using Oceananigans.Fields: AbstractField, compute_at!, ZeroField
 using Oceananigans.LagrangianParticleTracking: LagrangianParticles
 
 # Needed to support `fetch_output` with `model::Nothing`.
@@ -41,3 +41,6 @@ function fetch_and_convert_output(output, model, writer)
     fetched = fetch_output(output, model)
     return convert_output(fetched, writer)
 end
+
+fetch_and_convert_output(output::ZeroField, model, writer) = zero(eltype(output))
+

From 21687e91a2e01bf656d3110454f7d3d90743d242 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 24 Apr 2023 14:49:45 -0800
Subject: [PATCH 216/530] Better JLD2 output writer

---
 src/OutputWriters/jld2_output_writer.jl | 26 ++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/OutputWriters/jld2_output_writer.jl b/src/OutputWriters/jld2_output_writer.jl
index 137c9a816b..78164f8bfb 100644
--- a/src/OutputWriters/jld2_output_writer.jl
+++ b/src/OutputWriters/jld2_output_writer.jl
@@ -192,22 +192,34 @@ function initialize_jld2_file!(filepath, init, jld2_kw, including, outputs, mode
     try
         jldopen(filepath, "a+"; jld2_kw...) do file
             init(file, model)
+        end
+    catch err
+        @warn """Failed to execute user `init` for $filepath because $(typeof(err)): $(sprint(showerror, err))"""
+    end
+
+    try 
+        jldopen(filepath, "a+"; jld2_kw...) do file
             saveproperties!(file, model, including)
 
             # Serialize properties in `including`.
             for property in including
                 serializeproperty!(file, "serialized/$property", getproperty(model, property))
             end
+        end
+    catch err
+        @warn """Failed to save and serialize $including in $filepath because $(typeof(err)): $(sprint(showerror, err))"""
+    end
 
-            # Serialize the location and boundary conditions of each output.
-            for (i, (field_name, field)) in enumerate(pairs(outputs))
-                file["timeseries/$field_name/serialized/location"] = location(field)
-                file["timeseries/$field_name/serialized/indices"] = indices(field)
-                serializeproperty!(file, "timeseries/$field_name/serialized/boundary_conditions", boundary_conditions(field))
+    # Serialize the location and boundary conditions of each output.
+    for (name, field) in pairs(outputs)
+        try
+            jldopen(filepath, "a+"; jld2_kw...) do file
+                file["timeseries/$name/serialized/location"] = location(field)
+                file["timeseries/$name/serialized/indices"] = indices(field)
+                serializeproperty!(file, "timeseries/$name/serialized/boundary_conditions", boundary_conditions(field))
             end
+        catch
         end
-    catch err
-        @warn """Initialization of $filepath failed because $(typeof(err)): $(sprint(showerror, err))"""
     end
 
     return nothing

From 5ad700e77f1973720bbd22d595424f029fdc5145 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 24 Apr 2023 15:08:22 -0800
Subject: [PATCH 217/530] Add backward compatibility for FieldTimeSeries

---
 src/OutputReaders/field_time_series.jl | 51 ++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/src/OutputReaders/field_time_series.jl b/src/OutputReaders/field_time_series.jl
index 602668aef0..72f14bee3d 100644
--- a/src/OutputReaders/field_time_series.jl
+++ b/src/OutputReaders/field_time_series.jl
@@ -118,14 +118,59 @@ function FieldTimeSeries(path, name, backend;
     end
 
     isnothing(grid) && (grid = file["serialized/grid"])
-    close(file)
 
     # Default to CPU if neither architecture nor grid is specified
     architecture = isnothing(architecture) ?
         (isnothing(grid) ? CPU() : Architectures.architecture(grid)) : architecture
 
-    # This should be removed in a month or two (4/5/2022).
-    grid = on_architecture(architecture, grid)
+    # This should be removed eventually... (4/5/2022)
+    grid = try
+        on_architecture(architecture, grid)
+    catch err # Likely, the grid was saved with CuArrays or generated with a different Julia version.
+        if grid isa RectilinearGrid # we can try...
+            @info "Initial attempt to transfer grid to $architecture failed."
+            @info "Attempting to reconstruct RectilinearGrid on $architecture manually..."
+
+            Nx = file["grid/Nx"]
+            Ny = file["grid/Ny"]
+            Nz = file["grid/Nz"]
+            Hx = file["grid/Hx"]
+            Hy = file["grid/Hy"]
+            Hz = file["grid/Hz"]
+            xᶠᵃᵃ = file["grid/xᶠᵃᵃ"]
+            yᵃᶠᵃ = file["grid/yᵃᶠᵃ"]
+            zᵃᵃᶠ = file["grid/zᵃᵃᶠ"]
+            x = file["grid/Δxᶠᵃᵃ"] isa Number ? (xᶠᵃᵃ[1], xᶠᵃᵃ[Nx+1]) : xᶠᵃᵃ
+            y = file["grid/Δyᵃᶠᵃ"] isa Number ? (yᵃᶠᵃ[1], yᵃᶠᵃ[Ny+1]) : yᵃᶠᵃ
+            z = file["grid/Δzᵃᵃᶠ"] isa Number ? (zᵃᵃᶠ[1], zᵃᵃᶠ[Nz+1]) : zᵃᵃᶠ
+            topo = topology(grid)
+
+            N = (Nx, Ny, Nz)
+
+            # Reduce for Flat dimensions
+            domain = Dict()
+            for (i, ξ) in enumerate((x, y, z))
+                if topo[i] !== Flat
+                    if !(ξ isa Tuple)
+                        chopped_ξ = ξ[1:N[i]+1]
+                    else
+                        chopped_ξ = ξ
+                    end
+                    sξ = (:x, :y, :z)[i]
+                    domain[sξ] = chopped_ξ
+                end
+            end
+
+            size = Tuple(N[i] for i=1:3 if topo[i] !== Flat)
+            halo = Tuple((Hx, Hy, Hz)[i] for i=1:3 if topo[i] !== Flat)
+
+            RectilinearGrid(architecture; size, halo, topology=topo, domain...)
+        else
+            throw(err)
+        end
+    end
+
+    close(file)
 
     LX, LY, LZ = Location
     loc = map(instantiate, Location)

From d9caceef0492558e8db7a7921de8868bbdeb4726 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 26 Apr 2023 10:26:17 -0800
Subject: [PATCH 218/530] Fix bug in entrainment diffusivity for ri-based

---
 .../ri_based_vertical_diffusivity.jl                     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 80de3917f9..5153c5c2fd 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -199,15 +199,16 @@ const f = Face()
     # Convection and entrainment
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     N²_above = ∂z_b(i, j, k+1, grid, buoyancy, tracers)
-    convecting = N² < 0
-    entraining = (!convecting) & (N²_above < 0)
+
+    # Conditions
+    convecting = N² < 0 # applies regardless of Qᵇ
+    entraining = (N²_above < 0) & (!convecting) & (Qᵇ > 0)
 
     # Convective adjustment diffusivity
     κᶜᵃ = ifelse(convecting, κᶜᵃ, zero(grid))
 
     # Entrainment diffusivity
-    κᵉⁿ = ifelse(Qᵇ > 0, Cᵉⁿ * Qᵇ / N², zero(grid))
-    κᵉⁿ = ifelse(entraining, Cᵉⁿ, zero(grid))
+    κᵉⁿ = ifelse(entraining, Cᵉⁿ * Qᵇ / N², zero(grid))
 
     # Shear mixing diffusivity and viscosity
     Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)

From ecfed1f5986660e5406526095b47217aabe0c500 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Apr 2023 13:32:53 -0400
Subject: [PATCH 219/530] bottom drag for e

---
 .../CATKEVerticalDiffusivities.jl                   | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index db08d53c75..2708c10be5 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -120,7 +120,7 @@ optimal_turbulent_kinetic_energy_equation(FT) = TurbulentKineticEnergyEquation(
 
 optimal_mixing_length(FT) = MixingLength(
     Cᵇ   = FT(0.37), 
-    Cᶜc  = FT(4.8),
+    Cᶜc  = FT(1.0),
     Cᶜe  = FT(1.1),
     Cᵉc  = FT(0.049),
     Cᵉe  = FT(0.0),
@@ -301,7 +301,16 @@ end
         dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
         wb_e = ifelse(dissipative_buoyancy_flux, wb / eⁱʲᵏ, zero(grid))
         
-        diffusivities.Lᵉ[i, j, k] = - wb_e + implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        on_bottom = !inactive_cell(i, j, k, grid) & inactive_cell(i, j, k-1, grid)
+        # on_side = near_horizontal_boundary(i, j, k, grid)
+        Δz = Δzᶜᶜᶜ(i, j, k, grid)
+
+        Q_e = - 10.0 * turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
+
+        # Implicit TKE dissipation
+        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+
+        diffusivities.Lᵉ[i, j, k] = - wb_e + ϵ_e + Q_e
     end
 end
 

From c7cd4e470514e37d3a83722d8e3780be8eb2216e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Apr 2023 13:41:59 -0400
Subject: [PATCH 220/530] limit TKE

---
 .../turbulent_kinetic_energy_equation.jl                   | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 79bbd8e216..be5ec3ece0 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -74,7 +74,7 @@ end
     Riᶜ = closure.mixing_length.CRiᶜ
     Riʷ = closure.mixing_length.CRiʷ
     Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
-    σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
+    σ  = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ) 
 
     Cᵇ = closure.mixing_length.Cᵇ
     #ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᵇ, tracers.e, velocities, tracers, buoyancy)
@@ -98,8 +98,11 @@ end
     #   and thus    L = - Cᴰ √e / ℓ .
 
     τ = closure.negative_turbulent_kinetic_energy_damping_time_scale
+    e_max = 10.0
 
-    return ifelse(eᵢ < 0, -1/τ, -sqrt(abs(eᵢ)) / ℓᴰ)
+    e_limiter = max(one(grid), eᵢ / e_max)
+
+    return ifelse(eᵢ < 0, -1/τ, - sqrt(abs(eᵢ)) / ℓᴰ * e_limiter)
 end
 
 # Fallbacks for explicit time discretization

From 0d57e3713a37658300c3ec5a4361366a38d9269f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Apr 2023 13:42:53 -0400
Subject: [PATCH 221/530] bugfix

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 2708c10be5..37c45e23de 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -305,7 +305,7 @@ end
         # on_side = near_horizontal_boundary(i, j, k, grid)
         Δz = Δzᶜᶜᶜ(i, j, k, grid)
 
-        Q_e = - 10.0 * turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
+        Q_e = - 10.0 * turbulent_velocity(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
 
         # Implicit TKE dissipation
         ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)

From 430536024316d1456a2e24e4de723bd30d77ad44 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Apr 2023 14:28:11 -0400
Subject: [PATCH 222/530] bugfix

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 37c45e23de..1bf2c05127 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -16,6 +16,7 @@ using Oceananigans.BoundaryConditions: default_prognostic_bc, DefaultBoundaryCon
 using Oceananigans.BoundaryConditions: BoundaryCondition, FieldBoundaryConditions
 using Oceananigans.BoundaryConditions: DiscreteBoundaryFunction, FluxBoundaryCondition
 using Oceananigans.BuoyancyModels: ∂z_b, top_buoyancy_flux
+using Oceananigans.Grids: inactive_cell
 
 using Oceananigans.TurbulenceClosures:
     getclosure,

From db7607b6bb323b3d6269fb2ebd565ab9a3e94939 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 28 Apr 2023 16:21:32 -0800
Subject: [PATCH 223/530] Cell centered dissipation length

---
 .../CATKEVerticalDiffusivities/mixing_length.jl        | 10 +++++-----
 .../turbulent_kinetic_energy_equation.jl               |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 961aa442a1..9f1f9aa962 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -51,7 +51,7 @@ end
     return S²
 end
 
-@inline function buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
+@inline function stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     #N² = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, ∂z_b, buoyancy, tracers)
@@ -60,7 +60,7 @@ end
     return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
 end
 
-@inline function buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+@inline function stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     N²⁺ = clip(N²)
@@ -70,7 +70,7 @@ end
 
 @inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
     Cᴺ = closure.mixing_length.Cᴺ
-    ℓᴺ = Cᴺ * buoyancy_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
+    ℓᴺ = Cᴺ * stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
 
     Cᵇ = closure.mixing_length.Cᵇ
     d_up   = depthᶜᶜᶠ(i, j, k, grid)
@@ -85,7 +85,7 @@ end
 
 @inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
     Cᴺ = closure.mixing_length.Cᴺ
-    ℓᴺ = Cᴺ * buoyancy_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+    ℓᴺ = Cᴺ * stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
 
     Cᵇ = closure.mixing_length.Cᵇ
     d_up = depthᶜᶜᶜ(i, j, k, grid)
@@ -153,7 +153,7 @@ end
     Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
     w★       = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)
     w★²      = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)^2
-    w★³      = turbulent_velocityᶜᶜᶜ(i, j, grid.Nz, grid, closure, tracers.e)^3
+    w★³      = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)^3
     S²       = shearᶜᶜᶜ(i, j, k, grid, u, v)
     N²       = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     N²_above = ℑzᵃᵃᶜ(i, j, k+1, grid, ∂z_b, buoyancy, tracers)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index f997e4c817..5e29781f96 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -66,8 +66,8 @@ end
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = ℑzᵃᵃᶜ(i, j, k, grid, convective_length_scaleᶜᶜᶠ, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
-    #ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    #ℓʰ = ℑzᵃᵃᶜ(i, j, k, grid, convective_length_scaleᶜᶜᶠ, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
 
     # "Stable" dissipation length
     C⁻D = closure.turbulent_kinetic_energy_equation.C⁻D
@@ -77,8 +77,8 @@ end
     Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
     σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
 
-    #ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-    ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    #ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)

From 2c62342fb85731b710e9d22d1c3cf13d57d3a1db Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 28 Apr 2023 20:25:56 -0800
Subject: [PATCH 224/530] Bug in vertically-implicit diffusion?

---
 src/Solvers/batched_tridiagonal_solver.jl     | 42 ++++++++-----------
 .../vertically_implicit_diffusion_solver.jl   | 25 ++++++-----
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 4fe2ea21b5..d3a24255a8 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -8,12 +8,12 @@ import Oceananigans.Architectures: architecture
 A batched solver for large numbers of triadiagonal systems.
 """
 struct BatchedTridiagonalSolver{A, B, C, T, G, P}
-               a :: A
-               b :: B
-               c :: C
-               t :: T
-            grid :: G
-      parameters :: P
+    a :: A
+    b :: B
+    c :: C
+    t :: T
+    grid :: G
+    parameters :: P
 end
 
 architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
@@ -37,26 +37,20 @@ where `a` is the `lower_diagonal`, `b` is the `diagonal`, and `c` is the `upper_
 
 2. A 3D array means that `aⁱʲᵏ = a[i, j, k]`.
 
-3. Otherwise, `a` is assumed to be callable:
-    * If `isnothing(parameters)` then `aⁱʲᵏ = a(i, j, k, grid, args...)`.
-    * If `!isnothing(parameters)` then `aⁱʲᵏ = a(i, j, k, grid, parameters, args...)`.
-    where `args...` are `Varargs` passed to `solve_batched_tridiagonal_system!(ϕ, solver, args...)`.
+Other coefficient types can be used by extending `get_coefficient`.
 """
 function BatchedTridiagonalSolver(grid;
                                   lower_diagonal,
                                   diagonal,
                                   upper_diagonal,
-                                  scratch = arch_array(architecture(grid), zeros(eltype(grid), grid.Nx, grid.Ny, grid.Nz)),
+                                  scratch = arch_array(architecture(grid), zeros(eltype(grid), size(grid)...)),
                                   parameters = nothing)
 
-    return BatchedTridiagonalSolver(lower_diagonal, diagonal, upper_diagonal,
-                                    scratch, grid, parameters)
+    return BatchedTridiagonalSolver(lower_diagonal, diagonal, upper_diagonal, scratch, grid, parameters)
 end
 
-@inline get_coefficient(a::AbstractArray{T, 1}, i, j, k, grid, p, args...) where {T} = @inbounds a[k]
-@inline get_coefficient(a::AbstractArray{T, 3}, i, j, k, grid, p, args...) where {T} = @inbounds a[i, j, k]
-@inline get_coefficient(a::Base.Callable, i, j, k, grid, p, args...)         = a(i, j, k, grid, p, args...)
-@inline get_coefficient(a::Base.Callable, i, j, k, grid, ::Nothing, args...) = a(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{T, 1}, p, args...) where {T} = @inbounds a[k]
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{T, 3}, p, args...) where {T} = @inbounds a[i, j, k]
 
 """
     solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
@@ -86,24 +80,24 @@ end
 @inline float_eltype(ϕ::AbstractArray{<:Complex{T}}) where T <: AbstractFloat = T
 
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args)
-    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz
+    Nx, Ny, Nz = size(grid)
 
     i, j = @index(Global, NTuple)
 
     @inbounds begin
-        β  = get_coefficient(b, i, j, 1, grid, p, args...)
-        f₁ = get_coefficient(f, i, j, 1, grid, p, args...)
+        β  = get_coefficient(i, j, 1, grid, b, p, args...)
+        f₁ = get_coefficient(i, j, 1, grid, f, p, args...)
         ϕ[i, j, 1] = f₁ / β
 
         @unroll for k = 2:Nz
-            cᵏ⁻¹ = get_coefficient(c, i, j, k-1, grid, p, args...)
-            bᵏ   = get_coefficient(b, i, j, k,   grid, p, args...)
-            aᵏ⁻¹ = get_coefficient(a, i, j, k-1, grid, p, args...)
+            cᵏ⁻¹ = get_coefficient(i, j, k-1, grid, c, p, args...)
+            bᵏ   = get_coefficient(i, j, k,   grid, b, p, args...)
+            aᵏ⁻¹ = get_coefficient(i, j, k-1, grid, a, p, args...)
 
             t[i, j, k] = cᵏ⁻¹ / β
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
 
-            fᵏ = get_coefficient(f, i, j, k, grid, p, args...)
+            fᵏ = get_coefficient(i, j, k, grid, f, p, args...)
             
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
             # the algorithm is unstable and we elide the forward pass update of ϕ.
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 0d216a4337..e38814e89d 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -55,11 +55,11 @@ end
 @inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
     k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    κᵏ = κz(i, j, k′, grid, closure_ij, K, id, clock)
+    κᵏ⁺¹ = κz(i, j, k′+1, grid, closure_ij, K, id, clock)
 
     return ifelse(k < 1,
                   zero(grid),
-                  - Δt * κ_Δz²(i, j, k′, k′, grid, κᵏ))
+                  - Δt * κ_Δz²(i, j, k′, k′, grid, κᵏ⁺¹))
 end
 
 # Vertical velocity kernel functions (at cell interfaces in z)
@@ -88,7 +88,7 @@ end
 
 @inline ivd_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) =
     one(grid) -
-        Δt * _implicit_linear_coefficient(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
+        Δt * _implicit_linear_coefficient(i, j, k+0, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
                       _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
                       _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
 
@@ -100,6 +100,10 @@ end
 ##### Solver constructor
 #####
 
+struct VerticallyImplicitLowerDiagonal end
+struct VerticallyImplicitDiagonal end
+struct VerticallyImplicitUpperDiagonal end
+
 """
     implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
 
@@ -125,18 +129,17 @@ function implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
                                  "grids that are Bounded in the z-direction.")
 
     z_solver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = Val(:_ivd_lower_diagonal),
-                                        diagonal       = Val(:ivd_diagonal),
-                                        upper_diagonal = Val(:_ivd_upper_diagonal))
+                                        lower_diagonal = VerticallyImplicitLowerDiagonal(),
+                                        diagonal       = VerticallyImplicitDiagonal(),
+                                        upper_diagonal = VerticallyImplicitUpperDiagonal())
 
     return z_solver
 end
 
-# Extend the `get_coefficient` function to retrieve the correct `ivd_diagonal`, `ivd_lower_diagonal` and `ivd_upper_diagonal` functions
-# REMEMBER: `get_coefficient(f::Function, args...)` leads to massive performance decrease on the CPU (https://github.com/CliMA/Oceananigans.jl/issues/2996) 
-@inline get_coefficient(::Val{:_ivd_lower_diagonal}, i, j, k, grid, p, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::Val{:_ivd_upper_diagonal}, i, j, k, grid, p, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::Val{:ivd_diagonal},        i, j, k, grid, p, args...) = ivd_diagonal(i, j, k, grid, args...)
+# Extend `get_coefficient` to retrieve `ivd_diagonal`, `_ivd_lower_diagonal` and `_ivd_upper_diagonal`.
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitLowerDiagonal, p, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitUpperDiagonal, p, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiagonal,      p, args...) = ivd_diagonal(i, j, k, grid, args...)
 
 #####
 ##### Implicit step functions

From 73192bb541b5702b27a7d83361a239d1897d3183 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 28 Apr 2023 20:27:09 -0800
Subject: [PATCH 225/530] Cosmetic improvements

---
 .../CATKEVerticalDiffusivities.jl                      |  5 ++---
 .../CATKEVerticalDiffusivities/mixing_length.jl        | 10 ++++------
 .../turbulent_kinetic_energy_equation.jl               |  7 +------
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 3976b53aab..0fa958db8c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -230,7 +230,7 @@ const c = Center()
 const f = Face()
 
 @kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    i, j, k, = @index(Global, NTuple)
+    i, j, k = @index(Global, NTuple)
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
@@ -254,7 +254,7 @@ const f = Face()
 
         # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
         # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
-        wb = buoyancy_flux(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, diffusivities)
+        wb = ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
         eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
         # See `buoyancy_flux`
@@ -284,7 +284,6 @@ end
     eᵐⁱⁿ = closure.minimum_turbulent_kinetic_energy
     return sqrt(max(eᵐⁱⁿ, eᵢ))
 end
-@inline is_stableᶜᶜᶠ(i, j, k, grid, tracers, buoyancy) = ∂z_b(i, j, k, grid, buoyancy, tracers) >= 0
 
 @inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 9f1f9aa962..c3f8515c1c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -34,7 +34,6 @@ end
 ##### Mixing length
 #####
 
-@inline ϕ⁺(i, j, k, grid, ψ) = @inbounds clip(ψ[i, j, k])
 @inline ϕ²(i, j, k, grid, ϕ, args...) = ϕ(i, j, k, grid, args...)^2
 
 @inline function shearᶜᶜᶠ(i, j, k, grid, u, v)
@@ -54,7 +53,6 @@ end
 @inline function stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
     FT = eltype(grid)
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    #N² = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, ∂z_b, buoyancy, tracers)
     N²⁺ = clip(N²)
     w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, e)
     return ifelse(N²⁺ == 0, FT(Inf), w★ / sqrt(N²⁺))
@@ -98,8 +96,8 @@ end
     return ℓ
 end
 
-@inline three_halves_tke(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^3
-@inline squared_tke(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^2
+@inline three_halves_tkeᶜᶜᶜ(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^3
+@inline squared_tkeᶜᶜᶜ(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^2
 
 @inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
                                             velocities, tracers, buoyancy, clock, tracer_bcs)
@@ -109,8 +107,8 @@ end
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
     Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
     w★       = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    w★²      = ℑzᵃᵃᶠ(i, j, k, grid, squared_tke, closure, tracers.e)
-    w★³      = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tke, closure, tracers.e)
+    w★²      = ℑzᵃᵃᶠ(i, j, k, grid, squared_tkeᶜᶜᶜ, closure, tracers.e)
+    w★³      = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tkeᶜᶜᶜ, closure, tracers.e)
     S²       = shearᶜᶜᶠ(i, j, k, grid, u, v)
     N²       = ∂z_b(i, j, k, grid, buoyancy, tracers)
     N²_above = ∂z_b(i, j, k+1, grid, buoyancy, tracers)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 5e29781f96..a17cae0642 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -18,8 +18,6 @@ end
 ##### Terms in the turbulent kinetic energy equation, all at cell centers
 #####
 
-@inline ϕ²(i, j, k, grid, ϕ) = ϕ(i, j, k, grid)^2
-
 @inline ν_∂z_u²(i, j, k, grid, ν, u) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
 @inline ν_∂z_v²(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
@@ -66,7 +64,6 @@ end
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    #ℓʰ = ℑzᵃᵃᶜ(i, j, k, grid, convective_length_scaleᶜᶜᶠ, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
     ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
 
     # "Stable" dissipation length
@@ -76,9 +73,7 @@ end
     Riʷ = closure.mixing_length.CRiʷ
     Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
     σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
-
     ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-    #ℓ★ = σ * ℑzᵃᵃᶜ(i, j, k, grid, stable_length_scaleᶜᶜᶠ, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
@@ -106,7 +101,7 @@ end
 @inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
     eᵢ = @inbounds tracers.e[i, j, k]
     L = implicit_dissipation_coefficient(i, j, k, grid, closure, velocities, tracers, args...)
-    return - L * eᵢ
+    return L * eᵢ
 end
 
 @inline implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)

From 8b41f3ac698d360d3d59144a48b6e2aee36a0fe3 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 28 Apr 2023 23:24:29 -0800
Subject: [PATCH 226/530] Fix vertically implicit solver for general grids plus
 better notation

---
 .../vertically_implicit_diffusion_solver.jl   | 93 ++++++++++---------
 1 file changed, 51 insertions(+), 42 deletions(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index e38814e89d..692d8661ac 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ
+using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ, Δz
 using Oceananigans.AbstractOperations: flip
 using Oceananigans.Solvers: BatchedTridiagonalSolver, solve!
 
@@ -39,58 +39,67 @@ implicit_diffusion_solver(::ExplicitTimeDiscretization, args...; kwargs...) = no
 ##### Note: "ivd" stands for implicit vertical diffusion.
 #####
 
-@inline κ_Δz²(i, j, kᶜ, kᶠ, grid, κ) = κ / Δzᵃᵃᶜ(i, j, kᶜ, grid) / Δzᵃᵃᶠ(i, j, kᶠ, grid)
-
 # Tracers and horizontal velocities at cell centers in z
 
-@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
-    closure_ij = getclosure(i, j, closure)  
-    κᵏ⁺¹ = κz(i, j, k+1, grid, closure_ij, K, id, clock)
+const c = Center()
+const f = Face()
 
-    return ifelse(k > grid.Nz-1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k, k+1, grid, κᵏ⁺¹))
+@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
+    closure_ij = getclosure(i, j, closure)
+    κᵏ⁺¹   = κz(i, j, k+1, grid, closure_ij, K, id, clock)
+    Δzᶜₖ   = Δz(i, j, k,   grid, ℓx, ℓy, c)
+    Δzᶠₖ₊₁ = Δz(i, j, k+1, grid, ℓx, ℓy, f)
+    du     = - Δt * κᵏ⁺¹ / (Δzᶜₖ * Δzᶠₖ₊₁)
+
+    # This conditional ensures the diagonal is correct
+    return ifelse(k > grid.Nz-1, zero(grid), du)
 end
 
-@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
-    k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
+@inline function ivd_lower_diagonal(i, j, k′, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
+    k = k′ + 2 # Shift index to match LinearAlgebra.Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    κᵏ⁺¹ = κz(i, j, k′+1, grid, closure_ij, K, id, clock)
-
-    return ifelse(k < 1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k′, k′, grid, κᵏ⁺¹))
+    κᵏ   = κz(i, j, k, grid, closure_ij, K, id, clock)
+    Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
+    Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
+    dl   = - Δt * κᵏ / (Δzᶜₖ * Δzᶠₖ)
+
+    # This conditional ensures the diagonal is correct: the lower diagonal does not
+    # exist for k′ = 0. (Note we use LinearAlgebra.Tridiagonal indexing convention,
+    # so that lower_diagonal should be defined for k′ = 1 ⋯ N-1).
+    return ifelse(k′ < 1, zero(grid), dl)
 end
 
-# Vertical velocity kernel functions (at cell interfaces in z)
-#
-# Note: these coefficients are specific to vertically-bounded grids (and so is
-# the BatchedTridiagonalSolver).
+#####
+##### Vertical velocity kernel functions (at cell interfaces in z)
+#####
+##### Note: these coefficients are specific to vertically-bounded grids (and so is
+##### the BatchedTridiagonalSolver).
+
 @inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ) 
     closure_ij = getclosure(i, j, closure)  
     νᵏ = νzᶜᶜᶜ(i, j, k, grid, closure_ij, K, clock)
-
-    return ifelse(k < 1, # should this be k < 2? #should this be grid.Nz - 1?
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k, k, grid, νᵏ))
+    Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
+    Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
+    du   = - Δt * νᵏ / (Δzᶜₖ * Δzᶠₖ)
+    return ifelse(k < 1, zero(grid), du)
 end
 
 @inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ)
-    k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
+    k′ = k + 2 # Shift to adjust for Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    νᵏ⁻¹ = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
-    return ifelse(k < 1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k′, k′-1, grid, νᵏ⁻¹))
+    νᵏ⁻¹   = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
+    Δzᶜₖ   = Δz(i, j, k′,   grid, ℓx, ℓy, c)
+    Δzᶠₖ₋₁ = Δz(i, j, k′-1, grid, ℓx, ℓy, f)
+    dl     = Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
+    return ifelse(k < 1, zero(grid), dl)
 end
 
 ### Diagonal terms
 
 @inline ivd_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) =
-    one(grid) -
-        Δt * _implicit_linear_coefficient(i, j, k+0, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
-                      _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
-                      _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
+    one(grid) - Δt * _implicit_linear_coefficient(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
+                              _ivd_upper_diagonal(i, j, k,   grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz) -
+                              _ivd_lower_diagonal(i, j, k-1, grid, closure, K, id, ℓx, ℓy, ℓz, clock, Δt, κz)
 
 @inline _implicit_linear_coefficient(args...) = implicit_linear_coefficient(args...)
 @inline _ivd_upper_diagonal(args...) = ivd_upper_diagonal(args...)
@@ -100,9 +109,9 @@ end
 ##### Solver constructor
 #####
 
-struct VerticallyImplicitLowerDiagonal end
-struct VerticallyImplicitDiagonal end
-struct VerticallyImplicitUpperDiagonal end
+struct VerticallyImplicitDiffusionLowerDiagonal end
+struct VerticallyImplicitDiffusionDiagonal end
+struct VerticallyImplicitDiffusionUpperDiagonal end
 
 """
     implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
@@ -129,17 +138,17 @@ function implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
                                  "grids that are Bounded in the z-direction.")
 
     z_solver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = VerticallyImplicitLowerDiagonal(),
-                                        diagonal       = VerticallyImplicitDiagonal(),
-                                        upper_diagonal = VerticallyImplicitUpperDiagonal())
+                                        lower_diagonal = VerticallyImplicitDiffusionLowerDiagonal(),
+                                        diagonal       = VerticallyImplicitDiffusionDiagonal(),
+                                        upper_diagonal = VerticallyImplicitDiffusionUpperDiagonal())
 
     return z_solver
 end
 
 # Extend `get_coefficient` to retrieve `ivd_diagonal`, `_ivd_lower_diagonal` and `_ivd_upper_diagonal`.
-@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitLowerDiagonal, p, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitUpperDiagonal, p, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiagonal,      p, args...) = ivd_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionLowerDiagonal, p, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionUpperDiagonal, p, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionDiagonal,      p, args...) = ivd_diagonal(i, j, k, grid, args...)
 
 #####
 ##### Implicit step functions

From 331911023bd05eced7d3711fcc0103d38f9aef0a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 28 Apr 2023 23:24:49 -0800
Subject: [PATCH 227/530] Cosmetics

---
 src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl | 4 ++--
 src/Solvers/batched_tridiagonal_solver.jl               | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
index 3a788263f2..219d41e2cb 100644
--- a/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
+++ b/src/ImmersedBoundaries/abstract_grid_fitted_boundary.jl
@@ -13,8 +13,8 @@ const GFIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Abstract
 ##### Same goes for the face solver, where we check at centers k in both Upper and lower diagonal
 #####
 
-@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Center) = immersed_peripheral_node(i, j, k+1, ibg, LX, LY, Face())
-@inline immersed_ivd_peripheral_node(i, j, k, ibg, LX, LY, ::Face)   = immersed_peripheral_node(i, j, k,   ibg, LX, LY, Center())
+@inline immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ::Center) = immersed_peripheral_node(i, j, k+1, ibg, ℓx, ℓy, Face())
+@inline immersed_ivd_peripheral_node(i, j, k, ibg, ℓx, ℓy, ::Face)   = immersed_peripheral_node(i, j, k,   ibg, ℓx, ℓy, Center())
 
 # Extend the upper and lower diagonal functions of the batched tridiagonal solver
 
diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index d3a24255a8..98280bfbb1 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -24,9 +24,9 @@ architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
 
 Construct a solver for batched tridiagonal systems on `grid` of the form
 
-                           bⁱʲ¹ ϕⁱʲ¹ + cⁱʲ¹ ϕⁱʲ²   = fⁱʲ¹,  k = 1
+                           bⁱʲ¹ ϕⁱʲ¹ + cⁱʲ¹ ϕⁱʲ²   = fⁱʲ¹,
            aⁱʲᵏ⁻¹ ϕⁱʲᵏ⁻¹ + bⁱʲᵏ ϕⁱʲᵏ + cⁱʲᵏ ϕⁱʲᵏ⁺¹ = fⁱʲᵏ,  k = 2, ..., N-1
-           aⁱʲᴺ⁻¹ ϕⁱʲᴺ⁻¹ + bⁱʲᴺ ϕⁱʲᴺ               = fⁱʲᴺ,  k = N
+           aⁱʲᴺ⁻¹ ϕⁱʲᴺ⁻¹ + bⁱʲᴺ ϕⁱʲᴺ               = fⁱʲᴺ,
 
 where `a` is the `lower_diagonal`, `b` is the `diagonal`, and `c` is the `upper_diagonal`.
 `ϕ` is the solution and `f` is the right hand side source term passed to `solve!(ϕ, tridiagonal_solver, f)`
@@ -54,7 +54,6 @@ end
 
 """
     solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
-                                      
 
 Solve the batched tridiagonal system of linear equations with right hand side
 `rhs` and lower diagonal, diagonal, and upper diagonal coefficients described by the

From 80482a331291da1c3e2037fc3afdc7201f443002 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 08:38:51 -0800
Subject: [PATCH 228/530] Hmm

---
 .../vertically_implicit_diffusion_solver.jl                   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 692d8661ac..fe00df83bb 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -56,9 +56,9 @@ const f = Face()
 end
 
 @inline function ivd_lower_diagonal(i, j, k′, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
-    k = k′ + 2 # Shift index to match LinearAlgebra.Tridiagonal indexing convenction
+    k = k′ + 1 # Shift index to match LinearAlgebra.Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    κᵏ   = κz(i, j, k, grid, closure_ij, K, id, clock)
+    κᵏ   = κz(i, j, k+1, grid, closure_ij, K, id, clock)
     Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
     Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
     dl   = - Δt * κᵏ / (Δzᶜₖ * Δzᶠₖ)

From 55ce41f48cd9fdc6b3f6c589322060908e535436 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 08:51:28 -0800
Subject: [PATCH 229/530] Add missing import for CATKE

---
 .../CATKEVerticalDiffusivities.jl                    | 12 +++++++-----
 .../vertically_implicit_diffusion_solver.jl          |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 0fa958db8c..73ce6d08eb 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -38,6 +38,8 @@ import Oceananigans.TurbulenceClosures:
     implicit_linear_coefficient,
     viscosity,
     diffusivity,
+    viscosity_location,
+    diffusivity_location,
     diffusive_flux_x,
     diffusive_flux_y,
     diffusive_flux_z
@@ -204,8 +206,11 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCATKE)
     return (; κᵘ, κᶜ, κᵉ, Lᵉ, _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
 end        
 
-@inline viscosity_location(::FlavorOfCATKE) = (Center(), Center(), Face())
-@inline diffusivity_location(::FlavorOfCATKE) = (Center(), Center(), Face())
+const c = Center()
+const f = Face()
+
+@inline viscosity_location(::FlavorOfCATKE) = (c, c, f)
+@inline diffusivity_location(::FlavorOfCATKE) = (c, c, f)
 
 @inline clip(x) = max(zero(x), x)
 
@@ -226,9 +231,6 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     return nothing
 end
 
-const c = Center()
-const f = Face()
-
 @kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
     i, j, k = @index(Global, NTuple)
 
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index fe00df83bb..e00c1f2344 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -58,7 +58,7 @@ end
 @inline function ivd_lower_diagonal(i, j, k′, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
     k = k′ + 1 # Shift index to match LinearAlgebra.Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    κᵏ   = κz(i, j, k+1, grid, closure_ij, K, id, clock)
+    κᵏ   = κz(i, j, k, grid, closure_ij, K, id, clock)
     Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
     Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
     dl   = - Δt * κᵏ / (Δzᶜₖ * Δzᶠₖ)

From d81e0275cc48912dc72caa3a7ff46d17ecc44b32 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 13:31:17 -0800
Subject: [PATCH 230/530] Add tests

---
 test/dependencies_for_runtests.jl |  3 ++-
 test/test_turbulence_closures.jl  | 42 ++++++++++++++++++++-----------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl
index 80a2398543..7a8d7ae933 100644
--- a/test/dependencies_for_runtests.jl
+++ b/test/dependencies_for_runtests.jl
@@ -60,7 +60,8 @@ closures = (
     :TwoDimensionalLeith,
     :SmagorinskyLilly,
     :AnisotropicMinimumDissipation,
-    :ConvectiveAdjustmentVerticalDiffusivity
+    :ConvectiveAdjustmentVerticalDiffusivity,
+    :RiBasedVerticalDiffusivity,
 )
 
 #####
diff --git a/test/test_turbulence_closures.jl b/test/test_turbulence_closures.jl
index eaa765fbd5..fbc6e11c57 100644
--- a/test/test_turbulence_closures.jl
+++ b/test/test_turbulence_closures.jl
@@ -1,8 +1,13 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: CATKEVerticalDiffusivity
+using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
+
+using Oceananigans.TurbulenceClosures: viscosity_location, diffusivity_location
+
 using Oceananigans.TurbulenceClosures: diffusive_flux_x, diffusive_flux_y, diffusive_flux_z,
-                                       viscous_flux_ux, viscous_flux_uy, viscous_flux_uz
+                                       viscous_flux_ux, viscous_flux_uy, viscous_flux_uz,
+                                       viscous_flux_vx, viscous_flux_vy, viscous_flux_vz,
+                                       viscous_flux_wx, viscous_flux_wy, viscous_flux_wz
 
 for closure in closures
     @eval begin
@@ -10,26 +15,21 @@ for closure in closures
     end
 end
 
-function constant_isotropic_diffusivity_basic(T=Float64; ν=T(0.3), κ=T(0.7))
-    closure = ScalarDiffusivity(T; κ=(T=κ, S=κ), ν=ν)
-    return closure.ν == ν && closure.κ.T == κ
-end
-
 function tracer_specific_horizontal_diffusivity(T=Float64; νh=T(0.3), κh=T(0.7))
     closure = HorizontalScalarDiffusivity(κ=(T=κh, S=κh), ν=νh)
     return closure.ν == νh && closure.κ.T == κh && closure.κ.T == κh
 end
 
 function run_constant_isotropic_diffusivity_fluxdiv_tests(FT=Float64; ν=FT(0.3), κ=FT(0.7))
-          arch = CPU()
-       closure = ScalarDiffusivity(FT, κ=(T=κ, S=κ), ν=ν)
-          grid = RectilinearGrid(FT, size=(3, 1, 4), extent=(3, 1, 4))
+    arch       = CPU()
+    closure    = ScalarDiffusivity(FT, κ=(T=κ, S=κ), ν=ν)
+    grid       = RectilinearGrid(FT, size=(3, 1, 4), extent=(3, 1, 4))
     velocities = VelocityFields(grid)
-       tracers = TracerFields((:T, :S), grid)
-         clock = Clock(time=0.0)
+    tracers    = TracerFields((:T, :S), grid)
+    clock      = Clock(time=0.0)
 
     u, v, w = velocities
-       T, S = tracers
+    T, S = tracers
 
     for k in 1:4
         interior(u)[:, 1, k] .= [0, -1/2, 0]
@@ -118,6 +118,7 @@ function time_step_with_variable_anisotropic_diffusivity(arch)
         model = NonhydrostaticModel(grid=RectilinearGrid(arch, size=(1, 1, 1), extent=(1, 2, 3)), closure=clo)
         time_step!(model, 1, euler=true)
     end
+
     return true
 end
 
@@ -212,12 +213,25 @@ end
             @test ν_dx_u[1, 1, 1] == 0.0
             @test κ_dx_c[1, 1, 1] == 0.0
         end
+
+        c = Center()
+        f = Face()
+        ri_based = RiBasedVerticalDiffusivity()
+        @test viscosity_location(catke) == (c, c, f)
+        @test diffusivity_location(catke) == (c, c, f)
+
+        catke = CATKEVerticalDiffusivity()
+        @test viscosity_location(catke) == (c, c, f)
+        @test diffusivity_location(catke) == (c, c, f)
     end
 
     @testset "ScalarDiffusivity" begin
         @info "  Testing ScalarDiffusivity..."
         for T in float_types
-            @test constant_isotropic_diffusivity_basic(T)
+            ν, κ = 0.3, 0.7
+            closure = ScalarDiffusivity(T; κ=(T=κ, S=κ), ν=ν)
+            @test closure.ν == T(ν)
+            @test closure.κ.T == T(κ)
             run_constant_isotropic_diffusivity_fluxdiv_tests(T)
         end
     end

From 74dd3d12ceaaa9be78e25e726afb6ff7a6df9e14 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 13:31:46 -0800
Subject: [PATCH 231/530] More readable

---
 .../abstract_scalar_diffusivity_closure.jl                  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
index 3f6ad1a504..dc32e9b3af 100644
--- a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
+++ b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
@@ -55,9 +55,11 @@ Returns the scalar diffusivity associated with `closure` and `tracer_index`.
 """
 function diffusivity end 
 
+const c = Center()
+
 # Fallback locations
-@inline viscosity_location(::AbstractScalarDiffusivity) = (Center(), Center(), Center())
-@inline diffusivity_location(::AbstractScalarDiffusivity) = (Center(), Center(), Center())
+@inline viscosity_location(::AbstractScalarDiffusivity) = (c, c, c)
+@inline diffusivity_location(::AbstractScalarDiffusivity) = (c, c, c)
 
 # For tuples (note that kernel functions are "untupled", so these are for the user API)
 viscosity(closure::Tuple, K) = Tuple(viscosity(closure[n], K[n]) for n = 1:length(closure))

From 9dcf36adf10b71c6b8573613daaf752913c34ddf Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 13:33:44 -0800
Subject: [PATCH 232/530] Help readability in ri-based diffusivity

---
 .../ri_based_vertical_diffusivity.jl                        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 5153c5c2fd..87ed0bb5e4 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -105,9 +105,11 @@ RiBasedVerticalDiffusivity(FT::DataType; kw...) =
 const RBVD = RiBasedVerticalDiffusivity
 const RBVDArray = AbstractArray{<:RBVD}
 const FlavorOfRBVD = Union{RBVD, RBVDArray}
+const c = Center()
+const f = Face()
 
-@inline viscosity_location(::FlavorOfRBVD)   = (Center(), Center(), Face())
-@inline diffusivity_location(::FlavorOfRBVD) = (Center(), Center(), Face())
+@inline viscosity_location(::FlavorOfRBVD)   = (c, c, f)
+@inline diffusivity_location(::FlavorOfRBVD) = (c, c, f)
 
 @inline viscosity(::FlavorOfRBVD, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfRBVD, diffusivities, id) = diffusivities.κᶜ

From 051e7ede42678a7fe0194956269c0ac2267058aa Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 29 Apr 2023 13:36:05 -0800
Subject: [PATCH 233/530] Dont test RiBased generically

---
 test/dependencies_for_runtests.jl | 1 -
 test/test_turbulence_closures.jl  | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl
index 7a8d7ae933..d51dac156a 100644
--- a/test/dependencies_for_runtests.jl
+++ b/test/dependencies_for_runtests.jl
@@ -61,7 +61,6 @@ closures = (
     :SmagorinskyLilly,
     :AnisotropicMinimumDissipation,
     :ConvectiveAdjustmentVerticalDiffusivity,
-    :RiBasedVerticalDiffusivity,
 )
 
 #####
diff --git a/test/test_turbulence_closures.jl b/test/test_turbulence_closures.jl
index fbc6e11c57..b6cc0b4049 100644
--- a/test/test_turbulence_closures.jl
+++ b/test/test_turbulence_closures.jl
@@ -1,6 +1,6 @@
 include("dependencies_for_runtests.jl")
 
-using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity
+using Oceananigans.TurbulenceClosures: CATKEVerticalDiffusivity, RiBasedVerticalDiffusivity
 
 using Oceananigans.TurbulenceClosures: viscosity_location, diffusivity_location
 
@@ -217,8 +217,8 @@ end
         c = Center()
         f = Face()
         ri_based = RiBasedVerticalDiffusivity()
-        @test viscosity_location(catke) == (c, c, f)
-        @test diffusivity_location(catke) == (c, c, f)
+        @test viscosity_location(ri_based) == (c, c, f)
+        @test diffusivity_location(ri_based) == (c, c, f)
 
         catke = CATKEVerticalDiffusivity()
         @test viscosity_location(catke) == (c, c, f)

From c3aab99218c251798fdd8b79104b52afc4e6df77 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 30 Apr 2023 09:41:54 +1000
Subject: [PATCH 234/530] add note about convention in BatchedTridiagonalSolver
 doc

---
 src/Solvers/batched_tridiagonal_solver.jl | 34 ++++++++++++++++++-----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 98280bfbb1..5c1e77a420 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -20,22 +20,42 @@ architecture(solver::BatchedTridiagonalSolver) = architecture(solver.grid)
 
 
 """
-    BatchedTridiagonalSolver(grid; lower_diagonal, diagonal, upper_diagonal, parameters=nothing)
+    BatchedTridiagonalSolver(grid;
+                             lower_diagonal,
+                             diagonal,
+                             upper_diagonal,
+                             scratch = arch_array(architecture(grid), zeros(eltype(grid), size(grid)...)),
+                             parameters = nothing)
 
 Construct a solver for batched tridiagonal systems on `grid` of the form
 
-                           bⁱʲ¹ ϕⁱʲ¹ + cⁱʲ¹ ϕⁱʲ²   = fⁱʲ¹,
-           aⁱʲᵏ⁻¹ ϕⁱʲᵏ⁻¹ + bⁱʲᵏ ϕⁱʲᵏ + cⁱʲᵏ ϕⁱʲᵏ⁺¹ = fⁱʲᵏ,  k = 2, ..., N-1
-           aⁱʲᴺ⁻¹ ϕⁱʲᴺ⁻¹ + bⁱʲᴺ ϕⁱʲᴺ               = fⁱʲᴺ,
+```
+                    bⁱʲ¹ ϕⁱʲ¹ + cⁱʲ¹ ϕⁱʲ²   = fⁱʲ¹,
+    aⁱʲᵏ⁻¹ ϕⁱʲᵏ⁻¹ + bⁱʲᵏ ϕⁱʲᵏ + cⁱʲᵏ ϕⁱʲᵏ⁺¹ = fⁱʲᵏ,  k = 2, ..., N-1
+    aⁱʲᴺ⁻¹ ϕⁱʲᴺ⁻¹ + bⁱʲᴺ ϕⁱʲᴺ               = fⁱʲᴺ,
+```
+or in matrix form
+```
+    ⎡ bⁱʲ¹   cⁱʲ¹     0       ⋯         0   ⎤ ⎡ ϕⁱʲ¹ ⎤   ⎡ fⁱʲ¹ ⎤
+    ⎢ aⁱʲ¹   bⁱʲ²   cⁱʲ²      0    ⋯    ⋮   ⎥ ⎢ ϕⁱʲ² ⎥   ⎢ fⁱʲ² ⎥
+    ⎢  0      ⋱      ⋱       ⋱              ⎥ ⎢   .  ⎥   ⎢   .  ⎥
+    ⎢  ⋮                                0   ⎥ ⎢ ϕⁱʲᵏ ⎥   ⎢ fⁱʲᵏ ⎥
+    ⎢  ⋮           aⁱʲᴺ⁻²   bⁱʲᴺ⁻¹   cⁱʲᴺ⁻¹ ⎥ ⎢      ⎥   ⎢   .  ⎥
+    ⎣  0      ⋯      0      aⁱʲᴺ⁻¹    bⁱʲᴺ  ⎦ ⎣ ϕⁱʲᴺ ⎦   ⎣ fⁱʲᴺ ⎦
+```
 
 where `a` is the `lower_diagonal`, `b` is the `diagonal`, and `c` is the `upper_diagonal`.
-`ϕ` is the solution and `f` is the right hand side source term passed to `solve!(ϕ, tridiagonal_solver, f)`
+
+Note the convention used here for indexing the upper and lower diagonals; this can be different from 
+other implementations where, e.g., `aⁱʲ²` may appear for `k = 2` instead of `aⁱʲ¹` as above.
+
+`ϕ` is the solution and `f` is the right hand side source term passed to `solve!(ϕ, tridiagonal_solver, f)`.
 
 `a`, `b`, `c`, and `f` can be specified in three ways:
 
-1. A 1D array means that `aⁱʲᵏ = a[k]`.
+1. A 1D array means, e.g., that `aⁱʲᵏ = a[k]`.
 
-2. A 3D array means that `aⁱʲᵏ = a[i, j, k]`.
+2. A 3D array means, e.g., that `aⁱʲᵏ = a[i, j, k]`.
 
 Other coefficient types can be used by extending `get_coefficient`.
 """

From 97030cef95b8e74f4ae0c69f3b3d30fc72c26f07 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 30 Apr 2023 09:45:11 +1000
Subject: [PATCH 235/530] clarifications

---
 src/Solvers/batched_tridiagonal_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 5c1e77a420..13898faf8f 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -47,7 +47,7 @@ or in matrix form
 where `a` is the `lower_diagonal`, `b` is the `diagonal`, and `c` is the `upper_diagonal`.
 
 Note the convention used here for indexing the upper and lower diagonals; this can be different from 
-other implementations where, e.g., `aⁱʲ²` may appear for `k = 2` instead of `aⁱʲ¹` as above.
+other implementations where, e.g., `aⁱʲ²` may appear at the second row, instead of `aⁱʲ¹` as above.
 
 `ϕ` is the solution and `f` is the right hand side source term passed to `solve!(ϕ, tridiagonal_solver, f)`.
 

From 18e07b9c851770234d5d7e8630bd4ddcde3819d4 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 30 Apr 2023 09:59:46 +1000
Subject: [PATCH 236/530] add remark about different notation in Press

---
 src/Solvers/batched_tridiagonal_solver.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 13898faf8f..1ab7b4d05f 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -82,7 +82,8 @@ TriDiagonal Matrix Algorithm (TDMA).
 
 The result is stored in `ϕ` which must have size `(grid.Nx, grid.Ny, grid.Nz)`.
 
-Reference implementation per Numerical Recipes, Press et. al 1992 (§ 2.4).
+Reference implementation per Numerical Recipes, Press et al. 1992 (§ 2.4). Note that
+we use slightly different notation from Press et al.; see [`BatchedTridiagonalSolver`](@ref).
 """
 function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args... )
 

From 09c65eb1558fb36a0b20f5d6dfca32776a03a381 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 30 Apr 2023 10:00:44 +1000
Subject: [PATCH 237/530] clarifications

---
 src/Solvers/batched_tridiagonal_solver.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 1ab7b4d05f..f86aabf50e 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -83,7 +83,8 @@ TriDiagonal Matrix Algorithm (TDMA).
 The result is stored in `ϕ` which must have size `(grid.Nx, grid.Ny, grid.Nz)`.
 
 Reference implementation per Numerical Recipes, Press et al. 1992 (§ 2.4). Note that
-we use slightly different notation from Press et al.; see [`BatchedTridiagonalSolver`](@ref).
+a slightly different notation from Press et al. is used for indexing the off-diagonal
+elements; see [`BatchedTridiagonalSolver`](@ref).
 """
 function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args... )
 

From 75eb47a2ea47753ce7f54cdedd859486a9be78ad Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 30 Apr 2023 10:04:39 +1000
Subject: [PATCH 238/530] minor beautification tweaks

---
 src/Solvers/batched_tridiagonal_solver.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index f86aabf50e..18ebe23ab3 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -3,7 +3,7 @@ using Oceananigans.Architectures: arch_array
 import Oceananigans.Architectures: architecture
 
 """
-    BatchedTridiagonalSolver
+    struct BatchedTridiagonalSolver{A, B, C, T, G, P}
 
 A batched solver for large numbers of triadiagonal systems.
 """
@@ -86,7 +86,7 @@ Reference implementation per Numerical Recipes, Press et al. 1992 (§ 2.4). Note
 a slightly different notation from Press et al. is used for indexing the off-diagonal
 elements; see [`BatchedTridiagonalSolver`](@ref).
 """
-function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args... )
+function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
 
     a, b, c, t, parameters = solver.a, solver.b, solver.c, solver.t, solver.parameters
     grid = solver.grid
@@ -101,7 +101,7 @@ end
 @inline float_eltype(ϕ::AbstractArray{<:Complex{T}}) where T <: AbstractFloat = T
 
 @kernel function solve_batched_tridiagonal_system_kernel!(ϕ, a, b, c, f, t, grid, p, args)
-    Nx, Ny, Nz = size(grid)
+    _, _, Nz = size(grid)
 
     i, j = @index(Global, NTuple)
 
@@ -121,7 +121,7 @@ end
             fᵏ = get_coefficient(i, j, k, grid, f, p, args...)
             
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
-            # the algorithm is unstable and we elide the forward pass update of ϕ.
+            # the algorithm is unstable and we elide the forward pass update of `ϕ`.
             definitely_diagonally_dominant = abs(β) > 10 * eps(float_eltype(ϕ))
             !definitely_diagonally_dominant && break
             ϕ[i, j, k] = (fᵏ - aᵏ⁻¹ * ϕ[i, j, k-1]) / β
@@ -132,4 +132,3 @@ end
         end
     end
 end
-

From 7568f0bc97f4619ff4ca9a4bbd5d33c2fe28a436 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sun, 30 Apr 2023 07:18:29 -0800
Subject: [PATCH 239/530] Add back Manifest from main

---
 Manifest.toml | 728 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 728 insertions(+)
 create mode 100644 Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 0000000000..bb35a50bd5
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,728 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.8.5"
+manifest_format = "2.0"
+
+[[deps.AbstractFFTs]]
+deps = ["ChainRulesCore", "LinearAlgebra"]
+git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
+uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+version = "1.3.1"
+
+[[deps.Adapt]]
+deps = ["LinearAlgebra", "Requires"]
+git-tree-sha1 = "cc37d689f599e8df4f464b2fa3870ff7db7492ef"
+uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
+version = "3.6.1"
+
+[[deps.ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+version = "1.1.1"
+
+[[deps.ArrayInterface]]
+deps = ["Adapt", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "SuiteSparse"]
+git-tree-sha1 = "38911c7737e123b28182d89027f4216cfc8a9da7"
+uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+version = "7.4.3"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.Atomix]]
+deps = ["UnsafeAtomics"]
+git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be"
+uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+version = "0.1.0"
+
+[[deps.BFloat16s]]
+deps = ["LinearAlgebra", "Printf", "Random", "Test"]
+git-tree-sha1 = "dbf84058d0a8cbbadee18d25cf606934b22d7c66"
+uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
+version = "0.4.2"
+
+[[deps.Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[deps.CEnum]]
+git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.2"
+
+[[deps.CFTime]]
+deps = ["Dates", "Printf"]
+git-tree-sha1 = "ed2e76c1c3c43fd9d0cb9248674620b29d71f2d1"
+uuid = "179af706-886a-5703-950a-314cd64e0468"
+version = "0.1.2"
+
+[[deps.CUDA]]
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "e4fe4652b2fab0e766053ffb4ccf7a39ecb36254"
+uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
+version = "4.1.2"
+
+[[deps.CUDA_Driver_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "10ca2b63b496edc09258b3de5d1aa64094b18b1d"
+uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
+version = "0.5.0+0"
+
+[[deps.CUDA_Runtime_Discovery]]
+deps = ["Libdl"]
+git-tree-sha1 = "6c8fceaaa6850dea627288ac3bb86fdcdf05e326"
+uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
+version = "0.2.0"
+
+[[deps.CUDA_Runtime_jll]]
+deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "802b1f2220fd43251d343219adf478e6b7992bd4"
+uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
+version = "0.5.0+0"
+
+[[deps.ChainRulesCore]]
+deps = ["Compat", "LinearAlgebra", "SparseArrays"]
+git-tree-sha1 = "c6d890a52d2c4d55d326439580c3b8d0875a77d9"
+uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+version = "1.15.7"
+
+[[deps.ChangesOfVariables]]
+deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
+git-tree-sha1 = "485193efd2176b88e6622a39a246f8c5b600e74e"
+uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
+version = "0.1.6"
+
+[[deps.CommonDataModel]]
+deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
+git-tree-sha1 = "246cf98b1422f984dd3abc11834c64e83d7bf832"
+uuid = "1fbeeb36-5f17-413c-809b-666fb144f157"
+version = "0.2.1"
+
+[[deps.Compat]]
+deps = ["Dates", "LinearAlgebra", "UUIDs"]
+git-tree-sha1 = "7a60c856b9fa189eb34f5f8a6f6b5529b7942957"
+uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
+version = "4.6.1"
+
+[[deps.CompilerSupportLibraries_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
+version = "1.0.1+0"
+
+[[deps.Crayons]]
+git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
+uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
+version = "4.1.1"
+
+[[deps.CubedSphere]]
+deps = ["Elliptic", "FFTW", "Printf", "ProgressBars", "SpecialFunctions", "TaylorSeries", "Test"]
+git-tree-sha1 = "db9c12cb765cc048e158987388287c52baddf57d"
+uuid = "7445602f-e544-4518-8976-18f8e8ae6cdb"
+version = "0.2.2"
+
+[[deps.DataAPI]]
+git-tree-sha1 = "e8119c1a33d267e16108be441a287a6981ba1630"
+uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+version = "1.14.0"
+
+[[deps.DataStructures]]
+deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
+git-tree-sha1 = "d1fff3a548102f48987a52a2e0d114fa97d730f0"
+uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+version = "0.18.13"
+
+[[deps.DataValueInterfaces]]
+git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
+uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
+version = "1.0.0"
+
+[[deps.Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[deps.Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[deps.DocStringExtensions]]
+deps = ["LibGit2"]
+git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.9.3"
+
+[[deps.Downloads]]
+deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+version = "1.6.0"
+
+[[deps.Elliptic]]
+git-tree-sha1 = "71c79e77221ab3a29918aaf6db4f217b89138608"
+uuid = "b305315f-e792-5b7a-8f41-49f472929428"
+version = "1.0.1"
+
+[[deps.ExprTools]]
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.9"
+
+[[deps.FFTW]]
+deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
+git-tree-sha1 = "f9818144ce7c8c41edf5c4c179c684d92aa4d9fe"
+uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+version = "1.6.0"
+
+[[deps.FFTW_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "c6033cc3892d0ef5bb9cd29b7f2f0331ea5184ea"
+uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a"
+version = "3.3.10+0"
+
+[[deps.FileIO]]
+deps = ["Pkg", "Requires", "UUIDs"]
+git-tree-sha1 = "7be5f99f7d15578798f338f5433b6c432ea8037b"
+uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+version = "1.16.0"
+
+[[deps.FileWatching]]
+uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+
+[[deps.GPUArrays]]
+deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"]
+git-tree-sha1 = "9ade6983c3dbbd492cf5729f865fe030d1541463"
+uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+version = "8.6.6"
+
+[[deps.GPUArraysCore]]
+deps = ["Adapt"]
+git-tree-sha1 = "1cd7f0af1aa58abc02ea1d872953a97359cb87fa"
+uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
+version = "0.1.4"
+
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"]
+git-tree-sha1 = "fd6431121f31fed05a5386ac88b9bb3f97fdfa69"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.18.0"
+
+[[deps.Glob]]
+git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
+uuid = "c27321d9-0574-5035-807b-f59d2c89b15c"
+version = "1.3.1"
+
+[[deps.HDF5_jll]]
+deps = ["Artifacts", "JLLWrappers", "LibCURL_jll", "Libdl", "OpenSSL_jll", "Pkg", "Zlib_jll"]
+git-tree-sha1 = "4cc2bb72df6ff40b055295fdef6d92955f9dede8"
+uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
+version = "1.12.2+2"
+
+[[deps.IfElse]]
+git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
+uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173"
+version = "0.1.1"
+
+[[deps.IncompleteLU]]
+deps = ["LinearAlgebra", "SparseArrays"]
+git-tree-sha1 = "6c676e79f98abb6d33fa28122cad099f1e464afe"
+uuid = "40713840-3770-5561-ab4c-a76e7d0d7895"
+version = "0.2.1"
+
+[[deps.IntelOpenMP_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "d979e54b71da82f3a65b62553da4fc3d18c9004c"
+uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
+version = "2018.0.3+2"
+
+[[deps.InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[deps.InverseFunctions]]
+deps = ["Test"]
+git-tree-sha1 = "49510dfcb407e572524ba94aeae2fced1f3feb0f"
+uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
+version = "0.1.8"
+
+[[deps.IrrationalConstants]]
+git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
+uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
+version = "0.2.2"
+
+[[deps.IterativeSolvers]]
+deps = ["LinearAlgebra", "Printf", "Random", "RecipesBase", "SparseArrays"]
+git-tree-sha1 = "1169632f425f79429f245113b775a0e3d121457c"
+uuid = "42fd0dbc-a981-5370-80f2-aaf504508153"
+version = "0.9.2"
+
+[[deps.IteratorInterfaceExtensions]]
+git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
+uuid = "82899510-4779-5014-852e-03e436cf321d"
+version = "1.0.0"
+
+[[deps.JLD2]]
+deps = ["FileIO", "MacroTools", "Mmap", "OrderedCollections", "Pkg", "Printf", "Reexport", "Requires", "TranscodingStreams", "UUIDs"]
+git-tree-sha1 = "42c17b18ced77ff0be65957a591d34f4ed57c631"
+uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+version = "0.4.31"
+
+[[deps.JLLWrappers]]
+deps = ["Preferences"]
+git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.4.1"
+
+[[deps.JSON3]]
+deps = ["Dates", "Mmap", "Parsers", "SnoopPrecompile", "StructTypes", "UUIDs"]
+git-tree-sha1 = "84b10656a41ef564c39d2d477d7236966d2b5683"
+uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+version = "1.12.0"
+
+[[deps.KernelAbstractions]]
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "350a880e80004f4d5d82a17f737d8fcdc56c3462"
+uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+version = "0.9.1"
+
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "f044a2796a9e18e0531b9b3072b0019a61f264bc"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "4.17.1"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "070e4b5b65827f82c16ae0916376cb47377aa1b5"
+uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
+version = "0.0.18+0"
+
+[[deps.LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
+[[deps.LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.6.3"
+
+[[deps.LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "7.84.0+0"
+
+[[deps.LibGit2]]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[deps.LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.10.2+0"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.Libiconv_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "c7cb1f5d892775ba13767a87c7ada0b980ea0a71"
+uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
+version = "1.16.1+2"
+
+[[deps.LinearAlgebra]]
+deps = ["Libdl", "libblastrampoline_jll"]
+uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+
+[[deps.LogExpFunctions]]
+deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"]
+git-tree-sha1 = "0a1b7c2863e44523180fdb3146534e265a91870b"
+uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+version = "0.3.23"
+
+[[deps.Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[deps.MKL_jll]]
+deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
+git-tree-sha1 = "2ce8695e1e699b68702c03402672a69f54b8aca9"
+uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+version = "2022.2.0+0"
+
+[[deps.MPI]]
+deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "Requires", "Serialization", "Sockets"]
+git-tree-sha1 = "6d72bafd3960f9c119ceb8f034fef28346490fe5"
+uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
+version = "0.20.8"
+
+[[deps.MPICH_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "d790fbd913f85e8865c55bf4725aff197c5155c8"
+uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
+version = "4.1.1+1"
+
+[[deps.MPIPreferences]]
+deps = ["Libdl", "Preferences"]
+git-tree-sha1 = "71f937129731a29eabe6969db2c90368a4408933"
+uuid = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
+version = "0.1.7"
+
+[[deps.MPItrampoline_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "ad88f863a5a16b3e26d14446afd3cd746266281b"
+uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
+version = "5.2.1+3"
+
+[[deps.MacroTools]]
+deps = ["Markdown", "Random"]
+git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
+uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+version = "0.5.10"
+
+[[deps.Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[deps.MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.28.0+0"
+
+[[deps.MicrosoftMPI_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "a8027af3d1743b3bfae34e54872359fdebb31422"
+uuid = "9237b28f-5490-5468-be7b-bb81f5f5e6cf"
+version = "10.1.3+4"
+
+[[deps.Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[deps.MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+version = "2022.2.1"
+
+[[deps.NCDatasets]]
+deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
+git-tree-sha1 = "afd015e81e60cfbdba04ef59bcdc80e18bd613cd"
+uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
+version = "0.12.14"
+
+[[deps.NetCDF_jll]]
+deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Pkg", "XML2_jll", "Zlib_jll"]
+git-tree-sha1 = "072f8371f74c3b9e1b26679de7fbf059d45ea221"
+uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
+version = "400.902.5+1"
+
+[[deps.NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
+
+[[deps.OffsetArrays]]
+deps = ["Adapt"]
+git-tree-sha1 = "82d7c9e310fe55aa54996e6f7f94674e2a38fcb4"
+uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
+version = "1.12.9"
+
+[[deps.OpenBLAS_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
+uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
+version = "0.3.20+0"
+
+[[deps.OpenLibm_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
+version = "0.8.1+0"
+
+[[deps.OpenMPI_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
+git-tree-sha1 = "f3080f4212a8ba2ceb10a34b938601b862094314"
+uuid = "fe0851c0-eecd-5654-98d4-656369965a5c"
+version = "4.1.5+0"
+
+[[deps.OpenSSL_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "9ff31d101d987eb9d66bd8b176ac7c277beccd09"
+uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
+version = "1.1.20+0"
+
+[[deps.OpenSpecFun_jll]]
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
+uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
+version = "0.5.5+0"
+
+[[deps.OrderedCollections]]
+git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282"
+uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+version = "1.6.0"
+
+[[deps.Parsers]]
+deps = ["Dates", "SnoopPrecompile"]
+git-tree-sha1 = "478ac6c952fddd4399e71d4779797c538d0ff2bf"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "2.5.8"
+
+[[deps.PencilArrays]]
+deps = ["Adapt", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrayInterface", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
+git-tree-sha1 = "0c6ebb4777158b8662288fb4fca255e404adc94b"
+uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
+version = "0.17.10"
+
+[[deps.PencilFFTs]]
+deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
+git-tree-sha1 = "602dc6232e4c2747035dd39a0e6569fccb9e9337"
+uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
+version = "0.14.3"
+
+[[deps.Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+version = "1.8.0"
+
+[[deps.Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.3.0"
+
+[[deps.Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[deps.ProgressBars]]
+deps = ["Printf"]
+git-tree-sha1 = "9d84c8646109eb8bc7a006d59b157c64d5155c81"
+uuid = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
+version = "1.5.0"
+
+[[deps.Quaternions]]
+deps = ["LinearAlgebra", "Random", "RealDot"]
+git-tree-sha1 = "da095158bdc8eaccb7890f9884048555ab771019"
+uuid = "94ee1d12-ae83-5a48-8b1c-48b8ff168ae0"
+version = "0.7.4"
+
+[[deps.REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[deps.Random]]
+deps = ["SHA", "Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.Random123]]
+deps = ["Random", "RandomNumbers"]
+git-tree-sha1 = "7a1a306b72cfa60634f03a911405f4e64d1b718b"
+uuid = "74087812-796a-5b5d-8853-05524746bad3"
+version = "1.6.0"
+
+[[deps.RandomNumbers]]
+deps = ["Random", "Requires"]
+git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111"
+uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143"
+version = "1.5.3"
+
+[[deps.RealDot]]
+deps = ["LinearAlgebra"]
+git-tree-sha1 = "9f0a1b71baaf7650f4fa8a1d168c7fb6ee41f0c9"
+uuid = "c1ae055f-0cd5-4b69-90a6-9a35b1a98df9"
+version = "0.1.0"
+
+[[deps.RecipesBase]]
+deps = ["SnoopPrecompile"]
+git-tree-sha1 = "261dddd3b862bd2c940cf6ca4d1c8fe593e457c8"
+uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
+version = "1.3.3"
+
+[[deps.Reexport]]
+git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
+uuid = "189a3867-3050-52da-a836-e630ba90ab69"
+version = "1.2.2"
+
+[[deps.Requires]]
+deps = ["UUIDs"]
+git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
+uuid = "ae029012-a4dd-5104-9daa-d747884805df"
+version = "1.3.0"
+
+[[deps.Rotations]]
+deps = ["LinearAlgebra", "Quaternions", "Random", "StaticArrays", "Statistics"]
+git-tree-sha1 = "72a6abdcd088764878b473102df7c09bbc0548de"
+uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc"
+version = "1.4.0"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.SeawaterPolynomials]]
+git-tree-sha1 = "958ba75b90c7c8a117d041d33184134201cf8c0f"
+uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
+version = "0.3.2"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.SnoopPrecompile]]
+deps = ["Preferences"]
+git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c"
+uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c"
+version = "1.0.3"
+
+[[deps.Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[deps.SparseArrays]]
+deps = ["LinearAlgebra", "Random"]
+uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[[deps.SpecialFunctions]]
+deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
+git-tree-sha1 = "ef28127915f4229c971eb43f3fc075dd3fe91880"
+uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
+version = "2.2.0"
+
+[[deps.Static]]
+deps = ["IfElse"]
+git-tree-sha1 = "08be5ee09a7632c32695d954a602df96a877bf0d"
+uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
+version = "0.8.6"
+
+[[deps.StaticArrayInterface]]
+deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
+git-tree-sha1 = "fd5f417fd7e103c121b0a0b4a6902f03991111f4"
+uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
+version = "1.3.0"
+
+[[deps.StaticArrays]]
+deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
+git-tree-sha1 = "b8d897fe7fa688e93aef573711cb207c08c9e11e"
+uuid = "90137ffa-7385-5640-81b9-e52037218182"
+version = "1.5.19"
+
+[[deps.StaticArraysCore]]
+git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
+uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+version = "1.4.0"
+
+[[deps.StaticPermutations]]
+git-tree-sha1 = "193c3daa18ff3e55c1dae66acb6a762c4a3bdb0b"
+uuid = "15972242-4b8f-49a0-b8a1-9ac0e7a1a45d"
+version = "0.3.0"
+
+[[deps.Statistics]]
+deps = ["LinearAlgebra", "SparseArrays"]
+uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[[deps.Strided]]
+deps = ["LinearAlgebra", "TupleTools"]
+git-tree-sha1 = "a7a664c91104329c88222aa20264e1a05b6ad138"
+uuid = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
+version = "1.2.3"
+
+[[deps.StructArrays]]
+deps = ["Adapt", "DataAPI", "GPUArraysCore", "StaticArraysCore", "Tables"]
+git-tree-sha1 = "521a0e828e98bb69042fec1809c1b5a680eb7389"
+uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
+version = "0.6.15"
+
+[[deps.StructTypes]]
+deps = ["Dates", "UUIDs"]
+git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70"
+uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
+version = "1.10.0"
+
+[[deps.SuiteSparse]]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
+uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
+
+[[deps.TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.0"
+
+[[deps.TableTraits]]
+deps = ["IteratorInterfaceExtensions"]
+git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
+uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
+version = "1.0.1"
+
+[[deps.Tables]]
+deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
+git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec"
+uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+version = "1.10.1"
+
+[[deps.Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+version = "1.10.1"
+
+[[deps.TaylorSeries]]
+deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
+git-tree-sha1 = "87baeec9ad6273ed8040a93fbbbaa039fa955f1f"
+uuid = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
+version = "0.12.2"
+
+[[deps.Test]]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[deps.TimerOutputs]]
+deps = ["ExprTools", "Printf"]
+git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.22"
+
+[[deps.TranscodingStreams]]
+deps = ["Random", "Test"]
+git-tree-sha1 = "0b829474fed270a4b0ab07117dce9b9a2fa7581a"
+uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
+version = "0.9.12"
+
+[[deps.TupleTools]]
+git-tree-sha1 = "3c712976c47707ff893cf6ba4354aa14db1d8938"
+uuid = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
+version = "1.3.0"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[deps.UnsafeAtomics]]
+git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
+uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
+version = "0.2.1"
+
+[[deps.UnsafeAtomicsLLVM]]
+deps = ["LLVM", "UnsafeAtomics"]
+git-tree-sha1 = "ead6292c02aab389cb29fe64cc9375765ab1e219"
+uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+version = "0.1.1"
+
+[[deps.VersionParsing]]
+git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"
+uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
+version = "1.3.0"
+
+[[deps.XML2_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
+git-tree-sha1 = "93c41695bc1c08c46c5899f4fe06d6ead504bb73"
+uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
+version = "2.10.3+0"
+
+[[deps.Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.12+3"
+
+[[deps.libblastrampoline_jll]]
+deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
+uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
+version = "5.1.1+0"
+
+[[deps.nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.48.0+0"
+
+[[deps.p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "17.4.0+0"

From 30065f2a93601431d2764014b3ee17b6b43e1cb0 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sun, 30 Apr 2023 07:33:53 -0800
Subject: [PATCH 240/530] Delete function tridiagonal solver tests

---
 test/test_batched_tridiagonal_solver.jl | 72 +------------------------
 1 file changed, 1 insertion(+), 71 deletions(-)

diff --git a/test/test_batched_tridiagonal_solver.jl b/test/test_batched_tridiagonal_solver.jl
index f78b4a9dad..d8ad744ca5 100644
--- a/test/test_batched_tridiagonal_solver.jl
+++ b/test/test_batched_tridiagonal_solver.jl
@@ -32,39 +32,6 @@ function can_solve_single_tridiagonal_system(arch, N)
     return Array(ϕ[:]) ≈ ϕ_correct
 end
 
-function can_solve_single_tridiagonal_system_with_functions(arch, N)
-    ArrayType = array_type(arch)
-
-    grid = RectilinearGrid(arch, size=(1, 1, N), extent=(1, 1, 1))
-
-    a = rand(N-1)
-    c = rand(N-1)
-
-    @inline b(i, j, k, grid) = 3 .+ cos(2π * grid.zᵃᵃᶜ[k])  # +3 to ensure diagonal dominance.
-    @inline f(i, j, k, grid) = sin(2π * grid.zᵃᵃᶜ[k])
-
-    bₐ = [b(1, 1, k, grid) for k in 1:N]
-    fₐ = [f(1, 1, k, grid) for k in 1:N]
-
-    # Solve the system with backslash on the CPU to avoid scalar operations on the GPU.
-    M = Tridiagonal(a, bₐ, c)
-    ϕ_correct = M \ fₐ
-
-    # Convert to CuArray if needed.
-    a, c = ArrayType.((a, c))
-
-    ϕ = reshape(zeros(N), (1, 1, N)) |> ArrayType
-
-    btsolver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = a,
-                                        diagonal = b,
-                                        upper_diagonal = c)
-
-    solve!(ϕ, btsolver, f)
-
-    return Array(ϕ[:]) ≈ ϕ_correct
-end
-
 function can_solve_batched_tridiagonal_system_with_3D_RHS(arch, Nx, Ny, Nz)
     ArrayType = array_type(arch)
 
@@ -97,42 +64,6 @@ function can_solve_batched_tridiagonal_system_with_3D_RHS(arch, Nx, Ny, Nz)
     return Array(ϕ) ≈ ϕ_correct
 end
 
-function can_solve_batched_tridiagonal_system_with_3D_functions(arch, Nx, Ny, Nz)
-    ArrayType = array_type(arch)
-
-    grid = RectilinearGrid(arch, size=(Nx, Ny, Nz), extent=(1, 1, 1))
-
-    a = rand(Nz-1)
-    c = rand(Nz-1)
-
-    @inline b(i, j, k, grid) = 3 + grid.xᶜᵃᵃ[i] * grid.yᵃᶜᵃ[j] * cos(2π * grid.zᵃᵃᶜ[k])
-    @inline f(i, j, k, grid) = (grid.xᶜᵃᵃ[i] + grid.yᵃᶜᵃ[j]) * sin(2π * grid.zᵃᵃᶜ[k])
-
-    ϕ_correct = zeros(Nx, Ny, Nz)
-
-    # Solve the system with backslash on the CPU to avoid scalar operations on the GPU.
-    for i = 1:Nx, j = 1:Ny
-        bₐ = [b(i, j, k, grid) for k in 1:Nz]
-        M = Tridiagonal(a, bₐ, c)
-
-        fₐ = [f(i, j, k, grid) for k in 1:Nz]
-        ϕ_correct[i, j, :] .= M \ fₐ
-    end
-
-    # Convert to CuArray if needed.
-    a, c = ArrayType.([a, c])
-
-    btsolver = BatchedTridiagonalSolver(grid;
-                                        lower_diagonal = a,
-                                        diagonal = b,
-                                        upper_diagonal = c)
-
-    ϕ = zeros(Nx, Ny, Nz) |> ArrayType
-    solve!(ϕ, btsolver, f)
-
-    return Array(ϕ) ≈ ϕ_correct
-end
-
 @testset "Batched tridiagonal solvers" begin
     @info "Testing BatchedTridiagonalSolver..."
 
@@ -140,13 +71,12 @@ end
         @testset "Batched tridiagonal solver [$arch]" begin
             for Nz in [8, 11, 18]
                 @test can_solve_single_tridiagonal_system(arch, Nz)
-                @test can_solve_single_tridiagonal_system_with_functions(arch, Nz)
             end
 
             for Nx in [3, 8], Ny in [5, 16], Nz in [8, 11]
                 @test can_solve_batched_tridiagonal_system_with_3D_RHS(arch, Nx, Ny, Nz)
-                @test can_solve_batched_tridiagonal_system_with_3D_functions(arch, Nx, Ny, Nz)
             end
         end
     end
 end
+

From 6788fb83a7200e00fc3b58b4d70332dac0f6bc9a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 1 May 2023 12:27:00 -0800
Subject: [PATCH 241/530] New numerics, new defaults

---
 .../CATKEVerticalDiffusivities.jl             |  4 +--
 .../mixing_length.jl                          | 28 +++++++++----------
 .../turbulent_kinetic_energy_equation.jl      | 12 ++++----
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 73ce6d08eb..769d90daaa 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -115,8 +115,8 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
                                   mixing_length = MixingLength(),
                                   turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
                                   maximum_diffusivity = Inf,
-                                  minimum_turbulent_kinetic_energy = 0,
-                                  minimum_convective_buoyancy_flux = 1e-11,
+                                  minimum_turbulent_kinetic_energy = 1e-6,
+                                  minimum_convective_buoyancy_flux = 1e-8,
                                   negative_turbulent_kinetic_energy_damping_time_scale = 1minute,
                                   warning = true) where TD
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index c3f8515c1c..9a810419d1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -13,21 +13,21 @@ using ..TurbulenceClosures:
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
-    Cᴺ   :: FT = 0.37
-    Cᵇ   :: FT = 1.0
-    Cᶜc  :: FT = 4.8
-    Cᶜe  :: FT = 1.1
-    Cᵉc  :: FT = 0.049
+    Cᴺ   :: FT = 0.41
+    Cᵇ   :: FT = 0.1
+    Cᶜc  :: FT = 1.5
+    Cᶜe  :: FT = 1.2
+    Cᵉc  :: FT = 0.085
     Cᵉe  :: FT = 0.0
-    Cˢᶜ  :: FT = 0.29
-    C⁻u  :: FT = 0.36
-    C⁺u  :: FT = 0.24
-    C⁻c  :: FT = 0.41
-    C⁺c  :: FT = 0.12
-    C⁻e  :: FT = 6.7
-    C⁺e  :: FT = 5.4
-    CRiʷ :: FT = 0.011
-    CRiᶜ :: FT = 0.76
+    Cˢᶜ  :: FT = 0.14
+    C⁻u  :: FT = 0.46
+    C⁺u  :: FT = 0.21
+    C⁻c  :: FT = 0.49
+    C⁺c  :: FT = 0.11
+    C⁻e  :: FT = 4.5
+    C⁺e  :: FT = 1.4
+    CRiʷ :: FT = 0.45
+    CRiᶜ :: FT = 0.47
 end
 
 #####
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index a17cae0642..63d5a6a3db 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,13 +5,13 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    C⁻D   :: FT = 4.4
-    C⁺D   :: FT = 3.3
-    CᶜD   :: FT = 0.23
+    C⁻D   :: FT = 2.3
+    C⁺D   :: FT = 6.7
+    CᶜD   :: FT = 0.88
     CᵉD   :: FT = 0.0
-    Cᵂu★  :: FT = 1.8
-    CᵂwΔ  :: FT = 12.0
-    Cᵂϵ   :: FT = 20.0
+    Cᵂu★  :: FT = 1.1
+    CᵂwΔ  :: FT = 4.0
+    Cᵂϵ   :: FT = 1.0
 end
 
 #####

From 01579c4e02f86ae221526b818939f5d17704384e Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 1 May 2023 16:35:25 -0800
Subject: [PATCH 242/530] Implement time-smoothing of the buoyancy flux

---
 .../CATKEVerticalDiffusivities.jl             | 58 ++++++++++++++-----
 .../mixing_length.jl                          | 18 +++---
 .../turbulent_kinetic_energy_equation.jl      | 17 ++++--
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 769d90daaa..d0fbd317b2 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -197,13 +197,15 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCATKE)
     κᵘ = ZFaceField(grid, boundary_conditions=bcs.κᵘ)
     κᶜ = ZFaceField(grid, boundary_conditions=bcs.κᶜ)
     κᵉ = ZFaceField(grid, boundary_conditions=bcs.κᵉ)
-    Lᵉ = CenterField(grid) #, boundary_conditions=nothing)
+    Lᵉ = CenterField(grid)
+    Qᵇ = Field{Center, Center, Nothing}(grid)
+    previous_compute_time = Ref(zero(grid))
 
     # Secret tuple for getting tracer diffusivities with tuple[tracer_index]
     _tupled_tracer_diffusivities         = NamedTuple(name => name === :e ? κᵉ : κᶜ          for name in tracer_names)
     _tupled_implicit_linear_coefficients = NamedTuple(name => name === :e ? Lᵉ : ZeroField() for name in tracer_names)
 
-    return (; κᵘ, κᶜ, κᵉ, Lᵉ, _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
+    return (; κᵘ, κᶜ, κᵉ, Lᵉ, Qᵇ, previous_compute_time, _tupled_tracer_diffusivities, _tupled_implicit_linear_coefficients)
 end        
 
 const c = Center()
@@ -223,26 +225,52 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model)
     buoyancy = model.buoyancy
     clock = model.clock
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
+    Δt = model.clock.time - diffusivities.previous_compute_time[]
+    diffusivities.previous_compute_time[] = model.clock.time
+
+    launch!(arch, grid, :xy,
+            compute_average_surface_buoyancy_flux!,
+            diffusivities.Qᵇ, grid, closure, velocities, tracers, buoyancy, top_tracer_bcs, clock, Δt)
 
     launch!(arch, grid, :xyz,
-            calculate_CATKE_diffusivities!,
-            diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+            compute_CATKE_diffusivities!,
+            diffusivities, grid, closure, velocities, tracers, buoyancy)
 
     return nothing
 end
 
-@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+@kernel function compute_average_surface_buoyancy_flux!(Qᵇ, grid, closure, velocities, tracers, buoyancy, top_tracer_bcs, clock, Δt)
+    i, j = @index(Global, NTuple)
+
+    closure = getclosure(i, j, closure)
+
+    Qᵇ★ = top_buoyancy_flux(i, j, grid, buoyancy, top_tracer_bcs, clock, merge(velocities, tracers))
+
+    k = grid.Nz
+    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, Qᵇ)
+
+    Qᵇᵋ = closure.minimum_convective_buoyancy_flux
+    Qᵇᵢⱼ = @inbounds Qᵇ[i, j, 1]
+    Qᵇ⁺ = max(Qᵇᵋ, Qᵇᵢⱼ, Qᵇ★) # selects fastest (dominant) time-scale
+    t★ = (ℓᴰ^2 / Qᵇ⁺)^(1/3)
+    ϵ = Δt / t★
+
+    @inbounds Qᵇ[i, j, 1] = (Qᵇᵢⱼ + ϵ * Qᵇ★) / (1 + ϵ)
+end
+
+@kernel function compute_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy)
     i, j, k = @index(Global, NTuple)
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
 
     max_K = closure_ij.maximum_diffusivity
+    Qᵇ = diffusivities.Qᵇ
 
     @inbounds begin
-        κᵘ★ = κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-        κᶜ★ = κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-        κᵉ★ = κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        κᵘ★ = κuᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, Qᵇ)
+        κᶜ★ = κcᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, Qᵇ)
+        κᵉ★ = κeᶜᶜᶠ(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, Qᵇ)
 
         on_periphery = peripheral_node(i, j, k, grid, c, c, f)
         within_inactive = inactive_node(i, j, k, grid, c, c, f)
@@ -270,7 +298,7 @@ end
         Q_e = - Cᵂϵ * turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
 
         # Implicit TKE dissipation
-        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, Qᵇ)
         
         diffusivities.Lᵉ[i, j, k] = - wb_e + ϵ_e + Q_e
     end
@@ -287,21 +315,21 @@ end
     return sqrt(max(eᵐⁱⁿ, eᵢ))
 end
 
-@inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+@inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     return ℓu * u★
 end
 
-@inline function κcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+@inline function κcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓc = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    ℓc = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     return ℓc * u★
 end
 
-@inline function κeᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+@inline function κeᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓe = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+    ℓe = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     return ℓe * u★
 end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 9a810419d1..7c6c2cac4b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -100,12 +100,12 @@ end
 @inline squared_tkeᶜᶜᶜ(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^2
 
 @inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
-                                            velocities, tracers, buoyancy, clock, tracer_bcs)
+                                            velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     u, v, w = velocities
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
-    Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
+    Qᵇ       = @inbounds surface_buoyancy_flux[i, j, 1]
     w★       = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
     w★²      = ℑzᵃᵃᶠ(i, j, k, grid, squared_tkeᶜᶜᶜ, closure, tracers.e)
     w★³      = ℑzᵃᵃᶠ(i, j, k, grid, three_halves_tkeᶜᶜᶜ, closure, tracers.e)
@@ -143,12 +143,12 @@ end
 end
 
 @inline function convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
-                                            velocities, tracers, buoyancy, clock, tracer_bcs)
+                                            velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     u, v, w = velocities
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
-    Qᵇ       = top_buoyancy_flux(i, j, grid, buoyancy, tracer_bcs, clock, merge(velocities, tracers))
+    Qᵇ       = @inbounds surface_buoyancy_flux[i, j, 1]
     w★       = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)
     w★²      = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)^2
     w★³      = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)^3
@@ -196,7 +196,7 @@ end
     return scale(Ri, C⁻, C⁺, CRiᶜ, CRiʷ)
 end
 
-@inline function momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
+@inline function momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     C⁻ = closure.mixing_length.C⁻u
     C⁺ = closure.mixing_length.C⁺u
     σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
@@ -209,11 +209,11 @@ end
     return min(H, ℓ★)
 end
 
-@inline function tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
+@inline function tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     Cᶜ  = closure.mixing_length.Cᶜc
     Cᵉ  = closure.mixing_length.Cᵉc
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     C⁻ = closure.mixing_length.C⁻c
     C⁺ = closure.mixing_length.C⁺c
@@ -228,11 +228,11 @@ end
     return min(H, max(ℓ★, ℓʰ))
 end
 
-@inline function TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, tracer_bcs)
+@inline function TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     Cᶜ  = closure.mixing_length.Cᶜe
     Cᵉ  = closure.mixing_length.Cᵉe
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     C⁻ = closure.mixing_length.C⁻e
     C⁺ = closure.mixing_length.C⁺e
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 63d5a6a3db..8af42c9342 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -55,16 +55,14 @@ end
 
 @inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
 
-@inline function implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE,
-                                                  velocities, tracers, buoyancy, clock, tracer_bcs)
-    e = tracers.e
-    FT = eltype(grid)
+@inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE,
+                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     # Convective dissipation length
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, clock, tracer_bcs)
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     # "Stable" dissipation length
     C⁻D = closure.turbulent_kinetic_energy_equation.C⁻D
@@ -82,6 +80,15 @@ end
     H = total_depthᶜᶜᵃ(i, j, grid)
     ℓᴰ = min(H, ℓ★ + ℓʰ)
 
+    return ℓᴰ
+end
+
+@inline function implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE,
+                                                  velocities, tracers, buoyancy, surface_buoyancy_flux)
+
+    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    e = tracers.e
+    FT = eltype(grid)
     eᵢ = @inbounds e[i, j, k]
     
     # Note:

From e4595e56c4bf2c1e49084a7edc81c4a7cc6f3434 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Tue, 2 May 2023 17:20:27 +1000
Subject: [PATCH 243/530] fix doctest

---
 docs/src/model_setup/boundary_conditions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/src/model_setup/boundary_conditions.md b/docs/src/model_setup/boundary_conditions.md
index 0d0b6d5d33..dbb2bf609f 100644
--- a/docs/src/model_setup/boundary_conditions.md
+++ b/docs/src/model_setup/boundary_conditions.md
@@ -442,7 +442,7 @@ hill (generic function with 1 method)
 
 julia> grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(hill))
 32×32×16 ImmersedBoundaryGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo:
-├── immersed_boundary: GridFittedBottom(min(h)=1.00e-01, max(h)=1.98e-01)
+├── immersed_boundary: GridFittedBottom(mean(z)=0.106195, min(z)=0.1, max(z)=0.198258)
 ├── underlying_grid: 32×32×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
 ├── Periodic x ∈ [-3.0, 3.0) regularly spaced with Δx=0.1875
 ├── Periodic y ∈ [-3.0, 3.0) regularly spaced with Δy=0.1875

From 1bdb0bbda8c95dc9674544b202b9135c6dfebf2a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 2 May 2023 07:45:29 -0600
Subject: [PATCH 244/530] Fix sign error in explicit dissipation

---
 .../turbulent_kinetic_energy_equation.jl                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 8af42c9342..a95001f75b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -108,7 +108,7 @@ end
 @inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
     eᵢ = @inbounds tracers.e[i, j, k]
     L = implicit_dissipation_coefficient(i, j, k, grid, closure, velocities, tracers, args...)
-    return L * eᵢ
+    return - L * eᵢ
 end
 
 @inline implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)

From 022516777f622cf5fae2de04624ebb877b130732 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 2 May 2023 07:49:56 -0600
Subject: [PATCH 245/530] Fix dissipation call signature and better name for
 dissipation rate

---
 ...ic_free_surface_tendency_kernel_functions.jl |  2 +-
 .../CATKEVerticalDiffusivities.jl               |  4 ++--
 .../turbulent_kinetic_energy_equation.jl        | 17 +++++++++--------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
index dac730f343..b94a912edf 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
@@ -178,6 +178,6 @@ end
              - immersed_∇_dot_qᶜ(i, j, k, grid, e, e_immersed_bc, closure, diffusivities, val_tracer_index, clock, model_fields)
              + shear_production(i, j, k, grid, closure, velocities, diffusivities)
              + buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
-             - dissipation(i, j, k, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+             - dissipation(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
              + forcing(i, j, k, grid, clock, model_fields))
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index d0fbd317b2..61af051670 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -298,9 +298,9 @@ end
         Q_e = - Cᵂϵ * turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
 
         # Implicit TKE dissipation
-        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, Qᵇ)
+        ω_e = dissipation_rate(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, diffusivities)
         
-        diffusivities.Lᵉ[i, j, k] = - wb_e + ϵ_e + Q_e
+        diffusivities.Lᵉ[i, j, k] = - wb_e + ω_e + Q_e
     end
 end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index a95001f75b..8cabff84fc 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -56,13 +56,14 @@ end
 @inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
 
 @inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE,
-                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
+                                             velocities, tracers, buoyancy, diffusivities)
 
     # Convective dissipation length
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    Qᵇ = diffusivities.Qᵇ
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, Qᵇ)
 
     # "Stable" dissipation length
     C⁻D = closure.turbulent_kinetic_energy_equation.C⁻D
@@ -83,10 +84,10 @@ end
     return ℓᴰ
 end
 
-@inline function implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE,
-                                                  velocities, tracers, buoyancy, surface_buoyancy_flux)
+@inline function dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE,
+                                  velocities, tracers, buoyancy, diffusivities)
 
-    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     e = tracers.e
     FT = eltype(grid)
     eᵢ = @inbounds e[i, j, k]
@@ -107,11 +108,11 @@ end
 # Fallbacks for explicit time discretization
 @inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
     eᵢ = @inbounds tracers.e[i, j, k]
-    L = implicit_dissipation_coefficient(i, j, k, grid, closure, velocities, tracers, args...)
-    return - L * eᵢ
+    ω = dissipation_rate(i, j, k, grid, closure, velocities, tracers, args...)
+    return - ω * eᵢ
 end
 
-@inline implicit_dissipation_coefficient(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)
+@inline dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)
 
 #####
 ##### For closure tuples...

From 46f4e15216f2ad7be852f83b67fa6759323b457d Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 3 May 2023 18:04:49 -0400
Subject: [PATCH 246/530] bugfix

---
 .../turbulent_kinetic_energy_equation.jl                  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 8cabff84fc..cb6a0f125d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -55,14 +55,14 @@ end
 
 @inline dissipation(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, args...) = zero(grid)
 
-@inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE,
-                                             velocities, tracers, buoyancy, diffusivities)
+@inline function dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers,
+                                             buoyancy, surface_buoyancy_flux)
 
     # Convective dissipation length
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
     Cˢᶜ = closure.mixing_length.Cˢᶜ
-    Qᵇ = diffusivities.Qᵇ
+    Qᵇ = surface_buoyancy_flux
     ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, Qᵇ)
 
     # "Stable" dissipation length
@@ -87,7 +87,7 @@ end
 @inline function dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE,
                                   velocities, tracers, buoyancy, diffusivities)
 
-    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+    ℓᴰ = dissipation_length_scaleᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     e = tracers.e
     FT = eltype(grid)
     eᵢ = @inbounds e[i, j, k]

From 4b1c69bd25c1d912dc29d91c51655ae3914e5cd3 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 4 May 2023 22:40:00 -0400
Subject: [PATCH 247/530] Refactor CATKE parameters

---
 ..._free_surface_tendency_kernel_functions.jl |   2 +-
 .../CATKEVerticalDiffusivities.jl             |  75 +++++---
 .../mixing_length.jl                          | 165 +++++++++++-------
 .../turbulent_kinetic_energy_equation.jl      |  86 +++++----
 .../column_windy_convection.jl                |   7 +-
 5 files changed, 206 insertions(+), 129 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
index b94a912edf..8473306733 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
@@ -176,7 +176,7 @@ end
     return ( - div_Uc(i, j, k, grid, advection, velocities, e)
              - ∇_dot_qᶜ(i, j, k, grid, closure, diffusivities, val_tracer_index, e, clock, model_fields, buoyancy)
              - immersed_∇_dot_qᶜ(i, j, k, grid, e, e_immersed_bc, closure, diffusivities, val_tracer_index, clock, model_fields)
-             + shear_production(i, j, k, grid, closure, velocities, diffusivities)
+             + shear_production(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
              + buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
              - dissipation(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
              + forcing(i, j, k, grid, clock, model_fields))
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 61af051670..1ab2dc2027 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -263,8 +263,6 @@ end
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
-
-    max_K = closure_ij.maximum_diffusivity
     Qᵇ = diffusivities.Qᵇ
 
     @inbounds begin
@@ -278,13 +276,13 @@ end
         κᶜ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᶜ★))
         κᵉ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵉ★))
 
-        diffusivities.κᵘ[i, j, k] = min(max_K, κᵘ★) 
-        diffusivities.κᶜ[i, j, k] = min(max_K, κᶜ★)
-        diffusivities.κᵉ[i, j, k] = min(max_K, κᵉ★)
+        diffusivities.κᵘ[i, j, k] = κᵘ★
+        diffusivities.κᶜ[i, j, k] = κᶜ★
+        diffusivities.κᵉ[i, j, k] = κᵉ★
 
         # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
         # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
-        wb = ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+        wb = _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
         eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
         # See `buoyancy_flux`
@@ -300,7 +298,7 @@ end
         # Implicit TKE dissipation
         ω_e = dissipation_rate(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, diffusivities)
         
-        diffusivities.Lᵉ[i, j, k] = - wb_e + ω_e + Q_e
+        diffusivities.Lᵉ[i, j, k] = - wb_e - ω_e + Q_e
     end
 end
 
@@ -316,21 +314,43 @@ end
 end
 
 @inline function κuᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓu = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    return ℓu * u★
+    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
+    ℓᵘ = momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    κᵘ = ℓᵘ * w★
+    κ_max = closure.maximum_diffusivity
+    return min(κᵘ, κ_max)
+end
+
+@inline function κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    w★ = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)
+    ℓᵘ = momentum_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    κᵘ = ℓᵘ * w★
+    κ_max = closure.maximum_diffusivity
+    return min(κᵘ, κ_max)
 end
 
 @inline function κcᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓc = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    return ℓc * u★
+    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
+    ℓᶜ = tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    κᶜ = ℓᶜ * w★
+    κ_max = closure.maximum_diffusivity
+    return min(κᶜ, κ_max)
+end
+
+@inline function κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    w★ = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, tracers.e)
+    ℓᶜ = tracer_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    κᶜ = ℓᶜ * w★
+    κ_max = closure.maximum_diffusivity
+    return min(κᶜ, κ_max)
 end
 
 @inline function κeᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    u★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
-    ℓe = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    return ℓe * u★
+    w★ = ℑzᵃᵃᶠ(i, j, k, grid, turbulent_velocityᶜᶜᶜ, closure, tracers.e)
+    ℓᵉ = TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    κᵉ = ℓᵉ * w★
+    κ_max = closure.maximum_diffusivity
+    return min(κᵉ, κ_max)
 end
 
 @inline viscosity(::FlavorOfCATKE, diffusivities) = diffusivities.κᵘ
@@ -346,7 +366,6 @@ function Base.summary(closure::CATKEVD)
 end
 
 function Base.show(io::IO, closure::FlavorOfCATKE)
-    # └
     print(io, summary(closure))
     print(io, '\n')
     print(io, "├── maximum_diffusivity: ", prettysummary(closure.maximum_diffusivity), '\n',
@@ -354,23 +373,23 @@ function Base.show(io::IO, closure::FlavorOfCATKE)
               "├── negative_turbulent_kinetic_energy_damping_time_scale: ", prettysummary(closure.negative_turbulent_kinetic_energy_damping_time_scale), '\n',
               "├── minimum_convective_buoyancy_flux: ", prettysummary(closure.minimum_convective_buoyancy_flux), '\n',
               "├── mixing_length: ", prettysummary(closure.mixing_length), '\n',
-              "│   ├── Cᴺ:   ", prettysummary(closure.mixing_length.Cᴺ), '\n',
+              "│   ├── Cˢ:   ", prettysummary(closure.mixing_length.Cˢ), '\n',
               "│   ├── Cᵇ:   ", prettysummary(closure.mixing_length.Cᵇ), '\n',
               "│   ├── Cᶜc:  ", prettysummary(closure.mixing_length.Cᶜc), '\n',
               "│   ├── Cᶜe:  ", prettysummary(closure.mixing_length.Cᶜe), '\n',
               "│   ├── Cᵉc:  ", prettysummary(closure.mixing_length.Cᵉc), '\n',
               "│   ├── Cᵉe:  ", prettysummary(closure.mixing_length.Cᵉe), '\n',
-              "│   ├── C⁻u:  ", prettysummary(closure.mixing_length.C⁻u), '\n',
-              "│   ├── C⁻c:  ", prettysummary(closure.mixing_length.C⁻c), '\n',
-              "│   ├── C⁻e:  ", prettysummary(closure.mixing_length.C⁻e), '\n',
-              "│   ├── C⁺u:  ", prettysummary(closure.mixing_length.C⁺u), '\n',
-              "│   ├── C⁺c:  ", prettysummary(closure.mixing_length.C⁺c), '\n',
-              "│   ├── C⁺e:  ", prettysummary(closure.mixing_length.C⁺e), '\n',
-              "│   ├── CRiʷ: ", prettysummary(closure.mixing_length.CRiʷ), '\n',
-              "│   └── CRiᶜ: ", prettysummary(closure.mixing_length.CRiᶜ), '\n',
+              "│   ├── Cˡᵒu: ", prettysummary(closure.mixing_length.Cˡᵒu), '\n',
+              "│   ├── Cˡᵒc: ", prettysummary(closure.mixing_length.Cˡᵒc), '\n',
+              "│   ├── Cˡᵒe: ", prettysummary(closure.mixing_length.Cˡᵒe), '\n',
+              "│   ├── Cʰⁱu: ", prettysummary(closure.mixing_length.Cʰⁱu), '\n',
+              "│   ├── Cʰⁱc: ", prettysummary(closure.mixing_length.Cʰⁱc), '\n',
+              "│   ├── Cʰⁱe: ", prettysummary(closure.mixing_length.Cʰⁱe), '\n',
+              "│   ├── CRiᵟ: ", prettysummary(closure.mixing_length.CRiᵟ), '\n',
+              "│   └── CRi⁰: ", prettysummary(closure.mixing_length.CRi⁰), '\n',
               "└── turbulent_kinetic_energy_equation: ", prettysummary(closure.turbulent_kinetic_energy_equation), '\n',
-              "    ├── C⁻D:  ", prettysummary(closure.turbulent_kinetic_energy_equation.C⁻D),  '\n',
-              "    ├── C⁺D:  ", prettysummary(closure.turbulent_kinetic_energy_equation.C⁺D),  '\n',
+              "    ├── CˡᵒD: ", prettysummary(closure.turbulent_kinetic_energy_equation.CˡᵒD),  '\n',
+              "    ├── CʰⁱD: ", prettysummary(closure.turbulent_kinetic_energy_equation.CʰⁱD),  '\n',
               "    ├── CᶜD:  ", prettysummary(closure.turbulent_kinetic_energy_equation.CᶜD),  '\n',
               "    ├── CᵉD:  ", prettysummary(closure.turbulent_kinetic_energy_equation.CᵉD),  '\n',
               "    ├── Cᵂu★: ", prettysummary(closure.turbulent_kinetic_energy_equation.Cᵂu★), '\n',
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 7c6c2cac4b..a1110e5839 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -13,21 +13,21 @@ using ..TurbulenceClosures:
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
-    Cᴺ   :: FT = 0.41
+    Cˢ   :: FT = 0.5
     Cᵇ   :: FT = 0.1
-    Cᶜc  :: FT = 1.5
-    Cᶜe  :: FT = 1.2
-    Cᵉc  :: FT = 0.085
+    Cᶜc  :: FT = 1.2
+    Cᶜe  :: FT = 1.7
+    Cᵉc  :: FT = 0.1
     Cᵉe  :: FT = 0.0
-    Cˢᶜ  :: FT = 0.14
-    C⁻u  :: FT = 0.46
-    C⁺u  :: FT = 0.21
-    C⁻c  :: FT = 0.49
-    C⁺c  :: FT = 0.11
-    C⁻e  :: FT = 4.5
-    C⁺e  :: FT = 1.4
-    CRiʷ :: FT = 0.45
-    CRiᶜ :: FT = 0.47
+    Cˢᵖ  :: FT = 1.8
+    Cˡᵒu :: FT = 0.9
+    Cʰⁱu :: FT = 0.3
+    Cˡᵒc :: FT = 0.8
+    Cʰⁱc :: FT = 0.2
+    Cˡᵒe :: FT = 1.9
+    Cʰⁱe :: FT = 2.6
+    CRiᵟ :: FT = 0.3
+    CRi⁰ :: FT = 0.1
 end
 
 #####
@@ -67,11 +67,11 @@ end
 end
 
 @inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
-    Cᴺ = closure.mixing_length.Cᴺ
-    ℓᴺ = Cᴺ * stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
+    ℓᴺ = stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
 
+    Cˢ = closure.mixing_length.Cˢ
     Cᵇ = closure.mixing_length.Cᵇ
-    d_up   = depthᶜᶜᶠ(i, j, k, grid)
+    d_up   = Cˢ * depthᶜᶜᶠ(i, j, k, grid)
     d_down = Cᵇ * height_above_bottomᶜᶜᶠ(i, j, k, grid)
     d = min(d_up, d_down)
 
@@ -82,11 +82,11 @@ end
 end
 
 @inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
-    Cᴺ = closure.mixing_length.Cᴺ
-    ℓᴺ = Cᴺ * stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+    ℓᴺ = stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
 
+    Cˢ = closure.mixing_length.Cˢ
     Cᵇ = closure.mixing_length.Cᵇ
-    d_up = depthᶜᶜᶜ(i, j, k, grid)
+    d_up   = Cˢ * depthᶜᶜᶜ(i, j, k, grid)
     d_down = Cᵇ * height_above_bottomᶜᶜᶜ(i, j, k, grid)
     d = min(d_up, d_down)
 
@@ -99,7 +99,7 @@ end
 @inline three_halves_tkeᶜᶜᶜ(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^3
 @inline squared_tkeᶜᶜᶜ(i, j, k, grid, closure, e) = turbulent_velocityᶜᶜᶜ(i, j, k, grid, closure, e)^2
 
-@inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
+@inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᵖ::Number,
                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     u, v, w = velocities
@@ -122,17 +122,17 @@ end
     convecting = (Qᵇ > Qᵇᵋ) & (N² < 0)
 
     # Model for shear-convection interaction
-    Sc = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sc = "Sheared convection number"
-    ϵᶜˢ = 1 - Cˢᶜ * Sc               # ϵ = Sheared convection factor
+    Sp = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sp = "Sheared convection number"
+    ϵˢᵖ = 1 - Cˢᵖ * Sp               # ϵ = Sheared convection factor
     
     # Reduce convective and entraining mixing lengths by sheared convection factor
     # end ensure non-negativity
-    ℓᶜ = clip(ϵᶜˢ * ℓᶜ)
+    ℓᶜ = clip(ϵˢᵖ * ℓᶜ)
 
     # "Entrainment length"
     # Ensures that w′b′ ~ Qᵇ at entrainment depth
     ℓᵉ = Cᵉ * Qᵇ / (w★ * N² + Qᵇᵋ)
-    ℓᵉ = clip(ϵᶜˢ * ℓᵉ)
+    ℓᵉ = clip(ϵˢᵖ * ℓᵉ)
     
     entraining = (Qᵇ > Qᵇᵋ) & (N² > 0) & (N²_above < 0)
 
@@ -142,7 +142,7 @@ end
     return ifelse(isnan(ℓ), zero(grid), ℓ)
 end
 
-@inline function convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᶜ::Number,
+@inline function convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᵖ::Number,
                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
 
     u, v, w = velocities
@@ -165,17 +165,17 @@ end
     convecting = (Qᵇ > Qᵇᵋ) & (N² < 0)
 
     # Model for shear-convection interaction
-    Sc = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sc = "Sheared convection number"
-    ϵᶜˢ = 1 - Cˢᶜ * Sc               # ϵ = Sheared convection factor
+    Sp = sqrt(S²) * w★² / (Qᵇ + Qᵇᵋ) # Sp = "Sheared convection number"
+    ϵˢᵖ = 1 - Cˢᵖ * Sp               # ϵ = Sheared convection factor
     
     # Reduce convective and entraining mixing lengths by sheared convection factor
     # end ensure non-negativity
-    ℓᶜ = clip(ϵᶜˢ * ℓᶜ)
+    ℓᶜ = clip(ϵˢᵖ * ℓᶜ)
 
     # "Entrainment length"
     # Ensures that w′b′ ~ Qᵇ at entrainment depth
     ℓᵉ = Cᵉ * Qᵇ / (w★ * N² + Qᵇᵋ)
-    ℓᵉ = clip(ϵᶜˢ * ℓᵉ)
+    ℓᵉ = clip(ϵˢᵖ * ℓᵉ)
     
     entraining = (Qᵇ > Qᵇᵋ) & (N² > 0) & (N²_above < 0)
 
@@ -189,17 +189,24 @@ end
 @inline step(x, c, w) = max(zero(x), min(one(x), (x - c) / w))
 @inline scale(Ri, σ⁻, σ⁺, c, w) = σ⁻ + (σ⁺ - σ⁻) * step(Ri, c, w)
 
-@inline function stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
+@inline function stability_functionᶜᶜᶠ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
-    CRiᶜ = closure.mixing_length.CRiᶜ
-    CRiʷ = closure.mixing_length.CRiʷ
-    return scale(Ri, C⁻, C⁺, CRiᶜ, CRiʷ)
+    CRi⁰ = closure.mixing_length.CRi⁰
+    CRiᵟ = closure.mixing_length.CRiᵟ
+    return scale(Ri, Cˡᵒ, Cʰⁱ, CRi⁰, CRiᵟ)
+end
+
+@inline function stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
+    Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
+    CRi⁰ = closure.mixing_length.CRi⁰
+    CRiᵟ = closure.mixing_length.CRiᵟ
+    return scale(Ri, Cˡᵒ, Cʰⁱ, CRi⁰, CRiᵟ)
 end
 
 @inline function momentum_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
-    C⁻ = closure.mixing_length.C⁻u
-    C⁺ = closure.mixing_length.C⁺u
-    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
+    Cˡᵒ = closure.mixing_length.Cˡᵒu
+    Cʰⁱ = closure.mixing_length.Cʰⁱu
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
 
     ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
@@ -209,59 +216,93 @@ end
     return min(H, ℓ★)
 end
 
+@inline function momentum_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    Cˡᵒ = closure.mixing_length.Cˡᵒu
+    Cʰⁱ = closure.mixing_length.Cʰⁱu
+    σ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
+
+    ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+
+    H = total_depthᶜᶜᵃ(i, j, grid)
+
+    return min(H, ℓ★)
+end
+
 @inline function tracer_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     Cᶜ  = closure.mixing_length.Cᶜc
     Cᵉ  = closure.mixing_length.Cᵉc
-    Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    Cˢᵖ = closure.mixing_length.Cˢᵖ
+    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, surface_buoyancy_flux)
 
-    C⁻ = closure.mixing_length.C⁻c
-    C⁺ = closure.mixing_length.C⁺c
-    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
+    Cˡᵒ = closure.mixing_length.Cˡᵒc
+    Cʰⁱ = closure.mixing_length.Cʰⁱc
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+    ℓᶜ = max(ℓ★, ℓʰ)
+
+    H = total_depthᶜᶜᵃ(i, j, grid)
+    return min(H, ℓᶜ)
+end
+
+@inline function tracer_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    Cᶜ  = closure.mixing_length.Cᶜc
+    Cᵉ  = closure.mixing_length.Cᵉc
+    Cˢᵖ = closure.mixing_length.Cˢᵖ
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, surface_buoyancy_flux)
+
+    Cˡᵒ = closure.mixing_length.Cˡᵒc
+    Cʰⁱ = closure.mixing_length.Cʰⁱc
+    σ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
+    ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+
+    ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
+    ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+    ℓᶜ = max(ℓ★, ℓʰ)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
 
-    return min(H, max(ℓ★, ℓʰ))
+    return min(H, ℓᶜ)
 end
 
 @inline function TKE_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, velocities, tracers, buoyancy, surface_buoyancy_flux)
     Cᶜ  = closure.mixing_length.Cᶜe
     Cᵉ  = closure.mixing_length.Cᵉe
-    Cˢᶜ = closure.mixing_length.Cˢᶜ
-    ℓʰ = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, surface_buoyancy_flux)
+    Cˢᵖ = closure.mixing_length.Cˢᵖ
+    ℓʰ  = convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, surface_buoyancy_flux)
 
-    C⁻ = closure.mixing_length.C⁻e
-    C⁺ = closure.mixing_length.C⁺e
-    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, C⁻, C⁺, velocities, tracers, buoyancy)
+    Cˡᵒ = closure.mixing_length.Cˡᵒe
+    Cʰⁱ = closure.mixing_length.Cʰⁱe
+    σ = stability_functionᶜᶜᶠ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     ℓ★ = σ * stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
 
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+    ℓᵉ = max(ℓ★, ℓʰ)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
-    return min(H, max(ℓ★, ℓʰ))
+    return min(H, ℓᵉ)
 end
 
 Base.summary(::MixingLength) = "CATKEVerticalDiffusivities.MixingLength"
 
 Base.show(io::IO, ml::MixingLength) =
     print(io, "CATKEVerticalDiffusivities.MixingLength parameters:", '\n',
-              "    Cᴺ   = $(ml.Cᴺ)",   '\n',
-              "    Cᵇ   = $(ml.Cᵇ)",   '\n',
-              "    Cᶜc  = $(ml.Cᶜc)",  '\n',
-              "    Cᶜe  = $(ml.Cᶜe)",  '\n',
-              "    Cᵉc  = $(ml.Cᵉc)",  '\n',
-              "    Cᵉe  = $(ml.Cᵉe)",  '\n',
-              "    C⁻u  = $(ml.C⁻u)", '\n',
-              "    C⁻c  = $(ml.C⁻c)", '\n',
-              "    C⁻e  = $(ml.C⁻e)", '\n',
-              "    C⁺u  = $(ml.C⁺u)", '\n',
-              "    C⁺c  = $(ml.C⁺c)", '\n',
-              "    C⁺e  = $(ml.C⁺e)", '\n',
-              "    CRiʷ = $(ml.CRiʷ)", '\n',
-              "    CRiᶜ = $(ml.CRiᶜ)")
+              "    Cˢ:   $(ml.Cˢ)",   '\n',
+              "    Cᵇ:   $(ml.Cᵇ)",   '\n',
+              "    Cᶜc:  $(ml.Cᶜc)",  '\n',
+              "    Cᶜe:  $(ml.Cᶜe)",  '\n',
+              "    Cᵉc:  $(ml.Cᵉc)",  '\n',
+              "    Cᵉe:  $(ml.Cᵉe)",  '\n',
+              "    Cˡᵒu: $(ml.Cˡᵒu)", '\n',
+              "    Cˡᵒc: $(ml.Cˡᵒc)", '\n',
+              "    Cˡᵒe: $(ml.Cˡᵒe)", '\n',
+              "    Cʰⁱu: $(ml.Cʰⁱu)", '\n',
+              "    Cʰⁱc: $(ml.Cʰⁱc)", '\n',
+              "    Cʰⁱe: $(ml.Cʰⁱe)", '\n',
+              "    CRiᵟ: $(ml.CRiᵟ)", '\n',
+              "    CRi⁰: $(ml.CRi⁰)")
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index cb6a0f125d..5b994411be 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,12 +5,12 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    C⁻D   :: FT = 2.3
-    C⁺D   :: FT = 6.7
-    CᶜD   :: FT = 0.88
+    CˡᵒD   :: FT = 0.16
+    CʰⁱD   :: FT = 1.1
+    CᶜD   :: FT = 0.87
     CᵉD   :: FT = 0.0
-    Cᵂu★  :: FT = 1.1
-    CᵂwΔ  :: FT = 4.0
+    Cᵂu★  :: FT = 0.26
+    CᵂwΔ  :: FT = 4.1
     Cᵂϵ   :: FT = 1.0
 end
 
@@ -18,31 +18,49 @@ end
 ##### Terms in the turbulent kinetic energy equation, all at cell centers
 #####
 
-@inline ν_∂z_u²(i, j, k, grid, ν, u) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
-@inline ν_∂z_v²(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
+@inline ν_∂z_u²(i, j, k, grid, u, ν, args...) = ℑxᶠᵃᵃ(i, j, k, grid, ν, args...) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
+@inline ν_∂z_v²(i, j, k, grid, u, ν, args...) = ℑyᵃᶠᵃ(i, j, k, grid, ν, args...) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
-@inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, diffusivities)
-    κᵘ = diffusivities.κᵘ
+@inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities)
+    closure = getclosure(i, j, closure)
     u = velocities.u
     v = velocities.v
 
-    # Separate reconstruction of the u- and v- contributions is essential for numerical stability
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², κᵘ, u) + ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², κᵘ, v)
+    #=
+    # Separate reconstruction of the u- and v- contributions is essential for numerical stability?
+    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², u, κuᶜᶜᶜ, closure, velocities, tracers, buoyancy, Qᵇ) +
+           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², v, κuᶜᶜᶜ, closure, velocities, tracers, buoyancy, Qᵇ)
+    =#
+
+    κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+    S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
+
+    return κᵘ * S²
 end
 
+#=
 @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
     κᶻ = @inbounds diffusivities.κᶜ[i, j, k]
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     return - κᶻ * N²
 end
+=#
+
+@inline function _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+    closure = getclosure(i, j, closure)
+    κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+    N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+    wb = - κᶜ * N²
+    return wb
+end
 
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
-    ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+    _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
 
 const VITD = VerticallyImplicitTimeDiscretization
 
 @inline function buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, velocities, tracers, buoyancy, diffusivities)
-    wb = ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+    wb = _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
     dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
@@ -61,27 +79,25 @@ end
     # Convective dissipation length
     Cᶜ = closure.turbulent_kinetic_energy_equation.CᶜD
     Cᵉ = closure.turbulent_kinetic_energy_equation.CᵉD
-    Cˢᶜ = closure.mixing_length.Cˢᶜ
+    Cˢᵖ = closure.mixing_length.Cˢᵖ
     Qᵇ = surface_buoyancy_flux
-    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᶜ, velocities, tracers, buoyancy, Qᵇ)
+    ℓʰ = convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ, Cᵉ, Cˢᵖ, velocities, tracers, buoyancy, Qᵇ)
 
     # "Stable" dissipation length
-    C⁻D = closure.turbulent_kinetic_energy_equation.C⁻D
-    C⁺D = closure.turbulent_kinetic_energy_equation.C⁺D
-    Riᶜ = closure.mixing_length.CRiᶜ
-    Riʷ = closure.mixing_length.CRiʷ
-    Ri = Riᶜᶜᶜ(i, j, k, grid, velocities, tracers, buoyancy)
-    σ = scale(Ri, C⁻D, C⁺D, Riᶜ, Riʷ)
-    ℓ★ = σ * stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    Cˡᵒ = closure.turbulent_kinetic_energy_equation.CˡᵒD
+    Cʰⁱ = closure.turbulent_kinetic_energy_equation.CʰⁱD
+    σᴰ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
+    ℓ★ = stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
+    ℓ★ = ℓ★ / σᴰ
 
+    # Dissipation length
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
     ℓ★ = ifelse(isnan(ℓ★), zero(grid), ℓ★)
+    ℓᴰ = max(ℓ★, ℓʰ)
 
-    # Dissipation length
     H = total_depthᶜᶜᵃ(i, j, grid)
-    ℓᴰ = min(H, ℓ★ + ℓʰ)
 
-    return ℓᴰ
+    return min(H, ℓᴰ)
 end
 
 @inline function dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE,
@@ -100,9 +116,10 @@ end
     #
     #   and thus    L = - Cᴰ √e / ℓ .
 
-    τ = closure.negative_turbulent_kinetic_energy_damping_time_scale
+    ω_numerical = 1 / closure.negative_turbulent_kinetic_energy_damping_time_scale
+    ω_physical = sqrt(abs(eᵢ)) / ℓᴰ
 
-    return ifelse(eᵢ < 0, -1/τ, -sqrt(abs(eᵢ)) / ℓᴰ)
+    return ifelse(eᵢ < 0, ω_numerical, ω_physical)
 end
 
 # Fallbacks for explicit time discretization
@@ -112,8 +129,6 @@ end
     return - ω * eᵢ
 end
 
-@inline dissipation_rate(i, j, k, grid, closure::FlavorOfCATKE, args...) = zero(grid)
-
 #####
 ##### For closure tuples...
 #####
@@ -298,9 +313,10 @@ end
 Base.summary(::TurbulentKineticEnergyEquation) = "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation"
 Base.show(io::IO, tke::TurbulentKineticEnergyEquation) =
     print(io, "CATKEVerticalDiffusivities.TurbulentKineticEnergyEquation parameters: \n" *
-              "    C⁻D  = $(tke.C⁻D),  \n" *
-              "    C⁺D  = $(tke.C⁺D),  \n" *
-              "    CᶜD  = $(tke.CᶜD),  \n" *
-              "    CᵉD  = $(tke.CᵉD),  \n" *
-              "    Cᵂu★ = $(tke.Cᵂu★), \n" *
-              "    CᵂwΔ = $(tke.CᵂwΔ)")
+              "    CˡᵒD: $(tke.CˡᵒD),  \n" *
+              "    CʰⁱD: $(tke.CʰⁱD),  \n" *
+              "    CᶜD:  $(tke.CᶜD),  \n" *
+              "    CᵉD:  $(tke.CᵉD),  \n" *
+              "    Cᵂu★: $(tke.Cᵂu★), \n" *
+              "    CᵂwΔ: $(tke.CᵂwΔ)")
+
diff --git a/validation/vertical_mixing_closures/column_windy_convection.jl b/validation/vertical_mixing_closures/column_windy_convection.jl
index 93098892aa..adc118c98c 100644
--- a/validation/vertical_mixing_closures/column_windy_convection.jl
+++ b/validation/vertical_mixing_closures/column_windy_convection.jl
@@ -27,7 +27,7 @@ u_bcs = FieldBoundaryConditions(top = FluxBoundaryCondition(Qᵘ))
 
 closures_to_run = [
                    CATKEVerticalDiffusivity(),
-                   RiBasedVerticalDiffusivity(),
+                   #RiBasedVerticalDiffusivity(),
                    #convective_adjustment,
                    ]   
 
@@ -41,7 +41,7 @@ for closure in closures_to_run
     bᵢ(x, y, z) = N² * z
     set!(model, b=bᵢ, e=1e-6)
 
-    simulation = Simulation(model, Δt=10minute, stop_time=48hours)
+    simulation = Simulation(model, Δt=10minutes, stop_iteration=1000)
 
     closurename = string(nameof(typeof(closure)))
 
@@ -52,7 +52,8 @@ for closure in closures_to_run
 
     simulation.output_writers[:fields] =
         JLD2OutputWriter(model, outputs,
-                         schedule = TimeInterval(10minutes),
+                         #schedule = TimeInterval(10minutes),
+                         schedule = IterationInterval(100),
                          filename = "windy_convection_" * closurename,
                          overwrite_existing = true)
 

From fb33f13d3b3f237fbf85548c27d511cbb73d23fd Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 4 May 2023 23:36:10 -0400
Subject: [PATCH 248/530] Fix GPU errors

---
 .../CATKEVerticalDiffusivities/mixing_length.jl             | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index a1110e5839..c7f99cad0c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -102,7 +102,8 @@ end
 @inline function convective_length_scaleᶜᶜᶠ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᵖ::Number,
                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
 
-    u, v, w = velocities
+    u = velocities.u
+    v = velocities.v
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
     Qᵇ       = @inbounds surface_buoyancy_flux[i, j, 1]
@@ -145,7 +146,8 @@ end
 @inline function convective_length_scaleᶜᶜᶜ(i, j, k, grid, closure, Cᶜ::Number, Cᵉ::Number, Cˢᵖ::Number,
                                             velocities, tracers, buoyancy, surface_buoyancy_flux)
 
-    u, v, w = velocities
+    u = velocities.u
+    v = velocities.v
 
     Qᵇᵋ      = closure.minimum_convective_buoyancy_flux
     Qᵇ       = @inbounds surface_buoyancy_flux[i, j, 1]

From 518960453d13d0a6b8ed50cbea76f8fe67a654fc Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sun, 7 May 2023 07:04:45 -0400
Subject: [PATCH 249/530] Updates

---
 .../mixing_length.jl                          |  2 +-
 .../turbulent_kinetic_energy_equation.jl      | 28 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index c7f99cad0c..d6592aa9a2 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -189,7 +189,7 @@ end
 
 """Piecewise linear function between 0 (when x < c) and 1 (when x - c > w)."""
 @inline step(x, c, w) = max(zero(x), min(one(x), (x - c) / w))
-@inline scale(Ri, σ⁻, σ⁺, c, w) = σ⁻ + (σ⁺ - σ⁻) * step(Ri, c, w)
+@inline scale(Ri, σ⁻, σ⁺ , c, w) = σ⁻ + (σ⁺ - σ⁻) * step(Ri, c, w)
 
 @inline function stability_functionᶜᶜᶠ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     Ri = Riᶜᶜᶠ(i, j, k, grid, velocities, tracers, buoyancy)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 5b994411be..dd2d37700f 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,8 +5,8 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    CˡᵒD   :: FT = 0.16
-    CʰⁱD   :: FT = 1.1
+    CˡᵒD  :: FT = 0.16
+    CʰⁱD  :: FT = 1.1
     CᶜD   :: FT = 0.87
     CᵉD   :: FT = 0.0
     Cᵂu★  :: FT = 0.26
@@ -18,40 +18,42 @@ end
 ##### Terms in the turbulent kinetic energy equation, all at cell centers
 #####
 
-@inline ν_∂z_u²(i, j, k, grid, u, ν, args...) = ℑxᶠᵃᵃ(i, j, k, grid, ν, args...) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
-@inline ν_∂z_v²(i, j, k, grid, u, ν, args...) = ℑyᵃᶠᵃ(i, j, k, grid, ν, args...) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
+@inline ν_∂z_u²(i, j, k, grid, u, ν) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
+@inline ν_∂z_v²(i, j, k, grid, v, ν) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
 @inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities)
-    closure = getclosure(i, j, closure)
     u = velocities.u
     v = velocities.v
 
     #=
     # Separate reconstruction of the u- and v- contributions is essential for numerical stability?
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², u, κuᶜᶜᶜ, closure, velocities, tracers, buoyancy, Qᵇ) +
-           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², v, κuᶜᶜᶜ, closure, velocities, tracers, buoyancy, Qᵇ)
+    κᵘ = diffusivities.κᵘ
+    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², κᵘ, u) +
+           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², κᵘ, v)
     =#
 
+    closure = getclosure(i, j, closure)
     κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
-
     return κᵘ * S²
 end
 
 #=
 @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-    κᶻ = @inbounds diffusivities.κᶜ[i, j, k]
+    κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    return - κᶻ * N²
+    return - κᶜ * N²
 end
+
+@inline _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+    ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
 =#
 
 @inline function _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     closure = getclosure(i, j, closure)
     κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
-    wb = - κᶜ * N²
-    return wb
+    return - κᶜ * N²
 end
 
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
@@ -88,7 +90,7 @@ end
     Cʰⁱ = closure.turbulent_kinetic_energy_equation.CʰⁱD
     σᴰ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     ℓ★ = stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-    ℓ★ = ℓ★ / σᴰ
+    ℓ★ = ℓ★ / σᴰ 
 
     # Dissipation length
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)

From 598e856ce890af30f4856daa482f1f80454cf1a0 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 8 May 2023 15:43:40 -0400
Subject: [PATCH 250/530] Bugfix

---
 .../turbulent_kinetic_energy_equation.jl                   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index dd2d37700f..bf9e40b248 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -18,8 +18,8 @@ end
 ##### Terms in the turbulent kinetic energy equation, all at cell centers
 #####
 
-@inline ν_∂z_u²(i, j, k, grid, u, ν) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
-@inline ν_∂z_v²(i, j, k, grid, v, ν) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
+@inline ν_∂z_u²(i, j, k, grid, ν, u) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
+@inline ν_∂z_v²(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
 @inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities)
     u = velocities.u
@@ -90,7 +90,7 @@ end
     Cʰⁱ = closure.turbulent_kinetic_energy_equation.CʰⁱD
     σᴰ = stability_functionᶜᶜᶜ(i, j, k, grid, closure, Cˡᵒ, Cʰⁱ, velocities, tracers, buoyancy)
     ℓ★ = stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, tracers.e, velocities, tracers, buoyancy)
-    ℓ★ = ℓ★ / σᴰ 
+    ℓ★ = ℓ★ / σᴰ
 
     # Dissipation length
     ℓʰ = ifelse(isnan(ℓʰ), zero(grid), ℓʰ)
@@ -98,7 +98,6 @@ end
     ℓᴰ = max(ℓ★, ℓʰ)
 
     H = total_depthᶜᶜᵃ(i, j, grid)
-
     return min(H, ℓᴰ)
 end
 

From 68cbd779e3ae646f831ac0fbc13245bdbe9ecebd Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 9 May 2023 16:19:02 -0400
Subject: [PATCH 251/530] Use precise TKE terms

---
 .../CATKEVerticalDiffusivities.jl             |  2 +-
 .../turbulent_kinetic_energy_equation.jl      | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 1ab2dc2027..738b14de14 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -282,7 +282,7 @@ end
 
         # "Patankar trick" for buoyancy production (cf Patankar 1980 or Burchard et al. 2003)
         # If buoyancy flux is a _sink_ of TKE, we treat it implicitly.
-        wb = _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+        wb = explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
         eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
         # See `buoyancy_flux`
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index bf9e40b248..7fe4d56d37 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -18,51 +18,51 @@ end
 ##### Terms in the turbulent kinetic energy equation, all at cell centers
 #####
 
-@inline ν_∂z_u²(i, j, k, grid, ν, u) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
-@inline ν_∂z_v²(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
+@inline ν_∂z_u²ᶠᶜᶠ(i, j, k, grid, ν, u) = ℑxᶠᵃᵃ(i, j, k, grid, ν) * ∂zᶠᶜᶠ(i, j, k, grid, u)^2
+@inline ν_∂z_v²ᶜᶠᶠ(i, j, k, grid, ν, v) = ℑyᵃᶠᵃ(i, j, k, grid, ν) * ∂zᶜᶠᶠ(i, j, k, grid, v)^2
 
 @inline function shear_production(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities)
     u = velocities.u
     v = velocities.v
 
-    #=
     # Separate reconstruction of the u- and v- contributions is essential for numerical stability?
     κᵘ = diffusivities.κᵘ
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u², κᵘ, u) +
-           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v², κᵘ, v)
-    =#
+    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
+           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
 
+    #=
     closure = getclosure(i, j, closure)
     κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
     return κᵘ * S²
+    =#
 end
 
-#=
 @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
     κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     return - κᶜ * N²
 end
 
-@inline _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
     ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
-=#
 
-@inline function _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+#=
+@inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     closure = getclosure(i, j, closure)
     κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     return - κᶜ * N²
 end
+=#
 
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
-    _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+    explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
 
 const VITD = VerticallyImplicitTimeDiscretization
 
 @inline function buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE{<:VITD}, velocities, tracers, buoyancy, diffusivities)
-    wb = _buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+    wb = explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     eⁱʲᵏ = @inbounds tracers.e[i, j, k]
 
     dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0

From c2a27e1b3debb89e7df20830a1785e83c88df65b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 16 May 2023 10:02:18 +0200
Subject: [PATCH 252/530] changes

---
 src/Distributed/multi_architectures.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 006707f119..256638efa2 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -212,7 +212,6 @@ function RankConnectivity(model_index, ranks, topology)
     r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
     r_northeast = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
-
     r_northwest = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
     r_southeast = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
     r_southwest = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)

From 18d46cee3350cc7f1c503d7785f6bcbe767bcff5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 16 May 2023 10:06:27 +0200
Subject: [PATCH 253/530] changes

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 4f28f438aa..510fc7d210 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -159,7 +159,7 @@ for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2,
     end
 end
 
-# If more than one direction is communicating we need to repeat one fill halo to fill the freaking corners!
+# If more than one direction is communicating we need to add a corner passing routine!
 function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; blocking = true, kwargs...)
     
     requests = MPI.Request[]

From f1cd18443e923beab709199d1fef67cb799c25dd Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 26 May 2023 13:31:31 -0400
Subject: [PATCH 254/530] Commit to nonconservative TKE equation but add
 comments

---
 .../turbulent_kinetic_energy_equation.jl      | 35 ++++++++++---------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index 7fe4d56d37..b69fe0d582 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -25,36 +25,37 @@ end
     u = velocities.u
     v = velocities.v
 
-    # Separate reconstruction of the u- and v- contributions is essential for numerical stability?
-    κᵘ = diffusivities.κᵘ
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
-           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
+    # To reconstruct the shear production term "conservatively" (ie approximately corresponding
+    # to dissipatation of mean kinetic energy):
+    # κᵘ = diffusivities.κᵘ
+    # return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
+    #        ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
 
-    #=
+    # Non-conservative reconstruction of shear production:
     closure = getclosure(i, j, closure)
     κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
     return κᵘ * S²
-    =#
 end
 
-@inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-    κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
-    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    return - κᶜ * N²
-end
-
-@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
-    ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
-
-#=
+# To reconstruct buoyancy flux "conservatively" (ie approximately correpsonding to production/destruction
+# of mean potential energy):
+# @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
+#     κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
+#     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
+#     return - κᶜ * N²
+# end
+# 
+# @inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+#     ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+
+# Non-conservative reconstruction of buoyancy flux:
 @inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
     closure = getclosure(i, j, closure)
     κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
     return - κᶜ * N²
 end
-=#
 
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
     explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)

From 935788824e1d934b080324da61f9f0f3effce37f Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 26 May 2023 13:13:23 -0600
Subject: [PATCH 255/530] Update free parameters and add comments

---
 .../mixing_length.jl                          | 30 +++++++++----------
 .../turbulent_kinetic_energy_equation.jl      | 14 ++++-----
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index d6592aa9a2..94f1bfb9fa 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -13,21 +13,21 @@ using ..TurbulenceClosures:
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
-    Cˢ   :: FT = 0.5
-    Cᵇ   :: FT = 0.1
-    Cᶜc  :: FT = 1.2
-    Cᶜe  :: FT = 1.7
-    Cᵉc  :: FT = 0.1
-    Cᵉe  :: FT = 0.0
-    Cˢᵖ  :: FT = 1.8
-    Cˡᵒu :: FT = 0.9
-    Cʰⁱu :: FT = 0.3
-    Cˡᵒc :: FT = 0.8
-    Cʰⁱc :: FT = 0.2
-    Cˡᵒe :: FT = 1.9
-    Cʰⁱe :: FT = 2.6
-    CRiᵟ :: FT = 0.3
-    CRi⁰ :: FT = 0.1
+    Cˢ   :: FT = 0.36   # Surface distance coefficient for shear length scale
+    Cᵇ   :: FT = 0.1    # Bottom distance coefficient for shear length scale
+    Cᶜc  :: FT = 0.78   # Convective mixing length coefficient for tracers
+    Cᶜe  :: FT = 0.087  # Convective mixing length coefficient for TKE
+    Cᵉc  :: FT = 0.25   # Convective penetration mixing length coefficient for tracers
+    Cᵉe  :: FT = 0.0    # Convective penetration mixing length coefficient for TKE
+    Cˢᵖ  :: FT = 0.072  # Sheared convective plume coefficient
+    Cˡᵒu :: FT = 1.1    # Shear mixing length coefficient for momentum at low Ri
+    Cʰⁱu :: FT = 0.28   # Shear mixing length coefficient for momentum at high Ri
+    Cˡᵒc :: FT = 1.9    # Shear mixing length coefficient for tracers at low Ri
+    Cʰⁱc :: FT = 0.22   # Shear mixing length coefficient for tracers at high Ri
+    Cˡᵒe :: FT = 0.71   # Shear mixing length coefficient for TKE at low Ri
+    Cʰⁱe :: FT = 3.5    # Shear mixing length coefficient for TKE at high Ri
+    CRiᵟ :: FT = 0.14   # Stability function width 
+    CRi⁰ :: FT = 0.13   # Stability function lower Ri
 end
 
 #####
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index b69fe0d582..43d5a4dd11 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,13 +5,13 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    CˡᵒD  :: FT = 0.16
-    CʰⁱD  :: FT = 1.1
-    CᶜD   :: FT = 0.87
-    CᵉD   :: FT = 0.0
-    Cᵂu★  :: FT = 0.26
-    CᵂwΔ  :: FT = 4.1
-    Cᵂϵ   :: FT = 1.0
+    CˡᵒD  :: FT = 0.080 # Dissipation length scale shear coefficient for low Ri
+    CʰⁱD  :: FT = 1.3   # Dissipation length scale shear coefficient for high Ri
+    CᶜD   :: FT = 1.6   # Dissipation length scale convecting layer coefficient
+    CᵉD   :: FT = 0.0   # Dissipation length scale penetration layer coefficient
+    Cᵂu★  :: FT = 4.0   # Surface shear-driven TKE flux coefficient
+    CᵂwΔ  :: FT = 0.91  # Surface convective TKE flux coefficient
+    Cᵂϵ   :: FT = 1.0   # Dissipative near-bottom TKE flux coefficient
 end
 
 #####

From 1834d92dd5018489fba8bdb41b15662e7280af93 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Sat, 27 May 2023 16:22:05 -0400
Subject: [PATCH 256/530] Go back to conservative formulation

---
 .../turbulent_kinetic_energy_equation.jl      | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index b69fe0d582..23331059fa 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -27,36 +27,36 @@ end
 
     # To reconstruct the shear production term "conservatively" (ie approximately corresponding
     # to dissipatation of mean kinetic energy):
-    # κᵘ = diffusivities.κᵘ
-    # return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
-    #        ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
+    κᵘ = diffusivities.κᵘ
+    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
+           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
 
     # Non-conservative reconstruction of shear production:
-    closure = getclosure(i, j, closure)
-    κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
-    S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
-    return κᵘ * S²
+    # closure = getclosure(i, j, closure)
+    # κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+    # S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
+    # return κᵘ * S²
 end
 
 # To reconstruct buoyancy flux "conservatively" (ie approximately correpsonding to production/destruction
 # of mean potential energy):
-# @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-#     κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
-#     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-#     return - κᶜ * N²
-# end
-# 
-# @inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
-#     ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
-
-# Non-conservative reconstruction of buoyancy flux:
-@inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
-    closure = getclosure(i, j, closure)
-    κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
-    N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+@inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
+    κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
+    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     return - κᶜ * N²
 end
 
+@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+    ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+
+# Non-conservative reconstruction of buoyancy flux:
+# @inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+#     closure = getclosure(i, j, closure)
+#     κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+#     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+#     return - κᶜ * N²
+# end
+
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
     explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
 

From b7658c82c99e686d72a8bc7a71eef976259b2a80 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 30 May 2023 12:06:35 -0600
Subject: [PATCH 257/530] New defaults based on defaults in #3090

---
 .../CATKEVerticalDiffusivities.jl             |  7 +--
 .../mixing_length.jl                          | 37 ++++++-------
 .../turbulent_kinetic_energy_equation.jl      | 52 +++++++++----------
 3 files changed, 49 insertions(+), 47 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 738b14de14..064b08282a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -272,9 +272,10 @@ end
 
         on_periphery = peripheral_node(i, j, k, grid, c, c, f)
         within_inactive = inactive_node(i, j, k, grid, c, c, f)
-        κᵘ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵘ★))
-        κᶜ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᶜ★))
-        κᵉ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, NaN, κᵉ★))
+        nan = convert(eltype(grid), NaN)
+        κᵘ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, nan, κᵘ★))
+        κᶜ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, nan, κᶜ★))
+        κᵉ★ = ifelse(on_periphery, zero(grid), ifelse(within_inactive, nan, κᵉ★))
 
         diffusivities.κᵘ[i, j, k] = κᵘ★
         diffusivities.κᶜ[i, j, k] = κᶜ★
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 94f1bfb9fa..521640ad62 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -13,21 +13,21 @@ using ..TurbulenceClosures:
 Contains mixing length parameters for CATKE vertical diffusivity.
 """
 Base.@kwdef struct MixingLength{FT}
-    Cˢ   :: FT = 0.36   # Surface distance coefficient for shear length scale
-    Cᵇ   :: FT = 0.1    # Bottom distance coefficient for shear length scale
-    Cᶜc  :: FT = 0.78   # Convective mixing length coefficient for tracers
-    Cᶜe  :: FT = 0.087  # Convective mixing length coefficient for TKE
-    Cᵉc  :: FT = 0.25   # Convective penetration mixing length coefficient for tracers
+    Cˢ   :: FT = 2.4    # Surface distance coefficient for shear length scale
+    Cᵇ   :: FT = Inf    # Bottom distance coefficient for shear length scale
+    Cᶜc  :: FT = 1.5    # Convective mixing length coefficient for tracers
+    Cᶜe  :: FT = 1.2    # Convective mixing length coefficient for TKE
+    Cᵉc  :: FT = 0.085  # Convective penetration mixing length coefficient for tracers
     Cᵉe  :: FT = 0.0    # Convective penetration mixing length coefficient for TKE
-    Cˢᵖ  :: FT = 0.072  # Sheared convective plume coefficient
-    Cˡᵒu :: FT = 1.1    # Shear mixing length coefficient for momentum at low Ri
-    Cʰⁱu :: FT = 0.28   # Shear mixing length coefficient for momentum at high Ri
-    Cˡᵒc :: FT = 1.9    # Shear mixing length coefficient for tracers at low Ri
-    Cʰⁱc :: FT = 0.22   # Shear mixing length coefficient for tracers at high Ri
-    Cˡᵒe :: FT = 0.71   # Shear mixing length coefficient for TKE at low Ri
-    Cʰⁱe :: FT = 3.5    # Shear mixing length coefficient for TKE at high Ri
-    CRiᵟ :: FT = 0.14   # Stability function width 
-    CRi⁰ :: FT = 0.13   # Stability function lower Ri
+    Cˢᵖ  :: FT = 0.14   # Sheared convective plume coefficient
+    Cˡᵒu :: FT = 0.19   # Shear mixing length coefficient for momentum at low Ri
+    Cʰⁱu :: FT = 0.086  # Shear mixing length coefficient for momentum at high Ri
+    Cˡᵒc :: FT = 0.2    # Shear mixing length coefficient for tracers at low Ri
+    Cʰⁱc :: FT = 0.045  # Shear mixing length coefficient for tracers at high Ri
+    Cˡᵒe :: FT = 1.9    # Shear mixing length coefficient for TKE at low Ri
+    Cʰⁱe :: FT = 0.57   # Shear mixing length coefficient for TKE at high Ri
+    CRiᵟ :: FT = 0.45   # Stability function width 
+    CRi⁰ :: FT = 0.47   # Stability function lower Ri
 end
 
 #####
@@ -67,14 +67,15 @@ end
 end
 
 @inline function stable_length_scaleᶜᶜᶠ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
-    ℓᴺ = stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
-
     Cˢ = closure.mixing_length.Cˢ
     Cᵇ = closure.mixing_length.Cᵇ
+
     d_up   = Cˢ * depthᶜᶜᶠ(i, j, k, grid)
     d_down = Cᵇ * height_above_bottomᶜᶜᶠ(i, j, k, grid)
     d = min(d_up, d_down)
 
+    ℓᴺ = stratification_mixing_lengthᶜᶜᶠ(i, j, k, grid, closure, e, tracers, buoyancy)
+
     ℓ = min(d, ℓᴺ)
     ℓ = ifelse(isnan(ℓ), d, ℓ)
 
@@ -82,14 +83,14 @@ end
 end
 
 @inline function stable_length_scaleᶜᶜᶜ(i, j, k, grid, closure, e, velocities, tracers, buoyancy)
-    ℓᴺ = stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
-
     Cˢ = closure.mixing_length.Cˢ
     Cᵇ = closure.mixing_length.Cᵇ
     d_up   = Cˢ * depthᶜᶜᶜ(i, j, k, grid)
     d_down = Cᵇ * height_above_bottomᶜᶜᶜ(i, j, k, grid)
     d = min(d_up, d_down)
 
+    ℓᴺ = stratification_mixing_lengthᶜᶜᶜ(i, j, k, grid, closure, e, tracers, buoyancy)
+
     ℓ = min(d, ℓᴺ)
     ℓ = ifelse(isnan(ℓ), d, ℓ)
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index cab7cc955b..e17ef1508e 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -5,12 +5,12 @@ Parameters for the evolution of oceanic turbulent kinetic energy at the O(1 m) s
 isotropic turbulence and diapycnal mixing.
 """
 Base.@kwdef struct TurbulentKineticEnergyEquation{FT}
-    CˡᵒD  :: FT = 0.080 # Dissipation length scale shear coefficient for low Ri
-    CʰⁱD  :: FT = 1.3   # Dissipation length scale shear coefficient for high Ri
-    CᶜD   :: FT = 1.6   # Dissipation length scale convecting layer coefficient
+    CˡᵒD  :: FT = 1.1   # Dissipation length scale shear coefficient for low Ri
+    CʰⁱD  :: FT = 0.37  # Dissipation length scale shear coefficient for high Ri
+    CᶜD   :: FT = 0.88  # Dissipation length scale convecting layer coefficient
     CᵉD   :: FT = 0.0   # Dissipation length scale penetration layer coefficient
-    Cᵂu★  :: FT = 4.0   # Surface shear-driven TKE flux coefficient
-    CᵂwΔ  :: FT = 0.91  # Surface convective TKE flux coefficient
+    Cᵂu★  :: FT = 1.1   # Surface shear-driven TKE flux coefficient
+    CᵂwΔ  :: FT = 4.0   # Surface convective TKE flux coefficient
     Cᵂϵ   :: FT = 1.0   # Dissipative near-bottom TKE flux coefficient
 end
 
@@ -27,35 +27,35 @@ end
 
     # To reconstruct the shear production term "conservatively" (ie approximately corresponding
     # to dissipatation of mean kinetic energy):
-    κᵘ = diffusivities.κᵘ
-    return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
-           ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
+    # κᵘ = diffusivities.κᵘ
+    # return ℑxzᶜᵃᶜ(i, j, k, grid, ν_∂z_u²ᶠᶜᶠ, κᵘ, u) +
+    #        ℑyzᵃᶜᶜ(i, j, k, grid, ν_∂z_v²ᶜᶠᶠ, κᵘ, v)
 
     # Non-conservative reconstruction of shear production:
-    # closure = getclosure(i, j, closure)
-    # κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
-    # S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
-    # return κᵘ * S²
+    closure = getclosure(i, j, closure)
+    κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+    S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
+    return κᵘ * S²
 end
 
 # To reconstruct buoyancy flux "conservatively" (ie approximately correpsonding to production/destruction
 # of mean potential energy):
-@inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
-    κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
-    N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
-    return - κᶜ * N²
-end
-
-@inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
-    ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
-
-# Non-conservative reconstruction of buoyancy flux:
-# @inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
-#     closure = getclosure(i, j, closure)
-#     κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
-#     N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+# @inline function buoyancy_fluxᶜᶜᶠ(i, j, k, grid, tracers, buoyancy, diffusivities)
+#     κᶜ = @inbounds diffusivities.κᶜ[i, j, k]
+#     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
 #     return - κᶜ * N²
 # end
+# 
+# @inline explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) =
+#     ℑzᵃᵃᶜ(i, j, k, grid, buoyancy_fluxᶜᶜᶠ, tracers, buoyancy, diffusivities)
+
+# Non-conservative reconstruction of buoyancy flux:
+@inline function explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)
+    closure = getclosure(i, j, closure)
+    κᶜ = κcᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
+    N² = ℑzᵃᵃᶜ(i, j, k, grid, ∂z_b, buoyancy, tracers)
+    return - κᶜ * N²
+end
 
 @inline buoyancy_flux(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, diffusivities) =
     explicit_buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities)

From f61999b96fde455a53eb288fe034f9fed58db9aa Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 30 May 2023 17:24:23 -0600
Subject: [PATCH 258/530] Fix shear production signature

---
 .../turbulent_kinetic_energy_equation.jl      | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index e17ef1508e..33215c98e5 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -35,6 +35,7 @@ end
     closure = getclosure(i, j, closure)
     κᵘ = κuᶜᶜᶜ(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities.Qᵇ)
     S² = shearᶜᶜᶜ(i, j, k, grid, u, v)
+
     return κᵘ * S²
 end
 
@@ -128,7 +129,7 @@ end
 @inline function dissipation(i, j, k, grid, closure::FlavorOfCATKE, velocities, tracers, args...)
     eᵢ = @inbounds tracers.e[i, j, k]
     ω = dissipation_rate(i, j, k, grid, closure, velocities, tracers, args...)
-    return - ω * eᵢ
+    return ω * eᵢ
 end
 
 #####
@@ -137,19 +138,19 @@ end
 
 # TODO: include shear production and buoyancy flux from AbstractScalarDiffusivity
 
-@inline shear_production(i, j, k, grid, closure, velocities, diffusivities) = zero(grid)
+@inline shear_production(i, j, k, grid, closure, U, C, B, K) = zero(grid)
 
-@inline shear_production(i, j, k, grid, closures::Tuple{<:Any}, velocities, diffusivities) =
-    shear_production(i, j, k, grid, closures[1], velocities, diffusivities[1])
+@inline shear_production(i, j, k, grid, closures::Tuple{<:Any}, U, C, B, K) =
+    shear_production(i, j, k, grid, closures[1], U, C, B, K[1])
 
-@inline shear_production(i, j, k, grid, closures::Tuple{<:Any, <:Any}, velocities, diffusivities) =
-    shear_production(i, j, k, grid, closures[1], velocities, diffusivities[1]) +
-    shear_production(i, j, k, grid, closures[2], velocities, diffusivities[2])
+@inline shear_production(i, j, k, grid, closures::Tuple{<:Any, <:Any}, U, C, B, K) =
+    shear_production(i, j, k, grid, closures[1], U, C, B, K[1]) +
+    shear_production(i, j, k, grid, closures[2], U, C, B, K[2])
 
-@inline shear_production(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any}, velocities, diffusivities) =
-    shear_production(i, j, k, grid, closures[1], velocities, diffusivities[1]) +
-    shear_production(i, j, k, grid, closures[2], velocities, diffusivities[2]) +
-    shear_production(i, j, k, grid, closures[3], velocities, diffusivities[3])
+@inline shear_production(i, j, k, grid, closures::Tuple{<:Any, <:Any, <:Any}, U, C, B, K) = 
+    shear_production(i, j, k, grid, closures[1], U, C, B, K[1]) +
+    shear_production(i, j, k, grid, closures[2], U, C, B, K[2]) +
+    shear_production(i, j, k, grid, closures[3], U, C, B, K[3])
 
 @inline buoyancy_flux(i, j, k, grid, closure, velocities, tracers, buoyancy, diffusivities) = zero(grid)
 

From e9c68c3a234322a8cb59d787247b005c131c5260 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Tue, 30 May 2023 17:24:30 -0600
Subject: [PATCH 259/530] Bump convective entrainment parameter

---
 .../CATKEVerticalDiffusivities/mixing_length.jl                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
index 521640ad62..445d9dd355 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/mixing_length.jl
@@ -17,7 +17,7 @@ Base.@kwdef struct MixingLength{FT}
     Cᵇ   :: FT = Inf    # Bottom distance coefficient for shear length scale
     Cᶜc  :: FT = 1.5    # Convective mixing length coefficient for tracers
     Cᶜe  :: FT = 1.2    # Convective mixing length coefficient for TKE
-    Cᵉc  :: FT = 0.085  # Convective penetration mixing length coefficient for tracers
+    Cᵉc  :: FT = 0.2    # Convective penetration mixing length coefficient for tracers
     Cᵉe  :: FT = 0.0    # Convective penetration mixing length coefficient for TKE
     Cˢᵖ  :: FT = 0.14   # Sheared convective plume coefficient
     Cˡᵒu :: FT = 0.19   # Shear mixing length coefficient for momentum at low Ri

From ac43759e45e9782289ca244b9a63a1ab3f2a6caf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:41:02 -0400
Subject: [PATCH 260/530] Manifest and Project

---
 Manifest.toml | 123 --------------------------------------------------
 Project.toml  |   7 ---
 2 files changed, 130 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 1556c4d038..952c7f92ae 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,25 +2,7 @@
 
 julia_version = "1.8.0"
 manifest_format = "2.0"
-<<<<<<< HEAD
-project_hash = "e5c066cd371cc92d479d4d0c34bc89f3323ab6b3"
-
-[[deps.AMGX]]
-deps = ["AMGX_jll", "CEnum", "CUDA", "JSON", "Libdl", "SparseArrays"]
-git-tree-sha1 = "4ce114680290d2989870c99db3a1ba9dd301634f"
-repo-rev = "vc/2.3"
-repo-url = "https://github.com/JuliaGPU/AMGX.jl.git"
-uuid = "c963dde9-0319-47f5-bf0c-b07d3c80ffa6"
-version = "0.2.0"
-
-[[deps.AMGX_jll]]
-deps = ["Artifacts", "CUDA_Runtime_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"]
-git-tree-sha1 = "9a9e64c4d2acee7b89286985eaa7489ac3e97328"
-uuid = "656d14af-56e4-5275-8e68-4e861d7b5043"
-version = "2.3.0+1"
-=======
 project_hash = "efa1b3c0f878c0fe77caba96f053e1145d126cd0"
->>>>>>> origin/main
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
@@ -33,15 +15,6 @@ deps = ["LinearAlgebra", "Requires"]
 git-tree-sha1 = "cc37d689f599e8df4f464b2fa3870ff7db7492ef"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 version = "3.6.1"
-<<<<<<< HEAD
-
-[[deps.AlgebraicMultigrid]]
-deps = ["CommonSolve", "LinearAlgebra", "Printf", "Reexport", "SparseArrays"]
-git-tree-sha1 = "796eedcb42226861a51d92d28ee82d4985ee860b"
-uuid = "2169fc97-5a83-5252-b627-83903c6c433c"
-version = "0.5.1"
-=======
->>>>>>> origin/main
 
 [[deps.ArgTools]]
 uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
@@ -84,29 +57,6 @@ version = "0.1.2"
 
 [[deps.CUDA]]
 deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
-<<<<<<< HEAD
-git-tree-sha1 = "6591ddc73adb429b9d97145c8197a0ac81664ab4"
-uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "4.1.3"
-
-[[deps.CUDA_Driver_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
-git-tree-sha1 = "10ca2b63b496edc09258b3de5d1aa64094b18b1d"
-uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc"
-version = "0.5.0+0"
-
-[[deps.CUDA_Runtime_Discovery]]
-deps = ["Libdl"]
-git-tree-sha1 = "6c8fceaaa6850dea627288ac3bb86fdcdf05e326"
-uuid = "1af6417a-86b4-443c-805f-a4643ffb695f"
-version = "0.2.0"
-
-[[deps.CUDA_Runtime_jll]]
-deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "802b1f2220fd43251d343219adf478e6b7992bd4"
-uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
-version = "0.5.0+0"
-=======
 git-tree-sha1 = "280893f920654ebfaaaa1999fbd975689051f890"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
 version = "4.2.0"
@@ -128,7 +78,6 @@ deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl",
 git-tree-sha1 = "5248d9c45712e51e27ba9b30eebec65658c6ce29"
 uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
 version = "0.6.0+0"
->>>>>>> origin/main
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
@@ -137,17 +86,10 @@ uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "1.15.7"
 
 [[deps.ChangesOfVariables]]
-<<<<<<< HEAD
-deps = ["ChainRulesCore", "LinearAlgebra", "Test"]
-git-tree-sha1 = "485193efd2176b88e6622a39a246f8c5b600e74e"
-uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.6"
-=======
 deps = ["LinearAlgebra", "Test"]
 git-tree-sha1 = "f84967c4497e0e1955f9a582c232b02847c5f589"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 version = "0.1.7"
->>>>>>> origin/main
 
 [[deps.CommonDataModel]]
 deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
@@ -197,15 +139,6 @@ version = "1.0.0"
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
-<<<<<<< HEAD
-[[deps.DiffRules]]
-deps = ["IrrationalConstants", "LogExpFunctions", "NaNMath", "Random", "SpecialFunctions"]
-git-tree-sha1 = "a4ad7ef19d2cdc2eff57abbbe68032b1cd0bd8f8"
-uuid = "b552c78f-8df3-52c6-915a-8e097449b14b"
-version = "1.13.0"
-
-=======
->>>>>>> origin/main
 [[deps.Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
 uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -266,15 +199,9 @@ version = "0.1.4"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-<<<<<<< HEAD
-git-tree-sha1 = "590d394bad1055b798b2f9b308327ba871b7badf"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.19.0"
-=======
 git-tree-sha1 = "e9a9173cd77e16509cdf9c1663fda19b22a518b7"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.19.3"
->>>>>>> origin/main
 
 [[deps.Glob]]
 git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
@@ -349,17 +276,10 @@ uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 version = "1.12.0"
 
 [[deps.KernelAbstractions]]
-<<<<<<< HEAD
-deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "SnoopPrecompile", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "350a880e80004f4d5d82a17f737d8fcdc56c3462"
-uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.1"
-=======
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
 git-tree-sha1 = "47be64f040a7ece575c2b5f53ca6da7b548d69f4"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 version = "0.9.4"
->>>>>>> origin/main
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
@@ -444,15 +364,9 @@ version = "0.1.7"
 
 [[deps.MPItrampoline_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
-<<<<<<< HEAD
-git-tree-sha1 = "ad88f863a5a16b3e26d14446afd3cd746266281b"
-uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
-version = "5.2.1+3"
-=======
 git-tree-sha1 = "b3dcf8e1c610a10458df3c62038c8cc3a4d6291d"
 uuid = "f1f71cc9-e9ae-5b93-9b94-4fe0e1ad3748"
 version = "5.3.0+0"
->>>>>>> origin/main
 
 [[deps.MacroTools]]
 deps = ["Markdown", "Random"]
@@ -483,23 +397,10 @@ uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
 version = "2022.2.1"
 
 [[deps.NCDatasets]]
-<<<<<<< HEAD
-deps = ["CFTime", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
-git-tree-sha1 = "fe130b7201b7fd908d950076dbfc0671270894c5"
-uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
-version = "0.12.13"
-
-[[deps.NaNMath]]
-deps = ["OpenLibm_jll"]
-git-tree-sha1 = "0877504529a3e5c3343c6f8b4c0381e57e4387e4"
-uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
-version = "1.0.2"
-=======
 deps = ["CFTime", "CommonDataModel", "DataStructures", "Dates", "NetCDF_jll", "NetworkOptions", "Printf"]
 git-tree-sha1 = "afd015e81e60cfbdba04ef59bcdc80e18bd613cd"
 uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 version = "0.12.14"
->>>>>>> origin/main
 
 [[deps.NetCDF_jll]]
 deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Pkg", "XML2_jll", "Zlib_jll"]
@@ -558,15 +459,9 @@ version = "2.5.8"
 
 [[deps.PencilArrays]]
 deps = ["Adapt", "JSON3", "LinearAlgebra", "MPI", "OffsetArrays", "Random", "Reexport", "Requires", "StaticArrayInterface", "StaticArrays", "StaticPermutations", "Strided", "TimerOutputs", "VersionParsing"]
-<<<<<<< HEAD
-git-tree-sha1 = "0c6ebb4777158b8662288fb4fca255e404adc94b"
-uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
-version = "0.17.10"
-=======
 git-tree-sha1 = "9d017f3a875a9f22b0649c7d72671b1b25441179"
 uuid = "0e08944d-e94e-41b1-9406-dcf66b6a9d2e"
 version = "0.17.11"
->>>>>>> origin/main
 
 [[deps.PencilFFTs]]
 deps = ["AbstractFFTs", "FFTW", "LinearAlgebra", "MPI", "PencilArrays", "Reexport", "TimerOutputs"]
@@ -667,15 +562,9 @@ uuid = "6c6a2e73-6563-6170-7368-637461726353"
 version = "1.2.0"
 
 [[deps.SeawaterPolynomials]]
-<<<<<<< HEAD
-git-tree-sha1 = "20e6926c620cedee2b7551b61169dd118b4e34f2"
-uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
-version = "0.3.1"
-=======
 git-tree-sha1 = "958ba75b90c7c8a117d041d33184134201cf8c0f"
 uuid = "d496a93d-167e-4197-9f49-d3af4ff8fe40"
 version = "0.3.2"
->>>>>>> origin/main
 
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
@@ -707,17 +596,6 @@ version = "0.8.6"
 
 [[deps.StaticArrayInterface]]
 deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
-<<<<<<< HEAD
-git-tree-sha1 = "fd5f417fd7e103c121b0a0b4a6902f03991111f4"
-uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
-version = "1.3.0"
-
-[[deps.StaticArrays]]
-deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "b8d897fe7fa688e93aef573711cb207c08c9e11e"
-uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.19"
-=======
 git-tree-sha1 = "33040351d2403b84afce74dae2e22d3f5b18edcb"
 uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
 version = "1.4.0"
@@ -727,7 +605,6 @@ deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
 git-tree-sha1 = "c262c8e978048c2b095be1672c9bee55b4619521"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
 version = "1.5.24"
->>>>>>> origin/main
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
diff --git a/Project.toml b/Project.toml
index 5aa10f22f5..ab7cd9c53d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -35,15 +35,8 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 
 [compat]
-<<<<<<< HEAD
-AMGX = "0.1.3, 0.2"
-Adapt = "3"
-AlgebraicMultigrid = "0.5"
-CUDA = "3.8, 3.9, 4"
-=======
 Adapt = "3"
 CUDA = "4"
->>>>>>> origin/main
 Crayons = "4"
 CubedSphere = "0.1, 0.2"
 DocStringExtensions = "0.8, 0.9"

From 616883dc6d66c4bb43a7b90ace143003fdf73427 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:45:16 -0400
Subject: [PATCH 261/530] bugfixes

---
 src/AbstractOperations/computed_field.jl      |  3 --
 ...ate_hydrostatic_free_surface_tendencies.jl | 32 -------------------
 .../CATKEVerticalDiffusivities.jl             | 28 ----------------
 .../turbulent_kinetic_energy_equation.jl      |  8 -----
 test/dependencies_for_poisson_solvers.jl      |  6 ----
 test/test_implicit_free_surface_solver.jl     |  4 ---
 6 files changed, 81 deletions(-)

diff --git a/src/AbstractOperations/computed_field.jl b/src/AbstractOperations/computed_field.jl
index 8f7da8f07c..f90cb11862 100644
--- a/src/AbstractOperations/computed_field.jl
+++ b/src/AbstractOperations/computed_field.jl
@@ -76,10 +76,7 @@ end
 function compute_computed_field!(comp)
     arch = architecture(comp)
     launch!(arch, comp.grid, size(comp), _compute!, comp.data, comp.operand, comp.indices)
-<<<<<<< HEAD
-=======
     return comp
->>>>>>> origin/main
 end
 
 """Compute an `operand` and store in `data`."""
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 3f9ff2c81c..899df8daaa 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -311,35 +311,3 @@ end
     @inbounds Gη[i′, j′, grid.Nz+1] = free_surface_tendency(i′, j′, grid, args...)
 end
 
-<<<<<<< HEAD
-#####
-##### Boundary condributions to hydrostatic free surface model
-#####
-
-function apply_flux_bcs!(Gcⁿ, c, arch, args...)
-    apply_x_bcs!(Gcⁿ, c, arch, args...)
-    apply_y_bcs!(Gcⁿ, c, arch, args...)
-    apply_z_bcs!(Gcⁿ, c, arch, args...)
-
-    return nothing
-end
-
-""" Apply boundary conditions by adding flux divergences to the right-hand-side. """
-function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velocities, free_surface, tracers, args...)
-    # Velocity fields
-    for i in (:u, :v)
-        apply_flux_bcs!(Gⁿ[i], velocities[i], arch, args...)
-    end
-
-    # Free surface
-    apply_flux_bcs!(Gⁿ.η, displacement(free_surface), arch,  args...)
-
-    # Tracer fields
-    for i in propertynames(tracers)
-        apply_flux_bcs!(Gⁿ[i], tracers[i], arch, args...)
-    end
-
-    return nothing
-end
-=======
->>>>>>> origin/main
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 1cbbb810d6..0e3aabfda1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -121,11 +121,7 @@ optimal_turbulent_kinetic_energy_equation(FT) = TurbulentKineticEnergyEquation(
 
 optimal_mixing_length(FT) = MixingLength(
     Cᵇ   = FT(0.37), 
-<<<<<<< HEAD
-    Cᶜc  = FT(1.0),
-=======
     Cᶜc  = FT(4.8),
->>>>>>> origin/main
     Cᶜe  = FT(1.1),
     Cᵉc  = FT(0.049),
     Cᵉe  = FT(0.0),
@@ -249,20 +245,13 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model;
     clock = model.clock
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-<<<<<<< HEAD
     launch!(arch, grid, kernel_size,
             calculate_CATKE_diffusivities!,
             diffusivities, kernel_offsets, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-=======
-    launch!(arch, grid, :xyz,
-            calculate_CATKE_diffusivities!,
-            diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
->>>>>>> origin/main
 
     return nothing
 end
 
-<<<<<<< HEAD
 # extend κ kernel to compute also the boundaries
 @inline function κ_CATKE_kernel_size(grid) 
     Nx, Ny, Nz = size(grid)
@@ -293,10 +282,6 @@ end
     i = i′ + offs[1] 
     j = j′ + offs[2] 
     k = k′ + offs[3]
-=======
-@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-    i, j, k, = @index(Global, NTuple)
->>>>>>> origin/main
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
@@ -317,20 +302,7 @@ end
         dissipative_buoyancy_flux = sign(wb) * sign(eⁱʲᵏ) < 0
         wb_e = ifelse(dissipative_buoyancy_flux, wb / eⁱʲᵏ, zero(grid))
         
-<<<<<<< HEAD
-        on_bottom = !inactive_cell(i, j, k, grid) & inactive_cell(i, j, k-1, grid)
-        # on_side = near_horizontal_boundary(i, j, k, grid)
-        Δz = Δzᶜᶜᶜ(i, j, k, grid)
-
-        Q_e = - 10.0 * turbulent_velocity(i, j, k, grid, closure_ij, tracers.e) / Δz * on_bottom
-
-        # Implicit TKE dissipation
-        ϵ_e = implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
-
-        diffusivities.Lᵉ[i, j, k] = - wb_e + ϵ_e + Q_e
-=======
         diffusivities.Lᵉ[i, j, k] = - wb_e + implicit_dissipation_coefficient(i, j, k, grid, closure_ij, velocities, tracers, buoyancy, clock, top_tracer_bcs)
->>>>>>> origin/main
     end
 end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
index fcd03dc518..6768d51477 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/turbulent_kinetic_energy_equation.jl
@@ -98,16 +98,8 @@ end
     #   and thus    L = - Cᴰ √e / ℓ .
 
     τ = closure.negative_turbulent_kinetic_energy_damping_time_scale
-<<<<<<< HEAD
-    e_max = 10.0
-
-    e_limiter = max(one(grid), eᵢ / e_max)
-
-    return ifelse(eᵢ < 0, -1/τ, - sqrt(abs(eᵢ)) / ℓᴰ * e_limiter)
-=======
 
     return ifelse(eᵢ < 0, -1/τ, -sqrt(abs(eᵢ)) / ℓᴰ)
->>>>>>> origin/main
 end
 
 # Fallbacks for explicit time discretization
diff --git a/test/dependencies_for_poisson_solvers.jl b/test/dependencies_for_poisson_solvers.jl
index 78cf0376df..af4f800687 100644
--- a/test/dependencies_for_poisson_solvers.jl
+++ b/test/dependencies_for_poisson_solvers.jl
@@ -59,14 +59,8 @@ function random_divergence_free_source_term(grid)
     arch = architecture(grid)
     fill_halo_regions!((Ru, Rv, Rw))
 
-<<<<<<< HEAD
     compute_w_from_continuity!(U, arch, grid)
     fill_halo_regions!(Rw)
-=======
-    launch!(arch, grid, :xy, _compute_w_from_continuity!, U, grid)
-
-    fill_halo_regions!(Rw, nothing, nothing)
->>>>>>> origin/main
 
     # Compute the right hand side R = ∇⋅U
     ArrayType = array_type(arch)
diff --git a/test/test_implicit_free_surface_solver.jl b/test/test_implicit_free_surface_solver.jl
index 685d80f7ff..e5f32ff3af 100644
--- a/test/test_implicit_free_surface_solver.jl
+++ b/test/test_implicit_free_surface_solver.jl
@@ -161,10 +161,6 @@ end
         @test fft_model.free_surface.implicit_step_solver isa FFTImplicitFreeSurfaceSolver
         @test pcg_model.free_surface.implicit_step_solver isa PCGImplicitFreeSurfaceSolver
         @test mat_model.free_surface.implicit_step_solver isa MatrixImplicitFreeSurfaceSolver
-<<<<<<< HEAD
-        @test  mg_model.free_surface.implicit_step_solver isa MGImplicitFreeSurfaceSolver
-=======
->>>>>>> origin/main
 
         Δt₁ = 900
         Δt₂ = 920.0

From a50a5e6294061fdec51bd7650c414a39a2397962 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:45:59 -0400
Subject: [PATCH 262/530] bugfix

---
 src/Solvers/batched_tridiagonal_solver.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 1f5c9e8260..19f228db5f 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -200,6 +200,4 @@ end
             ϕ[i, j, k] -= t[i, j, k+1] * ϕ[i, j, k+1]
         end
     end
-
-    return nothing
 end
\ No newline at end of file

From f2b38c287c2c418972c39a76d84727c686e73b11 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:47:25 -0400
Subject: [PATCH 263/530] another bugfdix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl      | 2 +-
 .../calculate_nonhydrostatic_tendencies.jl                | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 899df8daaa..e258f022ca 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -7,7 +7,7 @@ using Oceananigans.Grids: halo_size
 using Oceananigans: fields, prognostic_fields, TendencyCallsite, UpdateStateCallsite
 using Oceananigans.Biogeochemistry: update_tendencies!
 
-import Oceananigans.TimeSteppers: calculate_tendencies!
+import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
 import Oceananigans.Distributed: complete_communication_and_compute_boundary
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index 5223405d79..abb3d00d7b 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -4,7 +4,7 @@ using Oceananigans.Utils: work_layout
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
 
-import Oceananigans.TimeSteppers: calculate_tendencies!
+import Oceananigans.TimeSteppers: compute_tendencies!
 
 """
     compute_tendencies!(model::NonhydrostaticModel)
@@ -186,9 +186,15 @@ end
 function calculate_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
     fields = merge(velocities, tracers)
 
+<<<<<<< HEAD
+    foreach(i->apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+    foreach(i->apply_y_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+    foreach(i->apply_z_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+=======
     foreach(i -> apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
     foreach(i -> apply_y_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
     foreach(i -> apply_z_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
+>>>>>>> origin/main
                          
     return nothing
 end

From 1aae11ac018af4b327e643ff1c35bebfd0f6f916 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 10:49:01 -0400
Subject: [PATCH 264/530] more bugfixes

---
 .../calculate_nonhydrostatic_tendencies.jl                  | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index abb3d00d7b..7c38ac5097 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -186,15 +186,9 @@ end
 function calculate_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
     fields = merge(velocities, tracers)
 
-<<<<<<< HEAD
-    foreach(i->apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
-    foreach(i->apply_y_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
-    foreach(i->apply_z_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
-=======
     foreach(i -> apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
     foreach(i -> apply_y_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
     foreach(i -> apply_z_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
->>>>>>> origin/main
                          
     return nothing
 end

From 27ed966f80c993e24c84414ff7e41c56fc6e48bd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 11:20:35 -0400
Subject: [PATCH 265/530] bugfix

---
 .../recompute_boundary_tendencies.jl          | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index b0a19e5cdc..51a43d4390 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -33,10 +33,10 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args...)
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args)
     
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args...)
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args)
         
         launch!(arch, grid, kernel_size[1:2],
                 calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets[1:2],
@@ -52,29 +52,27 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
         @inbounds c_forcing = model.forcing[tracer_name]
         @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
 
-        c_kernel_function, closure, diffusivity_fields = tracer_tendency_kernel_function(model,
-                                                                                         Val(tracer_name),
-                                                                                         model.closure,
-                                                                                         model.diffusivity_fields)
-
-        args = (c_kernel_function,
-                grid,
-                Val(tracer_index),
-                c_advection,
-                closure,
-                c_immersed_bc,
-                model.buoyancy,
-                model.velocities,
-                model.free_surface,
-                model.tracers,
-                top_tracer_bcs,
-                diffusivity_fields,
-                model.auxiliary_fields,
-                c_forcing,
-                model.clock)
+        tendency_kernel!, closure, diffusivity = tracer_tendency_kernel_function(model, Val(tracer_name), model.closure, model.diffusivity_fields)
+
+        args = tuple(Val(tracer_index),
+                     Val(tracer_name),
+                     c_advection,
+                     closure,
+                     c_immersed_bc,
+                     model.buoyancy,
+                     model.biogeochemistry,
+                     model.velocities,
+                     model.free_surface,
+                     model.tracers,
+                     top_tracer_bcs,
+                     diffusivity,
+                     model.auxiliary_fields,
+                     c_forcing,
+                     model.clock)
 
         for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-            launch!(arch, grid, kernel_size, calculate_hydrostatic_free_surface_Gc!, c_tendency, kernel_offsets, args...)
+            launch!(arch, grid, kernel_size,
+                    tendency_kernel!, c_tendency, kernel_offsets, grid, args)
         end
     end
 end

From e8d1611793bc9fcfecf26a9fa6471bcf8d1357a3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 12:01:42 -0400
Subject: [PATCH 266/530] first bugfix

---
 .../NonhydrostaticModels/update_hydrostatic_pressure.jl     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 22085f3308..3e2c87879e 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -8,9 +8,9 @@ the `buoyancy_perturbationᶜᶜᶜ` downwards:
     `pHY′ = ∫ buoyancy_perturbationᶜᶜᶜ dz` from `z=0` down to `z=-Lz`
 """
 @kernel function _update_hydrostatic_pressure!(pHY′, offs, grid, buoyancy, C)
-    i, j = @index(Global, NTuple)
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
+    i′, j′ = @index(Global, NTuple)
+    i = i′ + offs[1] 
+    j = j′ + offs[2] 
 
     @inbounds pHY′[i, j, grid.Nz] = - z_dot_g_bᶜᶜᶠ(i, j, grid.Nz+1, grid, buoyancy, C) * Δzᶜᶜᶠ(i, j, grid.Nz+1, grid)
 

From 0c8406ea92fb91783decc8f97f66e2c043f80a31 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 12:28:13 -0400
Subject: [PATCH 267/530] secind bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index e258f022ca..1f0f4696da 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -200,11 +200,11 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     kernel_offsets = interior_tendency_kernel_offsets(grid)
     
     launch!(arch, grid, kernel_size,
-            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args;
+            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args;
             only_active_cells)
 
     launch!(arch, grid, kernel_size,
-            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args;
+            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args;
             only_active_cells)
 
     calculate_free_surface_tendency!(grid, model)

From d219b8367eddb9adffc7753b4b977ed5ad9da613 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 12:48:12 -0400
Subject: [PATCH 268/530] third bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 1f0f4696da..ccc3fb3483 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -94,7 +94,7 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
     calculate_hydrostatic_momentum_tendencies!(model, model.velocities)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
-    only_active_cells = use_only_active_cells(grid)
+    only_active_cells = use_only_active_interior_cells(grid)
 
     kernel_size    =   interior_tendency_kernel_size(grid)
     kernel_offsets = interior_tendency_kernel_offsets(grid)

From 80c7e5a35592aa688c78b1f21dfcb189f4713085 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:21:42 -0400
Subject: [PATCH 269/530] fourth bugfix

---
 .../HydrostaticFreeSurfaceModels/single_column_model_mode.jl    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index de2bbccedb..f2f9ef4286 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -42,6 +42,8 @@ validate_momentum_advection(momentum_advection, ::SingleColumnGrid) = nothing
 validate_tracer_advection(tracer_advection::AbstractAdvectionScheme, ::SingleColumnGrid) = nothing, NamedTuple()
 validate_tracer_advection(tracer_advection::Nothing, ::SingleColumnGrid) = nothing, NamedTuple()
 
+compute_w_from_continuity!(velocities, arch, ::SingleColumnGrid; kwargs...) = nothing
+
 #####
 ##### Time-step optimizations
 #####

From 0e5fcea12345511682ddeb90b47b7de37572dcfe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:48:42 -0400
Subject: [PATCH 270/530] bugfix five and six (and cleaning up halos)

---
 src/Distributed/distributed_fields.jl                         | 2 +-
 src/Distributed/halo_communication.jl                         | 4 +---
 src/Distributed/multi_architectures.jl                        | 2 +-
 .../HydrostaticFreeSurfaceModels/single_column_model_mode.jl  | 1 +
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
index 1b365ac95b..301337d28a 100644
--- a/src/Distributed/distributed_fields.jl
+++ b/src/Distributed/distributed_fields.jl
@@ -7,9 +7,9 @@ using Oceananigans.Fields: validate_field_data, validate_boundary_conditions, va
 
 function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indices::Tuple, op, status)
     arch = architecture(grid)
+    indices = validate_indices(indices, (LX, LY, LZ), grid)
     validate_field_data((LX, LY, LZ), data, grid, indices)
     validate_boundary_conditions((LX, LY, LZ), grid, old_bcs)
-    indices = validate_indices(indices, (LX, LY, LZ), grid)
     new_bcs = inject_halo_communication_boundary_conditions(old_bcs, arch.local_rank, arch.connectivity)
     buffers = FieldBoundaryBuffers(grid, data, new_bcs)
 
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 95098ab9a0..5d2b887ff5 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -200,7 +200,6 @@ end
 
 @inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
-@inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
 cooperative_wait(req::MPI.Request)            = MPI.Waitall(req)
 cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
@@ -242,10 +241,9 @@ end
 #####
 ##### fill_west_and_east_halo!   }
 ##### fill_south_and_north_halo! } for when both halos are communicative (Single communicating halos are to be implemented)
-##### fill_bottom_and_top_halo!  }
 #####
 
-for (side, opposite_side, dir) in zip([:west, :south, :bottom], [:east, :north, :top], [1, 2, 3])
+for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
     fill_both_halo! = Symbol("fill_$(side)_and_$(opposite_side)_halo!")
     fill_side_halo! = Symbol("fill_$(side)_halo!")
     send_side_halo  = Symbol("send_$(side)_halo")
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 256638efa2..9762611dd2 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -62,7 +62,7 @@ Keyword arguments
 function DistributedArch(child_architecture = CPU(); 
                    topology = (Periodic, Periodic, Periodic), 
                    ranks,
-                   use_buffers = false,
+                   use_buffers = true,
                    devices = nothing, 
                    enable_overlapped_computation = true,
                    communicator = MPI.COMM_WORLD)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index f2f9ef4286..1e2fcd9127 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -43,6 +43,7 @@ validate_tracer_advection(tracer_advection::AbstractAdvectionScheme, ::SingleCol
 validate_tracer_advection(tracer_advection::Nothing, ::SingleColumnGrid) = nothing, NamedTuple()
 
 compute_w_from_continuity!(velocities, arch, ::SingleColumnGrid; kwargs...) = nothing
+compute_w_from_continuity!(::PrescribedVelocityFields, arch, ::SingleColumnGrid; kwargs...) = nothing
 
 #####
 ##### Time-step optimizations

From d2dcbcb7f90cc91769bc4cc54a5cf117738e4bbc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:50:11 -0400
Subject: [PATCH 271/530] seventh bugfix

---
 .../turbulence_closure_implementations/nothing_closure.jl     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
index 0cda1ee731..ea98c72f45 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
@@ -3,8 +3,8 @@
 @inline ∂ⱼ_τ₂ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 @inline ∂ⱼ_τ₃ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 
-calculate_diffusivities!(diffusivities, ::Nothing, args...) = nothing
-calculate_diffusivities!(::Nothing, ::Nothing, args...) = nothing
+calculate_diffusivities!(diffusivities, ::Nothing, args...; kwargs...) = nothing
+calculate_diffusivities!(::Nothing, ::Nothing, args...; kwargs...) = nothing
 
 @inline viscosity(::Nothing, ::Nothing) = 0
 @inline diffusivity(::Nothing, ::Nothing, ::Val{id}) where id = 0

From a2f6de3844ed6c6344fcdaeba965303ae29262af Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 13:53:01 -0400
Subject: [PATCH 272/530] eigth bugfix

---
 .../recompute_boundary_tendencies.jl                       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 51a43d4390..7284d55577 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -13,8 +13,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
     v_immersed_bc = immersed_boundary_condition(model.velocities.v)
 
-    start_momentum_kernel_args = (grid,
-                                  model.advection.momentum,
+    start_momentum_kernel_args = (model.advection.momentum,
                                   model.coriolis,
                                   model.closure)
 
@@ -33,10 +32,10 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, u_kernel_args)
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args)
     
         launch!(arch, grid, kernel_size,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, v_kernel_args)
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args)
         
         launch!(arch, grid, kernel_size[1:2],
                 calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets[1:2],

From 21ede7f5d531a5c41b1ba7b07b3384fa50828178 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 1 Jun 2023 14:09:27 -0400
Subject: [PATCH 273/530] eighth bugfix

---
 .../barotropic_pressure_correction.jl                    | 4 ++--
 .../calculate_hydrostatic_free_surface_tendencies.jl     | 9 ++++-----
 .../recompute_boundary_tendencies.jl                     | 7 ++-----
 .../mpi_hydrostatic_turbulence.jl                        | 1 -
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
index 82eb903c59..c0f0711f13 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
@@ -30,8 +30,8 @@ function pressure_correct_velocities!(model::ImplicitFreeSurfaceHFSM, Δt)
     return nothing
 end
 
-calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM     ) = nothing
-calculate_free_surface_tendency!(grid, model::SplitExplicitFreeSurfaceHFSM) = nothing
+calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM     , args...) = nothing
+calculate_free_surface_tendency!(grid, model::SplitExplicitFreeSurfaceHFSM, args...) = nothing
 
 function pressure_correct_velocities!(model::SplitExplicitFreeSurfaceHFSM, Δt)
     u, v, _ = model.velocities
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index ccc3fb3483..ba2ccfaaee 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -147,7 +147,7 @@ function apply_flux_bcs!(Gcⁿ, c, arch, args...)
     return nothing
 end
 
-function calculate_free_surface_tendency!(grid, model)
+function calculate_free_surface_tendency!(grid, model, kernel_size, kernel_offsets)
 
     arch = architecture(grid)
 
@@ -158,8 +158,8 @@ function calculate_free_surface_tendency!(grid, model)
                  model.forcing,
                  model.clock)
 
-    launch!(arch, grid, :xy,
-            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, (0, 0), 
+    launch!(arch, grid, kernel_size,
+            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets, 
             grid, args)
 
     return nothing
@@ -207,7 +207,7 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
             calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args;
             only_active_cells)
 
-    calculate_free_surface_tendency!(grid, model)
+    calculate_free_surface_tendency!(grid, model, :xy, (0, 0))
 
     return nothing
 end
@@ -310,4 +310,3 @@ end
     j′ = j + offs[2]
     @inbounds Gη[i′, j′, grid.Nz+1] = free_surface_tendency(i′, j′, grid, args...)
 end
-
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 7284d55577..f6cad5ca07 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -29,7 +29,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
 
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
-    
+
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
         launch!(arch, grid, kernel_size,
                 calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args)
@@ -37,10 +37,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
         launch!(arch, grid, kernel_size,
                 calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args)
         
-        launch!(arch, grid, kernel_size[1:2],
-                calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets[1:2],
-                grid, model.velocities, model.free_surface, model.tracers, model.auxiliary_fields, model.forcing,
-                model.clock)
+        calculate_free_surface_tendency!(grid, model, kernel_size[1:2], kernel_offsets[1:2])
     end
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index d7c8ea568b..ae59448e8b 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -58,7 +58,6 @@ run!(simulation)
 
 if rank == 0
     using Printf
-    using NCDatasets
     using GLMakie
 
     iter = Observable(1)

From 5fe4dd216bb6c919bcdf85ab51e2d73e9296b0db Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 5 Jun 2023 16:26:51 -0600
Subject: [PATCH 274/530] Change heterogeneous windy convection to an upwelling
 example

---
 .../heterogeneous_windy_convection.jl         | 147 +++++++++---------
 1 file changed, 76 insertions(+), 71 deletions(-)

diff --git a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
index 140f9a26e1..3dd09c935b 100644
--- a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
+++ b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
@@ -9,16 +9,16 @@ using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom
 
 import Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities
 
-Nx = 1
-Ny = 200
+Nx = 50
+Ny = 1
 
-const Lx = 1000kilometers
+const Lx = 500kilometers
 const Ly = Lx
 const Lz = 1000
 
 # Stretched vertical grid
 γ = 1.01
-Δz₀ = 8
+Δz₀ = 16
 h₀ = 128
 z = [-Δz₀ * k for k = 0:ceil(h₀ / Δz₀)]
 while z[end] > -Lz
@@ -29,52 +29,52 @@ Nz = length(z) - 1
 
 grid = RectilinearGrid(size = (Nx, Ny, Nz),
                        halo = (4, 4, 4),
-                       x = (0, Lx),
-                       y = (-Ly/2, Ly/2),
+                       x = (-Lx, 0),
+                       y = (0, Ly),
                        z = z,
-                       topology=(Periodic, Bounded, Bounded))
+                       topology=(Bounded, Periodic, Bounded))
 
-z_bottom(x, y) = - Lz * (1 - (2y / Ly)^2)
-grid = ImmersedBoundaryGrid(grid, PartialCellBottom(z_bottom, minimum_fractional_cell_height=0.1))
+z_bottom(x, y) = - 2 * abs(x) * Lz / Lx
+#grid = ImmersedBoundaryGrid(grid, PartialCellBottom(z_bottom, minimum_fractional_cell_height=0.2))
+grid = ImmersedBoundaryGrid(grid, GridFittedBottom(z_bottom))
 
 @show grid
-@inline Qᵇ(x, y, t) = 1e-7
-@inline Qᵘ(x, y, t) = -1e-3 * cos(π * y / Ly)
+@inline Qᵇ(x, y, t) = 0.0 #1e-7
+@inline Qᵘ(x, y, t) = 0.0
+@inline Qᵛ(x, y, t, p) = + 1e-4 * exp(-x^2 / (2 * p.δx^2))
 
 b_top_bc = FluxBoundaryCondition(Qᵇ)
 u_top_bc = FluxBoundaryCondition(Qᵘ)
+v_top_bc = FluxBoundaryCondition(Qᵛ, parameters=(; δx=200kilometers))
 
 b_bcs = FieldBoundaryConditions(top=b_top_bc)
 u_bcs = FieldBoundaryConditions(top=u_top_bc)
+v_bcs = FieldBoundaryConditions(top=v_top_bc)
 
-vertical_mixing = CATKEVerticalDiffusivity(; minimum_turbulent_kinetic_energy=1e-9)
+vertical_mixing = CATKEVerticalDiffusivity(; minimum_turbulent_kinetic_energy=1e-6)
 #vertical_mixing = RiBasedVerticalDiffusivity()
+#
+horizontal_viscosity = HorizontalScalarDiffusivity(ν=1e4)
 
 @show vertical_mixing
+#closure = (vertical_mixing, horizontal_viscosity)
+closure = vertical_mixing
 
-Δy = Ly / Ny
-ν₄ = Δy^4 / 1hours
-hyperviscosity = HorizontalScalarBiharmonicDiffusivity(ν=ν₄)
-
-#closure = vertical_mixing
-closure = (vertical_mixing, hyperviscosity)
-
-filename = "heterogeneous_cooling_with_hyperviscosity.jld2"
-#filename = "heterogeneous_cooling.jld2"
+filename = "heterogeneous_cooling.jld2"
 
 model = HydrostaticFreeSurfaceModel(; grid, closure,
                                     momentum_advection = WENO(),
                                     tracer_advection = WENO(),
-                                    coriolis = FPlane(f=1e-4),
+                                    coriolis = FPlane(latitude=+33),
                                     tracers = (:b, :e),
-                                    boundary_conditions = (; b=b_bcs, u=u_bcs),
+                                    boundary_conditions = (; b=b_bcs, u=u_bcs, v=v_bcs),
                                     buoyancy = BuoyancyTracer())
 
 N²ᵢ = 1e-5
 bᵢ(x, y, z) = N²ᵢ * z
-set!(model, b=bᵢ, e=1e-9)
+set!(model, b=bᵢ, e=1e-6)
 
-simulation = Simulation(model, Δt=5minute, stop_time=10days)
+simulation = Simulation(model, Δt=10minute, stop_iteration=1000)
 
 κᶜ = if model.closure isa Tuple
     model.diffusivity_fields[1].κᶜ
@@ -88,7 +88,8 @@ outputs = (; model.velocities..., model.tracers..., κᶜ=κᶜ, N²=N²)
 
 simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs;
                                                       filename,
-                                                      schedule = TimeInterval(1hour),
+                                                      #schedule = TimeInterval(1hour),
+                                                      schedule = IterationInterval(10),
                                                       overwrite_existing = true)
 
 function progress(sim)
@@ -128,12 +129,11 @@ end
 
 fig = Figure(resolution=(1600, 800))
 
-ax_uyz = Axis(fig[1, 1], title="u(y, z) - <u(y, z)>")
-#ax_vyz = Axis(fig[1, 2], title="v(y, z)")
-ax_wyz = Axis(fig[1, 2], title="w(y, z)")
-ax_Nyz = Axis(fig[1, 3], title="N²(y, z)")
-ax_eyz = Axis(fig[1, 4], title="e(y, z)")
-ax_κyz = Axis(fig[1, 5], title="κ(y, z)")
+ax_vxz = Axis(fig[1, 1], title="v(x, z) - <v(x, z)>")
+ax_wxz = Axis(fig[1, 2], title="w(x, z)")
+ax_Nxz = Axis(fig[1, 3], title="N²(x, z)")
+ax_exz = Axis(fig[1, 4], title="e(x, z)")
+ax_κxz = Axis(fig[1, 5], title="κ(x, z)")
 
 ax_bz = Axis(fig[2, 1], title="b(z)", xlabel="z")
 ax_uz = Axis(fig[2, 2], title="u(z)", ylabel="z")
@@ -147,68 +147,73 @@ n = slider.value
 title = @lift string("Two-dimensional channel at t = ", prettytime(b_ts.times[$n]))
 Label(fig[0, :], title, fontsize=24)
 
-b_yz = @lift interior(b_ts[$n], 1, :, :)
-e_yz = @lift interior(e_ts[$n], 1, :, :)
+b_xz = @lift interior(b_ts[$n], :, 1, :)
+e_xz = @lift interior(e_ts[$n], :, 1, :)
 
-u_yz = @lift begin
-    u = interior(u_ts[$n], 1, :, :)
+u_xz = @lift begin
+    u = interior(u_ts[$n], :, 1, :)
     u .- mean(filter(!isnan, u))
 end
 
-v_yz = @lift interior(v_ts[$n], 1, :, :)
-w_yz = @lift interior(w_ts[$n], 1, :, :)
-w_yz = @lift interior(w_ts[$n], 1, :, :)
-N_yz = @lift interior(N_ts[$n], 1, :, :)
-κ_yz = @lift interior(κ_ts[$n], 1, :, :)
+v_xz = @lift begin
+    v = interior(v_ts[$n], :, 1, :)
+    v .- mean(filter(!isnan, v))
+end
 
-Nx, Ny, Nz = size(grid)
+v_xz = @lift interior(v_ts[$n], :, 1, :)
+w_xz = @lift interior(w_ts[$n], :, 1, :)
+w_xz = @lift interior(w_ts[$n], :, 1, :)
+N_xz = @lift interior(N_ts[$n], :, 1, :)
+κ_xz = @lift interior(κ_ts[$n], :, 1, :)
 
-b_z1 = @lift interior(b_ts[$n], 1, 16, :)
-b_z2 = @lift interior(b_ts[$n], 1, 32, :)
-b_z3 = @lift interior(b_ts[$n], 1, 8, :)
+Nx, Ny, Nz = size(grid)
 
-e_z1 = @lift interior(e_ts[$n], 1, 16, :)
-e_z2 = @lift interior(e_ts[$n], 1, 32, :)
-e_z3 = @lift interior(e_ts[$n], 1, 8, :)
+b_z1 = @lift interior(b_ts[$n], 16, 1, :)
+b_z2 = @lift interior(b_ts[$n], 32, 1, :)
+b_z3 = @lift interior(b_ts[$n], 8,  1, :)
 
-κ_z1 = @lift interior(κ_ts[$n], 1, 16, :)
-κ_z2 = @lift interior(κ_ts[$n], 1, 32, :)
-κ_z3 = @lift interior(κ_ts[$n], 1, 8, :)
+e_z1 = @lift interior(e_ts[$n], 16, 1, :)
+e_z2 = @lift interior(e_ts[$n], 32, 1, :)
+e_z3 = @lift interior(e_ts[$n], 8,  1, :)
 
-u_z1 = @lift interior(u_ts[$n], 1, 16, :)
-u_z2 = @lift interior(u_ts[$n], 1, 32, :)
-u_z3 = @lift interior(u_ts[$n], 1, 8, :)
+κ_z1 = @lift interior(κ_ts[$n], 16, 1, :)
+κ_z2 = @lift interior(κ_ts[$n], 32, 1, :)
+κ_z3 = @lift interior(κ_ts[$n], 8,  1, :)
 
-v_z1 = @lift interior(v_ts[$n], 1, 16, :)
-v_z2 = @lift interior(v_ts[$n], 1, 32, :)
-v_z3 = @lift interior(v_ts[$n], 1, 8, :)
+u_z1 = @lift interior(u_ts[$n], 16, 1, :)
+u_z2 = @lift interior(u_ts[$n], 32, 1, :)
+u_z3 = @lift interior(u_ts[$n], 8,  1, :)
+                                       
+v_z1 = @lift interior(v_ts[$n], 16, 1, :)
+v_z2 = @lift interior(v_ts[$n], 32, 1, :)
+v_z3 = @lift interior(v_ts[$n], 8,  1, :)
 
 x, y, z = nodes(b_ts)
 xκ, yκ, zκ = nodes(κ_ts)
 
 elim = 1e-4
 ulim = 0.2
-vlim = 2e-2
-wlim = 1e-5
+vlim = 1e-4
+wlim = 1e-7
 κlim = 1e-3 # 1e1
 
-heatmap!(ax_eyz, y, z, e_yz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
-contour!(ax_eyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_exz, x, z, e_xz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
+contour!(ax_exz, x, z, b_xz, levels=15, color=:black)
 
-heatmap!(ax_κyz, y, zκ, κ_yz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
-contour!(ax_κyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_κxz, x, zκ, κ_xz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
+contour!(ax_κxz, x, z, b_xz, levels=15, color=:black)
 
-heatmap!(ax_uyz, y, z, u_yz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
-contour!(ax_uyz, y, z, b_yz, levels=15, color=:black)
+# heatmap!(ax_uxz, x, z, u_xz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
+# contour!(ax_uxz, x, z, b_xz, levels=15, color=:black)
 
-# heatmap!(ax_vyz, y, z, v_yz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
-# contour!(ax_vyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_vxz, x, z, v_xz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
+contour!(ax_vxz, x, z, b_xz, levels=15, color=:black)
 
-heatmap!(ax_wyz, y, z, w_yz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
-contour!(ax_wyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_wxz, x, z, w_xz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
+contour!(ax_wxz, x, z, b_xz, levels=15, color=:black)
 
-heatmap!(ax_Nyz, y, z, N_yz, colormap=:thermal, colorrange=(1e-6, 1e-5), nan_color=:gray)
-contour!(ax_Nyz, y, z, b_yz, levels=15, color=:black)
+heatmap!(ax_Nxz, x, z, N_xz, colormap=:thermal, colorrange=(1e-6, 2e-5), nan_color=:gray)
+contour!(ax_Nxz, x, z, b_xz, levels=15, color=:black)
 
 lines!(ax_bz, b_z1, z)
 lines!(ax_bz, b_z2, z)

From e28cc04cd5b68082b93296d022b8d6d3560e023b Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 5 Jun 2023 20:42:38 -0600
Subject: [PATCH 275/530] Overhaul baroclinic adjustment example

---
 examples/baroclinic_adjustment.jl | 245 +++++++++++++++++++-----------
 1 file changed, 156 insertions(+), 89 deletions(-)

diff --git a/examples/baroclinic_adjustment.jl b/examples/baroclinic_adjustment.jl
index a0c6c4309e..34428440db 100644
--- a/examples/baroclinic_adjustment.jl
+++ b/examples/baroclinic_adjustment.jl
@@ -23,11 +23,7 @@ Lx = 1000kilometers # east-west extent [m]
 Ly = 1000kilometers # north-south extent [m]
 Lz = 1kilometers    # depth [m]
 
-Nx = 64
-Ny = 64
-Nz = 40
-
-grid = RectilinearGrid(size = (Nx, Ny, Nz),
+grid = RectilinearGrid(size = (48, 48, 8),
                        x = (0, Lx),
                        y = (-Ly/2, Ly/2),
                        z = (-Lz, 0),
@@ -45,8 +41,10 @@ model = HydrostaticFreeSurfaceModel(; grid,
                                     momentum_advection = WENO(),
                                     tracer_advection = WENO())
 
-# We want to initialize our model with a baroclinically unstable front plus some small-amplitude
-# noise.
+# We start our simulation from rest with a baroclinically unstable buoyancy distribution.
+# We use `ramp(y, Δy)`, defined below, to specify a front with width `Δy`
+# and horizontal buoyancy gradient `M²`. We impose the front on top of a
+# vertical buoyancy gradient `N²` and a bit of noise.
 
 """
     ramp(y, Δy)
@@ -61,17 +59,13 @@ For example:
 ```
 """
 ramp(y, Δy) = min(max(0, y/Δy + 1/2), 1)
-nothing #hide
-
-# We then use `ramp(y, Δy)` to construct an initial buoyancy configuration of a baroclinically
-# unstable front. The front has a buoyancy jump `Δb` over a latitudinal width `Δy`.
 
-N² = 4e-6 # [s⁻²] buoyancy frequency / stratification
-M² = 8e-8 # [s⁻²] horizontal buoyancy gradient
+N² = 1e-5 # [s⁻²] buoyancy frequency / stratification
+M² = 1e-7 # [s⁻²] horizontal buoyancy gradient
 
-Δy = 50kilometers # width of the region of the front
-Δb = Δy * M²      # buoyancy jump associated with the front
-ϵb = 1e-2 * Δb    # noise amplitude
+Δy = 100kilometers # width of the region of the front
+Δb = Δy * M²       # buoyancy jump associated with the front
+ϵb = 1e-2 * Δb     # noise amplitude
 
 bᵢ(x, y, z) = N² * z + Δb * ramp(y, Δy) + ϵb * randn()
 
@@ -81,7 +75,8 @@ set!(model, b=bᵢ)
 
 using CairoMakie
 
-x, y, z = 1e-3 .* nodes(grid, (Center(), Center(), Center())) # convert m -> km
+## Build coordinates with units of kilometers
+x, y, z = 1e-3 .* nodes(grid, (Center(), Center(), Center()))
 
 b = model.tracers.b
 
@@ -97,37 +92,33 @@ Colorbar(fig[1, 2], hm, label = "[m s⁻²]")
 current_figure() # hide
 fig
 
-# Now let's built a `Simulation`.
-
-Δt₀ = 5minutes
-stop_time = 40days
+# ## Simulation
+# 
+# Now let's build a `Simulation`.
 
-simulation = Simulation(model, Δt=Δt₀, stop_time=stop_time)
+simulation = Simulation(model, Δt=20minutes, stop_time=20days)
 
 # We add a `TimeStepWizard` callback to adapt the simulation's time-step,
 
 wizard = TimeStepWizard(cfl=0.2, max_change=1.1, max_Δt=20minutes)
-
 simulation.callbacks[:wizard] = Callback(wizard, IterationInterval(20))
 
 # Also, we add a callback to print a message about how the simulation is going,
 
 using Printf
 
-wall_clock = [time_ns()]
+wall_clock = Ref(time_ns())
 
 function print_progress(sim)
+    u, v, w = model.velocities
+    progress = 100 * (time(sim) / sim.stop_time)
+    elapsed = (time_ns() - wall_clock[]) / 1e9
+
     @printf("[%05.2f%%] i: %d, t: %s, wall time: %s, max(u): (%6.3e, %6.3e, %6.3e) m/s, next Δt: %s\n",
-            100 * (sim.model.clock.time / sim.stop_time),
-            sim.model.clock.iteration,
-            prettytime(sim.model.clock.time),
-            prettytime(1e-9 * (time_ns() - wall_clock[1])),
-            maximum(abs, sim.model.velocities.u),
-            maximum(abs, sim.model.velocities.v),
-            maximum(abs, sim.model.velocities.w),
-            prettytime(sim.Δt))
-
-    wall_clock[1] = time_ns()
+            progress, iteration(sim), prettytime(sim), prettytime(elapsed),
+            maximum(abs, u), maximum(abs, v), maximum(abs, w), prettytime(sim.Δt))
+
+    wall_clock[] = time_ns()
     
     return nothing
 end
@@ -135,20 +126,20 @@ end
 simulation.callbacks[:print_progress] = Callback(print_progress, IterationInterval(100))
 
 # ## Diagnostics/Output
-
-# Add some diagnostics. Here, we save the buoyancy, ``b``, at the edges of our domain as well as
+#
+# Here, we save the buoyancy, ``b``, at the edges of our domain as well as
 # the zonal (``x``) average of buoyancy.
 
 u, v, w = model.velocities
-
-B = Field(Average(b, dims=1))
+ζ = ∂x(v) - ∂y(u)
+B = Average(b, dims=1)
+U = Average(u, dims=1)
+V = Average(v, dims=1)
 
 filename = "baroclinic_adjustment"
 save_fields_interval = 0.5day
 
-slicers = (west = (1, :, :),
-           east = (grid.Nx, :, :),
-           south = (:, 1, :),
+slicers = (east = (grid.Nx, :, :),
            north = (:, grid.Ny, :),
            bottom = (:, :, 1),
            top = (:, :, grid.Nz))
@@ -156,19 +147,19 @@ slicers = (west = (1, :, :),
 for side in keys(slicers)
     indices = slicers[side]
 
-    simulation.output_writers[side] = JLD2OutputWriter(model, (; b);
+    simulation.output_writers[side] = JLD2OutputWriter(model, (; b, ζ);
                                                        filename = filename * "_$(side)_slice",
                                                        schedule = TimeInterval(save_fields_interval),
                                                        overwrite_existing = true,
                                                        indices)
 end
 
-simulation.output_writers[:zonal] = JLD2OutputWriter(model, (b=B,);
+simulation.output_writers[:zonal] = JLD2OutputWriter(model, (; b=B, u=U, v=V);
                                                      filename = filename * "_zonal_average",
                                                      schedule = TimeInterval(save_fields_interval),
                                                      overwrite_existing = true)
 
-# Now let's run!
+# Now we're ready to _run_.
 
 @info "Running the simulation..."
 
@@ -177,14 +168,18 @@ run!(simulation)
 @info "Simulation completed in " * prettytime(simulation.run_wall_time)
 
 # ## Visualization
-
-# Now we are ready to visualize our resutls! We use `CairoMakie` in this example.
-# On a system with OpenGL `using GLMakie` is more convenient as figures will be
-# displayed on the screen.
+#
+# All that's left is to make a pretty movie.
+# Actually, we make two visualizations here. First, we illustrate how to make a
+# 3D visualization with `Makie`'s `Axis3` and `Makie.surface`. Then we make a movie in 2D.
+# We use `CairoMakie` in this example, but note that `using GLMakie` is more
+# convenient on a system with OpenGL, as figures will be displayed on the screen.
 
 using CairoMakie
 
-# We load the saved buoyancy output on the top, bottom, and east surface as `FieldTimeSeries`es.
+# ### Three-dimensional visualization
+#
+# We load the saved buoyancy output on the top, bottom, north, and east surface as `FieldTimeSeries`es.
 
 filename = "baroclinic_adjustment"
 
@@ -197,19 +192,19 @@ b_timeserieses = (east   = FieldTimeSeries(slice_filenames.east, "b"),
                   bottom = FieldTimeSeries(slice_filenames.bottom, "b"),
                   top    = FieldTimeSeries(slice_filenames.top, "b"))
 
-avg_b_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "b")
-
-times = avg_b_timeseries.times
+B_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "b")
 
-nothing #hide
+times = B_timeseries.times
+grid = B_timeseries.grid
 
-# We build the coordinates. We rescale horizontal coordinates so that they correspond to kilometers.
+# We build the coordinates. We rescale horizontal coordinates to kilometers.
 
-x, y, z = nodes(b_timeserieses.east)
+xb, yb, zb = nodes(b_timeserieses.east)
 
-x = x .* 1e-3 # convert m -> km
-y = y .* 1e-3 # convert m -> km
+xb = xb ./ 1e3 # convert m -> km
+yb = yb ./ 1e3 # convert m -> km
 
+Nx, Ny, Nz = size(grid)
 x_xz = repeat(x, 1, Nz)
 y_xz_north = y[end] * ones(Nx, Nz)
 z_xz = repeat(reshape(z, 1, Nz), Nx, 1)
@@ -222,72 +217,144 @@ x_xy = x
 y_xy = y
 z_xy_top = z[end] * ones(grid.Nx, grid.Ny)
 z_xy_bottom = z[1] * ones(grid.Nx, grid.Ny)
-nothing #hide
+nothing # hide
 
 # Then we create a 3D axis. We use `zonal_slice_displacement` to control where the plot of the instantaneous
 # zonal average flow is located.
 
-fig = Figure(resolution = (900, 520))
+fig = Figure(resolution = (1600, 800))
 
 zonal_slice_displacement = 1.2
 
-ax = Axis3(fig[2, 1], aspect=(1, 1, 1/5),
-           xlabel="x (km)", ylabel="y (km)", zlabel="z (m)",
+ax = Axis3(fig[2, 1],
+           aspect=(1, 1, 1/5),
+           xlabel = "x (km)",
+           ylabel = "y (km)",
+           zlabel = "z (m)",
+           xlabeloffset = 100,
+           ylabeloffset = 100,
+           zlabeloffset = 100, 
            limits = ((x[1], zonal_slice_displacement * x[end]), (y[1], y[end]), (z[1], z[end])),
-           elevation = 0.45, azimuth = 6.8,
-           xspinesvisible = false, zgridvisible=false,
-           protrusions=40,
-           perspectiveness=0.7)
+           elevation = 0.45,
+           azimuth = 6.8,
+           xspinesvisible = false,
+           zgridvisible = false,
+           protrusions = 40,
+           perspectiveness = 0.7)
 
-nothing #hide
+# We use data from the final savepoint for the 3D plot.
+# Note that this plot can easily be animated by using Makie's `Observable`.
+# To dive into `Observable`s, check out
+# [Makie.jl's Documentation](https://makie.juliaplots.org/stable/documentation/nodes/index.html).
 
-# We use Makie's `Observable` to animate the data. To dive into how `Observable`s work we
-# refer to [Makie.jl's Documentation](https://makie.juliaplots.org/stable/documentation/nodes/index.html).
-
-n = Observable(1)
+n = length(times)
 
 # Now let's make a 3D plot of the buoyancy and in front of it we'll use the zonally-averaged output
 # to plot the instantaneous zonal-average of the buoyancy.
 
-b_slices = (east   = @lift(interior(b_timeserieses.east[$n], 1, :, :)),
-            north  = @lift(interior(b_timeserieses.north[$n], :, 1, :)),
-            bottom = @lift(interior(b_timeserieses.bottom[$n], :, :, 1)),
-            top    = @lift(interior(b_timeserieses.top[$n], :, :, 1)))
-
-avg_b = @lift interior(avg_b_timeseries[$n], 1, :, :)
+b_slices = (east   = interior(b_timeserieses.east[n], 1, :, :),
+            north  = interior(b_timeserieses.north[n], :, 1, :),
+            bottom = interior(b_timeserieses.bottom[n], :, :, 1),
+            top    = interior(b_timeserieses.top[n], :, :, 1))
 
-clims = @lift 1.1 .* extrema(b_timeserieses.top[$n][:])
+## Zonally-averaged buoyancy
+B = interior(B_timeseries[n], 1, :, :)
 
-kwargs = (colorrange = clims, colormap = :deep)
+clims = 1.1 .* extrema(b_timeserieses.top[n][:])
 
+kwargs = (colorrange=clims, colormap=:deep)
 surface!(ax, x_yz_east, y_yz, z_yz;    color = b_slices.east, kwargs...)
 surface!(ax, x_xz, y_xz_north, z_xz;   color = b_slices.north, kwargs...)
 surface!(ax, x_xy, y_xy, z_xy_bottom ; color = b_slices.bottom, kwargs...)
 surface!(ax, x_xy, y_xy, z_xy_top;     color = b_slices.top, kwargs...)
 
-sf = surface!(ax, zonal_slice_displacement .* x_yz_east, y_yz, z_yz; color = avg_b, kwargs...)
+sf = surface!(ax, zonal_slice_displacement .* x_yz_east, y_yz, z_yz; color = B, kwargs...)
 
-contour!(ax, y, z, avg_b; transformation = (:yz, zonal_slice_displacement * x[end]),
+contour!(ax, y, z, B; transformation = (:yz, zonal_slice_displacement * x[end]),
          levels = 15, linewidth = 2, color = :black)
 
-Colorbar(fig[2, 2], sf, label = "m s⁻²", height = 200, tellheight=false)
-
-title = @lift "Buoyancy at t = " * string(round(times[$n] / day, digits=1)) * " days"
+Colorbar(fig[2, 2], sf, label = "m s⁻²", height = Relative(0.4), tellheight=false)
 
+title = "Buoyancy at t = " * string(round(times[n] / day, digits=1)) * " days"
 fig[1, 1:2] = Label(fig, title; fontsize = 24, tellwidth = false, padding = (0, 0, -120, 0))
 
-current_figure() # hide
-fig
+rowgap!(fig.layout, 1, Relative(-0.2))
+colgap!(fig.layout, 1, Relative(-0.1))
+
+save("baroclinic_adjustment_3d.png", fig)
+nothing # hide
+
+# ![](baroclinic_adjustment_3d.png)
 
-# Finally, we add a figure title with the time of the snapshot and then record a movie.
+# ### Two-dimensional movie
+#
+# We make a 2D movie that shows buoyancy ``b`` and vertical vorticity ``ζ`` at the surface,
+# as well as the zonally-averaged zonal and meridional velocities ``U`` and ``V`` in the
+# ``(y, z)`` plane. First we load the `FieldTimeSeries` and extract the additional coordinates
+# we'll need for plotting
+
+ζ_timeseries = FieldTimeSeries(slice_filenames.top, "ζ")
+U_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "u")
+B_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "b")
+V_timeseries = FieldTimeSeries(filename * "_zonal_average.jld2", "v")
+
+xζ, yζ, zζ = nodes(ζ_timeseries)
+yv = ynodes(V_timeseries)
+
+xζ = xζ ./ 1e3 # convert m -> km
+yζ = yζ ./ 1e3 # convert m -> km
+yv = yv ./ 1e3 # convert m -> km
+
+# Next, we set up a plot with 4 panels. The top panels are large and square, while
+# the bottom panels get a reduced aspect ratio through `rowsize!`.
+
+set_theme!(Theme(fontsize=24))
+
+fig = Figure(resolution=(1800, 1000))
+
+axb = Axis(fig[1, 2], xlabel="x (km)", ylabel="y (km)", aspect=1)
+axζ = Axis(fig[1, 3], xlabel="x (km)", ylabel="y (km)", aspect=1, yaxisposition=:right)
+
+axu = Axis(fig[2, 2], xlabel="y (km)", ylabel="z (m)")
+axv = Axis(fig[2, 3], xlabel="y (km)", ylabel="z (m)", yaxisposition=:right)
+
+rowsize!(fig.layout, 2, Relative(0.3))
+
+# To prepare a plot for animation, we index the timeseries with an `Observable`,
+
+n = Observable(1)
+
+b_top = @lift interior(b_timeserieses.top[$n], :, :, 1)
+ζ_top = @lift interior(ζ_timeseries[$n], :, :, 1)
+U = @lift interior(U_timeseries[$n], 1, :, :)
+V = @lift interior(V_timeseries[$n], 1, :, :)
+B = @lift interior(B_timeseries[$n], 1, :, :)
+
+# and then build our plot:
+
+hm = heatmap!(axb, xb, yb, b_top, colorrange=(0, Δb), colormap=:thermal)
+Colorbar(fig[1, 1], hm, flipaxis=false, label="Surface b(x, y) (m s⁻²)")
+
+hm = heatmap!(axζ, xζ, yζ, ζ_top, colorrange=(-5e-5, 5e-5), colormap=:balance)
+Colorbar(fig[1, 4], hm, label="Surface ζ(x, y) (s⁻¹)")
+
+hm = heatmap!(axu, yb, zb, U; colorrange=(-5e-1, 5e-1), colormap=:balance)
+Colorbar(fig[2, 1], hm, flipaxis=false, label="Zonally-averaged U(y, z) (m s⁻¹)")
+contour!(axu, yb, zb, B; levels=15, color=:black)
+
+hm = heatmap!(axv, yv, zb, V; colorrange=(-1e-1, 1e-1), colormap=:balance)
+Colorbar(fig[2, 4], hm, label="Zonally-averaged V(y, z) (m s⁻¹)")
+contour!(axv, yb, zb, B; levels=15, color=:black)
+nothing # hide
+
+# Finally, we're ready to record the movie.
 
 frames = 1:length(times)
 
 record(fig, filename * ".mp4", frames, framerate=8) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
-nothing #hide
+nothing # hide
 
 # ![](baroclinic_adjustment.mp4)
+

From 9a10e9456f7635028bff420a99230adeab8a0d9c Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 5 Jun 2023 20:44:15 -0600
Subject: [PATCH 276/530] Remove messages during `record`

---
 examples/convecting_plankton.jl              | 2 --
 examples/internal_wave.jl                    | 2 --
 examples/kelvin_helmholtz_instability.jl     | 4 ++--
 examples/langmuir_turbulence.jl              | 2 --
 examples/ocean_wind_mixing_and_convection.jl | 2 --
 examples/one_dimensional_diffusion.jl        | 2 --
 examples/tilted_bottom_boundary_layer.jl     | 2 --
 examples/two_dimensional_turbulence.jl       | 2 --
 8 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/examples/convecting_plankton.jl b/examples/convecting_plankton.jl
index 0c1f547596..efc52fc075 100644
--- a/examples/convecting_plankton.jl
+++ b/examples/convecting_plankton.jl
@@ -274,8 +274,6 @@ frames = 1:length(times)
 @info "Making an animation of convecting plankton..."
 
 record(fig, "convecting_plankton.mp4", frames, framerate=8) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide
diff --git a/examples/internal_wave.jl b/examples/internal_wave.jl
index 6fe40398e6..30240a4a25 100644
--- a/examples/internal_wave.jl
+++ b/examples/internal_wave.jl
@@ -176,8 +176,6 @@ frames = 1:length(w_timeseries.times)
 @info "Animating a propagating internal wave..."
 
 record(fig, "internal_wave.mp4", frames, framerate=8) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide
diff --git a/examples/kelvin_helmholtz_instability.jl b/examples/kelvin_helmholtz_instability.jl
index 1063894da3..80ac7c9ef8 100644
--- a/examples/kelvin_helmholtz_instability.jl
+++ b/examples/kelvin_helmholtz_instability.jl
@@ -518,8 +518,8 @@ scatter!(ax_KE, KE_point;
 axislegend(ax_KE; position = :rb)
 
 record(fig, "kelvin_helmholtz_instability_total.mp4", frames, framerate=8) do i
-       @info "Plotting frame $i of $(frames[end])..."
-       n[] = i
+    @info "Plotting frame $i of $(frames[end])..."
+    n[] = i
 end
 nothing #hide
 
diff --git a/examples/langmuir_turbulence.jl b/examples/langmuir_turbulence.jl
index 1afb25ceac..e34fe32430 100644
--- a/examples/langmuir_turbulence.jl
+++ b/examples/langmuir_turbulence.jl
@@ -362,8 +362,6 @@ fig
 frames = 1:length(times)
 
 record(fig, "langmuir_turbulence.mp4", frames, framerate=8) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide
diff --git a/examples/ocean_wind_mixing_and_convection.jl b/examples/ocean_wind_mixing_and_convection.jl
index 687bd449be..1968e8e837 100644
--- a/examples/ocean_wind_mixing_and_convection.jl
+++ b/examples/ocean_wind_mixing_and_convection.jl
@@ -301,8 +301,6 @@ frames = intro:length(times)
 @info "Making a motion picture of ocean wind mixing and convection..."
 
 record(fig, filename * ".mp4", frames, framerate=8) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide
diff --git a/examples/one_dimensional_diffusion.jl b/examples/one_dimensional_diffusion.jl
index 9d649682b2..082759971d 100644
--- a/examples/one_dimensional_diffusion.jl
+++ b/examples/one_dimensional_diffusion.jl
@@ -152,8 +152,6 @@ frames = 1:length(times)
 @info "Making an animation..."
 
 record(fig, "one_dimensional_diffusion.mp4", frames, framerate=24) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide
diff --git a/examples/tilted_bottom_boundary_layer.jl b/examples/tilted_bottom_boundary_layer.jl
index d51993b963..a72815f101 100644
--- a/examples/tilted_bottom_boundary_layer.jl
+++ b/examples/tilted_bottom_boundary_layer.jl
@@ -240,8 +240,6 @@ fig
 frames = 1:length(times)
 
 record(fig, "tilted_bottom_boundary_layer.mp4", frames, framerate=12) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    if i%5 == 0 print(msg * " \r") end
     n[] = i
 end
 nothing #hide
diff --git a/examples/two_dimensional_turbulence.jl b/examples/two_dimensional_turbulence.jl
index 4f1a0b5471..a991052911 100644
--- a/examples/two_dimensional_turbulence.jl
+++ b/examples/two_dimensional_turbulence.jl
@@ -157,8 +157,6 @@ frames = 1:length(times)
 @info "Making a neat animation of vorticity and speed..."
 
 record(fig, filename * ".mp4", frames, framerate=24) do i
-    msg = string("Plotting frame ", i, " of ", frames[end])
-    print(msg * " \r")
     n[] = i
 end
 nothing #hide

From 4d8f58f93a24b68b8b93d0039ce812edb177cd9c Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 5 Jun 2023 20:45:32 -0600
Subject: [PATCH 277/530] Upwelling validation tests

---
 .../heterogeneous_windy_convection.jl         | 147 +++++++++---------
 1 file changed, 71 insertions(+), 76 deletions(-)

diff --git a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
index 3dd09c935b..140f9a26e1 100644
--- a/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
+++ b/validation/vertical_mixing_closures/heterogeneous_windy_convection.jl
@@ -9,16 +9,16 @@ using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom
 
 import Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities
 
-Nx = 50
-Ny = 1
+Nx = 1
+Ny = 200
 
-const Lx = 500kilometers
+const Lx = 1000kilometers
 const Ly = Lx
 const Lz = 1000
 
 # Stretched vertical grid
 γ = 1.01
-Δz₀ = 16
+Δz₀ = 8
 h₀ = 128
 z = [-Δz₀ * k for k = 0:ceil(h₀ / Δz₀)]
 while z[end] > -Lz
@@ -29,52 +29,52 @@ Nz = length(z) - 1
 
 grid = RectilinearGrid(size = (Nx, Ny, Nz),
                        halo = (4, 4, 4),
-                       x = (-Lx, 0),
-                       y = (0, Ly),
+                       x = (0, Lx),
+                       y = (-Ly/2, Ly/2),
                        z = z,
-                       topology=(Bounded, Periodic, Bounded))
+                       topology=(Periodic, Bounded, Bounded))
 
-z_bottom(x, y) = - 2 * abs(x) * Lz / Lx
-#grid = ImmersedBoundaryGrid(grid, PartialCellBottom(z_bottom, minimum_fractional_cell_height=0.2))
-grid = ImmersedBoundaryGrid(grid, GridFittedBottom(z_bottom))
+z_bottom(x, y) = - Lz * (1 - (2y / Ly)^2)
+grid = ImmersedBoundaryGrid(grid, PartialCellBottom(z_bottom, minimum_fractional_cell_height=0.1))
 
 @show grid
-@inline Qᵇ(x, y, t) = 0.0 #1e-7
-@inline Qᵘ(x, y, t) = 0.0
-@inline Qᵛ(x, y, t, p) = + 1e-4 * exp(-x^2 / (2 * p.δx^2))
+@inline Qᵇ(x, y, t) = 1e-7
+@inline Qᵘ(x, y, t) = -1e-3 * cos(π * y / Ly)
 
 b_top_bc = FluxBoundaryCondition(Qᵇ)
 u_top_bc = FluxBoundaryCondition(Qᵘ)
-v_top_bc = FluxBoundaryCondition(Qᵛ, parameters=(; δx=200kilometers))
 
 b_bcs = FieldBoundaryConditions(top=b_top_bc)
 u_bcs = FieldBoundaryConditions(top=u_top_bc)
-v_bcs = FieldBoundaryConditions(top=v_top_bc)
 
-vertical_mixing = CATKEVerticalDiffusivity(; minimum_turbulent_kinetic_energy=1e-6)
+vertical_mixing = CATKEVerticalDiffusivity(; minimum_turbulent_kinetic_energy=1e-9)
 #vertical_mixing = RiBasedVerticalDiffusivity()
-#
-horizontal_viscosity = HorizontalScalarDiffusivity(ν=1e4)
 
 @show vertical_mixing
-#closure = (vertical_mixing, horizontal_viscosity)
-closure = vertical_mixing
 
-filename = "heterogeneous_cooling.jld2"
+Δy = Ly / Ny
+ν₄ = Δy^4 / 1hours
+hyperviscosity = HorizontalScalarBiharmonicDiffusivity(ν=ν₄)
+
+#closure = vertical_mixing
+closure = (vertical_mixing, hyperviscosity)
+
+filename = "heterogeneous_cooling_with_hyperviscosity.jld2"
+#filename = "heterogeneous_cooling.jld2"
 
 model = HydrostaticFreeSurfaceModel(; grid, closure,
                                     momentum_advection = WENO(),
                                     tracer_advection = WENO(),
-                                    coriolis = FPlane(latitude=+33),
+                                    coriolis = FPlane(f=1e-4),
                                     tracers = (:b, :e),
-                                    boundary_conditions = (; b=b_bcs, u=u_bcs, v=v_bcs),
+                                    boundary_conditions = (; b=b_bcs, u=u_bcs),
                                     buoyancy = BuoyancyTracer())
 
 N²ᵢ = 1e-5
 bᵢ(x, y, z) = N²ᵢ * z
-set!(model, b=bᵢ, e=1e-6)
+set!(model, b=bᵢ, e=1e-9)
 
-simulation = Simulation(model, Δt=10minute, stop_iteration=1000)
+simulation = Simulation(model, Δt=5minute, stop_time=10days)
 
 κᶜ = if model.closure isa Tuple
     model.diffusivity_fields[1].κᶜ
@@ -88,8 +88,7 @@ outputs = (; model.velocities..., model.tracers..., κᶜ=κᶜ, N²=N²)
 
 simulation.output_writers[:fields] = JLD2OutputWriter(model, outputs;
                                                       filename,
-                                                      #schedule = TimeInterval(1hour),
-                                                      schedule = IterationInterval(10),
+                                                      schedule = TimeInterval(1hour),
                                                       overwrite_existing = true)
 
 function progress(sim)
@@ -129,11 +128,12 @@ end
 
 fig = Figure(resolution=(1600, 800))
 
-ax_vxz = Axis(fig[1, 1], title="v(x, z) - <v(x, z)>")
-ax_wxz = Axis(fig[1, 2], title="w(x, z)")
-ax_Nxz = Axis(fig[1, 3], title="N²(x, z)")
-ax_exz = Axis(fig[1, 4], title="e(x, z)")
-ax_κxz = Axis(fig[1, 5], title="κ(x, z)")
+ax_uyz = Axis(fig[1, 1], title="u(y, z) - <u(y, z)>")
+#ax_vyz = Axis(fig[1, 2], title="v(y, z)")
+ax_wyz = Axis(fig[1, 2], title="w(y, z)")
+ax_Nyz = Axis(fig[1, 3], title="N²(y, z)")
+ax_eyz = Axis(fig[1, 4], title="e(y, z)")
+ax_κyz = Axis(fig[1, 5], title="κ(y, z)")
 
 ax_bz = Axis(fig[2, 1], title="b(z)", xlabel="z")
 ax_uz = Axis(fig[2, 2], title="u(z)", ylabel="z")
@@ -147,73 +147,68 @@ n = slider.value
 title = @lift string("Two-dimensional channel at t = ", prettytime(b_ts.times[$n]))
 Label(fig[0, :], title, fontsize=24)
 
-b_xz = @lift interior(b_ts[$n], :, 1, :)
-e_xz = @lift interior(e_ts[$n], :, 1, :)
+b_yz = @lift interior(b_ts[$n], 1, :, :)
+e_yz = @lift interior(e_ts[$n], 1, :, :)
 
-u_xz = @lift begin
-    u = interior(u_ts[$n], :, 1, :)
+u_yz = @lift begin
+    u = interior(u_ts[$n], 1, :, :)
     u .- mean(filter(!isnan, u))
 end
 
-v_xz = @lift begin
-    v = interior(v_ts[$n], :, 1, :)
-    v .- mean(filter(!isnan, v))
-end
-
-v_xz = @lift interior(v_ts[$n], :, 1, :)
-w_xz = @lift interior(w_ts[$n], :, 1, :)
-w_xz = @lift interior(w_ts[$n], :, 1, :)
-N_xz = @lift interior(N_ts[$n], :, 1, :)
-κ_xz = @lift interior(κ_ts[$n], :, 1, :)
+v_yz = @lift interior(v_ts[$n], 1, :, :)
+w_yz = @lift interior(w_ts[$n], 1, :, :)
+w_yz = @lift interior(w_ts[$n], 1, :, :)
+N_yz = @lift interior(N_ts[$n], 1, :, :)
+κ_yz = @lift interior(κ_ts[$n], 1, :, :)
 
 Nx, Ny, Nz = size(grid)
 
-b_z1 = @lift interior(b_ts[$n], 16, 1, :)
-b_z2 = @lift interior(b_ts[$n], 32, 1, :)
-b_z3 = @lift interior(b_ts[$n], 8,  1, :)
+b_z1 = @lift interior(b_ts[$n], 1, 16, :)
+b_z2 = @lift interior(b_ts[$n], 1, 32, :)
+b_z3 = @lift interior(b_ts[$n], 1, 8, :)
+
+e_z1 = @lift interior(e_ts[$n], 1, 16, :)
+e_z2 = @lift interior(e_ts[$n], 1, 32, :)
+e_z3 = @lift interior(e_ts[$n], 1, 8, :)
 
-e_z1 = @lift interior(e_ts[$n], 16, 1, :)
-e_z2 = @lift interior(e_ts[$n], 32, 1, :)
-e_z3 = @lift interior(e_ts[$n], 8,  1, :)
+κ_z1 = @lift interior(κ_ts[$n], 1, 16, :)
+κ_z2 = @lift interior(κ_ts[$n], 1, 32, :)
+κ_z3 = @lift interior(κ_ts[$n], 1, 8, :)
 
-κ_z1 = @lift interior(κ_ts[$n], 16, 1, :)
-κ_z2 = @lift interior(κ_ts[$n], 32, 1, :)
-κ_z3 = @lift interior(κ_ts[$n], 8,  1, :)
+u_z1 = @lift interior(u_ts[$n], 1, 16, :)
+u_z2 = @lift interior(u_ts[$n], 1, 32, :)
+u_z3 = @lift interior(u_ts[$n], 1, 8, :)
 
-u_z1 = @lift interior(u_ts[$n], 16, 1, :)
-u_z2 = @lift interior(u_ts[$n], 32, 1, :)
-u_z3 = @lift interior(u_ts[$n], 8,  1, :)
-                                       
-v_z1 = @lift interior(v_ts[$n], 16, 1, :)
-v_z2 = @lift interior(v_ts[$n], 32, 1, :)
-v_z3 = @lift interior(v_ts[$n], 8,  1, :)
+v_z1 = @lift interior(v_ts[$n], 1, 16, :)
+v_z2 = @lift interior(v_ts[$n], 1, 32, :)
+v_z3 = @lift interior(v_ts[$n], 1, 8, :)
 
 x, y, z = nodes(b_ts)
 xκ, yκ, zκ = nodes(κ_ts)
 
 elim = 1e-4
 ulim = 0.2
-vlim = 1e-4
-wlim = 1e-7
+vlim = 2e-2
+wlim = 1e-5
 κlim = 1e-3 # 1e1
 
-heatmap!(ax_exz, x, z, e_xz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
-contour!(ax_exz, x, z, b_xz, levels=15, color=:black)
+heatmap!(ax_eyz, y, z, e_yz, colormap=:solar, colorrange=(0, elim), nan_color=:gray)
+contour!(ax_eyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_κxz, x, zκ, κ_xz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
-contour!(ax_κxz, x, z, b_xz, levels=15, color=:black)
+heatmap!(ax_κyz, y, zκ, κ_yz, colormap=:thermal, colorrange=(0, κlim), nan_color=:gray)
+contour!(ax_κyz, y, z, b_yz, levels=15, color=:black)
 
-# heatmap!(ax_uxz, x, z, u_xz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
-# contour!(ax_uxz, x, z, b_xz, levels=15, color=:black)
+heatmap!(ax_uyz, y, z, u_yz, colormap=:balance, colorrange=(-ulim, ulim), nan_color=:gray)
+contour!(ax_uyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_vxz, x, z, v_xz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
-contour!(ax_vxz, x, z, b_xz, levels=15, color=:black)
+# heatmap!(ax_vyz, y, z, v_yz, colormap=:balance, colorrange=(-vlim, vlim), nan_color=:gray)
+# contour!(ax_vyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_wxz, x, z, w_xz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
-contour!(ax_wxz, x, z, b_xz, levels=15, color=:black)
+heatmap!(ax_wyz, y, z, w_yz, colormap=:balance, colorrange=(-wlim, wlim), nan_color=:gray)
+contour!(ax_wyz, y, z, b_yz, levels=15, color=:black)
 
-heatmap!(ax_Nxz, x, z, N_xz, colormap=:thermal, colorrange=(1e-6, 2e-5), nan_color=:gray)
-contour!(ax_Nxz, x, z, b_xz, levels=15, color=:black)
+heatmap!(ax_Nyz, y, z, N_yz, colormap=:thermal, colorrange=(1e-6, 1e-5), nan_color=:gray)
+contour!(ax_Nyz, y, z, b_yz, levels=15, color=:black)
 
 lines!(ax_bz, b_z1, z)
 lines!(ax_bz, b_z2, z)

From caaebe9af887af6f9164f5d5c4cf02510c154d0f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 13:25:24 -0400
Subject: [PATCH 278/530] boundary buffer

---
 src/Distributed/interleave_comm_and_comp.jl      |  4 ++--
 ...culate_hydrostatic_free_surface_tendencies.jl |  6 +++---
 src/TurbulenceClosures/TurbulenceClosures.jl     |  6 +++++-
 ...ract_scalar_biharmonic_diffusivity_closure.jl |  2 +-
 .../abstract_scalar_diffusivity_closure.jl       |  2 +-
 src/TurbulenceClosures/closure_tuples.jl         |  4 ++--
 .../anisotropic_minimum_dissipation.jl           |  2 +-
 ...convective_adjustment_vertical_diffusivity.jl |  2 +-
 .../isopycnal_skew_symmetric_diffusivity.jl      |  2 +-
 .../leith_enstrophy_diffusivity.jl               |  2 +-
 .../ri_based_vertical_diffusivity.jl             |  9 +++++----
 .../scalar_biharmonic_diffusivity.jl             | 13 +++++++------
 .../scalar_diffusivity.jl                        | 13 +++++++------
 .../smagorinsky_lilly.jl                         |  2 +-
 .../turbulence_closure_utils.jl                  | 16 ++++++++--------
 15 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 4804f371e1..a5b8c3093f 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -1,7 +1,7 @@
 using Oceananigans: prognostic_fields
 using Oceananigans.Grids: halo_size
 
-function complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch)
+function complete_communication_and_compute_boundary!(model, ::DistributedGrid, arch)
 
     # We iterate over the fields because we have to clear _ALL_ architectures
     # and split explicit variables live on a different grid
@@ -15,7 +15,7 @@ function complete_communication_and_compute_boundary(model, grid::DistributedGri
     return nothing
 end
 
-complete_communication_and_compute_boundary(model, grid::DistributedGrid, arch::BlockingDistributedArch) = nothing
+complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingDistributedArch) = nothing
 compute_boundary_tendencies!(model) = nothing
 
 interior_tendency_kernel_size(grid::DistributedGrid)    = interior_tendency_kernel_size(grid,    architecture(grid))
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index ba2ccfaaee..4e11720f34 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -10,7 +10,7 @@ using Oceananigans.Biogeochemistry: update_tendencies!
 import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
-import Oceananigans.Distributed: complete_communication_and_compute_boundary
+import Oceananigans.Distributed: complete_communication_and_compute_boundary!
 import Oceananigans.Distributed: interior_tendency_kernel_size, interior_tendency_kernel_offsets
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
@@ -26,7 +26,7 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
     calculate_hydrostatic_free_surface_interior_tendency_contributions!(model)
-    complete_communication_and_compute_boundary(model, model.grid, model.architecture)
+    complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
@@ -49,7 +49,7 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
-complete_communication_and_compute_boundary(model, grid, arch) = nothing
+complete_communication_and_compute_boundary!(model, grid, arch) = nothing
 
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
 using Oceananigans.TurbulenceClosures.MEWSVerticalDiffusivities: MEWS
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 9d3a274bda..42a4557760 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -49,6 +49,7 @@ using Oceananigans.BuoyancyModels
 using Oceananigans.Utils
 
 using Oceananigans.Architectures: AbstractArchitecture, device
+import Oceananigans.Advection: boundary_buffer, required_halo_size
 
 const VerticallyBoundedGrid{FT} = AbstractGrid{FT, <:Any, <:Any, <:Bounded}
 
@@ -61,7 +62,7 @@ const VerticallyBoundedGrid{FT} = AbstractGrid{FT, <:Any, <:Any, <:Bounded}
 
 Abstract supertype for turbulence closures.
 """
-abstract type AbstractTurbulenceClosure{TimeDiscretization} end
+abstract type AbstractTurbulenceClosure{TimeDiscretization, BoundaryBuffer} end
 
 # Fallbacks
 validate_closure(closure) = closure
@@ -69,6 +70,9 @@ closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
 calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
 
+@inline boundary_buffer(::AbstractTurbulenceClosure{TD, B}) where B = B
+@inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where B = B
+
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
 
diff --git a/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl b/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
index 09b7cc7d45..357970226a 100644
--- a/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
+++ b/src/TurbulenceClosures/abstract_scalar_biharmonic_diffusivity_closure.jl
@@ -5,7 +5,7 @@ using Oceananigans.Grids: peripheral_node
 
 Abstract type for closures with scalar biharmonic diffusivities.
 """
-abstract type AbstractScalarBiharmonicDiffusivity{F} <: AbstractTurbulenceClosure{ExplicitTimeDiscretization} end
+abstract type AbstractScalarBiharmonicDiffusivity{F, N} <: AbstractTurbulenceClosure{ExplicitTimeDiscretization, N} end
 
 @inline formulation(::AbstractScalarBiharmonicDiffusivity{F}) where {F} = F()
 
diff --git a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
index 3f6ad1a504..569b272f69 100644
--- a/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
+++ b/src/TurbulenceClosures/abstract_scalar_diffusivity_closure.jl
@@ -5,7 +5,7 @@ using Oceananigans.Operators: ℑxyᶠᶠᵃ, ℑxzᶠᵃᶠ, ℑyzᵃᶠᶠ
 
 Abstract type for closures with scalar diffusivities.
 """
-abstract type AbstractScalarDiffusivity{TD, F} <: AbstractTurbulenceClosure{TD} end
+abstract type AbstractScalarDiffusivity{TD, F, N} <: AbstractTurbulenceClosure{TD, N} end
 
 #####
 ##### Formulations
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 791e13fb65..fb4b64e604 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -70,10 +70,10 @@ end
 
 with_tracers(tracers, closure_tuple::Tuple) = Tuple(with_tracers(tracers, closure) for closure in closure_tuple)
 
-function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...)
+function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...; kwargs...)
     for (α, closure) in enumerate(closure_tuple)
         diffusivity_fields = diffusivity_fields_tuple[α]
-        calculate_diffusivities!(diffusivity_fields, closure, args...)
+        calculate_diffusivities!(diffusivity_fields, closure, args...; kwargs...)
     end
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 23dd40fd04..58d8a614ca 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -7,7 +7,7 @@ Parameters for the "anisotropic minimum dissipation" turbulence closure for larg
 proposed originally by [Rozema15](@cite) and [Abkar16](@cite), then modified by [Verstappen18](@cite),
 and finally described and validated for by [Vreugdenhil18](@cite).
 """
-struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation}
+struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
     Cν :: PN
     Cκ :: PK
     Cb :: PB
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index 31f262aa3c..7c1e3bd14d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -3,7 +3,7 @@ using Oceananigans.AbstractOperations: KernelFunctionOperation
 using Oceananigans.BuoyancyModels: ∂z_b
 using Oceananigans.Operators: ℑzᵃᵃᶜ
 
-struct ConvectiveAdjustmentVerticalDiffusivity{TD, CK, CN, BK, BN} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
+struct ConvectiveAdjustmentVerticalDiffusivity{TD, CK, CN, BK, BN} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 1}
     convective_κz :: CK
     convective_νz :: CN
     background_κz :: BK
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 3bb65fb6a8..1ba3350f5b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -1,4 +1,4 @@
-struct IsopycnalSkewSymmetricDiffusivity{TD, K, S, M, L} <: AbstractTurbulenceClosure{TD}
+struct IsopycnalSkewSymmetricDiffusivity{TD, K, S, M, L} <: AbstractTurbulenceClosure{TD, 1}
                     κ_skew :: K
                κ_symmetric :: S
           isopycnal_tensor :: M
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index 4717d07f27..f3ef8e3665 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -4,7 +4,7 @@ using Oceananigans.Fields: AbstractField
 ##### The turbulence closure proposed by Leith
 #####
 
-struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation}
+struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 3}
                   C :: FT
              C_Redi :: CR
                C_GM :: GM
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index f87cdb3210..0190cf437d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -4,7 +4,7 @@ using Oceananigans.Operators
 using Oceananigans.Operators: ℑzᵃᵃᶜ
 using Oceananigans.Utils: use_only_active_interior_cells
 
-struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
+struct RiBasedVerticalDiffusivity{TD, FT, R} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 1}
     ν₀  :: FT
     κ₀  :: FT
     κᶜᵃ :: FT
@@ -139,7 +139,8 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κ, ν, Ri)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = κ_kernel_size(model.grid, closure), 
+                                                                               kernel_offsets = κ_kernel_offsets(model.grid, closure))
     arch = model.architecture
     grid = model.grid
     clock = model.clock
@@ -256,8 +257,8 @@ end
     Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, diffusivities.Ri)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
-    κᶜ★ = κ₀ * τ
-    κᵘ★ = ν₀ * τ
+    κ★ = κ₀ * τ
+    κ★ = ν₀ * τ
 
     κⁿ = κᶜ + κᵉ + κ★
     νⁿ = ν★
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index 524bdd4614..5c4bf11ffe 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -6,12 +6,12 @@ using Oceananigans.Utils: prettysummary
 
 Holds viscosity and diffusivities for models with prescribed isotropic diffusivities.
 """
-struct ScalarBiharmonicDiffusivity{F, N, K} <: AbstractScalarBiharmonicDiffusivity{F}
-    ν :: N
+struct ScalarBiharmonicDiffusivity{F, V, K, N} <: AbstractScalarBiharmonicDiffusivity{F, N}
+    ν :: V
     κ :: K
 
-    function ScalarBiharmonicDiffusivity{F}(ν::N, κ::K) where {F, N, K}
-        return new{F, N, K}(ν, κ)
+    function ScalarBiharmonicDiffusivity{F, N}(ν::V, κ::K) where {F, V, K, N}
+        return new{F, V, K, N}(ν, κ)
     end
 end
 
@@ -66,11 +66,12 @@ function ScalarBiharmonicDiffusivity(formulation=ThreeDimensionalFormulation(),
                                      ν=0, κ=0,
                                      discrete_form = false,
                                      loc = (nothing, nothing, nothing),
-                                     parameters = nothing)
+                                     parameters = nothing,
+                                     boundary_buffer = 1)
 
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)
-    return ScalarBiharmonicDiffusivity{typeof(formulation)}(ν, κ)
+    return ScalarBiharmonicDiffusivity{typeof(formulation), boundary_buffer}(ν, κ)
 end
 
 function with_tracers(tracers, closure::ScalarBiharmonicDiffusivity{F}) where {F}
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 5351bb1b08..c3e4d23d41 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -1,12 +1,12 @@
 import Oceananigans.Grids: required_halo_size
 using Oceananigans.Utils: prettysummary
 
-struct ScalarDiffusivity{TD, F, N, K} <: AbstractScalarDiffusivity{TD, F}
-    ν :: N
+struct ScalarDiffusivity{TD, F, V, K, N} <: AbstractScalarDiffusivity{TD, F, N}
+    ν :: V
     κ :: K
 
-    function ScalarDiffusivity{TD, F}(ν::N, κ::K) where {TD, F, N, K}
-        return new{TD, F, N, K}(ν, κ)
+    function ScalarDiffusivity{TD, F, N}(ν::V, κ::K) where {TD, F, V, K, N}
+        return new{TD, F, V, K, N}(ν, κ)
     end
 end
 
@@ -91,7 +91,8 @@ function ScalarDiffusivity(time_discretization=ExplicitTimeDiscretization(),
                            ν=0, κ=0,
                            discrete_form = false,
                            loc = (nothing, nothing, nothing),
-                           parameters = nothing)
+                           parameters = nothing,
+                           boundary_buffer = 1)
 
     if formulation == HorizontalFormulation() && time_discretization == VerticallyImplicitTimeDiscretization()
         throw(ArgumentError("VerticallyImplicitTimeDiscretization is only supported for `VerticalFormulation` or `ThreeDimensionalFormulation`"))
@@ -100,7 +101,7 @@ function ScalarDiffusivity(time_discretization=ExplicitTimeDiscretization(),
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
 
-    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation)}(ν, κ)
+    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation), boundary_buffer}(ν, κ)
 end
 
 # Explicit default
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index 2c7cf9501c..b68b612230 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -3,7 +3,7 @@
 ##### We also call this 'Constant Smagorinsky'.
 #####
 
-struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation}
+struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
      C :: FT
     Cb :: FT
     Pr :: P
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 1590b7f6f2..68019853b0 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -37,23 +37,23 @@ end
 end
 
 # extend κ kernel to compute also the boundaries
-@inline function κ_kernel_size(grid) 
+@inline function κ_kernel_size(grid, ::AbstractTurbulenceClosure{TD, B}) where{TD, B}
     Nx, Ny, Nz = size(grid)
     Tx, Ty, Tz = topology(grid)
 
-    Ax = Tx == Flat ? Nx : Nx + 2 
-    Ay = Ty == Flat ? Ny : Ny + 2 
-    Az = Tz == Flat ? Nz : Nz + 2 
+    Ax = Tx == Flat ? Nx : Nx + 2B 
+    Ay = Ty == Flat ? Ny : Ny + 2B 
+    Az = Tz == Flat ? Nz : Nz + 2B 
 
     return (Ax, Ay, Az)
 end
 
-@inline function κ_kernel_offsets(grid)
+@inline function κ_kernel_offsets(grid, ::AbstractTurbulenceClosure{TD, B}) where{TD, B}
     Tx, Ty, Tz = topology(grid)
 
-    Ax = Tx == Flat ? 0 : - 1
-    Ay = Ty == Flat ? 0 : - 1 
-    Az = Tz == Flat ? 0 : - 1 
+    Ax = Tx == Flat ? 0 : - B
+    Ay = Ty == Flat ? 0 : - B 
+    Az = Tz == Flat ? 0 : - B 
 
     return (Ax, Ay, Az)
 end

From ea7d550802f80a0a54f05437d411196f106007c4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 13:29:54 -0400
Subject: [PATCH 279/530] bump

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 15898a3e54..796296825f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Oceananigans"
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
 authors = ["Climate Modeling Alliance and contributors"]
-version = "0.82.0"
+version = "0.83.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 22910ecb96231cd01a934ccfffee21b2656d986a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 14:26:24 -0400
Subject: [PATCH 280/530] introduce KernelParameters

---
 src/TurbulenceClosures/TurbulenceClosures.jl |  4 ++--
 src/TurbulenceClosures/closure_tuples.jl     | 18 ++++++++++++++++++
 src/Utils/kernel_launching.jl                | 10 ++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 42a4557760..18c870956b 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -70,8 +70,8 @@ closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
 calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
 
-@inline boundary_buffer(::AbstractTurbulenceClosure{TD, B}) where B = B
-@inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where B = B
+@inline boundary_buffer(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B
+@inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index fb4b64e604..914b64d216 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,6 +86,24 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
+@inline function κ_kernel_size(grid, closure_tuple::Tuple)
+    kernel_size = (0, 0, 0)
+    for closure in closure_tuple
+        kernel_size = max.(kernel_size, κ_kernel_size(grid, closure))
+    end
+
+    return kernel_size
+end
+
+@inline function κ_kernel_offsets(grid, closure_tuple::Tuple)
+    kernel_offsets = (0, 0, 0)
+    for closure in closure_tuple
+        kernel_offsets = max.(kernel_offsets, κ_kernel_offsets(grid, closure))
+    end
+
+    return kernel_offsets
+end
+
 #####
 ##### Compiler-inferrable time_discretization for tuples
 #####
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 93254cd5f9..41628bac50 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -106,3 +106,13 @@ end
 @inline launch!(arch, grid, ::Val{workspec}, args...; kwargs...) where workspec =
     launch!(arch, grid, workspec, args...; kwargs...)
 
+# extend w kernel to compute also the boundaries
+# If Flat, do not calculate on halos!
+
+using Oceananigans.Operators: XFlatGrid, YFlatGrid
+using Oceananigans.Grids: topology
+
+struct KernelParameters{S, O} end
+
+KernelParameters(grid::AbstractGrid)          = KernelParameters{kernel_size(grid),  kernel_offsets(grid)}()
+KernelParameters(size::Tuple, offsets::Tuple) = KernelParameters{size, offsets}()

From a9d1b0f4eb0ed095799c5b5a37e2a4e3f642ea36 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:14:23 -0400
Subject: [PATCH 281/530] adjust CATKE

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 0e3aabfda1..e92828c28d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -44,7 +44,7 @@ import Oceananigans.TurbulenceClosures:
     diffusive_flux_y,
     diffusive_flux_z
 
-struct CATKEVerticalDiffusivity{TD, CL, FT, TKE} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
+struct CATKEVerticalDiffusivity{TD, CL, FT, TKE} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 2}
     mixing_length :: CL
     turbulent_kinetic_energy_equation :: TKE
     maximum_diffusivity :: FT

From 24858eaeaf8757d5caaa310cd784e4f3cf7d664d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:14:41 -0400
Subject: [PATCH 282/530] adjusting mews

---
 .../mews_vertical_diffusivity.jl                                | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index ba6c72b656..9f7fa234da 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -40,7 +40,7 @@ import Oceananigans.TurbulenceClosures:
     diffusive_flux_x,
     diffusive_flux_y
 
-struct MEWSVerticalDiffusivity{TD, FT} <: AbstractScalarDiffusivity{TD, VerticalFormulation}
+struct MEWSVerticalDiffusivity{TD, FT} <: AbstractScalarDiffusivity{TD, VerticalFormulation, 1}
     Cʰ  :: FT
     Cᴷʰ :: FT
     Cᴷᶻ :: FT

From 89ca4eb95b5f25f6405b4e5c8c63019342fedf1c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:34:35 -0400
Subject: [PATCH 283/530] new kernel abstraction

---
 Manifest.toml                                 |  6 +-
 Project.toml                                  |  1 -
 ...ate_hydrostatic_free_surface_tendencies.jl | 71 ++++++++-----------
 .../compute_w_from_continuity.jl              | 13 ++--
 .../recompute_boundary_tendencies.jl          | 16 +++--
 .../update_hydrostatic_pressure.jl            | 12 ++--
 .../shallow_water_diffusion_operators.jl      |  9 +--
 src/TurbulenceClosures/closure_tuples.jl      |  2 +
 src/Utils/Utils.jl                            |  2 +-
 src/Utils/kernel_launching.jl                 | 31 ++++----
 10 files changed, 78 insertions(+), 85 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 952c7f92ae..750c7ba4fa 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.8.0"
 manifest_format = "2.0"
-project_hash = "efa1b3c0f878c0fe77caba96f053e1145d126cd0"
+project_hash = "b61348c5ba4009d3da1a3d8c47bdeb84513faa2c"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
@@ -277,7 +277,9 @@ version = "1.12.0"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "47be64f040a7ece575c2b5f53ca6da7b548d69f4"
+git-tree-sha1 = "bbb7ac4a3194c0d1561b9dea2a20e8f1ab68f709"
+repo-rev = "main"
+repo-url = "https://github.com/simone-silvestri/KernelAbstractions.jl"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 version = "0.9.4"
 
diff --git a/Project.toml b/Project.toml
index 796296825f..330b1448a0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -45,7 +45,6 @@ Glob = "1.3"
 IncompleteLU = "0.2"
 IterativeSolvers = "0.9"
 JLD2 = "^0.4"
-KernelAbstractions = "0.9"
 MPI = "0.16, 0.17, 0.18, 0.19, 0.20"
 NCDatasets = "0.12.10"
 OffsetArrays = "1.4"
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 4e11720f34..b89293609b 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -1,7 +1,7 @@
 import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
-using Oceananigans.Utils: work_layout
+using Oceananigans.Utils: work_layout, KernelParameters
 using Oceananigans.Fields: immersed_boundary_condition
 using Oceananigans.Grids: halo_size
 using Oceananigans: fields, prognostic_fields, TendencyCallsite, UpdateStateCallsite
@@ -96,9 +96,11 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
     only_active_cells = use_only_active_interior_cells(grid)
 
-    kernel_size    =   interior_tendency_kernel_size(grid)
+    kernel_size = interior_tendency_kernel_size(grid)
     kernel_offsets = interior_tendency_kernel_offsets(grid)
 
+    kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
+
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
         c_tendency    = model.timestepper.Gⁿ[tracer_name]
         c_advection   = model.advection[tracer_name]
@@ -123,10 +125,9 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
                      c_forcing,
                      model.clock)
 
-        launch!(arch, grid, kernel_size,
+        launch!(arch, grid, kernel_parameters,
                 tendency_kernel!,
                 c_tendency,
-                kernel_offsets,
                 grid,
                 args;
                 only_active_cells)
@@ -147,7 +148,7 @@ function apply_flux_bcs!(Gcⁿ, c, arch, args...)
     return nothing
 end
 
-function calculate_free_surface_tendency!(grid, model, kernel_size, kernel_offsets)
+function calculate_free_surface_tendency!(grid, model, kernel_parameters)
 
     arch = architecture(grid)
 
@@ -158,8 +159,8 @@ function calculate_free_surface_tendency!(grid, model, kernel_size, kernel_offse
                  model.forcing,
                  model.clock)
 
-    launch!(arch, grid, kernel_size,
-            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, kernel_offsets, 
+    launch!(arch, grid, kernel_parameters,
+            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, 
             grid, args)
 
     return nothing
@@ -196,18 +197,20 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     
     only_active_cells = use_only_active_interior_cells(grid)
 
-    kernel_size    =   interior_tendency_kernel_size(grid)
+    kernel_size = interior_tendency_kernel_size(grid)
     kernel_offsets = interior_tendency_kernel_offsets(grid)
     
-    launch!(arch, grid, kernel_size,
-            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args;
+    kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
+
+    launch!(arch, grid, kernel_parameters,
+            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
             only_active_cells)
 
-    launch!(arch, grid, kernel_size,
-            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args;
+    launch!(arch, grid, kernel_parameters,
+            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, v_kernel_args;
             only_active_cells)
 
-    calculate_free_surface_tendency!(grid, model, :xy, (0, 0))
+    calculate_free_surface_tendency!(grid, model, KernelParameters(:xy, (0, 0)))
 
     return nothing
 end
@@ -236,30 +239,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid, args)
     i, j, k = @index(Global, NTuple)
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
-    k′ = k + offs[3]
-    @inbounds Gu[i′, j′, k′] = hydrostatic_free_surface_u_velocity_tendency(i′, j′, k′, grid, args...)
+    @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, offs, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid, args)
     i, j, k = @index(Global, NTuple)
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
-    k′ = k + offs[3]
-    @inbounds Gv[i′, j′, k′] = hydrostatic_free_surface_v_velocity_tendency(i′, j′, k′, grid, args...)
+    @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, offs, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
@@ -270,30 +267,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid, args)
     i, j, k = @index(Global, NTuple)
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
-    k′ = k + offs[3]
-    @inbounds Gc[i, j, k] =  hydrostatic_free_surface_tracer_tendency(i′, j′, k′, grid, args...)
+    @inbounds Gc[i, j, k] =  hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, offs, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_ntuple(idx, grid)
     @inbounds Gc[i, j, k] =  hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the subgrid scale energy equation. """
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, offs, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid, args)
     i, j, k = @index(Global, NTuple)
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
-    k′ = k + offs[3]
-    @inbounds Ge[i′, j′, k′] =  hydrostatic_turbulent_kinetic_energy_tendency(i′, j′, k′, grid, args...)
+    @inbounds Ge[i, j, k] =  hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, offs, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_ntuple(idx, grid)
     @inbounds Ge[i, j, k] =  hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
@@ -304,9 +295,7 @@ end
 #####
 
 """ Calculate the right-hand-side of the free surface displacement (``η``) equation. """
-@kernel function calculate_hydrostatic_free_surface_Gη!(Gη, offs, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gη!(Gη, grid, args)
     i, j = @index(Global, NTuple)
-    i′ = i + offs[1]
-    j′ = j + offs[2]
-    @inbounds Gη[i′, j′, grid.Nz+1] = free_surface_tendency(i′, j′, grid, args...)
+    @inbounds Gη[i, j, grid.Nz+1] = free_surface_tendency(i, j, grid, args...)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 09f7796861..22ad1c6c93 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -13,18 +13,15 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid; kernel_size = w_kernel_size(grid), kernel_offsets = w_kernel_offsets(grid)) = 
-    launch!(arch, grid, kernel_size, _compute_w_from_continuity!, velocities, kernel_offsets, grid)
+compute_w_from_continuity!(velocities, arch, grid; parameters = KernelParameters(w_kernel_size(grid), w_kernel_offsets(grid))) = 
+    launch!(arch, grid, parameters, _compute_w_from_continuity!, velocities, grid)
 
-@kernel function _compute_w_from_continuity!(U, offs, grid)
+@kernel function _compute_w_from_continuity!(U, grid)
     i, j = @index(Global, NTuple)
 
-    i′ = i + offs[1] 
-    j′ = j + offs[2] 
-
-    U.w[i′, j′, 1] = 0
+    U.w[i, j, 1] = 0
     @unroll for k in 2:grid.Nz+1
-        @inbounds U.w[i′, j′, k] = U.w[i′, j′, k-1] - Δzᶜᶜᶜ(i′, j′, k-1, grid) * div_xyᶜᶜᶜ(i′, j′, k-1, grid, U.u, U.v)
+        @inbounds U.w[i, j, k] = U.w[i, j, k-1] - Δzᶜᶜᶜ(i, j, k-1, grid) * div_xyᶜᶜᶜ(i, j, k-1, grid, U.u, U.v)
     end
 end
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index f6cad5ca07..538b08979c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -31,13 +31,13 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        launch!(arch, grid, kernel_size,
+        launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
                 calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args)
     
-        launch!(arch, grid, kernel_size,
+        launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
                 calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args)
         
-        calculate_free_surface_tendency!(grid, model, kernel_size[1:2], kernel_offsets[1:2])
+        calculate_free_surface_tendency!(grid, model, KernelParameteres(kernel_size[1:2], kernel_offsets[1:2]))
     end
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
@@ -67,7 +67,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
                      model.clock)
 
         for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-            launch!(arch, grid, kernel_size,
+            launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
                     tendency_kernel!, c_tendency, kernel_offsets, grid, args)
         end
     end
@@ -78,19 +78,21 @@ function recompute_auxiliaries!(model, grid, arch)
     sizes, offs = size_w_kernel(grid, arch)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        compute_w_from_continuity!(model.velocities, arch, grid; kernel_size, kernel_offsets)
+        compute_w_from_continuity!(model.velocities, arch, grid; parameters = KernelParameters(kernel_size, kernel_offsets))
     end
 
     sizes, offs = size_p_kernel(grid, arch)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; kernel_size, kernel_offsets)
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; 
+                                     parameters = KernelParameters(kernel_size, kernel_offsets))
     end
 
     sizes, offs = size_κ_kernel(grid, arch)
 
     for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; kernel_size, kernel_offsets)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model;
+                                 parameters = KernelParameters(kernel_size, kernel_offsets))
     end
 end
 
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 3e2c87879e..627050366c 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -8,9 +8,7 @@ the `buoyancy_perturbationᶜᶜᶜ` downwards:
     `pHY′ = ∫ buoyancy_perturbationᶜᶜᶜ dz` from `z=0` down to `z=-Lz`
 """
 @kernel function _update_hydrostatic_pressure!(pHY′, offs, grid, buoyancy, C)
-    i′, j′ = @index(Global, NTuple)
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
+    i, j = @index(Global, NTuple)
 
     @inbounds pHY′[i, j, grid.Nz] = - z_dot_g_bᶜᶜᶠ(i, j, grid.Nz+1, grid, buoyancy, C) * Δzᶜᶜᶠ(i, j, grid.Nz+1, grid)
 
@@ -27,11 +25,11 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = p_kernel_offsets(grid)) =
-    update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; kernel_size, kernel_offsets)
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; parameters = KernelParameters(p_kernel_size(grid), p_kernel_offsets(grid))) =
+    update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; parameters)
 
-update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; kernel_size = p_kernel_size(grid), kernel_offsets = p_kernel_offsets(grid)) =  
-    launch!(arch, grid, kernel_size, _update_hydrostatic_pressure!, pHY′, kernel_offsets, grid, buoyancy, tracers)
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; parameters = KernelParameters(p_kernel_size(grid), p_kernel_offsets(grid))) =  
+    launch!(arch, grid, parameters, _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)
 
 using Oceananigans.Grids: topology
 
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index ed2558a18e..885f96deb6 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -20,9 +20,10 @@ import Oceananigans.TurbulenceClosures:
                         calc_nonlinear_νᶜᶜᶜ,
                         νᶜᶜᶜ
 
-struct ShallowWaterScalarDiffusivity{N, X} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation}
-    ν :: N
+struct ShallowWaterScalarDiffusivity{V, X, N} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, N}
+    ν :: V
     ξ :: X
+    ShallowWaterScalarDiffusivity{N}(ν::V, ξ::X) where {N, V, X} = new{N, V, X}(ν, ξ)
 end
 
 """
@@ -39,10 +40,10 @@ With the `VectorInvariantFormulation()` (that evolves ``u`` and ``v``) we comput
 ``h^{-1} 𝛁(ν h 𝛁 t)``, while with the `ConservativeFormulation()` (that evolves
 ``u h`` and ``v h``) we compute ``𝛁 (ν h 𝛁 t)``.
 """
-function ShallowWaterScalarDiffusivity(FT::DataType=Float64; ν=0, ξ=0, discrete_form=false)
+function ShallowWaterScalarDiffusivity(FT::DataType=Float64; ν=0, ξ=0, discrete_form=false, boundary_buffer = 1)
     ν = convert_diffusivity(FT, ν; discrete_form)
     ξ = convert_diffusivity(FT, ξ; discrete_form)
-    return ShallowWaterScalarDiffusivity(ν, ξ)
+    return ShallowWaterScalarDiffusivity{boundary_buffer}(ν, ξ)
 end
 
 # We have no tracers in the shallow water diffusivity
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 914b64d216..40a4a91e96 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,6 +86,8 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
+KernelParameters(grid::AbstractGrid, closure) = KernelParameters(κ_kernel_size(grid, closure), κ_kernel_offsets(grid, closure))
+
 @inline function κ_kernel_size(grid, closure_tuple::Tuple)
     kernel_size = (0, 0, 0)
     for closure in closure_tuple
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
index 2ce6dce916..eb97ac76d1 100644
--- a/src/Utils/Utils.jl
+++ b/src/Utils/Utils.jl
@@ -1,6 +1,6 @@
 module Utils
 
-export launch_config, work_layout, launch!
+export launch_config, work_layout, launch!, KernelParameters
 export prettytime, pretty_filesize
 export tupleit, parenttuple, datatuple, datatuples
 export validate_intervals, time_to_run
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 41628bac50..57a28adf92 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -4,6 +4,17 @@
 
 using Oceananigans.Architectures
 using Oceananigans.Grids
+using Oceananigans.Grids: AbstractGrid
+
+struct KernelParameters{S, O} end
+
+KernelParameters(size, offsets) = KernelParameters{size, offsets}()
+
+worksize(::KernelParameters{S}) where S = S
+offsets(::KernelParameters{S, O}) where {S, O} = O
+
+offsets(workspec)  = nothing
+worksize(workspec) = workspec
 
 flatten_reduced_dimensions(worksize, dims) = Tuple(i ∈ dims ? 1 : worksize[i] for i = 1:3)
 
@@ -80,22 +91,25 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                  only_active_cells = nothing,
                  kwargs...)
 
-    workgroup, worksize = work_layout(grid, workspec;
+    workgroup, worksize = work_layout(grid, worksize(workspec);
                                       include_right_boundaries,
                                       reduced_dimensions,
                                       location)
 
+    offset = offsets(workspec)
+
     if !isnothing(only_active_cells)
         workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
+        offset = nothing
     end
 
     if worksize == 0
         return nothing
     end
     
-    loop! = kernel!(Architectures.device(arch), workgroup, worksize)
+    loop! = kernel!(Architectures.device(arch), workgroup, worksize, offset)
 
-    @debug "Launching kernel $kernel! with worksize $worksize"
+    @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset"
 
     loop!(kernel_args...)
 
@@ -105,14 +119,3 @@ end
 # When dims::Val
 @inline launch!(arch, grid, ::Val{workspec}, args...; kwargs...) where workspec =
     launch!(arch, grid, workspec, args...; kwargs...)
-
-# extend w kernel to compute also the boundaries
-# If Flat, do not calculate on halos!
-
-using Oceananigans.Operators: XFlatGrid, YFlatGrid
-using Oceananigans.Grids: topology
-
-struct KernelParameters{S, O} end
-
-KernelParameters(grid::AbstractGrid)          = KernelParameters{kernel_size(grid),  kernel_offsets(grid)}()
-KernelParameters(size::Tuple, offsets::Tuple) = KernelParameters{size, offsets}()

From 9bda1e0de4ce69dc6087e54871f7c913e2bc61d4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:41:47 -0400
Subject: [PATCH 284/530] offsets in BC

---
 src/BoundaryConditions/BoundaryConditions.jl  |  2 +-
 src/BoundaryConditions/fill_halo_regions.jl   | 30 +++++-------
 .../fill_halo_regions_flux.jl                 | 48 +++++++------------
 .../fill_halo_regions_open.jl                 | 30 +++++-------
 .../fill_halo_regions_periodic.jl             | 36 ++++++--------
 5 files changed, 58 insertions(+), 88 deletions(-)

diff --git a/src/BoundaryConditions/BoundaryConditions.jl b/src/BoundaryConditions/BoundaryConditions.jl
index 13dd9a6e19..fabedd0b2c 100644
--- a/src/BoundaryConditions/BoundaryConditions.jl
+++ b/src/BoundaryConditions/BoundaryConditions.jl
@@ -14,7 +14,7 @@ using CUDA
 using KernelAbstractions: @index, @kernel
 
 using Oceananigans.Architectures: CPU, GPU, device
-using Oceananigans.Utils: work_layout, launch!
+using Oceananigans.Utils: work_layout, launch!, KernelParameters
 using Oceananigans.Operators: Ax, Ay, Az, volume
 using Oceananigans.Grids
 
diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 754393dbe3..7ef16a9327 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -163,28 +163,22 @@ fill_first(bc1, bc2)               = true
 ##### General fill_halo! kernels
 #####
 
-@kernel function _fill_west_and_east_halo!(c, west_bc, east_bc, offset, loc, grid, args) 
+@kernel function _fill_west_and_east_halo!(c, west_bc, east_bc, loc, grid, args) 
     j, k = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
-    _fill_west_halo!(j′, k′, grid, c, west_bc, loc, args...)
-    _fill_east_halo!(j′, k′, grid, c, east_bc, loc, args...)
+    _fill_west_halo!(j, k, grid, c, west_bc, loc, args...)
+    _fill_east_halo!(j, k, grid, c, east_bc, loc, args...)
 end
 
-@kernel function _fill_south_and_north_halo!(c, south_bc, north_bc, offset, loc, grid, args)
+@kernel function _fill_south_and_north_halo!(c, south_bc, north_bc, loc, grid, args)
     i, k = @index(Global, NTuple)
-    i′ = i + offset[1]
-    k′ = k + offset[2]
-    _fill_south_halo!(i′, k′, grid, c, south_bc, loc, args...)
-    _fill_north_halo!(i′, k′, grid, c, north_bc, loc, args...)
+    _fill_south_halo!(i, k, grid, c, south_bc, loc, args...)
+    _fill_north_halo!(i, k, grid, c, north_bc, loc, args...)
 end
 
-@kernel function _fill_bottom_and_top_halo!(c, bottom_bc, top_bc, offset, loc, grid, args)
+@kernel function _fill_bottom_and_top_halo!(c, bottom_bc, top_bc, loc, grid, args)
     i, j = @index(Global, NTuple)
-    i′ = i + offset[1]
-    j′ = j + offset[2]
-    _fill_bottom_halo!(i′, j′, grid, c, bottom_bc, loc, args...)
-       _fill_top_halo!(i′, j′, grid, c, top_bc,    loc, args...)
+    _fill_bottom_halo!(i, j, grid, c, bottom_bc, loc, args...)
+       _fill_top_halo!(i, j, grid, c, top_bc,    loc, args...)
 end
 
 #####
@@ -230,13 +224,13 @@ end
 end
 
 fill_west_and_east_halo!(c, west_bc, east_bc, size, offset, loc, arch, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_west_and_east_halo!, c, west_bc, east_bc, offset, loc, grid, Tuple(args); kwargs...)
+    launch!(arch, grid, KernelParameters(size, offset), _fill_west_and_east_halo!, c, west_bc, east_bc, loc, grid, Tuple(args); kwargs...)
 
 fill_south_and_north_halo!(c, south_bc, north_bc, size, offset, loc, arch, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_south_and_north_halo!, c, south_bc, north_bc, offset, loc, grid, Tuple(args); kwargs...)
+    launch!(arch, grid, KernelParameters(size, offset), _fill_south_and_north_halo!, c, south_bc, north_bc, loc, grid, Tuple(args); kwargs...)
 
 fill_bottom_and_top_halo!(c, bottom_bc, top_bc, size, offset, loc, arch, grid, args...; kwargs...) =
-    launch!(arch, grid, size, _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, offset, loc, grid, Tuple(args); kwargs...)
+    launch!(arch, grid, KernelParameters(size, offset), _fill_bottom_and_top_halo!, c, bottom_bc, top_bc, loc, grid, Tuple(args); kwargs...)
 
 #####
 ##### Calculate kernel size and offset for Windowed and Sliced Fields
diff --git a/src/BoundaryConditions/fill_halo_regions_flux.jl b/src/BoundaryConditions/fill_halo_regions_flux.jl
index 58d2fe9b01..d77c7c7ba2 100644
--- a/src/BoundaryConditions/fill_halo_regions_flux.jl
+++ b/src/BoundaryConditions/fill_halo_regions_flux.jl
@@ -38,46 +38,34 @@ using KernelAbstractions.Extras.LoopInfo: @unroll
 ##### Single halo filling kernels
 #####
 
-@kernel function fill_flux_west_halo!(c, offset, grid)
+@kernel function fill_flux_west_halo!(c, grid)
     j, k = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
-    _fill_flux_west_halo!(1, j′, k′, grid, c)
+    _fill_flux_west_halo!(1, j, k, grid, c)
 end
 
-@kernel function fill_flux_south_halo!(c, offset, grid)
+@kernel function fill_flux_south_halo!(c, grid)
     i, k = @index(Global, NTuple)
-    i′ = i + offset[1]
-    k′ = k + offset[2]
-    _fill_flux_south_halo!(i′, 1, k′, grid, c)
+    _fill_flux_south_halo!(i, 1, k, grid, c)
 end
 
-@kernel function fill_flux_bottom_halo!(c, offset, grid)
+@kernel function fill_flux_bottom_halo!(c, grid)
     i, j = @index(Global, NTuple)
-    i′ = i + offset[1]
-    j′ = j + offset[2]
-    _fill_flux_bottom_halo!(i′, j′, 1, grid, c)
+    _fill_flux_bottom_halo!(i, j, 1, grid, c)
 end
 
-@kernel function fill_flux_east_halo!(c, offset, grid)
+@kernel function fill_flux_east_halo!(c, grid)
     j, k = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
-    _fill_flux_east_halo!(1, j′, k′, grid, c)
+    _fill_flux_east_halo!(1, j, k, grid, c)
 end
 
-@kernel function fill_flux_north_halo!(c, offset, grid)
+@kernel function fill_flux_north_halo!(c, grid)
     i, k = @index(Global, NTuple)
-    i′ = i + offset[1]
-    k′ = k + offset[2]
-    _fill_flux_north_halo!(i′, 1, k′, grid, c)
+    _fill_flux_north_halo!(i, 1, k, grid, c)
 end
 
-@kernel function fill_flux_top_halo!(c, offset, grid)
+@kernel function fill_flux_top_halo!(c, grid)
     i, j = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
-    _fill_flux_top_halo!(i′, j′, 1, grid, c)
+    _fill_flux_top_halo!(i, j, 1, grid, c)
 end
 
 #####
@@ -85,15 +73,15 @@ end
 #####
 
 fill_west_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_west_halo!, c, offset,grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_west_halo!, c,grid; kwargs...)
 fill_east_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_east_halo!, c, offset,grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_east_halo!, c, grid; kwargs...)
 fill_south_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_south_halo!, c, offset,grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_south_halo!, c, grid; kwargs...)
 fill_north_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_north_halo!, c, offset,grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_north_halo!, c, grid; kwargs...)
 fill_bottom_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_bottom_halo!, c, offset,grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_bottom_halo!, c, grid; kwargs...)
 fill_top_halo!(c, bc::FBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = 
-            launch!(arch, grid, kernel_size, fill_flux_top_halo!, c, offset, grid; kwargs...)
+            launch!(arch, grid, KernelParameters(kernel_size, offset), fill_flux_top_halo!, c, grid; kwargs...)
 
diff --git a/src/BoundaryConditions/fill_halo_regions_open.jl b/src/BoundaryConditions/fill_halo_regions_open.jl
index 44f6bace21..3c944a3a78 100644
--- a/src/BoundaryConditions/fill_halo_regions_open.jl
+++ b/src/BoundaryConditions/fill_halo_regions_open.jl
@@ -9,33 +9,27 @@
 # because the boundary-normal index can vary (and array boundary conditions need to be
 # 3D in general).
 
-@kernel function set_west_or_east_u!(u, offset, i_boundary, bc, grid, args) 
+@kernel function set_west_or_east_u!(u, i_boundary, bc, grid, args) 
     j, k = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
-@inbounds u[i_boundary, j′, k′] = getbc(bc, j′, k′, grid, args...)
+@inbounds u[i_boundary, j, k] = getbc(bc, j, k, grid, args...)
 end
 
-@kernel function set_south_or_north_v!(v, offset, j_boundary, bc, grid, args)
+@kernel function set_south_or_north_v!(v, j_boundary, bc, grid, args)
     i, k = @index(Global, NTuple)
-    i′ = i + offset[1]
-    k′ = k + offset[2]
-@inbounds v[i′, j_boundary, k′] = getbc(bc, i′, k′, grid, args...)
+@inbounds v[i′, j_boundary, k] = getbc(bc, i, k, grid, args...)
 end
 
-@kernel function set_bottom_or_top_w!(w, offset, k_boundary, bc, grid, args) 
+@kernel function set_bottom_or_top_w!(w, k_boundary, bc, grid, args) 
     i, j = @index(Global, NTuple)
-    i′ = i + offset[1]
-    j′ = j + offset[2]
-@inbounds w[i′, j′, k_boundary] = getbc(bc, i′, j′, grid, args...)
+@inbounds w[i, j, k_boundary] = getbc(bc, i, j, grid, args...)
 end
 
-@inline   fill_west_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset,           1, bc, grid, Tuple(args); kwargs...)
-@inline   fill_east_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_west_or_east_u!,   u, offset, grid.Nx + 1, bc, grid, Tuple(args); kwargs...)
-@inline  fill_south_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset,           1, bc, grid, Tuple(args); kwargs...)
-@inline  fill_north_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_south_or_north_v!, v, offset, grid.Ny + 1, bc, grid, Tuple(args); kwargs...)
-@inline fill_bottom_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset,           1, bc, grid, Tuple(args); kwargs...)
-@inline    fill_top_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, kernel_size, set_bottom_or_top_w!,  w, offset, grid.Nz + 1, bc, grid, Tuple(args); kwargs...)
+@inline   fill_west_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_west_or_east_u!,   u,           1, bc, grid, Tuple(args); kwargs...)
+@inline   fill_east_halo!(u, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_west_or_east_u!,   u, grid.Nx + 1, bc, grid, Tuple(args); kwargs...)
+@inline  fill_south_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_south_or_north_v!, v,           1, bc, grid, Tuple(args); kwargs...)
+@inline  fill_north_halo!(v, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_south_or_north_v!, v, grid.Ny + 1, bc, grid, Tuple(args); kwargs...)
+@inline fill_bottom_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_bottom_or_top_w!,  w,           1, bc, grid, Tuple(args); kwargs...)
+@inline    fill_top_halo!(w, bc::OBC, kernel_size, offset, loc, arch, grid, args...; kwargs...) = launch!(arch, grid, KernelParameters(kernel_size, offset), set_bottom_or_top_w!,  w, grid.Nz + 1, bc, grid, Tuple(args); kwargs...)
 
 @inline   _fill_west_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[1, j, k]           = getbc(bc, j, k, grid, args...)
 @inline   _fill_east_halo!(j, k, grid, c, bc::OBC, loc, args...) = @inbounds c[grid.Nx + 1, j, k] = getbc(bc, j, k, grid, args...)
diff --git a/src/BoundaryConditions/fill_halo_regions_periodic.jl b/src/BoundaryConditions/fill_halo_regions_periodic.jl
index de98c0a47c..0ca2654295 100644
--- a/src/BoundaryConditions/fill_halo_regions_periodic.jl
+++ b/src/BoundaryConditions/fill_halo_regions_periodic.jl
@@ -17,19 +17,19 @@ end
 
 function fill_west_and_east_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, yz_size, offset = parent_size_and_offset(c, 2, 3, size, offset)
-    launch!(arch, grid, yz_size, fill_periodic_west_and_east_halo!, c_parent, offset, grid.Hx, grid.Nx; kw...)
+    launch!(arch, grid, KernelParameters(yz_size, offset), fill_periodic_west_and_east_halo!, c_parent, grid.Hx, grid.Nx; kw...)
     return nothing
 end
 
 function fill_south_and_north_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, xz_size, offset = parent_size_and_offset(c, 1, 3, size, offset)
-    launch!(arch, grid, xz_size, fill_periodic_south_and_north_halo!, c_parent, offset, grid.Hy, grid.Ny;  kw...)
+    launch!(arch, grid, KernelParameters(xz_size, offset), fill_periodic_south_and_north_halo!, c_parent, grid.Hy, grid.Ny;  kw...)
     return nothing
 end
 
 function fill_bottom_and_top_halo!(c, ::PBCT, ::PBCT, size, offset, loc, arch, grid, args...; kw...)
     c_parent, xy_size, offset = parent_size_and_offset(c, 1, 2, size, offset)
-    launch!(arch, grid, xy_size, fill_periodic_bottom_and_top_halo!, c_parent, offset, grid.Hz, grid.Nz; kw...)
+    launch!(arch, grid, KernelParameters(xy_size, offset), fill_periodic_bottom_and_top_halo!, c_parent, grid.Hz, grid.Nz; kw...)
     return nothing
 end
 
@@ -37,38 +37,32 @@ end
 ##### Periodic boundary condition kernels
 #####
 
-@kernel function fill_periodic_west_and_east_halo!(c, offset, H::Int, N)
+@kernel function fill_periodic_west_and_east_halo!(c, H::Int, N)
     j, k = @index(Global, NTuple)
-    j′ = j + offset[1]
-    k′ = k + offset[2]
     @unroll for i = 1:H
         @inbounds begin
-            c[i, j′, k′]     = c[N+i, j′, k′] # west
-            c[N+H+i, j′, k′] = c[H+i, j′, k′] # east
+            c[i, j, k]     = c[N+i, j, k] # west
+            c[N+H+i, j, k] = c[H+i, j, k] # east
         end
     end
 end
 
-@kernel function fill_periodic_south_and_north_halo!(c, offset, H::Int, N)
+@kernel function fill_periodic_south_and_north_halo!(c, H::Int, N)
     i, k = @index(Global, NTuple)
-    i′ = i + offset[1]
-    k′ = k + offset[2]
     @unroll for j = 1:H
         @inbounds begin
-            c[i′, j, k′]     = c[i′, N+j, k′] # south
-            c[i′, N+H+j, k′] = c[i′, H+j, k′] # north
+            c[i, j, k]     = c[i, N+j, k] # south
+            c[i, N+H+j, k] = c[i, H+j, k] # north
         end
     end
 end
 
-@kernel function fill_periodic_bottom_and_top_halo!(c, offset, H::Int, N)
+@kernel function fill_periodic_bottom_and_top_halo!(c, H::Int, N)
     i, j = @index(Global, NTuple)
-    i′ = i + offset[1]
-    j′ = j + offset[2]
     @unroll for k = 1:H
         @inbounds begin
-            c[i′, j′, k]     = c[i′, j′, N+k] # top
-            c[i′, j′, N+H+k] = c[i′, j′, H+k] # bottom
+            c[i, j, k]     = c[i, j, N+k] # top
+            c[i, j, N+H+k] = c[i, j, H+k] # bottom
         end
     end
 end
@@ -77,7 +71,7 @@ end
 #### Tupled periodic boundary condition 
 ####
 
-@kernel function fill_periodic_west_and_east_halo!(c::NTuple{M}, offset, H::Int, N) where M
+@kernel function fill_periodic_west_and_east_halo!(c::NTuple{M}, H::Int, N) where M
     j, k = @index(Global, NTuple)
     @unroll for n = 1:M
         @unroll for i = 1:H
@@ -89,7 +83,7 @@ end
     end
 end
 
-@kernel function fill_periodic_south_and_north_halo!(c::NTuple{M}, offset, H::Int, N) where M
+@kernel function fill_periodic_south_and_north_halo!(c::NTuple{M}, H::Int, N) where M
     i, k = @index(Global, NTuple)
     @unroll for n = 1:M
         @unroll for j = 1:H
@@ -101,7 +95,7 @@ end
     end
 end
 
-@kernel function fill_periodic_bottom_and_top_halo!(c::NTuple{M}, offset, H::Int, N) where M
+@kernel function fill_periodic_bottom_and_top_halo!(c::NTuple{M}, H::Int, N) where M
     i, j = @index(Global, NTuple)
     @unroll for n = 1:M
         @unroll for k = 1:H

From 6e687890ac45cb084fa309c1ed0a90e83ebf769d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:45:53 -0400
Subject: [PATCH 285/530] works

---
 src/Utils/kernel_launching.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 57a28adf92..9f8c0f51cc 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -10,11 +10,11 @@ struct KernelParameters{S, O} end
 
 KernelParameters(size, offsets) = KernelParameters{size, offsets}()
 
-worksize(::KernelParameters{S}) where S = S
+worktuple(::KernelParameters{S}) where S = S
 offsets(::KernelParameters{S, O}) where {S, O} = O
 
 offsets(workspec)  = nothing
-worksize(workspec) = workspec
+worktuple(workspec) = workspec
 
 flatten_reduced_dimensions(worksize, dims) = Tuple(i ∈ dims ? 1 : worksize[i] for i = 1:3)
 
@@ -91,7 +91,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                  only_active_cells = nothing,
                  kwargs...)
 
-    workgroup, worksize = work_layout(grid, worksize(workspec);
+    workgroup, worksize = work_layout(grid, worktuple(workspec);
                                       include_right_boundaries,
                                       reduced_dimensions,
                                       location)

From 3d96f4b8f22987931b12e6386688470b7b166190 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:49:26 -0400
Subject: [PATCH 286/530] away from turbulence closures

---
 .../CATKEVerticalDiffusivities.jl             | 35 +++----------------
 ...vective_adjustment_vertical_diffusivity.jl | 14 +++-----
 .../ri_based_vertical_diffusivity.jl          | 27 ++++----------
 3 files changed, 16 insertions(+), 60 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index e92828c28d..a7503f3e35 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -235,7 +235,7 @@ end
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; kernel_size = κ_CATKE_kernel_size(model.grid), kernel_offsets = κ_CATKE_kernel_offsets(model.grid))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = KernelParameters(grid, closure))
 
     arch = model.architecture
     grid = model.grid
@@ -245,43 +245,16 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model;
     clock = model.clock
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    launch!(arch, grid, kernel_size,
+    launch!(arch, grid, parameters,
             calculate_CATKE_diffusivities!,
-            diffusivities, kernel_offsets, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+            diffusivities, grid, closure, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
     return nothing
 end
 
-# extend κ kernel to compute also the boundaries
-@inline function κ_CATKE_kernel_size(grid) 
-    Nx, Ny, Nz = size(grid)
-    Tx, Ty, Tz = topology(grid)
-
-    Ax = Tx == Flat ? Nx : Nx + 4 
-    Ay = Ty == Flat ? Ny : Ny + 4 
-    Az = Tz == Flat ? Nz : Nz + 2
-
-    return (Ax, Ay, Az)
-end
-
-@inline function κ_CATKE_kernel_offsets(grid)
-    Tx, Ty, Tz = topology(grid)
-
-    Ax = Tx == Flat ? 0 : - 2
-    Ay = Ty == Flat ? 0 : - 2 
-    Az = Tz == Flat ? 0 : - 1 
-
-    return (Ax, Ay, Az)
-end
-
-
 @kernel function calculate_CATKE_diffusivities!(diffusivities, offs, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
-    i′, j′, k′ = @index(Global, NTuple)
-
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
+    i, j, k = @index(Global, NTuple)
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index 7c1e3bd14d..1ba58efaa6 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -88,29 +88,25 @@ DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCAVD) = (; κᶜ = Z
 @inline viscosity(::FlavorOfCAVD, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfCAVD, diffusivities, id) = diffusivities.κᶜ
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; kernel_size = κ_kernel_size(model.grid), kernel_offsets = κ_kernel_offsets(model.grid))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = KernelParameters(grid, closure))
 
     arch = model.architecture
     grid = model.grid
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, kernel_size,
+    launch!(arch, grid, parameters,
             ## If we can figure out how to only precompute the "stability" of a cell:
             # compute_stability!, diffusivities, grid, closure, tracers, buoyancy,
-            compute_convective_adjustment_diffusivities!, diffusivities, kernel_offsets, grid, closure, tracers, buoyancy)
+            compute_convective_adjustment_diffusivities!, diffusivities, grid, closure, tracers, buoyancy)
 
     return nothing
 end
 
 @inline is_stableᶜᶜᶠ(i, j, k, grid, tracers, buoyancy) = ∂z_b(i, j, k, grid, buoyancy, tracers) >= 0
 
-@kernel function compute_convective_adjustment_diffusivities!(diffusivities, offs, grid, closure, tracers, buoyancy)
-    i′, j′, k′ = @index(Global, NTuple)
-
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
+@kernel function compute_convective_adjustment_diffusivities!(diffusivities, grid, closure, tracers, buoyancy)
+    i, j, k = @index(Global, NTuple)
 
     # Ensure this works with "ensembles" of closures, in addition to ordinary single closures
     closure_ij = getclosure(i, j, closure)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 0190cf437d..e597399a41 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -139,8 +139,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κ, ν, Ri)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; kernel_size = κ_kernel_size(model.grid, closure), 
-                                                                               kernel_offsets = κ_kernel_offsets(model.grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = KernelParameters(grid, closure))
     arch = model.architecture
     grid = model.grid
     clock = model.clock
@@ -149,10 +148,9 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
     velocities = model.velocities
     top_tracer_bcs = NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-    launch!(arch, grid, kernel_size,
+    launch!(arch, grid, parameters,
             compute_ri_number!,
             diffusivities,
-            kernel_offsets,
             grid,
             closure,
             velocities,
@@ -161,10 +159,9 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; k
             top_tracer_bcs,
             clock)
 
-    launch!(arch, grid, kernel_size,
+    launch!(arch, grid, parameters,
             compute_ri_based_diffusivities!,
             diffusivities,
-            kernel_offsets,
             grid,
             closure,
             velocities,
@@ -199,27 +196,17 @@ const Tanh   = HyperbolicTangentRiDependentTapering
     return ifelse(N² <= 0, zero(grid), Ri)
 end
 
-@kernel function compute_ri_number!(diffusivities, offs, grid, closure::FlavorOfRBVD,
+@kernel function compute_ri_number!(diffusivities, grid, closure::FlavorOfRBVD,
                                     velocities, tracers, buoyancy, tracer_bcs, clock)
 
-    i′, j′, k′ = @index(Global, NTuple)
-
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
-
+    i, j, k = @index(Global, NTuple)
     @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
 end
 
-@kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid, closure::FlavorOfRBVD,
+@kernel function compute_ri_based_diffusivities!(diffusivities, grid, closure::FlavorOfRBVD,
                                                 velocities, tracers, buoyancy, tracer_bcs, clock)
 
-    i′, j′, k′ = @index(Global, NTuple)
-
-    i = i′ + offs[1] 
-    j = j′ + offs[2] 
-    k = k′ + offs[3]
-
+    i, j, k = @index(Global, NTuple)
     _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
                                      velocities, tracers, buoyancy, tracer_bcs, clock)
 end

From c8c75185b5c07bf9990c2a62f3ba91b5b35fd122 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 17:55:39 -0400
Subject: [PATCH 287/530] remove prime from split-explicit

---
 ...distributed_split_explicit_free_surface.jl |  8 ++--
 .../split_explicit_free_surface.jl            |  6 +--
 .../split_explicit_free_surface_kernels.jl    | 39 ++++++++-----------
 ...ulti_region_split_explicit_free_surface.jl |  9 +++--
 4 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 123b2c0c23..1f579eb5ae 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -24,9 +24,9 @@ function SplitExplicitAuxiliaryFields(grid::DistributedGrid)
     kernel_size    = augmented_kernel_size(grid)
     kernel_offsets = augmented_kernel_offsets(grid)
 
-    @show kernel_size, kernel_offsets
+    kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
     
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_size, kernel_offsets)
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
 end
 
 """Integrate z at locations `location` and set! `height`` with the result"""
@@ -55,8 +55,8 @@ end
 
     Rx, Ry, _ = architecture(grid).ranks
 
-    Ax = Rx == 1 || Tx == RightConnected ? 0 : Hx - 1
-    Ay = Ry == 1 || Ty == RightConnected ? 0 : Hy - 1
+    Ax = Rx == 1 || Tx == RightConnected ? 0 : - Hx + 1
+    Ay = Ry == 1 || Ty == RightConnected ? 0 : - Hy + 1
 
     return (Ax, Ay)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index f90a314784..e7c043e7df 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -166,7 +166,7 @@ large (or `:xy` in case of a serial computation), and start computing from
 
 $(FIELDS)
 """
-Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒞𝒞, 𝒦, 𝒪}
+Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒞𝒞, 𝒦}
     "Vertically-integrated slow barotropic forcing function for `U` (`ReducedField` over ``z``)"
     Gᵁ :: ℱ𝒞
     "Vertically-integrated slow barotropic forcing function for `V` (`ReducedField` over ``z``)"
@@ -178,9 +178,7 @@ Base.@kwdef struct SplitExplicitAuxiliaryFields{𝒞ℱ, ℱ𝒞, 𝒞𝒞, 𝒦
     "Depth at `(Center, Center)` (`ReducedField` over ``z``)"
     Hᶜᶜ :: 𝒞𝒞
     "kernel size for barotropic time stepping"
-    kernel_size :: 𝒦
-    "index offsets for halo calculations"
-    kernel_offsets :: 𝒪
+    kernel_parameters :: 𝒦
 end
 
 """
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index e8a680d476..e3af89f55c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -129,50 +129,44 @@ end
 @kernel function split_explicit_free_surface_evolution_kernel!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
                                                                η̅, U̅, V̅, averaging_weight, 
                                                                Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ,
-                                                               timestepper, offsets)
+                                                               timestepper)
     i, j = @index(Global, NTuple)
     k_top = grid.Nz+1
 
-    i′ = i - offsets[1]
-    j′ = j - offsets[2]
-
     TX, TY, _ = topology(grid)
 
     @inbounds begin        
-        advance_previous_free_surface!(i′, j′, k_top, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²)
+        advance_previous_free_surface!(i, j, k_top, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²)
 
         # ∂τ(η) = - ∇ ⋅ U. 
         # `k_top - 1` is used here to allow `immersed_peripheral_node` to be true 
         # NOTE: `immersed_peripheral_node` is _always_ false on `Nz+1` `Face`s because `peripheral_node` is always true
-        η[i′, j′, k_top] -= Δτ * (div_xᶜᶜᶠ_U(i′, j′, k_top-1, grid, TX, U★, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²) +
-                                  div_yᶜᶜᶠ_V(i′, j′, k_top-1, grid, TY, U★, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²))
+        η[i, j, k_top] -= Δτ * (div_xᶜᶜᶠ_U(i, j, k_top-1, grid, TX, U★, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²) +
+                                div_yᶜᶜᶠ_V(i, j, k_top-1, grid, TY, U★, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²))
     end
 end
 
 @kernel function split_explicit_barotropic_velocity_evolution_kernel!(grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
                                                                       η̅, U̅, V̅, averaging_weight, 
                                                                       Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ,
-                                                                      timestepper, offsets)
+                                                                      timestepper)
     i, j  = @index(Global, NTuple)
     k_top = grid.Nz+1
     
-    i′ = i - offsets[1]
-    j′ = j - offsets[2]
-
     TX, TY, _ = topology(grid)
 
     @inbounds begin 
-        advance_previous_velocity!(i′, j′, 1, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²)
-        advance_previous_velocity!(i′, j′, 1, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²)
+        advance_previous_velocity!(i, j, 1, timestepper, U, Uᵐ⁻¹, Uᵐ⁻²)
+        advance_previous_velocity!(i, j, 1, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²)
 
         # ∂τ(U) = - ∇η + G
-        U[i′, j′, 1] +=  Δτ * (- g * Hᶠᶜ[i′, j′] * ∂xᶠᶜᶠ_η(i′, j′, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i′, j′, 1])
-        V[i′, j′, 1] +=  Δτ * (- g * Hᶜᶠ[i′, j′] * ∂yᶜᶠᶠ_η(i′, j′, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i′, j′, 1])
+        U[i, j, 1] +=  Δτ * (- g * Hᶠᶜ[i′, j′] * ∂xᶠᶜᶠ_η(i, j, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i, j, 1])
+        V[i, j, 1] +=  Δτ * (- g * Hᶜᶠ[i′, j′] * ∂yᶜᶠᶠ_η(i, j, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i, j, 1])
                           
         # time-averaging
-        η̅[i′, j′, k_top] +=  averaging_weight * η[i′, j′, k_top]
-        U̅[i′, j′, 1]     +=  averaging_weight * U[i′, j′, 1]
-        V̅[i′, j′, 1]     +=  averaging_weight * V[i′, j′, 1]
+        η̅[i, j, k_top] +=  averaging_weight * η[i, j, k_top]
+        U̅[i, j, 1]     +=  averaging_weight * U[i, j, 1]
+        V̅[i, j, 1]     +=  averaging_weight * V[i, j, 1]
     end
 end
 
@@ -188,15 +182,14 @@ function split_explicit_free_surface_substep!(η, state, auxiliary, settings, ar
     timestepper      = settings.timestepper
     averaging_weight = settings.averaging_weights[substep_index]
     
-    offsets     = auxiliary.kernel_offsets
-    kernel_size = auxiliary.kernel_size
+    parameters = auxiliary.kernel_parameters
 
     args = (grid, Δτ, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻², U, V, Uᵐ⁻¹, Uᵐ⁻², Vᵐ⁻¹, Vᵐ⁻², 
             η̅, U̅, V̅, averaging_weight, 
-            Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ, timestepper, offsets)
+            Gᵁ, Gⱽ, g, Hᶠᶜ, Hᶜᶠ, timestepper)
 
-    launch!(arch, grid, kernel_size, split_explicit_free_surface_evolution_kernel!,        args...)
-    launch!(arch, grid, kernel_size, split_explicit_barotropic_velocity_evolution_kernel!, args...)
+    launch!(arch, grid, parameters, split_explicit_free_surface_evolution_kernel!,        args...)
+    launch!(arch, grid, parameters, split_explicit_barotropic_velocity_evolution_kernel!, args...)
 
     return nothing
 end
diff --git a/src/MultiRegion/multi_region_split_explicit_free_surface.jl b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
index 2c9afac569..4d7926deb6 100644
--- a/src/MultiRegion/multi_region_split_explicit_free_surface.jl
+++ b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
@@ -1,3 +1,4 @@
+using Oceananigans.Utils
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
@@ -23,7 +24,9 @@ function SplitExplicitAuxiliaryFields(grid::MultiRegionGrid)
     @apply_regionally kernel_size    = augmented_kernel_size(grid, grid.partition)
     @apply_regionally kernel_offsets = augmented_kernel_offsets(grid, grid.partition)
     
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_size, kernel_offsets)
+    @apply_regionally kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
+
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
 end
 
 @inline function calculate_column_height!(height, location)
@@ -34,8 +37,8 @@ end
 @inline augmented_kernel_size(grid, ::XPartition) = (size(grid, 1) + 2halo_size(grid)[1]-2, size(grid, 2))
 @inline augmented_kernel_size(grid, ::YPartition) = (size(grid, 1), size(grid, 2) + 2halo_size(grid)[2]-2)
 
-@inline augmented_kernel_offsets(grid, ::XPartition) = (halo_size(grid)[1]-1, 0)
-@inline augmented_kernel_offsets(grid, ::YPartition) = (0, halo_size(grid)[2]-1)
+@inline augmented_kernel_offsets(grid, ::XPartition) = (- halo_size(grid)[1] + 1, 0)
+@inline augmented_kernel_offsets(grid, ::YPartition) = (0, - halo_size(grid)[2] + 1)
 
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::MultiRegionGrid)
 

From e7ed2319aef30f831f590199b78e4f670a4cfe2d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 18:16:10 -0400
Subject: [PATCH 288/530] first fix

---
 src/Utils/kernel_launching.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 9f8c0f51cc..3f978bd1d6 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -106,8 +106,8 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     if worksize == 0
         return nothing
     end
-    
-    loop! = kernel!(Architectures.device(arch), workgroup, worksize, offset)
+    loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
+                                kernel!(Architectures.device(arch), workgroup, worksize, offset) 
 
     @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset"
 

From 4e3bbdcff14c41f1b2f06a6b9fcb5e009b8d3a93 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 19:02:41 -0400
Subject: [PATCH 289/530] second bugfix + blocking -> async

---
 src/Architectures.jl                                      | 4 ++--
 src/BoundaryConditions/fill_halo_regions.jl               | 6 +++---
 src/Distributed/halo_communication.jl                     | 8 ++++----
 .../split_explicit_free_surface_kernels.jl                | 4 ++--
 .../update_hydrostatic_free_surface_model_state.jl        | 2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index 296c550f11..9cd3a7487d 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -87,11 +87,11 @@ function unified_array(::GPU, arr::AbstractArray)
 end
 
 ## Only for contiguous data!! (i.e. only if the offset for pointer(dst::CuArray, offset::Int) is 1)
-@inline function device_copy_to!(dst::CuArray, src::CuArray; blocking::Bool = true) 
+@inline function device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false) 
     n = length(src)
     context!(context(src)) do
         GC.@preserve src dst begin
-            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async = !(blocking))
+            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async)
         end
     end
     return dst
diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index 7ef16a9327..e708b00ec0 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -187,7 +187,7 @@ end
 
 import Oceananigans.Utils: @constprop
 
-@kernel function _fill_west_and_east_halo!(c::NTuple, west_bc, east_bc, offset, loc, grid, args)
+@kernel function _fill_west_and_east_halo!(c::NTuple, west_bc, east_bc, loc, grid, args)
     j, k = @index(Global, NTuple)
     ntuple(Val(length(west_bc))) do n
         Base.@_inline_meta
@@ -199,7 +199,7 @@ import Oceananigans.Utils: @constprop
     end
 end
 
-@kernel function _fill_south_and_north_halo!(c::NTuple, south_bc, north_bc, offset, loc, grid, args) 
+@kernel function _fill_south_and_north_halo!(c::NTuple, south_bc, north_bc, loc, grid, args) 
     i, k = @index(Global, NTuple)
     ntuple(Val(length(south_bc))) do n
         Base.@_inline_meta
@@ -211,7 +211,7 @@ end
     end
 end
 
-@kernel function _fill_bottom_and_top_halo!(c::NTuple, bottom_bc, top_bc, offset, loc, grid, args) 
+@kernel function _fill_bottom_and_top_halo!(c::NTuple, bottom_bc, top_bc, loc, grid, args) 
     i, j = @index(Global, NTuple)
     ntuple(Val(length(bottom_bc))) do n
         Base.@_inline_meta
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 5d2b887ff5..0f55997c12 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -160,7 +160,7 @@ for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2,
 end
 
 # If more than one direction is communicating we need to add a corner passing routine!
-function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; blocking = true, kwargs...)
+function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; async = false, kwargs...)
     
     requests = MPI.Request[]
 
@@ -178,7 +178,7 @@ function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args.
         return nothing
     end
 
-    if !blocking && !(arch isa BlockingDistributedArch)
+    if async && !(arch isa BlockingDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
@@ -204,7 +204,7 @@ end
 cooperative_wait(req::MPI.Request)            = MPI.Waitall(req)
 cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; blocking = true, kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = false, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
@@ -222,7 +222,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
 
     # Overlapping communication and computation, store requests in a `MPI.Request`
     # pool to be waited upon after tendency calculation
-    if !blocking && !(arch isa BlockingDistributedArch)
+    if async && !(arch isa BlockingDistributedArch)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index e3af89f55c..339ce5f78d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -291,7 +291,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     end
 
     fields_to_fill = (free_surface.state.U̅, free_surface.state.V̅)
-    fill_halo_regions!(fields_to_fill; blocking = false)
+    fill_halo_regions!(fields_to_fill; async = true)
 
     # Preparing velocities for the barotropic correction
     @apply_regionally begin 
@@ -354,7 +354,7 @@ function setup_free_surface!(model, free_surface::SplitExplicitFreeSurface, χ)
     @apply_regionally setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
     fields_to_fill = (auxiliary.Gᵁ, auxiliary.Gⱽ)
-    fill_halo_regions!(fields_to_fill; blocking = false)
+    fill_halo_regions!(fields_to_fill; async = true)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 6691ef7900..6e595655e3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -28,7 +28,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
 
     @apply_regionally mask_immersed_model_fields!(model, grid)
 
-    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); blocking = false)
+    fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
 

From a313a91f2f2fadc24c83ab03255b8388a27d9f33 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 19:03:57 -0400
Subject: [PATCH 290/530] comment

---
 src/Utils/kernel_launching.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 3f978bd1d6..fcf9de814a 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -6,6 +6,7 @@ using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.Grids: AbstractGrid
 
+"""Parameters for kernel launch, containing kernel size (`S`) and kernel offsets (`O`)"""
 struct KernelParameters{S, O} end
 
 KernelParameters(size, offsets) = KernelParameters{size, offsets}()

From da0259c6c17b8c9adc2ba9e4b45b6cdcb4045dc2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 19:10:16 -0400
Subject: [PATCH 291/530] remove comments

---
 src/ImmersedBoundaries/active_cells_map.jl    | 23 +------------------
 .../ri_based_vertical_diffusivity.jl          | 23 +++++++++++--------
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 1686e304da..9dbef8d9d0 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -116,25 +116,4 @@ function active_cells_map_surface(ibg)
     smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
     
     return smaller_indices
-end
-
-# using Oceananigans.TurbulenceClosures: Riᶜᶜᶠ, _compute_ri_based_diffusivities!, FlavorOfRBVD
-# import Oceananigans.TurbulenceClosures: compute_ri_number!, compute_ri_based_diffusivities!
-
-# @kernel function compute_ri_number!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-#     velocities, tracers, buoyancy, tracer_bcs, clock)
-#     idx = @index(Global, Linear)
-#     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
-
-#     @inbounds diffusivities.Ri[i, j, k] = Riᶜᶜᶠ(i, j, k, grid, velocities, buoyancy, tracers)
-# end
-
-# @kernel function compute_ri_based_diffusivities!(diffusivities, offs, grid::ActiveCellsIBG, closure::FlavorOfRBVD,
-#                 velocities, tracers, buoyancy, tracer_bcs, clock)
-
-#     idx = @index(Global, Linear)
-#     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
-            
-#     _compute_ri_based_diffusivities!(i, j, k, diffusivities, grid, closure,
-#      velocities, tracers, buoyancy, tracer_bcs, clock)
-# end
+end
\ No newline at end of file
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index e597399a41..38bdc5b51f 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -133,10 +133,10 @@ with_tracers(tracers, closure::FlavorOfRBVD) = closure
 
 # Note: computing diffusivities at cell centers for now.
 function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
-    κ  = Field((Center, Center, Face), grid)
-    ν  = Field((Center, Center, Face), grid)
+    κᶜ = Field((Center, Center, Face), grid)
+    κᵘ = Field((Center, Center, Face), grid)
     Ri = Field((Center, Center, Face), grid)
-    return (; κ, ν, Ri)
+    return (; κᶜ, κᵘ, Ri)
 end
 
 function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = KernelParameters(grid, closure))
@@ -244,14 +244,19 @@ end
     Ri = ℑxyᶜᶜᵃ(i, j, k, grid, ℑxyᶠᶠᵃ, diffusivities.Ri)
 
     τ = taper(tapering, Ri, Ri₀, Riᵟ)
-    κ★ = κ₀ * τ
-    κ★ = ν₀ * τ
+    κᶜ★ = κ₀ * τ
+    κᵘ★ = ν₀ * τ
 
-    κⁿ = κᶜ + κᵉ + κ★
-    νⁿ = ν★
-    @inbounds diffusivities.κ[i, j, k] = κⁿ
-    @inbounds diffusivities.ν[i, j, k] = νⁿ
+    # Previous diffusivities
+    κᶜ = diffusivities.κᶜ
+    κᵘ = diffusivities.κᵘ
 
+    # New diffusivities
+    κᶜ⁺ = κᶜᵃ + κᵉⁿ + κᶜ★
+    κᵘ⁺ = κᵘ★
+
+    @inbounds κᶜ[i, j, k] = (Cᵃᵛ * κᶜ[i, j, k] + κᶜ⁺) / (1 + Cᵃᵛ)
+    @inbounds κᵘ[i, j, k] = (Cᵃᵛ * κᵘ[i, j, k] + κᵘ⁺) / (1 + Cᵃᵛ)
     return nothing
 end
 

From bcd7c19b890a9ed312a7aff8a4362c0d7cd1186b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 19:24:23 -0400
Subject: [PATCH 292/530] third bugfix

---
 .../turbulence_closure_implementations/scalar_diffusivity.jl  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index c3e4d23d41..2d2962944d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -150,9 +150,9 @@ HorizontalDivergenceScalarDiffusivity(FT::DataType; kwargs...) = ScalarDiffusivi
 
 required_halo_size(closure::ScalarDiffusivity) = 1 
  
-function with_tracers(tracers, closure::ScalarDiffusivity{TD, F}) where {TD, F}
+function with_tracers(tracers, closure::ScalarDiffusivity{TD, F, N}) where {TD, F, N}
     κ = tracer_diffusivities(tracers, closure.κ)
-    return ScalarDiffusivity{TD, F}(closure.ν, κ)
+    return ScalarDiffusivity{TD, F, N}(closure.ν, κ)
 end
 
 @inline viscosity(closure::ScalarDiffusivity, K) = closure.ν

From 436eb38708142e4abf87e9f730cfa409ab5b5183 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 19:58:22 -0400
Subject: [PATCH 293/530] fourth bugfix

---
 src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 627050366c..23aaec8970 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -7,7 +7,7 @@ the `buoyancy_perturbationᶜᶜᶜ` downwards:
 
     `pHY′ = ∫ buoyancy_perturbationᶜᶜᶜ dz` from `z=0` down to `z=-Lz`
 """
-@kernel function _update_hydrostatic_pressure!(pHY′, offs, grid, buoyancy, C)
+@kernel function _update_hydrostatic_pressure!(pHY′, grid, buoyancy, C)
     i, j = @index(Global, NTuple)
 
     @inbounds pHY′[i, j, grid.Nz] = - z_dot_g_bᶜᶜᶠ(i, j, grid.Nz+1, grid, buoyancy, C) * Δzᶜᶜᶠ(i, j, grid.Nz+1, grid)

From ae509f7f2c97cccd8549e4b7aec790a65cdf2585 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 20:21:01 -0400
Subject: [PATCH 294/530] bugfix 6 + 7 and improve validation

---
 src/BoundaryConditions/fill_halo_regions.jl                   | 4 ++--
 .../hydrostatic_free_surface_model.jl                         | 2 +-
 .../scalar_biharmonic_diffusivity.jl                          | 4 ++--
 .../distributed_simulations/mpi_hydrostatic_turbulence.jl     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/BoundaryConditions/fill_halo_regions.jl b/src/BoundaryConditions/fill_halo_regions.jl
index e708b00ec0..c4874ab04c 100644
--- a/src/BoundaryConditions/fill_halo_regions.jl
+++ b/src/BoundaryConditions/fill_halo_regions.jl
@@ -9,8 +9,8 @@ import Base
 ##### General halo filling functions
 #####
 
-fill_halo_regions!(::Nothing, args...) = nothing
-fill_halo_regions!(::NamedTuple{(), Tuple{}}, args...) = nothing
+fill_halo_regions!(::Nothing, args...; kwargs...) = nothing
+fill_halo_regions!(::NamedTuple{(), Tuple{}}, args...; kwargs...) = nothing
 
 """
     fill_halo_regions!(fields::Union{Tuple, NamedTuple}, arch, args...)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 7f41a1ba6c..474d5a0abc 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -199,7 +199,7 @@ function HydrostaticFreeSurfaceModel(; grid,
                                         free_surface, forcing, closure, particles, biogeochemistry, velocities, tracers,
                                         pressure, diffusivity_fields, timestepper, auxiliary_fields)
 
-    update_state!(model)
+    # update_state!(model)
 
     return model
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index 5c4bf11ffe..c88f1059ce 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -74,9 +74,9 @@ function ScalarBiharmonicDiffusivity(formulation=ThreeDimensionalFormulation(),
     return ScalarBiharmonicDiffusivity{typeof(formulation), boundary_buffer}(ν, κ)
 end
 
-function with_tracers(tracers, closure::ScalarBiharmonicDiffusivity{F}) where {F}
+function with_tracers(tracers, closure::ScalarBiharmonicDiffusivity{F, N}) where {F, N}
     κ = tracer_diffusivities(tracers, closure.κ)
-    return ScalarBiharmonicDiffusivity{F}(closure.ν, κ)
+    return ScalarBiharmonicDiffusivity{F, N}(closure.ν, κ)
 end
 
 @inline viscosity(closure::ScalarBiharmonicDiffusivity, K) = closure.ν
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index ae59448e8b..808d1dba1d 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -16,7 +16,7 @@ ranks = (2, 2, 1)
 topo  = (Periodic, Periodic, Bounded)
 arch  = DistributedArch(CPU(), ranks=ranks, topology=topo, use_buffers=true)
 
-grid  = RectilinearGrid(arch, topology=topo, size=(28, 28, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
+grid  = RectilinearGrid(arch, topology=topo, size=(28 ÷ 4, 28, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
 
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 

From cf5fa8493c7178530c93bae431691e949bcfc751 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 20:21:35 -0400
Subject: [PATCH 295/530] improve validation

---
 .../mpi_nonhydrostatic_two_dimensional_turbulence.jl |  2 +-
 .../distributed_simulations/mpi_output_writing.jl    |  2 +-
 validation/distributed_simulations/mpi_set.jl        |  2 +-
 .../mpi_shallow_water_turbulence.jl                  | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
index ca13a77051..3b0a93ef92 100644
--- a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
+++ b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
@@ -29,7 +29,7 @@ Nx = Ny = 256
 Lx = Ly = 2π
 topology = (Periodic, Periodic, Flat)
 arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
-grid = RectilinearGrid(arch; topology, size=(Nx, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
+grid = RectilinearGrid(arch; topology, size=(Nx ÷ Nranks, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
 
 @info "Built $Nranks grids:"
 @show grid
diff --git a/validation/distributed_simulations/mpi_output_writing.jl b/validation/distributed_simulations/mpi_output_writing.jl
index 4bb0a8e4ff..8926ac9b90 100644
--- a/validation/distributed_simulations/mpi_output_writing.jl
+++ b/validation/distributed_simulations/mpi_output_writing.jl
@@ -10,7 +10,7 @@ Nranks = MPI.Comm_size(comm)
 
 topology = (Periodic, Periodic, Flat)
 arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
-grid = RectilinearGrid(arch; topology, size=(16, 16), halo=(3, 3), extent=(2π, 2π))
+grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), halo=(3, 3), extent=(2π, 2π))
 
 model = NonhydrostaticModel(; grid)
 
diff --git a/validation/distributed_simulations/mpi_set.jl b/validation/distributed_simulations/mpi_set.jl
index 26d00de4f1..2c4a374c0b 100644
--- a/validation/distributed_simulations/mpi_set.jl
+++ b/validation/distributed_simulations/mpi_set.jl
@@ -11,7 +11,7 @@ Nranks = MPI.Comm_size(MPI.COMM_WORLD)
 # Setup model
 topology = (Periodic, Periodic, Flat)
 arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
-grid = RectilinearGrid(arch; topology, size=(16, 16), extent=(2π, 2π))
+grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), extent=(2π, 2π))
 c = CenterField(grid)
 
 f(x, y, z) = rand()
diff --git a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
index 00a0c449ec..7ada7cd29c 100644
--- a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
+++ b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
@@ -11,16 +11,16 @@ using Statistics
 using Oceananigans
 using Oceananigans.Distributed
 
-     ranks = (2, 2, 1)
-      topo = (Periodic, Periodic, Flat)
-      arch = MultiCPU(CPU(), ranks=ranks)
-      grid = RectilinearGrid(arch, topology=topo, size=(128, 128), extent=(4π, 4π), halo=(3, 3))
+ranks = (2, 2, 1)
+topo = (Periodic, Periodic, Flat)
+arch = DistributedArch(CPU(), ranks=ranks, topology=topo)
+grid = RectilinearGrid(arch, topology=topo, size=(128 ÷ ranks[1], 128 ÷ ranks[2]), extent=(4π, 4π), halo=(3, 3))
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
 model = ShallowWaterModel(
                           grid = grid,
                    timestepper = :RungeKutta3,
-                     advection = UpwindBiasedFifthOrder(),
+            momentum_advection = UpwindBiasedFifthOrder(),
     gravitational_acceleration = 1.0
 )
 
@@ -31,7 +31,7 @@ uh₀ .-= mean(uh₀);
 set!(model, uh=uh₀, vh=uh₀)
 
 progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
-simulation = Simulation(model, Δt=0.001, stop_time=100.0, iteration_interval=1, progress=progress)
+simulation = Simulation(model, Δt=0.001, stop_time=100.0)
 
 uh, vh, h = model.solution
 outputs = (ζ=Field(∂x(vh/h) - ∂y(uh/h)),)

From 926103cbd0add332d3c64489a94c59726f94cd01 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 20:22:37 -0400
Subject: [PATCH 296/530] eighth bugfix

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index a7503f3e35..d5be1c273a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -235,7 +235,7 @@ end
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = KernelParameters(grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = KernelParameters(model.grid, closure))
 
     arch = model.architecture
     grid = model.grid

From 1316dcae9a94b1f5b1d28610132b2f063349bfe8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 20:52:26 -0400
Subject: [PATCH 297/530] theth/eleventh/twelth (probably last) bugfix

---
 src/Fields/field_boundary_buffers.jl          | 14 +++++++---
 .../shallow_water_diffusion_operators.jl      |  2 +-
 src/TurbulenceClosures/TurbulenceClosures.jl  |  5 ++++
 src/TurbulenceClosures/closure_tuples.jl      |  2 --
 ...vective_adjustment_vertical_diffusivity.jl |  2 +-
 .../ri_based_vertical_diffusivity.jl          |  2 +-
 test/test_distributed_models.jl               | 28 +++++++++----------
 7 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 474a7d3e53..7310e76336 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -61,9 +61,7 @@ end
 
 using_buffered_communication(arch) = true
 
-const PassingBC = Union{MCBC, DCBC}
-
-function create_buffer_x(arch, grid, data, H, ::PassingBC) 
+function create_buffer_x(arch, grid, data, H, ::DCBC) 
     if !using_buffered_communication(arch)
         return nothing
     end
@@ -71,7 +69,7 @@ function create_buffer_x(arch, grid, data, H, ::PassingBC)
             recv = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))))    
 end
 
-function create_buffer_y(arch, grid, data, H, ::PassingBC)
+function create_buffer_y(arch, grid, data, H, ::DCBC)
     if !using_buffered_communication(arch)
         return nothing
     end
@@ -79,6 +77,14 @@ function create_buffer_y(arch, grid, data, H, ::PassingBC)
             recv = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))))
 end
 
+create_buffer_x(arch, grid, data, H, ::MCBC) = 
+           (send = arch_array(arch, zeros(eltype(data), H, size(parent(data), 2), size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), H, size(parent(data), 2), size(parent(data), 3))))    
+
+create_buffer_y(arch, grid, data, H, ::MCBC) = 
+           (send = arch_array(arch, zeros(eltype(data), size(parent(data), 1), H, size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), size(parent(data), 1), H, size(parent(data), 3))))
+
 Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
     FieldBoundaryBuffers(Adapt.adapt(to, buff.west), 
                          Adapt.adapt(to, buff.east),    
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index 885f96deb6..908773bb8b 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -23,7 +23,7 @@ import Oceananigans.TurbulenceClosures:
 struct ShallowWaterScalarDiffusivity{V, X, N} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, N}
     ν :: V
     ξ :: X
-    ShallowWaterScalarDiffusivity{N}(ν::V, ξ::X) where {N, V, X} = new{N, V, X}(ν, ξ)
+    ShallowWaterScalarDiffusivity{N}(ν::V, ξ::X) where {N, V, X} = new{V, X, N}(ν, ξ)
 end
 
 """
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 18c870956b..5b60ec7bcf 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -76,6 +76,11 @@ calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs.
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
 
+import Oceananigans.Utils: KernelParameters
+
+KernelParameters(grid::AbstractGrid, closure) =
+        KernelParameters(κ_kernel_size(grid, closure), κ_kernel_offsets(grid, closure))
+
 # Interface for KE-based closures
 function shear_production end
 function buoyancy_flux end
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 40a4a91e96..914b64d216 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,8 +86,6 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
-KernelParameters(grid::AbstractGrid, closure) = KernelParameters(κ_kernel_size(grid, closure), κ_kernel_offsets(grid, closure))
-
 @inline function κ_kernel_size(grid, closure_tuple::Tuple)
     kernel_size = (0, 0, 0)
     for closure in closure_tuple
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index 1ba58efaa6..ae0e2156ea 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -88,7 +88,7 @@ DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCAVD) = (; κᶜ = Z
 @inline viscosity(::FlavorOfCAVD, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfCAVD, diffusivities, id) = diffusivities.κᶜ
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = KernelParameters(grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = KernelParameters(model.grid, closure))
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 38bdc5b51f..181601bb26 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -139,7 +139,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κᶜ, κᵘ, Ri)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = KernelParameters(grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = KernelParameters(model.grid, closure))
     arch = model.architecture
     grid = model.grid
     clock = model.clock
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 3d6cfc2e0b..221e66e047 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -211,7 +211,7 @@ end
 function test_triply_periodic_local_grid_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(4, 1, 1), topology = topo)
-    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    local_grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     nx, ny, nz = size(local_grid)
@@ -229,7 +229,7 @@ end
 function test_triply_periodic_local_grid_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(1, 4, 1), topology = topo)
-    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     nx, ny, nz = size(local_grid)
@@ -247,7 +247,7 @@ end
 function test_triply_periodic_local_grid_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(1, 1, 4), topology = topo)
-    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 2), extent=(1, 2, 3))
     
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     nx, ny, nz = size(local_grid)
@@ -265,7 +265,7 @@ end
 function test_triply_periodic_local_grid_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(2, 2, 1), topology = topo)
-    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    local_grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     
     i, j, k = arch.local_index
     nx, ny, nz = size(local_grid)
@@ -289,7 +289,7 @@ end
 function test_triply_periodic_bc_injection_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(ranks=(4, 1, 1), topology=topo)
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
@@ -306,7 +306,7 @@ end
 function test_triply_periodic_bc_injection_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(ranks=(1, 4, 1))
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
@@ -323,7 +323,7 @@ end
 function test_triply_periodic_bc_injection_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(ranks=(1, 1, 4))
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 2), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
@@ -340,7 +340,7 @@ end
 function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(ranks=(2, 2, 1))
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+    grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
@@ -362,7 +362,7 @@ function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch
     topo = (Periodic, Periodic, Periodic)
     for use_buffers in (true , false)
         arch = DistributedArch(child_arch; ranks=(4, 1, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid = RectilinearGrid(arch, topology=topo, size=(16, 6, 4), extent=(1, 2, 3), halo=halo)
+        grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
 
         for field in merge(fields(model))
@@ -388,7 +388,7 @@ function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch
     topo  = (Periodic, Periodic, Periodic)
     for use_buffers in (true , false)
         arch = DistributedArch(child_arch; ranks=(1, 4, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid  = RectilinearGrid(arch, topology=topo, size=(4, 16, 4), extent=(1, 2, 3), halo=halo)
+        grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
 
         for field in merge(fields(model), model.pressures)
@@ -411,8 +411,8 @@ end
 function test_triply_periodic_halo_communication_with_114_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
     for use_buffers in (true , false)
-        arch = DistributedArch(child_arch; ranks=(1, 4, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 16), extent=(1, 2, 3), halo=halo)
+        arch = DistributedArch(child_arch; ranks=(1, 1, 4), use_buffers, devices = (0, 0, 0, 0))
+        grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
 
         for field in merge(fields(model))
@@ -513,7 +513,7 @@ end
                 @info "Time-stepping a distributed NonhydrostaticModel with ranks $ranks..."
                 topo = (Periodic, Periodic, Periodic)
                 arch = DistributedArch(; ranks)
-                grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 8), extent=(1, 2, 3))
+                grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
                 model = NonhydrostaticModel(; grid)
 
                 time_step!(model, 1)
@@ -533,7 +533,7 @@ end
             topo = (Periodic, Periodic, Flat)
             use_buffers = child_arch isa GPU ? true : false
             arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology = topo, use_buffers, devices = (0, 0, 0, 0))
-            grid = RectilinearGrid(arch, topology=topo, size=(8, 8), extent=(1, 2), halo=(3, 3))
+            grid = RectilinearGrid(arch, topology=topo, size=(8, 2), extent=(1, 2), halo=(3, 3))
             model = ShallowWaterModel(; momentum_advection=nothing, mass_advection=nothing, tracer_advection=nothing, grid, gravitational_acceleration=1)
 
             set!(model, h=1)

From fc3b95ed27a8339792ade1f0f22ea1d368d7680e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:18:51 -0400
Subject: [PATCH 298/530] 13th bugfix + refactor recompute tendencies

---
 .../recompute_boundary_tendencies.jl          | 149 +++++++++---------
 .../CATKEVerticalDiffusivities.jl             |   2 +-
 2 files changed, 78 insertions(+), 73 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 538b08979c..c5f63d6a09 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -1,4 +1,6 @@
 import Oceananigans.Distributed: compute_boundary_tendencies!
+using Oceananigans.Utils: worktuple, offsets
+using Oceananigans.TurbulenceClosures: required_halo_size
 
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
@@ -8,7 +10,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     # We need new values for `w`, `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
 
-    sizes, offsets = size_tendency_kernel(grid, arch)
+    kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
 
     u_immersed_bc = immersed_boundary_condition(model.velocities.u)
     v_immersed_bc = immersed_boundary_condition(model.velocities.v)
@@ -30,14 +32,16 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-        launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
+    for parameters in kernel_parameters
+        launch!(arch, grid, parameters,
                 calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args)
     
-        launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
+        launch!(arch, grid, parameters,
                 calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args)
         
-        calculate_free_surface_tendency!(grid, model, KernelParameteres(kernel_size[1:2], kernel_offsets[1:2]))
+        η_parameters = KernelParameters(worktuple(parameters)[1:2], offsets(parameters)[1:2])
+
+        calculate_free_surface_tendency!(grid, model, η_parameters)
     end
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
@@ -66,8 +70,8 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
                      c_forcing,
                      model.clock)
 
-        for (kernel_size, kernel_offsets) in zip(sizes, offsets)
-            launch!(arch, grid, KernelParameters(kernel_size, kernel_offsets),
+        for parameters in kernel_parameters
+            launch!(arch, grid, parameters,
                     tendency_kernel!, c_tendency, kernel_offsets, grid, args)
         end
     end
@@ -75,116 +79,117 @@ end
 
 function recompute_auxiliaries!(model, grid, arch)
     
-    sizes, offs = size_w_kernel(grid, arch)
+    kernel_parameters = boundary_w_kernel_parameters(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        compute_w_from_continuity!(model.velocities, arch, grid; parameters = KernelParameters(kernel_size, kernel_offsets))
+    for parameters in kernel_parameters
+        compute_w_from_continuity!(model.velocities, arch, grid; parameters)
     end
 
-    sizes, offs = size_p_kernel(grid, arch)
+    kernel_parameters = boundary_p_kernel_parameters(grid, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; 
-                                     parameters = KernelParameters(kernel_size, kernel_offsets))
+    for parameters in kernel_parameters
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters)
     end
 
-    sizes, offs = size_κ_kernel(grid, arch)
+    kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
 
-    for (kernel_size, kernel_offsets) in zip(sizes, offs)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model;
-                                 parameters = KernelParameters(kernel_size, kernel_offsets))
+    for parameters in kernel_parameters
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters)
     end
 end
 
-function size_w_kernel(grid, arch)
+# w needs computing in the range - H + 1 : 0 and N - 1 : N + H - 1
+function boundary_w_kernel_parameters(grid, arch)
     Nx, Ny, _ = size(grid)
     Hx, Hy, _ = halo_size(grid)
-    Rx, Ry, _ = arch.ranks
-
-    size_x = (Hx, Ny)
-    size_y = (Nx, Hy)
 
-    offsᴸx = (-Hx+1, 0)
-    offsᴸy = (0, -Hy+1)
-    offsᴿx = (Nx-1, 0)
-    offsᴿy = (0, Ny-1)
+    Sx  = (Hx, Ny)
+    Sy  = (Nx, Hy)
+             
+    Oᴸx = (-Hx+1, 0)
+    Oᴸy = (0, -Hy+1)
+    Oᴿx = (Nx-1,  0)
+    Oᴿy = (0,  Ny-1)
 
-    sizes = (size_x, size_y, size_x, size_y)
-    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+    sizes = ( Sx,  Sy,  Sx,  Sy)
+    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs, grid)
+    return communicating_boundaries(arch, sizes, offs, grid)
 end
 
-function size_p_kernel(grid, arch)
+# p needs computing in the range  0 : 0 and N + 1 : N + 1
+function boundary_p_kernel_parameters(grid, arch)
     Nx, Ny, _ = size(grid)
-    Rx, Ry, _ = arch.ranks
-
-    size_x = (1, Ny)
-    size_y = (Nx, 1)
 
-    offsᴸx = (-1, 0)
-    offsᴸy = (0, -1)
-    offsᴿx = (Nx, 0)
-    offsᴿy = (0, Ny)
+    Sx  = (1, Ny)
+    Sy  = (Nx, 1)
+             
+    Oᴸx = (-1, 0)
+    Oᴸy = (0, -1)
+    Oᴿx = (Nx, 0)
+    Oᴿy = (0, Ny)
 
-    sizes = (size_x, size_y, size_x, size_y)
-    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+    sizes = ( Sx,  Sy,  Sx,  Sy)
+    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs, grid)
+    return communicating_boundaries(arch, sizes, offs, grid)
 end
 
-function size_κ_kernel(grid, arch)
+# diffusivities need computing in the range 0 : B and N - B : N + 1
+function boundary_κ_kernel_parameters(grid, closure, arch)
     Nx, Ny, Nz = size(grid)
-    Hx, Hy, _ = halo_size(grid)
-    Rx, Ry, _  = arch.ranks
 
-    size_x = (Hx, Ny, Nz)
-    size_y = (Nx, Hy, Nz)
+    B = required_halo_size(closure)
 
-    offsᴸx = (-Hx+2, 0, 0)
-    offsᴸy = (0, -Hy+2, 0)
-    offsᴿx = (Nx-2,  0, 0)
-    offsᴿy = (0,  Ny-2, 0)
+    Sx  = (B+1, Ny, Nz)
+    Sy  = (Nx, B+1, Nz)
+        
+    Oᴸx = (-1, 0, 0)
+    Oᴸy = (0, -1, 0)
+    Oᴿx = (Nx-B,  0, 0)
+    Oᴿy = (0,  Ny-B, 0)
 
-    sizes = (size_x, size_y, size_x, size_y)
-    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+    sizes = ( Sx,  Sy,  Sx,  Sy)
+    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs, grid)
+    return communicating_boundaries(arch, sizes, offs, grid)
 end
 
-function size_tendency_kernel(grid, arch)
+# tendencies need computing in the range 1 : H and N - H + 1 : N 
+function boundary_tendency_kernel_parameters(grid, arch)
     Nx, Ny, Nz = size(grid)
     Hx, Hy, _  = halo_size(grid)
-    Rx, Ry, _  = arch.ranks
-    
-    size_x = (Hx, Ny, Nz)
-    size_y = (Nx, Hy, Nz)
     
-    offsᴸx = (0,  0,  0)
-    offsᴸy = (0,  0,  0)
-    offsᴿx = (Nx-Hx, 0,     0)
-    offsᴿy = (0,     Ny-Hy, 0)
-
-    sizes = (size_x, size_y, size_x, size_y)
-    offs  = (offsᴸx, offsᴸy, offsᴿx, offsᴿy)
+    Sx  = (Hx, Ny, Nz)
+    Sy  = (Nx, Hy, Nz)
+         
+    Oᴸx = (0,  0,  0)
+    Oᴸy = (0,  0,  0)
+    Oᴿx = (Nx-Hx, 0,     0)
+    Oᴿy = (0,     Ny-Hy, 0)
+
+    sizes = ( Sx,  Sy,  Sx,  Sy)
+    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return return_correct_directions(Rx, Ry, sizes, offs, grid)
+    return communicating_boundaries(arch, sizes, offs, grid)
 end
 
 using Oceananigans.Operators: XFlatGrid, YFlatGrid
 
-function return_correct_directions(Rx, Ry, s, o, grid) 
+function communicating_boundaries(arch, S, O, grid) 
+    Rx, Ry, _ = arch.ranks
+
     include_x = !isa(grid, XFlatGrid) && (Rx != 1)
     include_y = !isa(grid, YFlatGrid) && (Ry != 1)
 
     if include_x && include_y
-        return s, o
+        return tuple(KernelParameters(S[i], O[i]) for i in 1:4)
     elseif include_x && !(include_y)
-        return (s[1], s[3]), (o[1], o[3])
+        return tuple(KernelParameters(S[i], O[i]) for i in 1:2:3)
     elseif !(include_x) && include_y
-        return (s[2], s[4]), (o[2], o[4])
+        return tuple(KernelParameters(S[i], O[i]) for i in 2:2:4)
     else
-        return (), ()
+        return ()
     end
 end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index d5be1c273a..6116a5c3f1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -252,7 +252,7 @@ function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model;
     return nothing
 end
 
-@kernel function calculate_CATKE_diffusivities!(diffusivities, offs, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
+@kernel function calculate_CATKE_diffusivities!(diffusivities, grid, closure::FlavorOfCATKE, velocities, tracers, buoyancy, clock, top_tracer_bcs)
 
     i, j, k = @index(Global, NTuple)
 

From a2118c82cfdd7cc324e44a22915161dcefbe7646 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:21:42 -0400
Subject: [PATCH 299/530] add comment

---
 src/ImmersedBoundaries/active_cells_map.jl                     | 3 +++
 .../calculate_nonhydrostatic_tendencies.jl                     | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 9dbef8d9d0..347f0ac3ee 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -83,6 +83,9 @@ function active_cells_map_interior(ibg)
     return active_indices
 end
 
+# Cannot `findall` on very large grids, so we split the computation in levels.
+# This makes the computation a little heavier but avoids OOM errors (this computation
+# is performed only once on setup)
 function findall_active_indices!(active_indices, active_cells_field, ibg, IndicesType)
     
     for k in 1:size(ibg, 3)
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index 7c38ac5097..fb14329931 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -113,7 +113,6 @@ function calculate_interior_tendency_contributions!(model)
                      end_tracer_kernel_args...,
                      forcing, clock)
 
-
         launch!(arch, grid, :xyz, calculate_Gc!, 
                 c_tendency, grid, args;
                 only_active_cells)

From 7ec6916624f9b9913c983b36e12d12ccabe9f2fa Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:25:04 -0400
Subject: [PATCH 300/530] comments

---
 src/ImmersedBoundaries/active_cells_map.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 347f0ac3ee..7eefc0221a 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -58,7 +58,7 @@ end
 function compute_active_cells_surface(ibg)
     one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0.0)
     column    = sum(one_field, dims = 3)
-    is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, computed_dependencies = (column, ))
+    is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, column)
     active_cells_field = Field{Center, Center, Nothing}(ibg, Bool)
     set!(active_cells_field, is_immersed_column)
     return active_cells_field
@@ -106,6 +106,8 @@ end
 
 @inline add_3rd_index(t::Tuple, k) = (t[1], t[2], k) 
 
+# If we eventually want to perform also barotropic step, `w` computation and `p` 
+# computation only on active `columns`
 function active_cells_map_surface(ibg)
     active_cells_field = compute_active_cells_surface(ibg)
     interior_cells     = arch_array(CPU(), interior(active_cells_field, :, :, 1))

From 9de5af41d585fa5f4a1edee37377b164d26b1585 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 21:56:15 -0400
Subject: [PATCH 301/530] another batch of bugfixes

---
 src/Distributed/halo_communication.jl         |  5 ----
 src/Distributed/interleave_comm_and_comp.jl   | 28 +++++++------------
 ...ate_hydrostatic_free_surface_tendencies.jl | 20 ++++---------
 .../compute_w_from_continuity.jl              | 23 +++++----------
 .../update_hydrostatic_pressure.jl            | 20 ++++---------
 .../shallow_water_diffusion_operators.jl      |  4 +--
 .../turbulence_closure_utils.jl               |  4 +--
 7 files changed, 32 insertions(+), 72 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 0f55997c12..318e58a777 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -193,11 +193,6 @@ function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args.
     return nothing
 end
 
-@inline mpi_communication_side(::Val{fill_southwest_halo!}) = :southwest
-@inline mpi_communication_side(::Val{fill_southeast_halo!}) = :southeast
-@inline mpi_communication_side(::Val{fill_northwest_halo!}) = :northwest
-@inline mpi_communication_side(::Val{fill_northeast_halo!}) = :northeast
-
 @inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
 
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index a5b8c3093f..eacb0606b3 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -18,32 +18,24 @@ end
 complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingDistributedArch) = nothing
 compute_boundary_tendencies!(model) = nothing
 
-interior_tendency_kernel_size(grid::DistributedGrid)    = interior_tendency_kernel_size(grid,    architecture(grid))
-interior_tendency_kernel_offsets(grid::DistributedGrid) = interior_tendency_kernel_offsets(grid, architecture(grid))
+interior_tendency_kernel_parameters(grid::DistributedGrid) = 
+            interior_tendency_kernel_parameters(grid, architecture(grid))
 
-interior_tendency_kernel_size(grid, ::BlockingDistributedArch) = :xyz
-interior_tendency_kernel_offsets(grid, ::BlockingDistributedArch) = (0, 0, 0)
+interior_tendency_kernel_parameters(grid, ::BlockingDistributedArch) = :xyz
 
-function interior_tendency_kernel_size(grid, arch)
+function interior_tendency_kernel_parameters(grid, arch)
     Rx, Ry, _ = arch.ranks
     Hx, Hy, _ = halo_size(grid)
 
     Nx, Ny, Nz = size(grid)
     
-    Ax = Rx == 1 ? 0 : Hx
-    Ay = Ry == 1 ? 0 : Hy
+    Sx = Rx == 1 ? 0 : Hx
+    Sy = Ry == 1 ? 0 : Hy
 
-    return (Nx-2Ax, Ny-2Ay, Nz)
-end
-
-function interior_tendency_kernel_offsets(grid, arch)
-    Rx, Ry, _ = arch.ranks
-    Hx, Hy, _ = halo_size(grid)
-    
-    Ax = Rx == 1 ? 0 : Hx
-    Ay = Ry == 1 ? 0 : Hy
-
-    return (Ax, Ay, 0)
+    Ox = Rx == 1 ? 0 : Hx
+    Oy = Ry == 1 ? 0 : Hy
+     
+    return KernelParameters((Nx-2Ax, Ny-2Ay, Nz), (Ax, Ay, 0))
 end
 
 """
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index b89293609b..42a59eb711 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -11,7 +11,7 @@ import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
 import Oceananigans.Distributed: complete_communication_and_compute_boundary!
-import Oceananigans.Distributed: interior_tendency_kernel_size, interior_tendency_kernel_offsets
+import Oceananigans.Distributed: interior_tendency_kernel_parameters
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
 
@@ -50,6 +50,7 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 end
 
 complete_communication_and_compute_boundary!(model, grid, arch) = nothing
+interior_tendency_kernel_parameters(grid) = :xyz
 
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
 using Oceananigans.TurbulenceClosures.MEWSVerticalDiffusivities: MEWS
@@ -95,11 +96,7 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
     only_active_cells = use_only_active_interior_cells(grid)
-
-    kernel_size = interior_tendency_kernel_size(grid)
-    kernel_offsets = interior_tendency_kernel_offsets(grid)
-
-    kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
+    kernel_parameters = interior_tendency_kernel_parameters(grid)
 
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
         c_tendency    = model.timestepper.Gⁿ[tracer_name]
@@ -165,9 +162,6 @@ function calculate_free_surface_tendency!(grid, model, kernel_parameters)
 
     return nothing
 end
-    
-interior_tendency_kernel_size(grid)    = :xyz
-interior_tendency_kernel_offsets(grid) = (0, 0, 0)
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
 function calculate_hydrostatic_momentum_tendencies!(model, velocities)
@@ -196,12 +190,8 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
     
     only_active_cells = use_only_active_interior_cells(grid)
-
-    kernel_size = interior_tendency_kernel_size(grid)
-    kernel_offsets = interior_tendency_kernel_offsets(grid)
+    kernel_parameters = interior_tendency_kernel_parameters(grid)
     
-    kernel_parameters = KernelParameters(kernel_size, kernel_offsets)
-
     launch!(arch, grid, kernel_parameters,
             calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
             only_active_cells)
@@ -210,7 +200,7 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
             calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, v_kernel_args;
             only_active_cells)
 
-    calculate_free_surface_tendency!(grid, model, KernelParameters(:xy, (0, 0)))
+    calculate_free_surface_tendency!(grid, model, :xy)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 22ad1c6c93..10aa393688 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -13,7 +13,7 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid; parameters = KernelParameters(w_kernel_size(grid), w_kernel_offsets(grid))) = 
+compute_w_from_continuity!(velocities, arch, grid; parameters = KernelParameters(w_kernel_parameters(grid))) = 
     launch!(arch, grid, parameters, _compute_w_from_continuity!, velocities, grid)
 
 @kernel function _compute_w_from_continuity!(U, grid)
@@ -35,26 +35,17 @@ end
 using Oceananigans.Operators: XFlatGrid, YFlatGrid
 using Oceananigans.Grids: topology
 
-@inline function w_kernel_size(grid) 
+@inline function w_kernel_parameters(grid) 
     Nx, Ny, _ = size(grid)
     Hx, Hy, _ = halo_size(grid)
 
     Tx, Ty, _ = topology(grid)
 
-    Ax = Tx == Flat ? Nx : Nx + 2Hx - 2 
-    Ay = Ty == Flat ? Ny : Ny + 2Hy - 2 
+    Sx = Tx == Flat ? Nx : Nx + 2Hx - 2 
+    Sy = Ty == Flat ? Ny : Ny + 2Hy - 2 
 
-    return (Ax, Ay)
-end
-
-@inline function w_kernel_offsets(grid)
-    Hx, Hy, _ = halo_size(grid)
+    Ox = Tx == Flat ? 0 : - Hx + 1 
+    Oy = Ty == Flat ? 0 : - Hy + 1 
 
-    Tx, Ty, _ = topology(grid)
-
-    Ax = Tx == Flat ? 0 : - Hx + 1 
-    Ay = Ty == Flat ? 0 : - Hy + 1 
-
-    return (Ax, Ay)
+    return KernelParameters((Ax, Ay), (Ox, Oy))
 end
-
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 23aaec8970..68f36fea8b 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -34,24 +34,16 @@ update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; parameters =
 using Oceananigans.Grids: topology
 
 # extend p kernel to compute also the boundaries
-@inline function p_kernel_size(grid) 
+@inline function p_kernel_parameters(grid) 
     Nx, Ny, _ = size(grid)
 
     TX, TY, _ = topology(grid)
 
-    Ax = TX == Flat ? Nx : Nx + 2 
-    Ay = TY == Flat ? Ny : Ny + 2 
+    Sx = TX == Flat ? Nx : Nx + 2 
+    Sy = TY == Flat ? Ny : Ny + 2 
 
-    return (Ax, Ay)
-end
-
-@inline function p_kernel_offsets(grid)
-    TX, TY, _ = topology(grid)
-
-    Ax = TX == Flat ? 0 : - 1 
-    Ay = TY == Flat ? 0 : - 1 
+    Ox = TX == Flat ? 0 : - 1 
+    Oy = TY == Flat ? 0 : - 1 
 
-    return (Ax, Ay)
+    return KernelParameters((Sx, Sy), (Ox, Oy))
 end
-        
-        
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index 908773bb8b..4f17ed01a9 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -50,8 +50,8 @@ end
 with_tracers(tracers, closure::ShallowWaterScalarDiffusivity) = closure
 viscosity(closure::ShallowWaterScalarDiffusivity, K) = closure.ν
 
-Adapt.adapt_structure(to, closure::ShallowWaterScalarDiffusivity) = 
-    ShallowWaterScalarDiffusivity(Adapt.adapt(to, closure.ν), Adapt.adapt(to, closure.ξ))
+Adapt.adapt_structure(to, closure::ShallowWaterScalarDiffusivity{B}) where B = 
+    ShallowWaterScalarDiffusivity{B}(Adapt.adapt(to, closure.ν), Adapt.adapt(to, closure.ξ))
 
 # The diffusivity for the shallow water model is calculated as h*ν in order to have a viscous term in the form
 # h⁻¹ ∇ ⋅ (hν t) where t is the 2D stress tensor plus a trace => t = ∇u + (∇u)ᵀ - ξI⋅(∇⋅u)
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 68019853b0..4d00d5c89a 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -43,7 +43,7 @@ end
 
     Ax = Tx == Flat ? Nx : Nx + 2B 
     Ay = Ty == Flat ? Ny : Ny + 2B 
-    Az = Tz == Flat ? Nz : Nz + 2B 
+    Az = Tz == Flat ? Nz : Nz + 2
 
     return (Ax, Ay, Az)
 end
@@ -53,7 +53,7 @@ end
 
     Ax = Tx == Flat ? 0 : - B
     Ay = Ty == Flat ? 0 : - B 
-    Az = Tz == Flat ? 0 : - B 
+    Az = Tz == Flat ? 0 : - 1
 
     return (Ax, Ay, Az)
 end

From 984d0a14359fdb7de1ee1e2e38acb0809c98b15c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 22:15:55 -0400
Subject: [PATCH 302/530] other (maybe 20th) bugfix

---
 src/Distributed/distributed_utils.jl                 |  5 +++--
 src/Distributed/interleave_comm_and_comp.jl          |  6 +++---
 .../compute_w_from_continuity.jl                     | 12 ++++--------
 .../update_hydrostatic_pressure.jl                   |  9 ++++-----
 src/Simulations/time_step_wizard.jl                  |  6 +-----
 5 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index edce46915a..bb997cbac9 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -4,10 +4,11 @@ using Oceananigans.Grids:
     left_halo_indices, right_halo_indices,
     underlying_left_halo_indices, underlying_right_halo_indices
 
-
-all_reduce(val, grid; op = +) = 
+all_reduce(val, grid::DistributedGrid; op = +) = 
     MPI.Allreduce(val, op, grid.architecture.communicator)
 
+all_reduce(val, grid; kwargs...) = val
+
 # TODO: Move to Grids/grid_utils.jl
 
 #####
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index eacb0606b3..16c929213f 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -29,13 +29,13 @@ function interior_tendency_kernel_parameters(grid, arch)
 
     Nx, Ny, Nz = size(grid)
     
-    Sx = Rx == 1 ? 0 : Hx
-    Sy = Ry == 1 ? 0 : Hy
+    Sx = Rx == 1 ? Nx : Nx - 2Hx
+    Sy = Ry == 1 ? Ny : Nx - 2Hy
 
     Ox = Rx == 1 ? 0 : Hx
     Oy = Ry == 1 ? 0 : Hy
      
-    return KernelParameters((Nx-2Ax, Ny-2Ay, Nz), (Ax, Ay, 0))
+    return KernelParameters((Sx, Sy, Nz), (Ox, Oy, 0))
 end
 
 """
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index 10aa393688..dd1e3142a4 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -1,6 +1,7 @@
 using Oceananigans.Architectures: device
+using Oceananigans.Grids: halo_size, topology
 using Oceananigans.Operators: div_xyᶜᶜᶜ, Δzᶜᶜᶜ
-using Oceananigans.Grids: halo_size
+using Oceananigans.Operators: XFlatGrid, YFlatGrid
 
 """
     compute_w_from_continuity!(model)
@@ -13,7 +14,7 @@ w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 """
 compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
 
-compute_w_from_continuity!(velocities, arch, grid; parameters = KernelParameters(w_kernel_parameters(grid))) = 
+compute_w_from_continuity!(velocities, arch, grid; parameters = w_kernel_parameters(grid)) = 
     launch!(arch, grid, parameters, _compute_w_from_continuity!, velocities, grid)
 
 @kernel function _compute_w_from_continuity!(U, grid)
@@ -31,14 +32,9 @@ end
 
 # extend w kernel to compute also the boundaries
 # If Flat, do not calculate on halos!
-
-using Oceananigans.Operators: XFlatGrid, YFlatGrid
-using Oceananigans.Grids: topology
-
 @inline function w_kernel_parameters(grid) 
     Nx, Ny, _ = size(grid)
     Hx, Hy, _ = halo_size(grid)
-
     Tx, Ty, _ = topology(grid)
 
     Sx = Tx == Flat ? Nx : Nx + 2Hx - 2 
@@ -47,5 +43,5 @@ using Oceananigans.Grids: topology
     Ox = Tx == Flat ? 0 : - Hx + 1 
     Oy = Ty == Flat ? 0 : - Hy + 1 
 
-    return KernelParameters((Ax, Ay), (Ox, Oy))
+    return KernelParameters((Sx, Sy), (Ox, Oy))
 end
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 68f36fea8b..c6f33cf2a6 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -1,5 +1,7 @@
 using Oceananigans.Operators: Δzᶜᶜᶜ, Δzᶜᶜᶠ
 using Oceananigans.ImmersedBoundaries: PartialCellBottom, ImmersedBoundaryGrid
+using Oceananigans.Grids: topology
+using Oceananigans.Operators: XFlatGrid, YFlatGrid
 
 """
 Update the hydrostatic pressure perturbation pHY′. This is done by integrating
@@ -25,18 +27,15 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; parameters = KernelParameters(p_kernel_size(grid), p_kernel_offsets(grid))) =
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; parameters = p_kernel_parameters(grid)) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; parameters)
 
-update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; parameters = KernelParameters(p_kernel_size(grid), p_kernel_offsets(grid))) =  
+update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; parameters = p_kernel_parameters(grid)) =  
     launch!(arch, grid, parameters, _update_hydrostatic_pressure!, pHY′, grid, buoyancy, tracers)
 
-using Oceananigans.Grids: topology
-
 # extend p kernel to compute also the boundaries
 @inline function p_kernel_parameters(grid) 
     Nx, Ny, _ = size(grid)
-
     TX, TY, _ = topology(grid)
 
     Sx = TX == Flat ? Nx : Nx + 2 
diff --git a/src/Simulations/time_step_wizard.jl b/src/Simulations/time_step_wizard.jl
index dc241aea15..bd50eb1270 100644
--- a/src/Simulations/time_step_wizard.jl
+++ b/src/Simulations/time_step_wizard.jl
@@ -79,7 +79,6 @@ function TimeStepWizard(FT=Float64;
 end
 
 using Oceananigans.Grids: topology
-using Oceananigans.Distributed
 using Oceananigans.Distributed: all_reduce
 
 """
@@ -99,10 +98,7 @@ function new_time_step(old_Δt, wizard, model)
     new_Δt = min(wizard.max_change * old_Δt, new_Δt)
     new_Δt = max(wizard.min_change * old_Δt, new_Δt)
     new_Δt = clamp(new_Δt, wizard.min_Δt, wizard.max_Δt)
-
-    if model.architecture isa DistributedArch
-        new_Δt = all_reduce(new_Δt, model.grid; op = min)
-    end
+    new_Δt = all_reduce(new_Δt, model.grid; op = min)
 
     return new_Δt
 end

From b362263f51fa87013c8ba0ffa70d0e1b6d8c590f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 22:36:05 -0400
Subject: [PATCH 303/530] more simplification

---
 src/Distributed/distributed_utils.jl          |   5 -
 src/Distributed/partition_assemble.jl         |   5 +
 ...ate_hydrostatic_free_surface_tendencies.jl |  47 ++++---
 .../recompute_boundary_tendencies.jl          | 132 +++++-------------
 4 files changed, 63 insertions(+), 126 deletions(-)

diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
index bb997cbac9..b91a951522 100644
--- a/src/Distributed/distributed_utils.jl
+++ b/src/Distributed/distributed_utils.jl
@@ -4,11 +4,6 @@ using Oceananigans.Grids:
     left_halo_indices, right_halo_indices,
     underlying_left_halo_indices, underlying_right_halo_indices
 
-all_reduce(val, grid::DistributedGrid; op = +) = 
-    MPI.Allreduce(val, op, grid.architecture.communicator)
-
-all_reduce(val, grid; kwargs...) = val
-
 # TODO: Move to Grids/grid_utils.jl
 
 #####
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 3590f1bc10..0ced672f97 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,5 +1,10 @@
 using Oceananigans.Architectures: arch_array
 
+all_reduce(val, grid::DistributedGrid; op = +) = 
+    MPI.Allreduce(val, op, grid.architecture.communicator)
+
+all_reduce(val, grid; kwargs...) = val
+
 """
     concatenate_local_sizes(n, arch::DistributedArch) 
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 42a59eb711..554f049c5e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -23,9 +23,11 @@ contribution from non-hydrostatic pressure.
 """
 function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
+    kernel_parameters = tuple(interior_tendency_kernel_parameters(grid))
+
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
-    calculate_hydrostatic_free_surface_interior_tendency_contributions!(model)
+    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
@@ -87,22 +89,21 @@ top_tracer_boundary_conditions(grid, tracers) =
     NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_hydrostatic_free_surface_interior_tendency_contributions!(model)
+function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
 
     arch = model.architecture
     grid = model.grid
 
-    calculate_hydrostatic_momentum_tendencies!(model, model.velocities)
+    calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
     only_active_cells = use_only_active_interior_cells(grid)
-    kernel_parameters = interior_tendency_kernel_parameters(grid)
 
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
-        c_tendency    = model.timestepper.Gⁿ[tracer_name]
-        c_advection   = model.advection[tracer_name]
-        c_forcing     = model.forcing[tracer_name]
-        c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
+        @inbounds c_tendency    = model.timestepper.Gⁿ[tracer_name]
+        @inbounds c_advection   = model.advection[tracer_name]
+        @inbounds c_forcing     = model.forcing[tracer_name]
+        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
 
         tendency_kernel!, closure, diffusivity = tracer_tendency_kernel_function(model, Val(tracer_name), model.closure, model.diffusivity_fields)
 
@@ -122,12 +123,14 @@ function calculate_hydrostatic_free_surface_interior_tendency_contributions!(mod
                      c_forcing,
                      model.clock)
 
-        launch!(arch, grid, kernel_parameters,
-                tendency_kernel!,
-                c_tendency,
-                grid,
-                args;
-                only_active_cells)
+        for parameters in kernel_parameters
+            launch!(arch, grid, parameters,
+                    tendency_kernel!,
+                    c_tendency,
+                    grid,
+                    args;
+                    only_active_cells)
+        end
     end
 
     return nothing
@@ -164,7 +167,7 @@ function calculate_free_surface_tendency!(grid, model, kernel_parameters)
 end
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function calculate_hydrostatic_momentum_tendencies!(model, velocities)
+function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters)
 
     grid = model.grid
     arch = architecture(grid)
@@ -192,13 +195,15 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities)
     only_active_cells = use_only_active_interior_cells(grid)
     kernel_parameters = interior_tendency_kernel_parameters(grid)
     
-    launch!(arch, grid, kernel_parameters,
-            calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
-            only_active_cells)
+    for parameters in kernel_parameters
+        launch!(arch, grid, parameters,
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
+                only_active_cells)
 
-    launch!(arch, grid, kernel_parameters,
-            calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, v_kernel_args;
-            only_active_cells)
+        launch!(arch, grid, parameters,
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, v_kernel_args;
+                only_active_cells)
+    end
 
     calculate_free_surface_tendency!(grid, model, :xy)
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index c5f63d6a09..1222f29869 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -10,91 +10,41 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     # We need new values for `w`, `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
 
+    # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
+    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
 
-    u_immersed_bc = immersed_boundary_condition(model.velocities.u)
-    v_immersed_bc = immersed_boundary_condition(model.velocities.v)
-
-    start_momentum_kernel_args = (model.advection.momentum,
-                                  model.coriolis,
-                                  model.closure)
-
-    end_momentum_kernel_args = (model.velocities,
-                                model.free_surface,
-                                model.tracers,
-                                model.buoyancy,
-                                model.diffusivity_fields,
-                                model.pressure.pHY′,
-                                model.auxiliary_fields,
-                                model.forcing,
-                                model.clock)
-
-    u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
-    v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
-
-    for parameters in kernel_parameters
-        launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, kernel_offsets, grid, u_kernel_args)
-    
-        launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, kernel_offsets, grid, v_kernel_args)
-        
-        η_parameters = KernelParameters(worktuple(parameters)[1:2], offsets(parameters)[1:2])
+    return nothing
+end
 
-        calculate_free_surface_tendency!(grid, model, η_parameters)
-    end
+# tendencies need computing in the range 1 : H and N - H + 1 : N 
+function boundary_tendency_kernel_parameters(grid, arch)
+    Nx, Ny, Nz = size(grid)
+    Hx, Hy, _  = halo_size(grid)
+    
+    Sx  = (Hx, Ny, Nz)
+    Sy  = (Nx, Hy, Nz)
+         
+    Oᴸ  = (0,  0,  0)
+    Oᴿx = (Nx-Hx, 0,     0)
+    Oᴿy = (0,     Ny-Hy, 0)
 
-    top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
-
-    for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
-        @inbounds c_tendency = model.timestepper.Gⁿ[tracer_name]
-        @inbounds c_advection = model.advection[tracer_name]
-        @inbounds c_forcing = model.forcing[tracer_name]
-        @inbounds c_immersed_bc = immersed_boundary_condition(model.tracers[tracer_name])
-
-        tendency_kernel!, closure, diffusivity = tracer_tendency_kernel_function(model, Val(tracer_name), model.closure, model.diffusivity_fields)
-
-        args = tuple(Val(tracer_index),
-                     Val(tracer_name),
-                     c_advection,
-                     closure,
-                     c_immersed_bc,
-                     model.buoyancy,
-                     model.biogeochemistry,
-                     model.velocities,
-                     model.free_surface,
-                     model.tracers,
-                     top_tracer_bcs,
-                     diffusivity,
-                     model.auxiliary_fields,
-                     c_forcing,
-                     model.clock)
-
-        for parameters in kernel_parameters
-            launch!(arch, grid, parameters,
-                    tendency_kernel!, c_tendency, kernel_offsets, grid, args)
-        end
-    end
+    sizes = (Sx, Sy,  Sx,  Sy)
+    offs  = (Oᴸ, Oᴸ, Oᴿx, Oᴿy)
+        
+    return boundary_parameters(sizes, offs, grid, arch)
 end
 
 function recompute_auxiliaries!(model, grid, arch)
     
-    kernel_parameters = boundary_w_kernel_parameters(grid, arch)
-
-    for parameters in kernel_parameters
-        compute_w_from_continuity!(model.velocities, arch, grid; parameters)
-    end
-
-    kernel_parameters = boundary_p_kernel_parameters(grid, arch)
-
-    for parameters in kernel_parameters
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters)
-    end
-
-    kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
-
-    for parameters in kernel_parameters
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters)
+    w_kernel_parameters = boundary_w_kernel_parameters(grid, arch)
+    p_kernel_parameters = boundary_p_kernel_parameters(grid, arch)
+    κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
+
+    for (wpar, ppar, κpar) in zip(w_kernel_parameters, p_kernel_parameters, κ_kernel_parameters)
+        compute_w_from_continuity!(model.velocities, arch, grid; parameters = wpar)
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
     end
 end
 
@@ -114,7 +64,7 @@ function boundary_w_kernel_parameters(grid, arch)
     sizes = ( Sx,  Sy,  Sx,  Sy)
     offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return communicating_boundaries(arch, sizes, offs, grid)
+    return boundary_parameters(sizes, offs, grid, arch)
 end
 
 # p needs computing in the range  0 : 0 and N + 1 : N + 1
@@ -132,7 +82,7 @@ function boundary_p_kernel_parameters(grid, arch)
     sizes = ( Sx,  Sy,  Sx,  Sy)
     offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return communicating_boundaries(arch, sizes, offs, grid)
+    return boundary_parameters(sizes, offs, grid, arch)
 end
 
 # diffusivities need computing in the range 0 : B and N - B : N + 1
@@ -152,31 +102,13 @@ function boundary_κ_kernel_parameters(grid, closure, arch)
     sizes = ( Sx,  Sy,  Sx,  Sy)
     offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
         
-    return communicating_boundaries(arch, sizes, offs, grid)
-end
-
-# tendencies need computing in the range 1 : H and N - H + 1 : N 
-function boundary_tendency_kernel_parameters(grid, arch)
-    Nx, Ny, Nz = size(grid)
-    Hx, Hy, _  = halo_size(grid)
-    
-    Sx  = (Hx, Ny, Nz)
-    Sy  = (Nx, Hy, Nz)
-         
-    Oᴸx = (0,  0,  0)
-    Oᴸy = (0,  0,  0)
-    Oᴿx = (Nx-Hx, 0,     0)
-    Oᴿy = (0,     Ny-Hy, 0)
-
-    sizes = ( Sx,  Sy,  Sx,  Sy)
-    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
-        
-    return communicating_boundaries(arch, sizes, offs, grid)
+    return boundary_parameters(sizes, offs, grid, arch)
 end
 
 using Oceananigans.Operators: XFlatGrid, YFlatGrid
 
-function communicating_boundaries(arch, S, O, grid) 
+# Recompute only on communicating sides 
+function boundary_parameters(S, O, grid, arch) 
     Rx, Ry, _ = arch.ranks
 
     include_x = !isa(grid, XFlatGrid) && (Rx != 1)

From 3d9700f203a3207d087afd67a097c85f824c03ad Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 22:48:45 -0400
Subject: [PATCH 304/530] bugfix all_reduce

---
 src/Distributed/partition_assemble.jl | 6 +++---
 src/Simulations/time_step_wizard.jl   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 0ced672f97..6d8f94706b 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,9 +1,9 @@
 using Oceananigans.Architectures: arch_array
 
-all_reduce(val, grid::DistributedGrid; op = +) = 
-    MPI.Allreduce(val, op, grid.architecture.communicator)
+all_reduce(val, arch::DistributedArch; op = +) = 
+    MPI.Allreduce(val, op, arch.communicator)
 
-all_reduce(val, grid; kwargs...) = val
+all_reduce(val, arch; kwargs...) = val
 
 """
     concatenate_local_sizes(n, arch::DistributedArch) 
diff --git a/src/Simulations/time_step_wizard.jl b/src/Simulations/time_step_wizard.jl
index bd50eb1270..e8b3b32db6 100644
--- a/src/Simulations/time_step_wizard.jl
+++ b/src/Simulations/time_step_wizard.jl
@@ -1,5 +1,5 @@
 using Oceananigans: TurbulenceClosures
-using Oceananigans.Grids: prettysummary
+using Oceananigans.Grids: prettysummary, architecture
 
 mutable struct TimeStepWizard{FT, C, D}
                          cfl :: FT
@@ -98,7 +98,7 @@ function new_time_step(old_Δt, wizard, model)
     new_Δt = min(wizard.max_change * old_Δt, new_Δt)
     new_Δt = max(wizard.min_change * old_Δt, new_Δt)
     new_Δt = clamp(new_Δt, wizard.min_Δt, wizard.max_Δt)
-    new_Δt = all_reduce(new_Δt, model.grid; op = min)
+    new_Δt = all_reduce(new_Δt, architecture(model.grid); op = min)
 
     return new_Δt
 end

From ae9aa23f912637bfac0d12405b51f37b40d3ab2a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 23:06:16 -0400
Subject: [PATCH 305/530] simplified a lot BC passing

---
 src/Distributed/halo_communication.jl         | 29 ++++---------------
 src/Fields/field_boundary_buffers.jl          | 21 ++++++++++++++
 ...ate_hydrostatic_free_surface_tendencies.jl |  2 +-
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 318e58a777..def23e7fd2 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -4,16 +4,7 @@ using CUDA: synchronize
 using CUDA: cuStreamGetFlags, stream, priority_range, CUstream_flags_enum, CuStream, stream!
 
 import Oceananigans.Utils: sync_device!
-using Oceananigans.Fields: fill_west_and_east_send_buffers!,
-                           fill_south_and_north_send_buffers!,
-                           fill_west_send_buffers!,
-                           fill_east_send_buffers!,
-                           fill_south_send_buffers!,
-                           fill_north_send_buffers!,
-                           fill_southwest_send_buffers!,
-                           fill_southeast_send_buffers!,
-                           fill_northwest_send_buffers!,
-                           fill_northeast_send_buffers!,
+using Oceananigans.Fields: fill_send_buffers!,
                            recv_from_buffers!, 
                            reduced_dimensions, 
                            instantiated_location
@@ -125,6 +116,10 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     arch       = architecture(grid)
     halo_tuple = permute_boundary_conditions(bcs)
     
+    # This has to be synchronized!!
+    fill_send_buffers!(c, buffers, grid)
+    sync_device!(child_architecture(arch))
+
     for task = 1:3
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     end
@@ -149,9 +144,6 @@ for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2,
             local_rank = arch.local_rank
 
             recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
-            $fill_side_send_buffers!(c, buffers, grid)
-            sync_device!(child_arch)
-
             send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
             
             return [send_req, recv_req]
@@ -260,11 +252,6 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
-            # This has to be synchronized!!
-            $fill_all_send_buffers!(c, buffers, grid)
-
-            sync_device!(child_architecture(arch))
-
             send_req1 = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
@@ -280,9 +267,6 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
 
             $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
-            $fill_side_send_buffers!(c, buffers, grid)
-
-            sync_device!(child_arch)
 
             send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             
@@ -298,9 +282,6 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
-            $fill_opposite_side_send_buffers!(c, buffers, grid)
-
-            sync_device!(child_arch)
 
             send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 7310e76336..8cce7869a1 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -101,12 +101,31 @@ Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
 fills `buffers.send` from OffsetArray `c` preparing for message passing. If we are on CPU
 we do not need to fill the buffers as the transfer can happen through views
 """
+function fill_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
+    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, _ = size(grid)
+
+     _fill_west_send_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
+     _fill_east_send_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
+    _fill_south_send_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
+    _fill_north_send_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+
+    _fill_southwest_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+    _fill_southeast_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+    _fill_northwest_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+    _fill_northeast_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+
+    return nothing
+end
+
 function fill_west_and_east_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
     _fill_west_send_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
     _fill_east_send_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
+
+    return nothing
 end
 
 function fill_south_and_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
@@ -115,6 +134,8 @@ function fill_south_and_north_send_buffers!(c::OffsetArray, buffers::FieldBounda
 
     _fill_south_send_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
     _fill_north_send_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+
+    return nothing
 end
 
 fill_west_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 554f049c5e..9e4c6a7c97 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -23,7 +23,7 @@ contribution from non-hydrostatic pressure.
 """
 function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
-    kernel_parameters = tuple(interior_tendency_kernel_parameters(grid))
+    kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain

From 925a50b9439097ccc43fe4f715b39152d1b4bc6b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 23:13:00 -0400
Subject: [PATCH 306/530] bugfix

---
 src/Fields/field_boundary_buffers.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 8cce7869a1..e43b83f72a 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -101,7 +101,7 @@ Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
 fills `buffers.send` from OffsetArray `c` preparing for message passing. If we are on CPU
 we do not need to fill the buffers as the transfer can happen through views
 """
-function fill_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
+function fill_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 

From 67a83e67be49a4b49a1d41b174b5a1c77c2d9b7d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 23:28:42 -0400
Subject: [PATCH 307/530] bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 9e4c6a7c97..30775c7aea 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -193,7 +193,6 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_pa
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
     
     only_active_cells = use_only_active_interior_cells(grid)
-    kernel_parameters = interior_tendency_kernel_parameters(grid)
     
     for parameters in kernel_parameters
         launch!(arch, grid, parameters,

From 794fafc2d925ad6c6bfe9f9a2b904aa1ff374c50 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 11 Jun 2023 23:44:54 -0400
Subject: [PATCH 308/530] bugfix

---
 .../recompute_boundary_tendencies.jl                      | 8 ++++----
 .../split_explicit_free_surface_kernels.jl                | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 1222f29869..2c9a4030b4 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -85,7 +85,7 @@ function boundary_p_kernel_parameters(grid, arch)
     return boundary_parameters(sizes, offs, grid, arch)
 end
 
-# diffusivities need computing in the range 0 : B and N - B : N + 1
+# diffusivities need computing in the range 0 : B and N - B + 1 : N + 1
 function boundary_κ_kernel_parameters(grid, closure, arch)
     Nx, Ny, Nz = size(grid)
 
@@ -115,11 +115,11 @@ function boundary_parameters(S, O, grid, arch)
     include_y = !isa(grid, YFlatGrid) && (Ry != 1)
 
     if include_x && include_y
-        return tuple(KernelParameters(S[i], O[i]) for i in 1:4)
+        return Tuple(KernelParameters(S[i], O[i]) for i in 1:4)
     elseif include_x && !(include_y)
-        return tuple(KernelParameters(S[i], O[i]) for i in 1:2:3)
+        return Tuple(KernelParameters(S[i], O[i]) for i in 1:2:3)
     elseif !(include_x) && include_y
-        return tuple(KernelParameters(S[i], O[i]) for i in 2:2:4)
+        return Tuple(KernelParameters(S[i], O[i]) for i in 2:2:4)
     else
         return ()
     end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 339ce5f78d..f38b50be90 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -160,8 +160,8 @@ end
         advance_previous_velocity!(i, j, 1, timestepper, V, Vᵐ⁻¹, Vᵐ⁻²)
 
         # ∂τ(U) = - ∇η + G
-        U[i, j, 1] +=  Δτ * (- g * Hᶠᶜ[i′, j′] * ∂xᶠᶜᶠ_η(i, j, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i, j, 1])
-        V[i, j, 1] +=  Δτ * (- g * Hᶜᶠ[i′, j′] * ∂yᶜᶠᶠ_η(i, j, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i, j, 1])
+        U[i, j, 1] +=  Δτ * (- g * Hᶠᶜ[i, j] * ∂xᶠᶜᶠ_η(i, j, k_top, grid, TX, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gᵁ[i, j, 1])
+        V[i, j, 1] +=  Δτ * (- g * Hᶜᶠ[i, j] * ∂yᶜᶠᶠ_η(i, j, k_top, grid, TY, η★, timestepper, η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²) + Gⱽ[i, j, 1])
                           
         # time-averaging
         η̅[i, j, k_top] +=  averaging_weight * η[i, j, k_top]

From 7f057c4cdf8a48ff8cbea4079ed9e75c9f4d3ba3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 00:10:06 -0400
Subject: [PATCH 309/530] avoid computing prescribed fields

---
 .../prescribed_hydrostatic_velocity_fields.jl             | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index b3ae3cc938..01137bceb9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -76,18 +76,18 @@ end
 
 ab2_step_velocities!(::PrescribedVelocityFields, args...) = nothing
 ab2_step_free_surface!(::Nothing, model, Δt, χ) = nothing 
-compute_w_from_continuity!(::PrescribedVelocityFields, args...) = nothing
+compute_w_from_continuity!(::PrescribedVelocityFields, args...; kwargs...) = nothing
 
 validate_velocity_boundary_conditions(grid, ::PrescribedVelocityFields) = nothing
 extract_boundary_conditions(::PrescribedVelocityFields) = NamedTuple()
 
 FreeSurfaceDisplacementField(::PrescribedVelocityFields, ::Nothing, grid) = nothing
 HorizontalVelocityFields(::PrescribedVelocityFields, grid) = nothing, nothing
-FreeSurface(free_surface::ExplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
-FreeSurface(free_surface::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
+FreeSurface(::ExplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
+FreeSurface(::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
 
 hydrostatic_prognostic_fields(::PrescribedVelocityFields, ::Nothing, tracers) = tracers
-calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields; kwargs...) = []
+calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters) = nothing
 
 apply_flux_bcs!(::Nothing, c, arch, clock, model_fields) = nothing
 

From 737bda4c4b0bcfa793a775add86aaa0c0fab21dd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 00:14:35 -0400
Subject: [PATCH 310/530] finished?

---
 .../hydrostatic_free_surface_model.jl                           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 474d5a0abc..7f41a1ba6c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -199,7 +199,7 @@ function HydrostaticFreeSurfaceModel(; grid,
                                         free_surface, forcing, closure, particles, biogeochemistry, velocities, tracers,
                                         pressure, diffusivity_fields, timestepper, auxiliary_fields)
 
-    # update_state!(model)
+    update_state!(model)
 
     return model
 end

From 92e8d0cfd5f616780e29db6adde141311b192f8e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 00:59:51 -0400
Subject: [PATCH 311/530] only underlying_halo remaining

---
 .../recompute_boundary_tendencies.jl          | 44 +++++++++----------
 src/TurbulenceClosures/closure_tuples.jl      | 12 +++++
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
index 2c9a4030b4..cf3a5960f9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
@@ -26,11 +26,11 @@ function boundary_tendency_kernel_parameters(grid, arch)
     Sy  = (Nx, Hy, Nz)
          
     Oᴸ  = (0,  0,  0)
-    Oᴿx = (Nx-Hx, 0,     0)
-    Oᴿy = (0,     Ny-Hy, 0)
+    Oxᴿ = (Nx-Hx, 0,     0)
+    Oyᴿ = (0,     Ny-Hy, 0)
 
-    sizes = (Sx, Sy,  Sx,  Sy)
-    offs  = (Oᴸ, Oᴸ, Oᴿx, Oᴿy)
+    sizes = (Sx, Sy, Sx,  Sy)
+    offs  = (Oᴸ, Oᴸ, Oxᴿ, Oyᴿ)
         
     return boundary_parameters(sizes, offs, grid, arch)
 end
@@ -56,13 +56,13 @@ function boundary_w_kernel_parameters(grid, arch)
     Sx  = (Hx, Ny)
     Sy  = (Nx, Hy)
              
-    Oᴸx = (-Hx+1, 0)
-    Oᴸy = (0, -Hy+1)
-    Oᴿx = (Nx-1,  0)
-    Oᴿy = (0,  Ny-1)
+    Oxᴸ = (-Hx+1, 0)
+    Oyᴸ = (0, -Hy+1)
+    Oxᴿ = (Nx-1,  0)
+    Oyᴿ = (0,  Ny-1)
 
-    sizes = ( Sx,  Sy,  Sx,  Sy)
-    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
+    sizes = (Sx,  Sy,  Sx,  Sy)
+    offs  = (Oxᴸ, Oyᴸ, Oxᴿ, Oyᴿ)
         
     return boundary_parameters(sizes, offs, grid, arch)
 end
@@ -74,13 +74,13 @@ function boundary_p_kernel_parameters(grid, arch)
     Sx  = (1, Ny)
     Sy  = (Nx, 1)
              
-    Oᴸx = (-1, 0)
-    Oᴸy = (0, -1)
-    Oᴿx = (Nx, 0)
-    Oᴿy = (0, Ny)
+    Oxᴸ = (-1, 0)
+    Oyᴸ = (0, -1)
+    Oxᴿ = (Nx, 0)
+    Oyᴿ = (0, Ny)
 
-    sizes = ( Sx,  Sy,  Sx,  Sy)
-    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
+    sizes = (Sx,  Sy,  Sx,  Sy)
+    offs  = (Oxᴸ, Oyᴸ, Oxᴿ, Oyᴿ)
         
     return boundary_parameters(sizes, offs, grid, arch)
 end
@@ -94,13 +94,13 @@ function boundary_κ_kernel_parameters(grid, closure, arch)
     Sx  = (B+1, Ny, Nz)
     Sy  = (Nx, B+1, Nz)
         
-    Oᴸx = (-1, 0, 0)
-    Oᴸy = (0, -1, 0)
-    Oᴿx = (Nx-B,  0, 0)
-    Oᴿy = (0,  Ny-B, 0)
+    Oxᴸ = (-1, 0, 0)
+    Oyᴸ = (0, -1, 0)
+    Oxᴿ = (Nx-B,  0, 0)
+    Oyᴿ = (0,  Ny-B, 0)
 
-    sizes = ( Sx,  Sy,  Sx,  Sy)
-    offs  = (Oᴸx, Oᴸy, Oᴿx, Oᴿy)
+    sizes = (Sx,  Sy,  Sx,  Sy)
+    offs  = (Oxᴸ, Oyᴸ, Oxᴿ, Oyᴿ)
         
     return boundary_parameters(sizes, offs, grid, arch)
 end
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 914b64d216..8c04f980d7 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,6 +86,18 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
+@inline κ_kernel_size(grid, closure::AbstractArray) = κ_kernel_size(grid, closure[1])
+@inline κ_kernel_offsets(grid, closure::AbstractArray) = κ_kernel_offsets(grid, closure[1])
+
+@inline function κ_kernel_offsets(grid, closure_tuple::Tuple)
+    kernel_offsets = (0, 0, 0)
+    for closure in closure_tuple
+        kernel_offsets = max.(kernel_offsets, κ_kernel_offsets(grid, closure))
+    end
+
+    return kernel_offsets
+end
+
 @inline function κ_kernel_size(grid, closure_tuple::Tuple)
     kernel_size = (0, 0, 0)
     for closure in closure_tuple

From d03050be8736a89cf09fdc0d21a1030a55eeeccb Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 02:43:08 -0400
Subject: [PATCH 312/530] fixed

---
 .../split_explicit_free_surface.jl                         | 7 +++----
 test/test_distributed_models.jl                            | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index e7c043e7df..68068d0f2b 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -206,10 +206,9 @@ function SplitExplicitAuxiliaryFields(grid::AbstractGrid)
 
     fill_halo_regions!((Hᶠᶜ, Hᶜᶠ, Hᶜᶜ))
 
-    kernel_size    = :xy
-    kernel_offsets = (0, 0)
-
-    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_size, kernel_offsets)
+    kernel_parameters = :xy
+    
+    return SplitExplicitAuxiliaryFields(Gᵁ, Gⱽ, Hᶠᶜ, Hᶜᶠ, Hᶜᶜ, kernel_parameters)
 end
 
 """
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 221e66e047..0f98892f67 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -531,7 +531,7 @@ end
     @testset "Time stepping ShallowWaterModel" begin
         for child_arch in archs
             topo = (Periodic, Periodic, Flat)
-            use_buffers = child_arch isa GPU ? true : false
+            use_buffers = true
             arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology = topo, use_buffers, devices = (0, 0, 0, 0))
             grid = RectilinearGrid(arch, topology=topo, size=(8, 2), extent=(1, 2), halo=(3, 3))
             model = ShallowWaterModel(; momentum_advection=nothing, mass_advection=nothing, tracer_advection=nothing, grid, gravitational_acceleration=1)

From ff12b026569d97eef2175f3f4aff088680dc3e97 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 09:10:49 -0400
Subject: [PATCH 313/530] fixed MPI tag

---
 src/Distributed/distributed_grids.jl        | 16 ++--------------
 src/Distributed/halo_communication.jl       | 13 ++-----------
 src/Distributed/interleave_comm_and_comp.jl |  2 +-
 test/test_distributed_models.jl             |  8 ++++----
 4 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index df10a9a42b..2dab01305f 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -59,13 +59,7 @@ function RectilinearGrid(arch::DistributedArch,
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, child_architecture(arch))
     Lz, zᵃᵃᶠ, zᵃᵃᶜ, Δzᵃᵃᶠ, Δzᵃᵃᶜ = generate_coordinate(FT, topology[3](), nz, Hz, zl, child_architecture(arch))
 
-    architecture = DistributedArch(child_architecture(arch), 
-                                   topology = topology, 
-                                   ranks = arch.ranks, 
-                                   communicator = arch.communicator,
-                                   use_buffers = using_buffered_communication(arch))
-
-    return RectilinearGrid{TX, TY, TZ}(architecture,
+    return RectilinearGrid{TX, TY, TZ}(arch,
                                        nx, ny, nz,
                                        Hx, Hy, Hz,
                                        Lx, Ly, Lz,
@@ -122,13 +116,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     # when constructing the metrics!
     Lφ, φᵃᶠᵃ, φᵃᶜᵃ, Δφᵃᶠᵃ, Δφᵃᶜᵃ = generate_coordinate(FT, Bounded(), nφ, Hφ, φl, arch.child_architecture)
 
-    architecture = DistributedArch(child_architecture(arch); 
-                                   topology = topology,
-                                   ranks = arch.ranks, 
-                                   communicator = arch.communicator,
-                                   use_buffers = using_buffered_communication(arch))
-
-    preliminary_grid = LatitudeLongitudeGrid{TX, TY, TZ}(architecture,
+    preliminary_grid = LatitudeLongitudeGrid{TX, TY, TZ}(arch,
                                                          nλ, nφ, nz,
                                                          Hλ, Hφ, Hz,
                                                          Lλ, Lφ, Lz,
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index def23e7fd2..d7f5441634 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -58,29 +58,20 @@ opposite_side = Dict(
 RANK_DIGITS = 2
 ID_DIGITS   = 1
 
-# REMEMBER!!! This won't work for tracers!!! (It assumes you are passing maximum 4 at a time)
-@inline loc_id(::Nothing, tag) = tag + 5
-@inline loc_id(::Face,    tag) = tag
-@inline loc_id(::Center,  tag) = tag
-@inline location_id(X, Y, Z, tag) = loc_id(Z, tag)
-
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        # REMEMBER, we need to reset the tag not more than once every four passes!!
         function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
-            side_digit  = side_id[Symbol($side_str)]
-            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
+            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
             from_digits = string(local_rank, pad=RANK_DIGITS)
             to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
             return parse(Int, field_id * from_digits * to_digits)
         end
 
         function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
-            side_digit  = side_id[opposite_side[Symbol($side_str)]]
-            field_id    = string(location_id(location..., arch.mpi_tag[1]) + side_digit, pad=ID_DIGITS)
+            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
             from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
             to_digits   = string(local_rank, pad=RANK_DIGITS)
             return parse(Int, field_id * from_digits * to_digits)
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 16c929213f..6d3c6f1efc 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -9,7 +9,7 @@ function complete_communication_and_compute_boundary!(model, ::DistributedGrid,
         complete_halo_communication!(field)
     end
 
-    # HERE we have to put fill_eventual_halo_corners
+    # Recompute tendencies near the boundary halos
     compute_boundary_tendencies!(model)
 
     return nothing
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 0f98892f67..619aba612a 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -360,7 +360,7 @@ end
 
 function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true , false)
+    for use_buffers in (true, )
         arch = DistributedArch(child_arch; ranks=(4, 1, 1), use_buffers, devices = (0, 0, 0, 0))
         grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
@@ -386,7 +386,7 @@ end
 
 function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch)
     topo  = (Periodic, Periodic, Periodic)
-    for use_buffers in (true , false)
+    for use_buffers in (true, )
         arch = DistributedArch(child_arch; ranks=(1, 4, 1), use_buffers, devices = (0, 0, 0, 0))
         grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
@@ -410,7 +410,7 @@ end
 
 function test_triply_periodic_halo_communication_with_114_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true , false)
+    for use_buffers in (true, )
         arch = DistributedArch(child_arch; ranks=(1, 1, 4), use_buffers, devices = (0, 0, 0, 0))
         grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)
@@ -435,7 +435,7 @@ end
 
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true , false)
+    for use_buffers in (true, )
         arch = DistributedArch(child_arch; ranks=(2, 2, 1), use_buffers, devices = (0, 0, 0, 0))
         grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
         model = NonhydrostaticModel(grid=grid)

From eb6f5e60ceb1dba7a6040c22ad0c98b1d2eb03e4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 09:12:02 -0400
Subject: [PATCH 314/530] comment

---
 src/Distributed/halo_communication.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index d7f5441634..6d23fdf63c 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -116,6 +116,8 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     end
     
     fill_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
+
+    # Switch to the next field to send
     arch.mpi_tag[1] += 1
 
     return nothing

From 80fdc83d7ad5d0c71090ec31cdfba81bba2f8651 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 09:26:20 -0400
Subject: [PATCH 315/530] fixed tag problems

---
 src/Distributed/halo_communication.jl | 34 +++++++++++++++------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 6d23fdf63c..6b5faaa431 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -55,26 +55,30 @@ opposite_side = Dict(
 #   digits 5-6: the "from" rank
 #   digits 7-8: the "to" rank
 
-RANK_DIGITS = 2
-ID_DIGITS   = 1
+ID_DIGITS   = 2
+
+@inline loc_id(::Face)    = 0
+@inline loc_id(::Center)  = 1
+@inline loc_id(::Nothing) = 2
+@inline loc_id(LX, LY, LZ) = loc_id(LZ)
 
 for side in sides
     side_str = string(side)
     send_tag_fn_name = Symbol("$(side)_send_tag")
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
-        function $send_tag_fn_name(arch, location, local_rank, rank_to_send_to)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            from_digits = string(local_rank, pad=RANK_DIGITS)
-            to_digits   = string(rank_to_send_to, pad=RANK_DIGITS)
-            return parse(Int, field_id * from_digits * to_digits)
+        function $send_tag_fn_name(arch, location)
+            field_id   = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            loc_digit  = string(loc_id(location...)) 
+            side_digit = string(side_id[Symbol($side_str)])
+            return parse(Int, field_id * loc_digit * side_digit)
         end
 
-        function $recv_tag_fn_name(arch, location, local_rank, rank_to_recv_from)
-            field_id    = string(arch.mpi_tag[1], pad=ID_DIGITS)
-            from_digits = string(rank_to_recv_from, pad=RANK_DIGITS)
-            to_digits   = string(local_rank, pad=RANK_DIGITS)
-            return parse(Int, field_id * from_digits * to_digits)
+        function $recv_tag_fn_name(arch, location)
+            field_id   = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            loc_digit  = string(loc_id(location...)) 
+            side_digit = string(side_id[opposite_side[Symbol($side_str)]])
+            return parse(Int, field_id * loc_digit * side)
         end
     end
 end
@@ -116,7 +120,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     end
     
     fill_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
-
+    
     # Switch to the next field to send
     arch.mpi_tag[1] += 1
 
@@ -297,7 +301,7 @@ for side in sides
     @eval begin
         function $send_side_halo(c, grid, arch, side_location, location, local_rank, rank_to_send_to, buffers)
             send_buffer = $get_side_send_buffer(c, grid, side_location, buffers, arch)
-            send_tag = $side_send_tag(arch, location, local_rank, rank_to_send_to)
+            send_tag = $side_send_tag(arch, location)
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
             
@@ -325,7 +329,7 @@ for side in sides
     @eval begin
         function $recv_and_fill_side_halo!(c, grid, arch, side_location, location, local_rank, rank_to_recv_from, buffers)
             recv_buffer = $get_side_recv_buffer(c, grid, side_location, buffers, arch)
-            recv_tag = $side_recv_tag(arch, location, local_rank, rank_to_recv_from)
+            recv_tag = $side_recv_tag(arch, location)
 
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
             recv_req = MPI.Irecv!(recv_buffer, rank_to_recv_from, recv_tag, arch.communicator)

From 7cd4b44514413fa55816f331095d094965376e07 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 12 Jun 2023 09:26:36 -0400
Subject: [PATCH 316/530] bugfix

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 6b5faaa431..78bbe719b7 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -78,7 +78,7 @@ for side in sides
             field_id   = string(arch.mpi_tag[1], pad=ID_DIGITS)
             loc_digit  = string(loc_id(location...)) 
             side_digit = string(side_id[opposite_side[Symbol($side_str)]])
-            return parse(Int, field_id * loc_digit * side)
+            return parse(Int, field_id * loc_digit * side_digit)
         end
     end
 end

From d2ad49c2249211bea49f628030a4e12ba9ce31b2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 13 Jun 2023 09:31:42 -0400
Subject: [PATCH 317/530] Update scalar_biharmonic_diffusivity.jl

---
 .../scalar_biharmonic_diffusivity.jl                  | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index 118de01369..bbb5fae63e 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -9,15 +9,8 @@ Holds viscosity and diffusivities for models with prescribed isotropic diffusivi
 struct ScalarBiharmonicDiffusivity{F, V, K, N} <: AbstractScalarBiharmonicDiffusivity{F, N}
     ν :: V
     κ :: K
-
-<<<<<<< HEAD
-    function ScalarBiharmonicDiffusivity{F, N}(ν::V, κ::K) where {F, V, K, N}
-        return new{F, V, K, N}(ν, κ)
-    end
-=======
-    ScalarBiharmonicDiffusivity{F}(ν::N, κ::K) where {F, N, K} =
-        new{F, N, K}(ν, κ)
->>>>>>> main
+    ScalarBiharmonicDiffusivity{F, N}(ν::V, κ::K) where {F, V, K, N} =
+        new{F, V, K, N}(ν, κ)
 end
 
 # Aliases that allow specify the floating type, assuming that the discretization is Explicit in time

From 165e15ee7dc3fb011222de40f445250b42a6d1b1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 14 Jun 2023 08:08:45 -0400
Subject: [PATCH 318/530] Update src/Distributed/multi_architectures.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/Distributed/multi_architectures.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 9762611dd2..8b18fcf2ca 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -60,12 +60,12 @@ Keyword arguments
                   if not for testing or developing. Change at your own risk!
 """
 function DistributedArch(child_architecture = CPU(); 
-                   topology = (Periodic, Periodic, Periodic), 
-                   ranks,
-                   use_buffers = true,
-                   devices = nothing, 
-                   enable_overlapped_computation = true,
-                   communicator = MPI.COMM_WORLD)
+                         topology = (Periodic, Periodic, Periodic), 
+                         ranks,
+                         use_buffers = true,
+                         devices = nothing, 
+                         enable_overlapped_computation = true,
+                         communicator = MPI.COMM_WORLD)
 
     MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
 

From 88569a11eca197a8bdc35ece1cb6b1bc789594d6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 14 Jun 2023 08:10:23 -0400
Subject: [PATCH 319/530] Update src/Distributed/partition_assemble.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/Distributed/partition_assemble.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 6d8f94706b..21d36d0e56 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -8,8 +8,8 @@ all_reduce(val, arch; kwargs...) = val
 """
     concatenate_local_sizes(n, arch::DistributedArch) 
 
-returns a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
-all 3 directions
+Return a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
+all 3 directions.
 """
 concatenate_local_sizes(n, arch::DistributedArch) = 
     Tuple(concatenate_local_sizes(n, arch, i) for i in 1:length(n))

From 89603f2a38e036e6b0f39fc133c8f3b83c289b51 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 14 Jun 2023 08:10:36 -0400
Subject: [PATCH 320/530] Update src/ImmersedBoundaries/ImmersedBoundaries.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/ImmersedBoundaries/ImmersedBoundaries.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index a845137820..f886db3b70 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -112,7 +112,6 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
         FT = eltype(grid)
         arch = architecture(grid)
         Arch = typeof(arch)
-        
         return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, mi)
     end
 end

From 10b37da1cb1714d4078b1cb4ed5a841c199c0761 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 14 Jun 2023 08:10:45 -0400
Subject: [PATCH 321/530] Update src/ImmersedBoundaries/active_cells_map.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/ImmersedBoundaries/active_cells_map.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 7eefc0221a..77ce308117 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -121,4 +121,4 @@ function active_cells_map_surface(ibg)
     smaller_indices = getproperty.(full_indices, Ref(:I)) .|> Tuple{IntType, IntType}
     
     return smaller_indices
-end
\ No newline at end of file
+end

From 3c68709075dbb90f7a4a2f1d88d16b8416a68839 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Wed, 14 Jun 2023 09:40:19 -0400
Subject: [PATCH 322/530] Update src/Distributed/interleave_comm_and_comp.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/Distributed/interleave_comm_and_comp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 6d3c6f1efc..07426b79ed 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -60,4 +60,4 @@ function complete_halo_communication!(field)
     recv_from_buffers!(field.data, field.boundary_buffers, field.grid)
     
     return nothing
-end
\ No newline at end of file
+end

From 99d7f9890b44e5f32c3d080dd634702de3cb795a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 14 Jun 2023 08:57:38 -0600
Subject: [PATCH 323/530] Clean up batched tridiagonal solver and vertically
 implicit solver

---
 src/Solvers/batched_tridiagonal_solver.jl     | 40 +++++++++----------
 .../vertically_implicit_diffusion_solver.jl   |  9 +++--
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 646f642aef..f4cbaebe4a 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -60,7 +60,7 @@ other implementations where, e.g., `aⁱʲ²` may appear at the second row, inst
 
 2. A 3D array means, e.g., that `aⁱʲᵏ = a[i, j, k]`.
 
-Other coefficient types can be used by extending `get_coefficient`.
+Other coefficient types can be implemented by extending `get_coefficient`.
 """
 function BatchedTridiagonalSolver(grid;
                                   lower_diagonal,
@@ -113,10 +113,10 @@ function solve!(ϕ, solver::BatchedTridiagonalSolver, rhs, args...)
     return nothing
 end
 
-@inline get_coefficient(a::AbstractArray{T, 1}, i, j, k, grid, p, ::XDirection,   args...) where {T} = @inbounds a[i]
-@inline get_coefficient(a::AbstractArray{T, 1}, i, j, k, grid, p, ::YDirection,   args...) where {T} = @inbounds a[j]
-@inline get_coefficient(a::AbstractArray{T, 1}, i, j, k, grid, p, ::ZDirection,   args...) where {T} = @inbounds a[k]
-@inline get_coefficient(a::AbstractArray{T, 3}, i, j, k, grid, p, tridiag_dir, args...) where {T} = @inbounds a[i, j, k]
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{<:Any, 1}, p, ::XDirection,          args...) = @inbounds a[i]
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{<:Any, 1}, p, ::YDirection,          args...) = @inbounds a[j]
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{<:Any, 1}, p, ::ZDirection,          args...) = @inbounds a[k]
+@inline get_coefficient(i, j, k, grid, a::AbstractArray{<:Any, 3}, p, tridiagonal_direction, args...) = @inbounds a[i, j, k]
 
 @inline float_eltype(ϕ::AbstractArray{T}) where T <: AbstractFloat = T
 @inline float_eltype(ϕ::AbstractArray{<:Complex{T}}) where T <: AbstractFloat = T
@@ -126,8 +126,8 @@ end
     j, k = @index(Global, NTuple)
 
     @inbounds begin
-        β  = get_coefficient(b, 1, j, k, grid, p, tridiagonal_direction, args...)
-        f₁ = get_coefficient(f, 1, j, k, grid, p, tridiagonal_direction, args...)
+        β  = get_coefficient(1, j, k, grid, b, p, tridiagonal_direction, args...)
+        f₁ = get_coefficient(1, j, k, grid, f, p, tridiagonal_direction, args...)
         ϕ[1, j, k] = f₁ / β
 
         @unroll for i = 2:Nx
@@ -138,7 +138,7 @@ end
             t[i, j, k] = cᵏ⁻¹ / β
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
 
-            fᵏ = get_coefficient(f, i, j, k, grid, p, tridiagonal_direction, args...)
+            fᵏ = get_coefficient(i, j, k, grid, f, p, tridiagonal_direction, args...)
 
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
             # the algorithm is unstable and we elide the forward pass update of ϕ.
@@ -158,19 +158,19 @@ end
     i, k = @index(Global, NTuple)
 
     @inbounds begin
-        β  = get_coefficient(b, i, 1, k, grid, p, tridiagonal_direction, args...)
-        f₁ = get_coefficient(f, i, 1, k, grid, p, tridiagonal_direction, args...)
+        β  = get_coefficient(i, 1, k, grid, b, p, tridiagonal_direction, args...)
+        f₁ = get_coefficient(i, 1, k, grid, f, p, tridiagonal_direction, args...)
         ϕ[i, 1, k] = f₁ / β
 
         @unroll for j = 2:Ny
-            cᵏ⁻¹ = get_coefficient(c, i, j-1, k, grid, p, tridiagonal_direction, args...)
-            bᵏ   = get_coefficient(b, i, j,   k, grid, p, tridiagonal_direction, args...)
-            aᵏ⁻¹ = get_coefficient(a, i, j-1, k, grid, p, tridiagonal_direction, args...)
+            cᵏ⁻¹ = get_coefficient(i, j-1, k, grid, c, p, tridiagonal_direction, args...)
+            bᵏ   = get_coefficient(i, j,   k, grid, b, p, tridiagonal_direction, args...)
+            aᵏ⁻¹ = get_coefficient(i, j-1, k, grid, a, p, tridiagonal_direction, args...)
 
             t[i, j, k] = cᵏ⁻¹ / β
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
 
-            fᵏ = get_coefficient(f, i, j, k, grid, p, tridiagonal_direction, args...)
+            fᵏ = get_coefficient(i, j, k, grid, f, p, tridiagonal_direction, args...)
 
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
             # the algorithm is unstable and we elide the forward pass update of ϕ.
@@ -190,18 +190,18 @@ end
     i, j = @index(Global, NTuple)
 
     @inbounds begin
-        β  = get_coefficient(b, i, j, 1, grid, p, tridiagonal_direction, args...)
-        f₁ = get_coefficient(f, i, j, 1, grid, p, tridiagonal_direction, args...)
+        β  = get_coefficient(i, j, 1, grid, b, p, tridiagonal_direction, args...)
+        f₁ = get_coefficient(i, j, 1, f, grid, p, tridiagonal_direction, args...)
         ϕ[i, j, 1] = f₁ / β
 
         @unroll for k = 2:Nz
-            cᵏ⁻¹ = get_coefficient(c, i, j, k-1, grid, p, tridiagonal_direction, args...)
-            bᵏ   = get_coefficient(b, i, j, k,   grid, p, tridiagonal_direction, args...)
-            aᵏ⁻¹ = get_coefficient(a, i, j, k-1, grid, p, tridiagonal_direction, args...)
+            cᵏ⁻¹ = get_coefficient(i, j, k-1, grid, c, p, tridiagonal_direction, args...)
+            bᵏ   = get_coefficient(i, j, k,   grid, b, p, tridiagonal_direction, args...)
+            aᵏ⁻¹ = get_coefficient(i, j, k-1, grid, a, p, tridiagonal_direction, args...)
 
             t[i, j, k] = cᵏ⁻¹ / β
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
-            fᵏ = get_coefficient(f, i, j, k, grid, p, tridiagonal_direction, args...)
+            fᵏ = get_coefficient(i, j, k, grid, f, p, tridiagonal_direction, args...)
 
             # If the problem is not diagonally-dominant such that `β ≈ 0`,
             # the algorithm is unstable and we elide the forward pass update of `ϕ`.
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index d1ba2602cc..6d939cdb91 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -1,6 +1,7 @@
-using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ, Δz
+using Oceananigans.Operators: Δz
 using Oceananigans.AbstractOperations: flip
 using Oceananigans.Solvers: BatchedTridiagonalSolver, solve!
+using Oceananigans.Grids: ZDirection
 
 import Oceananigans.Solvers: get_coefficient
 
@@ -146,9 +147,9 @@ function implicit_diffusion_solver(::VerticallyImplicitTimeDiscretization, grid)
 end
 
 # Extend `get_coefficient` to retrieve `ivd_diagonal`, `_ivd_lower_diagonal` and `_ivd_upper_diagonal`.
-@inline get_coefficient(::VerticallyImplicitDiffusionLowerDiagonal, i, j, k, grid, p, tridiag_dir, args...) = maybe_tupled_ivd_lower_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::VerticallyImplicitDiffusionUpperDiagonal, i, j, k, grid, p, tridiag_dir, args...) = maybe_tupled_ivd_upper_diagonal(i, j, k, grid, args...)
-@inline get_coefficient(::VerticallyImplicitDiffusionDiagonal,      i, j, k, grid, p, tridiag_dir, args...) = ivd_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionLowerDiagonal, p, ::ZDirection, args...) = _ivd_lower_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionUpperDiagonal, p, ::ZDirection, args...) = _ivd_upper_diagonal(i, j, k, grid, args...)
+@inline get_coefficient(i, j, k, grid, ::VerticallyImplicitDiffusionDiagonal,      p, ::ZDirection, args...) = ivd_diagonal(i, j, k, grid, args...)
 
 #####
 ##### Implicit step functions

From 287ac42591c51eb3d97ca5aadabcccbc3bf32163 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Wed, 14 Jun 2023 09:19:02 -0600
Subject: [PATCH 324/530] Fix bug in batched tridiagonal solver

---
 src/Solvers/batched_tridiagonal_solver.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index f4cbaebe4a..a188d0fc89 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -131,9 +131,9 @@ end
         ϕ[1, j, k] = f₁ / β
 
         @unroll for i = 2:Nx
-            cᵏ⁻¹ = get_coefficient(c, i-1, j, k, grid, p, tridiagonal_direction, args...)
-            bᵏ   = get_coefficient(b, i,   j, k, grid, p, tridiagonal_direction, args...)
-            aᵏ⁻¹ = get_coefficient(a, i-1, j, k, grid, p, tridiagonal_direction, args...)
+            cᵏ⁻¹ = get_coefficient(i-1, j, k, grid, c, p, tridiagonal_direction, args...)
+            bᵏ   = get_coefficient(i,   j, k, grid, b, p, tridiagonal_direction, args...)
+            aᵏ⁻¹ = get_coefficient(i-1, j, k, grid, a, p, tridiagonal_direction, args...)
 
             t[i, j, k] = cᵏ⁻¹ / β
             β = bᵏ - aᵏ⁻¹ * t[i, j, k]
@@ -191,7 +191,7 @@ end
 
     @inbounds begin
         β  = get_coefficient(i, j, 1, grid, b, p, tridiagonal_direction, args...)
-        f₁ = get_coefficient(i, j, 1, f, grid, p, tridiagonal_direction, args...)
+        f₁ = get_coefficient(i, j, 1, grid, f, p, tridiagonal_direction, args...)
         ϕ[i, j, 1] = f₁ / β
 
         @unroll for k = 2:Nz

From 814cd43a8b25e4b16554920ef00273cd874c12fe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 20 Jun 2023 19:47:36 -0400
Subject: [PATCH 325/530] bugfix

---
 src/Distributed/interleave_comm_and_comp.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 6d3c6f1efc..46eb877696 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -30,7 +30,7 @@ function interior_tendency_kernel_parameters(grid, arch)
     Nx, Ny, Nz = size(grid)
     
     Sx = Rx == 1 ? Nx : Nx - 2Hx
-    Sy = Ry == 1 ? Ny : Nx - 2Hy
+    Sy = Ry == 1 ? Ny : Ny - 2Hy
 
     Ox = Rx == 1 ? 0 : Hx
     Oy = Ry == 1 ? 0 : Hy

From df01667e91e584a464cb03e16f8427d1fc052742 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 22 Jun 2023 12:53:27 -0600
Subject: [PATCH 326/530] Try to fix multi region immersed boundary issue

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 0c476e45e9..9e5f701629 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -74,8 +74,13 @@ function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
     set!(bottom_field, ib.bottom_height)
     fill_halo_regions!(bottom_field)
     new_ib = GridFittedBottom(bottom_field, ib.immersed_condition)
+    return ImmersedBoundaryGrid(grid, new_ib)
+end
+
+function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
     TX, TY, TZ = topology(grid)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
+    validate_ib_size(grid, ib)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
 end
 
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})

From 3113880b01262ae87bb6af87e98c4f72297a877b Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 23 Jun 2023 11:50:40 -0600
Subject: [PATCH 327/530] Hopefully fix immersed boundary grid constructor

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 9e5f701629..7640e0979f 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -74,7 +74,8 @@ function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
     set!(bottom_field, ib.bottom_height)
     fill_halo_regions!(bottom_field)
     new_ib = GridFittedBottom(bottom_field, ib.immersed_condition)
-    return ImmersedBoundaryGrid(grid, new_ib)
+    TX, TY, TZ = topology(grid)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
 end
 
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})

From 389e2431138b13eb6d75d709369c02c59160ed75 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 23 Jun 2023 13:33:27 -0600
Subject: [PATCH 328/530] Another fix

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 7640e0979f..56fa63e2f9 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -75,7 +75,7 @@ function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
     fill_halo_regions!(bottom_field)
     new_ib = GridFittedBottom(bottom_field, ib.immersed_condition)
     TX, TY, TZ = topology(grid)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
+    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
 end
 
 function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})

From 6492041cf4126766ccabd0f632c9af90e5baeed0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 24 Jun 2023 12:37:23 +0100
Subject: [PATCH 329/530] fixed project and manifest

---
 Manifest.toml | 2 +-
 Project.toml  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 750c7ba4fa..753a36ee12 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,6 +1,6 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.8.0"
+julia_version = "1.8.5"
 manifest_format = "2.0"
 project_hash = "b61348c5ba4009d3da1a3d8c47bdeb84513faa2c"
 
diff --git a/Project.toml b/Project.toml
index 330b1448a0..164e308ba0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Oceananigans"
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
 authors = ["Climate Modeling Alliance and contributors"]
-version = "0.83.0"
+version = "0.85.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -37,6 +37,7 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 [compat]
 Adapt = "3"
 CUDA = "4"
+KernelAbstractions = "^0.9"
 Crayons = "4"
 CubedSphere = "0.1, 0.2"
 DocStringExtensions = "0.8, 0.9"

From 5633d144650e83371222f212a3ca3cdfeb01bb63 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:23:54 +0200
Subject: [PATCH 330/530] convert instead of FT

---
 src/Advection/reconstruction_coefficients.jl | 22 +++----
 src/Advection/weno_interpolants.jl           | 68 ++++++++++----------
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/Advection/reconstruction_coefficients.jl b/src/Advection/reconstruction_coefficients.jl
index e385954d1a..c4ca94a25a 100644
--- a/src/Advection/reconstruction_coefficients.jl
+++ b/src/Advection/reconstruction_coefficients.jl
@@ -123,19 +123,19 @@ Examples
 julia> using Oceananigans.Advection: calc_reconstruction_stencil
 
 julia> calc_reconstruction_stencil(1, :right, :x)
-:(+(FT(coeff1_right[1]) * ψ[i + 0, j, k]))
+:(+(convert(FT, coeff1_right[1]) * ψ[i + 0, j, k]))
 
 julia> calc_reconstruction_stencil(1, :left, :x)
-:(+(FT(coeff1_left[1]) * ψ[i + -1, j, k]))
+:(+(convert(FT, coeff1_left[1]) * ψ[i + -1, j, k]))
 
 julia> calc_reconstruction_stencil(1, :symm, :x)
-:(FT(coeff2_symm[2]) * ψ[i + -1, j, k] + FT(coeff2_symm[1]) * ψ[i + 0, j, k])
+:(convert(FT, coeff2_symm[2]) * ψ[i + -1, j, k] + convert(FT, coeff2_symm[1]) * ψ[i + 0, j, k])
 
 julia> calc_reconstruction_stencil(2, :symm, :x)
-:(FT(coeff4_symm[4]) * ψ[i + -2, j, k] + FT(coeff4_symm[3]) * ψ[i + -1, j, k] + FT(coeff4_symm[2]) * ψ[i + 0, j, k] + FT(coeff4_symm[1]) * ψ[i + 1, j, k])
+:(convert(FT, coeff4_symm[4]) * ψ[i + -2, j, k] + convert(FT, coeff4_symm[3]) * ψ[i + -1, j, k] + convert(FT, coeff4_symm[2]) * ψ[i + 0, j, k] + convert(FT, coeff4_symm[1]) * ψ[i + 1, j, k])
 
 julia> calc_reconstruction_stencil(3, :left, :x)
-:(FT(coeff5_left[5]) * ψ[i + -3, j, k] + FT(coeff5_left[4]) * ψ[i + -2, j, k] + FT(coeff5_left[3]) * ψ[i + -1, j, k] + FT(coeff5_left[2]) * ψ[i + 0, j, k] + FT(coeff5_left[1]) * ψ[i + 1, j, k])
+:(convert(FT, coeff5_left[5]) * ψ[i + -3, j, k] + convert(FT, coeff5_left[4]) * ψ[i + -2, j, k] + convert(FT, coeff5_left[3]) * ψ[i + -1, j, k] + convert(FT, coeff5_left[2]) * ψ[i + 0, j, k] + convert(FT, coeff5_left[1]) * ψ[i + 1, j, k])
 ```
 """
 @inline function calc_reconstruction_stencil(buffer, shift, dir, func::Bool = false)
@@ -154,16 +154,16 @@ julia> calc_reconstruction_stencil(3, :left, :x)
         c = n - buffer - 1
         if func
             stencil_full[idx] = dir == :x ? 
-                                :(FT($coeff[$(order - idx + 1)]) * ψ(i + $c, j, k, grid, args...)) :
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ(i + $c, j, k, grid, args...)) :
                                 dir == :y ?
-                                :(FT($coeff[$(order - idx + 1)]) * ψ(i, j + $c, k, grid, args...)) :
-                                :(FT($coeff[$(order - idx + 1)]) * ψ(i, j, k + $c, grid, args...))
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ(i, j + $c, k, grid, args...)) :
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ(i, j, k + $c, grid, args...))
         else
             stencil_full[idx] =  dir == :x ? 
-                                :(FT($coeff[$(order - idx + 1)]) * ψ[i + $c, j, k]) :
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ[i + $c, j, k]) :
                                 dir == :y ?
-                                :(FT($coeff[$(order - idx + 1)]) * ψ[i, j + $c, k]) :
-                                :(FT($coeff[$(order - idx + 1)]) * ψ[i, j, k + $c])
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ[i, j + $c, k]) :
+                                :(convert(FT, $coeff[$(order - idx + 1)]) * ψ[i, j, k + $c])
         end
     end
     return Expr(:call, :+, stencil_full...)
diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl
index 2e382753f8..c07c492b9c 100644
--- a/src/Advection/weno_interpolants.jl
+++ b/src/Advection/weno_interpolants.jl
@@ -60,8 +60,8 @@ for buffer in [2, 3, 4, 5, 6]
             @inline Cr(scheme::WENO{$buffer}, ::Val{$stencil}) = @inbounds Cl(scheme, Val($(buffer-stencil-1)))
 
             # uniform coefficients are independent on direction and location
-            @inline  coeff_left_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil  , collect(1:100), collect(1:100); order = buffer)))
-            @inline coeff_right_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil-1, collect(1:100), collect(1:100); order = buffer)))
+            @inline  coeff_left_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds convert.(Ref(FT), $(stencil_coefficients(50, stencil  , collect(1:100), collect(1:100); order = buffer)))
+            @inline coeff_right_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds convert.(Ref(FT), $(stencil_coefficients(50, stencil-1, collect(1:100), collect(1:100); order = buffer)))
 
             # stretched coefficients are retrieved from precalculated coefficients
             @inline  coeff_left_p(scheme::WENO{$buffer}, ::Val{$stencil}, T, dir, i, loc) = @inbounds retrieve_coeff(scheme, $stencil,     dir, i, loc)
@@ -77,30 +77,30 @@ for buffer in [2, 3, 4, 5, 6]
 end
 
 # _UNIFORM_ smoothness coefficients (stretched smoothness coefficients are to be fixed!)
-@inline coeff_β(scheme::WENO{2, FT}, ::Val{0}) where FT = @inbounds FT.((1, -2, 1))
-@inline coeff_β(scheme::WENO{2, FT}, ::Val{1}) where FT = @inbounds FT.((1, -2, 1))
-
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{0}) where FT = @inbounds FT.((10, -31, 11, 25, -19,  4))
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{1}) where FT = @inbounds FT.((4,  -13, 5,  13, -13,  4))
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{2}) where FT = @inbounds FT.((4,  -19, 11, 25, -31, 10))
-
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{0}) where FT = @inbounds FT.((2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{1}) where FT = @inbounds FT.((0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{2}) where FT = @inbounds FT.((0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{3}) where FT = @inbounds FT.((0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
-
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{0}) where FT = @inbounds FT.((1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{1}) where FT = @inbounds FT.((0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{2}) where FT = @inbounds FT.((0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{3}) where FT = @inbounds FT.((0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{4}) where FT = @inbounds FT.((0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918)) 
-
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{0}) where FT = @inbounds FT.((0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{1}) where FT = @inbounds FT.((0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{2}) where FT = @inbounds FT.((0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{3}) where FT = @inbounds FT.((0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{4}) where FT = @inbounds FT.((0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{5}) where FT = @inbounds FT.((0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211)) 
+@inline coeff_β(scheme::WENO{2, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (1, -2, 1))
+@inline coeff_β(scheme::WENO{2, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (1, -2, 1))
+
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (10, -31, 11, 25, -19,  4))
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (4,  -13, 5,  13, -13,  4))
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (4,  -19, 11, 25, -31, 10))
+
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
+
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{4}) where FT = @inbounds convert.(Ref(FT), (0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918)) 
+
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{4}) where FT = @inbounds convert.(Ref(FT), (0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{5}) where FT = @inbounds convert.(Ref(FT), (0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211)) 
 
 # The rule for calculating smoothness indicators is the following (example WENO{4} which is seventh order) 
 # ψ[1] (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3] + C[4] * ψ[4]) + 
@@ -281,16 +281,16 @@ julia> calc_weno_stencil(2, :right, :x)
             c = n - buffer - 1
             if func 
                 stencil_point[idx] =  dir == :x ? 
-                                    :(@inbounds ψ(i + $c, j, k, args...)) :
-                                    dir == :y ?
-                                    :(@inbounds ψ(i, j + $c, k, args...)) :
-                                    :(@inbounds ψ(i, j, k + $c, args...))
+                                      :(@inbounds ψ(i + $c, j, k, args...)) :
+                                      dir == :y ?
+                                      :(@inbounds ψ(i, j + $c, k, args...)) :
+                                      :(@inbounds ψ(i, j, k + $c, args...))
             else    
                 stencil_point[idx] =  dir == :x ? 
-                                    :(@inbounds ψ[i + $c, j, k]) :
-                                    dir == :y ?
-                                    :(@inbounds ψ[i, j + $c, k]) :
-                                    :(@inbounds ψ[i, j, k + $c])
+                                      :(@inbounds ψ[i + $c, j, k]) :
+                                      dir == :y ?
+                                      :(@inbounds ψ[i, j + $c, k]) :
+                                      :(@inbounds ψ[i, j, k + $c])
             end                
         end
         stencil_full[buffer - stencil + 1] = :($(stencil_point...), )

From 9f895bb9bfd104ac0c1f9053f2a46c41d7410777 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:25:11 +0200
Subject: [PATCH 331/530] export KernelParameters

---
 src/BoundaryConditions/BoundaryConditions.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/BoundaryConditions/BoundaryConditions.jl b/src/BoundaryConditions/BoundaryConditions.jl
index fabedd0b2c..13dd9a6e19 100644
--- a/src/BoundaryConditions/BoundaryConditions.jl
+++ b/src/BoundaryConditions/BoundaryConditions.jl
@@ -14,7 +14,7 @@ using CUDA
 using KernelAbstractions: @index, @kernel
 
 using Oceananigans.Architectures: CPU, GPU, device
-using Oceananigans.Utils: work_layout, launch!, KernelParameters
+using Oceananigans.Utils: work_layout, launch!
 using Oceananigans.Operators: Ax, Ay, Az, volume
 using Oceananigans.Grids
 

From 9842f6e73f91949b3457faa939ba209685e1bd67 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:26:56 +0200
Subject: [PATCH 332/530] remove FT

---
 src/Advection/upwind_biased_reconstruction.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Advection/upwind_biased_reconstruction.jl b/src/Advection/upwind_biased_reconstruction.jl
index a2a8b36c49..57932738ac 100644
--- a/src/Advection/upwind_biased_reconstruction.jl
+++ b/src/Advection/upwind_biased_reconstruction.jl
@@ -125,8 +125,8 @@ for (sd, side) in enumerate((:left, :right)), (dir, ξ, val) in zip((:xᶠᵃᵃ
 
     for buffer in [1, 2, 3, 4, 5, 6]
         @eval begin
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ, idx, loc, args...)           where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ, false)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
-            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer, FT}, ψ::Function, idx, loc, args...) where FT = @inbounds sum($(reconstruction_stencil(buffer, side, ξ,  true)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer}, ψ, idx, loc, args...)           = @inbounds sum($(reconstruction_stencil(buffer, side, ξ, false)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::UpwindBiased{$buffer}, ψ::Function, idx, loc, args...) = @inbounds sum($(reconstruction_stencil(buffer, side, ξ,  true)) .* retrieve_coeff(scheme, Val($sd), Val($val), idx, loc))
         end
     end
 end

From 126829c59daa36cb3d64216d4e4bcb1352ee2359 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:29:27 +0200
Subject: [PATCH 333/530] removed useless where FT

---
 src/Advection/centered_reconstruction.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Advection/centered_reconstruction.jl b/src/Advection/centered_reconstruction.jl
index 9ea5147323..5c74d5b909 100644
--- a/src/Advection/centered_reconstruction.jl
+++ b/src/Advection/centered_reconstruction.jl
@@ -100,8 +100,8 @@ for (dir, ξ, val) in zip((:xᶠᵃᵃ, :yᵃᶠᵃ, :zᵃᵃᶠ), (:x, :y, :z),
 
     for buffer in [1, 2, 3, 4, 5, 6]
         @eval begin
-            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer, FT}, ψ, idx, loc, args...)           where FT = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ, false)) .* retrieve_coeff(scheme, Val($val), idx, loc))
-            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer, FT}, ψ::Function, idx, loc, args...) where FT = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ,  true)) .* retrieve_coeff(scheme, Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer}, ψ, idx, loc, args...)           = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ, false)) .* retrieve_coeff(scheme, Val($val), idx, loc))
+            @inline $stencil(i, j, k, grid, scheme::Centered{$buffer}, ψ::Function, idx, loc, args...) = @inbounds sum($(reconstruction_stencil(buffer, :symm, ξ,  true)) .* retrieve_coeff(scheme, Val($val), idx, loc))
         end
     end
 end

From 9569ac9f82fa3cebe04bd5eebf41b8ffaa71c3a5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:55:56 +0200
Subject: [PATCH 334/530] small bugfix

---
 src/TimeSteppers/runge_kutta_3.jl | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index 0848a0a283..d8c93d3544 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -99,8 +99,6 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     # First stage
     #
 
-    calculate_tendencies!(model, callbacks)
-
     rk3_substep!(model, Δt, γ¹, nothing)
 
     calculate_pressure_correction!(model, first_stage_Δt)
@@ -115,8 +113,6 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     # Second stage
     #
 
-    calculate_tendencies!(model, callbacks)
-
     rk3_substep!(model, Δt, γ², ζ²)
 
     calculate_pressure_correction!(model, second_stage_Δt)
@@ -130,8 +126,6 @@ function time_step!(model::AbstractModel{<:RungeKutta3TimeStepper}, Δt; callbac
     #
     # Third stage
     #
-
-    calculate_tendencies!(model, callbacks)
     
     rk3_substep!(model, Δt, γ³, ζ³)
 

From 2917114aef7aca12379633d9e66f8e05f7d0a60d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:59:17 +0200
Subject: [PATCH 335/530] update manifest

---
 Manifest.toml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 30e3fb1a45..c7e3ad6d5f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,13 +2,13 @@
 
 julia_version = "1.8.5"
 manifest_format = "2.0"
-project_hash = "b61348c5ba4009d3da1a3d8c47bdeb84513faa2c"
+project_hash = "c19e7e0747cdfab35ec6d8d8bd4d66290cc1b731"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "16b6dbc4cf7caee4e1e75c49485ec67b667098a0"
+git-tree-sha1 = "8bc0aaec0ca548eb6cf5f0d7d16351650c1ee956"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.3.1"
+version = "1.3.2"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
@@ -62,10 +62,10 @@ uuid = "179af706-886a-5703-950a-314cd64e0468"
 version = "0.1.2"
 
 [[deps.CUDA]]
-deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "442d989978ed3ff4e174c928ee879dc09d1ef693"
+deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "35160ef0f03b14768abfd68b830f8e3940e8e0dc"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "4.3.2"
+version = "4.4.0"
 
 [[deps.CUDA_Driver_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
@@ -112,7 +112,7 @@ version = "4.7.0"
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "0.5.2+0"
+version = "1.0.1+0"
 
 [[deps.Crayons]]
 git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
@@ -205,9 +205,9 @@ version = "0.1.5"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "cb090aea21c6ca78d59672a7e7d13bd56d09de64"
+git-tree-sha1 = "69a9aa4346bca723e46769ff6b6277e597c969b1"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.20.3"
+version = "0.21.2"
 
 [[deps.Glob]]
 git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
@@ -283,17 +283,17 @@ version = "1.13.1"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "bbb7ac4a3194c0d1561b9dea2a20e8f1ab68f709"
+git-tree-sha1 = "ada2d5824ce593ff117e714d29f8e890419e8b78"
 repo-rev = "main"
 repo-url = "https://github.com/simone-silvestri/KernelAbstractions.jl"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.6"
+version = "0.9.4"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "5007c1421563108110bbd57f63d8ad4565808818"
+git-tree-sha1 = "7d5788011dd273788146d40eb5b1fbdc199d0296"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "5.2.0"
+version = "6.0.1"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
@@ -688,7 +688,7 @@ version = "1.10.1"
 [[deps.Tar]]
 deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.0"
+version = "1.10.1"
 
 [[deps.TaylorSeries]]
 deps = ["LinearAlgebra", "Markdown", "Requires", "SparseArrays"]
@@ -731,9 +731,9 @@ version = "0.2.1"
 
 [[deps.UnsafeAtomicsLLVM]]
 deps = ["LLVM", "UnsafeAtomics"]
-git-tree-sha1 = "ea37e6066bf194ab78f4e747f5245261f17a7175"
+git-tree-sha1 = "323e3d0acf5e78a56dfae7bd8928c989b4f3083e"
 uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
-version = "0.1.2"
+version = "0.1.3"
 
 [[deps.VersionParsing]]
 git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"

From fa38abc5c97f6e18a2b53d4a999f06217927c7fd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 10:20:45 +0200
Subject: [PATCH 336/530] remove unbuffered communication

---
 src/Distributed/Distributed.jl           |  1 -
 src/Distributed/distributed_utils.jl     | 96 ------------------------
 src/Distributed/halo_communication.jl    | 12 +--
 src/Distributed/multi_architectures.jl   | 18 ++---
 src/Fields/field_boundary_buffers.jl     | 11 ---
 src/Utils/multi_region_transformation.jl |  3 +-
 6 files changed, 9 insertions(+), 132 deletions(-)
 delete mode 100644 src/Distributed/distributed_utils.jl

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 23f6f93285..9caf1037e3 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -10,7 +10,6 @@ using MPI
 using Oceananigans.Utils
 using Oceananigans.Grids
 
-include("distributed_utils.jl")
 include("multi_architectures.jl")
 include("partition_assemble.jl")
 include("distributed_grids.jl")
diff --git a/src/Distributed/distributed_utils.jl b/src/Distributed/distributed_utils.jl
deleted file mode 100644
index b91a951522..0000000000
--- a/src/Distributed/distributed_utils.jl
+++ /dev/null
@@ -1,96 +0,0 @@
-using Oceananigans.Fields: AbstractField
-using Oceananigans.Grids:
-    interior_indices,
-    left_halo_indices, right_halo_indices,
-    underlying_left_halo_indices, underlying_right_halo_indices
-
-# TODO: Move to Grids/grid_utils.jl
-
-#####
-##### Viewing halos
-#####
-
-west_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx), :, :) :
-                      view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
-
-east_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, right_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx), :, :) :
-                      view(f.data, right_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
-
-south_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, left_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy), :) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   left_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy),
-                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
-
-north_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, right_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy), :) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   right_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy),
-                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
-
-bottom_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, :, left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
-
-top_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
-
-instantiate(T::Type) = T()
-instantiate(t) = t
-
-underlying_west_halo(f, grid, location) =
-    view(f.parent, underlying_left_halo_indices(instantiate(location), instantiate(topology(grid, 1)), grid.Nx, grid.Hx), :, :)
-
-underlying_east_halo(f, grid, location) =
-    view(f.parent, underlying_right_halo_indices(instantiate(location), instantiate(topology(grid, 1)), grid.Nx, grid.Hx), :, :)
-
-underlying_south_halo(f, grid, location) =
-    view(f.parent, :, underlying_left_halo_indices(instantiate(location), instantiate(topology(grid, 2)), grid.Ny, grid.Hy), :)
-
-underlying_north_halo(f, grid, location) =
-    view(f.parent, :, underlying_right_halo_indices(instantiate(location), instantiate(topology(grid, 2)), grid.Ny, grid.Hy), :)
-
-underlying_bottom_halo(f, grid, location) =
-    view(f.parent, :, :, underlying_left_halo_indices(instantiate(location), instantiate(topology(grid, 3)), grid.Nz, grid.Hz))
-
-underlying_top_halo(f, grid, location) =
-    view(f.parent, :, :, underlying_right_halo_indices(instantiate(location), instantiate(topology(grid, 3)), grid.Nz, grid.Hz))
-
-#####
-##### Viewing boundary grid points (used to fill other halos)
-#####
-
-underlying_left_boundary_indices(loc, topo, N, H) = 1+H:2H
-underlying_left_boundary_indices(::Nothing, topo, N, H) = 1:0 # empty
-
-underlying_right_boundary_indices(loc, topo, N, H) = N+1:N+H
-underlying_right_boundary_indices(::Nothing, topo, N, H) = 1:0 # empty
-
-underlying_west_boundary(f, grid, location) =
-    view(f.parent, underlying_left_boundary_indices(instantiate(location), instantiate(topology(grid, 1)), grid.Nx, grid.Hx), :, :)
-
-underlying_east_boundary(f, grid, location) =
-    view(f.parent, underlying_right_boundary_indices(instantiate(location), instantiate(topology(grid, 1)), grid.Nx, grid.Hx), :, :)
-
-underlying_south_boundary(f, grid, location) =
-    view(f.parent, :, underlying_left_boundary_indices(instantiate(location), instantiate(topology(grid, 2)), grid.Ny, grid.Hy), :)
-
-underlying_north_boundary(f, grid, location) =
-    view(f.parent, :, underlying_right_boundary_indices(instantiate(location), instantiate(topology(grid, 2)), grid.Ny, grid.Hy), :)
-
-underlying_bottom_boundary(f, grid, location) =
-    view(f.parent, :, :, underlying_left_boundary_indices(instantiate(location), instantiate(topology(grid, 3)), grid.Nz, grid.Hz))
-
-underlying_top_boundary(f, grid, location) =
-    view(f.parent, :, :, underlying_right_boundary_indices(instantiate(location), instantiate(topology(grid, 3)), grid.Nz, grid.Hz))
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 78bbe719b7..1c0d8ef6f8 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,9 +1,7 @@
 using KernelAbstractions: @kernel, @index, priority!
 using OffsetArrays: OffsetArray
-using CUDA: synchronize
 using CUDA: cuStreamGetFlags, stream, priority_range, CUstream_flags_enum, CuStream, stream!
 
-import Oceananigans.Utils: sync_device!
 using Oceananigans.Fields: fill_send_buffers!,
                            recv_from_buffers!, 
                            reduced_dimensions, 
@@ -25,10 +23,6 @@ import Oceananigans.BoundaryConditions:
     fill_south_and_north_halo!,
     fill_bottom_and_top_halo!
 
-@inline sync_device!(::CPU)                 = nothing
-@inline sync_device!(::GPU)                 = synchronize()
-@inline sync_device!(arch::DistributedArch) = sync_device!(arch.child_architecture)
-
 #####
 ##### MPI tags for halo communication BCs
 #####
@@ -310,8 +304,7 @@ for side in sides
             return send_req
         end
 
-        @inline $get_side_send_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_boundary(c, grid, side_location)
-        @inline $get_side_send_buffer(c, grid, side_location, buffers, arch)                   = buffers.$side.send     
+        @inline $get_side_send_buffer(c, grid, side_location, buffers, arch) = buffers.$side.send     
     end
 end
 
@@ -337,7 +330,6 @@ for side in sides
             return recv_req
         end
 
-        @inline $get_side_recv_buffer(c, grid, side_location, buffers, ::ViewsDistributedArch) = $underlying_side_halo(c, grid, side_location)
-        @inline $get_side_recv_buffer(c, grid, side_location, buffers, arch)                   = buffers.$side.recv
+        @inline $get_side_recv_buffer(c, grid, side_location, buffers, arch) = buffers.$side.recv
     end
 end
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 8b18fcf2ca..b1b6d8d4b1 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -5,8 +5,9 @@ using CUDA: ndevices, device!
 import Oceananigans.Architectures: device, arch_array, array_type, child_architecture
 import Oceananigans.Grids: zeros
 import Oceananigans.Fields: using_buffered_communication
+import Oceananigans.Utils: sync_device!
 
-struct DistributedArch{A, R, I, ρ, C, γ, B, M, T} <: AbstractArchitecture
+struct DistributedArch{A, R, I, ρ, C, γ, M, T} <: AbstractArchitecture
   child_architecture :: A
           local_rank :: R
          local_index :: I
@@ -25,7 +26,6 @@ end
     DistributedArch(child_architecture = CPU(); 
                     topology = (Periodic, Periodic, Periodic), 
                     ranks, 
-                    use_buffers = false,
                     devices = nothing, 
                     communicator = MPI.COMM_WORLD)
 
@@ -48,10 +48,6 @@ Keyword arguments
                       `y` and `z` direction. NOTE: support for distributed z direction is 
                       limited, so `Rz = 1` is strongly suggested.
 
-- `use_buffers`: if `true`, buffered halo communication is implemented. If `false`, halos will be 
-                 exchanged through views. Buffered communication is not necessary in case of `CPU`
-                 execution, but it is necessary for `GPU` execution without CUDA-aware MPI
-
 - `devices`: `GPU` device linked to local rank. The GPU will be assigned based on the 
              local node rank as such `devices[node_rank]`. Make sure to run `--ntasks-per-node` <= `--gres=gpu`.
              If `nothing`, the devices will be assigned automatically based on the available resources
@@ -62,7 +58,6 @@ Keyword arguments
 function DistributedArch(child_architecture = CPU(); 
                          topology = (Periodic, Periodic, Periodic), 
                          ranks,
-                         use_buffers = true,
                          devices = nothing, 
                          enable_overlapped_computation = true,
                          communicator = MPI.COMM_WORLD)
@@ -107,17 +102,13 @@ function DistributedArch(child_architecture = CPU();
 
     mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
 
-    B = use_buffers
     M = typeof(mpi_requests)
     T = typeof([0])
 
-    return DistributedArch{A, R, I, ρ, C, γ, B, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, [0])
+    return DistributedArch{A, R, I, ρ, C, γ, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, [0])
 end
 
-const ViewsDistributedArch   = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, false}
-const BlockingDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
-
-using_buffered_communication(::DistributedArch{A, R, I, ρ, C, γ, B}) where {A, R, I, ρ, C, γ, B} = B
+const BlockingDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
 
 #####
 ##### All the architectures
@@ -128,6 +119,7 @@ device(arch::DistributedArch)             = device(child_architecture(arch))
 arch_array(arch::DistributedArch, A)      = arch_array(child_architecture(arch), A)
 zeros(FT, arch::DistributedArch, N...)    = zeros(FT, child_architecture(arch), N...)
 array_type(arch::DistributedArch)         = array_type(child_architecture(arch))
+sync_device!(arch::DistributedArch)       = sync_device!(arch.child_architecture)
 
 #####
 ##### Converting between index and MPI rank taking k as the fast index
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index e43b83f72a..b61f5f4254 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -52,27 +52,16 @@ create_buffer_y(arch, grid, data, H, bc) = nothing
 create_buffer_corner(arch, grid, data, Hx, Hy, ::Nothing) = nothing
 
 function create_buffer_corner(arch, grid, data, Hx, Hy, side)
-    if !using_buffered_communication(arch)
-        return nothing
-    end
     return (send = arch_array(arch, zeros(eltype(data), Hx, Hy, size(parent(data), 3))), 
             recv = arch_array(arch, zeros(eltype(data), Hx, Hy, size(parent(data), 3))))    
 end
 
-using_buffered_communication(arch) = true
-
 function create_buffer_x(arch, grid, data, H, ::DCBC) 
-    if !using_buffered_communication(arch)
-        return nothing
-    end
     return (send = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))), 
             recv = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))))    
 end
 
 function create_buffer_y(arch, grid, data, H, ::DCBC)
-    if !using_buffered_communication(arch)
-        return nothing
-    end
     return (send = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))), 
             recv = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))))
 end
diff --git a/src/Utils/multi_region_transformation.jl b/src/Utils/multi_region_transformation.jl
index de525cecf4..fcfab540c4 100644
--- a/src/Utils/multi_region_transformation.jl
+++ b/src/Utils/multi_region_transformation.jl
@@ -175,8 +175,9 @@ end
     end 
 end
 
+@inline sync_device!(::CPU)      = nothing
+@inline sync_device!(::GPU)      = CUDA.synchronize()
 @inline sync_device!(::CuDevice) = CUDA.synchronize()
-@inline sync_device!(dev)        = nothing
 
 
 # TODO: The macro errors when there is a return and the function has (args...) in the 

From 2cac349ea141ecc350579c20f7e4e0108f104ca6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 10:38:02 +0200
Subject: [PATCH 337/530] little bit of a cleanup

---
 src/Fields/field_boundary_buffers.jl | 58 ++--------------------------
 1 file changed, 3 insertions(+), 55 deletions(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index b61f5f4254..24b7fd9dfd 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -107,50 +107,6 @@ function fill_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     return nothing
 end
 
-function fill_west_and_east_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
-    Hx, Hy, _ = halo_size(grid)
-    Nx, Ny, _ = size(grid)
-
-    _fill_west_send_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
-    _fill_east_send_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
-
-    return nothing
-end
-
-function fill_south_and_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
-    Hx, Hy, _ = halo_size(grid)
-    Nx, Ny, _ = size(grid)
-
-    _fill_south_send_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
-    _fill_north_send_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
-
-    return nothing
-end
-
-fill_west_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_west_send_buffer!(parent(c), buffers.west, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_east_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_east_send_buffer!(parent(c), buffers.east, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_south_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_south_send_buffer!(parent(c), buffers.south, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_north_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_north_send_buffer!(parent(c), buffers.north, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_southwest_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_southwest_send_buffer!(parent(c), buffers.southwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_southeast_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_southeast_send_buffer!(parent(c), buffers.southeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_northwest_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_northwest_send_buffer!(parent(c), buffers.northwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-fill_northeast_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid) = 
-    _fill_northeast_send_buffer!(parent(c), buffers.northeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
 """
     recv_from_buffers(c, buffers, arch)
 
@@ -208,17 +164,9 @@ end
 
 recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:bottom_and_top}) = nothing
 
-recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:southwest}) = 
-        _recv_from_southwest_buffer!(c, buffers.southwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:southeast}) = 
-        _recv_from_southeast_buffer!(c, buffers.southeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:northwest}) = 
-        _recv_from_northwest_buffer!(c, buffers.northwest, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
-
-recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:northeast}) = 
-        _recv_from_northeast_buffer!(c, buffers.northeast, halo_size(grid)[1], halo_size(grid)[2], size(grid, 1), size(grid, 2))
+#####
+##### Individual _fill_send_buffers and _recv_from_buffer kernels
+#####
 
  _fill_west_send_buffer!(c, ::Nothing, args...) = nothing
  _fill_east_send_buffer!(c, ::Nothing, args...) = nothing

From 8564df20650a59bc9e2789190ebc46f9260bc5c5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 10:38:30 +0200
Subject: [PATCH 338/530] removed `views` comment

---
 src/Fields/field_boundary_buffers.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 24b7fd9dfd..ce6d2e5324 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -87,8 +87,7 @@ Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
 """
     fill_send_buffers(c, buffers, arch)
 
-fills `buffers.send` from OffsetArray `c` preparing for message passing. If we are on CPU
-we do not need to fill the buffers as the transfer can happen through views
+fills `buffers.send` from OffsetArray `c` preparing for message passing. 
 """
 function fill_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
@@ -110,8 +109,7 @@ end
 """
     recv_from_buffers(c, buffers, arch)
 
-fills OffsetArray `c` from `buffers.recv` after message passing occurred. If we are on CPU
-we do not need to fill the buffers as the transfer can happen through views
+fills OffsetArray `c` from `buffers.recv` after message passing occurred. 
 """
 function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)

From a8c29e14ac5209049432073de8126a156d2674c5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 10:52:41 +0200
Subject: [PATCH 339/530] couple of bugfixes

---
 src/Distributed/multi_architectures.jl   |  7 -------
 src/TurbulenceClosures/closure_tuples.jl | 13 ++-----------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index b1b6d8d4b1..a4fc2bb293 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -4,7 +4,6 @@ using CUDA: ndevices, device!
 
 import Oceananigans.Architectures: device, arch_array, array_type, child_architecture
 import Oceananigans.Grids: zeros
-import Oceananigans.Fields: using_buffered_communication
 import Oceananigans.Utils: sync_device!
 
 struct DistributedArch{A, R, I, ρ, C, γ, M, T} <: AbstractArchitecture
@@ -64,12 +63,6 @@ function DistributedArch(child_architecture = CPU();
 
     MPI.Initialized() || error("Must call MPI.Init() before constructing a MultiCPU.")
 
-    (use_buffers && child_architecture isa CPU) && 
-            @warn "Using buffers on CPU architectures is not required (but useful for testing)"
-
-    (!use_buffers && child_architecture isa GPU) && 
-            @warn "On GPU architectures not using buffers will lead to a substantial slowdown https://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-support"
-
     validate_tupled_argument(ranks, Int, "ranks")
 
     Rx, Ry, Rz = ranks
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 8c04f980d7..917a03fc3a 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -89,19 +89,10 @@ end
 @inline κ_kernel_size(grid, closure::AbstractArray) = κ_kernel_size(grid, closure[1])
 @inline κ_kernel_offsets(grid, closure::AbstractArray) = κ_kernel_offsets(grid, closure[1])
 
-@inline function κ_kernel_offsets(grid, closure_tuple::Tuple)
-    kernel_offsets = (0, 0, 0)
-    for closure in closure_tuple
-        kernel_offsets = max.(kernel_offsets, κ_kernel_offsets(grid, closure))
-    end
-
-    return kernel_offsets
-end
-
 @inline function κ_kernel_size(grid, closure_tuple::Tuple)
     kernel_size = (0, 0, 0)
     for closure in closure_tuple
-        kernel_size = max.(kernel_size, κ_kernel_size(grid, closure))
+        kernel_size = map(max, kernel_size, κ_kernel_size(grid, closure))
     end
 
     return kernel_size
@@ -110,7 +101,7 @@ end
 @inline function κ_kernel_offsets(grid, closure_tuple::Tuple)
     kernel_offsets = (0, 0, 0)
     for closure in closure_tuple
-        kernel_offsets = max.(kernel_offsets, κ_kernel_offsets(grid, closure))
+        kernel_offsets = map(min, kernel_offsets, κ_kernel_offsets(grid, closure))
     end
 
     return kernel_offsets

From db8d996402838f599d73deec00496f64a04b3283 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:02:22 +0200
Subject: [PATCH 340/530] fixed tests

---
 src/Distributed/multi_architectures.jl |   9 +-
 test/test_distributed_models.jl        | 194 +++++++++++++++----------
 2 files changed, 119 insertions(+), 84 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index a4fc2bb293..7f14cd7a47 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -23,7 +23,7 @@ end
 
 """
     DistributedArch(child_architecture = CPU(); 
-                    topology = (Periodic, Periodic, Periodic), 
+                    topology, 
                     ranks, 
                     devices = nothing, 
                     communicator = MPI.COMM_WORLD)
@@ -40,9 +40,8 @@ Positional arguments
 Keyword arguments
 =================
 
-- `topology`: the topology we want the grid to have. It is used to establish connectivity.
-              Default: `topology = (Periodic, Periodic, Periodic)`.
-
+- `topology` (required): the topology we want the grid to have. It is used to establish connectivity.
+                        
 - `ranks` (required): A 3-tuple `(Rx, Ry, Rz)` specifying the total processors in the `x`, 
                       `y` and `z` direction. NOTE: support for distributed z direction is 
                       limited, so `Rz = 1` is strongly suggested.
@@ -55,7 +54,7 @@ Keyword arguments
                   if not for testing or developing. Change at your own risk!
 """
 function DistributedArch(child_architecture = CPU(); 
-                         topology = (Periodic, Periodic, Periodic), 
+                         topology, 
                          ranks,
                          devices = nothing, 
                          enable_overlapped_computation = true,
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 619aba612a..8b88a9940a 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -26,7 +26,52 @@ MPI.Init()
 # to initialize MPI.
 
 using Oceananigans.BoundaryConditions: fill_halo_regions!, DCBC
-using Oceananigans.Distributed: DistributedArch, index2rank, east_halo, west_halo, north_halo, south_halo, top_halo, bottom_halo
+using Oceananigans.Distributed: DistributedArch, index2rank
+using Oceananigans.Fields: AbstractField
+using Oceananigans.Grids:
+    interior_indices,
+    left_halo_indices, right_halo_indices,
+    underlying_left_halo_indices, underlying_right_halo_indices
+
+#####
+##### Viewing halos
+#####
+
+west_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx), :, :) :
+                      view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx),
+                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
+
+east_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, right_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx), :, :) :
+                      view(f.data, right_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx),
+                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
+
+south_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, left_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy), :) :
+                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                                   left_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy),
+                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
+
+north_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, right_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy), :) :
+                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                                   right_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy),
+                                   interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
+
+bottom_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, :, left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
+                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                                   left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
+
+top_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
+    include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
+                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                                   right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
 # Right now just testing with 4 ranks!
 comm = MPI.COMM_WORLD
@@ -305,7 +350,7 @@ end
 
 function test_triply_periodic_bc_injection_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(1, 4, 1))
+    arch = DistributedArch(ranks=(1, 4, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -322,7 +367,7 @@ end
 
 function test_triply_periodic_bc_injection_with_114_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(1, 1, 4))
+    arch = DistributedArch(ranks=(1, 1, 4), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 2), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -339,7 +384,7 @@ end
 
 function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(2, 2, 1))
+    arch = DistributedArch(ranks=(2, 2, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -360,24 +405,22 @@ end
 
 function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true, )
-        arch = DistributedArch(child_arch; ranks=(4, 1, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
-        model = NonhydrostaticModel(grid=grid)
-
-        for field in merge(fields(model))
-            interior(field) .= arch.local_rank
-            fill_halo_regions!(field)
-
-            @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
-            @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
-
-            @test all(interior(field) .== arch.local_rank)
-            @test all(north_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(south_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(top_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
-        end
+    arch = DistributedArch(child_arch; ranks=(4, 1, 1), topology=topo, devices = (0, 0, 0, 0))
+    grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
+    model = NonhydrostaticModel(grid=grid)
+
+    for field in merge(fields(model))
+        interior(field) .= arch.local_rank
+        fill_halo_regions!(field)
+
+        @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
+        @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
+
+        @test all(interior(field) .== arch.local_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
 
 
@@ -386,48 +429,44 @@ end
 
 function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch)
     topo  = (Periodic, Periodic, Periodic)
-    for use_buffers in (true, )
-        arch = DistributedArch(child_arch; ranks=(1, 4, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
-        model = NonhydrostaticModel(grid=grid)
-
-        for field in merge(fields(model), model.pressures)
-            interior(field) .= arch.local_rank
-            fill_halo_regions!(field)
-
-            @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
-            @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
-
-            @test all(interior(field) .== arch.local_rank)
-            @test all(east_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(west_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(top_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
-        end
+    arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology=topo, devices = (0, 0, 0, 0))
+    grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
+    model = NonhydrostaticModel(grid=grid)
+
+    for field in merge(fields(model), model.pressures)
+        interior(field) .= arch.local_rank
+        fill_halo_regions!(field)
+
+        @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
+        @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
+
+        @test all(interior(field) .== arch.local_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
     return nothing
 end
 
 function test_triply_periodic_halo_communication_with_114_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true, )
-        arch = DistributedArch(child_arch; ranks=(1, 1, 4), use_buffers, devices = (0, 0, 0, 0))
-        grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
-        model = NonhydrostaticModel(grid=grid)
-
-        for field in merge(fields(model))
-            interior(field) .= arch.local_rank
-            fill_halo_regions!(field)
-
-            @test all(top_halo(field, include_corners=false) .== arch.connectivity.top)
-            @test all(bottom_halo(field, include_corners=false) .== arch.connectivity.bottom)
-
-            @test all(interior(field) .== arch.local_rank)
-            @test all(east_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(west_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(north_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(south_halo(field, include_corners=false) .== arch.local_rank)
-        end
+    arch = DistributedArch(child_arch; ranks=(1, 1, 4), topology=topo, devices = (0, 0, 0, 0))
+    grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
+    model = NonhydrostaticModel(grid=grid)
+
+    for field in merge(fields(model))
+        interior(field) .= arch.local_rank
+        fill_halo_regions!(field)
+
+        @test all(top_halo(field, include_corners=false) .== arch.connectivity.top)
+        @test all(bottom_halo(field, include_corners=false) .== arch.connectivity.bottom)
+
+        @test all(interior(field) .== arch.local_rank)
+        @test all(east_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(west_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(north_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(south_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
@@ -435,24 +474,22 @@ end
 
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    for use_buffers in (true, )
-        arch = DistributedArch(child_arch; ranks=(2, 2, 1), use_buffers, devices = (0, 0, 0, 0))
-        grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
-        model = NonhydrostaticModel(grid=grid)
-
-        for field in merge(fields(model))
-            interior(field) .= arch.local_rank
-            fill_halo_regions!(field)
-
-            @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
-            @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
-            @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
-            @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
-
-            @test all(interior(field) .== arch.local_rank)
-            @test all(top_halo(field, include_corners=false) .== arch.local_rank)
-            @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
-        end
+    arch = DistributedArch(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
+    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
+    model = NonhydrostaticModel(grid=grid)
+
+    for field in merge(fields(model))
+        interior(field) .= arch.local_rank
+        fill_halo_regions!(field)
+
+        @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
+        @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
+        @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
+        @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
+
+        @test all(interior(field) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
     end
 
     return nothing
@@ -512,7 +549,7 @@ end
             for ranks in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]
                 @info "Time-stepping a distributed NonhydrostaticModel with ranks $ranks..."
                 topo = (Periodic, Periodic, Periodic)
-                arch = DistributedArch(; ranks)
+                arch = DistributedArch(; ranks, topology=topo)
                 grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
                 model = NonhydrostaticModel(; grid)
 
@@ -531,8 +568,7 @@ end
     @testset "Time stepping ShallowWaterModel" begin
         for child_arch in archs
             topo = (Periodic, Periodic, Flat)
-            use_buffers = true
-            arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology = topo, use_buffers, devices = (0, 0, 0, 0))
+            arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology = topo, devices = (0, 0, 0, 0))
             grid = RectilinearGrid(arch, topology=topo, size=(8, 2), extent=(1, 2), halo=(3, 3))
             model = ShallowWaterModel(; momentum_advection=nothing, mass_advection=nothing, tracer_advection=nothing, grid, gravitational_acceleration=1)
 

From 6681636b6cca23717eb026a7d3dc7a5265d3c6cc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:05:36 +0200
Subject: [PATCH 341/530] probably done

---
 .../mpi_hydrostatic_turbulence.jl                | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index 808d1dba1d..b9c3492f3c 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -14,9 +14,9 @@ using Oceananigans.Distributed
 
 ranks = (2, 2, 1)
 topo  = (Periodic, Periodic, Bounded)
-arch  = DistributedArch(CPU(), ranks=ranks, topology=topo, use_buffers=true)
+arch  = DistributedArch(CPU(), ranks=ranks, topology=topo)
 
-grid  = RectilinearGrid(arch, topology=topo, size=(28 ÷ 4, 28, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
+grid  = RectilinearGrid(arch, topology=topo, size=(28 ÷ 2, 28 ÷ 2, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
 
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
@@ -67,13 +67,13 @@ if rank == 0
     z3 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank2.jld2", "u")
     z4 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank3.jld2", "u")
 
-    ζ1 = @lift(interior(z1[$iter], 1:28, 1:28, 1))
-    ζ2 = @lift(interior(z2[$iter], 1:28, 1:28, 1))
-    ζ3 = @lift(interior(z3[$iter], 1:28, 1:28, 1))
-    ζ4 = @lift(interior(z4[$iter], 1:28, 1:28, 1))
+    ζ1 = @lift(interior(z1[$iter], 1:14, 1:14, 1))
+    ζ2 = @lift(interior(z2[$iter], 1:14, 1:14, 1))
+    ζ3 = @lift(interior(z3[$iter], 1:14, 1:14, 1))
+    ζ4 = @lift(interior(z4[$iter], 1:14, 1:14, 1))
 
-    x1, y1 = z1.grid.xᶠᵃᵃ[1:28], z1.grid.yᵃᶜᵃ[1:28]
-    x2, y2 = z4.grid.xᶠᵃᵃ[1:28], z4.grid.yᵃᶜᵃ[1:28]
+    x1, y1 = z1.grid.xᶠᵃᵃ[1:14], z1.grid.yᵃᶜᵃ[1:14]
+    x2, y2 = z4.grid.xᶠᵃᵃ[1:14], z4.grid.yᵃᶜᵃ[1:14]
 
     fig = Figure()
     ax = Axis(fig[1, 1])

From d1eb3ba12641178e41dda5d299f0216e6248d405 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:36:04 +0200
Subject: [PATCH 342/530] same thing for nonhydrostatic model

---
 src/Distributed/interleave_comm_and_comp.jl   |  5 ++
 .../HydrostaticFreeSurfaceModels.jl           |  2 +-
 ...static_free_surface_boundary_tendencies.jl | 58 +++++++++++++++++++
 ...ate_hydrostatic_free_surface_tendencies.jl |  3 -
 ...ate_nonhydrostatic_boundary_tendencies.jl} | 31 ++--------
 .../calculate_nonhydrostatic_tendencies.jl    | 40 ++++++++-----
 .../update_nonhydrostatic_model_state.jl      |  5 +-
 7 files changed, 94 insertions(+), 50 deletions(-)
 create mode 100644 src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
 rename src/Models/{HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl => NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl} (74%)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 1e800ad0b6..18c6eeaab6 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -15,9 +15,14 @@ function complete_communication_and_compute_boundary!(model, ::DistributedGrid,
     return nothing
 end
 
+# Fallback
 complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingDistributedArch) = nothing
+complete_communication_and_compute_boundary!(model, grid, arch) = nothing
+
 compute_boundary_tendencies!(model) = nothing
 
+interior_tendency_kernel_parameters(grid) = :xyz
+
 interior_tendency_kernel_parameters(grid::DistributedGrid) = 
             interior_tendency_kernel_parameters(grid, architecture(grid))
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
index 2959870b29..ce804cc291 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
@@ -106,7 +106,7 @@ step_lagrangian_particles!(model::HydrostaticFreeSurfaceModel, Δt) = step_lagra
 include("barotropic_pressure_correction.jl")
 include("hydrostatic_free_surface_tendency_kernel_functions.jl")
 include("calculate_hydrostatic_free_surface_tendencies.jl")
-include("recompute_boundary_tendencies.jl")
+include("calculate_hydrostatic_free_surface_boundary_tendencies.jl")
 include("update_hydrostatic_free_surface_model_state.jl")
 include("hydrostatic_free_surface_ab2_step.jl")
 include("store_hydrostatic_free_surface_tendencies.jl")
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
new file mode 100644
index 0000000000..e16b8d3a5b
--- /dev/null
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
@@ -0,0 +1,58 @@
+import Oceananigans.Distributed: compute_boundary_tendencies!
+using Oceananigans.Utils: worktuple, offsets
+using Oceananigans.TurbulenceClosures: required_halo_size
+using Oceananigans.Models.NonhydrostaticModel: boundary_tendency_kernel_parameters,
+                                               boundary_p_kernel_parameters, 
+                                               boundary_κ_kernel_parameters,
+                                               boundary_parameters
+
+import Oceananigans.Models.NonhydrostaticModel: compute_boundary_tendencies!
+
+                                
+# We assume here that top/bottom BC are always synched (no partitioning in z)
+function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
+    grid = model.grid
+    arch = architecture(grid)
+
+    # We need new values for `w`, `p` and `κ`
+    recompute_auxiliaries!(model, grid, arch)
+
+    # parameters for communicating North / South / East / West side
+    kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
+    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+
+    return nothing
+end
+
+function recompute_auxiliaries!(model::HydrostaticFreeSurfaceModel, grid, arch)
+    
+    w_kernel_parameters = boundary_w_kernel_parameters(grid, arch)
+    p_kernel_parameters = boundary_p_kernel_parameters(grid, arch)
+    κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
+
+    for (wpar, ppar, κpar) in zip(w_kernel_parameters, p_kernel_parameters, κ_kernel_parameters)
+        compute_w_from_continuity!(model.velocities, arch, grid; parameters = wpar)
+        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
+    end
+end
+
+# w needs computing in the range - H + 1 : 0 and N - 1 : N + H - 1
+function boundary_w_kernel_parameters(grid, arch)
+    Nx, Ny, _ = size(grid)
+    Hx, Hy, _ = halo_size(grid)
+
+    Sx  = (Hx, Ny)
+    Sy  = (Nx, Hy)
+             
+    Oxᴸ = (-Hx+1, 0)
+    Oyᴸ = (0, -Hy+1)
+    Oxᴿ = (Nx-1,  0)
+    Oyᴿ = (0,  Ny-1)
+
+    sizes = (Sx,  Sy,  Sx,  Sy)
+    offs  = (Oxᴸ, Oyᴸ, Oxᴿ, Oyᴿ)
+        
+    return boundary_parameters(sizes, offs, grid, arch)
+end
+
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index fb13740986..45df32d3a3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -51,9 +51,6 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
     return nothing
 end
 
-complete_communication_and_compute_boundary!(model, grid, arch) = nothing
-interior_tendency_kernel_parameters(grid) = :xyz
-
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
 using Oceananigans.TurbulenceClosures.MEWSVerticalDiffusivities: MEWS
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
similarity index 74%
rename from src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
rename to src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index cf3a5960f9..191ab7f2b8 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/recompute_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -3,16 +3,16 @@ using Oceananigans.Utils: worktuple, offsets
 using Oceananigans.TurbulenceClosures: required_halo_size
 
 # We assume here that top/bottom BC are always synched (no partitioning in z)
-function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
+function compute_boundary_tendencies!(model::NonhydrostaticModel)
     grid = model.grid
     arch = architecture(grid)
 
-    # We need new values for `w`, `p` and `κ`
+    # We need new values for `p` and `κ`
     recompute_auxiliaries!(model, grid, arch)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
-    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+    calculate_interior_tendency_contributions!(model, kernel_parameters)
 
     return nothing
 end
@@ -35,38 +35,17 @@ function boundary_tendency_kernel_parameters(grid, arch)
     return boundary_parameters(sizes, offs, grid, arch)
 end
 
-function recompute_auxiliaries!(model, grid, arch)
+function recompute_auxiliaries!(model::NonhydrostaticModel, grid, arch)
     
-    w_kernel_parameters = boundary_w_kernel_parameters(grid, arch)
     p_kernel_parameters = boundary_p_kernel_parameters(grid, arch)
     κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
 
-    for (wpar, ppar, κpar) in zip(w_kernel_parameters, p_kernel_parameters, κ_kernel_parameters)
-        compute_w_from_continuity!(model.velocities, arch, grid; parameters = wpar)
+    for (ppar, κpar) in zip(p_kernel_parameters, κ_kernel_parameters)
         update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
         calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
     end
 end
 
-# w needs computing in the range - H + 1 : 0 and N - 1 : N + H - 1
-function boundary_w_kernel_parameters(grid, arch)
-    Nx, Ny, _ = size(grid)
-    Hx, Hy, _ = halo_size(grid)
-
-    Sx  = (Hx, Ny)
-    Sy  = (Nx, Hy)
-             
-    Oxᴸ = (-Hx+1, 0)
-    Oyᴸ = (0, -Hy+1)
-    Oxᴿ = (Nx-1,  0)
-    Oyᴿ = (0,  Ny-1)
-
-    sizes = (Sx,  Sy,  Sx,  Sy)
-    offs  = (Oxᴸ, Oyᴸ, Oxᴿ, Oyᴿ)
-        
-    return boundary_parameters(sizes, offs, grid, arch)
-end
-
 # p needs computing in the range  0 : 0 and N + 1 : N + 1
 function boundary_p_kernel_parameters(grid, arch)
     Nx, Ny, _ = size(grid)
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index fb14329931..1fc43f7a91 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -1,6 +1,7 @@
 using Oceananigans.Biogeochemistry: update_tendencies!
 using Oceananigans: fields, TendencyCallsite
 using Oceananigans.Utils: work_layout
+using Oceananigans.Distributed: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
 
@@ -24,8 +25,11 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
-    calculate_interior_tendency_contributions!(model)
-                                               
+    kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
+
+    calculate_interior_tendency_contributions!(model, kernel_parameters)
+    complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
+                      
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
     calculate_boundary_tendency_contributions!(model.timestepper.Gⁿ,
@@ -45,7 +49,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 end
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(model)
+function calculate_interior_tendency_contributions!(model, kernel_parameters)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
@@ -86,17 +90,19 @@ function calculate_interior_tendency_contributions!(model)
     
     only_active_cells = use_only_active_interior_cells(grid)
 
-    launch!(arch, grid, :xyz, calculate_Gu!, 
-            tendencies.u, grid, u_kernel_args;
-            only_active_cells)
-            
-    launch!(arch, grid, :xyz, calculate_Gv!, 
-            tendencies.v, grid, v_kernel_args;
-            only_active_cells)
+    for parameters in kernel_parameters
+        launch!(arch, grid, parameters, calculate_Gu!, 
+                tendencies.u, grid, u_kernel_args;
+                only_active_cells)
 
-    launch!(arch, grid, :xyz, calculate_Gw!, 
-            tendencies.w, grid, w_kernel_args;
-            only_active_cells)
+        launch!(arch, grid, parameters, calculate_Gv!, 
+                tendencies.v, grid, v_kernel_args;
+                only_active_cells)
+
+        launch!(arch, grid, parameters, calculate_Gw!, 
+                tendencies.w, grid, w_kernel_args;
+                only_active_cells)
+    end
 
     start_tracer_kernel_args = (advection, closure)
     end_tracer_kernel_args   = (buoyancy, biogeochemistry, background_fields, velocities, tracers, auxiliary_fields, diffusivities)
@@ -113,9 +119,11 @@ function calculate_interior_tendency_contributions!(model)
                      end_tracer_kernel_args...,
                      forcing, clock)
 
-        launch!(arch, grid, :xyz, calculate_Gc!, 
-                c_tendency, grid, args;
-                only_active_cells)
+        for parameters in kernel_parameters
+            launch!(arch, grid, parameters, calculate_Gc!, 
+                    c_tendency, grid, args;
+                    only_active_cells)
+        end
     end
 
     return nothing
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index e8e7ea844c..b7844c5172 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -21,7 +21,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     foreach(mask_immersed_field!, model.tracers)
 
     # Fill halos for velocities and tracers
-    fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model))
+    fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model); async = true)
 
     # Compute auxiliary fields
     for aux_field in model.auxiliary_fields
@@ -30,10 +30,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
 
     # Calculate diffusivities
     calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-    fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model))
-
     update_hydrostatic_pressure!(model)
-    fill_halo_regions!(model.pressures.pHY′, model.clock, fields(model))
 
     for callback in callbacks
         callback.callsite isa UpdateStateCallsite && callback(model)

From f2406fb2262cf291a0f61e4ab2a7c7af11fbdc86 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:37:08 +0200
Subject: [PATCH 343/530] include file

---
 src/Models/NonhydrostaticModels/NonhydrostaticModels.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index f8a0a48d7c..4488b2c0f0 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -75,5 +75,6 @@ include("update_nonhydrostatic_model_state.jl")
 include("pressure_correction.jl")
 include("nonhydrostatic_tendency_kernel_functions.jl")
 include("calculate_nonhydrostatic_tendencies.jl")
+include("calculate_nonhydrostatic_boundary_tendencies.jl")
 
 end # module

From 03ff7da24bc1c4127df585a974c256209bbe00a0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:44:34 +0200
Subject: [PATCH 344/530] bugfix

---
 ...ate_hydrostatic_free_surface_boundary_tendencies.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
index e16b8d3a5b..efb7937223 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
@@ -1,12 +1,12 @@
 import Oceananigans.Distributed: compute_boundary_tendencies!
 using Oceananigans.Utils: worktuple, offsets
 using Oceananigans.TurbulenceClosures: required_halo_size
-using Oceananigans.Models.NonhydrostaticModel: boundary_tendency_kernel_parameters,
-                                               boundary_p_kernel_parameters, 
-                                               boundary_κ_kernel_parameters,
-                                               boundary_parameters
+using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_parameters,
+                                                boundary_p_kernel_parameters, 
+                                                boundary_κ_kernel_parameters,
+                                                boundary_parameters
 
-import Oceananigans.Models.NonhydrostaticModel: compute_boundary_tendencies!
+import Oceananigans.Models.NonhydrostaticModels: compute_boundary_tendencies!
 
                                 
 # We assume here that top/bottom BC are always synched (no partitioning in z)

From 23a0040c1654c09b6283be17234895932dfaa0c5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:45:51 +0200
Subject: [PATCH 345/530] prepare for nonhydrostatic multiregion

---
 .../update_nonhydrostatic_model_state.jl                  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index b7844c5172..1ee1e518d2 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -28,9 +28,11 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
         compute!(aux_field)
     end
 
-    # Calculate diffusivities
-    calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-    update_hydrostatic_pressure!(model)
+    # Calculate diffusivities and hydrostatic pressure
+    @apply_regionally begin
+        calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
+        update_hydrostatic_pressure!(model)
+    end
 
     for callback in callbacks
         callback.callsite isa UpdateStateCallsite && callback(model)

From f2f5de3995ae64ab2b2e6b0a652d13c6ef9db775 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 11:46:11 +0200
Subject: [PATCH 346/530] also here

---
 .../NonhydrostaticModels/update_nonhydrostatic_model_state.jl   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 1ee1e518d2..1108b3a914 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -18,7 +18,7 @@ they are called in the end.
 function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendencies = true)
     
     # Mask immersed tracers
-    foreach(mask_immersed_field!, model.tracers)
+    @apply_regionally foreach(mask_immersed_field!, model.tracers)
 
     # Fill halos for velocities and tracers
     fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model); async = true)

From 4e2b04b6e0a3fdf1dd8661fe2e01c76cfbd5ab97 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:01:31 +0200
Subject: [PATCH 347/530] bugfix

---
 .../NonhydrostaticModels/update_nonhydrostatic_model_state.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 1108b3a914..21f1e50e9c 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -18,7 +18,9 @@ they are called in the end.
 function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendencies = true)
     
     # Mask immersed tracers
-    @apply_regionally foreach(mask_immersed_field!, model.tracers)
+    foreach(model.tracers) do tracer
+        @apply_regionally mask_immersed_field!(tracer)
+    end
 
     # Fill halos for velocities and tracers
     fill_halo_regions!(merge(model.velocities, model.tracers), model.clock, fields(model); async = true)

From b29a79883f1d6d4e1f565065140da8509d3c4b0a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:03:06 +0200
Subject: [PATCH 348/530] other bugfix

---
 src/TurbulenceClosures/turbulence_closure_utils.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 4d00d5c89a..844da639ee 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -43,7 +43,7 @@ end
 
     Ax = Tx == Flat ? Nx : Nx + 2B 
     Ay = Ty == Flat ? Ny : Ny + 2B 
-    Az = Tz == Flat ? Nz : Nz + 2
+    Az = Tz == Flat ? Nz : Nz + 2B
 
     return (Ax, Ay, Az)
 end
@@ -53,7 +53,7 @@ end
 
     Ax = Tx == Flat ? 0 : - B
     Ay = Ty == Flat ? 0 : - B 
-    Az = Tz == Flat ? 0 : - 1
+    Az = Tz == Flat ? 0 : - B
 
     return (Ax, Ay, Az)
 end

From f0b93e515d6c0dca1351f9591f3d6fe8518cf591 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:07:52 +0200
Subject: [PATCH 349/530] fix closures

---
 .../anisotropic_minimum_dissipation.jl               | 12 +++++-------
 .../isopycnal_skew_symmetric_diffusivity.jl          |  4 ++--
 .../leith_enstrophy_diffusivity.jl                   |  4 ++--
 .../smagorinsky_lilly.jl                             |  4 ++--
 src/TurbulenceClosures/turbulence_closure_utils.jl   |  4 ++--
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 25311c2622..79283d90a3 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -185,22 +185,20 @@ end
     return max(zero(FT), κˢᵍˢ)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model)
+function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = KernelParameters(model.grid, closure))
     grid = model.grid
     arch = model.architecture
     velocities = model.velocities
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    workgroup, worksize = work_layout(grid, :xyz)
-    viscosity_kernel!   = calculate_nonlinear_viscosity!(device(arch), workgroup, worksize)
-    diffusivity_kernel! = calculate_nonlinear_tracer_diffusivity!(device(arch), workgroup, worksize)
-
-    viscosity_kernel!(diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
+    launch!(arch, grid, calculate_nonlinear_viscosity!, parameters, 
+            diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     for (tracer_index, κₑ) in enumerate(diffusivity_fields.κₑ)
         @inbounds tracer = tracers[tracer_index]
-        diffusivity_kernel!(κₑ, grid, closure, tracer, Val(tracer_index), velocities)
+        launch!(arch, grid, calculate_nonlinear_tracer_diffusivity!, parameters, 
+                κₑ, grid, closure, tracer, Val(tracer_index), velocities)
     end
 
     return nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 1ba3350f5b..188bec6c7f 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -80,14 +80,14 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfISSD{TD}) w
     end
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model)
+function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = KernelParameters(model.grid, closure)))
 
     arch = model.architecture
     grid = model.grid
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, parameters,
             compute_tapered_R₃₃!, diffusivities.ϵ_R₃₃, grid, closure, tracers, buoyancy)
 
     return nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index f3ef8e3665..dc1996ad5a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -90,14 +90,14 @@ end
     (closure.C * Δᶠ(i, j, k, grid, closure))^3 * sqrt(   abs²_∇h_ζ(i, j, k, grid, velocities)
                                                       + abs²_∇h_wz(i, j, k, grid, velocities.w))
 
-function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model)
+function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = KernelParameters(model.grid, closure))
     arch = model.architecture
     grid = model.grid
     velocities = model.velocities
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, parameters,
             calculate_nonlinear_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index b68b612230..b8210ed331 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -117,14 +117,14 @@ end
     return ς * (C * Δᶠ)^2 * sqrt(2Σ²)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model)
+function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = KernelParameters(model.grid, closure))
     arch = model.architecture
     grid = model.grid
     buoyancy = model.buoyancy
     velocities = model.velocities
     tracers = model.tracers
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, parameters,
             calculate_nonlinear_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 844da639ee..4d00d5c89a 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -43,7 +43,7 @@ end
 
     Ax = Tx == Flat ? Nx : Nx + 2B 
     Ay = Ty == Flat ? Ny : Ny + 2B 
-    Az = Tz == Flat ? Nz : Nz + 2B
+    Az = Tz == Flat ? Nz : Nz + 2
 
     return (Ax, Ay, Az)
 end
@@ -53,7 +53,7 @@ end
 
     Ax = Tx == Flat ? 0 : - B
     Ay = Ty == Flat ? 0 : - B 
-    Az = Tz == Flat ? 0 : - B
+    Az = Tz == Flat ? 0 : - 1
 
     return (Ax, Ay, Az)
 end

From 80f07c7b11fbfe0c04a74164b3688da9ffe205b3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:22:43 +0200
Subject: [PATCH 350/530] bugfix

---
 .../isopycnal_skew_symmetric_diffusivity.jl                   | 2 +-
 src/TurbulenceClosures/turbulence_closure_utils.jl            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 188bec6c7f..96bd1cf8cf 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -80,7 +80,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfISSD{TD}) w
     end
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = KernelParameters(model.grid, closure)))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = KernelParameters(model.grid, closure))
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 4d00d5c89a..844da639ee 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -43,7 +43,7 @@ end
 
     Ax = Tx == Flat ? Nx : Nx + 2B 
     Ay = Ty == Flat ? Ny : Ny + 2B 
-    Az = Tz == Flat ? Nz : Nz + 2
+    Az = Tz == Flat ? Nz : Nz + 2B
 
     return (Ax, Ay, Az)
 end
@@ -53,7 +53,7 @@ end
 
     Ax = Tx == Flat ? 0 : - B
     Ay = Ty == Flat ? 0 : - B 
-    Az = Tz == Flat ? 0 : - 1
+    Az = Tz == Flat ? 0 : - B
 
     return (Ax, Ay, Az)
 end

From 2f28cb089a06f3f78c11ba783fa36c92f8681eb4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:31:37 +0200
Subject: [PATCH 351/530] simplify

---
 .../shallow_water_diffusion_operators.jl          |  2 --
 .../anisotropic_minimum_dissipation.jl            | 15 +++++++++------
 .../leith_enstrophy_diffusivity.jl                | 14 +++++++++-----
 .../mews_vertical_diffusivity.jl                  |  4 ++--
 .../smagorinsky_lilly.jl                          |  8 +++++---
 .../turbulence_closure_utils.jl                   | 10 ----------
 6 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index 4f17ed01a9..6952a111a8 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -14,10 +14,8 @@ using Oceananigans.TurbulenceClosures:
 import Oceananigans.TurbulenceClosures:
                         DiffusivityFields,
                         calculate_diffusivities!,
-                        calculate_nonlinear_viscosity!,
                         viscosity,
                         with_tracers,
-                        calc_nonlinear_νᶜᶜᶜ,
                         νᶜᶜᶜ
 
 struct ShallowWaterScalarDiffusivity{V, X, N} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, N}
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 79283d90a3..295337f25d 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -143,7 +143,9 @@ end
 @inline Cᴾᵒⁱⁿ(i, j, k, grid, C::AbstractArray) = @inbounds C[i, j, k]
 @inline Cᴾᵒⁱⁿ(i, j, k, grid, C::Function) = C(xnode(i, grid, Center()), ynode(j, grid, Center()), znode(k, grid, Center()))
 
-@inline function calc_nonlinear_νᶜᶜᶜ(i, j, k, grid, closure::AMD, buoyancy, velocities, tracers)
+@kernel function _compute_AMD_viscosity!(νₑ, grid, closure::AMD, buoyancy, velocities, tracers)
+    i, j, k = @index(Global, NTuple)
+    
     FT = eltype(grid)
     ijk = (i, j, k, grid)
     q = norm_tr_∇uᶜᶜᶜ(ijk..., velocities.u, velocities.v, velocities.w)
@@ -162,10 +164,11 @@ end
         νˢᵍˢ = - Cᴾᵒⁱⁿ(i, j, k, grid, closure.Cν) * δ² * (r - Cb_ζ) / q
     end
 
-    return max(zero(FT), νˢᵍˢ)
+    @inbounds νₑ[i, j, k] = max(zero(FT), νˢᵍˢ)
 end
 
-@inline function calc_nonlinear_κᶜᶜᶜ(i, j, k, grid, closure::AMD, tracer, ::Val{tracer_index}, velocities) where {tracer_index}
+@kernel function _compute_AMD_diffusivity!(κₑ, grid, closure::AMD, tracer, ::Val{tracer_index}, velocities) where {tracer_index}
+    i, j, k = @index(Global, NTuple)
 
     FT = eltype(grid)
     ijk = (i, j, k, grid)
@@ -182,7 +185,7 @@ end
         κˢᵍˢ = - Cᴾᵒⁱⁿ(i, j, k, grid, Cκ) * δ² * ϑ / σ
     end
 
-    return max(zero(FT), κˢᵍˢ)
+    @inbounds κₑ[i, j, k] = max(zero(FT), κˢᵍˢ)
 end
 
 function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = KernelParameters(model.grid, closure))
@@ -192,12 +195,12 @@ function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimu
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, calculate_nonlinear_viscosity!, parameters, 
+    launch!(arch, grid, parameters, _compute_AMD_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     for (tracer_index, κₑ) in enumerate(diffusivity_fields.κₑ)
         @inbounds tracer = tracers[tracer_index]
-        launch!(arch, grid, calculate_nonlinear_tracer_diffusivity!, parameters, 
+        launch!(arch, grid, _compute_AMD_diffusivity!, parameters, 
                 κₑ, grid, closure, tracer, Val(tracer_index), velocities)
     end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index dc1996ad5a..d901e6a14a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -86,9 +86,14 @@ const ArrayOrField = Union{AbstractArray, AbstractField}
     return wxz² + wyz²
 end
 
-@inline calc_nonlinear_νᶜᶜᶜ(i, j, k, grid, closure::TwoDimensionalLeith{FT}, buoyancy, velocities, tracers) where FT =
-    (closure.C * Δᶠ(i, j, k, grid, closure))^3 * sqrt(   abs²_∇h_ζ(i, j, k, grid, velocities)
-                                                      + abs²_∇h_wz(i, j, k, grid, velocities.w))
+@kernel function _compute_leith_viscosity!(νₑ, grid, closure::TwoDimensionalLeith{FT}, buoyancy, velocities, tracers) where FT 
+    i, j, k = @index(Global, NTuple)
+
+    prefactor = (closure.C * Δᶠ(i, j, k, grid, closure))^3 
+    dynamic_ν = sqrt(abs²_∇h_ζ(i, j, k, grid, velocities) + abs²_∇h_wz(i, j, k, grid, velocities.w))
+    
+    @inbounds νₑ[i, j, k] = prefactor * dynamic_ν
+end
 
 function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = KernelParameters(model.grid, closure))
     arch = model.architecture
@@ -97,8 +102,7 @@ function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLei
     tracers = model.tracers
     buoyancy = model.buoyancy
 
-    launch!(arch, grid, parameters,
-            calculate_nonlinear_viscosity!,
+    launch!(arch, grid, parameters, _compute_leith_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     return nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index 9f7fa234da..8096cb6f1b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -125,7 +125,7 @@ function calculate_diffusivities!(diffusivities, closure::MEWS, model)
     velocities = model.velocities
 
     launch!(arch, grid, :xyz,
-            compute_mews_diffusivities!,
+            _compute_mews_diffusivities!,
             diffusivities,
             grid,
             closure,
@@ -156,7 +156,7 @@ end
     return h
 end
 
-@kernel function compute_mews_diffusivities!(diffusivities, grid, maybe_closure_ensemble,
+@kernel function _compute_mews_diffusivities!(diffusivities, grid, maybe_closure_ensemble,
                                              velocities, tracers, buoyancy, coriolis)
 
     i, j, k, = @index(Global, NTuple)
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index b8210ed331..df0398c29c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -101,7 +101,9 @@ when ``N^2 > 0``, and 1 otherwise.
     return ifelse(Σ²==0, zero(FT), sqrt(ς²))
 end
 
-@inline function calc_nonlinear_νᶜᶜᶜ(i, j, k, grid::AbstractGrid, closure::SmagorinskyLilly, buoyancy, velocities, tracers)
+@kernel function _compute_smagorinsky_viscosity!(νₑ, grid, closure, buoyancy, velocities, tracers)
+    i, j, k = @index(Global, NTuple)
+
     # Strain tensor dot product
     Σ² = ΣᵢⱼΣᵢⱼᶜᶜᶜ(i, j, k, grid, velocities.u, velocities.v, velocities.w)
 
@@ -114,7 +116,7 @@ end
     Δᶠ = cbrt(Δ³)
     C = closure.C # free parameter
 
-    return ς * (C * Δᶠ)^2 * sqrt(2Σ²)
+    @inbounds νₑ[i, j, k] = ς * (C * Δᶠ)^2 * sqrt(2Σ²)
 end
 
 function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = KernelParameters(model.grid, closure))
@@ -125,7 +127,7 @@ function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly,
     tracers = model.tracers
 
     launch!(arch, grid, parameters,
-            calculate_nonlinear_viscosity!,
+            _compute_smagorinsky_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     return nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 844da639ee..5709b8e123 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -26,16 +26,6 @@ function convert_diffusivity(FT, κ::NamedTuple; discrete_form=false, loc=(nothi
     return NamedTuple{κ_names}(Tuple(convert_diffusivity(FT, κi; discrete_form, loc, parameters) for κi in κ))
 end
 
-@kernel function calculate_nonlinear_viscosity!(νₑ, grid, closure, buoyancy, velocities, tracers) 
-    i, j, k = @index(Global, NTuple)
-    @inbounds νₑ[i, j, k] = calc_nonlinear_νᶜᶜᶜ(i, j, k, grid, closure, buoyancy, velocities, tracers)
-end
-
-@kernel function calculate_nonlinear_tracer_diffusivity!(κₑ, grid, closure, tracer, tracer_index, U)
-    i, j, k = @index(Global, NTuple)
-    @inbounds κₑ[i, j, k] = calc_nonlinear_κᶜᶜᶜ(i, j, k, grid, closure, tracer, tracer_index, U)
-end
-
 # extend κ kernel to compute also the boundaries
 @inline function κ_kernel_size(grid, ::AbstractTurbulenceClosure{TD, B}) where{TD, B}
     Nx, Ny, Nz = size(grid)

From 4c8136bc5155a3b95962b2ec2545bdeda887e231 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:34:21 +0200
Subject: [PATCH 352/530] 2D leith requires 2 halos!

---
 .../leith_enstrophy_diffusivity.jl                              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index d901e6a14a..ac9354cabe 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -4,7 +4,7 @@ using Oceananigans.Fields: AbstractField
 ##### The turbulence closure proposed by Leith
 #####
 
-struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 3}
+struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 2}
                   C :: FT
              C_Redi :: CR
                C_GM :: GM

From b222f57c42a6ae098dd0054f57bfed0f214c7417 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:49:34 +0200
Subject: [PATCH 353/530] AMD and Smag require 1 halo!

---
 .../anisotropic_minimum_dissipation.jl                          | 2 +-
 .../turbulence_closure_implementations/smagorinsky_lilly.jl     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 295337f25d..928d987842 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -7,7 +7,7 @@ Parameters for the "anisotropic minimum dissipation" turbulence closure for larg
 proposed originally by [Rozema15](@cite) and [Abkar16](@cite), then modified by [Verstappen18](@cite),
 and finally described and validated for by [Vreugdenhil18](@cite).
 """
-struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
+struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 1}
     Cν :: PN
     Cκ :: PK
     Cb :: PB
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index df0398c29c..0ad1554b7c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -3,7 +3,7 @@
 ##### We also call this 'Constant Smagorinsky'.
 #####
 
-struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
+struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 1}
      C :: FT
     Cb :: FT
     Pr :: P

From 752e6f0a8528ae665bcf5ad60b6645f30d40a150 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 12:50:25 +0200
Subject: [PATCH 354/530] wrong order

---
 .../anisotropic_minimum_dissipation.jl                         | 2 +-
 .../turbulence_closure_implementations/smagorinsky_lilly.jl    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 928d987842..183b54b0c9 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -200,7 +200,7 @@ function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimu
 
     for (tracer_index, κₑ) in enumerate(diffusivity_fields.κₑ)
         @inbounds tracer = tracers[tracer_index]
-        launch!(arch, grid, _compute_AMD_diffusivity!, parameters, 
+        launch!(arch, grid, parameters, _compute_AMD_diffusivity!, 
                 κₑ, grid, closure, tracer, Val(tracer_index), velocities)
     end
 
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index 0ad1554b7c..d42fd071f4 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -126,8 +126,7 @@ function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly,
     velocities = model.velocities
     tracers = model.tracers
 
-    launch!(arch, grid, parameters,
-            _compute_smagorinsky_viscosity!,
+    launch!(arch, grid, parameters, _compute_smagorinsky_viscosity!,
             diffusivity_fields.νₑ, grid, closure, buoyancy, velocities, tracers)
 
     return nothing

From e36931a095f8ea139cf79503d9a16d12174ff10a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 13:57:02 +0200
Subject: [PATCH 355/530] correct halo handling for diffusivities

---
 ...late_nonhydrostatic_boundary_tendencies.jl |  4 ++--
 src/TurbulenceClosures/TurbulenceClosures.jl  |  9 +++++++--
 .../anisotropic_minimum_dissipation.jl        |  2 +-
 .../leith_enstrophy_diffusivity.jl            |  2 +-
 .../smagorinsky_lilly.jl                      |  2 +-
 .../turbulence_closure_utils.jl               | 20 ++++++++++---------
 src/Utils/multi_region_transformation.jl      |  1 +
 ...n_large_eddy_simulation_regression_test.jl |  4 ++--
 8 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index 191ab7f2b8..29b78e0186 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -41,7 +41,7 @@ function recompute_auxiliaries!(model::NonhydrostaticModel, grid, arch)
     κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
 
     for (ppar, κpar) in zip(p_kernel_parameters, κ_kernel_parameters)
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
+        update_hydrostatic_pressure!(model.pressures.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
         calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
     end
 end
@@ -64,7 +64,7 @@ function boundary_p_kernel_parameters(grid, arch)
     return boundary_parameters(sizes, offs, grid, arch)
 end
 
-# diffusivities need computing in the range 0 : B and N - B + 1 : N + 1
+# diffusivities need recomputing in the range 0 : B and N - B + 1 : N + 1
 function boundary_κ_kernel_parameters(grid, closure, arch)
     Nx, Ny, Nz = size(grid)
 
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 5b60ec7bcf..2d029d95d8 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -69,9 +69,14 @@ validate_closure(closure) = closure
 closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
 calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
-
+ 
+# The required halo size to calculate diffusivities. Take care that if the diffusivity can
+# be calculated from local information, still `B = 1`, because we need at least one additional
+# point at each side to calculate viscous fluxes at the edge of the domain. 
+# If diffusivity itself requires one halo to be computed (e.g. κ = ℑxᶠᵃᵃ(i, j, k, grid, ℑxᶜᵃᵃ, T),
+# or `AnisotropicMinimumDissipation` and `SmagorinskyLilly`) then B = 2
 @inline boundary_buffer(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B
-@inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B
+@inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B 
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 183b54b0c9..0f4ddaeb32 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -7,7 +7,7 @@ Parameters for the "anisotropic minimum dissipation" turbulence closure for larg
 proposed originally by [Rozema15](@cite) and [Abkar16](@cite), then modified by [Verstappen18](@cite),
 and finally described and validated for by [Vreugdenhil18](@cite).
 """
-struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 1}
+struct AnisotropicMinimumDissipation{TD, PK, PN, PB} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
     Cν :: PN
     Cκ :: PK
     Cb :: PB
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index ac9354cabe..d901e6a14a 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -4,7 +4,7 @@ using Oceananigans.Fields: AbstractField
 ##### The turbulence closure proposed by Leith
 #####
 
-struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 2}
+struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 3}
                   C :: FT
              C_Redi :: CR
                C_GM :: GM
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index d42fd071f4..94ed406e56 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -3,7 +3,7 @@
 ##### We also call this 'Constant Smagorinsky'.
 #####
 
-struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 1}
+struct SmagorinskyLilly{TD, FT, P} <: AbstractScalarDiffusivity{TD, ThreeDimensionalFormulation, 2}
      C :: FT
     Cb :: FT
     Pr :: P
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 5709b8e123..2eb414ccc2 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -26,24 +26,26 @@ function convert_diffusivity(FT, κ::NamedTuple; discrete_form=false, loc=(nothi
     return NamedTuple{κ_names}(Tuple(convert_diffusivity(FT, κi; discrete_form, loc, parameters) for κi in κ))
 end
 
-# extend κ kernel to compute also the boundaries
-@inline function κ_kernel_size(grid, ::AbstractTurbulenceClosure{TD, B}) where{TD, B}
+# extend κ kernel to compute also the boundaries 
+# Since the viscous calculation is _always_ second order 
+# we need just +1 in each direction
+@inline function κ_kernel_size(grid, ::AbstractTurbulenceClosure)
     Nx, Ny, Nz = size(grid)
     Tx, Ty, Tz = topology(grid)
 
-    Ax = Tx == Flat ? Nx : Nx + 2B 
-    Ay = Ty == Flat ? Ny : Ny + 2B 
-    Az = Tz == Flat ? Nz : Nz + 2B
+    Ax = Tx == Flat ? Nx : Nx + 2 
+    Ay = Ty == Flat ? Ny : Ny + 2 
+    Az = Tz == Flat ? Nz : Nz + 2
 
     return (Ax, Ay, Az)
 end
 
-@inline function κ_kernel_offsets(grid, ::AbstractTurbulenceClosure{TD, B}) where{TD, B}
+@inline function κ_kernel_offsets(grid, ::AbstractTurbulenceClosure)  
     Tx, Ty, Tz = topology(grid)
 
-    Ax = Tx == Flat ? 0 : - B
-    Ay = Ty == Flat ? 0 : - B 
-    Az = Tz == Flat ? 0 : - B
+    Ax = Tx == Flat ? 0 : - 1 
+    Ay = Ty == Flat ? 0 : - 1  
+    Az = Tz == Flat ? 0 : - 1 
 
     return (Ax, Ay, Az)
 end
diff --git a/src/Utils/multi_region_transformation.jl b/src/Utils/multi_region_transformation.jl
index fcfab540c4..ec5393cc9b 100644
--- a/src/Utils/multi_region_transformation.jl
+++ b/src/Utils/multi_region_transformation.jl
@@ -175,6 +175,7 @@ end
     end 
 end
 
+@inline sync_device!(::Nothing)  = nothing
 @inline sync_device!(::CPU)      = nothing
 @inline sync_device!(::GPU)      = CUDA.synchronize()
 @inline sync_device!(::CuDevice) = CUDA.synchronize()
diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index f562263bc2..6f7efae4e1 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -16,10 +16,10 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
     # Grid
     N = L = 16
     if grid_type == :regular
-        grid = RectilinearGrid(arch, size=(N, N, N), extent=(L, L, L), halo=(1, 1, 1))
+        grid = RectilinearGrid(arch, size=(N, N, N), extent=(L, L, L), halo=(2, 2, 2))
     elseif grid_type == :vertically_unstretched
         zF = range(-L, 0, length=N+1)
-        grid = RectilinearGrid(arch, size=(N, N, N), x=(0, L), y=(0, L), z=zF, halo=(1, 1, 1))
+        grid = RectilinearGrid(arch, size=(N, N, N), x=(0, L), y=(0, L), z=zF, halo=(2, 2, 2))
     end
 
     # Boundary conditions

From 527b240ef02e6ce3d8661ce402cd3d3bbe4d5ea4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:15:10 +0200
Subject: [PATCH 356/530] correct Leith formulation + fixes

---
 .../leith_enstrophy_diffusivity.jl            | 17 ++++------
 ...n_large_eddy_simulation_regression_test.jl | 34 +++++++++----------
 test/test_distributed_models.jl               |  3 ++
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index d901e6a14a..3bbbba7aee 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -4,7 +4,7 @@ using Oceananigans.Fields: AbstractField
 ##### The turbulence closure proposed by Leith
 #####
 
-struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 3}
+struct TwoDimensionalLeith{FT, CR, GM, M} <: AbstractScalarDiffusivity{ExplicitTimeDiscretization, ThreeDimensionalFormulation, 2}
                   C :: FT
              C_Redi :: CR
                C_GM :: GM
@@ -67,12 +67,9 @@ function with_tracers(tracers, closure::TwoDimensionalLeith{FT}) where FT
 end
 
 @inline function abs²_∇h_ζ(i, j, k, grid, velocities)
-    vxx = ℑyᵃᶜᵃ(i, j, k, grid, ∂²xᶜᶠᶜ, velocities.v)
-    uyy = ℑxᶜᵃᵃ(i, j, k, grid, ∂²yᶠᶜᶜ, velocities.u)
-    uxy = ℑyᵃᶜᵃ(i, j, k, grid, ∂xᶜᶠᶜ, ∂yᶠᶠᶜ, velocities.u)
-    vxy = ℑxᶜᵃᵃ(i, j, k, grid, ∂xᶠᶜᶜ, ∂yᶜᶜᶜ, velocities.v)
-
-    return (vxx - uxy)^2 + (vxy - uyy)^2
+    ζx = ℑyᵃᶜᵃ(i, j, k, grid, ∂xᶜᶠᶜ, ζ₃ᶠᶠᶜ, u, v)
+    ζy = ℑxᶜᵃᵃ(i, j, k, grid, ∂yᶠᶜᶜ, ζ₃ᶠᶠᶜ, u, v)
+    return ζx^2 + ζy^2
 end
 
 const ArrayOrField = Union{AbstractArray, AbstractField}
@@ -81,9 +78,9 @@ const ArrayOrField = Union{AbstractArray, AbstractField}
 @inline ψ²(i, j, k, grid, ψ::ArrayOrField, args...) = @inbounds ψ[i, j, k]^2
 
 @inline function abs²_∇h_wz(i, j, k, grid, w)
-    wxz² = ℑxᶜᵃᵃ(i, j, k, grid, ψ², ∂xᶠᶜᶜ, ∂zᶜᶜᶜ, w)
-    wyz² = ℑyᵃᶜᵃ(i, j, k, grid, ψ², ∂yᶜᶠᶜ, ∂zᶜᶜᶜ, w)
-    return wxz² + wyz²
+    wxz = ℑxᶜᵃᵃ(i, j, k, grid, ∂xᶠᶜᶜ, ∂zᶜᶜᶜ, w)
+    wyz = ℑyᵃᶜᵃ(i, j, k, grid, ∂yᶜᶠᶜ, ∂zᶜᶜᶜ, w)
+    return wxz^2 + wyz^2
 end
 
 @kernel function _compute_leith_viscosity!(νₑ, grid, closure::TwoDimensionalLeith{FT}, buoyancy, velocities, tracers) where FT 
diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index 6f7efae4e1..dd667a9987 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -80,23 +80,23 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     Nz = grid.Nz
 
-    model.velocities.u.data.parent .= ArrayType(solution₀.u)
-    model.velocities.v.data.parent .= ArrayType(solution₀.v)
-    model.velocities.w.data.parent .= ArrayType(solution₀.w)
-    model.tracers.T.data.parent    .= ArrayType(solution₀.T)
-    model.tracers.S.data.parent    .= ArrayType(solution₀.S)
-
-    model.timestepper.Gⁿ.u.data.parent .= ArrayType(Gⁿ₀.u)
-    model.timestepper.Gⁿ.v.data.parent .= ArrayType(Gⁿ₀.v)
-    model.timestepper.Gⁿ.w.data.parent .= ArrayType(Gⁿ₀.w)
-    model.timestepper.Gⁿ.T.data.parent .= ArrayType(Gⁿ₀.T)
-    model.timestepper.Gⁿ.S.data.parent .= ArrayType(Gⁿ₀.S)
-
-    model.timestepper.G⁻.u.data.parent .= ArrayType(G⁻₀.u)
-    model.timestepper.G⁻.v.data.parent .= ArrayType(G⁻₀.v)
-    model.timestepper.G⁻.w.data.parent .= ArrayType(G⁻₀.w)
-    model.timestepper.G⁻.T.data.parent .= ArrayType(G⁻₀.T)
-    model.timestepper.G⁻.S.data.parent .= ArrayType(G⁻₀.S)
+    interior(model.velocities.u) .= ArrayType(solution₀.u[1:Nx, 1:Ny])
+    interior(model.velocities.v) .= ArrayType(solution₀.v[1:Nx, 1:Ny])
+    interior(model.velocities.w) .= ArrayType(solution₀.w[1:Nx, 1:Ny])
+    interior(model.tracers.T)    .= ArrayType(solution₀.T[1:Nx, 1:Ny])
+    interior(model.tracers.S)    .= ArrayType(solution₀.S[1:Nx, 1:Ny])
+
+    interior(model.timestepper.Gⁿ.u) .= ArrayType(Gⁿ₀.u[1:Nx, 1:Ny])
+    interior(model.timestepper.Gⁿ.v) .= ArrayType(Gⁿ₀.v[1:Nx, 1:Ny])
+    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:Nx, 1:Ny])
+    interior(model.timestepper.Gⁿ.T) .= ArrayType(Gⁿ₀.T[1:Nx, 1:Ny])
+    interior(model.timestepper.Gⁿ.S) .= ArrayType(Gⁿ₀.S[1:Nx, 1:Ny])
+
+    interior(model.timestepper.G⁻.u) .= ArrayType(G⁻₀.u[1:Nx, 1:Ny])
+    interior(model.timestepper.G⁻.v) .= ArrayType(G⁻₀.v[1:Nx, 1:Ny])
+    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:Nx, 1:Ny])
+    interior(model.timestepper.G⁻.T) .= ArrayType(G⁻₀.T[1:Nx, 1:Ny])
+    interior(model.timestepper.G⁻.S) .= ArrayType(G⁻₀.S[1:Nx, 1:Ny])
 
     model.clock.time = spinup_steps * Δt
     model.clock.iteration = spinup_steps
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 8b88a9940a..a039afefcb 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -37,6 +37,9 @@ using Oceananigans.Grids:
 ##### Viewing halos
 #####
 
+instantiate(T::Type) = T()
+instantiate(t) = t
+
 west_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
     include_corners ? view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx), :, :) :
                       view(f.data, left_halo_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx, f.grid.Hx),

From 0f3a06a742d9b87e4bd3a94fad3f2fb9c62dea12 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:59:07 +0200
Subject: [PATCH 357/530] `only_local_halos` kwarg in `fill_halo_regions!`

---
 src/Distributed/halo_communication.jl         | 21 +++++++++------
 ...te_hydrostatic_free_surface_model_state.jl |  1 +
 .../update_nonhydrostatic_model_state.jl      |  2 ++
 src/TurbulenceClosures/TurbulenceClosures.jl  |  3 +--
 src/TurbulenceClosures/closure_tuples.jl      | 21 ---------------
 .../CATKEVerticalDiffusivities.jl             |  6 ++---
 .../anisotropic_minimum_dissipation.jl        |  2 +-
 ...vective_adjustment_vertical_diffusivity.jl |  2 +-
 .../isopycnal_skew_symmetric_diffusivity.jl   |  2 +-
 .../leith_enstrophy_diffusivity.jl            |  8 +++---
 .../ri_based_vertical_diffusivity.jl          |  2 +-
 .../smagorinsky_lilly.jl                      |  2 +-
 .../turbulence_closure_utils.jl               | 26 +------------------
 13 files changed, 29 insertions(+), 69 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1c0d8ef6f8..a9484366c9 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -235,8 +235,10 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
 
     @eval begin
         function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side::DCBCT, size, offset, loc, arch::DistributedArch, 
-                                  grid::DistributedGrid, buffers, args...; kwargs...)
+                                  grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
+            only_local_halos && return nothing
+                        
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
 
@@ -250,30 +252,33 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
         end
 
         function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side, size, offset, loc, arch::DistributedArch, 
-                                  grid::DistributedGrid, buffers, args...; kwargs...)
+                                  grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
+            $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
+
+            only_local_halos && return nothing
+            
             child_arch = child_architecture(arch)
             local_rank = bc_side.condition.from
 
             recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
-
-            $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
-
             send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
             
             return [send_req, recv_req]
         end
 
         function $fill_both_halo!(c, bc_side, bc_opposite_side::DCBCT, size, offset, loc, arch::DistributedArch, 
-                                  grid::DistributedGrid, buffers, args...; kwargs...)
+                                  grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
+
+            $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
+
+            only_local_halos && return nothing
 
             child_arch = child_architecture(arch)
             local_rank = bc_opposite_side.condition.from
 
             recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
-            $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
-
             send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req, recv_req]
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 6e595655e3..1b8297e89d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -31,6 +31,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
     fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
 
     @apply_regionally compute_w_diffusivities_pressure!(model)
+    fill_halo_regions!(model.diffusivity_fields; only_local_halos = true)
 
     [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
     
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 21f1e50e9c..186c54f552 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -36,6 +36,8 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
         update_hydrostatic_pressure!(model)
     end
 
+    fill_halo_regions!(model.diffusivity_fields; only_local_halos = true)
+    
     for callback in callbacks
         callback.callsite isa UpdateStateCallsite && callback(model)
     end
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 2d029d95d8..4663499090 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -83,8 +83,7 @@ add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) =
 
 import Oceananigans.Utils: KernelParameters
 
-KernelParameters(grid::AbstractGrid, closure) =
-        KernelParameters(κ_kernel_size(grid, closure), κ_kernel_offsets(grid, closure))
+KernelParameters(grid::AbstractGrid, closure) = KernelParameters(size(grid), (0, 0, 0))
 
 # Interface for KE-based closures
 function shear_production end
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 917a03fc3a..fb4b64e604 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,27 +86,6 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
-@inline κ_kernel_size(grid, closure::AbstractArray) = κ_kernel_size(grid, closure[1])
-@inline κ_kernel_offsets(grid, closure::AbstractArray) = κ_kernel_offsets(grid, closure[1])
-
-@inline function κ_kernel_size(grid, closure_tuple::Tuple)
-    kernel_size = (0, 0, 0)
-    for closure in closure_tuple
-        kernel_size = map(max, kernel_size, κ_kernel_size(grid, closure))
-    end
-
-    return kernel_size
-end
-
-@inline function κ_kernel_offsets(grid, closure_tuple::Tuple)
-    kernel_offsets = (0, 0, 0)
-    for closure in closure_tuple
-        kernel_offsets = map(min, kernel_offsets, κ_kernel_offsets(grid, closure))
-    end
-
-    return kernel_offsets
-end
-
 #####
 ##### Compiler-inferrable time_discretization for tuples
 #####
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 6116a5c3f1..f92a65fe69 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -24,9 +24,7 @@ using Oceananigans.TurbulenceClosures:
     AbstractScalarDiffusivity,
     VerticallyImplicitTimeDiscretization,
     VerticalFormulation,
-    κ_kernel_size,
-    κ_kernel_offsets
-
+    
 import Oceananigans.BoundaryConditions: getbc
 import Oceananigans.Utils: with_tracers
 import Oceananigans.TurbulenceClosures:
@@ -235,7 +233,7 @@ end
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 0f4ddaeb32..6f757993cf 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -188,7 +188,7 @@ end
     @inbounds κₑ[i, j, k] = max(zero(FT), κˢᵍˢ)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = :xyz)
     grid = model.grid
     arch = model.architecture
     velocities = model.velocities
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index ae0e2156ea..a7334841ef 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -88,7 +88,7 @@ DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCAVD) = (; κᶜ = Z
 @inline viscosity(::FlavorOfCAVD, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfCAVD, diffusivities, id) = diffusivities.κᶜ
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 96bd1cf8cf..6a1c4b0f70 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -80,7 +80,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfISSD{TD}) w
     end
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index 3bbbba7aee..0cff3fbc0b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -66,7 +66,7 @@ function with_tracers(tracers, closure::TwoDimensionalLeith{FT}) where FT
     return TwoDimensionalLeith{FT}(closure.C, C_Redi, C_GM, closure.isopycnal_model)
 end
 
-@inline function abs²_∇h_ζ(i, j, k, grid, velocities)
+@inline function abs²_∇h_ζ(i, j, k, grid, u, v)
     ζx = ℑyᵃᶜᵃ(i, j, k, grid, ∂xᶜᶠᶜ, ζ₃ᶠᶠᶜ, u, v)
     ζy = ℑxᶜᵃᵃ(i, j, k, grid, ∂yᶠᶜᶜ, ζ₃ᶠᶠᶜ, u, v)
     return ζx^2 + ζy^2
@@ -85,14 +85,14 @@ end
 
 @kernel function _compute_leith_viscosity!(νₑ, grid, closure::TwoDimensionalLeith{FT}, buoyancy, velocities, tracers) where FT 
     i, j, k = @index(Global, NTuple)
-
+    u, v, w = velocities
     prefactor = (closure.C * Δᶠ(i, j, k, grid, closure))^3 
-    dynamic_ν = sqrt(abs²_∇h_ζ(i, j, k, grid, velocities) + abs²_∇h_wz(i, j, k, grid, velocities.w))
+    dynamic_ν = sqrt(abs²_∇h_ζ(i, j, k, grid, u, v) + abs²_∇h_wz(i, j, k, grid, w))
     
     @inbounds νₑ[i, j, k] = prefactor * dynamic_ν
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     velocities = model.velocities
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index 181601bb26..da2bee3aa1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -139,7 +139,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κᶜ, κᵘ, Ri)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     clock = model.clock
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index 94ed406e56..c613ccb13c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -119,7 +119,7 @@ end
     @inbounds νₑ[i, j, k] = ς * (C * Δᶠ)^2 * sqrt(2Σ²)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = KernelParameters(model.grid, closure))
+function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     buoyancy = model.buoyancy
diff --git a/src/TurbulenceClosures/turbulence_closure_utils.jl b/src/TurbulenceClosures/turbulence_closure_utils.jl
index 2eb414ccc2..0238a995d8 100644
--- a/src/TurbulenceClosures/turbulence_closure_utils.jl
+++ b/src/TurbulenceClosures/turbulence_closure_utils.jl
@@ -24,28 +24,4 @@ end
 function convert_diffusivity(FT, κ::NamedTuple; discrete_form=false, loc=(nothing, nothing, nothing), parameters=nothing)
     κ_names = propertynames(κ)
     return NamedTuple{κ_names}(Tuple(convert_diffusivity(FT, κi; discrete_form, loc, parameters) for κi in κ))
-end
-
-# extend κ kernel to compute also the boundaries 
-# Since the viscous calculation is _always_ second order 
-# we need just +1 in each direction
-@inline function κ_kernel_size(grid, ::AbstractTurbulenceClosure)
-    Nx, Ny, Nz = size(grid)
-    Tx, Ty, Tz = topology(grid)
-
-    Ax = Tx == Flat ? Nx : Nx + 2 
-    Ay = Ty == Flat ? Ny : Ny + 2 
-    Az = Tz == Flat ? Nz : Nz + 2
-
-    return (Ax, Ay, Az)
-end
-
-@inline function κ_kernel_offsets(grid, ::AbstractTurbulenceClosure)  
-    Tx, Ty, Tz = topology(grid)
-
-    Ax = Tx == Flat ? 0 : - 1 
-    Ay = Ty == Flat ? 0 : - 1  
-    Az = Tz == Flat ? 0 : - 1 
-
-    return (Ax, Ay, Az)
-end
+end
\ No newline at end of file

From 718e0f858fe8fb759f54cab837a6aaa8fd16247d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 14:59:15 +0200
Subject: [PATCH 358/530] bugfix

---
 ...n_large_eddy_simulation_regression_test.jl | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index dd667a9987..de2a486938 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -80,23 +80,23 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     Nz = grid.Nz
 
-    interior(model.velocities.u) .= ArrayType(solution₀.u[1:Nx, 1:Ny])
-    interior(model.velocities.v) .= ArrayType(solution₀.v[1:Nx, 1:Ny])
-    interior(model.velocities.w) .= ArrayType(solution₀.w[1:Nx, 1:Ny])
-    interior(model.tracers.T)    .= ArrayType(solution₀.T[1:Nx, 1:Ny])
-    interior(model.tracers.S)    .= ArrayType(solution₀.S[1:Nx, 1:Ny])
-
-    interior(model.timestepper.Gⁿ.u) .= ArrayType(Gⁿ₀.u[1:Nx, 1:Ny])
-    interior(model.timestepper.Gⁿ.v) .= ArrayType(Gⁿ₀.v[1:Nx, 1:Ny])
-    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:Nx, 1:Ny])
-    interior(model.timestepper.Gⁿ.T) .= ArrayType(Gⁿ₀.T[1:Nx, 1:Ny])
-    interior(model.timestepper.Gⁿ.S) .= ArrayType(Gⁿ₀.S[1:Nx, 1:Ny])
-
-    interior(model.timestepper.G⁻.u) .= ArrayType(G⁻₀.u[1:Nx, 1:Ny])
-    interior(model.timestepper.G⁻.v) .= ArrayType(G⁻₀.v[1:Nx, 1:Ny])
-    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:Nx, 1:Ny])
-    interior(model.timestepper.G⁻.T) .= ArrayType(G⁻₀.T[1:Nx, 1:Ny])
-    interior(model.timestepper.G⁻.S) .= ArrayType(G⁻₀.S[1:Nx, 1:Ny])
+    interior(model.velocities.u) .= ArrayType(solution₀.u[1:N, 1:N, 1:N])
+    interior(model.velocities.v) .= ArrayType(solution₀.v[1:N, 1:N, 1:N])
+    interior(model.velocities.w) .= ArrayType(solution₀.w[1:N, 1:N, 1:N])
+    interior(model.tracers.T)    .= ArrayType(solution₀.T[1:N, 1:N, 1:N])
+    interior(model.tracers.S)    .= ArrayType(solution₀.S[1:N, 1:N, 1:N])
+
+    interior(model.timestepper.Gⁿ.u) .= ArrayType(Gⁿ₀.u[1:N, 1:N, 1:N])
+    interior(model.timestepper.Gⁿ.v) .= ArrayType(Gⁿ₀.v[1:N, 1:N, 1:N])
+    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:N, 1:N, 1:N])
+    interior(model.timestepper.Gⁿ.T) .= ArrayType(Gⁿ₀.T[1:N, 1:N, 1:N])
+    interior(model.timestepper.Gⁿ.S) .= ArrayType(Gⁿ₀.S[1:N, 1:N, 1:N])
+
+    interior(model.timestepper.G⁻.u) .= ArrayType(G⁻₀.u[1:N, 1:N, 1:N])
+    interior(model.timestepper.G⁻.v) .= ArrayType(G⁻₀.v[1:N, 1:N, 1:N])
+    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:N, 1:N, 1:N])
+    interior(model.timestepper.G⁻.T) .= ArrayType(G⁻₀.T[1:N, 1:N, 1:N])
+    interior(model.timestepper.G⁻.S) .= ArrayType(G⁻₀.S[1:N, 1:N, 1:N])
 
     model.clock.time = spinup_steps * Δt
     model.clock.iteration = spinup_steps

From 2e3306965f4d8667a6ed47dfc5c350cee00256ff Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:00:53 +0200
Subject: [PATCH 359/530] FT on GPU

---
 src/Advection/weno_interpolants.jl | 48 +++++++++++++++---------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl
index d781d3104a..f00d6618f8 100644
--- a/src/Advection/weno_interpolants.jl
+++ b/src/Advection/weno_interpolants.jl
@@ -121,30 +121,30 @@ for buffer in [2, 3, 4, 5, 6]
 end
 
 # _UNIFORM_ smoothness coefficients (stretched smoothness coefficients are to be fixed!)
-@inline coeff_β(scheme::WENO{2, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (1, -2, 1))
-@inline coeff_β(scheme::WENO{2, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (1, -2, 1))
-
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (10, -31, 11, 25, -19,  4))
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (4,  -13, 5,  13, -13,  4))
-@inline coeff_β(scheme::WENO{3, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (4,  -19, 11, 25, -31, 10))
-
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
-@inline coeff_β(scheme::WENO{4, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
-
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658)) 
-@inline coeff_β(scheme::WENO{5, FT}, ::Val{4}) where FT = @inbounds convert.(Ref(FT), (0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918)) 
-
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{0}) where FT = @inbounds convert.(Ref(FT), (0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{1}) where FT = @inbounds convert.(Ref(FT), (0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{2}) where FT = @inbounds convert.(Ref(FT), (0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{3}) where FT = @inbounds convert.(Ref(FT), (0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{4}) where FT = @inbounds convert.(Ref(FT), (0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561)) 
-@inline coeff_β(scheme::WENO{6, FT}, ::Val{5}) where FT = @inbounds convert.(Ref(FT), (0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211)) 
+@inline coeff_β(scheme::WENO{2, FT}, ::Val{0}) where FT = @inbounds FT.((1, -2, 1))
+@inline coeff_β(scheme::WENO{2, FT}, ::Val{1}) where FT = @inbounds FT.((1, -2, 1))
+
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{0}) where FT = @inbounds FT.((10, -31, 11, 25, -19,  4))
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{1}) where FT = @inbounds FT.((4,  -13, 5,  13, -13,  4))
+@inline coeff_β(scheme::WENO{3, FT}, ::Val{2}) where FT = @inbounds FT.((4,  -19, 11, 25, -31, 10))
+
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{0}) where FT = @inbounds FT.((2.107,  -9.402, 7.042, -1.854, 11.003,  -17.246,  4.642,  7.043,  -3.882, 0.547))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{1}) where FT = @inbounds FT.((0.547,  -2.522, 1.922, -0.494,  3.443,  - 5.966,  1.602,  2.843,  -1.642, 0.267))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{2}) where FT = @inbounds FT.((0.267,  -1.642, 1.602, -0.494,  2.843,  - 5.966,  1.922,  3.443,  -2.522, 0.547))
+@inline coeff_β(scheme::WENO{4, FT}, ::Val{3}) where FT = @inbounds FT.((0.547,  -3.882, 4.642, -1.854,  7.043,  -17.246,  7.042, 11.003,  -9.402, 2.107))
+
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{0}) where FT = @inbounds FT.((1.07918,  -6.49501, 7.58823, -4.11487,  0.86329,  10.20563, -24.62076, 13.58458, -2.88007, 15.21393, -17.04396, 3.64863,  4.82963, -2.08501, 0.22658)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{1}) where FT = @inbounds FT.((0.22658,  -1.40251, 1.65153, -0.88297,  0.18079,   2.42723,  -6.11976,  3.37018, -0.70237,  4.06293,  -4.64976, 0.99213,  1.38563, -0.60871, 0.06908)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{2}) where FT = @inbounds FT.((0.06908,  -0.51001, 0.67923, -0.38947,  0.08209,   1.04963,  -2.99076,  1.79098, -0.38947,  2.31153,  -2.99076, 0.67923,  1.04963, -0.51001, 0.06908)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{3}) where FT = @inbounds FT.((0.06908,  -0.60871, 0.99213, -0.70237,  0.18079,   1.38563,  -4.64976,  3.37018, -0.88297,  4.06293,  -6.11976, 1.65153,  2.42723, -1.40251, 0.22658)) 
+@inline coeff_β(scheme::WENO{5, FT}, ::Val{4}) where FT = @inbounds FT.((0.22658,  -2.08501, 3.64863, -2.88007,  0.86329,   4.82963, -17.04396, 13.58458, -4.11487, 15.21393, -24.62076, 7.58823, 10.20563, -6.49501, 1.07918)) 
+
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{0}) where FT = @inbounds FT.((0.6150211, -4.7460464, 7.6206736, -6.3394124, 2.7060170, -0.4712740,  9.4851237, -31.1771244, 26.2901672, -11.3206788,  1.9834350, 26.0445372, -44.4003904, 19.2596472, -3.3918804, 19.0757572, -16.6461044, 2.9442256, 3.6480687, -1.2950184, 0.1152561)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{1}) where FT = @inbounds FT.((0.1152561, -0.9117992, 1.4742480, -1.2183636, 0.5134574, -0.0880548,  1.9365967,  -6.5224244,  5.5053752,  -2.3510468,  0.4067018,  5.6662212,  -9.7838784,  4.2405032, -0.7408908,  4.3093692,  -3.7913324, 0.6694608, 0.8449957, -0.3015728, 0.0271779)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{2}) where FT = @inbounds FT.((0.0271779, -0.2380800, 0.4086352, -0.3462252, 0.1458762, -0.0245620,  0.5653317,  -2.0427884,  1.7905032,  -0.7727988,  0.1325006,  1.9510972,  -3.5817664,  1.5929912, -0.2792660,  1.7195652,  -1.5880404, 0.2863984, 0.3824847, -0.1429976, 0.0139633)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{3}) where FT = @inbounds FT.((0.0139633, -0.1429976, 0.2863984, -0.2792660, 0.1325006, -0.0245620,  0.3824847,  -1.5880404,  1.5929912,  -0.7727988,  0.1458762,  1.7195652,  -3.5817664,  1.7905032, -0.3462252,  1.9510972,  -2.0427884, 0.4086352, 0.5653317, -0.2380800, 0.0271779)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{4}) where FT = @inbounds FT.((0.0271779, -0.3015728, 0.6694608, -0.7408908, 0.4067018, -0.0880548,  0.8449957,  -3.7913324,  4.2405032,  -2.3510468,  0.5134574,  4.3093692,  -9.7838784,  5.5053752, -1.2183636,  5.6662212,  -6.5224244, 1.4742480, 1.9365967, -0.9117992, 0.1152561)) 
+@inline coeff_β(scheme::WENO{6, FT}, ::Val{5}) where FT = @inbounds FT.((0.1152561, -1.2950184, 2.9442256, -3.3918804, 1.9834350, -0.4712740,  3.6480687, -16.6461044, 19.2596472, -11.3206788,  2.7060170, 19.0757572, -44.4003904, 26.2901672, -6.3394124, 26.0445372, -31.1771244, 7.6206736, 9.4851237, -4.7460464, 0.6150211)) 
 
 # The rule for calculating smoothness indicators is the following (example WENO{4} which is seventh order) 
 # ψ[1] (C[1]  * ψ[1] + C[2] * ψ[2] + C[3] * ψ[3] + C[4] * ψ[4]) + 

From 4be413e313d87986d3f99cfe1862c5282768ac89 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:08:57 +0200
Subject: [PATCH 360/530] bugfix

---
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index f92a65fe69..8a59d77e53 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -23,7 +23,7 @@ using Oceananigans.TurbulenceClosures:
     time_discretization,
     AbstractScalarDiffusivity,
     VerticallyImplicitTimeDiscretization,
-    VerticalFormulation,
+    VerticalFormulation
     
 import Oceananigans.BoundaryConditions: getbc
 import Oceananigans.Utils: with_tracers

From ce0628adfb66caa36c6d387d0aa08d7569891b8c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:23:41 +0200
Subject: [PATCH 361/530] bugfix

---
 .../ocean_large_eddy_simulation_regression_test.jl     | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index de2a486938..283a2af895 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -78,23 +78,21 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     solution₀, Gⁿ₀, G⁻₀ = get_fields_from_checkpoint(initial_filename)
 
-    Nz = grid.Nz
-
     interior(model.velocities.u) .= ArrayType(solution₀.u[1:N, 1:N, 1:N])
     interior(model.velocities.v) .= ArrayType(solution₀.v[1:N, 1:N, 1:N])
-    interior(model.velocities.w) .= ArrayType(solution₀.w[1:N, 1:N, 1:N])
+    interior(model.velocities.w) .= ArrayType(solution₀.w[1:N, 1:N, 1:N+1])
     interior(model.tracers.T)    .= ArrayType(solution₀.T[1:N, 1:N, 1:N])
     interior(model.tracers.S)    .= ArrayType(solution₀.S[1:N, 1:N, 1:N])
 
     interior(model.timestepper.Gⁿ.u) .= ArrayType(Gⁿ₀.u[1:N, 1:N, 1:N])
     interior(model.timestepper.Gⁿ.v) .= ArrayType(Gⁿ₀.v[1:N, 1:N, 1:N])
-    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:N, 1:N, 1:N])
+    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:N, 1:N, 1:N+1])
     interior(model.timestepper.Gⁿ.T) .= ArrayType(Gⁿ₀.T[1:N, 1:N, 1:N])
     interior(model.timestepper.Gⁿ.S) .= ArrayType(Gⁿ₀.S[1:N, 1:N, 1:N])
 
     interior(model.timestepper.G⁻.u) .= ArrayType(G⁻₀.u[1:N, 1:N, 1:N])
     interior(model.timestepper.G⁻.v) .= ArrayType(G⁻₀.v[1:N, 1:N, 1:N])
-    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:N, 1:N, 1:N])
+    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:N, 1:N, 1:N+1])
     interior(model.timestepper.G⁻.T) .= ArrayType(G⁻₀.T[1:N, 1:N, 1:N])
     interior(model.timestepper.G⁻.S) .= ArrayType(G⁻₀.S[1:N, 1:N, 1:N])
 
@@ -115,7 +113,7 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     test_fields = CUDA.@allowscalar (u = Array(interior(model.velocities.u)),
                                      v = Array(interior(model.velocities.v)),
-                                     w = Array(interior(model.velocities.w)[:, :, 1:Nz]),
+                                     w = Array(interior(model.velocities.w)[:, :, 1:N+1]),
                                      T = Array(interior(model.tracers.T)),
                                      S = Array(interior(model.tracers.S)))
 

From a8285afc5014907ee6bf117bc4fbd9421d66f0fe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 15:57:28 +0200
Subject: [PATCH 362/530] last bugfix?

---
 .../ocean_large_eddy_simulation_regression_test.jl              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index 283a2af895..8e81311cd4 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -113,7 +113,7 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     test_fields = CUDA.@allowscalar (u = Array(interior(model.velocities.u)),
                                      v = Array(interior(model.velocities.v)),
-                                     w = Array(interior(model.velocities.w)[:, :, 1:N+1]),
+                                     w = Array(interior(model.velocities.w)[:, :, 1:N]),
                                      T = Array(interior(model.tracers.T)),
                                      S = Array(interior(model.tracers.S)))
 

From 07f8d1d89594b7b9f724461d4543ab3df48a784f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:12:42 +0200
Subject: [PATCH 363/530] removed all offsets from kernels + fixed all tests

---
 src/AbstractOperations/computed_field.jl      | 14 ++----
 src/Advection/weno_interpolants.jl            |  4 +-
 .../fill_halo_regions_open.jl                 |  2 +-
 src/Distributed/distributed_grids.jl          | 11 ++--
 src/Fields/Fields.jl                          |  1 +
 src/Fields/broadcasting_abstract_fields.jl    | 15 +++---
 src/ImmersedBoundaries/immersed_reductions.jl |  6 +--
 src/Solvers/fft_based_poisson_solver.jl       |  2 +
 ...n_large_eddy_simulation_regression_test.jl | 50 ++++++++++---------
 9 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/src/AbstractOperations/computed_field.jl b/src/AbstractOperations/computed_field.jl
index f90cb11862..3e5df945e0 100644
--- a/src/AbstractOperations/computed_field.jl
+++ b/src/AbstractOperations/computed_field.jl
@@ -4,7 +4,7 @@
 
 using KernelAbstractions: @kernel, @index
 using Oceananigans.Grids: default_indices
-using Oceananigans.Fields: FieldStatus, reduced_dimensions, validate_indices, offset_compute_index
+using Oceananigans.Fields: FieldStatus, reduced_dimensions, validate_indices, offset_index
 using Oceananigans.Utils: launch!
 
 import Oceananigans.Fields: Field, compute!
@@ -75,17 +75,13 @@ end
 
 function compute_computed_field!(comp)
     arch = architecture(comp)
-    launch!(arch, comp.grid, size(comp), _compute!, comp.data, comp.operand, comp.indices)
+    parameters = KernelParameters(size(comp), map(offset_index, comp.indices))
+    launch!(arch, comp.grid, parameters, _compute!, comp.data, comp.operand, comp.indices)
     return comp
 end
 
 """Compute an `operand` and store in `data`."""
-@kernel function _compute!(data, operand, index_ranges)
+@kernel function _compute!(data, operand)
     i, j, k = @index(Global, NTuple)
-
-    i′ = offset_compute_index(index_ranges[1], i)
-    j′ = offset_compute_index(index_ranges[2], j)
-    k′ = offset_compute_index(index_ranges[3], k)
-
-    @inbounds data[i′, j′, k′] = operand[i′, j′, k′]
+    @inbounds data[i, j, k] = operand[i, j, k]
 end
diff --git a/src/Advection/weno_interpolants.jl b/src/Advection/weno_interpolants.jl
index f00d6618f8..e292b1cde5 100644
--- a/src/Advection/weno_interpolants.jl
+++ b/src/Advection/weno_interpolants.jl
@@ -104,8 +104,8 @@ for buffer in [2, 3, 4, 5, 6]
             @inline Cr(scheme::WENO{$buffer}, ::Val{$stencil}) = @inbounds Cl(scheme, Val($(buffer-stencil-1)))
 
             # uniform coefficients are independent on direction and location
-            @inline  coeff_left_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds convert.(Ref(FT), $(stencil_coefficients(50, stencil  , collect(1:100), collect(1:100); order = buffer)))
-            @inline coeff_right_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds convert.(Ref(FT), $(stencil_coefficients(50, stencil-1, collect(1:100), collect(1:100); order = buffer)))
+            @inline  coeff_left_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil  , collect(1:100), collect(1:100); order = buffer)))
+            @inline coeff_right_p(scheme::WENO{$buffer, FT}, ::Val{$stencil}, ::Type{Nothing}, args...) where FT = @inbounds FT.($(stencil_coefficients(50, stencil-1, collect(1:100), collect(1:100); order = buffer)))
 
             # stretched coefficients are retrieved from precalculated coefficients
             @inline  coeff_left_p(scheme::WENO{$buffer}, ::Val{$stencil}, T, dir, i, loc) = @inbounds retrieve_coeff(scheme, $stencil,     dir, i, loc)
diff --git a/src/BoundaryConditions/fill_halo_regions_open.jl b/src/BoundaryConditions/fill_halo_regions_open.jl
index 3c944a3a78..20b1f97641 100644
--- a/src/BoundaryConditions/fill_halo_regions_open.jl
+++ b/src/BoundaryConditions/fill_halo_regions_open.jl
@@ -16,7 +16,7 @@ end
 
 @kernel function set_south_or_north_v!(v, j_boundary, bc, grid, args)
     i, k = @index(Global, NTuple)
-@inbounds v[i′, j_boundary, k] = getbc(bc, i, k, grid, args...)
+@inbounds v[i, j_boundary, k] = getbc(bc, i, k, grid, args...)
 end
 
 @kernel function set_bottom_or_top_w!(w, k_boundary, bc, grid, args) 
diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 2dab01305f..2c481979d6 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -36,7 +36,7 @@ function RectilinearGrid(arch::DistributedArch,
                          extent = nothing,
                          topology = (Periodic, Periodic, Bounded))
 
-    global_size = sum.(concatenate_local_sizes(size, arch))
+    global_size = map(sum, concatenate_local_sizes(size, arch))
     
     TX, TY, TZ, global_size, halo, x, y, z =
         validate_rectilinear_grid_args(topology, global_size, halo, FT, extent, x, y, z)
@@ -85,8 +85,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                halo = (1, 1, 1))
 
 
-    global_sizes = concatenate_local_sizes(size, arch)
-    global_size  = sum.(global_sizes)
+    global_sizes = map(sum, concatenate_local_sizes(size, arch))
 
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
         validate_lat_lon_grid_args(FT, latitude, longitude, z, global_size, halo, topology, precompute_metrics)
@@ -123,7 +122,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                                          Δλᶠᵃᵃ, Δλᶜᵃᵃ, λᶠᵃᵃ, λᶜᵃᵃ,
                                                          Δφᵃᶠᵃ, Δφᵃᶜᵃ, φᵃᶠᵃ, φᵃᶜᵃ,
                                                          Δzᵃᵃᶠ, Δzᵃᵃᶜ, zᵃᵃᶠ, zᵃᵃᶜ,
-                                                         (nothing for i=1:10)..., FT(radius))
+                                                         (nothing for i=1:10)..., convert(FT, radius))
 
     return !precompute_metrics ? preliminary_grid : with_precomputed_metrics(preliminary_grid)
 end
@@ -142,7 +141,7 @@ function reconstruct_global_grid(grid::DistributedRectilinearGrid)
 
     nx, ny, nz = n = size(grid)
     Hx, Hy, Hz = H = halo_size(grid)
-    Nx, Ny, Nz = sum.(concatenate_local_sizes(n, arch))
+    Nx, Ny, Nz = map(sum, concatenate_local_sizes(n, arch))
 
     TX, TY, TZ = topology(grid)
 
@@ -185,7 +184,7 @@ function reconstruct_global_grid(grid::DistributedLatitudeLongitudeGrid)
 
     nλ, nφ, nz = n = size(grid)
     Hλ, Hφ, Hz = H = halo_size(grid)
-    Nλ, Nφ, Nz = sum.(concatenate_local_sizes(n, arch))
+    Nλ, Nφ, Nz = map(sum, concatenate_local_sizes(n, arch))
 
     TX, TY, TZ = topology(grid)
 
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index fd95e425d5..afb4b8e6e0 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -12,6 +12,7 @@ export interpolate
 using Oceananigans.Architectures
 using Oceananigans.Grids
 using Oceananigans.BoundaryConditions
+using Oceananigans.Utils
 
 include("abstract_field.jl")
 include("constant_field.jl")
diff --git a/src/Fields/broadcasting_abstract_fields.jl b/src/Fields/broadcasting_abstract_fields.jl
index 9c18c31a0b..04812b0a41 100644
--- a/src/Fields/broadcasting_abstract_fields.jl
+++ b/src/Fields/broadcasting_abstract_fields.jl
@@ -43,14 +43,12 @@ end
 @inline offset_compute_index(::Colon, i) = i
 @inline offset_compute_index(range::UnitRange, i) = range[1] + i - 1
 
-@kernel function broadcast_kernel!(dest, bc, index_ranges)
-    i, j, k = @index(Global, NTuple)
-
-    i′ = offset_compute_index(index_ranges[1], i)
-    j′ = offset_compute_index(index_ranges[2], j)
-    k′ = offset_compute_index(index_ranges[3], k)
+@inline offset_index(::Colon) = 0
+@inline offset_index(range::UnitRange) = range[1] - 1
 
-    @inbounds dest[i′, j′, k′] = bc[i′, j′, k′]
+@kernel function _broadcast_kernel!(dest, bc)
+    i, j, k = @index(Global, NTuple)
+    @inbounds dest[i, j, k] = bc[i, j, k]
 end
 
 # Interface for getting AbstractOperation right
@@ -70,7 +68,8 @@ broadcasted_to_abstract_operation(loc, grid, a) = a
 
     bc′ = broadcasted_to_abstract_operation(location(dest), grid, bc)
 
-    launch!(arch, grid, size(dest), broadcast_kernel!, dest, bc′, dest.indices)
+    param = KernelParameters(size(dest), map(offset_index, dest.indices))
+    launch!(arch, grid, param, _broadcast_kernel!, dest, bc′)
 
     return dest
 end
diff --git a/src/ImmersedBoundaries/immersed_reductions.jl b/src/ImmersedBoundaries/immersed_reductions.jl
index 2ff322c873..636aa4d740 100644
--- a/src/ImmersedBoundaries/immersed_reductions.jl
+++ b/src/ImmersedBoundaries/immersed_reductions.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Fields: AbstractField, offset_compute_index, indices
+using Oceananigans.Fields: AbstractField, indices
 
 import Oceananigans.AbstractOperations: ConditionalOperation, get_condition, truefunc
 import Oceananigans.Fields: condition_operand, conditional_length
@@ -58,8 +58,8 @@ const IRF = Union{XIRF, YIRF, ZIRF, YZIRF, XZIRF, XYIRF, XYZIRF}
 
 @inline function immersed_column(field::IRF)
     reduced_dims  = reduced_dimensions(field)
-    full_location = fill_location.(location(field)) 
-    one_field    = ConditionalOperation{full_location...}(OneField(Int), identity, field.grid, NotImmersed(truefunc), 0.0)
+    full_location = map(fill_location, location(field)) 
+    one_field     = ConditionalOperation{full_location...}(OneField(Int), identity, field.grid, NotImmersed(truefunc), 0.0)
 
     return sum(one_field, dims = reduced_dims)
 end
diff --git a/src/Solvers/fft_based_poisson_solver.jl b/src/Solvers/fft_based_poisson_solver.jl
index 7c5e81cf9e..15f54704f5 100644
--- a/src/Solvers/fft_based_poisson_solver.jl
+++ b/src/Solvers/fft_based_poisson_solver.jl
@@ -118,6 +118,8 @@ function solve!(ϕ, solver::FFTBasedPoissonSolver, b, m=0)
     return ϕ
 end
 
+# We have to pass the offset explicitly to this kernel (we cannot use KA implicit
+# index offsetting) since ϕc and ϕ and indexed with different indices
 @kernel function copy_real_component!(ϕ, ϕc, index_ranges)
     i, j, k = @index(Global, NTuple)
 
diff --git a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
index 8e81311cd4..7eeed6fa85 100644
--- a/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
+++ b/test/regression_tests/ocean_large_eddy_simulation_regression_test.jl
@@ -37,9 +37,6 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
                                 closure = closure,
                                 boundary_conditions = (u=u_bcs, T=T_bcs, S=S_bcs))
 
-    # We will manually change the stop_iteration as needed.
-    simulation = Simulation(model, Δt=Δt, stop_iteration=0)
-
     # The type of the underlying data, not the offset array.
     ArrayType = typeof(model.velocities.u.data.parent)
 
@@ -78,28 +75,33 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
 
     solution₀, Gⁿ₀, G⁻₀ = get_fields_from_checkpoint(initial_filename)
 
-    interior(model.velocities.u) .= ArrayType(solution₀.u[1:N, 1:N, 1:N])
-    interior(model.velocities.v) .= ArrayType(solution₀.v[1:N, 1:N, 1:N])
-    interior(model.velocities.w) .= ArrayType(solution₀.w[1:N, 1:N, 1:N+1])
-    interior(model.tracers.T)    .= ArrayType(solution₀.T[1:N, 1:N, 1:N])
-    interior(model.tracers.S)    .= ArrayType(solution₀.S[1:N, 1:N, 1:N])
+    Nz = grid.Nz
+
+    solution_indices   = [2:N+3, 2:N+3, 2:N+3]
+    w_solution_indices = [2:N+3, 2:N+3, 2:N+4]
+
+    parent(model.velocities.u)[solution_indices...]   .= ArrayType(solution₀.u)
+    parent(model.velocities.v)[solution_indices...]   .= ArrayType(solution₀.v)
+    parent(model.velocities.w)[w_solution_indices...] .= ArrayType(solution₀.w)
+    parent(model.tracers.T)[solution_indices...]      .= ArrayType(solution₀.T)
+    parent(model.tracers.S)[solution_indices...]      .= ArrayType(solution₀.S)
 
-    interior(model.timestepper.Gⁿ.u) .= ArrayType(Gⁿ₀.u[1:N, 1:N, 1:N])
-    interior(model.timestepper.Gⁿ.v) .= ArrayType(Gⁿ₀.v[1:N, 1:N, 1:N])
-    interior(model.timestepper.Gⁿ.w) .= ArrayType(Gⁿ₀.w[1:N, 1:N, 1:N+1])
-    interior(model.timestepper.Gⁿ.T) .= ArrayType(Gⁿ₀.T[1:N, 1:N, 1:N])
-    interior(model.timestepper.Gⁿ.S) .= ArrayType(Gⁿ₀.S[1:N, 1:N, 1:N])
+    parent(model.timestepper.Gⁿ.u)[solution_indices...]   .= ArrayType(Gⁿ₀.u)
+    parent(model.timestepper.Gⁿ.v)[solution_indices...]   .= ArrayType(Gⁿ₀.v)
+    parent(model.timestepper.Gⁿ.w)[w_solution_indices...] .= ArrayType(Gⁿ₀.w)
+    parent(model.timestepper.Gⁿ.T)[solution_indices...]   .= ArrayType(Gⁿ₀.T)
+    parent(model.timestepper.Gⁿ.S)[solution_indices...]   .= ArrayType(Gⁿ₀.S)
 
-    interior(model.timestepper.G⁻.u) .= ArrayType(G⁻₀.u[1:N, 1:N, 1:N])
-    interior(model.timestepper.G⁻.v) .= ArrayType(G⁻₀.v[1:N, 1:N, 1:N])
-    interior(model.timestepper.G⁻.w) .= ArrayType(G⁻₀.w[1:N, 1:N, 1:N+1])
-    interior(model.timestepper.G⁻.T) .= ArrayType(G⁻₀.T[1:N, 1:N, 1:N])
-    interior(model.timestepper.G⁻.S) .= ArrayType(G⁻₀.S[1:N, 1:N, 1:N])
+    parent(model.timestepper.G⁻.u)[solution_indices...]   .= ArrayType(G⁻₀.u)
+    parent(model.timestepper.G⁻.v)[solution_indices...]   .= ArrayType(G⁻₀.v)
+    parent(model.timestepper.G⁻.w)[w_solution_indices...] .= ArrayType(G⁻₀.w)
+    parent(model.timestepper.G⁻.T)[solution_indices...]   .= ArrayType(G⁻₀.T)
+    parent(model.timestepper.G⁻.S)[solution_indices...]   .= ArrayType(G⁻₀.S)
 
     model.clock.time = spinup_steps * Δt
     model.clock.iteration = spinup_steps
 
-    update_state!(model)
+    update_state!(model; compute_tendencies = true)
     model.timestepper.previous_Δt = Δt
 
     for n in 1:test_steps
@@ -117,11 +119,11 @@ function run_ocean_large_eddy_simulation_regression_test(arch, grid_type, closur
                                      T = Array(interior(model.tracers.T)),
                                      S = Array(interior(model.tracers.S)))
 
-    correct_fields = (u = Array(interior(solution₁.u, model.grid)),
-                      v = Array(interior(solution₁.v, model.grid)),
-                      w = Array(interior(solution₁.w, model.grid)),
-                      T = Array(interior(solution₁.T, model.grid)),
-                      S = Array(interior(solution₁.S, model.grid)))
+    correct_fields = (u = Array(solution₁.u)[2:N+1, 2:N+1, 2:N+1],
+                      v = Array(solution₁.v)[2:N+1, 2:N+1, 2:N+1],
+                      w = Array(solution₁.w)[2:N+1, 2:N+1, 2:N+1],
+                      T = Array(solution₁.T)[2:N+1, 2:N+1, 2:N+1],
+                      S = Array(solution₁.S)[2:N+1, 2:N+1, 2:N+1])
 
     summarize_regression_test(test_fields, correct_fields)
 

From e5975dbc53d52a7f1de999b9488cd7b84f21bd31 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:23:46 +0200
Subject: [PATCH 364/530] fix `_compute!`

---
 src/AbstractOperations/computed_field.jl | 2 +-
 src/CubedSpheres/CubedSpheres.jl         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/AbstractOperations/computed_field.jl b/src/AbstractOperations/computed_field.jl
index 3e5df945e0..fbbd30efe9 100644
--- a/src/AbstractOperations/computed_field.jl
+++ b/src/AbstractOperations/computed_field.jl
@@ -76,7 +76,7 @@ end
 function compute_computed_field!(comp)
     arch = architecture(comp)
     parameters = KernelParameters(size(comp), map(offset_index, comp.indices))
-    launch!(arch, comp.grid, parameters, _compute!, comp.data, comp.operand, comp.indices)
+    launch!(arch, comp.grid, parameters, _compute!, comp.data, comp.operand)
     return comp
 end
 
diff --git a/src/CubedSpheres/CubedSpheres.jl b/src/CubedSpheres/CubedSpheres.jl
index e273a84cdb..dfd41da6b9 100644
--- a/src/CubedSpheres/CubedSpheres.jl
+++ b/src/CubedSpheres/CubedSpheres.jl
@@ -14,6 +14,7 @@ include("immersed_conformal_cubed_sphere_grid.jl")
 ##### Validating cubed sphere stuff
 #####
 
+using Oceananigans.Utils
 import Oceananigans.Grids: validate_index
 import Oceananigans.Fields: validate_field_data, validate_boundary_conditions
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: validate_vertical_velocity_boundary_conditions
@@ -190,7 +191,8 @@ function compute!(comp::CubedSphereComputedField, time=nothing)
 
     arch = architecture(comp)
     foreach(faces(comp)) do c
-        launch!(arch, c.grid, size(c), _compute!, c.data, c.operand, c.indices)
+        parameters = KernelParameters(size(c), map(offset_index, c.indices))
+        launch!(arch, c.grid, parameters, _compute!, c.data, c.operand)
     end
     
     fill_halo_regions!(comp)

From d82d908785fd84477ae7a5a8213e01bbc91f9391 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 17:52:49 +0200
Subject: [PATCH 365/530] finished

---
 src/CubedSpheres/CubedSpheres.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CubedSpheres/CubedSpheres.jl b/src/CubedSpheres/CubedSpheres.jl
index dfd41da6b9..38d6b080c7 100644
--- a/src/CubedSpheres/CubedSpheres.jl
+++ b/src/CubedSpheres/CubedSpheres.jl
@@ -177,7 +177,7 @@ end
 #####
 
 using Oceananigans.AbstractOperations: _compute!
-using Oceananigans.Fields: compute_at!
+using Oceananigans.Fields: compute_at!, offset_index
 
 import Oceananigans.Fields: compute!
 

From bd26e8cbf64af242fcdc4c0ae408506f9aeb6afe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 28 Jun 2023 19:51:20 +0200
Subject: [PATCH 366/530] fixed broken tests

---
 ...distributed_split_explicit_free_surface.jl |  2 +-
 test/test_computed_field.jl                   | 32 +++-------
 .../mpi_hydrostatic_turbulence.jl             | 64 +++++++++++--------
 .../mpi_output_writing.jl                     |  2 +-
 4 files changed, 49 insertions(+), 51 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 7707ba78d8..910b9a3602 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -66,7 +66,7 @@ function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::D
         settings  = free_surface.settings 
 
         old_halos  = halo_size(grid)
-        Nsubsteps  = length(settings.substeps.averaging_weights)
+        Nsubsteps  = length(settings.substepping.averaging_weights)
 
         new_halos = distributed_split_explicit_halos(old_halos, Nsubsteps+1, grid)         
         new_grid  = with_halo(new_halos, grid)
diff --git a/test/test_computed_field.jl b/test/test_computed_field.jl
index d3d2c37e39..6e3f6c40ef 100644
--- a/test/test_computed_field.jl
+++ b/test/test_computed_field.jl
@@ -534,13 +534,8 @@ for arch in archs
                 @test all(interior(computed_tke, 2:3, 2:3, 2:3) .== 9/2)
 
                 tke_window = Field(tke_ccc, indices=(2:3, 2:3, 2:3))
-                if (grid isa ImmersedBoundaryGrid) & (arch==GPU())
-                    @test_broken try compute!(tke_window); true; catch; false; end
-                    @test_broken all(interior(tke_window) .== 9/2)
-                else
-                    @test try compute!(tke_window); true; catch; false; end
-                    @test all(interior(tke_window) .== 9/2)
-                end
+                @test try compute!(tke_window); true; catch; false; end
+                @test all(interior(tke_window) .== 9/2)
 
                 # Computations along slices
                 tke_xy = Field(tke_ccc, indices=(:, :, 2))
@@ -551,25 +546,14 @@ for arch in archs
                 tke_yz = Field(tke_ccc, indices=(2, 2:3, 2:3))
                 tke_x = Field(tke_ccc, indices=(2:3, 2, 2))
 
-                if (grid isa ImmersedBoundaryGrid) & (arch==GPU())
-                    @test_broken try compute!(tke_xz); true; catch; false; end
-                    @test_broken all(interior(tke_xz) .== 9/2)
-
-                    @test_broken try compute!(tke_yz); true; catch; false; end
-                    @test_broken all(interior(tke_yz) .== 9/2)
+                @test try compute!(tke_xz); true; catch; false; end
+                @test all(interior(tke_xz) .== 9/2)
 
-                    @test_broken try compute!(tke_x); true; catch; false; end
-                    @test_broken all(interior(tke_x) .== 9/2)
-                else
-                    @test try compute!(tke_xz); true; catch; false; end
-                    @test all(interior(tke_xz) .== 9/2)
+                @test try compute!(tke_yz); true; catch; false; end
+                @test all(interior(tke_yz) .== 9/2)
 
-                    @test try compute!(tke_yz); true; catch; false; end
-                    @test all(interior(tke_yz) .== 9/2)
-
-                    @test try compute!(tke_x); true; catch; false; end
-                    @test all(interior(tke_x) .== 9/2)
-                end
+                @test try compute!(tke_x); true; catch; false; end
+                @test all(interior(tke_x) .== 9/2)
             end
 
             @testset "Computations with Fields [$A, $G]" begin
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index b9c3492f3c..7c86bf84a6 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -1,22 +1,28 @@
 using Oceananigans
 using MPI
+using Oceananigans.Models.HydrostaticFreeSurfaceModels: VerticalVorticityField
+using Printf
+using CairoMakie
 
 MPI.Initialized() || MPI.Init()
 
      comm = MPI.COMM_WORLD
 mpi_ranks = MPI.Comm_size(comm)
 
-@assert mpi_ranks == 4
+@assert mpi_ranks == 16
 
 using Statistics
 using Oceananigans
 using Oceananigans.Distributed
 
-ranks = (2, 2, 1)
+ranks = (4, 4, 1)
 topo  = (Periodic, Periodic, Bounded)
 arch  = DistributedArch(CPU(), ranks=ranks, topology=topo)
 
-grid  = RectilinearGrid(arch, topology=topo, size=(28 ÷ 2, 28 ÷ 2, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
+N = 28
+nx, ny = N ÷ ranks[1], N ÷ ranks[2] 
+
+grid  = RectilinearGrid(arch, topology=topo, size=(nx, ny, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
 
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
@@ -40,12 +46,16 @@ if local_rank == 0
 end
 
 u, v, _ = model.velocities
-outputs = merge(model.velocities, model.tracers)
+ζ = VerticalVorticityField(model)
+outputs = merge(model.velocities, model.tracers, (; ζ))
+
+progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time), Δt: $(sim.Δt)"
+simulation = Simulation(model, Δt=0.01, stop_time=100.0)
 
-progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time)"
-simulation = Simulation(model, Δt=0.001, stop_time=100.0)
+wizard = TimeStepWizard(cfl = 0.7, max_change = 1.2)
 
 simulation.callbacks[:progress] = Callback(progress, IterationInterval(100))
+simulation.callbacks[:wizard]   = Callback(wizard,   IterationInterval(10))
 
 filepath = "mpi_hydrostatic_turbulence_rank$(local_rank)"
 simulation.output_writers[:fields] =
@@ -55,30 +65,34 @@ simulation.output_writers[:fields] =
 MPI.Barrier(MPI.COMM_WORLD)
 
 run!(simulation)
+MPI.Barrier(MPI.COMM_WORLD)
 
 if rank == 0
-    using Printf
-    using GLMakie
-
     iter = Observable(1)
 
-    z1 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank0.jld2", "u")
-    z2 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank1.jld2", "u")
-    z3 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank2.jld2", "u")
-    z4 = FieldTimeSeries("mpi_hydrostatic_turbulence_rank3.jld2", "u")
+    vort = []
+    ζ = []
+    x = []
+    y = []
+    for i in 0:15
+        push!(vort, FieldTimeSeries("mpi_hydrostatic_turbulence_rank$i.jld2", "u"))
+        z1 = @lift(interior(vort[i][$iter], 1:nx, 1:ny, 1))
+        push!(ζ, z1)
 
-    ζ1 = @lift(interior(z1[$iter], 1:14, 1:14, 1))
-    ζ2 = @lift(interior(z2[$iter], 1:14, 1:14, 1))
-    ζ3 = @lift(interior(z3[$iter], 1:14, 1:14, 1))
-    ζ4 = @lift(interior(z4[$iter], 1:14, 1:14, 1))
-
-    x1, y1 = z1.grid.xᶠᵃᵃ[1:14], z1.grid.yᵃᶜᵃ[1:14]
-    x2, y2 = z4.grid.xᶠᵃᵃ[1:14], z4.grid.yᵃᶜᵃ[1:14]
+        push!(x, vort[i].grid.xᶠᵃᵃ[1:nx])
+        push!(y, vort[i].grid.yᵃᶠᵃ[1:ny])
+    end
 
     fig = Figure()
     ax = Axis(fig[1, 1])
-    heatmap!(ax, x1, y1, ζ1, colorrange = (-1.0, 1.0))
-    heatmap!(ax, x1, y2, ζ2, colorrange = (-1.0, 1.0))
-    heatmap!(ax, x2, y1, ζ3, colorrange = (-1.0, 1.0))
-    heatmap!(ax, x2, y2, ζ4, colorrange = (-1.0, 1.0))
-end
\ No newline at end of file
+    for i in 0:15
+        heatmap!(ax, x[i], y[i], ζ[i], colorrange = (-1.0, 1.0))
+    end
+
+    CairoMakie.record(fig, "hydrostatic_test.mp4", iterations, framerate = 11) do i
+        @info "step $i"; 
+        iter[] = i; 
+    end
+end
+
+MPI.Barrier(MPI.COMM_WORLD)
\ No newline at end of file
diff --git a/validation/distributed_simulations/mpi_output_writing.jl b/validation/distributed_simulations/mpi_output_writing.jl
index 8926ac9b90..a5092a4c0f 100644
--- a/validation/distributed_simulations/mpi_output_writing.jl
+++ b/validation/distributed_simulations/mpi_output_writing.jl
@@ -9,7 +9,7 @@ rank = MPI.Comm_rank(comm)
 Nranks = MPI.Comm_size(comm)
 
 topology = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
+arch = DistributedArch(CPU(); topology, ranks=(Nranks, 1, 1))
 grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), halo=(3, 3), extent=(2π, 2π))
 
 model = NonhydrostaticModel(; grid)

From 04bd76a9d8f6f64f2435a846404d2d5247d5009e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 00:09:24 +0200
Subject: [PATCH 367/530] fixed docs

---
 src/Advection/reconstruction_coefficients.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Advection/reconstruction_coefficients.jl b/src/Advection/reconstruction_coefficients.jl
index 64bee55c80..c4eb6931ba 100644
--- a/src/Advection/reconstruction_coefficients.jl
+++ b/src/Advection/reconstruction_coefficients.jl
@@ -128,11 +128,11 @@ julia> calc_reconstruction_stencil(1, :right, :x)
 julia> calc_reconstruction_stencil(1, :left, :x)
 :(+(convert(FT, coeff1_left[1]) * ψ[i + -1, j, k]))
 
-julia> calc_reconstruction_stencil(1, :symm, :x)
-:(convert(FT, coeff2_symm[2]) * ψ[i + -1, j, k] + convert(FT, coeff2_symm[1]) * ψ[i + 0, j, k])
+julia> calc_reconstruction_stencil(1, :symmetric, :x)
+:(convert(FT, coeff2_symmetric[2]) * ψ[i + -1, j, k] + convert(FT, coeff2_symmetric[1]) * ψ[i + 0, j, k])
 
-julia> calc_reconstruction_stencil(2, :symm, :x)
-:(convert(FT, coeff4_symm[4]) * ψ[i + -2, j, k] + convert(FT, coeff4_symm[3]) * ψ[i + -1, j, k] + convert(FT, coeff4_symm[2]) * ψ[i + 0, j, k] + convert(FT, coeff4_symm[1]) * ψ[i + 1, j, k])
+julia> calc_reconstruction_stencil(2, :symmetric, :x)
+:(convert(FT, coeff4_symmetric[4]) * ψ[i + -2, j, k] + convert(FT, coeff4_symmetric[3]) * ψ[i + -1, j, k] + convert(FT, coeff4_symmetric[2]) * ψ[i + 0, j, k] + convert(FT, coeff4_symmetric[1]) * ψ[i + 1, j, k])
 
 julia> calc_reconstruction_stencil(3, :left, :x)
 :(convert(FT, coeff5_left[5]) * ψ[i + -3, j, k] + convert(FT, coeff5_left[4]) * ψ[i + -2, j, k] + convert(FT, coeff5_left[3]) * ψ[i + -1, j, k] + convert(FT, coeff5_left[2]) * ψ[i + 0, j, k] + convert(FT, coeff5_left[1]) * ψ[i + 1, j, k])

From e640e2aac261389f7af0f25ca464bc876119ea04 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 11:36:03 +0200
Subject: [PATCH 368/530] miscellaneous changes

---
 src/Distributed/halo_communication.jl         | 87 ++++++++++---------
 src/Distributed/halo_communication_bcs.jl     | 12 +--
 src/Distributed/multi_architectures.jl        | 30 +++----
 src/Distributed/partition_assemble.jl         |  1 -
 src/Fields/field_boundary_buffers.jl          |  2 +-
 src/ImmersedBoundaries/active_cells_map.jl    |  3 -
 src/ImmersedBoundaries/mask_immersed_field.jl |  6 --
 .../split_explicit_free_surface.jl            | 10 +--
 .../split_explicit_free_surface_kernels.jl    |  4 +-
 src/Simulations/run.jl                        |  2 +-
 src/TimeSteppers/quasi_adams_bashforth_2.jl   |  6 +-
 src/TurbulenceClosures/TurbulenceClosures.jl  |  4 -
 src/TurbulenceClosures/closure_tuples.jl      |  3 +
 .../mews_vertical_diffusivity.jl              |  4 +-
 src/Utils/kernel_launching.jl                 |  2 +-
 15 files changed, 79 insertions(+), 97 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index a9484366c9..1f990a9c9a 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,4 +1,4 @@
-using KernelAbstractions: @kernel, @index, priority!
+using KernelAbstractions: @kernel, @index
 using OffsetArrays: OffsetArray
 using CUDA: cuStreamGetFlags, stream, priority_range, CUstream_flags_enum, CuStream, stream!
 
@@ -107,12 +107,12 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     
     # This has to be synchronized!!
     fill_send_buffers!(c, buffers, grid)
-    sync_device!(child_architecture(arch))
+    sync_device!(arch)
 
     for task = 1:3
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     end
-    
+
     fill_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     
     # Switch to the next field to send
@@ -121,30 +121,13 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     return nothing
 end
 
-for (side, dir) in zip([:southwest, :southeast, :northwest, :northeast], [1, 2, 3, 3])
-    fill_corner_halo! = Symbol("fill_$(side)_halo!")
-    send_side_halo  = Symbol("send_$(side)_halo")
-    recv_and_fill_side_halo! = Symbol("recv_and_fill_$(side)_halo!")
-    fill_side_send_buffers! = Symbol("fill_$(side)_send_buffers!")    
-
-    @eval begin
-        $fill_corner_halo!(::Nothing, args...; kwargs...) = nothing
-
-        function $fill_corner_halo!(corner, c, indices, loc, arch, grid, buffers, args...; kwargs...) 
-            child_arch = child_architecture(arch)
-            local_rank = arch.local_rank
-
-            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
-            send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, corner, buffers)
-            
-            return [send_req, recv_req]
-        end
+# corner passing routine
+function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; async = false, only_local_halos = false, kwargs...)
+    
+    if only_local_halos 
+        return nothing
     end
-end
 
-# If more than one direction is communicating we need to add a corner passing routine!
-function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; async = false, kwargs...)
-    
     requests = MPI.Request[]
 
     reqsw = fill_southwest_halo!(connectivity.southwest, c, indices, loc, arch, grid, buffers, args...; kwargs...)
@@ -216,12 +199,36 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedAr
     return nothing
 end
 
+#####
+##### fill_$corner_halo! where corner = [:southwest, :southeast, :northwest, :northeast]
+##### 
+
+for side in [:southwest, :southeast, :northwest, :northeast]
+    fill_corner_halo! = Symbol("fill_$(side)_halo!")
+    send_side_halo  = Symbol("send_$(side)_halo")
+    recv_and_fill_side_halo! = Symbol("recv_and_fill_$(side)_halo!")
+
+    @eval begin
+        $fill_corner_halo!(::Nothing, args...; kwargs...) = nothing
+
+        function $fill_corner_halo!(corner, c, indices, loc, arch, grid, buffers, args...; kwargs...) 
+            child_arch = child_architecture(arch)
+            local_rank = arch.local_rank
+
+            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc, local_rank, corner, buffers)
+            send_req = $send_side_halo(c, grid, arch, loc, local_rank, corner, buffers)
+            
+            return [send_req, recv_req]
+        end
+    end
+end
+
 #####
 ##### fill_west_and_east_halo!   }
 ##### fill_south_and_north_halo! } for when both halos are communicative (Single communicating halos are to be implemented)
 #####
 
-for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
+for (side, opposite_side) in zip([:west, :south], [:east, :north])
     fill_both_halo! = Symbol("fill_$(side)_and_$(opposite_side)_halo!")
     fill_side_halo! = Symbol("fill_$(side)_halo!")
     send_side_halo  = Symbol("send_$(side)_halo")
@@ -242,11 +249,11 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             @assert bc_side.condition.from == bc_opposite_side.condition.from  # Extra protection in case of bugs
             local_rank = bc_side.condition.from
 
-            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
-            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+            recv_req1 = $recv_and_fill_side_halo!(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
+            recv_req2 = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc, local_rank, bc_opposite_side.condition.to, buffers)
 
-            send_req1 = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
-            send_req2 = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+            send_req1 = $send_side_halo(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
+            send_req2 = $send_opposite_side_halo(c, grid, arch, loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req1, send_req2, recv_req1, recv_req2]
         end
@@ -261,8 +268,8 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             child_arch = child_architecture(arch)
             local_rank = bc_side.condition.from
 
-            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
-            send_req = $send_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_side.condition.to, buffers)
+            recv_req = $recv_and_fill_side_halo!(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
+            send_req = $send_side_halo(c, grid, arch, loc, local_rank, bc_side.condition.to, buffers)
             
             return [send_req, recv_req]
         end
@@ -277,9 +284,9 @@ for (side, opposite_side, dir) in zip([:west, :south], [:east, :north], [1, 2])
             child_arch = child_architecture(arch)
             local_rank = bc_opposite_side.condition.from
 
-            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+            recv_req = $recv_and_fill_opposite_side_halo!(c, grid, arch, loc, local_rank, bc_opposite_side.condition.to, buffers)
 
-            send_req = $send_opposite_side_halo(c, grid, arch, loc[$dir], loc, local_rank, bc_opposite_side.condition.to, buffers)
+            send_req = $send_opposite_side_halo(c, grid, arch, loc, local_rank, bc_opposite_side.condition.to, buffers)
 
             return [send_req, recv_req]
         end
@@ -298,8 +305,8 @@ for side in sides
     get_side_send_buffer = Symbol("get_$(side)_send_buffer")
 
     @eval begin
-        function $send_side_halo(c, grid, arch, side_location, location, local_rank, rank_to_send_to, buffers)
-            send_buffer = $get_side_send_buffer(c, grid, side_location, buffers, arch)
+        function $send_side_halo(c, grid, arch, location, local_rank, rank_to_send_to, buffers)
+            send_buffer = $get_side_send_buffer(c, grid, buffers, arch)
             send_tag = $side_send_tag(arch, location)
 
             @debug "Sending " * $side_str * " halo: local_rank=$local_rank, rank_to_send_to=$rank_to_send_to, send_tag=$send_tag"
@@ -309,7 +316,7 @@ for side in sides
             return send_req
         end
 
-        @inline $get_side_send_buffer(c, grid, side_location, buffers, arch) = buffers.$side.send     
+        @inline $get_side_send_buffer(c, grid, buffers, arch) = buffers.$side.send     
     end
 end
 
@@ -325,8 +332,8 @@ for side in sides
     get_side_recv_buffer = Symbol("get_$(side)_recv_buffer")
 
     @eval begin
-        function $recv_and_fill_side_halo!(c, grid, arch, side_location, location, local_rank, rank_to_recv_from, buffers)
-            recv_buffer = $get_side_recv_buffer(c, grid, side_location, buffers, arch)
+        function $recv_and_fill_side_halo!(c, grid, arch, location, local_rank, rank_to_recv_from, buffers)
+            recv_buffer = $get_side_recv_buffer(c, grid, buffers, arch)
             recv_tag = $side_recv_tag(arch, location)
 
             @debug "Receiving " * $side_str * " halo: local_rank=$local_rank, rank_to_recv_from=$rank_to_recv_from, recv_tag=$recv_tag"
@@ -335,6 +342,6 @@ for side in sides
             return recv_req
         end
 
-        @inline $get_side_recv_buffer(c, grid, side_location, buffers, arch) = buffers.$side.recv
+        @inline $get_side_recv_buffer(c, grid, buffers, arch) = buffers.$side.recv
     end
 end
diff --git a/src/Distributed/halo_communication_bcs.jl b/src/Distributed/halo_communication_bcs.jl
index 7988ad4bf6..b7bfb3c482 100644
--- a/src/Distributed/halo_communication_bcs.jl
+++ b/src/Distributed/halo_communication_bcs.jl
@@ -16,30 +16,24 @@ function inject_halo_communication_boundary_conditions(field_bcs, local_rank, co
     rank_west   = connectivity.west
     rank_north  = connectivity.north
     rank_south  = connectivity.south
-    rank_top    = connectivity.top
-    rank_bottom = connectivity.bottom
 
     east_comm_ranks   = HaloCommunicationRanks(from=local_rank, to=rank_east)
     west_comm_ranks   = HaloCommunicationRanks(from=local_rank, to=rank_west)
     north_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_north)
     south_comm_ranks  = HaloCommunicationRanks(from=local_rank, to=rank_south)
-    top_comm_ranks    = HaloCommunicationRanks(from=local_rank, to=rank_top)
-    bottom_comm_ranks = HaloCommunicationRanks(from=local_rank, to=rank_bottom)
 
     east_comm_bc   = DistributedCommunicationBoundaryCondition(east_comm_ranks)
     west_comm_bc   = DistributedCommunicationBoundaryCondition(west_comm_ranks)
     north_comm_bc  = DistributedCommunicationBoundaryCondition(north_comm_ranks)
     south_comm_bc  = DistributedCommunicationBoundaryCondition(south_comm_ranks)
-    top_comm_bc    = DistributedCommunicationBoundaryCondition(top_comm_ranks)
-    bottom_comm_bc = DistributedCommunicationBoundaryCondition(bottom_comm_ranks)
 
     west     = isnothing(rank_west)   ? field_bcs.west   : west_comm_bc
     east     = isnothing(rank_east)   ? field_bcs.east   : east_comm_bc
     south    = isnothing(rank_south)  ? field_bcs.south  : south_comm_bc
     north    = isnothing(rank_north)  ? field_bcs.north  : north_comm_bc
-    bottom   = isnothing(rank_bottom) ? field_bcs.bottom : bottom_comm_bc
-    top      = isnothing(rank_top)    ? field_bcs.top    : top_comm_bc
-
+    
+    bottom   = field_bcs.bottom 
+    top      = field_bcs.top    
     immersed = field_bcs.immersed
 
     return FieldBoundaryConditions(west, east, south, north, bottom, top, immersed)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 7f14cd7a47..e70cc5b455 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -131,21 +131,19 @@ end
 ##### Rank connectivity graph
 #####
 
-struct RankConnectivity{E, W, N, S, T, B, SW, SE, NW, NE}
+struct RankConnectivity{E, W, N, S, SW, SE, NW, NE}
          east :: E
          west :: W
         north :: N
         south :: S
-          top :: T
-       bottom :: B
     southwest :: SW
     southeast :: SE
     northwest :: NW
     northeast :: NE
 end
 
-RankConnectivity(; east, west, north, south, top, bottom, southwest, southeast, northwest, northeast) =
-    RankConnectivity(east, west, north, south, top, bottom, southwest, southeast, northwest, northeast)
+RankConnectivity(; east, west, north, south, southwest, southeast, northwest, northeast) =
+    RankConnectivity(east, west, north, south, southwest, southeast, northwest, northeast)
 
 # The "Periodic" topologies are `Periodic`, `FullyConnected` and `RightConnected`
 # The "Bounded" topologies are `Bounded` and `LeftConnected`
@@ -185,23 +183,19 @@ function RankConnectivity(model_index, ranks, topology)
     i_west  = decrement_index(i, Rx, TX)
     j_north = increment_index(j, Ry, TY)
     j_south = decrement_index(j, Ry, TY)
-    k_top   = increment_index(k, Rz, TZ)
-    k_bot   = decrement_index(k, Rz, TZ)
 
-    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east, j, k, Rx, Ry, Rz)
-    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west, j, k, Rx, Ry, Rz)
+    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east,  j, k, Rx, Ry, Rz)
+    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west,  j, k, Rx, Ry, Rz)
     r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
     r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-    r_top   = isnothing(k_top)   ? nothing : index2rank(i, j, k_top, Rx, Ry, Rz)
-    r_bot   = isnothing(k_bot)   ? nothing : index2rank(i, j, k_bot, Rx, Ry, Rz)
 
-    r_northeast = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
-    r_northwest = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
-    r_southeast = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
-    r_southwest = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
+    r_northeast = isnothing(i_east) && isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
+    r_northwest = isnothing(i_west) && isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
+    r_southeast = isnothing(i_east) && isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
+    r_southwest = isnothing(i_west) && isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
 
-    return RankConnectivity(east=r_east, west=r_west, north=r_north,
-                            south=r_south, top=r_top, bottom=r_bot,
+    return RankConnectivity(west=r_west, east=r_east, 
+                            south=r_south, north=r_north,
                             southwest=r_southwest,
                             southeast=r_southeast,
                             northwest=r_northwest,
@@ -221,8 +215,6 @@ function Base.show(io::IO, arch::DistributedArch)
               isnothing(c.west) ? "" : " west=$(c.west)",
               isnothing(c.north) ? "" : " north=$(c.north)",
               isnothing(c.south) ? "" : " south=$(c.south)",
-              isnothing(c.top) ? "" : " top=$(c.top)",
-              isnothing(c.bottom) ? "" : " bottom=$(c.bottom)",
               isnothing(c.southwest) ? "" : " southwest=$(c.southwest)",
               isnothing(c.southeast) ? "" : " southeast=$(c.southeast)",
               isnothing(c.northwest) ? "" : " northwest=$(c.northwest)",
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 21d36d0e56..92e59655d4 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -43,7 +43,6 @@ end
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
 # which means that we need to repeat the value at the right boundary
 
-# Have to fix this! This won't work for face constructors??
 function partition(c::AbstractVector, n, R, r)
     nl = concatenate_local_sizes(n, R, r)
     return c[1 + sum(nl[1:r-1]) : 1 + sum(nl[1:r])]
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index ce6d2e5324..1ff39be223 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -16,7 +16,7 @@ struct FieldBoundaryBuffers{W, E, S, N, SW, SE, NW, NE}
    northeast :: NE
 end
 
-FieldBoundaryBuffers() = FieldBoundaryBuffers(nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing)
+FieldBoundaryBuffers() = nothing
 FieldBoundaryBuffers(grid, data, ::Missing) = nothing
 FieldBoundaryBuffers(grid, data, ::Nothing) = nothing
 
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 77ce308117..f352ffbf76 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -31,12 +31,9 @@ function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool)
     if active_cells_map 
         map_interior = active_cells_map_interior(ibg)
         map_interior = arch_array(architecture(ibg), map_interior)
-
-        map_surface  = nothing
         # map_surface = active_cells_map_surface(ibg)
         # map_surface = arch_array(architecture(ibg), map_surface)
     else
-        map_surface  = nothing
         map_interior = nothing
     end
 
diff --git a/src/ImmersedBoundaries/mask_immersed_field.jl b/src/ImmersedBoundaries/mask_immersed_field.jl
index 1422a7297a..4bd2878eec 100644
--- a/src/ImmersedBoundaries/mask_immersed_field.jl
+++ b/src/ImmersedBoundaries/mask_immersed_field.jl
@@ -50,12 +50,6 @@ end
     @inbounds field[i, j, k] = scalar_mask(i, j, k, grid, grid.immersed_boundary, loc..., value, field, mask)
 end
 
-#####
-##### mask_immersed_velocities for NonhydrostaticModel
-#####
-
-mask_immersed_velocities!(U, arch, grid) = nothing
-
 #####
 ##### Masking for GridFittedBoundary
 #####
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index 6e5985996d..c9b748eb0b 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -71,10 +71,10 @@ Keyword Arguments
   - `ForwardBackwardScheme()` (default): `η = f(U)`   then `U = f(η)`,
   - `AdamsBashforth3Scheme()`: `η = f(U, Uᵐ⁻¹, Uᵐ⁻²)` then `U = f(η, ηᵐ, ηᵐ⁻¹, ηᵐ⁻²)`.
 """
-SplitExplicitFreeSurface(FT::DataType=Float64; gravitational_acceleration = g_Earth, kwargs...) =
-    SplitExplicitFreeSurface(nothing, nothing, nothing,
-                             FT(gravitational_acceleration), SplitExplicitSettings(FT; kwargs...))
-
+SplitExplicitFreeSurface(FT::DataType = Float64; gravitational_acceleration = g_Earth, kwargs...) = 
+    SplitExplicitFreeSurface(nothing, nothing, nothing, convert(FT, gravitational_acceleration),
+                             SplitExplicitSettings(; gravitational_acceleration, kwargs...))
+                             
 # The new constructor is defined later on after the state, settings, auxiliary have been defined
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid)
     η =  FreeSurfaceDisplacementField(velocities, free_surface, grid)
@@ -270,7 +270,7 @@ function FixedTimeStepSize(FT::DataType = Float64;
 
     wave_speed = sqrt(gravitational_acceleration * grid.Lz)
     
-    Δt_barotopic = FT(cfl * Δs / wave_speed)
+    Δt_barotopic = convert(FT, cfl * Δs / wave_speed)
 
     return FixedTimeStepSize(Δt_barotopic, averaging_kernel)
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index ae8b4b0c63..9f23e1cd9c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -349,8 +349,8 @@ end
     end	
 end
 
-@inline ab2_step_Gu(i, j, k, grid, G⁻, Gⁿ, χ) = ifelse(peripheral_node(i, j, k, grid, f, c, c), zero(grid), (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ))
-@inline ab2_step_Gv(i, j, k, grid, G⁻, Gⁿ, χ) = ifelse(peripheral_node(i, j, k, grid, c, f, c), zero(grid), (1.5 + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (0.5 + χ))
+@inline ab2_step_Gu(i, j, k, grid, G⁻, Gⁿ, χ::FT) where FT = ifelse(peripheral_node(i, j, k, grid, f, c, c), zero(grid), (convert(FT, 1.5) + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (convert(FT, 0.5) + χ))
+@inline ab2_step_Gv(i, j, k, grid, G⁻, Gⁿ, χ::FT) where FT = ifelse(peripheral_node(i, j, k, grid, c, f, c), zero(grid), (convert(FT, 1.5) + χ) *  Gⁿ[i, j, k] - G⁻[i, j, k] * (convert(FT, 0.5) + χ))
 
 # Setting up the RHS for the barotropic step (tendencies of the barotopic velocity components)
 # This function is called after `calculate_tendency` and before `ab2_step_velocities!`
diff --git a/src/Simulations/run.jl b/src/Simulations/run.jl
index e063a48032..f8513fda27 100644
--- a/src/Simulations/run.jl
+++ b/src/Simulations/run.jl
@@ -55,7 +55,7 @@ function aligned_time_step(sim::Simulation, Δt)
     # Temporary fix for https://github.com/CliMA/Oceananigans.jl/issues/1280
     aligned_Δt = aligned_Δt <= 0 ? Δt : aligned_Δt
 
-    return FT(aligned_Δt)
+    return convert(FT, aligned_Δt)
 end
 
 """
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 80d9807900..1ac9ede8f7 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -47,7 +47,7 @@ function QuasiAdamsBashforth2TimeStepper(grid, tracers,
 
     FT = eltype(grid)
     GT = typeof(Gⁿ)
-    χ  = FT(χ)
+    χ  = convert(FT, χ)
 
     return QuasiAdamsBashforth2TimeStepper{FT, GT, IT}(χ, Inf, Gⁿ, G⁻, implicit_solver)
 end
@@ -151,8 +151,8 @@ Time step velocity fields via the 2nd-order quasi Adams-Bashforth method
     i, j, k = @index(Global, NTuple)
 
     FT = eltype(χ)
-    one_point_five = FT(1.5)
-    oh_point_five  = FT(0.5)
+    one_point_five = convert(FT, 1.5)
+    oh_point_five  = convert(FT, 0.5)
 
     @inbounds u[i, j, k] += Δt * ((one_point_five + χ) * Gⁿ[i, j, k] - (oh_point_five + χ) * G⁻[i, j, k])
 end
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 4663499090..e4b2922064 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -81,10 +81,6 @@ calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs.
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
 add_closure_specific_boundary_conditions(closure::ClosureKinda, bcs, args...) = bcs
 
-import Oceananigans.Utils: KernelParameters
-
-KernelParameters(grid::AbstractGrid, closure) = KernelParameters(size(grid), (0, 0, 0))
-
 # Interface for KE-based closures
 function shear_production end
 function buoyancy_flux end
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index fb4b64e604..3ed7ce6f1f 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,6 +86,9 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
+boundary_buffer(closure_tuple::Tuple)    = max(map(boundary_buffer, closure_tuple))
+required_halo_size(closure_tuple::Tuple) = max(map(required_halo_size, closure_tuple))
+
 #####
 ##### Compiler-inferrable time_discretization for tuples
 #####
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index 8096cb6f1b..27806fc60c 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -115,7 +115,7 @@ function Base.show(io::IO, closure::MEWS)
                  "    Cᴰ  : ", closure.Cᴰ))
 end
 
-function calculate_diffusivities!(diffusivities, closure::MEWS, model)
+function calculate_diffusivities!(diffusivities, closure::MEWS, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     clock = model.clock
@@ -124,7 +124,7 @@ function calculate_diffusivities!(diffusivities, closure::MEWS, model)
     buoyancy = model.buoyancy
     velocities = model.velocities
 
-    launch!(arch, grid, :xyz,
+    launch!(arch, grid, parameters,
             _compute_mews_diffusivities!,
             diffusivities,
             grid,
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index fcf9de814a..ba4ae890bb 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -14,8 +14,8 @@ KernelParameters(size, offsets) = KernelParameters{size, offsets}()
 worktuple(::KernelParameters{S}) where S = S
 offsets(::KernelParameters{S, O}) where {S, O} = O
 
-offsets(workspec)  = nothing
 worktuple(workspec) = workspec
+offsets(workspec)  = nothing
 
 flatten_reduced_dimensions(worksize, dims) = Tuple(i ∈ dims ? 1 : worksize[i] for i = 1:3)
 

From 53336099e3c7fb066ae2c6e9f4255a8b0d90bee7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 11:56:03 +0200
Subject: [PATCH 369/530] bugfix

---
 src/Distributed/multi_architectures.jl   | 8 ++++----
 src/TurbulenceClosures/closure_tuples.jl | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index e70cc5b455..03f9cf45f2 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -189,10 +189,10 @@ function RankConnectivity(model_index, ranks, topology)
     r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
     r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
 
-    r_northeast = isnothing(i_east) && isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
-    r_northwest = isnothing(i_west) && isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
-    r_southeast = isnothing(i_east) && isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
-    r_southwest = isnothing(i_west) && isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
+    r_northeast = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
+    r_northwest = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
+    r_southeast = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
+    r_southwest = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
 
     return RankConnectivity(west=r_west, east=r_east, 
                             south=r_south, north=r_north,
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index 3ed7ce6f1f..45c1850b83 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -86,8 +86,8 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
-boundary_buffer(closure_tuple::Tuple)    = max(map(boundary_buffer, closure_tuple))
-required_halo_size(closure_tuple::Tuple) = max(map(required_halo_size, closure_tuple))
+boundary_buffer(closure_tuple::Tuple)    = maximum(map(boundary_buffer, closure_tuple))
+required_halo_size(closure_tuple::Tuple) = maximum(map(required_halo_size, closure_tuple))
 
 #####
 ##### Compiler-inferrable time_discretization for tuples

From aaf6f259285d0d62365397d9ae01e08f754b3e05 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 13:40:59 +0200
Subject: [PATCH 370/530] removed tests for vertical subdivision

---
 test/test_distributed_models.jl | 139 +++-----------------------------
 1 file changed, 12 insertions(+), 127 deletions(-)

diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index a039afefcb..2e18766537 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -63,18 +63,18 @@ north_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ
                       view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
                                    right_halo_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny, f.grid.Hy),
                                    interior_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz))
-
+                        
 bottom_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, :, left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
+include_corners ? view(f.data, :, :, left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
+                  view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                               interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                               left_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
 top_halo(f::AbstractField{LX, LY, LZ}; include_corners=true) where {LX, LY, LZ} =
-    include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
-                      view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
-                                   interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
-                                   right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
+include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz)) :
+                  view(f.data, interior_indices(instantiate(LX), instantiate(topology(f, 1)), f.grid.Nx),
+                               interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
+                               right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
 # Right now just testing with 4 ranks!
 comm = MPI.COMM_WORLD
@@ -94,11 +94,9 @@ function test_triply_periodic_rank_connectivity_with_411_ranks()
 
     connectivity = arch.connectivity
 
-    # No communication in y and z.
+    # No communication in y.
     @test isnothing(connectivity.south)
     @test isnothing(connectivity.north)
-    @test isnothing(connectivity.top)
-    @test isnothing(connectivity.bottom)
 
     # +---+---+---+---+
     # | 0 | 1 | 2 | 3 |
@@ -130,11 +128,9 @@ function test_triply_periodic_rank_connectivity_with_141_ranks()
 
     connectivity = arch.connectivity
 
-    # No communication in x and z.
+    # No communication in x.
     @test isnothing(connectivity.east)
     @test isnothing(connectivity.west)
-    @test isnothing(connectivity.top)
-    @test isnothing(connectivity.bottom)
 
     # +---+
     # | 3 |
@@ -163,51 +159,6 @@ function test_triply_periodic_rank_connectivity_with_141_ranks()
     return nothing
 end
 
-function test_triply_periodic_rank_connectivity_with_114_ranks()
-    topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(1, 1, 4), topology = topo)
-
-    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    @test local_rank == index2rank(arch.local_index..., arch.ranks...)
-
-    connectivity = arch.connectivity
-
-    # No communication in x and y.
-    @test isnothing(connectivity.east)
-    @test isnothing(connectivity.west)
-    @test isnothing(connectivity.north)
-    @test isnothing(connectivity.south)
-
-    #   /---/
-    #  / 3 /
-    # /---/
-    #   /---/
-    #  / 2 /
-    # /---/
-    #   /---/
-    #  / 1 /
-    # /---/
-    #   /---/
-    #  / 0 /
-    # /---/
-
-    if local_rank == 0
-        @test connectivity.top == 1
-        @test connectivity.bottom == 3
-    elseif local_rank == 1
-        @test connectivity.top == 2
-        @test connectivity.bottom == 0
-    elseif local_rank == 2
-        @test connectivity.top == 3
-        @test connectivity.bottom == 1
-    elseif local_rank == 3
-        @test connectivity.top == 0
-        @test connectivity.bottom == 2
-    end
-
-    return nothing
-end
-
 function test_triply_periodic_rank_connectivity_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(2, 2, 1), topology = topo)
@@ -216,11 +167,7 @@ function test_triply_periodic_rank_connectivity_with_221_ranks()
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
 
     connectivity = arch.connectivity
-
-    # No communication in z.
-    @test isnothing(connectivity.top)
-    @test isnothing(connectivity.bottom)
-
+    
     # +---+---+
     # | 0 | 2 |
     # +---+---+
@@ -292,24 +239,6 @@ function test_triply_periodic_local_grid_with_141_ranks()
     return nothing
 end
 
-function test_triply_periodic_local_grid_with_114_ranks()
-    topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(1, 1, 4), topology = topo)
-    local_grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 2), extent=(1, 2, 3))
-    
-    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
-    nx, ny, nz = size(local_grid)
-
-    @test local_grid.xᶠᵃᵃ[1] == 0
-    @test local_grid.xᶠᵃᵃ[nx+1] == 1
-    @test local_grid.yᵃᶠᵃ[1] == 0
-    @test local_grid.yᵃᶠᵃ[ny+1] == 2
-    @test local_grid.zᵃᵃᶠ[1] == -3 + 0.75*local_rank
-    @test local_grid.zᵃᵃᶠ[nz+1] == -3 + 0.75*(local_rank+1)
-
-    return nothing
-end
-
 function test_triply_periodic_local_grid_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(CPU(), ranks=(2, 2, 1), topology = topo)
@@ -368,23 +297,6 @@ function test_triply_periodic_bc_injection_with_141_ranks()
     end
 end
 
-function test_triply_periodic_bc_injection_with_114_ranks()
-    topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(1, 1, 4), topology=topo)
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 2), extent=(1, 2, 3))
-    model = NonhydrostaticModel(grid=grid)
-
-    for field in merge(fields(model))
-        fbcs = field.boundary_conditions
-        @test !isa(fbcs.east, DCBC)
-        @test !isa(fbcs.west, DCBC)
-        @test !isa(fbcs.north, DCBC)
-        @test !isa(fbcs.south, DCBC)
-        @test fbcs.top isa DCBC
-        @test fbcs.bottom isa DCBC
-    end
-end
-
 function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(ranks=(2, 2, 1), topology=topo)
@@ -452,29 +364,6 @@ function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch
     return nothing
 end
 
-function test_triply_periodic_halo_communication_with_114_ranks(halo, child_arch)
-    topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(child_arch; ranks=(1, 1, 4), topology=topo, devices = (0, 0, 0, 0))
-    grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
-    model = NonhydrostaticModel(grid=grid)
-
-    for field in merge(fields(model))
-        interior(field) .= arch.local_rank
-        fill_halo_regions!(field)
-
-        @test all(top_halo(field, include_corners=false) .== arch.connectivity.top)
-        @test all(bottom_halo(field, include_corners=false) .== arch.connectivity.bottom)
-
-        @test all(interior(field) .== arch.local_rank)
-        @test all(east_halo(field, include_corners=false) .== arch.local_rank)
-        @test all(west_halo(field, include_corners=false) .== arch.local_rank)
-        @test all(north_halo(field, include_corners=false) .== arch.local_rank)
-        @test all(south_halo(field, include_corners=false) .== arch.local_rank)
-    end
-
-    return nothing
-end
-
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
@@ -514,7 +403,6 @@ end
         @info "  Testing multi architecture rank connectivity..."
         test_triply_periodic_rank_connectivity_with_411_ranks()
         test_triply_periodic_rank_connectivity_with_141_ranks()
-        # test_triply_periodic_rank_connectivity_with_114_ranks()
         test_triply_periodic_rank_connectivity_with_221_ranks()
     end
 
@@ -522,7 +410,6 @@ end
         @info "  Testing local grids for distributed models..."
         test_triply_periodic_local_grid_with_411_ranks()
         test_triply_periodic_local_grid_with_141_ranks()
-        # test_triply_periodic_local_grid_with_114_ranks()
         test_triply_periodic_local_grid_with_221_ranks()
     end
 
@@ -530,7 +417,6 @@ end
         @info "  Testing injection of halo communication BCs..."
         test_triply_periodic_bc_injection_with_411_ranks()
         test_triply_periodic_bc_injection_with_141_ranks()
-        # test_triply_periodic_bc_injection_with_114_ranks()
         test_triply_periodic_bc_injection_with_221_ranks()
     end
 
@@ -540,7 +426,6 @@ end
             for H in 1:3
                 test_triply_periodic_halo_communication_with_411_ranks((H, H, H), child_arch)
                 test_triply_periodic_halo_communication_with_141_ranks((H, H, H), child_arch)
-                # test_triply_periodic_halo_communication_with_114_ranks((H, H, H), child_arch)
                 test_triply_periodic_halo_communication_with_221_ranks((H, H, H), child_arch)
             end
         end

From c6fcc903009dd9883e3f02376b7e55317a7c02fc Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 14:37:57 +0200
Subject: [PATCH 371/530] test corner passing

---
 test/test_distributed_models.jl | 11 +++++++++
 test_partitioning.jl            | 44 ---------------------------------
 2 files changed, 11 insertions(+), 44 deletions(-)
 delete mode 100644 test_partitioning.jl

diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 2e18766537..36603b3ee8 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -76,6 +76,12 @@ include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instant
                                interior_indices(instantiate(LY), instantiate(topology(f, 2)), f.grid.Ny),
                                right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
+
+southwest_halo(f::AbstractField) = view(f.data, -halo_size(grid)[1]:0, -halo_size(grid)[2]:0, :)
+southeast_halo(f::AbstractField) = view(f.data, size(grid, 1)+1:size(grid, 1)+halo_size(grid)[1], -halo_size(grid)[2]:0, :)
+northeast_halo(f::AbstractField) = view(f.data, size(grid, 1)+1:size(grid, 1)+halo_size(grid)[1], size(grid, 2)+1:size(grid, 2)+halo_size(grid)[2], :)
+northwest_halo(f::AbstractField) = view(f.data, -halo_size(grid)[1]:0, size(grid, 2)+1:size(grid, 2)+halo_size(grid)[2], :)
+
 # Right now just testing with 4 ranks!
 comm = MPI.COMM_WORLD
 mpi_ranks = MPI.Comm_size(comm)
@@ -382,6 +388,11 @@ function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch
         @test all(interior(field) .== arch.local_rank)
         @test all(top_halo(field, include_corners=false) .== arch.local_rank)
         @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
+
+        @test all(southwest_halo(field) .== arch.connectivity.southwest) 
+        @test all(southeast_halo(field) .== arch.connectivity.southeast) 
+        @test all(northwest_halo(field) .== arch.connectivity.northwest) 
+        @test all(northeast_halo(field) .== arch.connectivity.northeast) 
     end
 
     return nothing
diff --git a/test_partitioning.jl b/test_partitioning.jl
deleted file mode 100644
index a9b11f0295..0000000000
--- a/test_partitioning.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-using Oceananigans
-using Oceananigans.Distributed
-using Oceananigans.Distributed: partition_global_array
-using Oceananigans.Grids: architecture
-using Oceananigans.Units
-using MPI
-
-MPI.Init()
-
-comm   = MPI.COMM_WORLD
-rank   = MPI.Comm_rank(comm)
-Nranks = MPI.Comm_size(comm)
-
-topo = (Bounded, Periodic, Bounded)
-arch = DistributedArch(CPU(); topology = topo, 
-                 ranks=(Nranks, 1, 1),
-                 use_buffers = true)
-
-Lh = 100kilometers
-Lz = 400meters
-
-Nx = [10, 13, 18, 39]
-
-grid = RectilinearGrid(arch,
-                       size = (Nx[rank+1], 2, 1),
-                       x = (0, Lh), y = (0, Lh), z = (-Lz, 0),
-                       topology = topo,
-                       )
-
-
-array_full = zeros(prod(Nx), 2)
-for element in 1:prod(Nx)
-    array_full[element, :] .= element
-end
-
-arr = partition_global_array(architecture(grid), array_full, size(grid))
-
-@info "on rank $rank" size(grid) arr
-for r in 0:Nranks-1
-    if r == rank
-        @show rank arr
-    end
-    MPI.Barrier(MPI.COMM_WORLD)
-end
\ No newline at end of file

From 66e7ef3119cd7c77aabd44d7fc9be0cb7f2f7e6c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 14:53:59 +0200
Subject: [PATCH 372/530] correction

---
 .../distributed_split_explicit_free_surface.jl            | 7 +++++--
 .../split_explicit_free_surface_kernels.jl                | 4 ++--
 test/test_distributed_models.jl                           | 8 ++++----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 910b9a3602..3d0ea81814 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -1,5 +1,6 @@
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
-using Oceananigans.Distributed: DistributedGrid, DistributedField, complete_halo_communication!
+using Oceananigans.Distributed: DistributedGrid, DistributedField
+using Oceananigans.Distributed: BlockingDistributedArch, complete_halo_communication!
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliaryFields
@@ -92,7 +93,9 @@ end
 
 const DistributedSplitExplicit = SplitExplicitFreeSurface{<:DistributedField}
 
-function wait_free_surface_communication!(free_surface::DistributedSplitExplicit)
+wait_free_surface_communication!(::DistributedSplitExplicit, ::BlockingDistributedArch) = nothing
+    
+function wait_free_surface_communication!(free_surface::DistributedSplitExplicit, arch)
     
     state = free_surface.state
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 9f23e1cd9c..a8ebf59f38 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -278,7 +278,7 @@ function split_explicit_free_surface_step!(free_surface::SplitExplicitFreeSurfac
     grid = free_surface.η.grid
 
     # Wait for previous set up
-    wait_free_surface_communication!(free_surface)
+    wait_free_surface_communication!(free_surface, architecture(grid))
 
     # reset free surface averages
     @apply_regionally begin 
@@ -377,4 +377,4 @@ end
 setup_split_explicit_tendency!(auxiliary, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ) =
     launch!(architecture(grid), grid, :xy, _compute_integrated_ab2_tendencies!, auxiliary.Gᵁ, auxiliary.Gⱽ, grid, Gu⁻, Gv⁻, Guⁿ, Gvⁿ, χ)
 
-wait_free_surface_communication!(free_surface) = nothing
+wait_free_surface_communication!(free_surface, arch) = nothing
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 36603b3ee8..e9805fbdfa 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -77,10 +77,10 @@ include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instant
                                right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
 
-southwest_halo(f::AbstractField) = view(f.data, -halo_size(grid)[1]:0, -halo_size(grid)[2]:0, :)
-southeast_halo(f::AbstractField) = view(f.data, size(grid, 1)+1:size(grid, 1)+halo_size(grid)[1], -halo_size(grid)[2]:0, :)
-northeast_halo(f::AbstractField) = view(f.data, size(grid, 1)+1:size(grid, 1)+halo_size(grid)[1], size(grid, 2)+1:size(grid, 2)+halo_size(grid)[2], :)
-northwest_halo(f::AbstractField) = view(f.data, -halo_size(grid)[1]:0, size(grid, 2)+1:size(grid, 2)+halo_size(grid)[2], :)
+southwest_halo(f::AbstractField) = view(f.data, -halo_size(f.grid)[1]:0, -halo_size(f.grid)[2]:0, :)
+southeast_halo(f::AbstractField) = view(f.data, size(f.grid, 1)+1:size(f.grid, 1)+halo_size(f.grid)[1], -halo_size(f.grid)[2]:0, :)
+northeast_halo(f::AbstractField) = view(f.data, size(f.grid, 1)+1:size(f.grid, 1)+halo_size(f.grid)[1], size(f.grid, 2)+1:size(grid, 2)+halo_size(f.grid)[2], :)
+northwest_halo(f::AbstractField) = view(f.data, -halo_size(f.grid)[1]:0, size(f.grid, 2)+1:size(f.grid, 2)+halo_size(f.grid)[2], :)
 
 # Right now just testing with 4 ranks!
 comm = MPI.COMM_WORLD

From d53cba6981b2ce0cfcd01cb0d03acbd348e77305 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 15:09:24 +0200
Subject: [PATCH 373/530] retry

---
 test/test_distributed_models.jl | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index e9805fbdfa..90f8824191 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -29,6 +29,7 @@ using Oceananigans.BoundaryConditions: fill_halo_regions!, DCBC
 using Oceananigans.Distributed: DistributedArch, index2rank
 using Oceananigans.Fields: AbstractField
 using Oceananigans.Grids:
+    halo_size,
     interior_indices,
     left_halo_indices, right_halo_indices,
     underlying_left_halo_indices, underlying_right_halo_indices
@@ -77,10 +78,29 @@ include_corners ? view(f.data, :, :, right_halo_indices(instantiate(LZ), instant
                                right_halo_indices(instantiate(LZ), instantiate(topology(f, 3)), f.grid.Nz, f.grid.Hz))
 
 
-southwest_halo(f::AbstractField) = view(f.data, -halo_size(f.grid)[1]:0, -halo_size(f.grid)[2]:0, :)
-southeast_halo(f::AbstractField) = view(f.data, size(f.grid, 1)+1:size(f.grid, 1)+halo_size(f.grid)[1], -halo_size(f.grid)[2]:0, :)
-northeast_halo(f::AbstractField) = view(f.data, size(f.grid, 1)+1:size(f.grid, 1)+halo_size(f.grid)[1], size(f.grid, 2)+1:size(grid, 2)+halo_size(f.grid)[2], :)
-northwest_halo(f::AbstractField) = view(f.data, -halo_size(f.grid)[1]:0, size(f.grid, 2)+1:size(f.grid, 2)+halo_size(f.grid)[2], :)
+function southwest_halo(f::AbstractField) 
+    Nx, Ny, _ = size(f.grid)
+    Hx, Hy, _ = halo_size(f.grid)
+    return view(parent(f), 1:Hx, 1:Hy, :)
+end
+
+function southeast_halo(f::AbstractField) 
+    Nx, Ny, _ = size(f.grid)
+    Hx, Hy, _ = halo_size(f.grid)
+    return view(parent(f), Nx+Hx+1:Nx+2Hx, 1:Hy, :)
+end
+
+function northeast_halo(f::AbstractField) 
+    Nx, Ny, _ = size(f.grid)
+    Hx, Hy, _ = halo_size(f.grid)
+    return view(parent(f), Nx+Hx+1:Nx+2Hx, Ny+Hy+1:Ny+2Hy, :)
+end
+
+function northwest_halo(f::AbstractField) 
+    Nx, Ny, _ = size(f.grid)
+    Hx, Hy, _ = halo_size(f.grid)
+    return view(parent(f), 1:Hx, Ny+Hy+1:Ny+2Hy, :)
+end
 
 # Right now just testing with 4 ranks!
 comm = MPI.COMM_WORLD

From 59c7cd50906d34796a34bd392b19c5f77b256d5f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 16:36:18 +0200
Subject: [PATCH 374/530] fixed all problems

---
 src/Distributed/multi_architectures.jl |   4 +-
 src/Fields/field_boundary_buffers.jl   | 162 ++++++++++++++-----------
 2 files changed, 91 insertions(+), 75 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 03f9cf45f2..d388aa471a 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -174,8 +174,8 @@ function decrement_index(i, R, topo)
     end
 end
 
-function RankConnectivity(model_index, ranks, topology)
-    i, j, k = model_index
+function RankConnectivity(local_index, ranks, topology)
+    i, j, k = local_index
     Rx, Ry, Rz = ranks
     TX, TY, TZ = topology
 
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 1ff39be223..7033b71f90 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -20,6 +20,10 @@ FieldBoundaryBuffers() = nothing
 FieldBoundaryBuffers(grid, data, ::Missing) = nothing
 FieldBoundaryBuffers(grid, data, ::Nothing) = nothing
 
+# OneDBuffers are associated with partitioning without corner passing,
+# therefore the "corner zones" are communicated within the one-dimensional pass.
+const OneDBuffers = FieldBoundaryBuffers{<:Any, <:Any, <:Any, <:Any, <:Nothing, <:Nothing, <:Nothing, <:Nothing}
+
 function FieldBoundaryBuffers(grid, data, boundary_conditions)
 
     Hx, Hy, Hz = halo_size(grid)
@@ -57,13 +61,17 @@ function create_buffer_corner(arch, grid, data, Hx, Hy, side)
 end
 
 function create_buffer_x(arch, grid, data, H, ::DCBC) 
-    return (send = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))), 
-            recv = arch_array(arch, zeros(eltype(data), H, size(grid, 2), size(parent(data), 3))))    
+    # Either we pass corners or it is a 1D parallelization in x
+    size_y = arch.ranks[2] == 1 ? size(parent(data), 2) : size(grid, 2)
+    return (send = arch_array(arch, zeros(eltype(data), H, size_y, size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), H, size_y, size(parent(data), 3))))    
 end
 
 function create_buffer_y(arch, grid, data, H, ::DCBC)
-    return (send = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))), 
-            recv = arch_array(arch, zeros(eltype(data), size(grid, 1), H, size(parent(data), 3))))
+    # Either we pass corners or it is a 1D parallelization in y
+    size_x = arch.ranks[1] == 1 ? size(parent(data), 1) : size(grid, 1)
+    return (send = arch_array(arch, zeros(eltype(data), size_x, H, size(parent(data), 3))), 
+            recv = arch_array(arch, zeros(eltype(data), size_x, H, size(parent(data), 3))))
 end
 
 create_buffer_x(arch, grid, data, H, ::MCBC) = 
@@ -89,19 +97,19 @@ Adapt.adapt_structure(to, buff::FieldBoundaryBuffers) =
 
 fills `buffers.send` from OffsetArray `c` preparing for message passing. 
 """
-function fill_send_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
+function fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-     _fill_west_send_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
-     _fill_east_send_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
-    _fill_south_send_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
-    _fill_north_send_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+     _fill_west_send_buffer!(parent(c), buff, buff.west,  Hx, Hy, Nx, Ny)
+     _fill_east_send_buffer!(parent(c), buff, buff.east,  Hx, Hy, Nx, Ny)
+    _fill_south_send_buffer!(parent(c), buff, buff.south, Hx, Hy, Nx, Ny)
+    _fill_north_send_buffer!(parent(c), buff, buff.north, Hx, Hy, Nx, Ny)
 
-    _fill_southwest_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
-    _fill_southeast_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
-    _fill_northwest_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
-    _fill_northeast_send_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
+    _fill_southwest_send_buffer!(parent(c), buff, buff.southwest, Hx, Hy, Nx, Ny)
+    _fill_southeast_send_buffer!(parent(c), buff, buff.southeast, Hx, Hy, Nx, Ny)
+    _fill_northwest_send_buffer!(parent(c), buff, buff.northwest, Hx, Hy, Nx, Ny)
+    _fill_northeast_send_buffer!(parent(c), buff, buff.northeast, Hx, Hy, Nx, Ny)
 
     return nothing
 end
@@ -111,99 +119,107 @@ end
 
 fills OffsetArray `c` from `buffers.recv` after message passing occurred. 
 """
-function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid)
+function recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid)
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-     _recv_from_west_buffer!(parent(c), buffers.west,  Hx, Hy, Nx, Ny)
-     _recv_from_east_buffer!(parent(c), buffers.east,  Hx, Hy, Nx, Ny)
-    _recv_from_south_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
-    _recv_from_north_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+     _recv_from_west_buffer!(parent(c), buff, buff.west,  Hx, Hy, Nx, Ny)
+     _recv_from_east_buffer!(parent(c), buff, buff.east,  Hx, Hy, Nx, Ny)
+    _recv_from_south_buffer!(parent(c), buff, buff.south, Hx, Hy, Nx, Ny)
+    _recv_from_north_buffer!(parent(c), buff, buff.north, Hx, Hy, Nx, Ny)
    
-   _recv_from_southwest_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
-   _recv_from_southeast_buffer!(parent(c), buffers.southeast, Hx, Hy, Nx, Ny)
-   _recv_from_northwest_buffer!(parent(c), buffers.northwest, Hx, Hy, Nx, Ny)
-   _recv_from_northeast_buffer!(parent(c), buffers.northeast, Hx, Hy, Nx, Ny)
+   _recv_from_southwest_buffer!(parent(c), buff, buff.southwest, Hx, Hy, Nx, Ny)
+   _recv_from_southeast_buffer!(parent(c), buff, buff.southeast, Hx, Hy, Nx, Ny)
+   _recv_from_northwest_buffer!(parent(c), buff, buff.northwest, Hx, Hy, Nx, Ny)
+   _recv_from_northeast_buffer!(parent(c), buff, buff.northeast, Hx, Hy, Nx, Ny)
 
    return nothing
 end
 
-function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:corners})
+function recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:corners})
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-   _recv_from_southwest_buffer!(parent(c), buffers.southwest, Hx, Hy, Nx, Ny)
-   _recv_from_southeast_buffer!(parent(c), buffers.southeast, Hx, Hy, Nx, Ny)
-   _recv_from_northwest_buffer!(parent(c), buffers.northwest, Hx, Hy, Nx, Ny)
-   _recv_from_northeast_buffer!(parent(c), buffers.northeast, Hx, Hy, Nx, Ny)
+   _recv_from_southwest_buffer!(parent(c), buff, buff.southwest, Hx, Hy, Nx, Ny)
+   _recv_from_southeast_buffer!(parent(c), buff, buff.southeast, Hx, Hy, Nx, Ny)
+   _recv_from_northwest_buffer!(parent(c), buff, buff.northwest, Hx, Hy, Nx, Ny)
+   _recv_from_northeast_buffer!(parent(c), buff, buff.northeast, Hx, Hy, Nx, Ny)
 
    return nothing
 end
 
-function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:west_and_east})
+function recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:west_and_east})
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-    _recv_from_west_buffer!(parent(c), buffers.west, Hx, Hy, Nx, Ny)
-    _recv_from_east_buffer!(parent(c), buffers.east, Hx, Hy, Nx, Ny)
+    _recv_from_west_buffer!(parent(c), buff, buff.west, Hx, Hy, Nx, Ny)
+    _recv_from_east_buffer!(parent(c), buff, buff.east, Hx, Hy, Nx, Ny)
 
     return nothing
 end
 
-function recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:south_and_north})
+function recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:south_and_north})
     Hx, Hy, _ = halo_size(grid)
     Nx, Ny, _ = size(grid)
 
-   _recv_from_south_buffer!(parent(c), buffers.south, Hx, Hy, Nx, Ny)
-   _recv_from_north_buffer!(parent(c), buffers.north, Hx, Hy, Nx, Ny)
+   _recv_from_south_buffer!(parent(c), buff, buff.south, Hx, Hy, Nx, Ny)
+   _recv_from_north_buffer!(parent(c), buff, buff.north, Hx, Hy, Nx, Ny)
 
    return nothing
 end
 
-recv_from_buffers!(c::OffsetArray, buffers::FieldBoundaryBuffers, grid, ::Val{:bottom_and_top}) = nothing
+recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:bottom_and_top}) = nothing
 
 #####
 ##### Individual _fill_send_buffers and _recv_from_buffer kernels
 #####
 
- _fill_west_send_buffer!(c, ::Nothing, args...) = nothing
- _fill_east_send_buffer!(c, ::Nothing, args...) = nothing
-_fill_north_send_buffer!(c, ::Nothing, args...) = nothing
-_fill_south_send_buffer!(c, ::Nothing, args...) = nothing
-
-_fill_southwest_send_buffer!(c, ::Nothing, args...) = nothing
-_fill_southeast_send_buffer!(c, ::Nothing, args...) = nothing
-_fill_northwest_send_buffer!(c, ::Nothing, args...) = nothing
-_fill_northeast_send_buffer!(c, ::Nothing, args...) = nothing
-
- _recv_from_west_buffer!(c, ::Nothing, args...) = nothing
- _recv_from_east_buffer!(c, ::Nothing, args...) = nothing
-_recv_from_north_buffer!(c, ::Nothing, args...) = nothing
-_recv_from_south_buffer!(c, ::Nothing, args...) = nothing
-
-_recv_from_southwest_buffer!(c, ::Nothing, args...) = nothing
-_recv_from_southeast_buffer!(c, ::Nothing, args...) = nothing
-_recv_from_northwest_buffer!(c, ::Nothing, args...) = nothing
-_recv_from_northeast_buffer!(c, ::Nothing, args...) = nothing
-
- _fill_west_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:Ny+Hy, :)
- _fill_east_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:Ny+Hy, :)
-_fill_south_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Hy:2Hy,  :)
-_fill_north_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Ny:Ny+Hy, :)
-
- _recv_from_west_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Hy:Ny+Hy,     :) .= buff.recv
- _recv_from_east_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Hy:Ny+Hy,     :) .= buff.recv
-_recv_from_south_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1:Hy,           :) .= buff.recv
-_recv_from_north_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1+Ny+Hy:Ny+2Hy, :) .= buff.recv
-
-_fill_southwest_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:2Hy,   :)
-_fill_southeast_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:2Hy,   :)
-_fill_northwest_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Ny:Ny+Hy, :)
-_fill_northeast_send_buffer!(c, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Ny:Ny+Hy, :)
-
-_recv_from_southwest_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1:Hy,           :) .= buff.recv
-_recv_from_southeast_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1:Hy,           :) .= buff.recv
-_recv_from_northwest_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Ny+Hy:Ny+2Hy, :) .= buff.recv
-_recv_from_northeast_buffer!(c, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+for dir in (:west, :east, :south, :north, :southwest, :southeast, :northwest, :northeast)
+    _fill_send_buffer! = Symbol(:_fill_, dir, :_send_buffer!)
+    _recv_from_buffer! = Symbol(:_recv_, dir, :_from_buffer!)
+
+    @eval $_fill_send_buffer!(c, buff, ::Nothing, args...) = nothing
+    @eval $_recv_from_buffer!(c, buff, ::Nothing, args...) = nothing
+    @eval $_fill_send_buffer!(c, ::OneDBuffers, ::Nothing, args...) = nothing
+    @eval $_recv_from_buffer!(c, ::OneDBuffers, ::Nothing, args...) = nothing
+end
+
+#####
+##### 1D Parallelizations (cover corners with 1 MPI pass)
+#####
+
+ _fill_west_send_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   :, :)
+ _fill_east_send_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, :, :)
+_fill_south_send_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, :, 1+Hy:2Hy,  :)
+_fill_north_send_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, :, 1+Ny:Ny+Hy, :)
+
+ _recv_from_west_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           :,     :) .= buff.recv
+ _recv_from_east_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, :,     :) .= buff.recv
+_recv_from_south_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = view(c, :,     1:Hy,           :) .= buff.recv
+_recv_from_north_buffer!(c, ::OneDBuffers, buff, Hx, Hy, Nx, Ny) = view(c, :,     1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+
+#####
+##### 2D Parallelizations (explicitly send corners)
+#####
+
+ _fill_west_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:Ny+Hy, :)
+ _fill_east_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:Ny+Hy, :)
+_fill_south_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Hy:2Hy,  :)
+_fill_north_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:Nx+Hx, 1+Ny:Ny+Hy, :)
+
+ _recv_from_west_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Hy:Ny+Hy,     :) .= buff.recv
+ _recv_from_east_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Hy:Ny+Hy,     :) .= buff.recv
+_recv_from_south_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1:Hy,           :) .= buff.recv
+_recv_from_north_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1+Hx:Nx+Hx,     1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+
+_fill_southwest_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Hy:2Hy,   :)
+_fill_southeast_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Hy:2Hy,   :)
+_fill_northwest_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Hx:2Hx,   1+Ny:Ny+Hy, :)
+_fill_northeast_send_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = buff.send .= view(c, 1+Nx:Nx+Hx, 1+Ny:Ny+Hy, :)
+
+_recv_from_southwest_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1:Hy,           :) .= buff.recv
+_recv_from_southeast_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1:Hy,           :) .= buff.recv
+_recv_from_northwest_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1:Hx,           1+Ny+Hy:Ny+2Hy, :) .= buff.recv
+_recv_from_northeast_buffer!(c, b, buff, Hx, Hy, Nx, Ny) = view(c, 1+Nx+Hx:Nx+2Hx, 1+Ny+Hy:Ny+2Hy, :) .= buff.recv
 
 

From 9b1412d5c458cbdf65e948a059c16a5c8a2109cd Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 17:30:17 +0200
Subject: [PATCH 375/530] Added a validation example

---
 src/Fields/field_boundary_buffers.jl          |   6 +-
 .../mpi_hydrostatic_turbulence.jl             | 150 ++++++++++--------
 2 files changed, 89 insertions(+), 67 deletions(-)

diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 7033b71f90..3bd68963da 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -176,10 +176,10 @@ recv_from_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:bott
 
 for dir in (:west, :east, :south, :north, :southwest, :southeast, :northwest, :northeast)
     _fill_send_buffer! = Symbol(:_fill_, dir, :_send_buffer!)
-    _recv_from_buffer! = Symbol(:_recv_, dir, :_from_buffer!)
+    _recv_from_buffer! = Symbol(:_recv_from_, dir, :_buffer!)
 
-    @eval $_fill_send_buffer!(c, buff, ::Nothing, args...) = nothing
-    @eval $_recv_from_buffer!(c, buff, ::Nothing, args...) = nothing
+    @eval $_fill_send_buffer!(c, b, ::Nothing, args...) = nothing
+    @eval $_recv_from_buffer!(c, b, ::Nothing, args...) = nothing
     @eval $_fill_send_buffer!(c, ::OneDBuffers, ::Nothing, args...) = nothing
     @eval $_recv_from_buffer!(c, ::OneDBuffers, ::Nothing, args...) = nothing
 end
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index 7c86bf84a6..cd189afa14 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -2,97 +2,119 @@ using Oceananigans
 using MPI
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: VerticalVorticityField
 using Printf
-using CairoMakie
-
-MPI.Initialized() || MPI.Init()
-
-     comm = MPI.COMM_WORLD
-mpi_ranks = MPI.Comm_size(comm)
-
-@assert mpi_ranks == 16
-
 using Statistics
-using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.BoundaryConditions
+using Oceananigans.Distributed    
+using Random
+using GLMakie
 
-ranks = (4, 4, 1)
-topo  = (Periodic, Periodic, Bounded)
-arch  = DistributedArch(CPU(), ranks=ranks, topology=topo)
+# Run with 
+#
+# ```julia 
+#   mpiexec -n 4 julia --project mpi_hydrostatic_turbulence.jl
+# ```
 
-N = 28
-nx, ny = N ÷ ranks[1], N ÷ ranks[2] 
+function run_simulation(nx, ny, arch, topo)
 
-grid  = RectilinearGrid(arch, topology=topo, size=(nx, ny, 1), extent=(4π, 4π, 0.5), halo=(3, 3, 3))
+    grid  = RectilinearGrid(arch; topology=topo, size=(nx, ny, 1), extent=(4π, 4π, 0.5), halo=(7, 7, 7))
 
-local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
+    local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
-free_surface = SplitExplicitFreeSurface(; substeps = 30)
+    free_surface = SplitExplicitFreeSurface(; substeps = 10)
 
-model = HydrostaticFreeSurfaceModel(; grid, free_surface,
-                     momentum_advection = WENO(),
-                     tracer_advection = WENO(),
-                     buoyancy = nothing,
-                     coriolis = FPlane(f = 1),
-                     tracers = :c)
+    model = HydrostaticFreeSurfaceModel(; grid, free_surface,
+                         momentum_advection = VectorInvariant(vorticity_scheme = WENO(order = 9)),
+                         tracer_advection = WENO(),
+                         buoyancy = nothing,
+                         coriolis = FPlane(f = 1),
+                         tracers = :c)
 
-using Random
-Random.seed!(1234 * (local_rank +1))
+    # Scale seed with rank to avoid symmetry
+    Random.seed!(1234 * (local_rank + 1))
 
-set!(model, u = (x, y, z) -> rand(), v = (x, y, z) -> rand())
+    set!(model, u = (x, y, z) -> 1-2rand(), v = (x, y, z) -> 1-2rand())
+    
+    mask(x, y, z) = x > 3π/2 && x < 5π/2 && y > 3π/2 && y < 5π/2
+    c = model.tracers.c
 
-mask(x, y, z) = x > π && x < 2π && y > π && y < 2π ? 1.0 : 0.0
-if local_rank == 0
-    set!(model.tracers.c, mask)
-end
+    set!(c, mask)
 
-u, v, _ = model.velocities
-ζ = VerticalVorticityField(model)
-outputs = merge(model.velocities, model.tracers, (; ζ))
+    u, v, _ = model.velocities
+    ζ = VerticalVorticityField(model)
+    η = model.free_surface.η
+    outputs = merge(model.velocities, model.tracers, (; ζ, η))
 
-progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time), Δt: $(sim.Δt)"
-simulation = Simulation(model, Δt=0.01, stop_time=100.0)
+    progress(sim) = @info "Iteration: $(sim.model.clock.iteration), time: $(sim.model.clock.time), Δt: $(sim.Δt)"
+    simulation = Simulation(model, Δt=0.02, stop_time=100.0)
 
-wizard = TimeStepWizard(cfl = 0.7, max_change = 1.2)
+    wizard = TimeStepWizard(cfl = 0.2, max_change = 1.1)
 
-simulation.callbacks[:progress] = Callback(progress, IterationInterval(100))
-simulation.callbacks[:wizard]   = Callback(wizard,   IterationInterval(10))
+    simulation.callbacks[:progress] = Callback(progress, IterationInterval(100))
+    simulation.callbacks[:wizard]   = Callback(wizard,   IterationInterval(10))
 
-filepath = "mpi_hydrostatic_turbulence_rank$(local_rank)"
-simulation.output_writers[:fields] =
-    JLD2OutputWriter(model, outputs, filename=filepath, schedule=TimeInterval(0.1),
-                     overwrite_existing=true)
+    filepath = "mpi_hydrostatic_turbulence_rank$(local_rank)"
+    simulation.output_writers[:fields] =
+        JLD2OutputWriter(model, outputs, filename=filepath, schedule=TimeInterval(0.1),
+                         overwrite_existing=true)
 
-MPI.Barrier(MPI.COMM_WORLD)
-
-run!(simulation)
-MPI.Barrier(MPI.COMM_WORLD)
+    run!(simulation)
+    MPI.Barrier(MPI.COMM_WORLD)
+end
 
-if rank == 0
+# Produce a video for variable `var`
+function visualize_simulation(var)
     iter = Observable(1)
 
-    vort = []
-    ζ = []
-    x = []
-    y = []
-    for i in 0:15
-        push!(vort, FieldTimeSeries("mpi_hydrostatic_turbulence_rank$i.jld2", "u"))
-        z1 = @lift(interior(vort[i][$iter], 1:nx, 1:ny, 1))
-        push!(ζ, z1)
-
-        push!(x, vort[i].grid.xᶠᵃᵃ[1:nx])
-        push!(y, vort[i].grid.yᵃᶠᵃ[1:ny])
+    v = Vector(undef, 4)
+    V = Vector(undef, 4)
+    x = Vector(undef, 4)
+    y = Vector(undef, 4)
+
+    for r in 1:4
+        v[r] = FieldTimeSeries("mpi_hydrostatic_turbulence_rank$(i-1).jld2", var))
+        nx, ny, _ = size(v[r])
+        V[r] = @lift(interior(v[r][$iter], 1:nx, 1:ny, 1))
+
+        x[r] = xnodes(v[i])
+        y[r] = ynodes(v[i])
     end
 
     fig = Figure()
     ax = Axis(fig[1, 1])
-    for i in 0:15
-        heatmap!(ax, x[i], y[i], ζ[i], colorrange = (-1.0, 1.0))
+    for r in 1:4
+        heatmap!(ax, x[r], y[r], V[r], colorrange = (-1.0, 1.0))
     end
 
-    CairoMakie.record(fig, "hydrostatic_test.mp4", iterations, framerate = 11) do i
+    GLMakie.record(fig, "hydrostatic_test_" * var * ".mp4", 1:length(v[1].times), framerate = 11) do i
         @info "step $i"; 
         iter[] = i; 
     end
 end
 
-MPI.Barrier(MPI.COMM_WORLD)
\ No newline at end of file
+MPI.Init()
+
+topo = (Periodic, Periodic, Bounded)
+
+Nranks = MPI.Comm_size(MPI.COMM_WORLD)
+Rx = 2
+Ry = 2
+
+@assert Nranks == 4
+
+# Enable overlapped communication!
+arch  = DistributedArch(CPU(), ranks = (Rx, Ry, 1), 
+                        topology=topo, 
+                        enable_overlapped_computation = true)
+
+# Example of non-uniform partitioning
+nx = [90, 128-90][arch.local_index[1]]
+ny = [56, 128-56][arch.local_index[2]]
+
+# Run the simulation
+run_simulation(nx, ny, arch, topo)
+
+# Visualize the plane
+visualize_simulation("u")
+visualize_simulation("v")
+visualize_simulation("ζ")
+visualize_simulation("c")

From 28f052eb0571d3a716239a6a49f79173b21fe810 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 19:33:43 +0200
Subject: [PATCH 376/530] fixed tests

---
 test/test_distributed_models.jl | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 90f8824191..68e9ddfcf4 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -351,7 +351,7 @@ function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
-        interior(field) .= arch.local_rank
+        fill!(field, arch.local_rank)
         fill_halo_regions!(field)
 
         @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
@@ -375,7 +375,7 @@ function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model), model.pressures)
-        interior(field) .= arch.local_rank
+        fill!(field, arch.local_rank)
         fill_halo_regions!(field)
 
         @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
@@ -393,22 +393,22 @@ end
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
     arch = DistributedArch(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
-    grid = RectilinearGrid(arch, topology=topo, size=(8, 8, 3), extent=(1, 2, 3), halo=halo)
+    grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 3), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
     for field in merge(fields(model))
-        interior(field) .= arch.local_rank
+        fill!(field, arch.local_rank)
         fill_halo_regions!(field)
 
-        @test all(east_halo(field, include_corners=false) .== arch.connectivity.east)
-        @test all(west_halo(field, include_corners=false) .== arch.connectivity.west)
+        @test all(interior(field) .== arch.local_rank)
+
+        @test all(east_halo(field, include_corners=false)  .== arch.connectivity.east)
+        @test all(west_halo(field, include_corners=false)  .== arch.connectivity.west)
         @test all(north_halo(field, include_corners=false) .== arch.connectivity.north)
         @test all(south_halo(field, include_corners=false) .== arch.connectivity.south)
 
-        @test all(interior(field) .== arch.local_rank)
-        @test all(top_halo(field, include_corners=false) .== arch.local_rank)
+        @test all(top_halo(field, include_corners=false)    .== arch.local_rank)
         @test all(bottom_halo(field, include_corners=false) .== arch.local_rank)
-
         @test all(southwest_halo(field) .== arch.connectivity.southwest) 
         @test all(southeast_halo(field) .== arch.connectivity.southeast) 
         @test all(northwest_halo(field) .== arch.connectivity.northwest) 
@@ -426,10 +426,6 @@ end
 
     @info "Testing distributed MPI Oceananigans..."
 
-    # We don't support distributing _anything_ in the vertical,
-    # so these tests are commented out below (and maybe should be removed
-    # in the future). 
-
     @testset "Multi architectures rank connectivity" begin
         @info "  Testing multi architecture rank connectivity..."
         test_triply_periodic_rank_connectivity_with_411_ranks()

From 4b6743a55c64d308b515d1ebb62ecd6f4310710e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 29 Jun 2023 23:58:55 +0200
Subject: [PATCH 377/530] try new test

---
 src/Distributed/halo_communication.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1f990a9c9a..f2b17a8f86 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -224,7 +224,7 @@ for side in [:southwest, :southeast, :northwest, :northeast]
 end
 
 #####
-##### fill_west_and_east_halo!   }
+##### fill_west_and_east_halo!  }
 ##### fill_south_and_north_halo! } for when both halos are communicative (Single communicating halos are to be implemented)
 #####
 

From b167ad976fcef54c962ae7b83d76f38fac3187f9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 09:31:01 +0200
Subject: [PATCH 378/530] fill send buffers in the correct place

---
 src/Distributed/halo_communication.jl | 82 +++++++++++++--------------
 src/Fields/field_boundary_buffers.jl  | 30 ++++++++++
 2 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index f2b17a8f86..1f16934335 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -104,10 +104,6 @@ end
 function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::DistributedGrid, buffers, args...; kwargs...)
     arch       = architecture(grid)
     halo_tuple = permute_boundary_conditions(bcs)
-    
-    # This has to be synchronized!!
-    fill_send_buffers!(c, buffers, grid)
-    sync_device!(arch)
 
     for task = 1:3
         fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid, buffers, args...; kwargs...)
@@ -121,13 +117,41 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     return nothing
 end
 
+function pool_requests_or_complete_comm!(c, buffers, requests, arch, side)
+   
+    # if `isnothing(requests)`, `fill_halo!` did not involve MPI 
+    if isnothing(requests)
+        return nothing
+    end
+
+    # Overlapping communication and computation, store requests in a `MPI.Request`
+    # pool to be waited upon after tendency calculation
+    if async && !(arch isa BlockingDistributedArch)
+        push!(arch.mpi_requests, requests...)
+        return nothing
+    end
+
+    # Syncronous MPI fill_halo_event!
+    cooperative_waitall!(requests)
+    # Reset MPI tag
+    arch.mpi_tag[1] -= arch.mpi_tag[1]
+
+    recv_from_buffers!(c, buffers, grid, Val(side))    
+
+    return nothing
+end
+
 # corner passing routine
 function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args...; async = false, only_local_halos = false, kwargs...)
     
-    if only_local_halos 
+    if only_local_halos # No corner filling needed!
         return nothing
     end
 
+    # This has to be synchronized!!
+    fill_send_buffers!(c, buffers, grid, Val(:corners))
+    sync_device!(arch)
+
     requests = MPI.Request[]
 
     reqsw = fill_southwest_halo!(connectivity.southwest, c, indices, loc, arch, grid, buffers, args...; kwargs...)
@@ -140,61 +164,37 @@ function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args.
     !isnothing(reqnw) && push!(requests, reqnw...)
     !isnothing(reqne) && push!(requests, reqne...)
 
-    if isempty(requests)
-        return nothing
-    end
-
-    if async && !(arch isa BlockingDistributedArch)
-        push!(arch.mpi_requests, requests...)
-        return nothing
-    end
-
-    # Syncronous MPI fill_halo_event!
-    cooperative_waitall!(requests)
-
-    # Reset MPI tag
-    arch.mpi_tag[1] -= arch.mpi_tag[1]
-    recv_from_buffers!(c, buffers, grid, Val(:corners))    
+    pool_requests_or_complete_comm!(c, buffers, requests, arch, :corners)
 
     return nothing
 end
 
 @inline mpi_communication_side(::Val{fill_west_and_east_halo!})   = :west_and_east
 @inline mpi_communication_side(::Val{fill_south_and_north_halo!}) = :south_and_north
+@inline mpi_communication_side(::Val{fill_bottom_and_top_halo!})  = :bottom_and_top
 
 cooperative_wait(req::MPI.Request)            = MPI.Waitall(req)
 cooperative_waitall!(req::Array{MPI.Request}) = MPI.Waitall(req)
 
-function fill_halo_event!(task, halo_tuple, c, indices, loc, arch::DistributedArch, grid::DistributedGrid, buffers, args...; async = false, kwargs...)
+function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid::DistributedGrid, buffers, args...; async = false, only_local_halos = false, kwargs...)
     fill_halo!  = halo_tuple[1][task]
     bc_left     = halo_tuple[2][task]
     bc_right    = halo_tuple[3][task]
 
-    # Calculate size and offset of the fill_halo kernel
-    size   = fill_halo_size(c, fill_halo!, indices, bc_left, loc, grid)
-    offset = fill_halo_offset(size, fill_halo!, indices)
-
-    requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; kwargs...)
+    buffer_side = mpi_communication_side(Val(fill_halo!))
 
-    # if `isnothing(requests)`, `fill_halo!` did not involve MPI 
-    if isnothing(requests)
-        return nothing
+    if !only_local_halos # Then we need to fill the `send` buffers
+        fill_send_buffers!(c, buffers, grid, Val(buffer_side))
+        sync_device!(arch)
     end
 
-    # Overlapping communication and computation, store requests in a `MPI.Request`
-    # pool to be waited upon after tendency calculation
-    if async && !(arch isa BlockingDistributedArch)
-        push!(arch.mpi_requests, requests...)
-        return nothing
-    end
+    # Calculate size and offset of the fill_halo kernel
+    size   = fill_halo_size(c, fill_halo!, indices, bc_left, loc, grid)
+    offset = fill_halo_offset(size, fill_halo!, indices)
 
-    # Syncronous MPI fill_halo_event!
-    cooperative_waitall!(requests)
-    # Reset MPI tag
-    arch.mpi_tag[1] -= arch.mpi_tag[1]
+    requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; only_local_halos, kwargs...)
 
-    buffer_side = mpi_communication_side(Val(fill_halo!))
-    recv_from_buffers!(c, buffers, grid, Val(buffer_side))    
+    pool_requests_or_complete_comm!(c, buffers, requests, arch, buffer_side)
 
     return nothing
 end
diff --git a/src/Fields/field_boundary_buffers.jl b/src/Fields/field_boundary_buffers.jl
index 3bd68963da..85d5312975 100644
--- a/src/Fields/field_boundary_buffers.jl
+++ b/src/Fields/field_boundary_buffers.jl
@@ -114,6 +114,36 @@ function fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid)
     return nothing
 end
 
+function fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:corners})
+    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, _ = size(grid)
+
+    _fill_southwest_send_buffer!(parent(c), buff, buff.southwest, Hx, Hy, Nx, Ny)
+    _fill_southeast_send_buffer!(parent(c), buff, buff.southeast, Hx, Hy, Nx, Ny)
+    _fill_northwest_send_buffer!(parent(c), buff, buff.northwest, Hx, Hy, Nx, Ny)
+    _fill_northeast_send_buffer!(parent(c), buff, buff.northeast, Hx, Hy, Nx, Ny)
+
+    return nothing
+end
+
+function fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:west_and_east})
+    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, _ = size(grid)
+
+     _fill_west_send_buffer!(parent(c), buff, buff.west,  Hx, Hy, Nx, Ny)
+     _fill_east_send_buffer!(parent(c), buff, buff.east,  Hx, Hy, Nx, Ny)
+end
+
+function fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:south_and_north})
+    Hx, Hy, _ = halo_size(grid)
+    Nx, Ny, _ = size(grid)
+
+    _fill_south_send_buffer!(parent(c), buff, buff.south, Hx, Hy, Nx, Ny)
+    _fill_north_send_buffer!(parent(c), buff, buff.north, Hx, Hy, Nx, Ny)
+end
+
+fill_send_buffers!(c::OffsetArray, buff::FieldBoundaryBuffers, grid, ::Val{:bottom_and_top}) = nothing
+
 """
     recv_from_buffers(c, buffers, arch)
 

From a85999db863c26c395b215bf2e93574feb68da8f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 09:38:16 +0200
Subject: [PATCH 379/530] fixed comments

---
 src/Distributed/halo_communication.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 1f16934335..39796dd3c4 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -224,8 +224,8 @@ for side in [:southwest, :southeast, :northwest, :northeast]
 end
 
 #####
-##### fill_west_and_east_halo!  }
-##### fill_south_and_north_halo! } for when both halos are communicative (Single communicating halos are to be implemented)
+##### fill_west_and_east_halo!  
+##### fill_south_and_north_halo! 
 #####
 
 for (side, opposite_side) in zip([:west, :south], [:east, :north])
@@ -321,7 +321,7 @@ for side in sides
 end
 
 #####
-##### Receiving and filling halos (buffer is a view so it gets filled upon receive)
+##### Receiving and filling halos 
 #####
 
 for side in sides

From 2e3fb940d2c521e65b07db6e366e984d75f9727e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 09:51:35 +0200
Subject: [PATCH 380/530] define async

---
 src/Distributed/halo_communication.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 39796dd3c4..733aa14019 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -117,7 +117,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     return nothing
 end
 
-function pool_requests_or_complete_comm!(c, buffers, requests, arch, side)
+@inline function pool_requests_or_complete_comm!(c, buffers, requests, arch, async, side)
    
     # if `isnothing(requests)`, `fill_halo!` did not involve MPI 
     if isnothing(requests)
@@ -164,7 +164,7 @@ function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args.
     !isnothing(reqnw) && push!(requests, reqnw...)
     !isnothing(reqne) && push!(requests, reqne...)
 
-    pool_requests_or_complete_comm!(c, buffers, requests, arch, :corners)
+    pool_requests_or_complete_comm!(c, buffers, requests, arch, async, :corners)
 
     return nothing
 end
@@ -194,7 +194,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid::Distrib
 
     requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; only_local_halos, kwargs...)
 
-    pool_requests_or_complete_comm!(c, buffers, requests, arch, buffer_side)
+    pool_requests_or_complete_comm!(c, buffers, requests, arch, async, buffer_side)
 
     return nothing
 end

From 1b0f2a89a05cf76f9a129f59853e0e4c470b282e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 10:06:08 +0200
Subject: [PATCH 381/530] pass the grid

---
 src/Distributed/halo_communication.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 733aa14019..5f2f2dceab 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -117,7 +117,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     return nothing
 end
 
-@inline function pool_requests_or_complete_comm!(c, buffers, requests, arch, async, side)
+@inline function pool_requests_or_complete_comm!(c, arch, grid, buffers, requests, async, side)
    
     # if `isnothing(requests)`, `fill_halo!` did not involve MPI 
     if isnothing(requests)
@@ -164,7 +164,7 @@ function fill_corners!(connectivity, c, indices, loc, arch, grid, buffers, args.
     !isnothing(reqnw) && push!(requests, reqnw...)
     !isnothing(reqne) && push!(requests, reqne...)
 
-    pool_requests_or_complete_comm!(c, buffers, requests, arch, async, :corners)
+    pool_requests_or_complete_comm!(c, arch, grid, buffers, requests, async, :corners)
 
     return nothing
 end
@@ -194,7 +194,7 @@ function fill_halo_event!(task, halo_tuple, c, indices, loc, arch, grid::Distrib
 
     requests = fill_halo!(c, bc_left, bc_right, size, offset, loc, arch, grid, buffers, args...; only_local_halos, kwargs...)
 
-    pool_requests_or_complete_comm!(c, buffers, requests, arch, async, buffer_side)
+    pool_requests_or_complete_comm!(c, arch, grid, buffers, requests, async, buffer_side)
 
     return nothing
 end

From 306655aeb093a1f00372f618073a6f1e7e1c4212 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 13:29:40 +0200
Subject: [PATCH 382/530] bugfix

---
 src/Distributed/distributed_grids.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 2c481979d6..21b4787676 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -84,8 +84,7 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
                                radius = R_Earth,
                                halo = (1, 1, 1))
 
-
-    global_sizes = map(sum, concatenate_local_sizes(size, arch))
+    global_size = map(sum, concatenate_local_sizes(size, arch))
 
     Nλ, Nφ, Nz, Hλ, Hφ, Hz, latitude, longitude, z, topology, precompute_metrics =
         validate_lat_lon_grid_args(FT, latitude, longitude, z, global_size, halo, topology, precompute_metrics)

From 4c737f3e5281de11cc86d42249c42fa061d18ca8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 14:00:04 +0200
Subject: [PATCH 383/530] fix show method

---
 .../show_hydrostatic_free_surface_model.jl                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/show_hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/show_hydrostatic_free_surface_model.jl
index 3c7bf23f31..d46b919028 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/show_hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/show_hydrostatic_free_surface_model.jl
@@ -27,7 +27,7 @@ function Base.show(io::IO, model::HydrostaticFreeSurfaceModel)
         end
 
         if typeof(model.free_surface).name.wrapper == SplitExplicitFreeSurface
-            print(io, "│   └── number of substeps: $(model.free_surface.settings.substeps)", "\n")
+            print(io, "│   └── substepping: $(model.free_surface.settings.substepping)", "\n")
         end
     end
 

From fb0505d66334d63059649735fb1dd3ee9004b7c5 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 17:15:57 +0200
Subject: [PATCH 384/530] RefValue for mpi_tag

---
 src/Distributed/halo_communication.jl       | 8 ++++----
 src/Distributed/interleave_comm_and_comp.jl | 2 +-
 src/Distributed/multi_architectures.jl      | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 5f2f2dceab..decce72e03 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -62,14 +62,14 @@ for side in sides
     recv_tag_fn_name = Symbol("$(side)_recv_tag")
     @eval begin
         function $send_tag_fn_name(arch, location)
-            field_id   = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            field_id   = string(arch.mpi_tag[], pad=ID_DIGITS)
             loc_digit  = string(loc_id(location...)) 
             side_digit = string(side_id[Symbol($side_str)])
             return parse(Int, field_id * loc_digit * side_digit)
         end
 
         function $recv_tag_fn_name(arch, location)
-            field_id   = string(arch.mpi_tag[1], pad=ID_DIGITS)
+            field_id   = string(arch.mpi_tag[], pad=ID_DIGITS)
             loc_digit  = string(loc_id(location...)) 
             side_digit = string(side_id[opposite_side[Symbol($side_str)]])
             return parse(Int, field_id * loc_digit * side_digit)
@@ -112,7 +112,7 @@ function fill_halo_regions!(c::OffsetArray, bcs, indices, loc, grid::Distributed
     fill_corners!(arch.connectivity, c, indices, loc, arch, grid, buffers, args...; kwargs...)
     
     # Switch to the next field to send
-    arch.mpi_tag[1] += 1
+    arch.mpi_tag[] += 1
 
     return nothing
 end
@@ -134,7 +134,7 @@ end
     # Syncronous MPI fill_halo_event!
     cooperative_waitall!(requests)
     # Reset MPI tag
-    arch.mpi_tag[1] -= arch.mpi_tag[1]
+    arch.mpi_tag[] -= arch.mpi_tag[]
 
     recv_from_buffers!(c, buffers, grid, Val(side))    
 
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 18c6eeaab6..5b97cef979 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -56,7 +56,7 @@ function complete_halo_communication!(field)
         cooperative_waitall!(arch.mpi_requests)
 
         # Reset MPI tag
-        arch.mpi_tag[1] -= arch.mpi_tag[1]
+        arch.mpi_tag[] -= arch.mpi_tag[]
     
         # Reset MPI requests
         empty!(arch.mpi_requests)
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index d388aa471a..4d847550b9 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -95,9 +95,9 @@ function DistributedArch(child_architecture = CPU();
     mpi_requests = enable_overlapped_computation ? MPI.Request[] : nothing
 
     M = typeof(mpi_requests)
-    T = typeof([0])
+    T = typeof(Ref(0))
 
-    return DistributedArch{A, R, I, ρ, C, γ, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, [0])
+    return DistributedArch{A, R, I, ρ, C, γ, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, Ref(0))
 end
 
 const BlockingDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}

From d37a7813f4cb6bc3fe447053bb08cfc57c75af30 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 30 Jun 2023 17:17:35 +0200
Subject: [PATCH 385/530] comment

---
 src/Distributed/multi_architectures.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 4d847550b9..07927decd1 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -46,6 +46,10 @@ Keyword arguments
                       `y` and `z` direction. NOTE: support for distributed z direction is 
                       limited, so `Rz = 1` is strongly suggested.
 
+- enable_overlapped_computation: if `true` the prognostic halo communication will be overlapped
+                                 with tendency calculations, and the barotropic halo communication
+                                 with the implicit vertical solver (defaults to `true`)
+
 - `devices`: `GPU` device linked to local rank. The GPU will be assigned based on the 
              local node rank as such `devices[node_rank]`. Make sure to run `--ntasks-per-node` <= `--gres=gpu`.
              If `nothing`, the devices will be assigned automatically based on the available resources

From bcd4d02dc8f6442e2b2f65cbb135a60fd5ffc1e5 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 2 Jul 2023 15:06:05 +1000
Subject: [PATCH 386/530] add catke preprint

---
 docs/oceananigans.bib | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/oceananigans.bib b/docs/oceananigans.bib
index 35f7ff6802..10cc857967 100644
--- a/docs/oceananigans.bib
+++ b/docs/oceananigans.bib
@@ -882,3 +882,11 @@ @article{Roquet15TEOS
 	year = {2015},
 	pages = {29--43}
 }
+
+@article{Wagner23catke,
+  title={CATKE: a turbulent-kinetic-energy-based parameterization for ocean microturbulence with dynamic convective adjustment},
+  author={Wagner, Gregory LeClaire and Hillier, Adeline and Constantinou, Navid C and Silvestri, Simone and Souza, Andre and Burns, Keaton and Ramadhan, Ali and Hill, Chris and Campin, Jean-Michel and Marshall, John and others},
+  journal={arXiv preprint arXiv:2306.13204},
+  year={2023},
+  doi={10.48550/arXiv.2306.13204}
+}

From 80d46dee1e4be993587cdf04acbd9ee1c20858f0 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 2 Jul 2023 15:06:44 +1000
Subject: [PATCH 387/530] remove warning; add ref to catke preprint

---
 .../CATKEVerticalDiffusivities.jl             | 81 ++++++++++---------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 064b08282a..738ad75b15 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -68,48 +68,67 @@ function CATKEVerticalDiffusivity{TD}(mixing_length::CL,
                                                      negative_turbulent_kinetic_energy_damping_time_scale)
 end
 
+CATKEVerticalDiffusivity(FT::DataType; kw...) = CATKEVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
+
+const CATKEVD{TD} = CATKEVerticalDiffusivity{TD} where TD
+const CATKEVDArray{TD} = AbstractArray{<:CATKEVD{TD}} where TD
+const FlavorOfCATKE{TD} = Union{CATKEVD{TD}, CATKEVDArray{TD}} where TD
+
+include("mixing_length.jl")
+include("turbulent_kinetic_energy_equation.jl")
+
 """
-    CATKEVerticalDiffusivity(time_discretization = VerticallyImplicitTimeDiscretization(), FT=Float64;
-                             mixing_length = MixingLength{FT}(),
-                             turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation{FT}(),
+    CATKEVerticalDiffusivity([time_discretization = VerticallyImplicitTimeDiscretization(),
+                             FT = Float64;]
+                             mixing_length = MixingLength(),
+                             turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
                              maximum_diffusivity = Inf,
-                             minimum_turbulent_kinetic_energy = zero(FT),
+                             minimum_turbulent_kinetic_energy = 1e-6,
+                             minimum_convective_buoyancy_flux = 1e-8,
                              negative_turbulent_kinetic_energy_damping_time_scale = 1minute)
 
 Return the `CATKEVerticalDiffusivity` turbulence closure for vertical mixing by
 small-scale ocean turbulence based on the prognostic evolution of subgrid
 Turbulent Kinetic Energy (TKE).
 
+!!! note "CATKE vertical diffusivity"
+    `CATKEVerticalDiffusivity` is new turbulence closure diffusivity. The default
+    values for its free parameters are obtained from calibration against large eddy
+    simulations. For more details please refer to [Wagner23catke](@cite).
+
+    Use with caution and report any issues with the physics at https://github.com/CliMA/Oceananigans.jl/issues.
+
+Arguments
+=========
+
+- `time_discretization`: Either `ExplicitTimeDiscretization()` or `VerticallyImplicitTimeDiscretization()`;
+                         default `VerticallyImplicitTimeDiscretization()`.
+
+- `FT`: Float type; default `Float64`.
+
+
 Keyword arguments
 =================
-  - `maximum_diffusivity`: Maximum value for tracer, momentum, and TKE diffusivities.
-                           Used to clip the diffusivity when/if CATKE predicts
-                           diffusivities that are too large.
-                           Default: `Inf`.
 
-  - `minimum_turbulent_kinetic_energy`: Minimum value for the turbulent kinetic energy.
-                                        Can be used to model the presence "background" TKE
-                                        levels due to, for example, mixing by breaking internal waves.
-                                        Default: 0.
+- `maximum_diffusivity`: Maximum value for tracer, momentum, and TKE diffusivities.
+                        Used to clip the diffusivity when/if CATKE predicts
+                        diffusivities that are too large.
+                        Default: `Inf`.
 
-  - `negative_turbulent_kinetic_energy_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
-                                                            typically generated by oscillatory errors associated
-                                                            with TKE advection.
-                                                            Default: 1 minute.
+- `minimum_turbulent_kinetic_energy`: Minimum value for the turbulent kinetic energy.
+                                    Can be used to model the presence "background" TKE
+                                    levels due to, for example, mixing by breaking internal waves.
+                                    Default: 0.
+
+- `negative_turbulent_kinetic_energy_damping_time_scale`: Damping time-scale for spurious negative values of TKE,
+                                                        typically generated by oscillatory errors associated
+                                                        with TKE advection.
+                                                        Default: 1 minute.
 
 Note that for numerical stability, it is recommended to either have a relative short
 `negative_turbulent_kinetic_energy_damping_time_scale` or a reasonable
 `minimum_turbulent_kinetic_energy`, or both.
 """
-CATKEVerticalDiffusivity(FT::DataType; kw...) = CATKEVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
-
-const CATKEVD{TD} = CATKEVerticalDiffusivity{TD} where TD
-const CATKEVDArray{TD} = AbstractArray{<:CATKEVD{TD}} where TD
-const FlavorOfCATKE{TD} = Union{CATKEVD{TD}, CATKEVDArray{TD}} where TD
-
-include("mixing_length.jl")
-include("turbulent_kinetic_energy_equation.jl")
-
 function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTimeDiscretization(),
                                   FT = Float64;
                                   mixing_length = MixingLength(),
@@ -117,16 +136,7 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
                                   maximum_diffusivity = Inf,
                                   minimum_turbulent_kinetic_energy = 1e-6,
                                   minimum_convective_buoyancy_flux = 1e-8,
-                                  negative_turbulent_kinetic_energy_damping_time_scale = 1minute,
-                                  warning = true) where TD
-
-    if warning
-        @warn "CATKEVerticalDiffusivity is an experimental turbulence closure that \n" *
-              "is unvalidated and whose default parameters are not calibrated for \n" * 
-              "realistic ocean conditions or for use in a three-dimensional \n" *
-              "simulation. Use with caution and report bugs and problems with physics \n" *
-              "to https://github.com/CliMA/Oceananigans.jl/issues."
-    end
+                                  negative_turbulent_kinetic_energy_damping_time_scale = 1minute) where TD
 
     mixing_length = convert_eltype(FT, mixing_length)
     turbulent_kinetic_energy_equation = convert_eltype(FT, turbulent_kinetic_energy_equation)
@@ -137,7 +147,6 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
                                         FT(minimum_turbulent_kinetic_energy),
                                         FT(minimum_convective_buoyancy_flux),
                                         FT(negative_turbulent_kinetic_energy_damping_time_scale))
-                                  
 end
 
 function with_tracers(tracer_names, closure::FlavorOfCATKE)

From 00a5eba50022b2d3e9cafdfbda47f0f662f8bae9 Mon Sep 17 00:00:00 2001
From: "Navid C. Constantinou" <navidcy@users.noreply.github.com>
Date: Sun, 2 Jul 2023 15:14:49 +1000
Subject: [PATCH 388/530] some code cleanup

---
 .../CATKEVerticalDiffusivities.jl             | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 738ad75b15..39a7853688 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -53,22 +53,21 @@ struct CATKEVerticalDiffusivity{TD, CL, FT, TKE} <: AbstractScalarDiffusivity{TD
     negative_turbulent_kinetic_energy_damping_time_scale :: FT
 end
 
-function CATKEVerticalDiffusivity{TD}(mixing_length::CL,
-                                      turbulent_kinetic_energy_equation::TKE,
-                                      maximum_diffusivity::FT,
-                                      minimum_turbulent_kinetic_energy::FT,
-                                      minimum_convective_buoyancy_flux::FT,
-                                      negative_turbulent_kinetic_energy_damping_time_scale::FT) where {TD, CL, TKE, FT}
-
-    return CATKEVerticalDiffusivity{TD, CL, FT, TKE}(mixing_length,
-                                                     turbulent_kinetic_energy_equation,
-                                                     maximum_diffusivity,
-                                                     minimum_turbulent_kinetic_energy,
-                                                     minimum_convective_buoyancy_flux,
-                                                     negative_turbulent_kinetic_energy_damping_time_scale)
-end
-
-CATKEVerticalDiffusivity(FT::DataType; kw...) = CATKEVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
+CATKEVerticalDiffusivity{TD}(mixing_length::CL,
+                             turbulent_kinetic_energy_equation::TKE,
+                             maximum_diffusivity::FT,
+                             minimum_turbulent_kinetic_energy::FT,
+                             minimum_convective_buoyancy_flux::FT,
+                             negative_turbulent_kinetic_energy_damping_time_scale::FT) where {TD, CL, TKE, FT} =
+    CATKEVerticalDiffusivity{TD, CL, FT, TKE}(mixing_length,
+                                              turbulent_kinetic_energy_equation,
+                                              maximum_diffusivity,
+                                              minimum_turbulent_kinetic_energy,
+                                              minimum_convective_buoyancy_flux,
+                                              negative_turbulent_kinetic_energy_damping_time_scale)
+
+CATKEVerticalDiffusivity(FT::DataType; kw...) =
+    CATKEVerticalDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
 
 const CATKEVD{TD} = CATKEVerticalDiffusivity{TD} where TD
 const CATKEVDArray{TD} = AbstractArray{<:CATKEVD{TD}} where TD
@@ -183,7 +182,7 @@ catke_first(catke1::FlavorOfCATKE, catke2::FlavorOfCATKE) = error("Can't have tw
     N² = ∂z_b(i, j, k, grid, buoyancy, tracers)
     S² = ∂z_u² + ∂z_v²
     Ri = N² / S²
-    return ifelse(N² <= 0, zero(grid), Ri)
+    return ifelse(N² ≤ 0, zero(grid), Ri)
 end
 
 for S in (:MixingLength, :TurbulentKineticEnergyEquation)

From 04e603cef0e62eed6bf8f73d9290ce9e2428652c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 3 Jul 2023 12:29:21 +0200
Subject: [PATCH 389/530] correct the example

---
 .../mpi_hydrostatic_turbulence.jl             | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index cd189afa14..1210b9a315 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -18,6 +18,10 @@ function run_simulation(nx, ny, arch, topo)
 
     grid  = RectilinearGrid(arch; topology=topo, size=(nx, ny, 1), extent=(4π, 4π, 0.5), halo=(7, 7, 7))
 
+    bottom(x, y) = (x > π && x < 3π/2 && y > π/2 && y < 3π/2) ? 1.0 : - grid.Lz - 1.0
+
+    grid = ImmersedBoundaryGrid(grid, GridFittedBottom(bottom))
+
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 
     free_surface = SplitExplicitFreeSurface(; substeps = 10)
@@ -71,12 +75,12 @@ function visualize_simulation(var)
     y = Vector(undef, 4)
 
     for r in 1:4
-        v[r] = FieldTimeSeries("mpi_hydrostatic_turbulence_rank$(i-1).jld2", var))
+        v[r] = FieldTimeSeries("mpi_hydrostatic_turbulence_rank$(r-1).jld2", var)
         nx, ny, _ = size(v[r])
         V[r] = @lift(interior(v[r][$iter], 1:nx, 1:ny, 1))
 
-        x[r] = xnodes(v[i])
-        y[r] = ynodes(v[i])
+        x[r] = xnodes(v[r])
+        y[r] = ynodes(v[r])
     end
 
     fig = Figure()
@@ -114,7 +118,12 @@ ny = [56, 128-56][arch.local_index[2]]
 run_simulation(nx, ny, arch, topo)
 
 # Visualize the plane
-visualize_simulation("u")
-visualize_simulation("v")
-visualize_simulation("ζ")
-visualize_simulation("c")
+if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+    visualize_simulation("u")
+    visualize_simulation("v")
+    visualize_simulation("ζ")
+    visualize_simulation("c")
+end
+
+MPI.Barrier(MPI.COMM_WORLD)
+MPI.Finalize()

From c8944b7350b5f66af1c4af9d21d3a27a254e0f3f Mon Sep 17 00:00:00 2001
From: "Gregory L. Wagner" <wagner.greg@gmail.com>
Date: Wed, 5 Jul 2023 09:49:15 -0600
Subject: [PATCH 390/530] Update
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 6d939cdb91..4a622c688f 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -86,7 +86,7 @@ end
 end
 
 @inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ)
-    k′ = k + 2 # Shift to adjust for Tridiagonal indexing convenction
+    k′ = k + 2 # Shift to adjust for Tridiagonal indexing convention
     closure_ij = getclosure(i, j, closure)  
     νᵏ⁻¹   = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
     Δzᶜₖ   = Δz(i, j, k′,   grid, ℓx, ℓy, c)

From 603f50e62c21623675b81fc2eaf511ec7043e6c8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 6 Jul 2023 09:41:29 +0200
Subject: [PATCH 391/530] bugfix

---
 src/Fields/interpolate.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Fields/interpolate.jl b/src/Fields/interpolate.jl
index d5d723db2a..b3e52a7d56 100644
--- a/src/Fields/interpolate.jl
+++ b/src/Fields/interpolate.jl
@@ -1,6 +1,7 @@
 using Oceananigans.Grids: isxregular, isyregular, iszregular, 
                           xnodes, ynodes, znodes, 
                           λnodes, φnodes,
+                          λspacings, φspacings, zspacings,
                           topology, 
                           node,
                           isxflat, isyflat, iszflat,

From 2e06209ad7f44dfbe42a90080bbad279a2167a9a Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Thu, 6 Jul 2023 12:59:11 -0600
Subject: [PATCH 392/530] Refactor unit tests

---
 test/test_multi_region_unit.jl | 54 +++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/test/test_multi_region_unit.jl b/test/test_multi_region_unit.jl
index f4884b0ec3..6ff4cce90c 100644
--- a/test/test_multi_region_unit.jl
+++ b/test/test_multi_region_unit.jl
@@ -1,3 +1,5 @@
+include("dependencies_for_runtests.jl")
+
 using Oceananigans.MultiRegion
 using Oceananigans.MultiRegion: reconstruct_global_grid, reconstruct_global_field, getnamewrapper
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid, GridFittedBottom, GridFittedBoundary
@@ -8,47 +10,59 @@ devices(::GPU, num) = Tuple(0 for i in 1:num)
 @testset "Testing multi region grids" begin
     for arch in archs
 
-        region_num   = [2, 4, 5]
-        partitioning = [XPartition]
+        regions = [2, 4, 5]
+        partition_types = [XPartition]
+
+        lat_lon_grid = LatitudeLongitudeGrid(arch,
+                                             size = (20, 20, 1),
+                                             latitude = (-80, 80),
+                                             longitude = collect(range(-180, 180, length=21)),
+                                             z = (0, 1))
 
-        grids = [LatitudeLongitudeGrid(arch, size=(20, 20, 1), latitude=(-80, 80), longitude=collect(range(-180, 180, length=21)), z=(0, 1)),
-                RectilinearGrid(arch, size=(20, 20, 1), x=(0, 1), y=collect(range(0, 1, length=21)), z=(0, 1))]
+        rectilinear_grid = RectilinearGrid(arch,
+                                           size = (20, 20, 1),
+                                           x = (0, 1),
+                                           y = collect(range(0, 1, length=21)),
+                                           z = (0, 1))
+                 
+        grids = [lat_lon_grid, rectilinear_grid]
 
-        immersed_boundaries = [GridFittedBottom((x, y)->0.5),
+        immersed_boundaries = [GridFittedBottom((x, y) -> 0.5),
                                GridFittedBottom(arch_array(arch, [0.5 for i in 1:20, j in 1:20])),
-                               GridFittedBoundary((x, y, z)->z>0.5),
+                               GridFittedBoundary((x, y, z) -> z>0.5),
                                GridFittedBoundary(arch_array(arch, [false for i in 1:20, j in 1:20, k in 1:1]))]
         
-        for grid in grids, P in partitioning, regions in region_num
-            @info "Testing multi region $(getnamewrapper(grid)) on $regions $(P)s"
-            mrg = MultiRegionGrid(grid, partition = P(regions), devices = devices(arch, regions))
+        for grid in grids, Partition in partition_types, region in regions
+            @info "Testing multi region $(getnamewrapper(grid)) on $regions $(Partition)s"
+            mrg = MultiRegionGrid(grid, partition = Partition(region), devices = devices(arch, region))
 
             @test reconstruct_global_grid(mrg) == grid
 
             for FieldType in [CenterField, XFaceField, YFaceField]
-                @info "Testing multi region $(FieldType) on $(getnamewrapper(grid)) on $regions $(P)s"
+                @info "Testing multi region $(FieldType) on $(getnamewrapper(grid)) on $regions $(Partition)s"
 
-                par_field = FieldType(mrg)
-                ser_field = FieldType(grid)
+                multi_region_field = FieldType(mrg)
+                single_region_field = FieldType(grid)
 
-                set!(ser_field, (x, y, z) -> x)
-                @apply_regionally set!(par_field, (x, y, z) -> x)
+                set!(single_region_field, (x, y, z) -> x)
+                @apply_regionally set!(multi_region_field, (x, y, z) -> x)
 
-                fill_halo_regions!(ser_field)
-                fill_halo_regions!(par_field)
+                fill_halo_regions!(single_region_field)
+                fill_halo_regions!(multi_region_field)
 
-                rec_field = reconstruct_global_field(par_field)
+                reconstructed_field = reconstruct_global_field(multi_region_field)
 
-                @test all(Array(rec_field.data.parent) .≈ Array(ser_field.data.parent))
+                @test parent(reconstructed_field) ≈ parent(single_region_field)
             end
 
             for immersed_boundary in immersed_boundaries
-                @info "Testing multi region immersed boundaries on $(getnamewrapper(grid)) on $regions $(P)s"
+                @info "Testing multi region immersed boundaries on $(getnamewrapper(grid)) on $regions $(Partition)s"
                 ibg = ImmersedBoundaryGrid(grid, immersed_boundary)
-                mrg = MultiRegionGrid(ibg, partition = P(regions), devices = devices(arch, regions))
+                mrg = MultiRegionGrid(ibg, partition = Partition(region), devices = devices(arch, region))
 
                 @test on_architecture(arch, reconstruct_global_grid(mrg)) == ibg
             end
         end
     end
 end
+

From c7245374a9ca45f8d08f74e2f496a6675d52bc13 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Fri, 7 Jul 2023 15:44:26 -0400
Subject: [PATCH 393/530] Generalize regridding for lat-lon

---
 src/Fields/regridding_fields.jl | 81 +++++++++++++++++----------------
 1 file changed, 42 insertions(+), 39 deletions(-)

diff --git a/src/Fields/regridding_fields.jl b/src/Fields/regridding_fields.jl
index 110e3289e5..580ee9c3cf 100644
--- a/src/Fields/regridding_fields.jl
+++ b/src/Fields/regridding_fields.jl
@@ -78,6 +78,7 @@ function we_can_regrid_in_x(a, target_grid, source_grid, b)
 end
 
 function regrid_in_z!(a, target_grid, source_grid, b)
+    location(a, 3) == Center || throw(ArgumentError("Can only regrid fields in z with Center z-locations."))
     arch = architecture(a)
     source_z_faces = znodes(source_grid, f)
     launch!(arch, target_grid, :xy, _regrid_in_z!, a, b, target_grid, source_grid, source_z_faces)
@@ -86,16 +87,18 @@ function regrid_in_z!(a, target_grid, source_grid, b)
 end
 
 function regrid_in_y!(a, target_grid, source_grid, b)
+    location(a, 2) == Center || throw(ArgumentError("Can only regrid fields in y with Center y-locations."))
     arch = architecture(a)
-    source_y_faces = ynodes(source_grid, f)
+    source_y_faces = nodes(source_grid, c, f, c)[2]
     Nx_source_faces = size(source_grid, (Face, Center, Center), 1)
     launch!(arch, target_grid, :xz, _regrid_in_y!, a, b, target_grid, source_grid, source_y_faces, Nx_source_faces)
     return a
 end
 
 function regrid_in_x!(a, target_grid, source_grid, b)
+    location(a, 1) == Center || throw(ArgumentError("Can only regrid fields in x with Center x-locations."))
     arch = architecture(a)
-    source_x_faces = xnodes(source_grid, f)
+    source_x_faces = nodes(source_grid, f, c, c)[1]
     Ny_source_faces = size(source_grid, (Center, Face, Center), 2)
     launch!(arch, target_grid, :yz, _regrid_in_x!, a, b, target_grid, source_grid, source_x_faces, Ny_source_faces)
     return a
@@ -138,8 +141,8 @@ end
 
     fo = ForwardOrdering()
 
-    @unroll for k = 1:target_grid.Nz
-        @inbounds target_field[i, j, k] = 0
+    @inbounds @unroll for k = 1:target_grid.Nz
+        target_field[i, j, k] = 0
 
         z₋ = znode(i, j, k,   target_grid, c, c, f)
         z₊ = znode(i, j, k+1, target_grid, c, c, f)
@@ -154,11 +157,11 @@ end
             # lies entirely within the source cell j₊_src (ie, we are _refining_
             # rather than coarse graining). In this case our job is easy:
             # the target cell concentration is equal to the source concentration.
-            @inbounds target_field[i, j, k] = source_field[i_src, j_src, k₊_src]
+            target_field[i, j, k] = source_field[i_src, j_src, k₊_src]
         else
             # Add contribution from all full cells in the integration range
             @unroll for k_src = k₋_src:k₊_src-1
-                @inbounds target_field[i, j, k] += source_field[i_src, j_src, k_src] * Δzᶜᶜᶜ(i_src, j_src, k_src, source_grid)
+                target_field[i, j, k] += source_field[i_src, j_src, k_src] * Δzᶜᶜᶜ(i_src, j_src, k_src, source_grid)
             end
 
             zk₋_src = znode(i_src, j_src, k₋_src, source_grid, c, c, f)
@@ -167,16 +170,16 @@ end
             # Add contribution to integral from fractional left part of the source field,
             # if that region is a part of the grid.
             if k₋_src > 1
-                @inbounds target_field[i, j, k] += source_field[i_src, j_src, k₋_src - 1] * (zk₋_src - z₋)
+                target_field[i, j, k] += source_field[i_src, j_src, k₋_src - 1] * (zk₋_src - z₋)
             end
 
             # Add contribution to integral from fractional right part of the source field, if that
             # region is part of the grid.
             if k₊_src < source_grid.Nz+1
-                @inbounds target_field[i, j, k] += source_field[i_src, j_src, k₊_src] * (z₊ - zk₊_src)
+                target_field[i, j, k] += source_field[i_src, j_src, k₊_src] * (z₊ - zk₊_src)
             end
 
-            @inbounds target_field[i, j, k] /= Δzᶜᶜᶜ(i, j, k, target_grid)
+            target_field[i, j, k] /= Δzᶜᶜᶜ(i, j, k, target_grid)
         end
     end
 end
@@ -193,11 +196,11 @@ end
 
     fo = ForwardOrdering()
 
-    @unroll for j = 1:target_grid.Ny
-        @inbounds target_field[i, j, k] = 0
+    @inbounds @unroll for j = 1:target_grid.Ny
+        target_field[i, j, k] = 0
 
-        y₋ = ynode(i, j,   k, target_grid, c, f, c)
-        y₊ = ynode(i, j+1, k, target_grid, c, f, c)
+        y₋ = node(i, j,   k, target_grid, c, f, c)[2]
+        y₊ = node(i, j+1, k, target_grid, c, f, c)[2]
 
         # Integrate source field from y₋ to y₊
         j₋_src = searchsortedfirst(source_y_faces, y₋, 1, Ny_source+1, fo)
@@ -209,15 +212,15 @@ end
             # lies entirely within the source cell j₊_src (ie, we are _refining_
             # rather than coarse graining). In this case our job is easy:
             # the target cell concentration is equal to the source concentration.
-            @inbounds target_field[i, j, k] = source_field[i_src, j₊_src, k_src]
+            target_field[i, j, k] = source_field[i_src, j₊_src, k_src]
         else
             # Add contribution from all full cells in the integration range
             @unroll for j_src = j₋_src:j₊_src-1
-                @inbounds target_field[i, j, k] += source_field[i_src, j_src, k_src] * Azᶜᶜᶜ(i_src, j_src, k_src, source_grid)
+                target_field[i, j, k] += source_field[i_src, j_src, k_src] * Azᶜᶜᶜ(i_src, j_src, k_src, source_grid)
             end
 
-            yj₋_src = ynode(i_src, j₋_src, k_src, source_grid, c, f, c)
-            yj₊_src = ynode(i_src, j₊_src, k_src, source_grid, c, f, c)
+            yj₋_src = node(i_src, j₋_src, k_src, source_grid, c, f, c)[2]
+            yj₊_src = node(i_src, j₊_src, k_src, source_grid, c, f, c)[2]
 
             # Add contribution to integral from fractional left part,
             # if that region is a part of the grid.
@@ -225,25 +228,25 @@ end
             if j₋_src > 1
                 j_left = j₋_src - 1
 
-                x₁ = xnode(i_src,  source_grid, f)
-                x₂ = xnode(i⁺_src, source_grid, f)
+                x₁ = node(i_src,  j_left, k_src, source_grid, f, c, c)[1]
+                x₂ = node(i⁺_src, j_left, k_src, source_grid, f, c, c)[1]
                 Az_left = fractional_horizontal_area(source_grid, x₁, x₂, y₋, yj₋_src)
 
-                @inbounds target_field[i, j, k] += source_field[i_src, j_left, k_src] * Az_left
+                target_field[i, j, k] += source_field[i_src, j_left, k_src] * Az_left
             end
 
             # Similar to above, add contribution to integral from fractional right part.
             if j₊_src < source_grid.Ny+1
                 j_right = j₊_src
 
-                x₁ = xnode(i_src,  source_grid, f)
-                x₂ = xnode(i⁺_src, source_grid, f)
+                x₁ = node(i_src,  j_right, k_src, source_grid, f, c, c)[1]
+                x₂ = node(i⁺_src, j_right, k_src, source_grid, f, c, c)[1]
                 Az_right = fractional_horizontal_area(source_grid, x₁, x₂, yj₊_src, y₊)
 
-                @inbounds target_field[i, j, k] += source_field[i_src, j_right, k_src] * Az_right
+                target_field[i, j, k] += source_field[i_src, j_right, k_src] * Az_right
             end
 
-            @inbounds target_field[i, j, k] /= Azᶜᶜᶜ(i, j, k, target_grid)
+            target_field[i, j, k] /= Azᶜᶜᶜ(i, j, k, target_grid)
         end
     end
 end
@@ -260,12 +263,12 @@ end
 
     fo = ForwardOrdering()
 
-    @unroll for i = 1:target_grid.Nx
-        @inbounds target_field[i, j, k] = 0
+    @inbounds @unroll for i = 1:target_grid.Nx
+        target_field[i, j, k] = 0
 
         # Integrate source field from x₋ to x₊
-        x₋ = xnode(i,   j, k, target_grid, f, c, c)
-        x₊ = xnode(i+1, j, k, target_grid, f, c, c)
+        x₋ = node(i,   j, k, target_grid, f, c, c)[1]
+        x₊ = node(i+1, j, k, target_grid, f, c, c)[1]
 
         # The first face on the source grid that appears inside the target cell
         i₋_src = searchsortedfirst(source_x_faces, x₋, 1, Nx_source+1, fo)
@@ -279,20 +282,20 @@ end
             # lies entirely within the source cell i₊_src (ie, we are _refining_
             # rather than coarse graining). In this case our job is easy:
             # the target cell concentration is equal to the source concentration.
-            @inbounds target_field[i, j, k] = source_field[i₊_src, j_src, k_src]
+            target_field[i, j, k] = source_field[i₊_src, j_src, k_src]
         else
             # Otherwise, our job is a little bit harder and we have to carefully, conservatively
             # sum up all the contributions from the source field to the target cell.
             
             # First we add up all the contributions from all source cells that lie entirely within the target cell.
             @unroll for i_src = i₋_src:i₊_src-1
-                @inbounds target_field[i, j, k] += source_field[i_src, j_src, k_src] * Azᶜᶜᶜ(i_src, j_src, k_src, source_grid)
+                target_field[i, j, k] += source_field[i_src, j_src, k_src] * Azᶜᶜᶜ(i_src, j_src, k_src, source_grid)
             end
     
             # Next, we add contributions from the "fractional" source cells on the right
             # and left of the target cell.
-            xi₋_src = xnode(i₋_src, j_src, k_src, source_grid, f, c, c)
-            xi₊_src = xnode(i₊_src, j_src, k_src, source_grid, f, c, c)
+            xi₋_src = node(i₋_src, j_src, k_src, source_grid, f, c, c)[1]
+            xi₊_src = node(i₊_src, j_src, k_src, source_grid, f, c, c)[1]
     
             # Add contribution to integral from fractional left part,
             # if that region is a part of the grid.
@@ -300,25 +303,25 @@ end
             if i₋_src > 1
                 i_left = i₋_src - 1
                 
-                y₁ = ynode(j_src,  source_grid, f) 
-                y₂ = ynode(j⁺_src, source_grid, f) 
+                y₁ = node(i_left, j_src,  k_src, source_grid, c, f, c)[2]
+                y₂ = node(i_left, j⁺_src, k_src, source_grid, c, f, c)[2] 
                 Az_left = fractional_horizontal_area(source_grid, x₋, xi₋_src, y₁, y₂)
 
-                @inbounds target_field[i, j, k] += source_field[i_left, j_src, k_src] * Az_left
+                target_field[i, j, k] += source_field[i_left, j_src, k_src] * Az_left
             end
     
             # Similar to above, add contribution to integral from fractional right part.
             if i₊_src < source_grid.Nx+1
                 i_right = i₊_src
 
-                y₁ = ynode(j_src,  source_grid, f)
-                y₂ = ynode(j⁺_src, source_grid, f)
+                y₁ = node(i_right, j_src,  k_src, source_grid, c, f, c)[2]
+                y₂ = node(i_right, j⁺_src, k_src, source_grid, c, f, c)[2]
                 Az_right = fractional_horizontal_area(source_grid, xi₊_src, x₊, y₁, y₂)
 
-                @inbounds target_field[i, j, k] += source_field[i_right, j_src, k_src] * Az_right
+                target_field[i, j, k] += source_field[i_right, j_src, k_src] * Az_right
             end
     
-            @inbounds target_field[i, j, k] /= Azᶜᶜᶜ(i, j, k, target_grid)
+            target_field[i, j, k] /= Azᶜᶜᶜ(i, j, k, target_grid)
         end
     end
 end

From 9069bf4704492fdc0bff00725c6f5965beed6bcb Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 10 Jul 2023 09:37:55 +0200
Subject: [PATCH 394/530] bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 45df32d3a3..0b7029ded9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -265,7 +265,7 @@ end
 
 @kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
@@ -277,7 +277,7 @@ end
 
 @kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, args)
     idx = @index(Global, Linear)
-    i, j, k = active_linear_index_to_ntuple(idx, grid)
+    i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 

From 40e87b51d48043027ffce4c3dad0a57fcfaeb269 Mon Sep 17 00:00:00 2001
From: Gregory Wagner <wagner.greg@gmail.com>
Date: Mon, 10 Jul 2023 14:44:29 -0400
Subject: [PATCH 395/530] Add newline

---
 src/Grids/latitude_longitude_grid.jl | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/Grids/latitude_longitude_grid.jl b/src/Grids/latitude_longitude_grid.jl
index b598cb5b6f..3b9689a728 100644
--- a/src/Grids/latitude_longitude_grid.jl
+++ b/src/Grids/latitude_longitude_grid.jl
@@ -573,7 +573,6 @@ end
 
 return_metrics(::LatitudeLongitudeGrid) = (:λᶠᵃᵃ, :λᶜᵃᵃ, :φᵃᶠᵃ, :φᵃᶜᵃ, :zᵃᵃᶠ, :zᵃᵃᶜ)
 
-
 #####
 ##### Grid nodes
 #####
@@ -620,7 +619,6 @@ end
 @inline xnodes(grid::LatLonGrid, ℓx, ℓy, ℓz; with_halos=false) = xnodes(grid, ℓx, ℓy; with_halos)
 @inline ynodes(grid::LatLonGrid, ℓx, ℓy, ℓz; with_halos=false) = ynodes(grid, ℓy; with_halos)
 
-
 @inline node(i, j, k, grid::LatLonGrid, ℓx, ℓy, ℓz) = (λnode(i, j, k, grid, ℓx, ℓy, ℓz),
                                                        φnode(i, j, k, grid, ℓx, ℓy, ℓz),
                                                        znode(i, j, k, grid, ℓx, ℓy, ℓz))
@@ -692,7 +690,6 @@ end
 @inline yspacings(grid::LatLonGrid, ℓx, ℓy, ℓz; kwargs...) = yspacings(grid, ℓx, ℓy; kwargs...)
 @inline zspacings(grid::LatLonGrid, ℓx, ℓy, ℓz; kwargs...) = zspacings(grid, ℓz; kwargs...)
 
-
 #####
 ##### Grid spacings in λ, φ (in degrees)
 #####
@@ -729,4 +726,4 @@ end
 
 @inline isxregular(::LatitudeLongitudeGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Number}) = true
 @inline isyregular(::LatitudeLongitudeGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Number}) = true
-@inline iszregular(::LatitudeLongitudeGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Number}) = true
\ No newline at end of file
+@inline iszregular(::LatitudeLongitudeGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Number}) = true

From 19bc3dd46f361aeae9b802ba6d49d17122156842 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Jul 2023 12:51:33 +0000
Subject: [PATCH 396/530] small correction

---
 ...static_free_surface_boundary_tendencies.jl |  3 +-
 .../hydrostatic_free_surface_model.jl         | 28 ++++++++-----------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
index efb7937223..0940831582 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
@@ -6,9 +6,8 @@ using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_paramet
                                                 boundary_κ_kernel_parameters,
                                                 boundary_parameters
 
-import Oceananigans.Models.NonhydrostaticModels: compute_boundary_tendencies!
+import Oceananigans.Distributed: compute_boundary_tendencies!
 
-                                
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 7b7fc30b5b..864b61fc23 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -1,7 +1,7 @@
 using CUDA: has_cuda
 using OrderedCollections: OrderedDict
 
-
+using Oceananigans.Distributed
 using Oceananigans.Architectures: AbstractArchitecture, GPU
 using Oceananigans.Advection: AbstractAdvectionScheme, CenteredSecondOrder, VectorInvariant
 using Oceananigans.BuoyancyModels: validate_buoyancy, regularize_buoyancy, SeawaterBuoyancy, g_Earth
@@ -22,12 +22,6 @@ using Oceananigans.Utils: tupleit
 import Oceananigans: initialize!
 import Oceananigans.Models: total_velocities
 
-""" Returns a default_tracer_advection, tracer_advection `tuple`. """
-validate_tracer_advection(invalid_tracer_advection, grid) = error("$invalid_tracer_advection is invalid tracer_advection!")
-validate_tracer_advection(tracer_advection_tuple::NamedTuple, grid) = CenteredSecondOrder(), tracer_advection_tuple
-validate_tracer_advection(tracer_advection::AbstractAdvectionScheme, grid) = tracer_advection, NamedTuple()
-validate_tracer_advection(tracer_advection::Nothing, grid) = nothing, NamedTuple()
-
 PressureField(grid) = (; pHY′ = CenterField(grid))
 
 const ParticlesOrNothing = Union{Nothing, AbstractLagrangianParticles}
@@ -172,6 +166,7 @@ function HydrostaticFreeSurfaceModel(; grid,
 
     @apply_regionally validate_velocity_boundary_conditions(grid, velocities)
 
+    free_surface = validate_free_surface(arch, free_surface)
     free_surface = FreeSurface(free_surface, velocities, grid)
 
     # Instantiate timestepper if not already instantiated
@@ -212,23 +207,22 @@ function validate_vertical_velocity_boundary_conditions(w)
     return nothing
 end
 
-momentum_advection_squawk(momentum_advection, grid) = error("$(typeof(momentum_advection)) is not supported with $(typeof(grid))")
-
-function momentum_advection_squawk(momentum_advection, ::AbstractHorizontallyCurvilinearGrid) 
-    @warn "The $(summary(momentum_advection)) momentum advection scheme is not allowed on curvilinear grids. " * 
-          "The momentum advection scheme has been set to VectorInvariant()"
-    return VectorInvariant()
-end
+validate_free_surface(::DistributedArch, free_surface::SplitExplicitFreeSurface) = free_surface
+validate_free_surface(arch::DistributedArch, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
+validate_free_surface(arch, free_surface) = free_surface
 
 validate_momentum_advection(momentum_advection, ibg::ImmersedBoundaryGrid) = validate_momentum_advection(momentum_advection, ibg.underlying_grid)
-
 validate_momentum_advection(momentum_advection, grid::RectilinearGrid)                     = momentum_advection
 validate_momentum_advection(momentum_advection, grid::AbstractHorizontallyCurvilinearGrid) = momentum_advection
-
 validate_momentum_advection(momentum_advection::Nothing,         grid::OrthogonalSphericalShellGrid) = momentum_advection
 validate_momentum_advection(momentum_advection::VectorInvariant, grid::OrthogonalSphericalShellGrid) = momentum_advection
+validate_momentum_advection(momentum_advection, grid::OrthogonalSphericalShellGrid) = error("$(typeof(momentum_advection)) is not supported with $(typeof(grid))")
 
-validate_momentum_advection(momentum_advection, grid::OrthogonalSphericalShellGrid) = momentum_advection_squawk(momentum_advection, grid)
+""" Returns a default_tracer_advection, tracer_advection `tuple`. """
+validate_tracer_advection(invalid_tracer_advection, grid) = error("$invalid_tracer_advection is invalid tracer_advection!")
+validate_tracer_advection(tracer_advection_tuple::NamedTuple, grid) = CenteredSecondOrder(), tracer_advection_tuple
+validate_tracer_advection(tracer_advection::AbstractAdvectionScheme, grid) = tracer_advection, NamedTuple()
+validate_tracer_advection(tracer_advection::Nothing, grid) = nothing, NamedTuple()
 
 initialize!(model::HydrostaticFreeSurfaceModel) = initialize_free_surface!(model.free_surface, model.grid, model.velocities)
 initialize_free_surface!(free_surface, grid, velocities) = nothing

From 54f273c76e8bfd8c6869dc3e5a869557f6398828 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:03:49 +0000
Subject: [PATCH 397/530] new tests

---
 .../upwind_biased_advective_fluxes.jl         | 101 ++++++++++--------
 src/Advection/vector_invariant_advection.jl   |  12 +--
 .../vector_invariant_cross_upwinding.jl       |  12 +--
 .../vector_invariant_self_upwinding.jl        |  42 +++-----
 .../vector_invariant_velocity_upwinding.jl    |  34 ++----
 5 files changed, 90 insertions(+), 111 deletions(-)

diff --git a/src/Advection/upwind_biased_advective_fluxes.jl b/src/Advection/upwind_biased_advective_fluxes.jl
index 701c266519..003631be07 100644
--- a/src/Advection/upwind_biased_advective_fluxes.jl
+++ b/src/Advection/upwind_biased_advective_fluxes.jl
@@ -9,6 +9,27 @@ const UpwindScheme = AbstractUpwindBiasedAdvectionScheme
 
 @inline upwind_biased_product(ũ, ψᴸ, ψᴿ) = ((ũ + abs(ũ)) * ψᴸ + (ũ - abs(ũ)) * ψᴿ) / 2
 
+@inline sign_val(u) = Val(Int(sign(u)))
+
+# Upwind interpolate -> choose _left_biased if u > 0 and _right_biased if u < 0
+for (d, ξ) in enumerate((:x, :y, :z))
+    code = [:ᵃ, :ᵃ, :ᵃ]
+
+    for loc in (:ᶜ, :ᶠ)
+        code[d] = loc
+        second_order_interp = Symbol(:ℑ, ξ, code...)
+        alt_interp       = Symbol(:_upwind_interpolate_, ξ, code...)
+        alt_left_interp  = Symbol(:_left_biased_interpolate_, ξ, code...)
+        alt_right_interp = Symbol(:_right_biased_interpolate_, ξ, code...)
+
+        @eval begin
+            @inline $alt_interp(i, j, k, grid, u, args...) = $alt_interp(i, j, k, grid, sign_val(u), args...)
+            @inline $alt_interp(i, j, k, grid, ::Val{1},  args...) =  $alt_left_interp(i, j, k, grid, args...)
+            @inline $alt_interp(i, j, k, grid, ::Val{-1}, args...) = $alt_right_interp(i, j, k, grid, args...)
+        end
+    end
+end
+
 #####
 ##### Momentum advection operators
 #####
@@ -17,83 +38,74 @@ const UpwindScheme = AbstractUpwindBiasedAdvectionScheme
 
 @inline function advective_momentum_flux_Uu(i, j, k, grid, scheme::UpwindScheme, U, u)
 
-    ũ  =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
+    ũ  = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    uᴿ =    _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, ũ, scheme, u)
 
-    return upwind_biased_product(ũ, uᴸ, uᴿ)
+    return ũ * uᴿ
 end
 
 @inline function advective_momentum_flux_Vu(i, j, k, grid, scheme::UpwindScheme, V, u)
 
-    ṽ  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    uᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
+    ṽ  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    uᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, u)
 
-    return upwind_biased_product(ṽ, uᴸ, uᴿ)
+    return ṽ * uᴿ
 end
 
 @inline function advective_momentum_flux_Wu(i, j, k, grid, scheme::UpwindScheme, W, u)
 
-    w̃  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    uᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
-    uᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
+    w̃  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    uᴿ =    _upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, u)
 
-    return upwind_biased_product(w̃, uᴸ, uᴿ)
+    return w̃ * uᴿ
 end
 
 @inline function advective_momentum_flux_Uv(i, j, k, grid, scheme::UpwindScheme, U, v)
 
-    ũ  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    vᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
+    ũ  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    vᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, v)
  
-    return upwind_biased_product(ũ, vᴸ, vᴿ)
+    return ũ * vᴿ
 end
 
 @inline function advective_momentum_flux_Vv(i, j, k, grid, scheme::UpwindScheme, V, v)
 
-    ṽ  =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
+    ṽ  = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    vᴿ =    _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, ṽ, scheme, v)
 
-    return upwind_biased_product(ṽ, vᴸ, vᴿ)
+    return ṽ * vᴿ
 end
 
 @inline function advective_momentum_flux_Wv(i, j, k, grid, scheme::UpwindScheme, W, v)
 
-    w̃  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    vᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
-    vᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
+    w̃  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    vᴿ =    _upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, v)
 
-    return upwind_biased_product(w̃, vᴸ, vᴿ)
+    return w̃ * vᴿ
 end
 
 @inline function advective_momentum_flux_Uw(i, j, k, grid, scheme::UpwindScheme, U, w)
 
-    ũ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    wᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
+    ũ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    wᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, w)
 
-    return upwind_biased_product(ũ, wᴸ, wᴿ)
+    return ũ * wᴿ
 end
 
 @inline function advective_momentum_flux_Vw(i, j, k, grid, scheme::UpwindScheme, V, w)
 
-    ṽ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    wᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
+    ṽ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    wᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, w)
 
-    return upwind_biased_product(ṽ, wᴸ, wᴿ)
+    return ṽ * wᴿ
 end
 
 @inline function advective_momentum_flux_Ww(i, j, k, grid, scheme::UpwindScheme, W, w)
 
-    w̃  =    _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    wᴸ =  _left_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
-    wᴿ = _right_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
+    w̃  = _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    wᴿ =    _upwind_interpolate_zᵃᵃᶜ(i, j, k, grid, w̃, scheme, w)
 
-    return upwind_biased_product(w̃, wᴸ, wᴿ)
+    return w̃ * wᴿ
 end
 
 #####
@@ -103,26 +115,23 @@ end
 @inline function advective_tracer_flux_x(i, j, k, grid, scheme::UpwindScheme, U, c) 
 
     @inbounds ũ = U[i, j, k]
-    cᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
+    cᴿ =_upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, c)
 
-    return Axᶠᶜᶜ(i, j, k, grid) * upwind_biased_product(ũ, cᴸ, cᴿ)
+    return Axᶠᶜᶜ(i, j, k, grid) * ũ * cᴿ
 end
 
 @inline function advective_tracer_flux_y(i, j, k, grid, scheme::UpwindScheme, V, c)
 
     @inbounds ṽ = V[i, j, k]
-    cᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
+    cᴿ =_upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, c)
 
-    return Ayᶜᶠᶜ(i, j, k, grid) * upwind_biased_product(ṽ, cᴸ, cᴿ)
+    return Ayᶜᶠᶜ(i, j, k, grid) * ṽ * cᴿ
 end
 
 @inline function advective_tracer_flux_z(i, j, k, grid, scheme::UpwindScheme, W, c)
 
     @inbounds w̃ = W[i, j, k]
-    cᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
-    cᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
+    cᴿ =_upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, c)
 
-    return Azᶜᶜᶠ(i, j, k, grid) * upwind_biased_product(w̃, cᴸ, cᴿ) 
-end
+    return Azᶜᶜᶠ(i, j, k, grid) * w̃ * cᴿ
+end
\ No newline at end of file
diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 731cba588a..04667674d9 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -217,10 +217,9 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds v̂ = ℑxᶠᵃᵃ(i, j, k, grid, ℑyᵃᶜᵃ, Δx_qᶜᶠᶜ, v) / Δxᶠᶜᶜ(i, j, k, grid) 
-    ζᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-    ζᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-
-    return - upwind_biased_product(v̂, ζᴸ, ζᴿ)
+    ζᴿ =  _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    
+    return - v̂ * ζᴿ
 end
 
 @inline function horizontal_advection_V(i, j, k, grid, scheme::VectorInvariantUpwindVorticity, u, v) 
@@ -228,10 +227,9 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds û  =  ℑyᵃᶠᵃ(i, j, k, grid, ℑxᶜᵃᵃ, Δy_qᶠᶜᶜ, u) / Δyᶜᶠᶜ(i, j, k, grid)
-    ζᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-    ζᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
 
-    return + upwind_biased_product(û, ζᴸ, ζᴿ)
+    return + û * ζᴿ
 end
 
 #####
diff --git a/src/Advection/vector_invariant_cross_upwinding.jl b/src/Advection/vector_invariant_cross_upwinding.jl
index 9a71d0070a..08300f418e 100644
--- a/src/Advection/vector_invariant_cross_upwinding.jl
+++ b/src/Advection/vector_invariant_cross_upwinding.jl
@@ -22,19 +22,15 @@ const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:An
 @inline function upwinded_divergence_flux_Uᶠᶜᶜ(i, j, k, grid, scheme::VectorInvariantCrossVerticalUpwinding, u, v)
     @inbounds û = u[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
+    δᴿ = _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    δᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-
-    return upwind_biased_product(û, δᴸ, δᴿ)
+    return û * δᴿ
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantCrossVerticalUpwinding, u, v)
     @inbounds v̂ = v[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
+    δᴿ = _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    δᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-
-    return upwind_biased_product(v̂, δᴸ, δᴿ) 
+    return v̂ * δᴿ
 end
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index 2dea587dfd..129ca94df6 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -20,11 +20,10 @@ const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds û = u[i, j, k]
-    δvˢ =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_V, u, v) 
-    δuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
-    δuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
+    δvˢ = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid,    scheme, cross_scheme, δy_V, u, v) 
+    δuᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
 
-    return upwind_biased_product(û, δuᴸ, δuᴿ) + û * δvˢ
+    return û * (δuᴿ + δvˢ)
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantSelfVerticalUpwinding, u, v)
@@ -33,11 +32,10 @@ end
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds v̂ = v[i, j, k]
-    δuˢ =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, cross_scheme, δx_U, u, v)
-    δvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
-    δvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
+    δuˢ = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid,    scheme, cross_scheme, δx_U, u, v)
+    δvᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
 
-    return upwind_biased_product(v̂, δvᴸ, δvᴿ) + v̂ * δuˢ
+    return v̂ * (δuˢ + δvᴿ)
 end
 
 #####
@@ -59,34 +57,24 @@ const VectorInvariantVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:
 
 @inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
 
-    @inbounds û = u[i, j, k]
-
     δu²_stencil  = scheme.upwinding.δu²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    δKvˢ =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
-    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
-    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
-    
-    ∂Kᴸ = (δKuᴸ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
-    ∂Kᴿ = (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
+    @inbounds û = u[i, j, k]
+    δKvˢ = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid,    scheme, cross_scheme, δx_v², u, v)
+    δKuᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
 
-    return ifelse(û > 0, ∂Kᴸ, ∂Kᴿ)
+    return (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
 end
 
 @inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
 
-    @inbounds v̂ = v[i, j, k]
-
-    δv²_stencil   = scheme.upwinding.δv²_stencil    
+    δv²_stencil  = scheme.upwinding.δv²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    δKuˢ =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
-    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
-    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
+    @inbounds v̂ = v[i, j, k]
+    δKuˢ = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid,    scheme, cross_scheme, δy_u², u, v)
+    δKvᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
     
-    ∂Kᴸ = (δKvᴸ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid) 
-    ∂Kᴿ = (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
-
-    return ifelse(v̂ > 0, ∂Kᴸ, ∂Kᴿ)
+    return (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
 end
diff --git a/src/Advection/vector_invariant_velocity_upwinding.jl b/src/Advection/vector_invariant_velocity_upwinding.jl
index 65b11920ec..2b99c6c693 100644
--- a/src/Advection/vector_invariant_velocity_upwinding.jl
+++ b/src/Advection/vector_invariant_velocity_upwinding.jl
@@ -11,21 +11,15 @@ const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any,
 #####
 
 @inline function upwinded_Ax_uᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
-
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
-
-    return ifelse(û > 0, Uᴸ, Uᴿ)
+    û  = ℑxᶜᵃᵃ(i, j, k, grid, u)
+    Uᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
+    return Uᴿ
 end
 
 @inline function upwinded_Ay_vᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
-
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
-
-    return ifelse(v̂ > 0, Vᴸ, Vᴿ)
+    v̂  = ℑyᵃᶜᵃ(i, j, k, grid, v)
+    Vᴿ = _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
+    return Vᴿ
 end
 
 @inline reconstructed_Ax_uᶠᶠᶜ(i, j, k, grid, scheme, u) = 
@@ -57,20 +51,14 @@ end
 #####
 
 @inline function upwinded_u²ᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
-
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
-
-    return ifelse(û > 0, Uᴸ, Uᴿ)
+    û  = ℑxᶜᵃᵃ(i, j, k, grid, u)
+    Uᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, half_ϕ², u)
+    return Uᴿ
 end
 
 @inline function upwinded_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
-
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
-
+    v̂  = ℑyᵃᶜᵃ(i, j, k, grid, v)
+    Vᴿ = _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, half_ϕ², v)
     return ifelse(v̂ > 0, Vᴸ, Vᴿ)
 end
 

From 9e520beaf13b6c12fde1c78a876ee421d2b0702e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 12 Jul 2023 13:12:58 +0000
Subject: [PATCH 398/530] bugfix

---
 src/Advection/upwind_biased_advective_fluxes.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Advection/upwind_biased_advective_fluxes.jl b/src/Advection/upwind_biased_advective_fluxes.jl
index 003631be07..37f9842ea8 100644
--- a/src/Advection/upwind_biased_advective_fluxes.jl
+++ b/src/Advection/upwind_biased_advective_fluxes.jl
@@ -24,6 +24,7 @@ for (d, ξ) in enumerate((:x, :y, :z))
 
         @eval begin
             @inline $alt_interp(i, j, k, grid, u, args...) = $alt_interp(i, j, k, grid, sign_val(u), args...)
+            @inline $alt_interp(i, j, k, grid, ::Val{0},  args...) =  $alt_left_interp(i, j, k, grid, args...)
             @inline $alt_interp(i, j, k, grid, ::Val{1},  args...) =  $alt_left_interp(i, j, k, grid, args...)
             @inline $alt_interp(i, j, k, grid, ::Val{-1}, args...) = $alt_right_interp(i, j, k, grid, args...)
         end

From 85d44f768b140a57669e047febbed415d4275005 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Thu, 13 Jul 2023 10:21:18 -0400
Subject: [PATCH 399/530] bugfix

---
 src/Advection/upwind_biased_advective_fluxes.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Advection/upwind_biased_advective_fluxes.jl b/src/Advection/upwind_biased_advective_fluxes.jl
index 37f9842ea8..72a9c9fa87 100644
--- a/src/Advection/upwind_biased_advective_fluxes.jl
+++ b/src/Advection/upwind_biased_advective_fluxes.jl
@@ -9,7 +9,10 @@ const UpwindScheme = AbstractUpwindBiasedAdvectionScheme
 
 @inline upwind_biased_product(ũ, ψᴸ, ψᴿ) = ((ũ + abs(ũ)) * ψᴸ + (ũ - abs(ũ)) * ψᴿ) / 2
 
-@inline sign_val(u) = Val(Int(sign(u)))
+struct LeftUpwind end
+struct RightUpwind end
+
+@inline sign_val(u) = ifelse(u > 0, LeftUpwind(), RightUpwind())
 
 # Upwind interpolate -> choose _left_biased if u > 0 and _right_biased if u < 0
 for (d, ξ) in enumerate((:x, :y, :z))
@@ -24,9 +27,8 @@ for (d, ξ) in enumerate((:x, :y, :z))
 
         @eval begin
             @inline $alt_interp(i, j, k, grid, u, args...) = $alt_interp(i, j, k, grid, sign_val(u), args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{0},  args...) =  $alt_left_interp(i, j, k, grid, args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{1},  args...) =  $alt_left_interp(i, j, k, grid, args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{-1}, args...) = $alt_right_interp(i, j, k, grid, args...)
+            @inline $alt_interp(i, j, k, grid, ::LeftUpwind,  args...) =  $alt_left_interp(i, j, k, grid, args...)
+            @inline $alt_interp(i, j, k, grid, ::RightUpwind, args...) = $alt_right_interp(i, j, k, grid, args...)
         end
     end
 end

From fdc0aea0c97368d6ff8f9d1d8b321576579f5c0e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 13 Jul 2023 12:24:08 -0400
Subject: [PATCH 400/530] back for testing

---
 .../upwind_biased_advective_fluxes.jl         | 102 ++++++++----------
 src/Advection/vector_invariant_advection.jl   |  12 ++-
 .../vector_invariant_cross_upwinding.jl       |  12 ++-
 .../vector_invariant_self_upwinding.jl        |  42 +++++---
 .../vector_invariant_velocity_upwinding.jl    |  34 ++++--
 5 files changed, 111 insertions(+), 91 deletions(-)

diff --git a/src/Advection/upwind_biased_advective_fluxes.jl b/src/Advection/upwind_biased_advective_fluxes.jl
index 37f9842ea8..701c266519 100644
--- a/src/Advection/upwind_biased_advective_fluxes.jl
+++ b/src/Advection/upwind_biased_advective_fluxes.jl
@@ -9,28 +9,6 @@ const UpwindScheme = AbstractUpwindBiasedAdvectionScheme
 
 @inline upwind_biased_product(ũ, ψᴸ, ψᴿ) = ((ũ + abs(ũ)) * ψᴸ + (ũ - abs(ũ)) * ψᴿ) / 2
 
-@inline sign_val(u) = Val(Int(sign(u)))
-
-# Upwind interpolate -> choose _left_biased if u > 0 and _right_biased if u < 0
-for (d, ξ) in enumerate((:x, :y, :z))
-    code = [:ᵃ, :ᵃ, :ᵃ]
-
-    for loc in (:ᶜ, :ᶠ)
-        code[d] = loc
-        second_order_interp = Symbol(:ℑ, ξ, code...)
-        alt_interp       = Symbol(:_upwind_interpolate_, ξ, code...)
-        alt_left_interp  = Symbol(:_left_biased_interpolate_, ξ, code...)
-        alt_right_interp = Symbol(:_right_biased_interpolate_, ξ, code...)
-
-        @eval begin
-            @inline $alt_interp(i, j, k, grid, u, args...) = $alt_interp(i, j, k, grid, sign_val(u), args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{0},  args...) =  $alt_left_interp(i, j, k, grid, args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{1},  args...) =  $alt_left_interp(i, j, k, grid, args...)
-            @inline $alt_interp(i, j, k, grid, ::Val{-1}, args...) = $alt_right_interp(i, j, k, grid, args...)
-        end
-    end
-end
-
 #####
 ##### Momentum advection operators
 #####
@@ -39,74 +17,83 @@ end
 
 @inline function advective_momentum_flux_Uu(i, j, k, grid, scheme::UpwindScheme, U, u)
 
-    ũ  = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    uᴿ =    _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, ũ, scheme, u)
+    ũ  =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
+    uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, u)
 
-    return ũ * uᴿ
+    return upwind_biased_product(ũ, uᴸ, uᴿ)
 end
 
 @inline function advective_momentum_flux_Vu(i, j, k, grid, scheme::UpwindScheme, V, u)
 
-    ṽ  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    uᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, u)
+    ṽ  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    uᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
+    uᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, u)
 
-    return ṽ * uᴿ
+    return upwind_biased_product(ṽ, uᴸ, uᴿ)
 end
 
 @inline function advective_momentum_flux_Wu(i, j, k, grid, scheme::UpwindScheme, W, u)
 
-    w̃  = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    uᴿ =    _upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, u)
+    w̃  =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    uᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
+    uᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, u)
 
-    return w̃ * uᴿ
+    return upwind_biased_product(w̃, uᴸ, uᴿ)
 end
 
 @inline function advective_momentum_flux_Uv(i, j, k, grid, scheme::UpwindScheme, U, v)
 
-    ũ  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    vᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, v)
+    ũ  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    vᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
+    vᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, v)
  
-    return ũ * vᴿ
+    return upwind_biased_product(ũ, vᴸ, vᴿ)
 end
 
 @inline function advective_momentum_flux_Vv(i, j, k, grid, scheme::UpwindScheme, V, v)
 
-    ṽ  = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    vᴿ =    _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, ṽ, scheme, v)
+    ṽ  =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
+    vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, v)
 
-    return ṽ * vᴿ
+    return upwind_biased_product(ṽ, vᴸ, vᴿ)
 end
 
 @inline function advective_momentum_flux_Wv(i, j, k, grid, scheme::UpwindScheme, W, v)
 
-    w̃  = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    vᴿ =    _upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, v)
+    w̃  =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    vᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
+    vᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, v)
 
-    return w̃ * vᴿ
+    return upwind_biased_product(w̃, vᴸ, vᴿ)
 end
 
 @inline function advective_momentum_flux_Uw(i, j, k, grid, scheme::UpwindScheme, U, w)
 
-    ũ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
-    wᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, w)
+    ũ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ax_qᶠᶜᶜ, U)
+    wᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
+    wᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, w)
 
-    return ũ * wᴿ
+    return upwind_biased_product(ũ, wᴸ, wᴿ)
 end
 
 @inline function advective_momentum_flux_Vw(i, j, k, grid, scheme::UpwindScheme, V, w)
 
-    ṽ  = _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
-    wᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, w)
+    ṽ  =    _symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, Ay_qᶜᶠᶜ, V)
+    wᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
+    wᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, w)
 
-    return ṽ * wᴿ
+    return upwind_biased_product(ṽ, wᴸ, wᴿ)
 end
 
 @inline function advective_momentum_flux_Ww(i, j, k, grid, scheme::UpwindScheme, W, w)
 
-    w̃  = _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
-    wᴿ =    _upwind_interpolate_zᵃᵃᶜ(i, j, k, grid, w̃, scheme, w)
+    w̃  =    _symmetric_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, Az_qᶜᶜᶠ, W)
+    wᴸ =  _left_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
+    wᴿ = _right_biased_interpolate_zᵃᵃᶜ(i, j, k, grid, scheme, w)
 
-    return w̃ * wᴿ
+    return upwind_biased_product(w̃, wᴸ, wᴿ)
 end
 
 #####
@@ -116,23 +103,26 @@ end
 @inline function advective_tracer_flux_x(i, j, k, grid, scheme::UpwindScheme, U, c) 
 
     @inbounds ũ = U[i, j, k]
-    cᴿ =_upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, ũ, scheme, c)
+    cᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
+    cᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, c)
 
-    return Axᶠᶜᶜ(i, j, k, grid) * ũ * cᴿ
+    return Axᶠᶜᶜ(i, j, k, grid) * upwind_biased_product(ũ, cᴸ, cᴿ)
 end
 
 @inline function advective_tracer_flux_y(i, j, k, grid, scheme::UpwindScheme, V, c)
 
     @inbounds ṽ = V[i, j, k]
-    cᴿ =_upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, ṽ, scheme, c)
+    cᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
+    cᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, c)
 
-    return Ayᶜᶠᶜ(i, j, k, grid) * ṽ * cᴿ
+    return Ayᶜᶠᶜ(i, j, k, grid) * upwind_biased_product(ṽ, cᴸ, cᴿ)
 end
 
 @inline function advective_tracer_flux_z(i, j, k, grid, scheme::UpwindScheme, W, c)
 
     @inbounds w̃ = W[i, j, k]
-    cᴿ =_upwind_interpolate_zᵃᵃᶠ(i, j, k, grid, w̃, scheme, c)
+    cᴸ =  _left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
+    cᴿ = _right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, c)
 
-    return Azᶜᶜᶠ(i, j, k, grid) * w̃ * cᴿ
-end
\ No newline at end of file
+    return Azᶜᶜᶠ(i, j, k, grid) * upwind_biased_product(w̃, cᴸ, cᴿ) 
+end
diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 04667674d9..731cba588a 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -217,9 +217,10 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds v̂ = ℑxᶠᵃᵃ(i, j, k, grid, ℑyᵃᶜᵃ, Δx_qᶜᶠᶜ, v) / Δxᶠᶜᶜ(i, j, k, grid) 
-    ζᴿ =  _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
-    
-    return - v̂ * ζᴿ
+    ζᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+
+    return - upwind_biased_product(v̂, ζᴸ, ζᴿ)
 end
 
 @inline function horizontal_advection_V(i, j, k, grid, scheme::VectorInvariantUpwindVorticity, u, v) 
@@ -227,9 +228,10 @@ end
     Sζ = scheme.vorticity_stencil
 
     @inbounds û  =  ℑyᵃᶠᵃ(i, j, k, grid, ℑxᶜᵃᵃ, Δy_qᶠᶜᶜ, u) / Δyᶜᶠᶜ(i, j, k, grid)
-    ζᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
+    ζᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vorticity_scheme, ζ₃ᶠᶠᶜ, Sζ, u, v)
 
-    return + û * ζᴿ
+    return + upwind_biased_product(û, ζᴸ, ζᴿ)
 end
 
 #####
diff --git a/src/Advection/vector_invariant_cross_upwinding.jl b/src/Advection/vector_invariant_cross_upwinding.jl
index 08300f418e..9a71d0070a 100644
--- a/src/Advection/vector_invariant_cross_upwinding.jl
+++ b/src/Advection/vector_invariant_cross_upwinding.jl
@@ -22,15 +22,19 @@ const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:An
 @inline function upwinded_divergence_flux_Uᶠᶜᶜ(i, j, k, grid, scheme::VectorInvariantCrossVerticalUpwinding, u, v)
     @inbounds û = u[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
-    δᴿ = _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    return û * δᴿ
+    δᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+
+    return upwind_biased_product(û, δᴸ, δᴿ)
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantCrossVerticalUpwinding, u, v)
     @inbounds v̂ = v[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
-    δᴿ = _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
-    return v̂ * δᴿ
+    δᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+
+    return upwind_biased_product(v̂, δᴸ, δᴿ) 
 end
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index 129ca94df6..2dea587dfd 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -20,10 +20,11 @@ const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds û = u[i, j, k]
-    δvˢ = _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid,    scheme, cross_scheme, δy_V, u, v) 
-    δuᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
+    δvˢ =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_V, u, v) 
+    δuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
+    δuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
 
-    return û * (δuᴿ + δvˢ)
+    return upwind_biased_product(û, δuᴸ, δuᴿ) + û * δvˢ
 end
 
 @inline function upwinded_divergence_flux_Vᶜᶠᶜ(i, j, k, grid, scheme::VectorInvariantSelfVerticalUpwinding, u, v)
@@ -32,10 +33,11 @@ end
     cross_scheme = scheme.upwinding.cross_scheme
 
     @inbounds v̂ = v[i, j, k]
-    δuˢ = _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid,    scheme, cross_scheme, δx_U, u, v)
-    δvᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
+    δuˢ =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, cross_scheme, δx_U, u, v)
+    δvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
+    δvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
 
-    return v̂ * (δuˢ + δvᴿ)
+    return upwind_biased_product(v̂, δvᴸ, δvᴿ) + v̂ * δuˢ
 end
 
 #####
@@ -57,24 +59,34 @@ const VectorInvariantVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:
 
 @inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
 
+    @inbounds û = u[i, j, k]
+
     δu²_stencil  = scheme.upwinding.δu²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    @inbounds û = u[i, j, k]
-    δKvˢ = _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid,    scheme, cross_scheme, δx_v², u, v)
-    δKuᴿ =    _upwind_interpolate_xᶠᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
+    δKvˢ =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
+    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
+    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
+    
+    ∂Kᴸ = (δKuᴸ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
+    ∂Kᴿ = (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
 
-    return (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
+    return ifelse(û > 0, ∂Kᴸ, ∂Kᴿ)
 end
 
 @inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
 
-    δv²_stencil  = scheme.upwinding.δv²_stencil    
+    @inbounds v̂ = v[i, j, k]
+
+    δv²_stencil   = scheme.upwinding.δv²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
-    @inbounds v̂ = v[i, j, k]
-    δKuˢ = _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid,    scheme, cross_scheme, δy_u², u, v)
-    δKvᴿ =    _upwind_interpolate_yᵃᶠᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
+    δKuˢ =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
+    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
+    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
     
-    return (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
+    ∂Kᴸ = (δKvᴸ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid) 
+    ∂Kᴿ = (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
+
+    return ifelse(v̂ > 0, ∂Kᴸ, ∂Kᴿ)
 end
diff --git a/src/Advection/vector_invariant_velocity_upwinding.jl b/src/Advection/vector_invariant_velocity_upwinding.jl
index 2b99c6c693..65b11920ec 100644
--- a/src/Advection/vector_invariant_velocity_upwinding.jl
+++ b/src/Advection/vector_invariant_velocity_upwinding.jl
@@ -11,15 +11,21 @@ const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any,
 #####
 
 @inline function upwinded_Ax_uᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û  = ℑxᶜᵃᵃ(i, j, k, grid, u)
-    Uᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
-    return Uᴿ
+    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
+
+    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
+    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
+
+    return ifelse(û > 0, Uᴸ, Uᴿ)
 end
 
 @inline function upwinded_Ay_vᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂  = ℑyᵃᶜᵃ(i, j, k, grid, v)
-    Vᴿ = _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
-    return Vᴿ
+    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
+
+    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
+    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
+
+    return ifelse(v̂ > 0, Vᴸ, Vᴿ)
 end
 
 @inline reconstructed_Ax_uᶠᶠᶜ(i, j, k, grid, scheme, u) = 
@@ -51,14 +57,20 @@ end
 #####
 
 @inline function upwinded_u²ᶜᶜᶜ(i, j, k, grid, scheme, u) 
-    û  = ℑxᶜᵃᵃ(i, j, k, grid, u)
-    Uᴿ = _upwind_interpolate_xᶜᵃᵃ(i, j, k, grid, û, scheme, scheme.vertical_scheme, half_ϕ², u)
-    return Uᴿ
+    û = ℑxᶜᵃᵃ(i, j, k, grid, u)
+
+    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
+    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
+
+    return ifelse(û > 0, Uᴸ, Uᴿ)
 end
 
 @inline function upwinded_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) 
-    v̂  = ℑyᵃᶜᵃ(i, j, k, grid, v)
-    Vᴿ = _upwind_interpolate_yᵃᶜᵃ(i, j, k, grid, v̂, scheme, scheme.vertical_scheme, half_ϕ², v)
+    v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
+
+    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
+    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
+
     return ifelse(v̂ > 0, Vᴸ, Vᴿ)
 end
 

From 6885c8814ac1c9130e601501d4e8bf34ce3dc084 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 13 Jul 2023 18:35:45 -0400
Subject: [PATCH 401/530] update manifest

---
 Manifest.toml | 80 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index bf7e637d5a..aafa852c6f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -6,9 +6,9 @@ project_hash = "c19e7e0747cdfab35ec6d8d8bd4d66290cc1b731"
 
 [[deps.AbstractFFTs]]
 deps = ["ChainRulesCore", "LinearAlgebra"]
-git-tree-sha1 = "8bc0aaec0ca548eb6cf5f0d7d16351650c1ee956"
+git-tree-sha1 = "cad4c758c0038eea30394b1b671526921ca85b21"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-version = "1.3.2"
+version = "1.4.0"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
@@ -44,6 +44,12 @@ version = "0.4.2"
 [[deps.Base64]]
 uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
 
+[[deps.Bzip2_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "19a35467a82e236ff51bc17a3a44b69ef35185a2"
+uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0"
+version = "1.0.8+0"
+
 [[deps.CEnum]]
 git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
 uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
@@ -86,10 +92,10 @@ uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "1.16.0"
 
 [[deps.ChangesOfVariables]]
-deps = ["LinearAlgebra", "Test"]
-git-tree-sha1 = "f84967c4497e0e1955f9a582c232b02847c5f589"
+deps = ["InverseFunctions", "LinearAlgebra", "Test"]
+git-tree-sha1 = "2fba81a302a7be671aefe194f0525ef231104e7f"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.7"
+version = "0.1.8"
 
 [[deps.CommonDataModel]]
 deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
@@ -199,9 +205,9 @@ version = "0.1.5"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-git-tree-sha1 = "69a9aa4346bca723e46769ff6b6277e597c969b1"
+git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.21.2"
+version = "0.21.4"
 
 [[deps.Glob]]
 git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
@@ -209,10 +215,10 @@ uuid = "c27321d9-0574-5035-807b-f59d2c89b15c"
 version = "1.3.1"
 
 [[deps.HDF5_jll]]
-deps = ["Artifacts", "JLLWrappers", "LibCURL_jll", "Libdl", "OpenSSL_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "4cc2bb72df6ff40b055295fdef6d92955f9dede8"
+deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "OpenSSL_jll", "TOML", "Zlib_jll", "libaec_jll"]
+git-tree-sha1 = "3b20c3ce9c14aedd0adca2bc8c882927844bd53d"
 uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
-version = "1.12.2+2"
+version = "1.14.0+0"
 
 [[deps.IfElse]]
 git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
@@ -237,9 +243,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
 [[deps.InverseFunctions]]
 deps = ["Test"]
-git-tree-sha1 = "6667aadd1cdee2c6cd068128b3d226ebc4fb0c67"
+git-tree-sha1 = "eabe3125edba5c9c10b60a160b1779a000dc8b29"
 uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
-version = "0.1.9"
+version = "0.1.11"
 
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
@@ -276,24 +282,30 @@ uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 version = "1.13.1"
 
 [[deps.KernelAbstractions]]
-deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "ada2d5824ce593ff117e714d29f8e890419e8b78"
+deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
+git-tree-sha1 = "a8ed43278a827de44cef19b3f295d9db9c278f4d"
 repo-rev = "main"
 repo-url = "https://github.com/simone-silvestri/KernelAbstractions.jl"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.4"
+version = "0.9.7"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "7d5788011dd273788146d40eb5b1fbdc199d0296"
+git-tree-sha1 = "8695a49bfe05a2dc0feeefd06b4ca6361a018729"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "6.0.1"
+version = "6.1.0"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "1222116d7313cdefecf3d45a2bc1a89c4e7c9217"
+git-tree-sha1 = "c35203c1e1002747da220ffc3c0762ce7754b08c"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.22+0"
+version = "0.0.23+0"
+
+[[deps.LLVMOpenMP_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "f689897ccbe049adb19a065c495e75f372ecd42b"
+uuid = "1d63c593-3942-5779-bab2-d838dc0a180e"
+version = "15.0.4+0"
 
 [[deps.LazyArtifacts]]
 deps = ["Artifacts", "Pkg"]
@@ -405,10 +417,10 @@ uuid = "85f8d34a-cbdd-5861-8df4-14fed0d494ab"
 version = "0.12.17"
 
 [[deps.NetCDF_jll]]
-deps = ["Artifacts", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "Pkg", "XML2_jll", "Zlib_jll"]
-git-tree-sha1 = "072f8371f74c3b9e1b26679de7fbf059d45ea221"
+deps = ["Artifacts", "Bzip2_jll", "HDF5_jll", "JLLWrappers", "LibCURL_jll", "Libdl", "XML2_jll", "Zlib_jll", "Zstd_jll"]
+git-tree-sha1 = "10c612c81eaffdd6b7c28a45a554cdd9d2f40ff1"
 uuid = "7243133f-43d8-5620-bbf4-c2c921802cf3"
-version = "400.902.5+1"
+version = "400.902.208+0"
 
 [[deps.NetworkOptions]]
 uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
@@ -438,9 +450,9 @@ version = "4.1.5+0"
 
 [[deps.OpenSSL_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "1aa4b74f80b01c6bc2b89992b861b5f210e665b5"
+git-tree-sha1 = "cae3153c7f6cf3f069a853883fd1919a6e5bab5b"
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "1.1.21+0"
+version = "3.0.9+0"
 
 [[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
@@ -610,14 +622,14 @@ version = "1.4.0"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
-git-tree-sha1 = "832afbae2a45b4ae7e831f86965469a24d1d8a83"
+git-tree-sha1 = "fffc14c695c17bfdbfa92a2a01836cdc542a1e46"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.5.26"
+version = "1.6.1"
 
 [[deps.StaticArraysCore]]
-git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a"
+git-tree-sha1 = "1d5708d926c76a505052d0d24a846d5da08bc3a4"
 uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
-version = "1.4.0"
+version = "1.4.1"
 
 [[deps.StaticPermutations]]
 git-tree-sha1 = "193c3daa18ff3e55c1dae66acb6a762c4a3bdb0b"
@@ -739,6 +751,18 @@ deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
 version = "1.2.12+3"
 
+[[deps.Zstd_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "49ce682769cd5de6c72dcf1b94ed7790cd08974c"
+uuid = "3161d3a3-bdf6-5164-811a-617609db77b4"
+version = "1.5.5+0"
+
+[[deps.libaec_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "eddd19a8dea6b139ea97bdc8a0e2667d4b661720"
+uuid = "477f73a3-ac25-53e9-8cc3-50b2fa2566f0"
+version = "1.0.6+1"
+
 [[deps.libblastrampoline_jll]]
 deps = ["Artifacts", "Libdl", "OpenBLAS_jll"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"

From 6955b9264cc9c4593feaa00e97911083e0e37fd9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:29:04 -0400
Subject: [PATCH 402/530] more options

---
 src/Advection/vector_invariant_advection.jl   | 38 +++++++++++--------
 .../vector_invariant_cross_upwinding.jl       |  2 +-
 .../vector_invariant_self_upwinding.jl        | 14 +++----
 .../vector_invariant_velocity_upwinding.jl    | 16 ++++----
 4 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 731cba588a..6fea894e83 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -7,21 +7,23 @@ struct EnstrophyConservingScheme{FT} <: AbstractAdvectionScheme{1, FT} end
 EnergyConservingScheme(FT::DataType = Float64)    = EnergyConservingScheme{FT}()
 EnstrophyConservingScheme(FT::DataType = Float64) = EnstrophyConservingScheme{FT}()
 
-struct VectorInvariant{N, FT, Z, ZS, V, D, M} <: AbstractAdvectionScheme{N, FT}
-    vorticity_scheme  :: Z  # reconstruction scheme for vorticity flux
-    vorticity_stencil :: ZS # stencil used for assessing vorticity smoothness
-    vertical_scheme   :: V  # stencil used for assessing divergence smoothness
-    upwinding         :: D  # treatment of upwinding for divergence flux and kinetic energy gradient
+struct VectorInvariant{N, FT, Z, ZS, V, K, D, M} <: AbstractAdvectionScheme{N, FT}
+    vorticity_scheme   :: Z  # reconstruction scheme for vorticity flux
+    vorticity_stencil  :: ZS # stencil used for assessing vorticity smoothness
+    vertical_scheme    :: V  # stencil used for assessing divergence smoothness
+    ke_gradient_scheme :: K  # stencil used for assessing divergence smoothness
+    upwinding          :: D  # treatment of upwinding for divergence flux and kinetic energy gradient
 
     VectorInvariant{N, FT, M}(vorticity_scheme::Z, vorticity_stencil::ZS, vertical_scheme::V, 
-                              upwinding::D) where {N, FT, Z, ZS, V, D, M} =
-        new{N, FT, Z, ZS, V, D, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, upwinding)
+                              ke_gradient_scheme::K, upwinding::D) where {N, FT, Z, ZS, V, K, D, M} =
+        new{N, FT, Z, ZS, V, D, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
 end
 
 """
     VectorInvariant(; vorticity_scheme::AbstractAdvectionScheme{N, FT} = EnstrophyConservingScheme(), 
                       vorticity_stencil  = VelocityStencil(),
-                      vertical_scheme    = EnergyConservingScheme()) where {N, FT}
+                      vertical_scheme    = EnergyConservingScheme(),
+                      ke_gradient_scheme = vertical_scheme) where {N, FT}
                
 Construct a vector invariant momentum advection scheme of order `N * 2 - 1`.
 
@@ -34,7 +36,8 @@ Keyword arguments
 - `vorticity_stencil`: Stencil used for smoothness indicators in case of a `WENO` upwind reconstruction. Choices are between `VelocityStencil`
                        which uses the horizontal velocity field to diagnose smoothness and `DefaultStencil` which uses the variable
                        being transported (defaults to `VelocityStencil()`)
-- `vertical_scheme`: Scheme used for vertical advection of horizontal momentum and upwinding of divergence and kinetic energy gradient. Defaults to `EnergyConservingScheme()`.)
+- `vertical_scheme`: Scheme used for vertical advection of horizontal momentum and upwinding of divergence. Defaults to `EnergyConservingScheme()`.
+- `ke_gradient_scheme`: Scheme used for kinetic energy gradient. Defaults to `vertical_advection`.
 - `upwinding`: Treatment of upwinding in case of Upwinding reconstruction of divergence and kinetic energy gradient. Choices are between
                          `CrossAndSelfUpwinding()`, `OnlySelfUpwinding()`, and `VelocityUpwinding()` (defaults to `OnlySelfUpwinding()`).
 - `multi_dimensional_stencil` : if `true`, use a horizontal two dimensional stencil for the reconstruction of vorticity, divergence and kinetic energy gradient.
@@ -77,18 +80,20 @@ Vector Invariant, Dimension-by-dimension reconstruction
 function VectorInvariant(; vorticity_scheme::AbstractAdvectionScheme{N, FT} = EnstrophyConservingScheme(), 
                            vorticity_stencil    = VelocityStencil(),
                            vertical_scheme      = EnergyConservingScheme(),
+                           ke_gradient_scheme   = vertical_scheme,
                            upwinding  = OnlySelfUpwinding(; cross_scheme = vertical_scheme),
                            multi_dimensional_stencil = false) where {N, FT}
         
-    return VectorInvariant{N, FT, multi_dimensional_stencil}(vorticity_scheme, vorticity_stencil, vertical_scheme, upwinding)
+    return VectorInvariant{N, FT, multi_dimensional_stencil}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
 end
 
-const VectorInvariantEnergyConserving         = VectorInvariant{<:Any, <:Any, <:EnergyConservingScheme}
-const VectorInvariantEnstrophyConserving      = VectorInvariant{<:Any, <:Any, <:EnstrophyConservingScheme}
-const VectorInvariantVerticalEnergyConserving = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:EnergyConservingScheme}
+const VectorInvariantEnergyConserving           = VectorInvariant{<:Any, <:Any, <:EnergyConservingScheme}
+const VectorInvariantEnstrophyConserving        = VectorInvariant{<:Any, <:Any, <:EnstrophyConservingScheme}
+const VectorInvariantVerticalEnergyConserving   = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:EnergyConservingScheme}
+const VectorInvariantKEGradientEnergyConserving = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:EnergyConservingScheme}
 
 const VectorInvariantUpwindVorticity  = VectorInvariant{<:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
-const MultiDimensionalVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, true}
+const MultiDimensionalVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, true}
 
 Base.summary(a::VectorInvariant)                 = string("Vector Invariant, Dimension-by-dimension reconstruction")
 Base.summary(a::MultiDimensionalVectorInvariant) = string("Vector Invariant, Multidimensional reconstruction")
@@ -111,6 +116,7 @@ Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, D, M}) where
         VectorInvariant{N, FT, M}(Adapt.adapt(to, scheme.vorticity_scheme), 
                                   Adapt.adapt(to, scheme.vorticity_stencil), 
                                   Adapt.adapt(to, scheme.vertical_scheme),
+                                  Adapt.adapt(to, scheme.ke_gradient_scheme),
                                   Adapt.adapt(to, scheme.upwinding))
 
 @inline U_dot_∇u(i, j, k, grid, scheme::VectorInvariant, U) = (
@@ -152,8 +158,8 @@ end
 @inline ϕ²(i, j, k, grid, ϕ)       = @inbounds ϕ[i, j, k]^2
 @inline Khᶜᶜᶜ(i, j, k, grid, u, v) = (ℑxᶜᵃᵃ(i, j, k, grid, ϕ², u) + ℑyᵃᶜᵃ(i, j, k, grid, ϕ², v)) / 2
 
-@inline bernoulli_head_U(i, j, k, grid, ::VectorInvariantVerticalEnergyConserving, u, v) = ∂xᶠᶜᶜ(i, j, k, grid, Khᶜᶜᶜ, u, v)
-@inline bernoulli_head_V(i, j, k, grid, ::VectorInvariantVerticalEnergyConserving, u, v) = ∂yᶜᶠᶜ(i, j, k, grid, Khᶜᶜᶜ, u, v)
+@inline bernoulli_head_U(i, j, k, grid, ::VectorInvariantKEGradientEnergyConserving, u, v) = ∂xᶠᶜᶜ(i, j, k, grid, Khᶜᶜᶜ, u, v)
+@inline bernoulli_head_V(i, j, k, grid, ::VectorInvariantKEGradientEnergyConserving, u, v) = ∂yᶜᶠᶜ(i, j, k, grid, Khᶜᶜᶜ, u, v)
 
 #####
 ##### Conservative vertical advection 
diff --git a/src/Advection/vector_invariant_cross_upwinding.jl b/src/Advection/vector_invariant_cross_upwinding.jl
index 9a71d0070a..df9c3a5531 100644
--- a/src/Advection/vector_invariant_cross_upwinding.jl
+++ b/src/Advection/vector_invariant_cross_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:CrossAndSelfUpwinding}
+const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:CrossAndSelfUpwinding}
 
 #####
 ##### Cross upwinding results in the largest kinetic energy content, 
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index 2dea587dfd..bac09d54f6 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:OnlySelfUpwinding}
+const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:OnlySelfUpwinding}
 
 ##### 
 ##### Self Upwinding of Divergence Flux, the best option!
@@ -44,7 +44,7 @@ end
 ##### Self Upwinding of Kinetic Energy Gradient 
 #####
 
-const VectorInvariantVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
+const VectorInvariantSelfKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
 
 @inline half_ϕ²(i, j, k, grid, ϕ) = @inbounds ϕ[i, j, k]^2 / 2
 
@@ -65,8 +65,8 @@ const VectorInvariantVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:
     cross_scheme = scheme.upwinding.cross_scheme
 
     δKvˢ =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
-    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
-    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_u², δu²_stencil, u, v)
+    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δx_u², δu²_stencil, u, v)
+    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δx_u², δu²_stencil, u, v)
     
     ∂Kᴸ = (δKuᴸ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
     ∂Kᴿ = (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
@@ -78,12 +78,12 @@ end
 
     @inbounds v̂ = v[i, j, k]
 
-    δv²_stencil   = scheme.upwinding.δv²_stencil    
+    δv²_stencil  = scheme.upwinding.δv²_stencil    
     cross_scheme = scheme.upwinding.cross_scheme
 
     δKuˢ =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
-    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
-    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_v², δv²_stencil, u, v) 
+    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δy_v², δv²_stencil, u, v) 
+    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δy_v², δv²_stencil, u, v) 
     
     ∂Kᴸ = (δKvᴸ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid) 
     ∂Kᴿ = (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)
diff --git a/src/Advection/vector_invariant_velocity_upwinding.jl b/src/Advection/vector_invariant_velocity_upwinding.jl
index 65b11920ec..2cb235ddaf 100644
--- a/src/Advection/vector_invariant_velocity_upwinding.jl
+++ b/src/Advection/vector_invariant_velocity_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
+const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:VelocityUpwinding}
 
 #####
 ##### Velocity upwinding is a Partial Upwinding where the upwind choice occurrs _inside_
@@ -56,11 +56,13 @@ end
 ##### Velocity Upwinding of Kinetic Energy gradient
 #####
 
+const VectorInvariantVelocityKEGradientUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:VelocityUpwinding}
+
 @inline function upwinded_u²ᶜᶜᶜ(i, j, k, grid, scheme, u) 
     û = ℑxᶜᵃᵃ(i, j, k, grid, u)
 
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², u)
+    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, half_ϕ², u)
+    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, half_ϕ², u)
 
     return ifelse(û > 0, Uᴸ, Uᴿ)
 end
@@ -68,8 +70,8 @@ end
 @inline function upwinded_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) 
     v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
 
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, half_ϕ², v)
+    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, half_ϕ², v)
+    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, half_ϕ², v)
 
     return ifelse(v̂ > 0, Vᴸ, Vᴿ)
 end
@@ -80,7 +82,7 @@ end
 @inline reconstructed_v²ᶜᶜᶜ(i, j, k, grid, scheme, v) = 
      _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.upwinding.cross_scheme, half_ϕ², v)
 
-@inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVelocityVerticalUpwinding, u, v)
+@inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVelocityKEGradientUpwinding, u, v)
 
     δKu = δxᶠᵃᵃ(i, j, k, grid,      upwinded_u²ᶜᶜᶜ, scheme, u)
     δKv = δxᶠᵃᵃ(i, j, k, grid, reconstructed_v²ᶜᶜᶜ, scheme, v)
@@ -88,7 +90,7 @@ end
     return (δKu + δKv) / Δxᶠᶜᶜ(i, j, k, grid)
 end
 
-@inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVelocityVerticalUpwinding, u, v)
+@inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVelocityKEGradientUpwinding, u, v)
 
     δKu = δyᵃᶠᵃ(i, j, k, grid,      upwinded_u²ᶜᶜᶜ, scheme, u)
     δKv = δyᵃᶠᵃ(i, j, k, grid, reconstructed_v²ᶜᶜᶜ, scheme, v)

From 876e4e3e37873bd25bc2fc90ec9e6feb9d45fba0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:34:09 -0400
Subject: [PATCH 403/530] more

---
 src/Advection/vector_invariant_advection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 6fea894e83..fc8b3d8499 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -16,7 +16,7 @@ struct VectorInvariant{N, FT, Z, ZS, V, K, D, M} <: AbstractAdvectionScheme{N, F
 
     VectorInvariant{N, FT, M}(vorticity_scheme::Z, vorticity_stencil::ZS, vertical_scheme::V, 
                               ke_gradient_scheme::K, upwinding::D) where {N, FT, Z, ZS, V, K, D, M} =
-        new{N, FT, Z, ZS, V, D, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
+        new{N, FT, Z, ZS, V, K, D, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
 end
 
 """

From 010517917c756b6a127cc877115778b8bf7f9ba1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 09:40:45 -0400
Subject: [PATCH 404/530] finished

---
 src/Advection/vector_invariant_self_upwinding.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index bac09d54f6..f3499365c3 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -44,7 +44,7 @@ end
 ##### Self Upwinding of Kinetic Energy Gradient 
 #####
 
-const VectorInvariantSelfKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
+const VectorInvariantKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
 
 @inline half_ϕ²(i, j, k, grid, ϕ) = @inbounds ϕ[i, j, k]^2 / 2
 
@@ -57,7 +57,7 @@ const VectorInvariantSelfKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any,
 @inline u_smoothness(i, j, k, grid, u, v) = ℑxᶜᵃᵃ(i, j, k, grid, u)
 @inline v_smoothness(i, j, k, grid, u, v) = ℑyᵃᶜᵃ(i, j, k, grid, v)
 
-@inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
+@inline function bernoulli_head_U(i, j, k, grid, scheme::VectorInvariantKineticEnergyUpwinding, u, v)
 
     @inbounds û = u[i, j, k]
 
@@ -74,7 +74,7 @@ const VectorInvariantSelfKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any,
     return ifelse(û > 0, ∂Kᴸ, ∂Kᴿ)
 end
 
-@inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantVerticalUpwinding, u, v)
+@inline function bernoulli_head_V(i, j, k, grid, scheme::VectorInvariantKineticEnergyUpwinding, u, v)
 
     @inbounds v̂ = v[i, j, k]
 

From 3b79b9a3b902e966069e0d868c539fd2a85258af Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 14:49:51 -0400
Subject: [PATCH 405/530] test hypothesis

---
 .../calculate_hydrostatic_free_surface_tendencies.jl            | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 0b7029ded9..58bc6a5bc2 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -91,6 +91,8 @@ function calculate_hydrostatic_free_surface_tendency_contributions!(model, kerne
     arch = model.architecture
     grid = model.grid
 
+    @info kernel_parameters
+
     calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)

From d6520aa65171953fa9bf4ee30ecda6119cd6d10e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 15:20:21 -0400
Subject: [PATCH 406/530] fixed bug - correct speed now

---
 .../calculate_hydrostatic_free_surface_tendencies.jl | 12 +++++-------
 .../calculate_nonhydrostatic_tendencies.jl           |  6 ++----
 src/Utils/kernel_launching.jl                        |  4 ++--
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 58bc6a5bc2..6168537e86 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -27,7 +27,7 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
-    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
@@ -86,14 +86,14 @@ top_tracer_boundary_conditions(grid, tracers) =
     NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = false)
 
     arch = model.architecture
     grid = model.grid
 
     @info kernel_parameters
 
-    calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters)
+    calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
     only_active_cells = use_only_active_interior_cells(grid)
@@ -166,7 +166,7 @@ function calculate_free_surface_tendency!(grid, model, kernel_parameters)
 end
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters)
+function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = false)
 
     grid = model.grid
     arch = architecture(grid)
@@ -190,9 +190,7 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_pa
 
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args...)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args...)
-    
-    only_active_cells = use_only_active_interior_cells(grid)
-    
+        
     for parameters in kernel_parameters
         launch!(arch, grid, parameters,
                 calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index 1fc43f7a91..7403428fd0 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -27,7 +27,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
     # interior of the domain
     kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
 
-    calculate_interior_tendency_contributions!(model, kernel_parameters)
+    calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
                       
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
@@ -49,7 +49,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 end
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(model, kernel_parameters)
+function calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = false)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
@@ -87,8 +87,6 @@ function calculate_interior_tendency_contributions!(model, kernel_parameters)
     u_kernel_args = tuple(start_momentum_kernel_args..., u_immersed_bc, end_momentum_kernel_args..., forcings, hydrostatic_pressure, clock)
     v_kernel_args = tuple(start_momentum_kernel_args..., v_immersed_bc, end_momentum_kernel_args..., forcings, hydrostatic_pressure, clock)
     w_kernel_args = tuple(start_momentum_kernel_args..., w_immersed_bc, end_momentum_kernel_args..., forcings, clock)
-    
-    only_active_cells = use_only_active_interior_cells(grid)
 
     for parameters in kernel_parameters
         launch!(arch, grid, parameters, calculate_Gu!, 
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index ba4ae890bb..46138d0cca 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -76,7 +76,7 @@ function work_layout(grid, workdims::Symbol; include_right_boundaries=false, loc
     return workgroup, worksize
 end
 
-@inline active_cells_work_layout(size, only_active_cells, grid) = heuristic_workgroup(size...), size
+@inline active_cells_work_layout(workgroup, worksize, only_active_cells, grid) = workgroup, worksize
 @inline use_only_active_interior_cells(grid) = nothing
 
 """
@@ -100,7 +100,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     offset = offsets(workspec)
 
     if !isnothing(only_active_cells)
-        workgroup, worksize = active_cells_work_layout(worksize, only_active_cells, grid) 
+        workgroup, worksize = active_cells_work_layout(workgroup, worksize, only_active_cells, grid) 
         offset = nothing
     end
 

From 5dbf9aac6955f4fff1fa767dd4e7fd5228bfa7a0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 14 Jul 2023 15:23:48 -0400
Subject: [PATCH 407/530] add space

---
 src/Utils/kernel_launching.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 46138d0cca..44144d09cb 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -107,6 +107,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     if worksize == 0
         return nothing
     end
+    
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), workgroup, worksize, offset) 
 

From 70ac393a31074e54f9bec8c6379348c844412d71 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 13:31:13 -0400
Subject: [PATCH 408/530] bugfix

---
 src/ImmersedBoundaries/active_cells_map.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index f352ffbf76..ea7c9fa06d 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -16,8 +16,8 @@ struct SurfaceMap end
 @inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
 @inline use_only_active_surface_cells(grid::ActiveCellsIBG) = SurfaceMap()
 
-@inline active_cells_work_layout(size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
-@inline active_cells_work_layout(size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
+@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
+@inline active_cells_work_layout(group, size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
 
 @inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_interior[idx])
 @inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_surface[idx])

From 7d03b63fc93f4c9d1771bd3b143fa318e7d4bf53 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 15:41:22 -0400
Subject: [PATCH 409/530] test

---
 ...ate_hydrostatic_free_surface_tendencies.jl |  2 --
 .../hydrostatic_free_surface_model.jl         | 33 +++++++++----------
 src/Utils/kernel_launching.jl                 |  2 +-
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 6168537e86..ffd44c5ba7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -91,8 +91,6 @@ function calculate_hydrostatic_free_surface_tendency_contributions!(model, kerne
     arch = model.architecture
     grid = model.grid
 
-    @info kernel_parameters
-
     calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 864b61fc23..5eb4cc605a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -67,7 +67,6 @@ end
                                           pressure = nothing,
                                 diffusivity_fields = nothing,
                                   auxiliary_fields = NamedTuple(),
-            calculate_only_active_cells_tendencies = false
     )
 
 Construct a hydrostatic model with a free surface on `grid`.
@@ -96,22 +95,22 @@ Keyword arguments
 
 """
 function HydrostaticFreeSurfaceModel(; grid,
-                                             clock = Clock{eltype(grid)}(0, 0, 1),
-                                momentum_advection = CenteredSecondOrder(),
-                                  tracer_advection = CenteredSecondOrder(),
-                                          buoyancy = SeawaterBuoyancy(eltype(grid)),
-                                          coriolis = nothing,
-                                      free_surface = ImplicitFreeSurface(gravitational_acceleration=g_Earth),
-                               forcing::NamedTuple = NamedTuple(),
-                                           closure = nothing,
-                   boundary_conditions::NamedTuple = NamedTuple(),
-                                           tracers = (:T, :S),
-                     particles::ParticlesOrNothing = nothing,
-             biogeochemistry::AbstractBGCOrNothing = nothing,
-                                        velocities = nothing,
-                                          pressure = nothing,
-                                diffusivity_fields = nothing,
-                                  auxiliary_fields = NamedTuple()
+                                      clock = Clock{eltype(grid)}(0, 0, 1),
+                         momentum_advection = CenteredSecondOrder(),
+                           tracer_advection = CenteredSecondOrder(),
+                                   buoyancy = SeawaterBuoyancy(eltype(grid)),
+                                   coriolis = nothing,
+                               free_surface = ImplicitFreeSurface(gravitational_acceleration=g_Earth),
+                        forcing::NamedTuple = NamedTuple(),
+                                    closure = nothing,
+            boundary_conditions::NamedTuple = NamedTuple(),
+                                    tracers = (:T, :S),
+              particles::ParticlesOrNothing = nothing,
+      biogeochemistry::AbstractBGCOrNothing = nothing,
+                                 velocities = nothing,
+                                   pressure = nothing,
+                         diffusivity_fields = nothing,
+                           auxiliary_fields = NamedTuple()
     )
 
     # Check halos and throw an error if the grid's halo is too small
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 44144d09cb..850888acfc 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -111,7 +111,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), workgroup, worksize, offset) 
 
-    @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset"
+    @info "Launching kernel $kernel! with worksize $worksize and offsets $offset"
 
     loop!(kernel_args...)
 

From 056ff34fda2fdc96b9da0e0178b4fd95b32e0fb8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 15:42:33 -0400
Subject: [PATCH 410/530] more info

---
 src/Utils/kernel_launching.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 850888acfc..09cc6ff19a 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -111,7 +111,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), workgroup, worksize, offset) 
 
-    @info "Launching kernel $kernel! with worksize $worksize and offsets $offset"
+    @info "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
 
     loop!(kernel_args...)
 

From 25141304b7414c87464549249482ae047239a3f8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:16:49 -0400
Subject: [PATCH 411/530] removed left-right connected computation

---
 src/Distributed/interleave_comm_and_comp.jl   | 10 +++++-----
 ...late_nonhydrostatic_boundary_tendencies.jl | 20 ++++++++-----------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_comm_and_comp.jl
index 5b97cef979..540cca2eef 100644
--- a/src/Distributed/interleave_comm_and_comp.jl
+++ b/src/Distributed/interleave_comm_and_comp.jl
@@ -31,14 +31,14 @@ interior_tendency_kernel_parameters(grid, ::BlockingDistributedArch) = :xyz
 function interior_tendency_kernel_parameters(grid, arch)
     Rx, Ry, _ = arch.ranks
     Hx, Hy, _ = halo_size(grid)
-
+    Tx, Ty, _ = topology(grid)
     Nx, Ny, Nz = size(grid)
     
-    Sx = Rx == 1 ? Nx : Nx - 2Hx
-    Sy = Ry == 1 ? Ny : Ny - 2Hy
+    Sx = Rx == 1 ? Nx : (Tx == RightConnected || Tx == LeftConnected ? Nx - Hx : Nx - 2Hx)
+    Sy = Ry == 1 ? Ny : (Ty == RightConnected || Ty == LeftConnected ? Ny - Hy : Ny - 2Hy)
 
-    Ox = Rx == 1 ? 0 : Hx
-    Oy = Ry == 1 ? 0 : Hy
+    Ox = Rx == 1 || Tx == RightConnected ? 0 : Hx
+    Oy = Ry == 1 || Tx == RightConnected ? 0 : Hy
      
     return KernelParameters((Sx, Sy, Nz), (Ox, Oy, 0))
 end
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index 29b78e0186..fa031cb43a 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -89,18 +89,14 @@ using Oceananigans.Operators: XFlatGrid, YFlatGrid
 # Recompute only on communicating sides 
 function boundary_parameters(S, O, grid, arch) 
     Rx, Ry, _ = arch.ranks
+    Tx, Ty, _ = topology(grid)
 
-    include_x = !isa(grid, XFlatGrid) && (Rx != 1)
-    include_y = !isa(grid, YFlatGrid) && (Ry != 1)
-
-    if include_x && include_y
-        return Tuple(KernelParameters(S[i], O[i]) for i in 1:4)
-    elseif include_x && !(include_y)
-        return Tuple(KernelParameters(S[i], O[i]) for i in 1:2:3)
-    elseif !(include_x) && include_y
-        return Tuple(KernelParameters(S[i], O[i]) for i in 2:2:4)
-    else
-        return ()
-    end
+    include_xᴸ = !isa(grid, XFlatGrid) && (Rx != 1) && !(Tx == RightConnected)
+    include_yᴸ = !isa(grid, YFlatGrid) && (Ry != 1) && !(Ty == RightConnected)
+    include_xᴿ = !isa(grid, XFlatGrid) && (Rx != 1) && !(Tx == LeftConnected)
+    include_yᴿ = !isa(grid, YFlatGrid) && (Ry != 1) && !(Ty == LeftConnected)
+
+    include_side = (include_xᴸ, include_yᴸ, include_xᴿ, include_yᴿ)
+    return Tuple(KernelParameters(S[i], O[i]) for i in findall(include_side))
 end
 

From cea3240fddb49106b79c376b89dc95d1122f0150 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:45:10 -0400
Subject: [PATCH 412/530] bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl        | 4 ++--
 .../calculate_nonhydrostatic_tendencies.jl                  | 2 +-
 src/Utils/kernel_launching.jl                               | 6 +++++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index ffd44c5ba7..4e6a8a4c84 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -86,7 +86,7 @@ top_tracer_boundary_conditions(grid, tracers) =
     NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = false)
+function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
 
     arch = model.architecture
     grid = model.grid
@@ -164,7 +164,7 @@ function calculate_free_surface_tendency!(grid, model, kernel_parameters)
 end
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = false)
+function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = nothing)
 
     grid = model.grid
     arch = architecture(grid)
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index 7403428fd0..f038d07eb0 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -49,7 +49,7 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
 end
 
 """ Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = false)
+function calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 09cc6ff19a..8868d8269d 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -97,9 +97,13 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                                       reduced_dimensions,
                                       location)
 
+    if !only_active_cells
+        only_active_cells = nothing
+    end
+    
     offset = offsets(workspec)
 
-    if !isnothing(only_active_cells)
+    if !isnothing(only_active_cells) 
         workgroup, worksize = active_cells_work_layout(workgroup, worksize, only_active_cells, grid) 
         offset = nothing
     end

From c1b2049280e6d2c52a2a216a94bcd0ecad319061 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:46:49 -0400
Subject: [PATCH 413/530] remove info

---
 src/Utils/kernel_launching.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 8868d8269d..c1d9430fc7 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -100,7 +100,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     if !only_active_cells
         only_active_cells = nothing
     end
-    
+
     offset = offsets(workspec)
 
     if !isnothing(only_active_cells) 
@@ -115,7 +115,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), workgroup, worksize, offset) 
 
-    @info "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
+    @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
 
     loop!(kernel_args...)
 

From abea7ef9113b927b1a73734adb7af2fcca38df78 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:54:50 -0400
Subject: [PATCH 414/530] improve

---
 ...ate_hydrostatic_free_surface_tendencies.jl | 26 ++++++++++--------
 .../calculate_nonhydrostatic_tendencies.jl    | 27 ++++++++++---------
 src/Utils/kernel_launching.jl                 |  4 ---
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 4e6a8a4c84..9f565e1e98 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -13,7 +13,8 @@ import Oceananigans: tracer_tendency_kernel_function
 import Oceananigans.Distributed: complete_communication_and_compute_boundary!
 import Oceananigans.Distributed: interior_tendency_kernel_parameters
 
-using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
+using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
+                                       InteriorMap, active_linear_index_to_interior_tuple
 
 """
     compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
@@ -125,6 +126,7 @@ function calculate_hydrostatic_free_surface_tendency_contributions!(model, kerne
                     tendency_kernel!,
                     c_tendency,
                     grid,
+                    only_active_cells,
                     args;
                     only_active_cells)
         end
@@ -191,11 +193,13 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_pa
         
     for parameters in kernel_parameters
         launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, u_kernel_args;
+                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, 
+                only_active_cells, u_kernel_args;
                 only_active_cells)
 
         launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, v_kernel_args;
+                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, 
+                only_active_cells, v_kernel_args;
                 only_active_cells)
     end
 
@@ -228,24 +232,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
@@ -256,24 +260,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the subgrid scale energy equation. """
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid, args)
+@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, args)
+@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
index f038d07eb0..6f94527e28 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
@@ -3,7 +3,8 @@ using Oceananigans: fields, TendencyCallsite
 using Oceananigans.Utils: work_layout
 using Oceananigans.Distributed: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
 
-using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, active_linear_index_to_interior_tuple
+using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
+                                       InteriorMap, active_linear_index_to_interior_tuple
 
 import Oceananigans.TimeSteppers: compute_tendencies!
 
@@ -90,15 +91,15 @@ function calculate_interior_tendency_contributions!(model, kernel_parameters; on
 
     for parameters in kernel_parameters
         launch!(arch, grid, parameters, calculate_Gu!, 
-                tendencies.u, grid, u_kernel_args;
+                tendencies.u, grid, only_active_cells, u_kernel_args;
                 only_active_cells)
 
         launch!(arch, grid, parameters, calculate_Gv!, 
-                tendencies.v, grid, v_kernel_args;
+                tendencies.v, grid, only_active_cells, v_kernel_args;
                 only_active_cells)
 
         launch!(arch, grid, parameters, calculate_Gw!, 
-                tendencies.w, grid, w_kernel_args;
+                tendencies.w, grid, only_active_cells, w_kernel_args;
                 only_active_cells)
     end
 
@@ -119,7 +120,7 @@ function calculate_interior_tendency_contributions!(model, kernel_parameters; on
 
         for parameters in kernel_parameters
             launch!(arch, grid, parameters, calculate_Gc!, 
-                    c_tendency, grid, args;
+                    c_tendency, grid, only_active_cells, args;
                     only_active_cells)
         end
     end
@@ -132,36 +133,36 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_Gu!(Gu, grid, args) 
+@kernel function calculate_Gu!(Gu, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gu!(Gu, grid::ActiveCellsIBG, args) 
+@kernel function calculate_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_Gv!(Gv, grid, args) 
+@kernel function calculate_Gv!(Gv, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gv!(Gv, grid::ActiveCellsIBG, args) 
+@kernel function calculate_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the w-velocity equation. """
-@kernel function calculate_Gw!(Gw, grid, args) 
+@kernel function calculate_Gw!(Gw, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gw!(Gw, grid::ActiveCellsIBG, args)
+@kernel function calculate_Gw!(Gw, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
@@ -172,12 +173,12 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_Gc!(Gc, grid, args)
+@kernel function calculate_Gc!(Gc, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gc!(Gc, grid::ActiveCellsIBG, args) 
+@kernel function calculate_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index c1d9430fc7..5f0710dfaa 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -97,10 +97,6 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
                                       reduced_dimensions,
                                       location)
 
-    if !only_active_cells
-        only_active_cells = nothing
-    end
-
     offset = offsets(workspec)
 
     if !isnothing(only_active_cells) 

From c6deb5e0eb73fff1ee4049d9e4d1eeff79b99fa1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 16:56:30 -0400
Subject: [PATCH 415/530] typo

---
 src/Utils/kernel_launching.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 5f0710dfaa..b161a1ce11 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -83,7 +83,7 @@ end
     launch!(arch, grid, layout, kernel!, args...; kwargs...)
 
 Launches `kernel!`, with arguments `args` and keyword arguments `kwargs`,
-over the `dims` of `grid` on the architecture `arch`. kernels run on the defaul stream
+over the `dims` of `grid` on the architecture `arch`. kernels run on the default stream
 """
 function launch!(arch, grid, workspec, kernel!, kernel_args...;
                  include_right_boundaries = false,

From 66965ff32c1569ca05d8cd6a1d560adf542fbbea Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 18:03:04 -0400
Subject: [PATCH 416/530] bugfix

---
 .../calculate_hydrostatic_free_surface_tendencies.jl             | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
index 9f565e1e98..0b734c011d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
@@ -95,7 +95,6 @@ function calculate_hydrostatic_free_surface_tendency_contributions!(model, kerne
     calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
-    only_active_cells = use_only_active_interior_cells(grid)
 
     for (tracer_index, tracer_name) in enumerate(propertynames(model.tracers))
         @inbounds c_tendency    = model.timestepper.Gⁿ[tracer_name]

From 2e7354e1a62a4fb7d477bb784805a8a50766d1a4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 20:09:54 -0400
Subject: [PATCH 417/530] bugfix

---
 src/Advection/vector_invariant_self_upwinding.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index f3499365c3..e3f094c8d9 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -65,8 +65,8 @@ const VectorInvariantKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:An
     cross_scheme = scheme.upwinding.cross_scheme
 
     δKvˢ =    _symmetric_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, cross_scheme, δx_v², u, v)
-    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δx_u², δu²_stencil, u, v)
-    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δx_u², δu²_stencil, u, v)
+    δKuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, δx_u², δu²_stencil, u, v)
+    δKuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, δx_u², δu²_stencil, u, v)
     
     ∂Kᴸ = (δKuᴸ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
     ∂Kᴿ = (δKuᴿ + δKvˢ) / Δxᶠᶜᶜ(i, j, k, grid)
@@ -82,8 +82,8 @@ end
     cross_scheme = scheme.upwinding.cross_scheme
 
     δKuˢ =    _symmetric_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_u², u, v)
-    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δy_v², δv²_stencil, u, v) 
-    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.kinetic_energy_scheme, δy_v², δv²_stencil, u, v) 
+    δKvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, δy_v², δv²_stencil, u, v) 
+    δKvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.ke_gradient_scheme, δy_v², δv²_stencil, u, v) 
     
     ∂Kᴸ = (δKvᴸ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid) 
     ∂Kᴿ = (δKvᴿ + δKuˢ) / Δyᶜᶠᶜ(i, j, k, grid)

From 403e74f2cb110fee310d82b4b8f91afc2668330b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 15 Jul 2023 20:14:03 -0400
Subject: [PATCH 418/530] correct comments

---
 src/Advection/vector_invariant_advection.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index fc8b3d8499..cc7df2c3c0 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -10,8 +10,8 @@ EnstrophyConservingScheme(FT::DataType = Float64) = EnstrophyConservingScheme{FT
 struct VectorInvariant{N, FT, Z, ZS, V, K, D, M} <: AbstractAdvectionScheme{N, FT}
     vorticity_scheme   :: Z  # reconstruction scheme for vorticity flux
     vorticity_stencil  :: ZS # stencil used for assessing vorticity smoothness
-    vertical_scheme    :: V  # stencil used for assessing divergence smoothness
-    ke_gradient_scheme :: K  # stencil used for assessing divergence smoothness
+    vertical_scheme    :: V  # recontruction scheme for vertical advection
+    ke_gradient_scheme :: K  # reconstruction scheme for kinetic energy gradient
     upwinding          :: D  # treatment of upwinding for divergence flux and kinetic energy gradient
 
     VectorInvariant{N, FT, M}(vorticity_scheme::Z, vorticity_stencil::ZS, vertical_scheme::V, 

From 6580a1211d4db8b456af0a308797ba13543b7efa Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 16 Jul 2023 09:06:03 -0400
Subject: [PATCH 419/530] bugfix

---
 src/Advection/vector_invariant_advection.jl      | 2 +-
 src/Advection/vector_invariant_self_upwinding.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index cc7df2c3c0..e45c96fe16 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -112,7 +112,7 @@ Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} =
 # halo for vector invariant advection
 required_halo_size(scheme::VectorInvariant{N}) where N = N == 1 ? N : N + 1
 
-Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, D, M}) where {N, FT, Z, ZS, V, D, M} =
+Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, K, D, M}) where {N, FT, Z, ZS, V, K, D, M} =
         VectorInvariant{N, FT, M}(Adapt.adapt(to, scheme.vorticity_scheme), 
                                   Adapt.adapt(to, scheme.vorticity_stencil), 
                                   Adapt.adapt(to, scheme.vertical_scheme),
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index e3f094c8d9..527640752a 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -44,7 +44,7 @@ end
 ##### Self Upwinding of Kinetic Energy Gradient 
 #####
 
-const VectorInvariantKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
+const VectorInvariantKineticEnergyUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
 
 @inline half_ϕ²(i, j, k, grid, ϕ) = @inbounds ϕ[i, j, k]^2 / 2
 

From 923d1b230bb16cd371658559e4fec968d412a89d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 09:14:57 -0400
Subject: [PATCH 420/530] bugfix prescribed velocities

---
 .../prescribed_hydrostatic_velocity_fields.jl                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 01137bceb9..345ef74b93 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -87,7 +87,7 @@ FreeSurface(::ExplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) =
 FreeSurface(::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
 
 hydrostatic_prognostic_fields(::PrescribedVelocityFields, ::Nothing, tracers) = tracers
-calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters) = nothing
+calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters; kwargs...) = nothing
 
 apply_flux_bcs!(::Nothing, c, arch, clock, model_fields) = nothing
 

From 511352d69b3590d5ccbefbfb6af0166830963aa1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 11:35:47 -0400
Subject: [PATCH 421/530] fixes

---
 src/MultiRegion/multi_region_models.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl
index e3081929d1..1100a72c19 100644
--- a/src/MultiRegion/multi_region_models.jl
+++ b/src/MultiRegion/multi_region_models.jl
@@ -59,16 +59,18 @@ implicit_diffusion_solver(time_discretization::VerticallyImplicitTimeDiscretizat
 
 WENO(mrg::MultiRegionGrid, args...; kwargs...) = construct_regionally(WENO, mrg, args...; kwargs...)
 
-@inline  getregion(t::VectorInvariant{N, FT, Z, ZS, V, D, M}, r) where {N, FT, Z, ZS, V, D, M} = 
+@inline  getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, M}, r) where {N, FT, Z, ZS, V, K, D, M} = 
                 VectorInvariant{N, FT, M}(_getregion(t.vorticity_scheme, r), 
                                           _getregion(t.vorticity_stencil, r), 
                                           _getregion(t.vertical_scheme, r),
+                                          _getregion(t.ke_gradient_scheme, r),
                                           _getregion(t.upwinding, r))
 
-@inline _getregion(t::VectorInvariant{N, FT, Z, ZS, V, D, M}, r) where {N, FT, Z, ZS, V, D, M} = 
+@inline _getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, M}, r) where {N, FT, Z, ZS, V, K, D, M} = 
                 VectorInvariant{N, FT, M}(getregion(t.vorticity_scheme, r), 
                                           getregion(t.vorticity_stencil, r), 
                                           getregion(t.vertical_scheme, r),
+                                          getregion(t.ke_gradient_scheme, r),
                                           getregion(t.upwinding, r))
 
 function cell_advection_timescale(grid::MultiRegionGrid, velocities)

From 30acce8afa54da42bb54e9c3ce543006d97e8f16 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 11:41:07 -0400
Subject: [PATCH 422/530] ok on mac

---
 .../mpi_hydrostatic_turbulence.jl             | 82 ++++++++++---------
 1 file changed, 44 insertions(+), 38 deletions(-)

diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index 1210b9a315..fcd9e128da 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -6,7 +6,6 @@ using Statistics
 using Oceananigans.BoundaryConditions
 using Oceananigans.Distributed    
 using Random
-using GLMakie
 
 # Run with 
 #
@@ -65,43 +64,13 @@ function run_simulation(nx, ny, arch, topo)
     MPI.Barrier(MPI.COMM_WORLD)
 end
 
-# Produce a video for variable `var`
-function visualize_simulation(var)
-    iter = Observable(1)
-
-    v = Vector(undef, 4)
-    V = Vector(undef, 4)
-    x = Vector(undef, 4)
-    y = Vector(undef, 4)
-
-    for r in 1:4
-        v[r] = FieldTimeSeries("mpi_hydrostatic_turbulence_rank$(r-1).jld2", var)
-        nx, ny, _ = size(v[r])
-        V[r] = @lift(interior(v[r][$iter], 1:nx, 1:ny, 1))
-
-        x[r] = xnodes(v[r])
-        y[r] = ynodes(v[r])
-    end
-
-    fig = Figure()
-    ax = Axis(fig[1, 1])
-    for r in 1:4
-        heatmap!(ax, x[r], y[r], V[r], colorrange = (-1.0, 1.0))
-    end
-
-    GLMakie.record(fig, "hydrostatic_test_" * var * ".mp4", 1:length(v[1].times), framerate = 11) do i
-        @info "step $i"; 
-        iter[] = i; 
-    end
-end
-
 MPI.Init()
 
 topo = (Periodic, Periodic, Bounded)
 
 Nranks = MPI.Comm_size(MPI.COMM_WORLD)
-Rx = 2
-Ry = 2
+Rx = 4
+Ry = 1
 
 @assert Nranks == 4
 
@@ -118,12 +87,49 @@ ny = [56, 128-56][arch.local_index[2]]
 run_simulation(nx, ny, arch, topo)
 
 # Visualize the plane
-if MPI.Comm_rank(MPI.COMM_WORLD) == 0
-    visualize_simulation("u")
-    visualize_simulation("v")
-    visualize_simulation("ζ")
-    visualize_simulation("c")
+# Produce a video for variable `var`
+try 
+    using GLMakie
+
+    function visualize_simulation(var)
+        iter = Observable(1)
+
+        v = Vector(undef, 4)
+        V = Vector(undef, 4)
+        x = Vector(undef, 4)
+        y = Vector(undef, 4)
+
+        for r in 1:4
+            v[r] = FieldTimeSeries("mpi_hydrostatic_turbulence_rank$(r-1).jld2", var)
+            nx, ny, _ = size(v[r])
+            V[r] = @lift(interior(v[r][$iter], 1:nx, 1:ny, 1))
+
+            x[r] = xnodes(v[r])
+            y[r] = ynodes(v[r])
+        end
+
+        fig = Figure()
+        ax = Axis(fig[1, 1])
+        for r in 1:4
+            heatmap!(ax, x[r], y[r], V[r], colorrange = (-1.0, 1.0))
+        end
+
+        GLMakie.record(fig, "hydrostatic_test_" * var * ".mp4", 1:length(v[1].times), framerate = 11) do i
+            @info "step $i"; 
+            iter[] = i; 
+        end
+    end
+
+    if MPI.Comm_rank(MPI.COMM_WORLD) == 0
+        visualize_simulation("u")
+        visualize_simulation("v")
+        visualize_simulation("ζ")
+        visualize_simulation("c")
+    end
+catch err
+    @info err
 end
 
+
 MPI.Barrier(MPI.COMM_WORLD)
 MPI.Finalize()

From 0e211d7c7b06ac517fdd38b40e73f1d7e76456a6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:06:15 -0400
Subject: [PATCH 423/530] bugfix

---
 .../split_explicit_free_surface_kernels.jl                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 5cae4d8912..074bbf62f9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -314,11 +314,11 @@ const FNS = FixedSubstepNumber
 const FTS = FixedTimeStepSize
 
 @inline calculate_substeps(substepping::FNS, Δt) = length(substepping.averaging_weights)
-@inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / settings.Δtᴮ)
+@inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / substepping.Δtᴮ)
 
 @inline calculate_adaptive_settings(substepping::FNS, substeps) = substepping.fractional_step_size, substepping.averaging_weights
 @inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotopic), 
-                                                                                     substeps, substepping.averaging_kernel)
+                                                                                        substeps, substepping.averaging_kernel)
 
 function iterate_split_explicit!(free_surface, grid, Δt)
     arch = architecture(grid)

From 242d590f31a59bcb24b0ba0420fb690da0d0b459 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:08:45 -0400
Subject: [PATCH 424/530] bug fixed

---
 .../split_explicit_free_surface_kernels.jl                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 074bbf62f9..485813a99e 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -314,7 +314,7 @@ const FNS = FixedSubstepNumber
 const FTS = FixedTimeStepSize
 
 @inline calculate_substeps(substepping::FNS, Δt) = length(substepping.averaging_weights)
-@inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / substepping.Δtᴮ)
+@inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / substepping.Δt_barotropic)
 
 @inline calculate_adaptive_settings(substepping::FNS, substeps) = substepping.fractional_step_size, substepping.averaging_weights
 @inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotopic), 

From 6ea2af35f427d996aa743d6a223d5f6652be1e18 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:10:34 -0400
Subject: [PATCH 425/530] bugfixxed

---
 .../split_explicit_free_surface.jl                          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index d586cce460..1532577323 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -251,7 +251,7 @@ end
 """ An internal type for the `SplitExplicitFreeSurface` that allows substepping with
 a fixed `Δt_barotopic` based on a CFL condition """
 struct FixedTimeStepSize{B, F}
-    Δt_barotopic     :: B
+    Δt_barotropic    :: B
     averaging_kernel :: F
 end
 
@@ -273,9 +273,9 @@ function FixedTimeStepSize(FT::DataType = Float64;
 
     wave_speed = sqrt(gravitational_acceleration * grid.Lz)
     
-    Δt_barotopic = FT(cfl * Δs / wave_speed)
+    Δt_barotropic = FT(cfl * Δs / wave_speed)
 
-    return FixedTimeStepSize(Δt_barotopic, averaging_kernel)
+    return FixedTimeStepSize(Δt_barotropic, averaging_kernel)
 end
 
 @inline function weights_from_substeps(FT, substeps, averaging_kernel)

From 67d27ca647aa71c94824df01021416b179818f46 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:24:35 -0400
Subject: [PATCH 426/530] new default

---
 .../hydrostatic_free_surface_model.jl                         | 4 ++--
 .../split_explicit_free_surface.jl                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 7b7fc30b5b..32d32f6836 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -62,7 +62,7 @@ end
                                   tracer_advection = CenteredSecondOrder(),
                                           buoyancy = SeawaterBuoyancy(eltype(grid)),
                                           coriolis = nothing,
-                                      free_surface = ImplicitFreeSurface(gravitational_acceleration=g_Earth),
+                                      free_surface = SplitExplicitFreeSurface(gravitational_acceleration=g_Earth),
                                forcing::NamedTuple = NamedTuple(),
                                            closure = nothing,
                    boundary_conditions::NamedTuple = NamedTuple(),
@@ -107,7 +107,7 @@ function HydrostaticFreeSurfaceModel(; grid,
                                   tracer_advection = CenteredSecondOrder(),
                                           buoyancy = SeawaterBuoyancy(eltype(grid)),
                                           coriolis = nothing,
-                                      free_surface = ImplicitFreeSurface(gravitational_acceleration=g_Earth),
+                                      free_surface = SplitExplicitFreeSurface(; grid, cfl = 0.7, gravitational_acceleration=g_Earth),
                                forcing::NamedTuple = NamedTuple(),
                                            closure = nothing,
                    boundary_conditions::NamedTuple = NamedTuple(),
diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index 1532577323..636b09ab5d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -294,7 +294,7 @@ end
 end
 
 function SplitExplicitSettings(FT::DataType=Float64;
-                               substeps = 200, 
+                               substeps = nothing, 
                                cfl      = nothing,
                                grid     = nothing,
                                fixed_Δt = nothing,

From 19618b1394554201fea4a25c604ad8e572979d40 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:26:43 -0400
Subject: [PATCH 427/530] bugfix

---
 .../split_explicit_free_surface_kernels.jl                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 485813a99e..88df7286e5 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -317,7 +317,7 @@ const FTS = FixedTimeStepSize
 @inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / substepping.Δt_barotropic)
 
 @inline calculate_adaptive_settings(substepping::FNS, substeps) = substepping.fractional_step_size, substepping.averaging_weights
-@inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotopic), 
+@inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotropic), 
                                                                                         substeps, substepping.averaging_kernel)
 
 function iterate_split_explicit!(free_surface, grid, Δt)

From 3bb58440cafe20f1061b29d4fd6d7a5741d3163f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 12:51:02 -0400
Subject: [PATCH 428/530] remove <<<<HEAD

---
 Manifest.toml | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index e0ca5fe448..0964732789 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -5,13 +5,8 @@ manifest_format = "2.0"
 project_hash = "9044a5920a57a78e707ac7490810d9c46b9bc546"
 
 [[deps.AbstractFFTs]]
-<<<<<<< HEAD
 deps = ["ChainRulesCore", "LinearAlgebra"]
 git-tree-sha1 = "cad4c758c0038eea30394b1b671526921ca85b21"
-=======
-deps = ["LinearAlgebra"]
-git-tree-sha1 = "8bc0aaec0ca548eb6cf5f0d7d16351650c1ee956"
->>>>>>> origin/ss/fix_split_explicit
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 version = "1.4.0"
 
@@ -116,7 +111,6 @@ git-tree-sha1 = "5248d9c45712e51e27ba9b30eebec65658c6ce29"
 uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
 version = "0.6.0+0"
 
-<<<<<<< HEAD
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra", "SparseArrays"]
 git-tree-sha1 = "e30f2f4e20f7f186dc36529910beaedc60cfa644"
@@ -129,8 +123,6 @@ git-tree-sha1 = "2fba81a302a7be671aefe194f0525ef231104e7f"
 uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 version = "0.1.8"
 
-=======
->>>>>>> origin/ss/fix_split_explicit
 [[deps.CommonDataModel]]
 deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
 git-tree-sha1 = "2678b3fc170d582655a14d22867b031b6e43c2d4"
@@ -243,15 +235,9 @@ version = "0.1.5"
 
 [[deps.GPUCompiler]]
 deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-<<<<<<< HEAD
 git-tree-sha1 = "72b2e3c2ba583d1a7aa35129e56cf92e07c083e3"
 uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
 version = "0.21.4"
-=======
-git-tree-sha1 = "d60b5fe7333b5fa41a0378ead6614f1ab51cf6d0"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.21.3"
->>>>>>> origin/ss/fix_split_explicit
 
 [[deps.Glob]]
 git-tree-sha1 = "97285bbd5230dd766e9ef6749b80fc617126d496"
@@ -285,15 +271,12 @@ version = "2023.1.0+0"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-<<<<<<< HEAD
 [[deps.InverseFunctions]]
 deps = ["Test"]
 git-tree-sha1 = "eabe3125edba5c9c10b60a160b1779a000dc8b29"
 uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
 version = "0.1.11"
 
-=======
->>>>>>> origin/ss/fix_split_explicit
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -338,15 +321,9 @@ version = "0.9.7"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-<<<<<<< HEAD
 git-tree-sha1 = "8695a49bfe05a2dc0feeefd06b4ca6361a018729"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "6.1.0"
-=======
-git-tree-sha1 = "7d5788011dd273788146d40eb5b1fbdc199d0296"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "6.0.1"
->>>>>>> origin/ss/fix_split_explicit
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]

From 972730a1f166763cdbae1cecd69af37a0efed2ae Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 13:06:56 -0400
Subject: [PATCH 429/530] bugfix PrescribedVelocityFields

---
 .../prescribed_hydrostatic_velocity_fields.jl                    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 345ef74b93..f75a590d77 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -85,6 +85,7 @@ FreeSurfaceDisplacementField(::PrescribedVelocityFields, ::Nothing, grid) = noth
 HorizontalVelocityFields(::PrescribedVelocityFields, grid) = nothing, nothing
 FreeSurface(::ExplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
 FreeSurface(::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
+FreeSurface(::SplitExplicitFreeSurface,     ::PrescribedVelocityFields, grid) = nothing
 
 hydrostatic_prognostic_fields(::PrescribedVelocityFields, ::Nothing, tracers) = tracers
 calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters; kwargs...) = nothing

From cc5af476bcbd75accc809cc16ac60b392b9ae01a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:08:17 -0400
Subject: [PATCH 430/530] default in another PR

---
 .../barotropic_pressure_correction.jl                         | 4 ++--
 .../hydrostatic_free_surface_model.jl                         | 2 +-
 .../prescribed_hydrostatic_velocity_fields.jl                 | 4 ++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
index c0f0711f13..382236feec 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/barotropic_pressure_correction.jl
@@ -30,8 +30,8 @@ function pressure_correct_velocities!(model::ImplicitFreeSurfaceHFSM, Δt)
     return nothing
 end
 
-calculate_free_surface_tendency!(grid, model::ImplicitFreeSurfaceHFSM     , args...) = nothing
-calculate_free_surface_tendency!(grid, model::SplitExplicitFreeSurfaceHFSM, args...) = nothing
+calculate_free_surface_tendency!(grid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
+calculate_free_surface_tendency!(grid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
 
 function pressure_correct_velocities!(model::SplitExplicitFreeSurfaceHFSM, Δt)
     u, v, _ = model.velocities
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index fcb7cb4841..7d5bfb2da7 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -100,7 +100,7 @@ function HydrostaticFreeSurfaceModel(; grid,
                                   tracer_advection = CenteredSecondOrder(),
                                           buoyancy = SeawaterBuoyancy(eltype(grid)),
                                           coriolis = nothing,
-                                      free_surface = SplitExplicitFreeSurface(; grid, cfl = 0.7, gravitational_acceleration=g_Earth),
+                                      free_surface = ImplicitFreeSurface(gravitational_acceleration=g_Earth),
                                forcing::NamedTuple = NamedTuple(),
                                            closure = nothing,
                    boundary_conditions::NamedTuple = NamedTuple(),
diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index f75a590d77..d8bb3649ca 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -78,11 +78,15 @@ ab2_step_velocities!(::PrescribedVelocityFields, args...) = nothing
 ab2_step_free_surface!(::Nothing, model, Δt, χ) = nothing 
 compute_w_from_continuity!(::PrescribedVelocityFields, args...; kwargs...) = nothing
 
+calculate_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
+calculate_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
+
 validate_velocity_boundary_conditions(grid, ::PrescribedVelocityFields) = nothing
 extract_boundary_conditions(::PrescribedVelocityFields) = NamedTuple()
 
 FreeSurfaceDisplacementField(::PrescribedVelocityFields, ::Nothing, grid) = nothing
 HorizontalVelocityFields(::PrescribedVelocityFields, grid) = nothing, nothing
+
 FreeSurface(::ExplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
 FreeSurface(::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) = nothing
 FreeSurface(::SplitExplicitFreeSurface,     ::PrescribedVelocityFields, grid) = nothing

From 3644e308045c3c6681a06e944dec029d942dbbaf Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:13:04 -0400
Subject: [PATCH 431/530] bugfix

---
 .../prescribed_hydrostatic_velocity_fields.jl                | 3 ---
 .../HydrostaticFreeSurfaceModels/single_column_model_mode.jl | 5 +++++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index d8bb3649ca..009866740c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -78,9 +78,6 @@ ab2_step_velocities!(::PrescribedVelocityFields, args...) = nothing
 ab2_step_free_surface!(::Nothing, model, Δt, χ) = nothing 
 compute_w_from_continuity!(::PrescribedVelocityFields, args...; kwargs...) = nothing
 
-calculate_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
-calculate_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
-
 validate_velocity_boundary_conditions(grid, ::PrescribedVelocityFields) = nothing
 extract_boundary_conditions(::PrescribedVelocityFields) = NamedTuple()
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index bdfb051b02..eb9d597e11 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -51,6 +51,11 @@ compute_w_from_continuity!(::PrescribedVelocityFields, arch, ::SingleColumnGrid;
 
 calculate_free_surface_tendency!(::SingleColumnGrid, args...) = nothing
 
+# Disambiguation
+calculate_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
+calculate_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
+
+
 # Fast state update and halo filling
 
 function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGrid, callbacks)

From 2f60434e3e0f89f3f4bc3a43dd5b72178f382699 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:17:29 -0400
Subject: [PATCH 432/530] flat grids only in Grids

---
 src/Advection/flat_advective_fluxes.jl                        | 2 +-
 .../HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl | 2 +-
 .../calculate_nonhydrostatic_boundary_tendencies.jl           | 3 +--
 .../NonhydrostaticModels/update_hydrostatic_pressure.jl       | 2 +-
 src/Operators/interpolation_operators.jl                      | 4 +---
 5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/Advection/flat_advective_fluxes.jl b/src/Advection/flat_advective_fluxes.jl
index b910e610b3..b826265204 100644
--- a/src/Advection/flat_advective_fluxes.jl
+++ b/src/Advection/flat_advective_fluxes.jl
@@ -3,7 +3,7 @@
 ##### Flat Topologies
 #####
 
-using Oceananigans.Operators: XFlatGrid, YFlatGrid, ZFlatGrid
+using Oceananigans.Grids: XFlatGrid, YFlatGrid, ZFlatGrid
 
 for SchemeType in [:CenteredScheme, :UpwindScheme]
     @eval begin
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index dd1e3142a4..a2527bea43 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Architectures: device
 using Oceananigans.Grids: halo_size, topology
+using Oceananigans.Grids: XFlatGrid, YFlatGrid
 using Oceananigans.Operators: div_xyᶜᶜᶜ, Δzᶜᶜᶜ
-using Oceananigans.Operators: XFlatGrid, YFlatGrid
 
 """
     compute_w_from_continuity!(model)
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index fa031cb43a..b1375cf3aa 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -1,6 +1,7 @@
 import Oceananigans.Distributed: compute_boundary_tendencies!
 using Oceananigans.Utils: worktuple, offsets
 using Oceananigans.TurbulenceClosures: required_halo_size
+using Oceananigans.Grids: XFlatGrid, YFlatGrid
 
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::NonhydrostaticModel)
@@ -84,8 +85,6 @@ function boundary_κ_kernel_parameters(grid, closure, arch)
     return boundary_parameters(sizes, offs, grid, arch)
 end
 
-using Oceananigans.Operators: XFlatGrid, YFlatGrid
-
 # Recompute only on communicating sides 
 function boundary_parameters(S, O, grid, arch) 
     Rx, Ry, _ = arch.ranks
diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index c6f33cf2a6..f7568a6c02 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Operators: Δzᶜᶜᶜ, Δzᶜᶜᶠ
 using Oceananigans.ImmersedBoundaries: PartialCellBottom, ImmersedBoundaryGrid
 using Oceananigans.Grids: topology
-using Oceananigans.Operators: XFlatGrid, YFlatGrid
+using Oceananigans.Grids: XFlatGrid, YFlatGrid
 
 """
 Update the hydrostatic pressure perturbation pHY′. This is done by integrating
diff --git a/src/Operators/interpolation_operators.jl b/src/Operators/interpolation_operators.jl
index 4ac7534b9e..14817d62ed 100644
--- a/src/Operators/interpolation_operators.jl
+++ b/src/Operators/interpolation_operators.jl
@@ -94,9 +94,7 @@ using Oceananigans.Grids: Flat
 ##### Support for Flat Earths
 #####
 
-const XFlatGrid = AG{<:Any, Flat}
-const YFlatGrid = AG{<:Any, <:Any, Flat}
-const ZFlatGrid = AG{<:Any, <:Any, <:Any, Flat}
+using Oceananigans.Grids: XFlatGrid, YFlatGrid, ZFlatGrid
 
 @inline ℑxᶜᵃᵃ(i, j, k, grid::XFlatGrid, u) = @inbounds u[i, j, k]
 @inline ℑxᶠᵃᵃ(i, j, k, grid::XFlatGrid, c) = @inbounds c[i, j, k]

From a50ebb82f9b28ab130dc0b072fc7028bb3c3d2e7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 14:53:52 -0400
Subject: [PATCH 433/530] last bugfix

---
 test/test_biogeochemistry.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_biogeochemistry.jl b/test/test_biogeochemistry.jl
index fb625e049f..c01eddd07b 100644
--- a/test/test_biogeochemistry.jl
+++ b/test/test_biogeochemistry.jl
@@ -106,7 +106,7 @@ function test_biogeochemistry(grid, MinimalBiogeochemistryType, ModelType)
                                                  Iᴾᴬᴿ, 
                                                  drift_velocities)
 
-    model = ModelType(; grid, biogeochemistry)
+    model = ModelType(; grid, biogeochemistry, momentum_advection = VectorInvariant())
     set!(model, P = 1)
 
     @test :P in keys(model.tracers)

From ebdbc22d56376f3ba8753223c0a45c3d93bbb5e0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 17 Jul 2023 15:51:34 -0400
Subject: [PATCH 434/530] bugfix

---
 test/test_biogeochemistry.jl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_biogeochemistry.jl b/test/test_biogeochemistry.jl
index c01eddd07b..25e1def1a5 100644
--- a/test/test_biogeochemistry.jl
+++ b/test/test_biogeochemistry.jl
@@ -106,7 +106,11 @@ function test_biogeochemistry(grid, MinimalBiogeochemistryType, ModelType)
                                                  Iᴾᴬᴿ, 
                                                  drift_velocities)
 
-    model = ModelType(; grid, biogeochemistry, momentum_advection = VectorInvariant())
+    if ModelType == HydrostaticFreeSurfaceModel && grid isa OrthogonalSphericalShellGrid
+        model = ModelType(; grid, biogeochemistry, momentum_advection = VectorInvariant())
+    else
+        model = ModelType(; grid, biogeochemistry)
+    end
     set!(model, P = 1)
 
     @test :P in keys(model.tracers)

From 18eae2d1d51802d5ac30c7a4165f6785e1a0be1a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Jul 2023 14:17:40 -0400
Subject: [PATCH 435/530] try partial cells

---
 .../partial_cell_immersed_boundaries.jl       |  4 +-
 .../vertically_implicit_diffusion_solver.jl   | 61 +++++++++++--------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl b/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
index 7d2f021ef2..d0d6c77dd7 100644
--- a/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
+++ b/src/ImmersedBoundaries/partial_cell_immersed_boundaries.jl
@@ -55,8 +55,8 @@ const PCIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PartialC
 on_architecture(arch, ib::PartialCellBottom) = PartialCellBottom(arch_array(arch, ib.bottom_height), ib.minimum_fractional_Δz)
 Adapt.adapt_structure(to, ib::PartialCellBottom) = PartialCellBottom(adapt(to, ib.bottom_height), ib.minimum_fractional_Δz)     
 
-bottom_cell(i, j, k, ibg::PCIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
-                                    immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
+@inline bottom_cell(i, j, k, ibg::PCIBG) = !immersed_cell(i, j, k,   ibg.underlying_grid, ibg.immersed_boundary) &
+                                            immersed_cell(i, j, k-1, ibg.underlying_grid, ibg.immersed_boundary)
 
 @inline function Δzᶜᶜᶜ(i, j, k, ibg::PCIBG)
     underlying_grid = ibg.underlying_grid
diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 5a8bd51062..82ae542142 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -39,49 +39,56 @@ implicit_diffusion_solver(::ExplicitTimeDiscretization, args...; kwargs...) = no
 ##### Note: "ivd" stands for implicit vertical diffusion.
 #####
 
-@inline κ_Δz²(i, j, kᶜ, kᶠ, grid, κ) = κ / Δzᵃᵃᶜ(i, j, kᶜ, grid) / Δzᵃᵃᶠ(i, j, kᶠ, grid)
+const c = Center()
+const f = Face()
 
 # Tracers and horizontal velocities at cell centers in z
-
-@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Center, clock, Δt, κz)
-    closure_ij = getclosure(i, j, closure)  
-    κᵏ⁺¹ = κz(i, j, k+1, grid, closure_ij, K, id, clock)
-
-    return ifelse(k > grid.Nz-1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k, k+1, grid, κᵏ⁺¹))
+@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
+    closure_ij = getclosure(i, j, closure)
+    κᵏ⁺¹   = κz(i, j, k+1, grid, closure_ij, K, id, clock)
+    Δzᶜₖ   = Δz(i, j, k,   grid, ℓx, ℓy, c)
+    Δzᶠₖ₊₁ = Δz(i, j, k+1, grid, ℓx, ℓy, f)
+    du     = - Δt * κᵏ⁺¹ / (Δzᶜₖ * Δzᶠₖ₊₁)
+
+    # This conditional ensures the diagonal is correct
+    return ifelse(k > grid.Nz-1, zero(grid), du)
 end
 
-@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Center, clock, Δt, κz)
-    k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
+@inline function ivd_lower_diagonal(i, j, k′, grid, closure, K, id, ℓx, ℓy, ::Center, clock, Δt, κz)
+    k = k′ + 1 # Shift index to match LinearAlgebra.Tridiagonal indexing convenction
     closure_ij = getclosure(i, j, closure)  
-    κᵏ = κz(i, j, k′, grid, closure_ij, K, id, clock)
-
-    return ifelse(k < 1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k′, k′, grid, κᵏ))
+    κᵏ   = κz(i, j, k, grid, closure_ij, K, id, clock)
+    Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
+    Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
+    dl   = - Δt * κᵏ / (Δzᶜₖ * Δzᶠₖ)
+
+    # This conditional ensures the diagonal is correct: the lower diagonal does not
+    # exist for k′ = 0. (Note we use LinearAlgebra.Tridiagonal indexing convention,
+    # so that lower_diagonal should be defined for k′ = 1 ⋯ N-1).
+    return ifelse(k′ < 1, zero(grid), dl)
 end
 
 # Vertical velocity kernel functions (at cell interfaces in z)
 #
 # Note: these coefficients are specific to vertically-bounded grids (and so is
 # the BatchedTridiagonalSolver).
-@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Face, clock, Δt, νzᶜᶜᶜ) 
+@inline function ivd_upper_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ) 
     closure_ij = getclosure(i, j, closure)  
     νᵏ = νzᶜᶜᶜ(i, j, k, grid, closure_ij, K, clock)
-
-    return ifelse(k < 1, # should this be k < 2? #should this be grid.Nz - 1?
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k, k, grid, νᵏ))
+    Δzᶜₖ = Δz(i, j, k, grid, ℓx, ℓy, c)
+    Δzᶠₖ = Δz(i, j, k, grid, ℓx, ℓy, f)
+    du   = - Δt * νᵏ / (Δzᶜₖ * Δzᶠₖ)
+    return ifelse(k < 1, zero(grid), du)
 end
 
-@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, LX, LY, ::Face, clock, Δt, νzᶜᶜᶜ)
-    k′ = k + 1 # Shift to adjust for Tridiagonal indexing convenction
+@inline function ivd_lower_diagonal(i, j, k, grid, closure, K, id, ℓx, ℓy, ::Face, clock, Δt, νzᶜᶜᶜ)
+    k′ = k + 2 # Shift to adjust for Tridiagonal indexing convention
     closure_ij = getclosure(i, j, closure)  
-    νᵏ⁻¹ = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
-    return ifelse(k < 1,
-                  zero(grid),
-                  - Δt * κ_Δz²(i, j, k′, k′-1, grid, νᵏ⁻¹))
+    νᵏ⁻¹   = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
+    Δzᶜₖ   = Δz(i, j, k′,   grid, ℓx, ℓy, c)
+    Δzᶠₖ₋₁ = Δz(i, j, k′-1, grid, ℓx, ℓy, f)
+    dl     = Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
+    return ifelse(k < 1, zero(grid), dl)
 end
 
 ### Diagonal terms

From 3b8f2d74446bcf210e8d65423d23e601d7e5d657 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Jul 2023 17:45:34 -0400
Subject: [PATCH 436/530] bugfix

---
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 82ae542142..530c5f967e 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -1,4 +1,4 @@
-using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ
+using Oceananigans.Operators: Δzᵃᵃᶜ, Δzᵃᵃᶠ, Δz
 using Oceananigans.AbstractOperations: flip
 using Oceananigans.Solvers: BatchedTridiagonalSolver, solve!
 

From 7d97dec65e889a84a44c66ce9efcdf6201a1778f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Thu, 20 Jul 2023 20:55:37 -0400
Subject: [PATCH 437/530] bugfix

---
 src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index f7568a6c02..4f1ec1f602 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -27,7 +27,7 @@ update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.p
 const PCB = PartialCellBottom
 const PCBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:PCB}
 
-update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; parameters = p_kernel_parameters(grid)) =
+update_hydrostatic_pressure!(pHY′, arch, ibg::PCBIBG, buoyancy, tracers; parameters = p_kernel_parameters(ibg.underlying_grid)) =
     update_hydrostatic_pressure!(pHY′, arch, ibg.underlying_grid, buoyancy, tracers; parameters)
 
 update_hydrostatic_pressure!(pHY′, arch, grid, buoyancy, tracers; parameters = p_kernel_parameters(grid)) =  

From dad1301787312c6bfcbed72711f2a07338c2c123 Mon Sep 17 00:00:00 2001
From: "Gregory L. Wagner" <wagner.greg@gmail.com>
Date: Sat, 22 Jul 2023 11:22:08 -0600
Subject: [PATCH 438/530] Update test_turbulence_closures.jl

---
 test/test_turbulence_closures.jl | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test/test_turbulence_closures.jl b/test/test_turbulence_closures.jl
index b6cc0b4049..387cc944ea 100644
--- a/test/test_turbulence_closures.jl
+++ b/test/test_turbulence_closures.jl
@@ -257,26 +257,24 @@ end
     @testset "Time-stepping with CATKE closure" begin
         @info "  Testing time-stepping with CATKE closure and closure tuples with CATKE..."
         for arch in archs
-            warning = false
-
             @info "    Testing time-stepping CATKE by itself..."
-            closure = CATKEVerticalDiffusivity(; warning)
+            closure = CATKEVerticalDiffusivity()
             run_time_step_with_catke_tests(arch, closure)
 
             @info "    Testing time-stepping CATKE in a 2-tuple with HorizontalScalarDiffusivity..."
-            closure = (CATKEVerticalDiffusivity(; warning), HorizontalScalarDiffusivity())
+            closure = (CATKEVerticalDiffusivity(), HorizontalScalarDiffusivity())
             model = run_time_step_with_catke_tests(arch, closure)
             @test first(model.closure) === closure[1]
 
             # Test that closure tuples with CATKE are correctly reordered
             @info "    Testing time-stepping CATKE in a 2-tuple with HorizontalScalarDiffusivity..."
-            closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity(; warning))
+            closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity())
             model = run_time_step_with_catke_tests(arch, closure)
             @test first(model.closure) === closure[2]
 
             # These are slow to compile...
             @info "    Testing time-stepping CATKE in a 3-tuple..."
-            closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity(; warning), VerticalScalarDiffusivity())
+            closure = (HorizontalScalarDiffusivity(), CATKEVerticalDiffusivity(), VerticalScalarDiffusivity())
             model = run_time_step_with_catke_tests(arch, closure)
             @test first(model.closure) === closure[2]
         end

From c57d2c7f380033d3ae5a891affe3d1e2e390486b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:43:48 -0400
Subject: [PATCH 439/530] small fixes

---
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 530c5f967e..0d6d7c23db 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -87,7 +87,7 @@ end
     νᵏ⁻¹   = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
     Δzᶜₖ   = Δz(i, j, k′,   grid, ℓx, ℓy, c)
     Δzᶠₖ₋₁ = Δz(i, j, k′-1, grid, ℓx, ℓy, f)
-    dl     = Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
+    dl     = - Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
     return ifelse(k < 1, zero(grid), dl)
 end
 

From 14a32a1a20f6a8955a538d78d733183bddda9c95 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 25 Jul 2023 19:17:30 -0400
Subject: [PATCH 440/530] rework IBG and MRG

---
 src/ImmersedBoundaries/grid_fitted_bottom.jl  | 17 ++--
 src/MultiRegion/MultiRegion.jl                |  3 +-
 .../multi_region_abstract_operations.jl       | 18 ++--
 .../multi_region_boundary_conditions.jl       |  4 +-
 src/MultiRegion/multi_region_field.jl         | 16 ++--
 src/MultiRegion/multi_region_grid.jl          | 88 ++++++++++---------
 src/MultiRegion/multi_region_models.jl        |  8 +-
 .../multi_region_output_writers.jl            |  2 +-
 ...ulti_region_split_explicit_free_surface.jl |  4 +-
 .../unified_implicit_free_surface_solver.jl   |  8 +-
 test/test_multi_region_unit.jl                |  7 +-
 11 files changed, 89 insertions(+), 86 deletions(-)

diff --git a/src/ImmersedBoundaries/grid_fitted_bottom.jl b/src/ImmersedBoundaries/grid_fitted_bottom.jl
index 56fa63e2f9..8bd1d7de48 100644
--- a/src/ImmersedBoundaries/grid_fitted_bottom.jl
+++ b/src/ImmersedBoundaries/grid_fitted_bottom.jl
@@ -39,9 +39,9 @@ const GFBIBG = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:Any, <:GridFit
 GridFittedBottom(bottom_height) = GridFittedBottom(bottom_height, CenterImmersedCondition())
 
 function Base.summary(ib::GridFittedBottom)
-    hmax = maximum(parent(ib.bottom_height))
-    hmin = minimum(parent(ib.bottom_height))
-    hmean = mean(parent(ib.bottom_height))
+    hmax  = maximum(ib.bottom_height)
+    hmin  = minimum(ib.bottom_height)
+    hmean = mean(ib.bottom_height)
 
     summary1 = "GridFittedBottom("
 
@@ -78,12 +78,6 @@ function ImmersedBoundaryGrid(grid, ib::GridFittedBottom)
     return ImmersedBoundaryGrid{TX, TY, TZ}(grid, new_ib)
 end
 
-function ImmersedBoundaryGrid(grid, ib::AbstractGridFittedBottom{<:OffsetArray})
-    TX, TY, TZ = topology(grid)
-    validate_ib_size(grid, ib)
-    return ImmersedBoundaryGrid{TX, TY, TZ}(grid, ib)
-end
-
 @inline function _immersed_cell(i, j, k, underlying_grid, ib::GridFittedBottom{<:Any, <:InterfaceImmersedCondition})
     z = znode(i, j, k+1, underlying_grid, c, c, f)
     h = @inbounds ib.bottom_height[i, j, 1]
@@ -103,10 +97,11 @@ function on_architecture(arch, ib::GridFittedBottom{<:Field})
     architecture(ib.bottom_height) == arch && return ib
     arch_grid = on_architecture(arch, ib.bottom_height.grid)
     new_bottom_height = Field{Center, Center, Nothing}(arch_grid)
-    copyto!(parent(new_bottom_height), parent(ib.bottom_height))
+    set!(new_bottom_height, ib.bottom_height)
+    fill_halo_regions!(new_bottom_height)
     return GridFittedBottom(new_bottom_height, ib.immersed_condition)
 end
 
 Adapt.adapt_structure(to, ib::GridFittedBottom) = GridFittedBottom(adapt(to, ib.bottom_height.data),
-                                                                   ib.immersed_condition)
+                                                                             ib.immersed_condition)
 
diff --git a/src/MultiRegion/MultiRegion.jl b/src/MultiRegion/MultiRegion.jl
index 5ad5685f36..57309aae35 100644
--- a/src/MultiRegion/MultiRegion.jl
+++ b/src/MultiRegion/MultiRegion.jl
@@ -16,6 +16,7 @@ using OffsetArrays
 
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: Reference, Iterate, getnamewrapper
+using Oceananigans.Grids: AbstractUnderlyingGrid
 
 using KernelAbstractions: @kernel, @index
 
@@ -30,7 +31,7 @@ import Oceananigans.Utils:
                 _getregion,
                 sync_all_devices!
 
-abstract type AbstractMultiRegionGrid{FT, TX, TY, TZ, Arch} <: AbstractGrid{FT, TX, TY, TZ, Arch} end
+abstract type AbstractMultiRegionGrid{FT, TX, TY, TZ, Arch} <: AbstractUnderlyingGrid{FT, TX, TY, TZ, Arch} end
 
 abstract type AbstractPartition end
 
diff --git a/src/MultiRegion/multi_region_abstract_operations.jl b/src/MultiRegion/multi_region_abstract_operations.jl
index 92894251ea..4c09090c80 100644
--- a/src/MultiRegion/multi_region_abstract_operations.jl
+++ b/src/MultiRegion/multi_region_abstract_operations.jl
@@ -1,17 +1,19 @@
-using Oceananigans.AbstractOperations: UnaryOperation, BinaryOperation, MultiaryOperation, Derivative, KernelFunctionOperation
+using Oceananigans.AbstractOperations: UnaryOperation, BinaryOperation, MultiaryOperation, Derivative, KernelFunctionOperation, ConditionalOperation
 
 # Field and FunctionField (both fields with "grids attached")
-const MultiRegionUnaryOperation{LX, LY, LZ, O, A, I, G, T} = UnaryOperation{LX, LY, LZ, O, A, I, <:MultiRegionGrid, T} where {LX, LY, LZ, O, A, I, T}
-const MultiRegionBinaryOperation{LX, LY, LZ, O, A, B, IA, IB, G, T} = BinaryOperation{LX, LY, LZ, O, A, B, IA, IB, <:MultiRegionGrid, T} where {LX, LY, LZ, O, A, B, IA, IB, T}
-const MultiRegionMultiaryOperation{LX, LY, LZ, N, O, A, I, G, T} = MultiaryOperation{LX, LY, LZ, N, O, A, I, <:MultiRegionGrid, T} where {LX, LY, LZ, N, O, A, I, T}
-const MultiRegionDerivative{LX, LY, LZ, D, A, IN, AD, G, T} = Derivative{LX, LY, LZ, D, A, IN, AD, <:MultiRegionGrid, T} where {LX, LY, LZ, D, A, IN, AD, T}
-const MultiRegionKernelFunctionOperation{LX, LY, LZ, G, T, K, D} = KernelFunctionOperation{LX, LY, LZ, <:MultiRegionGrid, T, K, D} where {LX, LY, LZ, P, T, K, D}
+const MultiRegionUnaryOperation{LX, LY, LZ, O, A, I}          = UnaryOperation{LX, LY, LZ, O, A, I, <:MultiRegionGrids} where {LX, LY, LZ, O, A, I}
+const MultiRegionBinaryOperation{LX, LY, LZ, O, A, B, IA, IB} = BinaryOperation{LX, LY, LZ, O, A, B, IA, IB, <:MultiRegionGrids} where {LX, LY, LZ, O, A, B, IA, IB}
+const MultiRegionMultiaryOperation{LX, LY, LZ, N, O, A, I}    = MultiaryOperation{LX, LY, LZ, N, O, A, I, <:MultiRegionGrids} where {LX, LY, LZ, N, O, A, I}
+const MultiRegionDerivative{LX, LY, LZ, D, A, IN, AD}         = Derivative{LX, LY, LZ, D, A, IN, AD, <:MultiRegionGrids} where {LX, LY, LZ, D, A, IN, AD}
+const MultiRegionKernelFunctionOperation{LX, LY, LZ}          = KernelFunctionOperation{LX, LY, LZ, <:MultiRegionGrids} where {LX, LY, LZ, P}
+const MultiRegionConditionalOperation{LX, LY, LZ, O, F}       = ConditionalOperation{LX, LY, LZ, O, F, <:MultiRegionGrids} where {LX, LY, LZ, O, F}
 
 const MultiRegionAbstractOperation = Union{MultiRegionBinaryOperation, 
                                            MultiRegionUnaryOperation,
                                            MultiRegionMultiaryOperation,
                                            MultiRegionDerivative,
-                                           MultiRegionKernelFunctionOperation}
+                                           MultiRegionKernelFunctionOperation,
+                                           MultiRegionConditionalOperation}
 # Utils
 Base.size(f::MultiRegionAbstractOperation) = size(getregion(f.grid, 1))
 
@@ -26,7 +28,7 @@ sync_all_devices!(f::MultiRegionAbstractOperation)  = sync_all_devices!(devices(
 compute_at!(f::MultiRegionAbstractOperation, time) = apply_regionally!(compute_at!, f, time)
 compute!(f::MultiRegionAbstractOperation)          = apply_regionally!(compute!, f, time)
 
-for T in [:BinaryOperation, :UnaryOperation, :MultiaryOperation, :Derivative, :KernelFunctionOperation]
+for T in [:BinaryOperation, :UnaryOperation, :MultiaryOperation, :Derivative, :KernelFunctionOperation, :ConditionalOperation]
     @eval begin
         @inline getregion(f::$T{LX, LY, LZ}, r) where {LX, LY, LZ} =
                           $T{LX, LY, LZ}(Tuple(_getregion(getproperty(f, n), r) for n in fieldnames($T))...)
diff --git a/src/MultiRegion/multi_region_boundary_conditions.jl b/src/MultiRegion/multi_region_boundary_conditions.jl
index 777ec0268c..55f9f60ff4 100644
--- a/src/MultiRegion/multi_region_boundary_conditions.jl
+++ b/src/MultiRegion/multi_region_boundary_conditions.jl
@@ -29,7 +29,7 @@ import Oceananigans.BoundaryConditions:
 @inline boundary_conditions(field::MultiRegionField) = field.boundary_conditions
 
 # This can be implemented once we have a buffer for field_tuples
-@inline function tupled_fill_halo_regions!(full_fields, grid::MultiRegionGrid, args...; kwargs...) 
+@inline function tupled_fill_halo_regions!(full_fields, grid::MultiRegionGrids, args...; kwargs...) 
     for field in full_fields
         fill_halo_regions!(field, args...; kwargs...)
     end
@@ -59,7 +59,7 @@ fill_halo_regions!(c::MultiRegionObject, ::Nothing, args...; kwargs...) = nothin
 # fill_halo_regions!(c::MultiRegionObject, bcs, loc, mrg::MultiRegionGrid, buffers, args...; kwargs...) = 
 #     apply_regionally!(fill_halo_regions!, c, bcs, loc, mrg, Reference(c.regional_objects), Reference(buffers.regional_objects), args...; kwargs...)
 
-function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiRegionGrid, buffers, args...; kwargs...) 
+function fill_halo_regions!(c::MultiRegionObject, bcs, indices, loc, mrg::MultiRegionGrids, buffers, args...; kwargs...) 
 
     arch  = architecture(mrg)
     halo_tuple  = construct_regionally(permute_boundary_conditions, bcs)
diff --git a/src/MultiRegion/multi_region_field.jl b/src/MultiRegion/multi_region_field.jl
index 49f9625463..11e660bd61 100644
--- a/src/MultiRegion/multi_region_field.jl
+++ b/src/MultiRegion/multi_region_field.jl
@@ -11,9 +11,9 @@ import Base: fill!, axes
 import Oceananigans.Simulations: hasnan
 
 # Field and FunctionField (both fields with "grids attached")
-const MultiRegionField{LX, LY, LZ, O} = Field{LX, LY, LZ, O, <:MultiRegionGrid} where {LX, LY, LZ, O}
-const MultiRegionComputedField{LX, LY, LZ, O} = Field{LX, LY, LZ, <:AbstractOperation, <:MultiRegionGrid} where {LX, LY, LZ}
-const MultiRegionFunctionField{LX, LY, LZ, C, P, F} = FunctionField{LX, LY, LZ, C, P, F, <:MultiRegionGrid} where {LX, LY, LZ, C, P, F}
+const MultiRegionField{LX, LY, LZ, O} = Field{LX, LY, LZ, O, <:MultiRegionGrids} where {LX, LY, LZ, O}
+const MultiRegionComputedField{LX, LY, LZ, O} = Field{LX, LY, LZ, <:AbstractOperation, <:MultiRegionGrids} where {LX, LY, LZ}
+const MultiRegionFunctionField{LX, LY, LZ, C, P, F} = FunctionField{LX, LY, LZ, C, P, F, <:MultiRegionGrids} where {LX, LY, LZ, C, P, F}
 
 const GriddedMultiRegionField = Union{MultiRegionField, MultiRegionFunctionField}
 const GriddedMultiRegionFieldTuple{N, T} = NTuple{N, T} where {N, T<:GriddedMultiRegionField}
@@ -123,17 +123,17 @@ compute_at!(mrf::MultiRegionComputedField, time) = apply_regionally!(compute_at!
 
 @inline hasnan(field::MultiRegionField) = (&)(construct_regionally(hasnan, field).regional_objects...)
 
-validate_indices(indices, loc, mrg::MultiRegionGrid) = 
-    construct_regionally(validate_indices, indices, loc, mrg.region_grids)
+validate_indices(indices, loc, mrg::MultiRegionGrids) = 
+    construct_regionally(validate_indices, indices, loc, mrg)
 
-FieldBoundaryBuffers(grid::MultiRegionGrid, args...; kwargs...) = 
+FieldBoundaryBuffers(grid::MultiRegionGrids, args...; kwargs...) = 
     construct_regionally(FieldBoundaryBuffers, grid, args...; kwargs...)
 
-FieldBoundaryConditions(mrg::MultiRegionGrid, loc, indices; kwargs...) =
+FieldBoundaryConditions(mrg::MultiRegionGrids, loc, indices; kwargs...) =
   construct_regionally(inject_regional_bcs, mrg, Iterate(1:length(mrg)), Reference(mrg.partition), Reference(loc), indices; kwargs...)
 
 function regularize_field_boundary_conditions(bcs::FieldBoundaryConditions,
-                                              mrg::MultiRegionGrid,
+                                              mrg::MultiRegionGrids,
                                               field_name::Symbol,
                                               prognostic_field_name=nothing)
 
diff --git a/src/MultiRegion/multi_region_grid.jl b/src/MultiRegion/multi_region_grid.jl
index 014f06e93a..c7e51e98ea 100644
--- a/src/MultiRegion/multi_region_grid.jl
+++ b/src/MultiRegion/multi_region_grid.jl
@@ -15,18 +15,26 @@ struct MultiRegionGrid{FT, TX, TY, TZ, P, G, D, Arch} <: AbstractMultiRegionGrid
         new{FT, TX, TY, TZ, P, G, D, A}(arch, partition, region_grids, devices)
 end
 
-@inline isregional(mrg::MultiRegionGrid)        = true
-@inline getdevice(mrg::MultiRegionGrid, i)      = getdevice(mrg.region_grids, i)
-@inline switch_device!(mrg::MultiRegionGrid, i) = switch_device!(getdevice(mrg, i))
-@inline devices(mrg::MultiRegionGrid)           = devices(mrg.region_grids)
-@inline sync_all_devices!(mrg::MultiRegionGrid) = sync_all_devices!(devices(mrg))
+const ImmersedMultiRegionGrid = ImmersedBoundaryGrid{<:Any, <:Any, <:Any, <:Any, <:MultiRegionGrid} 
+
+const MultiRegionGrids = Union{MultiRegionGrid, ImmersedMultiRegionGrid}
+
+@inline isregional(mrg::MultiRegionGrids)        = true
+@inline getdevice(mrg::MultiRegionGrid, i)       = getdevice(mrg.region_grids, i)
+@inline switch_device!(mrg::MultiRegionGrid, i)  = switch_device!(getdevice(mrg, i))
+@inline devices(mrg::MultiRegionGrid)            = devices(mrg.region_grids)
+@inline sync_all_devices!(mrg::MultiRegionGrid)  = sync_all_devices!(devices(mrg))
 
 @inline  getregion(mrg::MultiRegionGrid, r) = _getregion(mrg.region_grids, r)
 @inline _getregion(mrg::MultiRegionGrid, r) =  getregion(mrg.region_grids, r)
 
-@inline Base.length(mrg::MultiRegionGrid)   = Base.length(mrg.region_grids)
+@inline getdevice(mrg::ImmersedMultiRegionGrid, i)       = getdevice(mrg.underlying_grid.region_grids, i)
+@inline switch_device!(mrg::ImmersedMultiRegionGrid, i)  = switch_device!(getdevice(mrg.underlying_grid, i))
+@inline devices(mrg::ImmersedMultiRegionGrid)            = devices(mrg.underlying_grid.region_grids)
+@inline sync_all_devices!(mrg::ImmersedMultiRegionGrid)  = sync_all_devices!(devices(mrg.underlying_grid))
 
-const ImmersedMultiRegionGrid = MultiRegionGrid{FT, TX, TY, TZ, P, <:MultiRegionObject{<:Tuple{Vararg{IBG}}}} where {FT, TX, TY, TZ, P, IBG<:ImmersedBoundaryGrid}
+@inline Base.length(mrg::MultiRegionGrid)         = Base.length(mrg.region_grids)
+@inline Base.length(mrg::ImmersedMultiRegionGrid) = Base.length(mrg.underlying_grid.region_grids)
 
 """
     MultiRegionGrid(global_grid; partition = XPartition(2), devices = nothing)
@@ -107,14 +115,6 @@ function construct_grid(grid::LatitudeLongitudeGrid, child_arch, topo, size, ext
                                  precompute_metrics = metrics_precomputed(grid))
 end
 
-function construct_grid(ibg::ImmersedBoundaryGrid, child_arch, topo, local_size, extent, partition, region)
-    boundary = partition_immersed_boundary(ibg.immersed_boundary, partition, local_size, region, child_arch)
-    return ImmersedBoundaryGrid(construct_grid(ibg.underlying_grid, child_arch, topo, local_size, extent), boundary)
-end
-
-partition_immersed_boundary(b, args...) = 
-    getnamewrapper(b)(partition_global_array(getproperty(b, propertynames(b)[1]), args...))
-
 function reconstruct_global_grid(mrg)
     size    = reconstruct_size(mrg, mrg.partition)
     extent  = reconstruct_extent(mrg, mrg.partition)
@@ -128,21 +128,29 @@ end
 
 Reconstruct the `mrg` global grid associated with the `MultiRegionGrid` on `architecture(mrg)`.
 """
-function reconstruct_global_grid(mrg::ImmersedMultiRegionGrid{FT, TX, TY, TZ}) where {FT, TX, TY, TZ}
-    underlying_mrg = MultiRegionGrid{FT, TX, TY, TZ}(architecture(mrg), 
-                                                     mrg.partition, 
-                                                     construct_regionally(getproperty, mrg, :underlying_grid), 
-                                                     mrg.devices)
-                                                     
-    global_grid     = on_architecture(CPU(), reconstruct_global_grid(underlying_mrg))
-    cpu_mrg         = on_architecture(CPU(), mrg)
-    local_boundary  = construct_regionally(getproperty, cpu_mrg, :immersed_boundary)
-    local_array     = construct_regionally(getproperty, local_boundary, propertynames(local_boundary[1])[1])
-    local_array     = construct_regionally(getinterior, local_array, mrg)
-    global_boundary = getnamewrapper(local_boundary[1])(reconstruct_global_array(local_array, mrg.partition, architecture(mrg)))
-    return on_architecture(architecture(mrg), ImmersedBoundaryGrid(global_grid, global_boundary))
+function reconstruct_global_grid(mrg::ImmersedMultiRegionGrid) 
+    global_grid     = reconstruct_global_grid(mrg.underlying_grid)
+    global_boundary = reconstruct_global_boundary(mrg.immersed_boundary)
+
+    return ImmersedBoundaryGrid(global_grid, global_boundary)
 end
 
+using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, GridFittedBoundary
+
+reconstruct_global_boundary(g::GridFittedBottom{<:Field})   = GridFittedBottom(reconstruct_global_field(g.bottom_height), g.immersed_condition)
+reconstruct_global_boundary(g::PartialCellBottom{<:Field})  =  PartialCellBottom(reconstruct_global_field(g.bottom_height), g.minimum_fractional_cell_height)
+reconstruct_global_boundary(g::GridFittedBoundary{<:Field}) = GridFittedBoundary(reconstruct_global_field(g.bottom_height))
+
+@inline  getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}(_getregion(mrg.underlying_grid, r), _getregion(mrg.immersed_boundary, r))
+@inline _getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}( getregion(mrg.underlying_grid, r),  getregion(mrg.immersed_boundary, r))
+
+@inline  getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary(_getregion(g.bottom_height, r))
+@inline _getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary( getregion(g.bottom_height, r))
+@inline  getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom(_getregion(g.bottom_height, r), g.immersed_condition)
+@inline _getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom( getregion(g.bottom_height, r), g.immersed_condition)
+@inline  getregion(g::PartialCellBottom{<:Field}, r)  = PartialCellBottom(_getregion(g.bottom_height, r))
+@inline _getregion(g::PartialCellBottom{<:Field}, r)  = PartialCellBottom( getregion(g.bottom_height, r))
+
 getinterior(array::AbstractArray{T, 2}, grid) where T = array[1:grid.Nx, 1:grid.Ny]
 getinterior(array::AbstractArray{T, 3}, grid) where T = array[1:grid.Nx, 1:grid.Ny, 1:grid.Nz]
 getinterior(func::Function, grid) = func
@@ -167,7 +175,7 @@ multi_region_object_from_array(a::AbstractArray, grid) = arch_array(architecture
 #### Utilitites for MultiRegionGrid
 ####
 
-new_data(FT::DataType, mrg::MultiRegionGrid, args...) = construct_regionally(new_data, FT, mrg, args...)
+new_data(FT::DataType, mrg::MultiRegionGrids, args...) = construct_regionally(new_data, FT, mrg, args...)
 
 # This is kind of annoying but it is necessary to have compatible MultiRegion and Distributed
 function with_halo(new_halo, mrg::MultiRegionGrid) 
@@ -208,9 +216,16 @@ function Base.:(==)(mrg1::MultiRegionGrid, mrg2::MultiRegionGrid)
     return all(vals.regional_objects)
 end
    
+####
+#### This works only for homogenous partitioning
+####
+
+size(mrg::MultiRegionGrids) = size(getregion(mrg, 1)) 
+halo_size(mrg::MultiRegionGrids) = halo_size(getregion(mrg, 1)) 
+
 ####
 #### Get property for `MultiRegionGrid` (gets the properties of region 1)
-#### In general getpropert should never be used as a MultiRegionGrid
+#### In general getproperty should never be used as a MultiRegionGrid
 #### Should be used only in combination with an @apply_regionally
 ####
 
@@ -225,15 +240,4 @@ const MRG = MultiRegionGrid
 @inline get_multi_property(mrg::MRG, ::Val{:architecture})           = getfield(mrg, :architecture)
 @inline get_multi_property(mrg::MRG, ::Val{:partition})              = getfield(mrg, :partition)
 @inline get_multi_property(mrg::MRG, ::Val{:region_grids})           = getfield(mrg, :region_grids)
-@inline get_multi_property(mrg::MRG, ::Val{:devices})                = getfield(mrg, :devices)
-
-@inline function get_multi_property(mrg::ImmersedMultiRegionGrid, ::Val{:underlying_grid})
-    global_grid = reconstruct_global_grid(mrg)
-    grid        = global_grid.underlying_grid
-    return MultiRegionGrid(grid, partition = mrg.partition, devices = mrg.devices, validate = false)
-end
-
-function maybe_add_active_cells_map(mrg::MRG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ}
-    new_grids = construct_regionally(maybe_add_active_cells_map, mrg)
-    return MultiRegionGrid{FT, TX, TY, TZ}(mrg.architecture, mrg.partition, new_grids, mrg.devices)
-end
+@inline get_multi_property(mrg::MRG, ::Val{:devices})                = getfield(mrg, :devices)
\ No newline at end of file
diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl
index e3081929d1..959e7d4d7a 100644
--- a/src/MultiRegion/multi_region_models.jl
+++ b/src/MultiRegion/multi_region_models.jl
@@ -11,7 +11,7 @@ import Oceananigans.Advection: WENO, cell_advection_timescale
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: build_implicit_step_solver, validate_tracer_advection
 import Oceananigans.TurbulenceClosures: implicit_diffusion_solver
 
-const MultiRegionModel = HydrostaticFreeSurfaceModel{<:Any, <:Any, <:AbstractArchitecture, <:Any, <:MultiRegionGrid}
+const MultiRegionModel = HydrostaticFreeSurfaceModel{<:Any, <:Any, <:AbstractArchitecture, <:Any, <:MultiRegionGrids}
 
 # Utility to generate the inputs to complex `getregion`s
 function getregionalproperties(T, inner=true) 
@@ -48,13 +48,13 @@ end
 @inline isregional(pv::PrescribedVelocityFields) = isregional(pv.u) | isregional(pv.v) | isregional(pv.w)
 @inline devices(pv::PrescribedVelocityFields)    = devices(pv[findfirst(isregional, (pv.u, pv.v, pv.w))])
 
-validate_tracer_advection(tracer_advection::MultiRegionObject, grid::MultiRegionGrid) = tracer_advection, NamedTuple()
+validate_tracer_advection(tracer_advection::MultiRegionObject, grid::MultiRegionGrids) = tracer_advection, NamedTuple()
 
 @inline isregional(mrm::MultiRegionModel)   = true
 @inline devices(mrm::MultiRegionModel)      = devices(mrm.grid)
 @inline getdevice(mrm::MultiRegionModel, d) = getdevice(mrm.grid, d)
 
-implicit_diffusion_solver(time_discretization::VerticallyImplicitTimeDiscretization, mrg::MultiRegionGrid) =
+implicit_diffusion_solver(time_discretization::VerticallyImplicitTimeDiscretization, mrg::MultiRegionGrids) =
       construct_regionally(implicit_diffusion_solver, time_discretization, mrg)
 
 WENO(mrg::MultiRegionGrid, args...; kwargs...) = construct_regionally(WENO, mrg, args...; kwargs...)
@@ -71,7 +71,7 @@ WENO(mrg::MultiRegionGrid, args...; kwargs...) = construct_regionally(WENO, mrg,
                                           getregion(t.vertical_scheme, r),
                                           getregion(t.upwinding, r))
 
-function cell_advection_timescale(grid::MultiRegionGrid, velocities)
+function cell_advection_timescale(grid::MultiRegionGrids, velocities)
     Δt = construct_regionally(cell_advection_timescale, grid, velocities)
     return minimum(Δt.regional_objects)
 end
diff --git a/src/MultiRegion/multi_region_output_writers.jl b/src/MultiRegion/multi_region_output_writers.jl
index 21f106b683..e3027fad4c 100644
--- a/src/MultiRegion/multi_region_output_writers.jl
+++ b/src/MultiRegion/multi_region_output_writers.jl
@@ -33,6 +33,6 @@ function serializeproperty!(file, location, mrf::MultiRegionField{LX, LY, LZ}) w
     return nothing
 end
 
-function serializeproperty!(file, location, mrg::MultiRegionGrid) 
+function serializeproperty!(file, location, mrg::MultiRegionGrids) 
     file[location] = on_architecture(CPU(), reconstruct_global_grid(mrg))
 end
diff --git a/src/MultiRegion/multi_region_split_explicit_free_surface.jl b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
index dffe0d4a03..7addf5e25d 100644
--- a/src/MultiRegion/multi_region_split_explicit_free_surface.jl
+++ b/src/MultiRegion/multi_region_split_explicit_free_surface.jl
@@ -3,7 +3,7 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, Spli
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliaryFields
 
-function SplitExplicitAuxiliaryFields(grid::MultiRegionGrid)
+function SplitExplicitAuxiliaryFields(grid::MultiRegionGrids)
     
     Gᵁ = Field((Face,   Center, Nothing), grid)
     Gⱽ = Field((Center, Face,   Nothing), grid)
@@ -37,7 +37,7 @@ end
 @inline augmented_kernel_offsets(grid, ::XPartition) = (halo_size(grid)[1]-1, 0)
 @inline augmented_kernel_offsets(grid, ::YPartition) = (0, halo_size(grid)[2]-1)
 
-function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::MultiRegionGrid)
+function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid::MultiRegionGrids)
 
         settings  = free_surface.settings 
 
diff --git a/src/MultiRegion/unified_implicit_free_surface_solver.jl b/src/MultiRegion/unified_implicit_free_surface_solver.jl
index 47be2ff8c4..9e209cfb09 100644
--- a/src/MultiRegion/unified_implicit_free_surface_solver.jl
+++ b/src/MultiRegion/unified_implicit_free_surface_solver.jl
@@ -26,7 +26,7 @@ end
 architecture(solver::UnifiedImplicitFreeSurfaceSolver) =
     architecture(solver.preconditioned_conjugate_gradient_solver)
 
-function UnifiedImplicitFreeSurfaceSolver(mrg::MultiRegionGrid, settings, gravitational_acceleration::Number; multiple_devices = false)
+function UnifiedImplicitFreeSurfaceSolver(mrg::MultiRegionGrids, settings, gravitational_acceleration::Number; multiple_devices = false)
     
     # Initialize vertically integrated lateral face areas
     grid = reconstruct_global_grid(mrg)
@@ -60,11 +60,11 @@ function UnifiedImplicitFreeSurfaceSolver(mrg::MultiRegionGrid, settings, gravit
     return UnifiedImplicitFreeSurfaceSolver(solver, right_hand_side, storage)
 end
 
-build_implicit_step_solver(::Val{:HeptadiagonalIterativeSolver}, grid::MultiRegionGrid, settings, gravitational_acceleration) =
+build_implicit_step_solver(::Val{:HeptadiagonalIterativeSolver}, grid::MultiRegionGrids, settings, gravitational_acceleration) =
     UnifiedImplicitFreeSurfaceSolver(grid, settings, gravitational_acceleration)
-build_implicit_step_solver(::Val{:Default}, grid::MultiRegionGrid, settings, gravitational_acceleration) =
+build_implicit_step_solver(::Val{:Default}, grid::MultiRegionGrids, settings, gravitational_acceleration) =
     UnifiedImplicitFreeSurfaceSolver(grid, settings, gravitational_acceleration)   
-build_implicit_step_solver(::Val{:PreconditionedConjugateGradient}, grid::MultiRegionGrid, settings, gravitational_acceleration) =
+build_implicit_step_solver(::Val{:PreconditionedConjugateGradient}, grid::MultiRegionGrids, settings, gravitational_acceleration) =
     throw(ArgumentError("Cannot use PCG solver with Multi-region grids!! Select :Default or :HeptadiagonalIterativeSolver as solver_method"))
 
 function compute_implicit_free_surface_right_hand_side!(rhs, implicit_solver::UnifiedImplicitFreeSurfaceSolver, g, Δt, ∫ᶻQ, η)
diff --git a/test/test_multi_region_unit.jl b/test/test_multi_region_unit.jl
index 6ff4cce90c..0c394d4ccc 100644
--- a/test/test_multi_region_unit.jl
+++ b/test/test_multi_region_unit.jl
@@ -41,7 +41,7 @@ devices(::GPU, num) = Tuple(0 for i in 1:num)
             for FieldType in [CenterField, XFaceField, YFaceField]
                 @info "Testing multi region $(FieldType) on $(getnamewrapper(grid)) on $regions $(Partition)s"
 
-                multi_region_field = FieldType(mrg)
+                multi_region_field  = FieldType(mrg)
                 single_region_field = FieldType(grid)
 
                 set!(single_region_field, (x, y, z) -> x)
@@ -58,9 +58,10 @@ devices(::GPU, num) = Tuple(0 for i in 1:num)
             for immersed_boundary in immersed_boundaries
                 @info "Testing multi region immersed boundaries on $(getnamewrapper(grid)) on $regions $(Partition)s"
                 ibg = ImmersedBoundaryGrid(grid, immersed_boundary)
-                mrg = MultiRegionGrid(ibg, partition = Partition(region), devices = devices(arch, region))
+                mrg = MultiRegionGrid(grid, partition = Partition(region), devices = devices(arch, region))
+                mribg = ImmersedBoundaryGrid(mrg, immersed_boundary)
 
-                @test on_architecture(arch, reconstruct_global_grid(mrg)) == ibg
+                @test reconstruct_global_grid(mribg) == ibg
             end
         end
     end

From 43c83eaf62b429c1746d264eb2c1df15e471f36e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 25 Jul 2023 19:18:21 -0400
Subject: [PATCH 441/530] Update
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl

---
 src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
index 4a622c688f..db4c7d0683 100644
--- a/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
+++ b/src/TurbulenceClosures/vertically_implicit_diffusion_solver.jl
@@ -91,7 +91,7 @@ end
     νᵏ⁻¹   = νzᶜᶜᶜ(i, j, k′-1, grid, closure_ij, K, clock)
     Δzᶜₖ   = Δz(i, j, k′,   grid, ℓx, ℓy, c)
     Δzᶠₖ₋₁ = Δz(i, j, k′-1, grid, ℓx, ℓy, f)
-    dl     = Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
+    dl     = - Δt * νᵏ⁻¹ / (Δzᶜₖ * Δzᶠₖ₋₁)
     return ifelse(k < 1, zero(grid), dl)
 end
 

From 45bdebca3955b91a5035d13859a1bc994e19c36d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 25 Jul 2023 19:40:22 -0400
Subject: [PATCH 442/530] small bugfix

---
 src/MultiRegion/multi_region_field.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/MultiRegion/multi_region_field.jl b/src/MultiRegion/multi_region_field.jl
index 11e660bd61..9ee4b7f7d6 100644
--- a/src/MultiRegion/multi_region_field.jl
+++ b/src/MultiRegion/multi_region_field.jl
@@ -117,6 +117,7 @@ set!(mrf::MultiRegionField, v)  = apply_regionally!(set!,  mrf, v)
 fill!(mrf::MultiRegionField, v) = apply_regionally!(fill!, mrf, v)
 
 set!(mrf::MultiRegionField, f::Function)  = apply_regionally!(set!, mrf, f)
+set!(u::MultiRegionField, v::MultiRegionField)  = apply_regionally!(set!, u, v)
 
 compute_at!(mrf::GriddedMultiRegionField, time)  = apply_regionally!(compute_at!, mrf, time)
 compute_at!(mrf::MultiRegionComputedField, time) = apply_regionally!(compute_at!, mrf, time)

From 7ff28daff3eaadb697556fbe3aa117a197d9e790 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 25 Jul 2023 20:52:00 -0400
Subject: [PATCH 443/530] remove multiregion ibg with arrays for the moment

---
 test/test_multi_region_unit.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/test_multi_region_unit.jl b/test/test_multi_region_unit.jl
index 0c394d4ccc..449de252a7 100644
--- a/test/test_multi_region_unit.jl
+++ b/test/test_multi_region_unit.jl
@@ -28,9 +28,7 @@ devices(::GPU, num) = Tuple(0 for i in 1:num)
         grids = [lat_lon_grid, rectilinear_grid]
 
         immersed_boundaries = [GridFittedBottom((x, y) -> 0.5),
-                               GridFittedBottom(arch_array(arch, [0.5 for i in 1:20, j in 1:20])),
-                               GridFittedBoundary((x, y, z) -> z>0.5),
-                               GridFittedBoundary(arch_array(arch, [false for i in 1:20, j in 1:20, k in 1:1]))]
+                               GridFittedBoundary((x, y, z) -> z>0.5)]
         
         for grid in grids, Partition in partition_types, region in regions
             @info "Testing multi region $(getnamewrapper(grid)) on $regions $(Partition)s"

From 95824659ef596d4e799f3877f705eefadda8ed2c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 25 Jul 2023 21:25:27 -0400
Subject: [PATCH 444/530] bugfix

---
 src/MultiRegion/multi_region_grid.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/MultiRegion/multi_region_grid.jl b/src/MultiRegion/multi_region_grid.jl
index c7e51e98ea..725e96d543 100644
--- a/src/MultiRegion/multi_region_grid.jl
+++ b/src/MultiRegion/multi_region_grid.jl
@@ -139,13 +139,13 @@ using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, Grid
 
 reconstruct_global_boundary(g::GridFittedBottom{<:Field})   = GridFittedBottom(reconstruct_global_field(g.bottom_height), g.immersed_condition)
 reconstruct_global_boundary(g::PartialCellBottom{<:Field})  =  PartialCellBottom(reconstruct_global_field(g.bottom_height), g.minimum_fractional_cell_height)
-reconstruct_global_boundary(g::GridFittedBoundary{<:Field}) = GridFittedBoundary(reconstruct_global_field(g.bottom_height))
+reconstruct_global_boundary(g::GridFittedBoundary{<:Field}) = GridFittedBoundary(reconstruct_global_field(g.mask))
 
 @inline  getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}(_getregion(mrg.underlying_grid, r), _getregion(mrg.immersed_boundary, r))
 @inline _getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}( getregion(mrg.underlying_grid, r),  getregion(mrg.immersed_boundary, r))
 
-@inline  getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary(_getregion(g.bottom_height, r))
-@inline _getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary( getregion(g.bottom_height, r))
+@inline  getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary(_getregion(g.mask, r))
+@inline _getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary( getregion(g.mask, r))
 @inline  getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom(_getregion(g.bottom_height, r), g.immersed_condition)
 @inline _getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom( getregion(g.bottom_height, r), g.immersed_condition)
 @inline  getregion(g::PartialCellBottom{<:Field}, r)  = PartialCellBottom(_getregion(g.bottom_height, r))

From 040c1bdb5b7549b6d194342ade5d724d198e56ff Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 26 Jul 2023 18:06:55 -0400
Subject: [PATCH 445/530] little cleaner

---
 src/MultiRegion/multi_region_grid.jl   | 38 +++++++++-----------------
 src/MultiRegion/multi_region_models.jl |  6 +++-
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/MultiRegion/multi_region_grid.jl b/src/MultiRegion/multi_region_grid.jl
index 725e96d543..3f9196f4bd 100644
--- a/src/MultiRegion/multi_region_grid.jl
+++ b/src/MultiRegion/multi_region_grid.jl
@@ -1,4 +1,6 @@
 using Oceananigans.Grids: metrics_precomputed, on_architecture, pop_flat_elements
+using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, GridFittedBoundary
+
 import Oceananigans.Grids: architecture, size, new_data, halo_size
 import Oceananigans.Grids: with_halo, on_architecture
 import Oceananigans.Distributed: reconstruct_global_grid
@@ -115,6 +117,11 @@ function construct_grid(grid::LatitudeLongitudeGrid, child_arch, topo, size, ext
                                  precompute_metrics = metrics_precomputed(grid))
 end
 
+"""
+    reconstruct_global_grid(mrg::MultiRegionGrid)
+
+Reconstruct the `mrg` global grid associated with the `MultiRegionGrid` on `architecture(mrg)`.
+"""
 function reconstruct_global_grid(mrg)
     size    = reconstruct_size(mrg, mrg.partition)
     extent  = reconstruct_extent(mrg, mrg.partition)
@@ -123,11 +130,10 @@ function reconstruct_global_grid(mrg)
     return construct_grid(mrg.region_grids[1], architecture(mrg), topo, size, extent)
 end
 
-"""
-    reconstruct_global_grid(mrg::MultiRegionGrid)
+#####
+##### `ImmersedMultiRegionGrid` functionalities
+#####
 
-Reconstruct the `mrg` global grid associated with the `MultiRegionGrid` on `architecture(mrg)`.
-"""
 function reconstruct_global_grid(mrg::ImmersedMultiRegionGrid) 
     global_grid     = reconstruct_global_grid(mrg.underlying_grid)
     global_boundary = reconstruct_global_boundary(mrg.immersed_boundary)
@@ -135,25 +141,12 @@ function reconstruct_global_grid(mrg::ImmersedMultiRegionGrid)
     return ImmersedBoundaryGrid(global_grid, global_boundary)
 end
 
-using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, GridFittedBoundary
-
-reconstruct_global_boundary(g::GridFittedBottom{<:Field})   = GridFittedBottom(reconstruct_global_field(g.bottom_height), g.immersed_condition)
+reconstruct_global_boundary(g::GridFittedBottom{<:Field})   =   GridFittedBottom(reconstruct_global_field(g.bottom_height), g.immersed_condition)
 reconstruct_global_boundary(g::PartialCellBottom{<:Field})  =  PartialCellBottom(reconstruct_global_field(g.bottom_height), g.minimum_fractional_cell_height)
 reconstruct_global_boundary(g::GridFittedBoundary{<:Field}) = GridFittedBoundary(reconstruct_global_field(g.mask))
 
-@inline  getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}(_getregion(mrg.underlying_grid, r), _getregion(mrg.immersed_boundary, r))
-@inline _getregion(mrg::ImmersedBoundaryGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}( getregion(mrg.underlying_grid, r),  getregion(mrg.immersed_boundary, r))
-
-@inline  getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary(_getregion(g.mask, r))
-@inline _getregion(g::GridFittedBoundary{<:Field}, r) = GridFittedBoundary( getregion(g.mask, r))
-@inline  getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom(_getregion(g.bottom_height, r), g.immersed_condition)
-@inline _getregion(g::GridFittedBottom{<:Field}, r)   = GridFittedBottom( getregion(g.bottom_height, r), g.immersed_condition)
-@inline  getregion(g::PartialCellBottom{<:Field}, r)  = PartialCellBottom(_getregion(g.bottom_height, r))
-@inline _getregion(g::PartialCellBottom{<:Field}, r)  = PartialCellBottom( getregion(g.bottom_height, r))
-
-getinterior(array::AbstractArray{T, 2}, grid) where T = array[1:grid.Nx, 1:grid.Ny]
-getinterior(array::AbstractArray{T, 3}, grid) where T = array[1:grid.Nx, 1:grid.Ny, 1:grid.Nz]
-getinterior(func::Function, grid) = func
+@inline  getregion(mrg::ImmersedMultiRegionGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}(_getregion(mrg.underlying_grid, r), _getregion(mrg.immersed_boundary, r))
+@inline _getregion(mrg::ImmersedMultiRegionGrid{FT, TX, TY, TZ}, r) where {FT, TX, TY, TZ} = ImmersedBoundaryGrid{TX, TY, TZ}( getregion(mrg.underlying_grid, r),  getregion(mrg.immersed_boundary, r))
 
 """
     multi_region_object_from_array(a::AbstractArray, grid)
@@ -196,11 +189,6 @@ function on_architecture(::CPU, mrg::MultiRegionGrid{FT, TX, TY, TZ}) where {FT,
     return MultiRegionGrid{FT, TX, TY, TZ}(CPU(), mrg.partition, new_grids, devices)
 end
 
-function on_specific_architecture(arch, dev, grid)
-    switch_device!(dev)
-    return on_architecture(arch, grid)
-end
-
 Base.summary(mrg::MultiRegionGrid{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =  
     "MultiRegionGrid{$FT, $TX, $TY, $TZ} with $(summary(mrg.partition)) on $(string(typeof(mrg.region_grids[1]).name.wrapper))"
 
diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl
index 959e7d4d7a..40cb638e05 100644
--- a/src/MultiRegion/multi_region_models.jl
+++ b/src/MultiRegion/multi_region_models.jl
@@ -6,6 +6,7 @@ using Oceananigans.Models: PrescribedVelocityFields
 using Oceananigans.TurbulenceClosures: VerticallyImplicitTimeDiscretization
 using Oceananigans.Advection: AbstractAdvectionScheme
 using Oceananigans.Advection: VelocityUpwinding, OnlySelfUpwinding, CrossAndSelfUpwinding
+using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, GridFittedBoundary
 
 import Oceananigans.Advection: WENO, cell_advection_timescale
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: build_implicit_step_solver, validate_tracer_advection
@@ -34,7 +35,10 @@ Types = (:HydrostaticFreeSurfaceModel,
          :PrescribedVelocityFields,
          :CrossAndSelfUpwinding,
          :OnlySelfUpwinding,
-         :VelocityUpwinding)
+         :VelocityUpwinding,
+         :GridFittedBoundary,
+         :GridFittedBottom,
+         :PartialCellBottom)
 
 for T in Types
     @eval begin

From fe5e413d45e1d3e65dd1ebf5fa74d74e7fb21e52 Mon Sep 17 00:00:00 2001
From: simone-silvestri <silvestri.simone0@gmail.com>
Date: Wed, 26 Jul 2023 18:30:55 -0400
Subject: [PATCH 446/530] fixed tests

---
 docs/src/model_setup/boundary_conditions.md | 2 +-
 test/test_multi_region_unit.jl              | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/src/model_setup/boundary_conditions.md b/docs/src/model_setup/boundary_conditions.md
index 50fa545cd9..e78af509e9 100644
--- a/docs/src/model_setup/boundary_conditions.md
+++ b/docs/src/model_setup/boundary_conditions.md
@@ -442,7 +442,7 @@ hill (generic function with 1 method)
 
 julia> grid = ImmersedBoundaryGrid(underlying_grid, GridFittedBottom(hill))
 32×32×16 ImmersedBoundaryGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo:
-├── immersed_boundary: GridFittedBottom(mean(z)=0.106195, min(z)=0.1, max(z)=0.198258)
+├── immersed_boundary: GridFittedBottom(mean(z)=0.108726, min(z)=0.1, max(z)=0.198258)
 ├── underlying_grid: 32×32×16 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on CPU with 3×3×3 halo
 ├── Periodic x ∈ [-3.0, 3.0) regularly spaced with Δx=0.1875
 ├── Periodic y ∈ [-3.0, 3.0) regularly spaced with Δy=0.1875
diff --git a/test/test_multi_region_unit.jl b/test/test_multi_region_unit.jl
index 449de252a7..13f0926e83 100644
--- a/test/test_multi_region_unit.jl
+++ b/test/test_multi_region_unit.jl
@@ -43,14 +43,15 @@ devices(::GPU, num) = Tuple(0 for i in 1:num)
                 single_region_field = FieldType(grid)
 
                 set!(single_region_field, (x, y, z) -> x)
-                @apply_regionally set!(multi_region_field, (x, y, z) -> x)
+                set!(multi_region_field,  (x, y, z) -> x)
 
                 fill_halo_regions!(single_region_field)
                 fill_halo_regions!(multi_region_field)
 
+                # Remember that fields are reconstructed on the CPU!!
                 reconstructed_field = reconstruct_global_field(multi_region_field)
 
-                @test parent(reconstructed_field) ≈ parent(single_region_field)
+                @test parent(reconstructed_field) == Array(parent(single_region_field))
             end
 
             for immersed_boundary in immersed_boundaries

From cd66ed31bf9a8a26b83d68f3f19297b735a03d55 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Jul 2023 12:44:44 -0400
Subject: [PATCH 447/530] see what the error is

---
 test/test_computed_field.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/test_computed_field.jl b/test/test_computed_field.jl
index d3d2c37e39..9e67d6151b 100644
--- a/test/test_computed_field.jl
+++ b/test/test_computed_field.jl
@@ -518,19 +518,19 @@ for arch in archs
                 tke                  = ((u - U)^2  + (v - V)^2 + w^2) / 2
                 tke_ccc              = @at (Center, Center, Center) ((u - U)^2  + (v - V)^2 + w^2) / 2
 
-                @test try compute!(Field(u_prime             )); true; catch; false; end
-                @test try compute!(Field(u_prime_ccc         )); true; catch; false; end
-                @test try compute!(Field(u_prime_squared     )); true; catch; false; end
-                @test try compute!(Field(u_prime_squared_ccc )); true; catch; false; end
-                @test try compute!(Field(horizontal_twice_tke)); true; catch; false; end
-                @test try compute!(Field(horizontal_tke      )); true; catch; false; end
-                @test try compute!(Field(twice_tke           )); true; catch; false; end
+                compute!(Field(u_prime             ))
+                compute!(Field(u_prime_ccc         ))
+                compute!(Field(u_prime_squared     ))
+                compute!(Field(u_prime_squared_ccc ))
+                compute!(Field(horizontal_twice_tke))
+                compute!(Field(horizontal_tke      ))
+                compute!(Field(twice_tke           ))
 
-                @test try compute!(Field(horizontal_tke_ccc  )); true; catch; false; end
-                @test try compute!(Field(tke                 )); true; catch; false; end
+                compute!(Field(horizontal_tke_ccc  ))
+                compute!(Field(tke                 ))
 
                 computed_tke = Field(tke_ccc)
-                @test try compute!(computed_tke); true; catch; false; end
+                compute!(computed_tke)
                 @test all(interior(computed_tke, 2:3, 2:3, 2:3) .== 9/2)
 
                 tke_window = Field(tke_ccc, indices=(2:3, 2:3, 2:3))

From cd935633a4c7cc96fa2e41533376517bba73c1b7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Jul 2023 13:21:41 -0400
Subject: [PATCH 448/530] allow changing halos from checkpointer

---
 src/OutputWriters/checkpointer.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 518f2d9183..ff0a52d40c 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -1,6 +1,7 @@
 using Glob
 
 using Oceananigans
+using Oceananigans.BoundaryConditions
 using Oceananigans: fields, prognostic_fields
 using Oceananigans.Fields: offset_data
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
@@ -204,6 +205,7 @@ function set!(model, filepath::AbstractString)
 
         # Validate the grid
         checkpointed_grid = file["grid"]
+        Hx, Hy, Hz = halo_size(checkpointed_grid)
 
         model.grid == checkpointed_grid ||
              @warn "The grid associated with $filepath and model.grid are not the same!"
@@ -212,9 +214,10 @@ function set!(model, filepath::AbstractString)
 
         for name in propertynames(model_fields)
             if string(name) ∈ keys(file) # Test if variable exist in checkpoint
-                parent_data = file["$name/data"]
+                parent_data = file["$name/data"][Hx+1:end-Hx, Hy+1:end-Hy, Hz+1:end-Hz]
                 model_field = model_fields[name]
-                copyto!(model_field.data.parent, parent_data)
+                set!(model_field, parent_data)
+                fill_halo_regions!(model_field)
             else
                 @warn "Field $name does not exist in checkpoint and could not be restored."
             end

From 2c7a63301a5dce4e7138c3c06f56018281200463 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Jul 2023 15:57:23 -0400
Subject: [PATCH 449/530] test it

---
 src/OutputWriters/checkpointer.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index ff0a52d40c..5a7e8d1576 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -213,11 +213,11 @@ function set!(model, filepath::AbstractString)
         model_fields = prognostic_fields(model)
 
         for name in propertynames(model_fields)
+            @info "loading field $name"
             if string(name) ∈ keys(file) # Test if variable exist in checkpoint
-                parent_data = file["$name/data"][Hx+1:end-Hx, Hy+1:end-Hy, Hz+1:end-Hz]
+                parent_data = file["$name/data"]
                 model_field = model_fields[name]
-                set!(model_field, parent_data)
-                fill_halo_regions!(model_field)
+                copyto!(parent(model_field), parent_data)
             else
                 @warn "Field $name does not exist in checkpoint and could not be restored."
             end

From 5310c55bd3f9781892b6f0e6d74d253d02fcdb94 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Jul 2023 16:13:52 -0400
Subject: [PATCH 450/530] finally fixed it

---
 src/OutputWriters/checkpointer.jl | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 5a7e8d1576..6b9a698329 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -1,7 +1,6 @@
 using Glob
 
 using Oceananigans
-using Oceananigans.BoundaryConditions
 using Oceananigans: fields, prognostic_fields
 using Oceananigans.Fields: offset_data
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
@@ -205,19 +204,22 @@ function set!(model, filepath::AbstractString)
 
         # Validate the grid
         checkpointed_grid = file["grid"]
-        Hx, Hy, Hz = halo_size(checkpointed_grid)
-
+        gridsize = size(checkpointed_grid)
+        topo     = map(instantiate, topology(checkpointed_grid))
         model.grid == checkpointed_grid ||
              @warn "The grid associated with $filepath and model.grid are not the same!"
 
         model_fields = prognostic_fields(model)
 
         for name in propertynames(model_fields)
-            @info "loading field $name"
             if string(name) ∈ keys(file) # Test if variable exist in checkpoint
-                parent_data = file["$name/data"]
                 model_field = model_fields[name]
-                copyto!(parent(model_field), parent_data)
+                halo = halo_size(model_field.grid)
+                loc  = location(model_field)
+                indices = map(interior_parent_indices, loc, topo, gridsize, halo)
+                parent_data = file["$name/data"][indices...]
+                set!(model_field, parent_data)
+                fill_halo_regions!(model_field)
             else
                 @warn "Field $name does not exist in checkpoint and could not be restored."
             end

From ac408b5ab0ddc716b11829e0a355a7a79c6ab891 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 28 Jul 2023 16:15:46 -0400
Subject: [PATCH 451/530] better naming

---
 src/OutputWriters/checkpointer.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 6b9a698329..f7a8651788 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -212,13 +212,13 @@ function set!(model, filepath::AbstractString)
         model_fields = prognostic_fields(model)
 
         for name in propertynames(model_fields)
-            if string(name) ∈ keys(file) # Test if variable exist in checkpoint
+            if string(name) ∈ keys(file) # Test if variable exist in checkpoint.
                 model_field = model_fields[name]
                 halo = halo_size(model_field.grid)
                 loc  = location(model_field)
                 indices = map(interior_parent_indices, loc, topo, gridsize, halo)
-                parent_data = file["$name/data"][indices...]
-                set!(model_field, parent_data)
+                interior_data = file["$name/data"][indices...] #  Allow different halo size by loading only the interior
+                set!(model_field, interior_data)
                 fill_halo_regions!(model_field)
             else
                 @warn "Field $name does not exist in checkpoint and could not be restored."

From 746a0145e79c4b61d939560a8cf36038d5c4167b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 30 Jul 2023 14:21:21 -0400
Subject: [PATCH 452/530] bugfix

---
 src/OutputWriters/checkpointer.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index f7a8651788..0ecac5a70f 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -2,7 +2,8 @@ using Glob
 
 using Oceananigans
 using Oceananigans: fields, prognostic_fields
-using Oceananigans.Fields: offset_data
+using Oceananigans.Grids: interior_parent_indices
+using Oceananigans.Fields: offset_data, intantiate
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
 
 import Oceananigans.Fields: set!

From a4aa6968b7ca10f4e813da42dfae3da95c856700 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Sun, 30 Jul 2023 14:28:37 -0400
Subject: [PATCH 453/530] bugfix

---
 src/OutputWriters/checkpointer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 0ecac5a70f..4668dda6b0 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -3,7 +3,7 @@ using Glob
 using Oceananigans
 using Oceananigans: fields, prognostic_fields
 using Oceananigans.Grids: interior_parent_indices
-using Oceananigans.Fields: offset_data, intantiate
+using Oceananigans.Fields: offset_data, instantiate
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
 
 import Oceananigans.Fields: set!

From 9f8c1bb40b9bb01dfd13159584bbd6a302d2c222 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 31 Jul 2023 00:16:19 -0400
Subject: [PATCH 454/530] bugfix

---
 src/OutputWriters/checkpointer.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 4668dda6b0..0dd3154391 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -4,6 +4,7 @@ using Oceananigans
 using Oceananigans: fields, prognostic_fields
 using Oceananigans.Grids: interior_parent_indices
 using Oceananigans.Fields: offset_data, instantiate
+using Oceananigans.BoundaryConditions
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
 
 import Oceananigans.Fields: set!

From 11f01d839f3649d2fc01ef825297112c4dd9d71c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 31 Jul 2023 12:36:56 -0400
Subject: [PATCH 455/530] bugfix

---
 src/OutputWriters/checkpointer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 0dd3154391..bce0814e1e 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -217,7 +217,7 @@ function set!(model, filepath::AbstractString)
             if string(name) ∈ keys(file) # Test if variable exist in checkpoint.
                 model_field = model_fields[name]
                 halo = halo_size(model_field.grid)
-                loc  = location(model_field)
+                loc  = map(instantiate, location(model_field))
                 indices = map(interior_parent_indices, loc, topo, gridsize, halo)
                 interior_data = file["$name/data"][indices...] #  Allow different halo size by loading only the interior
                 set!(model_field, interior_data)

From 2c0a170fb24b2472d4e9c67c35ad800e01e222ea Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 31 Jul 2023 13:07:26 -0400
Subject: [PATCH 456/530] removed useless tendency

---
 .../hydrostatic_free_surface_field_tuples.jl           |  9 ++++++++-
 src/OutputWriters/checkpointer.jl                      | 10 ++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
index 5601f85192..983f227288 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
@@ -8,10 +8,17 @@ function HydrostaticFreeSurfaceVelocityFields(::Nothing, grid, clock, bcs=NamedT
     return (u=u, v=v, w=w)
 end
 
-function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
+function HydrostaticFreeSurfaceTendencyFields(velocities, ::ExplicitFreeSurface, grid, tracer_names)
     u = XFaceField(grid)
     v = YFaceField(grid)
     η = FreeSurfaceDisplacementField(velocities, free_surface, grid)
     tracers = TracerFields(tracer_names, grid)
     return merge((u=u, v=v, η=η), tracers)
 end
+
+function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
+    u = XFaceField(grid)
+    v = YFaceField(grid)
+    tracers = TracerFields(tracer_names, grid)
+    return merge((u=u, v=v), tracers)
+end
\ No newline at end of file
diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index bce0814e1e..77de350f7f 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -206,8 +206,6 @@ function set!(model, filepath::AbstractString)
 
         # Validate the grid
         checkpointed_grid = file["grid"]
-        gridsize = size(checkpointed_grid)
-        topo     = map(instantiate, topology(checkpointed_grid))
         model.grid == checkpointed_grid ||
              @warn "The grid associated with $filepath and model.grid are not the same!"
 
@@ -216,12 +214,8 @@ function set!(model, filepath::AbstractString)
         for name in propertynames(model_fields)
             if string(name) ∈ keys(file) # Test if variable exist in checkpoint.
                 model_field = model_fields[name]
-                halo = halo_size(model_field.grid)
-                loc  = map(instantiate, location(model_field))
-                indices = map(interior_parent_indices, loc, topo, gridsize, halo)
-                interior_data = file["$name/data"][indices...] #  Allow different halo size by loading only the interior
-                set!(model_field, interior_data)
-                fill_halo_regions!(model_field)
+                parent_data = file["$name/data"] #  Allow different halo size by loading only the interior
+                copyto!(model_field.data.parent, parent_data)
             else
                 @warn "Field $name does not exist in checkpoint and could not be restored."
             end

From 24c681594b7e90f1f1561e6f374eed76464c45b4 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 31 Jul 2023 13:31:39 -0400
Subject: [PATCH 457/530] small fix

---
 .../hydrostatic_free_surface_field_tuples.jl          | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
index 983f227288..384c37fc7d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
@@ -8,17 +8,10 @@ function HydrostaticFreeSurfaceVelocityFields(::Nothing, grid, clock, bcs=NamedT
     return (u=u, v=v, w=w)
 end
 
-function HydrostaticFreeSurfaceTendencyFields(velocities, ::ExplicitFreeSurface, grid, tracer_names)
-    u = XFaceField(grid)
-    v = YFaceField(grid)
-    η = FreeSurfaceDisplacementField(velocities, free_surface, grid)
-    tracers = TracerFields(tracer_names, grid)
-    return merge((u=u, v=v, η=η), tracers)
-end
-
 function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
     u = XFaceField(grid)
     v = YFaceField(grid)
+    η = free_surface isa ExplicitFreeSurface ? FreeSurfaceDisplacementField(velocities, free_surface, grid) : nothing
     tracers = TracerFields(tracer_names, grid)
-    return merge((u=u, v=v), tracers)
+    return merge((u=u, v=v, η=η), tracers)
 end
\ No newline at end of file

From d19ab3c8f371ccb54e5439c84d4bcb35dc47a6b8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 31 Jul 2023 15:09:27 -0400
Subject: [PATCH 458/530] dummy commit

---
 src/OutputWriters/checkpointer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 90513dbc1b..a97617e57e 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -5,7 +5,7 @@ using Oceananigans: fields, prognostic_fields
 using Oceananigans.Fields: offset_data
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
 
-import Oceananigans.Fields: set!
+import Oceananigans.Fields: set! 
 
 mutable struct Checkpointer{T, P} <: AbstractOutputWriter
     schedule :: T

From 7688452820098da129f330b7897c90f31f44308d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 4 Aug 2023 09:42:07 -0400
Subject: [PATCH 459/530] fix active cell map

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl | 12 +++++++++---
 src/ImmersedBoundaries/active_cells_map.jl   |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 8bc32a3c5c..4b4455a6f8 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -114,6 +114,13 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
         Arch = typeof(arch)
         return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, mi)
     end
+
+    function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I}
+        FT = eltype(grid)
+        arch = architecture(grid)
+        Arch = typeof(arch)
+        return new{FT, TX, TY, TZ, G, I, Nothing, Arch}(arch, grid, ib, nothing)
+    end
 end
 
 const IBG = ImmersedBoundaryGrid
@@ -134,9 +141,8 @@ const IBG = ImmersedBoundaryGrid
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
     ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_interior))
 
-function with_halo(halo, ibg::ImmersedBoundaryGrid) 
-    return ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
-end
+with_halo(halo, ibg::ImmersedBoundaryGrid) =
+    ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
 
 # ImmersedBoundaryGrids require an extra halo point to check the "inactivity" of a `Face` node at N + H 
 # (which requires checking `Center` nodes at N + H and N + H + 1)
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index ea7c9fa06d..cd55827785 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -22,7 +22,7 @@ struct SurfaceMap end
 @inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_interior[idx])
 @inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_surface[idx])
 
-function ImmersedBoundaryGrid(grid, ib, active_cells_map::Bool) 
+function ImmersedBoundaryGrid(grid, ib; active_cells_map::Bool = true) 
 
     ibg = ImmersedBoundaryGrid(grid, ib)
     TX, TY, TZ = topology(ibg)

From 0e81f1235c5029a9b42abeb2a3296c8d656876b7 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 4 Aug 2023 09:42:40 -0400
Subject: [PATCH 460/530] comment

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 4b4455a6f8..2cecb31b51 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -115,6 +115,7 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
         return new{FT, TX, TY, TZ, G, I, M, Arch}(arch, grid, ib, mi)
     end
 
+    # Constructor with no active map
     function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I}
         FT = eltype(grid)
         arch = architecture(grid)

From 7e4bf9a39b30f567370542133e5d09fe6a55d0fb Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 13:23:58 -0400
Subject: [PATCH 461/530] bugfix

---
 .../HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index d01b1d3ecc..8b78623673 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -73,7 +73,7 @@ Keyword Arguments
 """
 SplitExplicitFreeSurface(FT::DataType = Float64; gravitational_acceleration = g_Earth, kwargs...) = 
     SplitExplicitFreeSurface(nothing, nothing, nothing, convert(FT, gravitational_acceleration),
-                             SplitExplicitSettings(; gravitational_acceleration, kwargs...))
+                             SplitExplicitSettings(FT; gravitational_acceleration, kwargs...))
                              
 # The new constructor is defined later on after the state, settings, auxiliary have been defined
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid)

From 347367e6f43ab0e472d61ea819f257dde03ddd22 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 13:24:15 -0400
Subject: [PATCH 462/530] bugfix

---
 .../HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
index d01b1d3ecc..8b78623673 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface.jl
@@ -73,7 +73,7 @@ Keyword Arguments
 """
 SplitExplicitFreeSurface(FT::DataType = Float64; gravitational_acceleration = g_Earth, kwargs...) = 
     SplitExplicitFreeSurface(nothing, nothing, nothing, convert(FT, gravitational_acceleration),
-                             SplitExplicitSettings(; gravitational_acceleration, kwargs...))
+                             SplitExplicitSettings(FT; gravitational_acceleration, kwargs...))
                              
 # The new constructor is defined later on after the state, settings, auxiliary have been defined
 function FreeSurface(free_surface::SplitExplicitFreeSurface, velocities, grid)

From 2068462baafde0062202a8281abca8725ee3ae3a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 14:51:43 -0400
Subject: [PATCH 463/530] removed useless tendency

---
 .../hydrostatic_free_surface_field_tuples.jl             | 9 ++++++++-
 .../prescribed_hydrostatic_velocity_fields.jl            | 5 +++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
index 384c37fc7d..1a6a75871f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
@@ -8,10 +8,17 @@ function HydrostaticFreeSurfaceVelocityFields(::Nothing, grid, clock, bcs=NamedT
     return (u=u, v=v, w=w)
 end
 
-function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
+function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface::ExplicitFreeSurface, grid, tracer_names)
     u = XFaceField(grid)
     v = YFaceField(grid)
     η = free_surface isa ExplicitFreeSurface ? FreeSurfaceDisplacementField(velocities, free_surface, grid) : nothing
     tracers = TracerFields(tracer_names, grid)
     return merge((u=u, v=v, η=η), tracers)
+end
+
+function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
+    u = XFaceField(grid)
+    v = YFaceField(grid)
+    tracers = TracerFields(tracer_names, grid)
+    return merge((u=u, v=v), tracers)
 end
\ No newline at end of file
diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 009866740c..85b26baf12 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -69,6 +69,11 @@ function HydrostaticFreeSurfaceTendencyFields(::PrescribedVelocityFields, free_s
     return merge((u = nothing, v = nothing, η = nothing), tracers)
 end
 
+function HydrostaticFreeSurfaceTendencyFields(::PrescribedVelocityFields, ::ExplicitFreeSurface, grid, tracer_names)
+    tracers = TracerFields(tracer_names, grid)
+    return merge((u = nothing, v = nothing, η = nothing), tracers)
+end
+
 @inline fill_halo_regions!(::PrescribedVelocityFields, args...) = nothing
 @inline fill_halo_regions!(::FunctionField, args...) = nothing
 

From c972d0769e2ad2b6b868b5601f36cbbbb294e39f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 15:05:12 -0400
Subject: [PATCH 464/530] maybe just keep it does not harm too much

---
 .../hydrostatic_free_surface_field_tuples.jl          | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
index 1a6a75871f..5601f85192 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_field_tuples.jl
@@ -8,17 +8,10 @@ function HydrostaticFreeSurfaceVelocityFields(::Nothing, grid, clock, bcs=NamedT
     return (u=u, v=v, w=w)
 end
 
-function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface::ExplicitFreeSurface, grid, tracer_names)
+function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
     u = XFaceField(grid)
     v = YFaceField(grid)
-    η = free_surface isa ExplicitFreeSurface ? FreeSurfaceDisplacementField(velocities, free_surface, grid) : nothing
+    η = FreeSurfaceDisplacementField(velocities, free_surface, grid)
     tracers = TracerFields(tracer_names, grid)
     return merge((u=u, v=v, η=η), tracers)
 end
-
-function HydrostaticFreeSurfaceTendencyFields(velocities, free_surface, grid, tracer_names)
-    u = XFaceField(grid)
-    v = YFaceField(grid)
-    tracers = TracerFields(tracer_names, grid)
-    return merge((u=u, v=v), tracers)
-end
\ No newline at end of file

From e01c38c0d6d83d7074f869049d964d96aa307148 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 17:08:44 -0400
Subject: [PATCH 465/530] should have fixed it?

---
 src/OutputWriters/output_writer_utils.jl | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 6f66226957..53a9f42356 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -1,5 +1,6 @@
 using StructArrays: StructArray, replace_storage
 using Oceananigans.Grids: on_architecture
+using Oceananigans.Distributed
 using Oceananigans.Fields: AbstractField, indices, boundary_conditions, instantiated_location
 using Oceananigans.BoundaryConditions: bc_str, FieldBoundaryConditions, ContinuousBoundaryFunction, DiscreteBoundaryFunction
 using Oceananigans.TimeSteppers: QuasiAdamsBashforth2TimeStepper, RungeKutta3TimeStepper
@@ -40,6 +41,13 @@ saveproperty!(file, address, p::Function)             = nothing
 saveproperty!(file, address, p::Tuple)                = [saveproperty!(file, address * "/$i", p[i]) for i in 1:length(p)]
 saveproperty!(file, address, grid::AbstractGrid)      = _saveproperty!(file, address, on_architecture(CPU(), grid))
 
+function saveproperty!(file, address, grid::DistributedGrid) 
+    arch = architecture(grid)
+    cpu_arch = DistributedArch(CPU(); topology = topology(grid),
+                                      ranks = arch.ranks)
+    _saveproperty!(file, address, on_architecture(cpu_arch, grid))
+end
+
 # Special saveproperty! so boundary conditions are easily readable outside julia.
 function saveproperty!(file, address, bcs::FieldBoundaryConditions)
     for boundary in propertynames(bcs)
@@ -75,6 +83,13 @@ serializeproperty!(file, address, p::CantSerializeThis) = nothing
 # TODO: use on_architecture for more stuff?
 serializeproperty!(file, address, grid::AbstractGrid) = file[address] = on_architecture(CPU(), grid)
 
+function serializeproperty!(file, address, grid::DistributedGrid) 
+    arch = architecture(grid)
+    cpu_arch = DistributedArch(CPU(); topology = topology(grid),
+                                      ranks = arch.ranks)
+    file[address] = on_architecture(cpu_arch, grid)
+end
+
 function serializeproperty!(file, address, p::FieldBoundaryConditions)
     # TODO: it'd be better to "filter" `FieldBoundaryCondition` and then serialize
     # rather than punting with `missing` instead.

From 45fb9d57c8e7b7999e70fd907c2fc01d9bd884ae Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 17:14:25 -0400
Subject: [PATCH 466/530] let's go now

---
 src/OutputWriters/output_writer_utils.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 53a9f42356..074066a3b1 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -1,6 +1,7 @@
 using StructArrays: StructArray, replace_storage
-using Oceananigans.Grids: on_architecture
+using Oceananigans.Grids: on_architecture, architecture
 using Oceananigans.Distributed
+using Oceananigans.Distributed: DistributedGrid
 using Oceananigans.Fields: AbstractField, indices, boundary_conditions, instantiated_location
 using Oceananigans.BoundaryConditions: bc_str, FieldBoundaryConditions, ContinuousBoundaryFunction, DiscreteBoundaryFunction
 using Oceananigans.TimeSteppers: QuasiAdamsBashforth2TimeStepper, RungeKutta3TimeStepper

From 170dc908da395d9306053d0161fb91e62d6d9b24 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 18:16:31 -0400
Subject: [PATCH 467/530] done

---
 src/Architectures.jl | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index 9cd3a7487d..29fcb984a8 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -67,11 +67,17 @@ arch_array(::CPU, a::SubArray{<:Any, <:Any, <:CuArray}) = Array(a)
 arch_array(::GPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
 arch_array(::CPU, a::SubArray{<:Any, <:Any, <:Array}) = a
 
-arch_array(arch, a::AbstractRange) = a
-arch_array(arch, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
-arch_array(arch, ::Nothing)   = nothing
-arch_array(arch, a::Number)   = a
-arch_array(arch, a::Function) = a
+arch_array(::CPU, a::AbstractRange) = a
+arch_array(::CPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
+arch_array(::CPU, ::Nothing)   = nothing
+arch_array(::CPU, a::Number)   = a
+arch_array(::CPU, a::Function) = a
+
+arch_array(::GPU, a::AbstractRange) = a
+arch_array(::GPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
+arch_array(::GPU, ::Nothing)   = nothing
+arch_array(::GPU, a::Number)   = a
+arch_array(::GPU, a::Function) = a
 
 unified_array(::CPU, a) = a
 unified_array(::GPU, a) = a

From f0ac1daf1aeb95a518febe788dfb5a90267e1840 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 18:55:45 -0400
Subject: [PATCH 468/530] bugfix

---
 src/Architectures.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index 29fcb984a8..581ea7943c 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -68,17 +68,18 @@ arch_array(::GPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
 arch_array(::CPU, a::SubArray{<:Any, <:Any, <:Array}) = a
 
 arch_array(::CPU, a::AbstractRange) = a
-arch_array(::CPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
 arch_array(::CPU, ::Nothing)   = nothing
 arch_array(::CPU, a::Number)   = a
 arch_array(::CPU, a::Function) = a
 
 arch_array(::GPU, a::AbstractRange) = a
-arch_array(::GPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
 arch_array(::GPU, ::Nothing)   = nothing
 arch_array(::GPU, a::Number)   = a
 arch_array(::GPU, a::Function) = a
 
+arch_array(arch::CPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
+arch_array(arch::GPU, a::OffsetArray) = OffsetArray(arch_array(arch, a.parent), a.offsets...)
+
 unified_array(::CPU, a) = a
 unified_array(::GPU, a) = a
 

From c9a4ae6703d41f35e3e4d0466d4a807c2ce3dcd2 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 6 Aug 2023 19:29:04 -0400
Subject: [PATCH 469/530] no need for this

---
 src/ImmersedBoundaries/conditional_fluxes.jl | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/src/ImmersedBoundaries/conditional_fluxes.jl b/src/ImmersedBoundaries/conditional_fluxes.jl
index e48962132d..6afac0e2ac 100644
--- a/src/ImmersedBoundaries/conditional_fluxes.jl
+++ b/src/ImmersedBoundaries/conditional_fluxes.jl
@@ -214,22 +214,4 @@ for bias in (:symmetric, :left_biased, :right_biased)
             end
         end
     end
-end
-
-for bias in (:left_biased, :right_biased)
-    for (d, dir) in zip((:x, :y), (:xᶜᵃᵃ, :yᵃᶜᵃ))
-        interp     = Symbol(bias, :_interpolate_, dir)
-        alt_interp = Symbol(:_, interp)
-
-        near_horizontal_boundary = Symbol(:near_, d, :_horizontal_boundary_, bias)
-
-        @eval begin
-            # Conditional Interpolation for VelocityStencil WENO vector invariant scheme
-            @inline $alt_interp(i, j, k, ibg::ImmersedBoundaryGrid, scheme::WENO, ζ, ::VelocityStencil, args...) =
-                ifelse($near_horizontal_boundary(i, j, k, ibg, scheme),
-                       $alt_interp(i, j, k, ibg, scheme, ζ, DefaultStencil(), args...),
-                       $interp(i, j, k, ibg, scheme, ζ, VelocityStencil(), args...))
-        end
-    end
-end
-
+end
\ No newline at end of file

From 6fc688e840c13f60d617a72b58ae9b4d715f5247 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 15 Aug 2023 16:11:43 -0400
Subject: [PATCH 470/530] =?UTF-8?q?convert=20=CE=94t=20in=20time=20steppin?=
 =?UTF-8?q?g?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/OutputWriters/checkpointer.jl           | 4 +---
 src/Simulations/run.jl                      | 3 +--
 src/TimeSteppers/quasi_adams_bashforth_2.jl | 2 +-
 src/TimeSteppers/runge_kutta_3.jl           | 8 ++++----
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/OutputWriters/checkpointer.jl b/src/OutputWriters/checkpointer.jl
index 77de350f7f..f23f6306d7 100644
--- a/src/OutputWriters/checkpointer.jl
+++ b/src/OutputWriters/checkpointer.jl
@@ -2,9 +2,7 @@ using Glob
 
 using Oceananigans
 using Oceananigans: fields, prognostic_fields
-using Oceananigans.Grids: interior_parent_indices
-using Oceananigans.Fields: offset_data, instantiate
-using Oceananigans.BoundaryConditions
+using Oceananigans.Fields: offset_data
 using Oceananigans.TimeSteppers: RungeKutta3TimeStepper, QuasiAdamsBashforth2TimeStepper
 
 import Oceananigans.Fields: set!
diff --git a/src/Simulations/run.jl b/src/Simulations/run.jl
index bed4bf6c53..c9acc13d89 100644
--- a/src/Simulations/run.jl
+++ b/src/Simulations/run.jl
@@ -41,7 +41,6 @@ and callback schedules. Alignment with `sim.stop_time` takes precedence.
 function aligned_time_step(sim::Simulation, Δt)
     clock = sim.model.clock
 
-    FT = eltype(Δt)
     aligned_Δt = Δt
 
     # Align time step with output writing and callback execution
@@ -53,7 +52,7 @@ function aligned_time_step(sim::Simulation, Δt)
     # Temporary fix for https://github.com/CliMA/Oceananigans.jl/issues/1280
     aligned_Δt = aligned_Δt <= 0 ? Δt : aligned_Δt
 
-    return convert(FT, aligned_Δt)
+    return aligned_Δt
 end
 
 """
diff --git a/src/TimeSteppers/quasi_adams_bashforth_2.jl b/src/TimeSteppers/quasi_adams_bashforth_2.jl
index 1ac9ede8f7..6c3854e843 100644
--- a/src/TimeSteppers/quasi_adams_bashforth_2.jl
+++ b/src/TimeSteppers/quasi_adams_bashforth_2.jl
@@ -154,7 +154,7 @@ Time step velocity fields via the 2nd-order quasi Adams-Bashforth method
     one_point_five = convert(FT, 1.5)
     oh_point_five  = convert(FT, 0.5)
 
-    @inbounds u[i, j, k] += Δt * ((one_point_five + χ) * Gⁿ[i, j, k] - (oh_point_five + χ) * G⁻[i, j, k])
+    @inbounds u[i, j, k] += convert(FT, Δt) * ((one_point_five + χ) * Gⁿ[i, j, k] - (oh_point_five + χ) * G⁻[i, j, k])
 end
 
 @kernel ab2_step_field!(::FunctionField, Δt, χ, Gⁿ, G⁻) = nothing
diff --git a/src/TimeSteppers/runge_kutta_3.jl b/src/TimeSteppers/runge_kutta_3.jl
index 02b1008bf6..ac28b59c21 100644
--- a/src/TimeSteppers/runge_kutta_3.jl
+++ b/src/TimeSteppers/runge_kutta_3.jl
@@ -181,18 +181,18 @@ Uᵐ⁺¹ = Uᵐ + Δt * (γᵐ * Gᵐ + ζᵐ * Gᵐ⁻¹)
 
 where `m` denotes the substage.
 """
-@kernel function rk3_substep_field!(U, Δt, γⁿ, ζⁿ, Gⁿ, G⁻)
+@kernel function rk3_substep_field!(U, Δt, γⁿ::FT, ζⁿ, Gⁿ, G⁻) where FT
     i, j, k = @index(Global, NTuple)
 
     @inbounds begin
-        U[i, j, k] += Δt * (γⁿ * Gⁿ[i, j, k] + ζⁿ * G⁻[i, j, k])
+        U[i, j, k] += convert(FT, Δt) * (γⁿ * Gⁿ[i, j, k] + ζⁿ * G⁻[i, j, k])
     end
 end
 
-@kernel function rk3_substep_field!(U, Δt, γ¹, ::Nothing, G¹, G⁰)
+@kernel function rk3_substep_field!(U, Δt, γ¹::FT, ::Nothing, G¹, G⁰) where FT
     i, j, k = @index(Global, NTuple)
 
     @inbounds begin
-        U[i, j, k] += Δt * γ¹ * G¹[i, j, k]
+        U[i, j, k] += convert(FT, Δt) * γ¹ * G¹[i, j, k]
     end
 end

From 234bd8e32d32f558a46cdbfd9736d5908348b06d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 15 Aug 2023 16:39:03 -0400
Subject: [PATCH 471/530] maximum

---
 .../split_explicit_free_surface_kernels.jl                      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 696e2ae36f..2d16d20f06 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -306,7 +306,7 @@ const FNS = FixedSubstepNumber
 const FTS = FixedTimeStepSize
 
 @inline calculate_substeps(substepping::FNS, Δt) = length(substepping.averaging_weights)
-@inline calculate_substeps(substepping::FTS, Δt) = ceil(Int, 2 * Δt / substepping.Δt_barotropic)
+@inline calculate_substeps(substepping::FTS, Δt) = max(5, ceil(Int, 2 * Δt / substepping.Δt_barotropic))
 
 @inline calculate_adaptive_settings(substepping::FNS, substeps) = substepping.fractional_step_size, substepping.averaging_weights
 @inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotropic), 

From d6e338d8de4a559d972a37c065f6dd7015c48bfe Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 15 Aug 2023 19:48:19 -0400
Subject: [PATCH 472/530] minimum substeps

---
 .../split_explicit_free_surface_kernels.jl                  | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
index 2d16d20f06..77d3cf27b0 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/split_explicit_free_surface_kernels.jl
@@ -305,8 +305,12 @@ end
 const FNS = FixedSubstepNumber
 const FTS = FixedTimeStepSize
 
+# since weights can be negative in the first few substeps (as in the default averaging kernel), 
+# we set a minimum number of substeps to execute to avoid numerical issues
+const MINIMUM_SUBSTEPS = 5
+
 @inline calculate_substeps(substepping::FNS, Δt) = length(substepping.averaging_weights)
-@inline calculate_substeps(substepping::FTS, Δt) = max(5, ceil(Int, 2 * Δt / substepping.Δt_barotropic))
+@inline calculate_substeps(substepping::FTS, Δt) = max(MINIMUM_SUBSTEPS, ceil(Int, 2 * Δt / substepping.Δt_barotropic))
 
 @inline calculate_adaptive_settings(substepping::FNS, substeps) = substepping.fractional_step_size, substepping.averaging_weights
 @inline calculate_adaptive_settings(substepping::FTS, substeps) = weights_from_substeps(eltype(substepping.Δt_barotropic), 

From 2eae7743e87d025d0bbce0b45e04240a8e9e1458 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 16 Aug 2023 12:14:16 -0400
Subject: [PATCH 473/530] more flexibility

---
 src/Advection/vector_invariant_advection.jl    | 18 +++++++++++++-----
 .../vector_invariant_cross_upwinding.jl        | 10 +++++-----
 .../vector_invariant_self_upwinding.jl         | 10 +++++-----
 .../vector_invariant_velocity_upwinding.jl     | 10 +++++-----
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index e45c96fe16..bef8d7856e 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -7,16 +7,17 @@ struct EnstrophyConservingScheme{FT} <: AbstractAdvectionScheme{1, FT} end
 EnergyConservingScheme(FT::DataType = Float64)    = EnergyConservingScheme{FT}()
 EnstrophyConservingScheme(FT::DataType = Float64) = EnstrophyConservingScheme{FT}()
 
-struct VectorInvariant{N, FT, Z, ZS, V, K, D, M} <: AbstractAdvectionScheme{N, FT}
+struct VectorInvariant{N, FT, Z, ZS, V, K, U, M} <: AbstractAdvectionScheme{N, FT}
     vorticity_scheme   :: Z  # reconstruction scheme for vorticity flux
     vorticity_stencil  :: ZS # stencil used for assessing vorticity smoothness
     vertical_scheme    :: V  # recontruction scheme for vertical advection
     ke_gradient_scheme :: K  # reconstruction scheme for kinetic energy gradient
-    upwinding          :: D  # treatment of upwinding for divergence flux and kinetic energy gradient
+    divergence_scheme  :: D  # reconstruction scheme for divergence flux
+    upwinding          :: U  # treatment of upwinding for divergence flux and kinetic energy gradient
 
     VectorInvariant{N, FT, M}(vorticity_scheme::Z, vorticity_stencil::ZS, vertical_scheme::V, 
-                              ke_gradient_scheme::K, upwinding::D) where {N, FT, Z, ZS, V, K, D, M} =
-        new{N, FT, Z, ZS, V, K, D, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
+                              ke_gradient_scheme::K, upwinding::U) where {N, FT, Z, ZS, V, K, U, M} =
+        new{N, FT, Z, ZS, V, K, U, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
 end
 
 """
@@ -81,10 +82,16 @@ function VectorInvariant(; vorticity_scheme::AbstractAdvectionScheme{N, FT} = En
                            vorticity_stencil    = VelocityStencil(),
                            vertical_scheme      = EnergyConservingScheme(),
                            ke_gradient_scheme   = vertical_scheme,
+                           divergence_scheme    = vorticity_scheme,
                            upwinding  = OnlySelfUpwinding(; cross_scheme = vertical_scheme),
                            multi_dimensional_stencil = false) where {N, FT}
         
-    return VectorInvariant{N, FT, multi_dimensional_stencil}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
+    return VectorInvariant{N, FT, multi_dimensional_stencil}(vorticity_scheme,
+                                                             vorticity_stencil, 
+                                                             vertical_scheme, 
+                                                             ke_gradient_scheme, 
+                                                             divergence_scheme, 
+                                                             upwinding)
 end
 
 const VectorInvariantEnergyConserving           = VectorInvariant{<:Any, <:Any, <:EnergyConservingScheme}
@@ -117,6 +124,7 @@ Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, K, D, M}) whe
                                   Adapt.adapt(to, scheme.vorticity_stencil), 
                                   Adapt.adapt(to, scheme.vertical_scheme),
                                   Adapt.adapt(to, scheme.ke_gradient_scheme),
+                                  Adapt.adapt(to, scheme.divergence_scheme),
                                   Adapt.adapt(to, scheme.upwinding))
 
 @inline U_dot_∇u(i, j, k, grid, scheme::VectorInvariant, U) = (
diff --git a/src/Advection/vector_invariant_cross_upwinding.jl b/src/Advection/vector_invariant_cross_upwinding.jl
index df9c3a5531..23ccd1c684 100644
--- a/src/Advection/vector_invariant_cross_upwinding.jl
+++ b/src/Advection/vector_invariant_cross_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:CrossAndSelfUpwinding}
+const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:CrossAndSelfUpwinding}
 
 #####
 ##### Cross upwinding results in the largest kinetic energy content, 
@@ -23,8 +23,8 @@ const VectorInvariantCrossVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:An
     @inbounds û = u[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
 
-    δᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
     return upwind_biased_product(û, δᴸ, δᴿ)
 end
@@ -33,8 +33,8 @@ end
     @inbounds v̂ = v[i, j, k]
     δ_stencil = scheme.upwinding.divergence_stencil
 
-    δᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
-    δᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
+    δᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, flux_div_xyᶜᶜᶜ, δ_stencil, u, v) 
 
     return upwind_biased_product(v̂, δᴸ, δᴿ) 
 end
diff --git a/src/Advection/vector_invariant_self_upwinding.jl b/src/Advection/vector_invariant_self_upwinding.jl
index 527640752a..1c34388e60 100644
--- a/src/Advection/vector_invariant_self_upwinding.jl
+++ b/src/Advection/vector_invariant_self_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:OnlySelfUpwinding}
+const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:OnlySelfUpwinding}
 
 ##### 
 ##### Self Upwinding of Divergence Flux, the best option!
@@ -21,8 +21,8 @@ const VectorInvariantSelfVerticalUpwinding = VectorInvariant{<:Any, <:Any, <:Any
 
     @inbounds û = u[i, j, k]
     δvˢ =    _symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, cross_scheme, δy_V, u, v) 
-    δuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
-    δuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δx_U, δU_stencil, u, v) 
+    δuᴸ =  _left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δx_U, δU_stencil, u, v) 
+    δuᴿ = _right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δx_U, δU_stencil, u, v) 
 
     return upwind_biased_product(û, δuᴸ, δuᴿ) + û * δvˢ
 end
@@ -34,8 +34,8 @@ end
 
     @inbounds v̂ = v[i, j, k]
     δuˢ =    _symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, cross_scheme, δx_U, u, v)
-    δvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
-    δvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, δy_V, δV_stencil, u, v) 
+    δvᴸ =  _left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δy_V, δV_stencil, u, v) 
+    δvᴿ = _right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, δy_V, δV_stencil, u, v) 
 
     return upwind_biased_product(v̂, δvᴸ, δvᴿ) + v̂ * δuˢ
 end
diff --git a/src/Advection/vector_invariant_velocity_upwinding.jl b/src/Advection/vector_invariant_velocity_upwinding.jl
index 2cb235ddaf..90a5cd56cc 100644
--- a/src/Advection/vector_invariant_velocity_upwinding.jl
+++ b/src/Advection/vector_invariant_velocity_upwinding.jl
@@ -1,4 +1,4 @@
-const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:VelocityUpwinding}
+const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme, <:Any, <:Any, <:VelocityUpwinding}
 
 #####
 ##### Velocity upwinding is a Partial Upwinding where the upwind choice occurrs _inside_
@@ -13,8 +13,8 @@ const VectorInvariantVelocityVerticalUpwinding  = VectorInvariant{<:Any, <:Any,
 @inline function upwinded_Ax_uᶜᶜᶜ(i, j, k, grid, scheme, u) 
     û = ℑxᶜᵃᵃ(i, j, k, grid, u)
 
-    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
-    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ax_qᶠᶜᶜ, u)
+    Uᴸ =  _left_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ax_qᶠᶜᶜ, u)
+    Uᴿ = _right_biased_interpolate_xᶜᵃᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ax_qᶠᶜᶜ, u)
 
     return ifelse(û > 0, Uᴸ, Uᴿ)
 end
@@ -22,8 +22,8 @@ end
 @inline function upwinded_Ay_vᶜᶜᶜ(i, j, k, grid, scheme, v) 
     v̂ = ℑyᵃᶜᵃ(i, j, k, grid, v)
 
-    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
-    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.vertical_scheme, Ay_qᶜᶠᶜ, v)
+    Vᴸ =  _left_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ay_qᶜᶠᶜ, v)
+    Vᴿ = _right_biased_interpolate_yᵃᶜᵃ(i, j, k, grid, scheme, scheme.divergence_scheme, Ay_qᶜᶠᶜ, v)
 
     return ifelse(v̂ > 0, Vᴸ, Vᴿ)
 end

From 04242b43d74915521bdb6ed983a189908df15442 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 16 Aug 2023 12:34:05 -0400
Subject: [PATCH 474/530] bugfix

---
 src/Advection/vector_invariant_advection.jl | 13 +++++++------
 src/MultiRegion/multi_region_models.jl      |  6 ++++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index bef8d7856e..ca62aca924 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -7,7 +7,7 @@ struct EnstrophyConservingScheme{FT} <: AbstractAdvectionScheme{1, FT} end
 EnergyConservingScheme(FT::DataType = Float64)    = EnergyConservingScheme{FT}()
 EnstrophyConservingScheme(FT::DataType = Float64) = EnstrophyConservingScheme{FT}()
 
-struct VectorInvariant{N, FT, Z, ZS, V, K, U, M} <: AbstractAdvectionScheme{N, FT}
+struct VectorInvariant{N, FT, Z, ZS, V, K, D, U, M} <: AbstractAdvectionScheme{N, FT}
     vorticity_scheme   :: Z  # reconstruction scheme for vorticity flux
     vorticity_stencil  :: ZS # stencil used for assessing vorticity smoothness
     vertical_scheme    :: V  # recontruction scheme for vertical advection
@@ -16,8 +16,8 @@ struct VectorInvariant{N, FT, Z, ZS, V, K, U, M} <: AbstractAdvectionScheme{N, F
     upwinding          :: U  # treatment of upwinding for divergence flux and kinetic energy gradient
 
     VectorInvariant{N, FT, M}(vorticity_scheme::Z, vorticity_stencil::ZS, vertical_scheme::V, 
-                              ke_gradient_scheme::K, upwinding::U) where {N, FT, Z, ZS, V, K, U, M} =
-        new{N, FT, Z, ZS, V, K, U, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, upwinding)
+                              ke_gradient_scheme::K, divergence_scheme::D, upwinding::U) where {N, FT, Z, ZS, V, K, D, U, M} =
+        new{N, FT, Z, ZS, V, K, D, U, M}(vorticity_scheme, vorticity_stencil, vertical_scheme, ke_gradient_scheme, divergence_scheme, upwinding)
 end
 
 """
@@ -37,8 +37,9 @@ Keyword arguments
 - `vorticity_stencil`: Stencil used for smoothness indicators in case of a `WENO` upwind reconstruction. Choices are between `VelocityStencil`
                        which uses the horizontal velocity field to diagnose smoothness and `DefaultStencil` which uses the variable
                        being transported (defaults to `VelocityStencil()`)
-- `vertical_scheme`: Scheme used for vertical advection of horizontal momentum and upwinding of divergence. Defaults to `EnergyConservingScheme()`.
-- `ke_gradient_scheme`: Scheme used for kinetic energy gradient. Defaults to `vertical_advection`.
+- `vertical_scheme`: Scheme used for vertical advection of horizontal momentum. Defaults to `EnergyConservingScheme()`.
+- `ke_gradient_scheme`: Scheme used for kinetic energy gradient. Defaults to `vertical_scheme`.
+- `divergence_scheme`: Scheme used for divergence flux (only upwinding options are allowed). Defaults to `vorticity_scheme`.
 - `upwinding`: Treatment of upwinding in case of Upwinding reconstruction of divergence and kinetic energy gradient. Choices are between
                          `CrossAndSelfUpwinding()`, `OnlySelfUpwinding()`, and `VelocityUpwinding()` (defaults to `OnlySelfUpwinding()`).
 - `multi_dimensional_stencil` : if `true`, use a horizontal two dimensional stencil for the reconstruction of vorticity, divergence and kinetic energy gradient.
@@ -119,7 +120,7 @@ Base.show(io::IO, a::VectorInvariant{N, FT}) where {N, FT} =
 # halo for vector invariant advection
 required_halo_size(scheme::VectorInvariant{N}) where N = N == 1 ? N : N + 1
 
-Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, K, D, M}) where {N, FT, Z, ZS, V, K, D, M} =
+Adapt.adapt_structure(to, scheme::VectorInvariant{N, FT, Z, ZS, V, K, D, U, M}) where {N, FT, Z, ZS, V, K, D, U, M} =
         VectorInvariant{N, FT, M}(Adapt.adapt(to, scheme.vorticity_scheme), 
                                   Adapt.adapt(to, scheme.vorticity_stencil), 
                                   Adapt.adapt(to, scheme.vertical_scheme),
diff --git a/src/MultiRegion/multi_region_models.jl b/src/MultiRegion/multi_region_models.jl
index edff3c2ec0..d7a4bc9982 100644
--- a/src/MultiRegion/multi_region_models.jl
+++ b/src/MultiRegion/multi_region_models.jl
@@ -64,18 +64,20 @@ implicit_diffusion_solver(time_discretization::VerticallyImplicitTimeDiscretizat
 
 WENO(mrg::MultiRegionGrid, args...; kwargs...) = construct_regionally(WENO, mrg, args...; kwargs...)
 
-@inline  getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, M}, r) where {N, FT, Z, ZS, V, K, D, M} = 
+@inline  getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, U, M}, r) where {N, FT, Z, ZS, V, K, D, U, M} = 
                 VectorInvariant{N, FT, M}(_getregion(t.vorticity_scheme, r), 
                                           _getregion(t.vorticity_stencil, r), 
                                           _getregion(t.vertical_scheme, r),
                                           _getregion(t.ke_gradient_scheme, r),
+                                          _getregion(t.divergence_scheme, r),
                                           _getregion(t.upwinding, r))
 
-@inline _getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, M}, r) where {N, FT, Z, ZS, V, K, D, M} = 
+@inline _getregion(t::VectorInvariant{N, FT, Z, ZS, V, K, D, U, M}, r) where {N, FT, Z, ZS, V, K, D, U, M} = 
                 VectorInvariant{N, FT, M}(getregion(t.vorticity_scheme, r), 
                                           getregion(t.vorticity_stencil, r), 
                                           getregion(t.vertical_scheme, r),
                                           getregion(t.ke_gradient_scheme, r),
+                                          getregion(t.divergence_scheme, r),
                                           getregion(t.upwinding, r))
 
 function cell_advection_timescale(grid::MultiRegionGrid, velocities)

From 2b009584ceeda5469f02bcc94c954b36c43a794a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 18 Aug 2023 10:16:10 -0400
Subject: [PATCH 475/530] mutlidimensional

---
 src/Advection/vector_invariant_advection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index ca62aca924..2c374e382c 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -101,7 +101,7 @@ const VectorInvariantVerticalEnergyConserving   = VectorInvariant{<:Any, <:Any,
 const VectorInvariantKEGradientEnergyConserving = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:EnergyConservingScheme}
 
 const VectorInvariantUpwindVorticity  = VectorInvariant{<:Any, <:Any, <:AbstractUpwindBiasedAdvectionScheme}
-const MultiDimensionalVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, true}
+const MultiDimensionalVectorInvariant = VectorInvariant{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Any, true}
 
 Base.summary(a::VectorInvariant)                 = string("Vector Invariant, Dimension-by-dimension reconstruction")
 Base.summary(a::MultiDimensionalVectorInvariant) = string("Vector Invariant, Multidimensional reconstruction")

From 3202adb7842fc8c6d30056449c932b15b46ac110 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 21 Aug 2023 17:52:37 -0400
Subject: [PATCH 476/530] fallback methods

---
 src/Distributed/partition_assemble.jl | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 92e59655d4..70e724b2c7 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -110,10 +110,11 @@ end
 Partition a global array in local arrays of size `(nx, ny)` if 2D or `(nx, ny, nz)` is 3D.
 Usefull for boundary arrays, forcings and initial conditions.
 """
-partition_global_array(arch, c_global::Function, n) = c_global 
+partition_global_array(arch, c_global::AbstractArray, n) = c_global
+partition_global_array(arch, c_global::Function, n)      = c_global 
 
 # Here we assume that we cannot partition in z (we should remove support for that)
-function partition_global_array(arch, c_global::AbstractArray, n) 
+function partition_global_array(arch::DistributedArch, c_global::AbstractArray, n) 
     c_global = arch_array(CPU(), c_global)
 
     ri, rj, rk = arch.local_index
@@ -144,10 +145,11 @@ end
 Construct global array from local arrays (2D of size `(nx, ny)` or 3D of size (`nx, ny, nz`)).
 Usefull for boundary arrays, forcings and initial conditions.
 """
-construct_global_array(arch, c_local::Function, N) = c_local
+construct_global_array(arch, c_local::AbstractArray, n) = c_local
+construct_global_array(arch, c_local::Function, N)      = c_local
 
 # TODO: This does not work for 3D parallelizations!!!
-function construct_global_array(arch, c_local::AbstractArray, n) 
+function construct_global_array(arch::DistributedArch, c_local::AbstractArray, n) 
     c_local = arch_array(CPU(), c_local)
 
     ri, rj, rk = arch.local_index

From 086b21ebf5ea73b9c332377f6b9a8e0b5dba4a01 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 08:57:46 -0400
Subject: [PATCH 477/530] test a thing

---
 src/Advection/vector_invariant_advection.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 2c374e382c..9e8838862b 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -299,6 +299,13 @@ const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 # To adapt passing smoothness stencils to upwind biased schemes (not weno) 
 for buffer in 1:6
     @eval begin
+        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)

From 9e1728fce3a3096f5abcea0dbdb882a393bc1cc9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:08:20 -0400
Subject: [PATCH 478/530] change

---
 src/Advection/vector_invariant_advection.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 9e8838862b..abc3f37ca1 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -300,11 +300,8 @@ const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 for buffer in 1:6
     @eval begin
         @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
 
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)

From f6f0f3e85cd3f5153cf0b9ff4102b4c56f963e8f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:08:37 -0400
Subject: [PATCH 479/530] chnage

---
 src/Advection/vector_invariant_advection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index abc3f37ca1..8c3c0fd470 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -296,7 +296,7 @@ const UX{N} = UpwindBiased{N, <:Any, <:Nothing}
 const UY{N} = UpwindBiased{N, <:Any, <:Any, <:Nothing}
 const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 
-# To adapt passing smoothness stencils to upwind biased schemes (not weno) 
+# To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
 for buffer in 1:6
     @eval begin
         @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)

From 7e61e0bc10f1c2fce733e2291b847d1b4b368b83 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:10:31 -0400
Subject: [PATCH 480/530] change

---
 src/Advection/vector_invariant_advection.jl | 34 ++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 8c3c0fd470..57a90c1389 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -299,22 +299,22 @@ const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 # To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
 for buffer in 1:6
     @eval begin
-        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, VI::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+
+        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+
+        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
     end
 end

From ecb5664cc8e97ae69f936875e7b32e270b80ddc9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:38:45 -0400
Subject: [PATCH 481/530] change

---
 src/Advection/vector_invariant_advection.jl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 57a90c1389..f2e7bd5e1f 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -296,12 +296,21 @@ const UX{N} = UpwindBiased{N, <:Any, <:Nothing}
 const UY{N} = UpwindBiased{N, <:Any, <:Any, <:Nothing}
 const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 
+const C{N}  = Centered{N}
+const CX{N} = Centered{N, <:Any, <:Nothing}
+const CY{N} = Centered{N, <:Any, <:Any, <:Nothing}
+const CZ{N} = Centered{N, <:Any, <:Any, <:Any, <:Nothing}
+
 # To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
 for buffer in 1:6
     @eval begin
-        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::Centered{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+
+        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::CX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::CY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::CZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
 
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)

From 38bc808178c11b6912ad468a1ef2b907b94b963e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:53:51 -0400
Subject: [PATCH 482/530] update

---
 src/Advection/vector_invariant_advection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index f2e7bd5e1f..e82435f088 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -296,7 +296,7 @@ const UX{N} = UpwindBiased{N, <:Any, <:Nothing}
 const UY{N} = UpwindBiased{N, <:Any, <:Any, <:Nothing}
 const UZ{N} = UpwindBiased{N, <:Any, <:Any, <:Any, <:Nothing}
 
-const C{N}  = Centered{N}
+const C{N}  = Centered{N, <:Any}
 const CX{N} = Centered{N, <:Any, <:Nothing}
 const CY{N} = Centered{N, <:Any, <:Any, <:Nothing}
 const CZ{N} = Centered{N, <:Any, <:Any, <:Any, <:Nothing}

From 836d62990d82fb2456e6016f1561e17566ad505b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 22 Aug 2023 09:54:22 -0400
Subject: [PATCH 483/530] update

---
 src/Advection/vector_invariant_advection.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index e82435f088..72bf327b08 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -304,13 +304,13 @@ const CZ{N} = Centered{N, <:Any, <:Any, <:Any, <:Nothing}
 # To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
 for buffer in 1:6
     @eval begin
-        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
 
-        @inline inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::CX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::CY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::CZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::CX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::CY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::CZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
 
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
         @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)

From 636abdb050ca063c07f33d90cd5059bb698b2d7a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 28 Aug 2023 13:58:44 -0400
Subject: [PATCH 484/530] new offsets + return to previous KA

---
 Manifest.toml                 |  44 ++++++---------
 src/Utils/kernel_launching.jl | 101 +++++++++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 27 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 4debd48758..1443d2a0c6 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,21 +1,19 @@
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.9.2"
+julia_version = "1.9.3"
 manifest_format = "2.0"
-project_hash = "ea620612cb5f84093b962d4345aa7d1b1271739c"
+project_hash = "aa82d3116bea9a2fdd8d20fa51673c89bc397f8b"
 
 [[deps.AbstractFFTs]]
-deps = ["ChainRulesCore", "LinearAlgebra"]
+deps = ["LinearAlgebra"]
 git-tree-sha1 = "cad4c758c0038eea30394b1b671526921ca85b21"
 uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 version = "1.4.0"
+weakdeps = ["ChainRulesCore"]
 
     [deps.AbstractFFTs.extensions]
     AbstractFFTsChainRulesCoreExt = "ChainRulesCore"
 
-    [deps.AbstractFFTs.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
 git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
@@ -117,12 +115,6 @@ git-tree-sha1 = "e30f2f4e20f7f186dc36529910beaedc60cfa644"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 version = "1.16.0"
 
-[[deps.ChangesOfVariables]]
-deps = ["InverseFunctions", "LinearAlgebra", "Test"]
-git-tree-sha1 = "2fba81a302a7be671aefe194f0525ef231104e7f"
-uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
-version = "0.1.8"
-
 [[deps.CommonDataModel]]
 deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
 git-tree-sha1 = "2678b3fc170d582655a14d22867b031b6e43c2d4"
@@ -277,12 +269,6 @@ version = "2023.1.0+0"
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 
-[[deps.InverseFunctions]]
-deps = ["Test"]
-git-tree-sha1 = "eabe3125edba5c9c10b60a160b1779a000dc8b29"
-uuid = "3587e190-3f89-42d0-90ee-14403ec27112"
-version = "0.1.11"
-
 [[deps.IrrationalConstants]]
 git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
@@ -319,11 +305,15 @@ version = "1.13.1"
 
 [[deps.KernelAbstractions]]
 deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "a8ed43278a827de44cef19b3f295d9db9c278f4d"
-repo-rev = "main"
-repo-url = "https://github.com/simone-silvestri/KernelAbstractions.jl"
+git-tree-sha1 = "4c5875e4c228247e1c2b087669846941fb6e0118"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
-version = "0.9.7"
+version = "0.9.8"
+
+    [deps.KernelAbstractions.extensions]
+    EnzymeExt = "EnzymeCore"
+
+    [deps.KernelAbstractions.weakdeps]
+    EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
@@ -669,13 +659,11 @@ deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_j
 git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
 version = "2.3.0"
+weakdeps = ["ChainRulesCore"]
 
     [deps.SpecialFunctions.extensions]
     SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
 
-    [deps.SpecialFunctions.weakdeps]
-    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-
 [[deps.Static]]
 deps = ["IfElse"]
 git-tree-sha1 = "dbde6766fc677423598138a5951269432b0fcc90"
@@ -694,10 +682,14 @@ weakdeps = ["OffsetArrays", "StaticArrays"]
     StaticArrayInterfaceStaticArraysExt = "StaticArrays"
 
 [[deps.StaticArrays]]
-deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"]
+deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
 git-tree-sha1 = "fffc14c695c17bfdbfa92a2a01836cdc542a1e46"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
 version = "1.6.1"
+weakdeps = ["Statistics"]
+
+    [deps.StaticArrays.extensions]
+    StaticArraysStatisticsExt = "Statistics"
 
 [[deps.StaticArraysCore]]
 git-tree-sha1 = "1d5708d926c76a505052d0d24a846d5da08bc3a4"
diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index b161a1ce11..d55b658a0d 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -17,6 +17,8 @@ offsets(::KernelParameters{S, O}) where {S, O} = O
 worktuple(workspec) = workspec
 offsets(workspec)  = nothing
 
+contiguousrange(range::NTuple{N, Int}, offset::NTuple{N, Int}) where N = Tuple(1+o:r+o for (r, o) in zip(range, offset))
+
 flatten_reduced_dimensions(worksize, dims) = Tuple(i ∈ dims ? 1 : worksize[i] for i = 1:3)
 
 function heuristic_workgroup(Wx, Wy, Wz=nothing)
@@ -108,8 +110,9 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
         return nothing
     end
     
+    # We can only launch offset kernels with Static sizes!!!!
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
-                                kernel!(Architectures.device(arch), workgroup, worksize, offset) 
+                                kernel!(Architectures.device(arch), StaticSize(workgroup), OffsetStaticSize(contiguousrange(worksize, offset))) 
 
     @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
 
@@ -121,3 +124,99 @@ end
 # When dims::Val
 @inline launch!(arch, grid, ::Val{workspec}, args...; kwargs...) where workspec =
     launch!(arch, grid, workspec, args...; kwargs...)
+
+#####
+##### Extension to KA for offset indices: to remove when implemented in KA
+##### Allows to call a kernel with kernel(dev, workgroup, worksize, offsets) 
+##### where offsets is a tuple containing the offset to pass to @index
+#####
+
+# TODO: when offsets are implemented in KA so that we can call `kernel(dev, group, size, offsets)`, remove all of this
+
+using KernelAbstractions: Kernel
+using KernelAbstractions.NDIteration: _Size, StaticSize
+using KernelAbstractions.NDIteration: NDRange
+
+struct OffsetStaticSize{S} <: _Size
+    function OffsetStaticSize{S}() where S
+        new{S::Tuple{Vararg}}()
+    end
+end
+
+import Base
+import Base: @pure
+import KernelAbstractions: get, expand
+
+@pure OffsetStaticSize(s::Tuple{Vararg{Int}}) = OffsetStaticSize{s}() 
+@pure OffsetStaticSize(s::Int...) = OffsetStaticSize{s}() 
+@pure OffsetStaticSize(s::Type{<:Tuple}) = OffsetStaticSize{tuple(s.parameters...)}()
+@pure OffsetStaticSize(s::Tuple{Vararg{UnitRange{Int}}}) = OffsetStaticSize{s}()
+
+# Some @pure convenience functions for `OffsetStaticSize` (following `StaticSize` in KA)
+@pure get(::Type{OffsetStaticSize{S}}) where {S} = S
+@pure get(::OffsetStaticSize{S}) where {S} = S
+@pure Base.getindex(::OffsetStaticSize{S}, i::Int) where {S} = i <= length(S) ? S[i] : 1
+@pure Base.ndims(::OffsetStaticSize{S}) where {S}  = length(S)
+@pure Base.length(::OffsetStaticSize{S}) where {S} = prod(worksize.(S))
+
+@inline getrange(::OffsetStaticSize{S}) where {S} = worksize(S), offsets(S)
+@inline getrange(::Type{OffsetStaticSize{S}}) where {S} = worksize(S), offsets(S)
+@inline offsets(::OffsetStaticSize{S}) where {S} = Tuple(s.start - 1 for s in S)
+
+@inline worksize(i::Tuple) = worksize.(i)
+@inline worksize(i::Int) = i
+@inline worksize(i::UnitRange) = length(i)
+
+# NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!!
+@inline function expand(ndrange::NDRange{N, StaticSize, StaticSize, O}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N, O<:Tuple}
+    nI = ntuple(Val(N)) do I
+        Base.@_inline_meta
+        stride = size(workitems(ndrange), I)
+        gidx = groupidx.I[I]
+        (gidx-1)*stride + idx.I[I] + ndrange.workitems[I]
+    end
+    CartesianIndex(nI)
+end
+
+using KernelAbstractions.NDIteration
+using KernelAbstractions: ndrange, workgroupsize
+import KernelAbstractions: partition
+
+using KernelAbstractions: CompilerMetadata
+import KernelAbstractions: __ndrange
+
+@inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize}  = CartesianIndices(get(NDRange))
+
+
+# Kernel{<:Any, <:StaticSize, <:StaticSize} and Kernel{<:Any, <:StaticSize, <:OffsetStaticSize} are the only kernels used by Oceananigans
+const OffsetKernel = Kernel{<:Any, <:StaticSize, <:OffsetStaticSize}
+
+# Extending the partition function to include offsets in NDRange: note that in this case the 
+# offsets take the place of the DynamicWorkitems which we assume is not needed in static kernels
+function partition(kernel::OffsetKernel, inrange, ingroupsize)
+    static_ndrange = ndrange(kernel)
+    static_workgroupsize = workgroupsize(kernel)
+
+    if inrange !== nothing && inrange != get(static_ndrange)
+        error("Static NDRange ($static_ndrange) and launch NDRange ($inrange) differ")
+    end
+    range, offsets = getrange(static_ndrange)
+
+    if static_workgroupsize <: StaticSize
+        if ingroupsize !== nothing && ingroupsize != get(static_workgroupsize)
+            error("Static WorkgroupSize ($static_workgroupsize) and launch WorkgroupSize $(ingroupsize) differ")
+        end
+        groupsize = get(static_workgroupsize)
+    end
+
+    @assert groupsize !== nothing
+    @assert range !== nothing
+    blocks, groupsize, dynamic = NDIteration.partition(range, groupsize)
+
+    static_blocks = StaticSize{blocks}
+    static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize
+    
+    iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, offsets)
+    return iterspace, dynamic
+end
+

From dad5ad9596f71b992a9fc57e3666e623ec5770fb Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:05:40 -0400
Subject: [PATCH 485/530] bugfix

---
 src/Utils/kernel_launching.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index d55b658a0d..c3c4b861e2 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -167,8 +167,10 @@ import KernelAbstractions: get, expand
 @inline worksize(i::Int) = i
 @inline worksize(i::UnitRange) = length(i)
 
+const OffsetNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:Tuple} where N
+
 # NDRange has been modified to have offsets in place of workitems: Remember, dynamic offset kernels are not possible with this extension!!
-@inline function expand(ndrange::NDRange{N, StaticSize, StaticSize, O}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N, O<:Tuple}
+@inline function expand(ndrange::OffsetNDRange{N}, groupidx::CartesianIndex{N}, idx::CartesianIndex{N}) where {N}
     nI = ntuple(Val(N)) do I
         Base.@_inline_meta
         stride = size(workitems(ndrange), I)
@@ -176,6 +178,7 @@ import KernelAbstractions: get, expand
         (gidx-1)*stride + idx.I[I] + ndrange.workitems[I]
     end
     CartesianIndex(nI)
+    @show nI
 end
 
 using KernelAbstractions.NDIteration
@@ -187,7 +190,6 @@ import KernelAbstractions: __ndrange
 
 @inline __ndrange(::CompilerMetadata{NDRange}) where {NDRange<:OffsetStaticSize}  = CartesianIndices(get(NDRange))
 
-
 # Kernel{<:Any, <:StaticSize, <:StaticSize} and Kernel{<:Any, <:StaticSize, <:OffsetStaticSize} are the only kernels used by Oceananigans
 const OffsetKernel = Kernel{<:Any, <:StaticSize, <:OffsetStaticSize}
 

From 10b2e9775d844c6943cb172d936dd6837982df9e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:22:52 -0400
Subject: [PATCH 486/530] bugfixxed

---
 src/Utils/kernel_launching.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index c3c4b861e2..7719505a3f 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -114,6 +114,7 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), StaticSize(workgroup), OffsetStaticSize(contiguousrange(worksize, offset))) 
 
+    @show loop! typeof(loop!)
     @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
 
     loop!(kernel_args...)
@@ -161,7 +162,8 @@ import KernelAbstractions: get, expand
 
 @inline getrange(::OffsetStaticSize{S}) where {S} = worksize(S), offsets(S)
 @inline getrange(::Type{OffsetStaticSize{S}}) where {S} = worksize(S), offsets(S)
-@inline offsets(::OffsetStaticSize{S}) where {S} = Tuple(s.start - 1 for s in S)
+
+@inline offsets(ranges::Tuple{Vararg{UnitRange}}) = Tuple(r.start - 1 for r in ranges)
 
 @inline worksize(i::Tuple) = worksize.(i)
 @inline worksize(i::Int) = i
@@ -178,7 +180,6 @@ const OffsetNDRange{N} = NDRange{N, <:StaticSize, <:StaticSize, <:Any, <:Tuple}
         (gidx-1)*stride + idx.I[I] + ndrange.workitems[I]
     end
     CartesianIndex(nI)
-    @show nI
 end
 
 using KernelAbstractions.NDIteration
@@ -219,6 +220,7 @@ function partition(kernel::OffsetKernel, inrange, ingroupsize)
     static_workgroupsize = StaticSize{groupsize} # we might have padded workgroupsize
     
     iterspace = NDRange{length(range), static_blocks, static_workgroupsize}(blocks, offsets)
+
     return iterspace, dynamic
 end
 

From 25316a674c20ad4054be028c7f6a2f6ebbc83f00 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 28 Aug 2023 15:24:52 -0400
Subject: [PATCH 487/530] remove debugging

---
 src/Utils/kernel_launching.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Utils/kernel_launching.jl b/src/Utils/kernel_launching.jl
index 7719505a3f..aaedcb9917 100644
--- a/src/Utils/kernel_launching.jl
+++ b/src/Utils/kernel_launching.jl
@@ -114,7 +114,6 @@ function launch!(arch, grid, workspec, kernel!, kernel_args...;
     loop! = isnothing(offset) ? kernel!(Architectures.device(arch), workgroup, worksize) : 
                                 kernel!(Architectures.device(arch), StaticSize(workgroup), OffsetStaticSize(contiguousrange(worksize, offset))) 
 
-    @show loop! typeof(loop!)
     @debug "Launching kernel $kernel! with worksize $worksize and offsets $offset from $workspec"
 
     loop!(kernel_args...)

From 6d21230b82242384ba0a4b1bc9e73fbb0fc79a40 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 10:55:10 -0400
Subject: [PATCH 488/530] going back

---
 src/Advection/vector_invariant_advection.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index 72bf327b08..ffc0d45454 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -83,7 +83,7 @@ function VectorInvariant(; vorticity_scheme::AbstractAdvectionScheme{N, FT} = En
                            vorticity_stencil    = VelocityStencil(),
                            vertical_scheme      = EnergyConservingScheme(),
                            ke_gradient_scheme   = vertical_scheme,
-                           divergence_scheme    = vorticity_scheme,
+                           divergence_scheme    = vertical_scheme,
                            upwinding  = OnlySelfUpwinding(; cross_scheme = vertical_scheme),
                            multi_dimensional_stencil = false) where {N, FT}
         

From f8de976c288283beb0a3cfd1ae049969c780a897 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 16:08:26 -0400
Subject: [PATCH 489/530] more robus partitioning

---
 src/Distributed/distributed_grids.jl  | 12 ++++++------
 src/Distributed/partition_assemble.jl | 26 +++++++++-----------------
 2 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index 21b4787676..ffa007dd34 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -51,9 +51,9 @@ function RectilinearGrid(arch::DistributedArch,
     TY = insert_connected_topology(TY, Ry, rj)
     TZ = insert_connected_topology(TZ, Rz, rk)
     
-    xl = partition(x, nx, Rx, ri)
-    yl = partition(y, ny, Ry, rj)
-    zl = z
+    xl = partition(x, nx, arch, 1)
+    yl = partition(y, ny, arch, 2)
+    zl = partition(z, nz, arch, 3)
 
     Lx, xᶠᵃᵃ, xᶜᵃᵃ, Δxᶠᵃᵃ, Δxᶜᵃᵃ = generate_coordinate(FT, topology[1](), nx, Hx, xl, child_architecture(arch))
     Ly, yᵃᶠᵃ, yᵃᶜᵃ, Δyᵃᶠᵃ, Δyᵃᶜᵃ = generate_coordinate(FT, topology[2](), ny, Hy, yl, child_architecture(arch))
@@ -97,9 +97,9 @@ function LatitudeLongitudeGrid(arch::DistributedArch,
     TY = insert_connected_topology(topology[2], Ry, rj)
     TZ = insert_connected_topology(topology[3], Rz, rk)
 
-    λl = partition(longitude, nλ, Rx, ri)
-    φl = partition(latitude,  nφ, Ry, rj)
-    zl = z
+    λl = partition(longitude, nλ, arch, 1)
+    φl = partition(latitude,  nφ, arch, 2)
+    zl = partition(z,         nz, arch, 3)
 
     # Calculate all direction (which might be stretched)
     # A direction is regular if the domain passed is a Tuple{<:Real, <:Real}, 
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index 70e724b2c7..f560ec7cd8 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -17,7 +17,7 @@ concatenate_local_sizes(n, arch::DistributedArch) =
 function concatenate_local_sizes(n, arch::DistributedArch, idx)
     R = arch.ranks[idx]
     r = arch.local_index[idx]
-    n = n[idx]
+    n = n isa Number ? n : n[idx]
     l = zeros(Int, R)
 
     r1, r2 = arch.local_index[[1, 2, 3] .!= idx]
@@ -31,27 +31,19 @@ function concatenate_local_sizes(n, arch::DistributedArch, idx)
     return l
 end
 
-function concatenate_local_sizes(n, R, r) 
-    l = zeros(Int, R)
-    l[r] = n
-    MPI.Allreduce!(l, +, MPI.COMM_WORLD)
-
-    return l
-end
-
 # Partitioning (localization of global objects) and assembly (global assembly of local objects)
 # Used for grid constructors (cpu_face_constructor_x, cpu_face_constructor_y, cpu_face_constructor_z)
 # which means that we need to repeat the value at the right boundary
-
-function partition(c::AbstractVector, n, R, r)
-    nl = concatenate_local_sizes(n, R, r)
-    return c[1 + sum(nl[1:r-1]) : 1 + sum(nl[1:r])]
+function partition(c::AbstractVector, n, arch, idx)
+    nl = concatenate_local_sizes(n, arch, idx)
+    r  = arch.local_index[idx]
+    return c[1 + sum(nl[1:r-1]) : sum(nl[1:r])]
 end
 
-function partition(c::Tuple, n, R, r)
-    nl = concatenate_local_sizes(n, R, r)
+function partition(c::Tuple, n, arch, idx)
+    nl = concatenate_local_sizes(n, arch, idx)
     N  = sum(nl)
-
+    R  = arch.ranks[idx]
     Δl = (c[2] - c[1]) / N  
 
     l = Tuple{Float64, Float64}[(c[1], c[1] + Δl * nl[1])]
@@ -60,7 +52,7 @@ function partition(c::Tuple, n, R, r)
         push!(l, (lp, lp + Δl * nl[i]))
     end
 
-    return l[r]
+    return l[arch.local_index[idx]]
 end
 
 """

From 4824add7277717ba55c36e8f232aaf194c2aa34d Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 16:09:06 -0400
Subject: [PATCH 490/530] quite new

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index bdc0864717..1718a029a6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Oceananigans"
 uuid = "9e8cae18-63c1-5223-a75c-80ca9d6e9a09"
 authors = ["Climate Modeling Alliance and contributors"]
-version = "0.87.2"
+version = "0.88.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 4416f37c4ee29732376de6e14a4b777c89f23f7a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 16:57:15 -0400
Subject: [PATCH 491/530] bugfix

---
 .../CATKEVerticalDiffusivities.jl                | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index 90c06aa171..ce2f443c04 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -58,25 +58,17 @@ end
 
 CATKEVerticalDiffusivity{TD}(mixing_length::CL,
                              turbulent_kinetic_energy_equation::TKE,
-<<<<<<< HEAD
-                             maximum_diffusivity::FT,
-=======
                              maximum_tracer_diffusivity::FT,
                              maximum_tke_diffusivity::FT,
                              maximum_viscosity::FT,
->>>>>>> origin/main
                              minimum_turbulent_kinetic_energy::FT,
                              minimum_convective_buoyancy_flux::FT,
                              negative_turbulent_kinetic_energy_damping_time_scale::FT) where {TD, CL, TKE, FT} =
     CATKEVerticalDiffusivity{TD, CL, FT, TKE}(mixing_length,
                                               turbulent_kinetic_energy_equation,
-<<<<<<< HEAD
-                                              maximum_diffusivity,
-=======
                                               maximum_tracer_diffusivity,
                                               maximum_tke_diffusivity,
                                               maximum_viscosity,
->>>>>>> origin/main
                                               minimum_turbulent_kinetic_energy,
                                               minimum_convective_buoyancy_flux,
                                               negative_turbulent_kinetic_energy_damping_time_scale)
@@ -96,13 +88,9 @@ include("turbulent_kinetic_energy_equation.jl")
                              FT = Float64;]
                              mixing_length = MixingLength(),
                              turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
-<<<<<<< HEAD
-                             maximum_diffusivity = Inf,
-=======
                              maximum_tracer_diffusivity = Inf,
                              maximum_tke_diffusivity = Inf,
                              maximum_viscosity = Inf,
->>>>>>> origin/main
                              minimum_turbulent_kinetic_energy = 1e-6,
                              minimum_convective_buoyancy_flux = 1e-8,
                              negative_turbulent_kinetic_energy_damping_time_scale = 1minute)
@@ -153,13 +141,9 @@ function CATKEVerticalDiffusivity(time_discretization::TD = VerticallyImplicitTi
                                   FT = Float64;
                                   mixing_length = MixingLength(),
                                   turbulent_kinetic_energy_equation = TurbulentKineticEnergyEquation(),
-<<<<<<< HEAD
-                                  maximum_diffusivity = Inf,
-=======
                                   maximum_tracer_diffusivity = Inf,
                                   maximum_tke_diffusivity = Inf,
                                   maximum_viscosity = Inf,
->>>>>>> origin/main
                                   minimum_turbulent_kinetic_energy = 1e-6,
                                   minimum_convective_buoyancy_flux = 1e-8,
                                   negative_turbulent_kinetic_energy_damping_time_scale = 1minute) where TD

From 74ef9eb142270448c2ad357565b7a6591e77214b Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 17:11:34 -0400
Subject: [PATCH 492/530] updated Manifest

---
 Manifest.toml | 59 ++++++++++++++++++++++-----------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index 450d63bdc4..363cd69179 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.9.3"
 manifest_format = "2.0"
-project_hash = "aa82d3116bea9a2fdd8d20fa51673c89bc397f8b"
+project_hash = "ea620612cb5f84093b962d4345aa7d1b1271739c"
 
 [[deps.AbstractFFTs]]
 deps = ["LinearAlgebra"]
@@ -91,9 +91,9 @@ version = "0.1.2"
 
 [[deps.CUDA]]
 deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Preferences", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "UnsafeAtomicsLLVM"]
-git-tree-sha1 = "35160ef0f03b14768abfd68b830f8e3940e8e0dc"
+git-tree-sha1 = "968c1365e2992824c3e7a794e30907483f8469a9"
 uuid = "052768ef-5323-5732-b1bb-66c8b64840ba"
-version = "4.4.0"
+version = "4.4.1"
 
 [[deps.CUDA_Driver_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
@@ -113,12 +113,6 @@ git-tree-sha1 = "5248d9c45712e51e27ba9b30eebec65658c6ce29"
 uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
 version = "0.6.0+0"
 
-[[deps.ChainRulesCore]]
-deps = ["Compat", "LinearAlgebra", "SparseArrays"]
-git-tree-sha1 = "e30f2f4e20f7f186dc36529910beaedc60cfa644"
-uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "1.16.0"
-
 [[deps.CommonDataModel]]
 deps = ["CFTime", "DataStructures", "Dates", "Preferences", "Printf"]
 git-tree-sha1 = "2678b3fc170d582655a14d22867b031b6e43c2d4"
@@ -252,9 +246,9 @@ version = "1.3.1"
 
 [[deps.HDF5_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "LazyArtifacts", "LibCURL_jll", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "OpenSSL_jll", "TOML", "Zlib_jll", "libaec_jll"]
-git-tree-sha1 = "592e1c427983a465831fc73c5ae0ca5d0ac13a9e"
+git-tree-sha1 = "10c72358aaaa5cd6bc7cc39b95e6eadf92f5a336"
 uuid = "0234f1f7-429e-5d53-9886-15a909be8d59"
-version = "1.14.1+0"
+version = "1.14.2+0"
 
 [[deps.IfElse]]
 git-tree-sha1 = "debdd00ffef04665ccbb3e150747a77560e8fad1"
@@ -325,15 +319,15 @@ version = "0.9.8"
 
 [[deps.LLVM]]
 deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "8695a49bfe05a2dc0feeefd06b4ca6361a018729"
+git-tree-sha1 = "a9d2ce1d5007b1e8f6c5b89c5a31ff8bd146db5c"
 uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "6.1.0"
+version = "6.2.1"
 
 [[deps.LLVMExtra_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "c35203c1e1002747da220ffc3c0762ce7754b08c"
+git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa"
 uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.23+0"
+version = "0.0.25+0"
 
 [[deps.LLVMOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
@@ -368,10 +362,10 @@ version = "1.10.2+0"
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
 [[deps.Libiconv_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "c7cb1f5d892775ba13767a87c7ada0b980ea0a71"
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175"
 uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
-version = "1.16.1+2"
+version = "1.17.0+0"
 
 [[deps.LinearAlgebra]]
 deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
@@ -404,9 +398,9 @@ version = "2023.2.0+0"
 
 [[deps.MPI]]
 deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "PkgVersion", "PrecompileTools", "Requires", "Serialization", "Sockets"]
-git-tree-sha1 = "32cafbe56c7f0b7160a1a6c492773af66c0b722f"
+git-tree-sha1 = "df53d0e1e0dbebf2315f4cd35e13e52ad43416c2"
 uuid = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-version = "0.20.14"
+version = "0.20.15"
 
     [deps.MPI.extensions]
     AMDGPUExt = "AMDGPU"
@@ -550,9 +544,9 @@ version = "1.9.2"
 
 [[deps.PkgVersion]]
 deps = ["Pkg"]
-git-tree-sha1 = "f6cf8e7944e50901594838951729a1861e668cb8"
+git-tree-sha1 = "f9501cc0430a26bc3d156ae1b5b0c1b47af4d6da"
 uuid = "eebad327-c553-4316-9ea0-9fa01ccd7688"
-version = "0.3.2"
+version = "0.3.3"
 
 [[deps.PrecompileTools]]
 deps = ["Preferences"]
@@ -649,12 +643,6 @@ version = "0.3.2"
 [[deps.Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
-[[deps.SnoopPrecompile]]
-deps = ["Preferences"]
-git-tree-sha1 = "e760a70afdcd461cf01a575947738d359234665c"
-uuid = "66db9d55-30c0-4569-8b51-7e840670fc0c"
-version = "1.0.3"
-
 [[deps.Sockets]]
 uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -671,6 +659,9 @@ version = "2.3.1"
     [deps.SpecialFunctions.extensions]
     SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
 
+    [deps.SpecialFunctions.weakdeps]
+    ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
+
 [[deps.Static]]
 deps = ["IfElse"]
 git-tree-sha1 = "f295e0a1da4ca425659c57441bcb59abb035a4bc"
@@ -678,10 +669,10 @@ uuid = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
 version = "0.8.8"
 
 [[deps.StaticArrayInterface]]
-deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "Requires", "SnoopPrecompile", "SparseArrays", "Static", "SuiteSparse"]
-git-tree-sha1 = "33040351d2403b84afce74dae2e22d3f5b18edcb"
+deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "PrecompileTools", "Requires", "SparseArrays", "Static", "SuiteSparse"]
+git-tree-sha1 = "03fec6800a986d191f64f5c0996b59ed526eda25"
 uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
-version = "1.4.0"
+version = "1.4.1"
 weakdeps = ["OffsetArrays", "StaticArrays"]
 
     [deps.StaticArrayInterface.extensions]
@@ -831,10 +822,10 @@ uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
 version = "1.3.0"
 
 [[deps.XML2_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"]
-git-tree-sha1 = "93c41695bc1c08c46c5899f4fe06d6ead504bb73"
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
+git-tree-sha1 = "04a51d15436a572301b5abbb9d099713327e9fc4"
 uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
-version = "2.10.3+0"
+version = "2.10.4+0"
 
 [[deps.Zlib_jll]]
 deps = ["Libdl"]

From 20b470c00771077d2b68821c6078b90a1832a5a0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Wed, 6 Sep 2023 17:12:07 -0400
Subject: [PATCH 493/530] build with 1.9.3

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index e4d2842862..5ff76b2118 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,5 +1,5 @@
 env:
-  JULIA_VERSION: "1.9.2"
+  JULIA_VERSION: "1.9.3"
   JULIA_MINOR_VERSION: "1.9"
   SVERDRUP_HOME: "/data5/glwagner"
   TARTARUS_HOME: "/storage5/buildkite-agent"

From 943458a92285867c80cd6a5bac90ae9f8dfdaf51 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 8 Sep 2023 16:32:53 -0400
Subject: [PATCH 494/530] switch boundary_buffer to required_halo_size

---
 src/Advection/Advection.jl                           |  3 +--
 .../topologically_conditional_interpolation.jl       | 12 ++++++------
 .../shallow_water_diffusion_operators.jl             |  4 ++--
 src/TurbulenceClosures/TurbulenceClosures.jl         |  5 ++---
 src/TurbulenceClosures/closure_tuples.jl             |  1 -
 .../scalar_biharmonic_diffusivity.jl                 |  6 ++----
 .../scalar_diffusivity.jl                            |  4 ++--
 .../src/TwoDimensionalBurgersAdvection.jl            |  2 +-
 .../src/TwoDimensionalGaussianAdvectionDiffusion.jl  |  8 ++++----
 .../src/TwoDimensionalVortexAdvection.jl             |  2 +-
 10 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/src/Advection/Advection.jl b/src/Advection/Advection.jl
index 5d851d5ca0..fb2f5401dd 100644
--- a/src/Advection/Advection.jl
+++ b/src/Advection/Advection.jl
@@ -54,8 +54,7 @@ abstract type AbstractUpwindBiasedAdvectionScheme{B, FT} <: AbstractAdvectionSch
 # Note that it is not possible to compile schemes for `advection_buffer = 41` or higher.
 const advection_buffers = [1, 2, 3, 4, 5, 6]
 
-@inline boundary_buffer(::AbstractAdvectionScheme{B}) where B = B
-@inline required_halo_size(scheme::AbstractAdvectionScheme{B}) where B = B
+@inline required_halo_size(::AbstractAdvectionScheme{B}) where B = B
 
 include("centered_advective_fluxes.jl")
 include("upwind_biased_advective_fluxes.jl")
diff --git a/src/Advection/topologically_conditional_interpolation.jl b/src/Advection/topologically_conditional_interpolation.jl
index e1a226af0d..0f0fadcf95 100644
--- a/src/Advection/topologically_conditional_interpolation.jl
+++ b/src/Advection/topologically_conditional_interpolation.jl
@@ -26,12 +26,12 @@ const AUGXYZ = AUG{<:Any, <:Bounded, <:Bounded, <:Bounded}
 # Left-biased buffers are smaller by one grid point on the right side; vice versa for right-biased buffers
 # Center interpolation stencil look at i + 1 (i.e., require one less point on the left)
 
-@inline    outside_symmetric_bufferᶠ(i, N, adv) = (i >= boundary_buffer(adv) + 1) & (i <= N + 1 - boundary_buffer(adv))
-@inline    outside_symmetric_bufferᶜ(i, N, adv) = (i >= boundary_buffer(adv))     & (i <= N + 1 - boundary_buffer(adv))
-@inline  outside_left_biased_bufferᶠ(i, N, adv) = (i >= boundary_buffer(adv) + 1) & (i <= N + 1 - (boundary_buffer(adv) - 1))
-@inline  outside_left_biased_bufferᶜ(i, N, adv) = (i >= boundary_buffer(adv))     & (i <= N + 1 - (boundary_buffer(adv) - 1))
-@inline outside_right_biased_bufferᶠ(i, N, adv) = (i >= boundary_buffer(adv))     & (i <= N + 1 - boundary_buffer(adv))
-@inline outside_right_biased_bufferᶜ(i, N, adv) = (i >= boundary_buffer(adv) - 1) & (i <= N + 1 - boundary_buffer(adv))
+@inline    outside_symmetric_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - required_halo_size(adv))
+@inline    outside_symmetric_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))
+@inline  outside_left_biased_haloᶠ(i, N, adv) = (i >= required_halo_size(adv) + 1) & (i <= N + 1 - (required_halo_size(adv) - 1))
+@inline  outside_left_biased_haloᶜ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - (required_halo_size(adv) - 1))
+@inline outside_right_biased_haloᶠ(i, N, adv) = (i >= required_halo_size(adv))     & (i <= N + 1 - required_halo_size(adv))
+@inline outside_right_biased_haloᶜ(i, N, adv) = (i >= required_halo_size(adv) - 1) & (i <= N + 1 - required_halo_size(adv))
 
 # Separate High order advection from low order advection
 const HOADV = Union{WENO, 
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index 6952a111a8..c8dd1d19f0 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -38,10 +38,10 @@ With the `VectorInvariantFormulation()` (that evolves ``u`` and ``v``) we comput
 ``h^{-1} 𝛁(ν h 𝛁 t)``, while with the `ConservativeFormulation()` (that evolves
 ``u h`` and ``v h``) we compute ``𝛁 (ν h 𝛁 t)``.
 """
-function ShallowWaterScalarDiffusivity(FT::DataType=Float64; ν=0, ξ=0, discrete_form=false, boundary_buffer = 1)
+function ShallowWaterScalarDiffusivity(FT::DataType=Float64; ν=0, ξ=0, discrete_form=false, required_halo_size = 1)
     ν = convert_diffusivity(FT, ν; discrete_form)
     ξ = convert_diffusivity(FT, ξ; discrete_form)
-    return ShallowWaterScalarDiffusivity{boundary_buffer}(ν, ξ)
+    return ShallowWaterScalarDiffusivity{required_halo_size}(ν, ξ)
 end
 
 # We have no tracers in the shallow water diffusivity
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 70bb762ff0..10fee681fd 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -49,7 +49,7 @@ using Oceananigans.BuoyancyModels
 using Oceananigans.Utils
 
 using Oceananigans.Architectures: AbstractArchitecture, device
-import Oceananigans.Advection: boundary_buffer, required_halo_size
+import Oceananigans.Advection: required_halo_size
 
 const VerticallyBoundedGrid{FT} = AbstractGrid{FT, <:Any, <:Any, <:Bounded}
 
@@ -62,7 +62,7 @@ const VerticallyBoundedGrid{FT} = AbstractGrid{FT, <:Any, <:Any, <:Bounded}
 
 Abstract supertype for turbulence closures.
 """
-abstract type AbstractTurbulenceClosure{TimeDiscretization, BoundaryBuffer} end
+abstract type AbstractTurbulenceClosure{TimeDiscretization, RequiredHalo} end
 
 # Fallbacks
 validate_closure(closure) = closure
@@ -75,7 +75,6 @@ calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs.
 # point at each side to calculate viscous fluxes at the edge of the domain. 
 # If diffusivity itself requires one halo to be computed (e.g. κ = ℑxᶠᵃᵃ(i, j, k, grid, ℑxᶜᵃᵃ, T),
 # or `AnisotropicMinimumDissipation` and `SmagorinskyLilly`) then B = 2
-@inline boundary_buffer(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B
 @inline required_halo_size(::AbstractTurbulenceClosure{TD, B}) where {TD, B} = B 
 
 const ClosureKinda = Union{Nothing, AbstractTurbulenceClosure, AbstractArray{<:AbstractTurbulenceClosure}}
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index f48d77ac75..edf37db41c 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -89,7 +89,6 @@ function add_closure_specific_boundary_conditions(closure_tuple::Tuple, bcs, arg
     return bcs
 end
 
-boundary_buffer(closure_tuple::Tuple)    = maximum(map(boundary_buffer, closure_tuple))
 required_halo_size(closure_tuple::Tuple) = maximum(map(required_halo_size, closure_tuple))
 
 #####
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index bbb5fae63e..a6dfa008c0 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -19,8 +19,6 @@ end
           HorizontalScalarBiharmonicDiffusivity(FT::DataType=Float64; kwargs...) = ScalarBiharmonicDiffusivity(HorizontalFormulation(), FT; kwargs...)
 HorizontalDivergenceScalarBiharmonicDiffusivity(FT::DataType=Float64; kwargs...) = ScalarBiharmonicDiffusivity(HorizontalDivergenceFormulation(), FT; kwargs...)
 
-required_halo_size(::ScalarBiharmonicDiffusivity) = 2
-
 """
     ScalarBiharmonicDiffusivity(formulation = ThreeDimensionalFormulation(), FT = Float64;
                                 ν = 0,
@@ -74,11 +72,11 @@ function ScalarBiharmonicDiffusivity(formulation = ThreeDimensionalFormulation()
                                      discrete_form = false,
                                      loc = (nothing, nothing, nothing),
                                      parameters = nothing,
-                                     boundary_buffer = 1)
+                                     required_halo_size = 1)
 
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)
-    return ScalarBiharmonicDiffusivity{typeof(formulation), boundary_buffer}(ν, κ)
+    return ScalarBiharmonicDiffusivity{typeof(formulation), required_halo_size}(ν, κ)
 end
 
 function with_tracers(tracers, closure::ScalarBiharmonicDiffusivity{F, N}) where {F, N}
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 489040fdd6..883d449ef2 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -106,7 +106,7 @@ function ScalarDiffusivity(time_discretization=ExplicitTimeDiscretization(),
                            discrete_form = false,
                            loc = (nothing, nothing, nothing),
                            parameters = nothing,
-                           boundary_buffer = 1)
+                           required_halo_size = 1)
 
     if formulation == HorizontalFormulation() && time_discretization == VerticallyImplicitTimeDiscretization()
         throw(ArgumentError("VerticallyImplicitTimeDiscretization is only supported for `VerticalFormulation` or `ThreeDimensionalFormulation`"))
@@ -115,7 +115,7 @@ function ScalarDiffusivity(time_discretization=ExplicitTimeDiscretization(),
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
 
-    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation), boundary_buffer}(ν, κ)
+    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation), required_halo_points}(ν, κ)
 end
 
 # Explicit default
diff --git a/validation/convergence_tests/src/TwoDimensionalBurgersAdvection.jl b/validation/convergence_tests/src/TwoDimensionalBurgersAdvection.jl
index e57532d8fe..340ce7c440 100644
--- a/validation/convergence_tests/src/TwoDimensionalBurgersAdvection.jl
+++ b/validation/convergence_tests/src/TwoDimensionalBurgersAdvection.jl
@@ -6,7 +6,7 @@ using Statistics
 using Oceananigans
 using Oceananigans.Grids
 using Oceananigans.Advection
-using Oceananigans.Advection: boundary_buffer
+using Oceananigans.Advection: required_halo_size
 using Oceananigans.Models.ShallowWaterModels: VectorInvariantFormulation, ConservativeFormulation, shallow_water_velocities
 using Oceananigans.Fields: interior
 
diff --git a/validation/convergence_tests/src/TwoDimensionalGaussianAdvectionDiffusion.jl b/validation/convergence_tests/src/TwoDimensionalGaussianAdvectionDiffusion.jl
index 9fc45827e4..d48829c097 100644
--- a/validation/convergence_tests/src/TwoDimensionalGaussianAdvectionDiffusion.jl
+++ b/validation/convergence_tests/src/TwoDimensionalGaussianAdvectionDiffusion.jl
@@ -6,7 +6,7 @@ using Statistics
 using Oceananigans
 using Oceananigans.Grids
 using Oceananigans.Advection
-using Oceananigans.Advection: boundary_buffer
+using Oceananigans.Advection: required_halo_size
 using Oceananigans.Fields: interior
 
 using ConvergenceTests: compute_error
@@ -42,7 +42,7 @@ function run_test(; Nx, Δt, stop_iteration, U = 1, κ = 1e-4, width = 0.05,
 
     simulation = Simulation(model, Δt=Δt, stop_iteration=stop_iteration)
 
-    @info "Running Gaussian advection diffusion test for wxy, and cxy with Nx = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(boundary_buffer(advection)))..."
+    @info "Running Gaussian advection diffusion test for wxy, and cxy with Nx = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(required_halo_size(advection)))..."
     run!(simulation)
 
     x = xnodes(model.tracers.c)
@@ -81,7 +81,7 @@ function run_test(; Nx, Δt, stop_iteration, U = 1, κ = 1e-4, width = 0.05,
 
     simulation = Simulation(model, Δt=Δt, stop_iteration=stop_iteration)
 
-    @info "Running Gaussian advection diffusion test for uyz and cyz with Ny = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(boundary_buffer(advection)))..."
+    @info "Running Gaussian advection diffusion test for uyz and cyz with Ny = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(required_halo_size(advection)))..."
     run!(simulation)
 
     # Calculate errors
@@ -113,7 +113,7 @@ function run_test(; Nx, Δt, stop_iteration, U = 1, κ = 1e-4, width = 0.05,
 
     simulation = Simulation(model, Δt=Δt, stop_iteration=stop_iteration)
 
-    @info "Running Gaussian advection diffusion test for vxz and cxz with Nz = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(boundary_buffer(advection)))..."
+    @info "Running Gaussian advection diffusion test for vxz and cxz with Nz = $Nx and Δt = $Δt ($(typeof(advection).name.wrapper) buffer $(required_halo_size(advection)))..."
     run!(simulation)
 
     # Calculate errors
diff --git a/validation/convergence_tests/src/TwoDimensionalVortexAdvection.jl b/validation/convergence_tests/src/TwoDimensionalVortexAdvection.jl
index d57cb58f1c..fe605fdf48 100644
--- a/validation/convergence_tests/src/TwoDimensionalVortexAdvection.jl
+++ b/validation/convergence_tests/src/TwoDimensionalVortexAdvection.jl
@@ -6,7 +6,7 @@ using Statistics
 using Oceananigans
 using Oceananigans.Grids
 using Oceananigans.Advection
-using Oceananigans.Advection: boundary_buffer, VelocityStencil
+using Oceananigans.Advection: required_halo_size, VelocityStencil
 using Oceananigans.Models.ShallowWaterModels: VectorInvariantFormulation, ConservativeFormulation, shallow_water_velocities
 using Oceananigans.Fields: interior
 

From 13982e3876397a471582f71ee1b0008a00089f78 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:01:00 -0400
Subject: [PATCH 495/530] bugfix

---
 .../hydrostatic_free_surface_tendency_kernel_functions.jl       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
index 283b8228f8..01ceaa21ec 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_tendency_kernel_functions.jl
@@ -6,7 +6,7 @@ using Oceananigans.StokesDrift
 using Oceananigans.TurbulenceClosures: ∂ⱼ_τ₁ⱼ, ∂ⱼ_τ₂ⱼ, ∇_dot_qᶜ
 using Oceananigans.Biogeochemistry: biogeochemical_transition, biogeochemical_drift_velocity
 using Oceananigans.TurbulenceClosures: immersed_∂ⱼ_τ₁ⱼ, immersed_∂ⱼ_τ₂ⱼ, immersed_∂ⱼ_τ₃ⱼ, immersed_∇_dot_qᶜ
-using Oceananigans.Advection: div_Uc, U_dot_∇u, U_dot_∇v, boundary_buffer
+using Oceananigans.Advection: div_Uc, U_dot_∇u, U_dot_∇v
 using Oceananigans.Forcings: with_advective_forcing
 using Oceananigans.TurbulenceClosures: shear_production, buoyancy_flux, dissipation
 using Oceananigans.Utils: SumOfArrays

From a5ff1bccd4d1e521d7e5f7a944e14fc0d6e58cc0 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 8 Sep 2023 17:08:01 -0400
Subject: [PATCH 496/530] Update
 src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl

Co-authored-by: Gregory L. Wagner <wagner.greg@gmail.com>
---
 .../HydrostaticFreeSurfaceModels/single_column_model_mode.jl     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index eb9d597e11..c233f00eb1 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -55,7 +55,6 @@ calculate_free_surface_tendency!(::SingleColumnGrid, args...) = nothing
 calculate_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
 calculate_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
 
-
 # Fast state update and halo filling
 
 function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGrid, callbacks)

From 44cff40dd99c5de5a4df5287ccb7c2cb660e5ae8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Fri, 8 Sep 2023 17:08:29 -0400
Subject: [PATCH 497/530] Update
 src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl

Co-authored-by: Gregory L. Wagner <wagner.greg@gmail.com>
---
 .../update_hydrostatic_free_surface_model_state.jl              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 6e97404928..beeb1909ec 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -35,7 +35,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
     @apply_regionally compute_w_diffusivities_pressure!(model)
     fill_halo_regions!(model.diffusivity_fields; only_local_halos = true)
 
-    [callback(model) for callback in callbacks if isa(callback.callsite, UpdateStateCallsite)]
+    [callback(model) for callback in callbacks if callback.callsite isa UpdateStateCallsite]
     
     update_biogeochemical_state!(model.biogeochemistry, model)
 

From df6967b29021d66956aa1d323ed780e58c6569a9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:20:30 -0400
Subject: [PATCH 498/530] bugfix

---
 .../turbulence_closure_implementations/scalar_diffusivity.jl    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 883d449ef2..88c3200f5e 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -115,7 +115,7 @@ function ScalarDiffusivity(time_discretization=ExplicitTimeDiscretization(),
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
 
-    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation), required_halo_points}(ν, κ)
+    return ScalarDiffusivity{typeof(time_discretization), typeof(formulation), required_halo_size}(ν, κ)
 end
 
 # Explicit default

From 229e4aaab78df2c84698a3ef621b5dceb52caa66 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Fri, 8 Sep 2023 17:30:51 -0400
Subject: [PATCH 499/530] biharmonic requires 2 halos

---
 .../scalar_biharmonic_diffusivity.jl                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index a6dfa008c0..e30ed05308 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -72,7 +72,7 @@ function ScalarBiharmonicDiffusivity(formulation = ThreeDimensionalFormulation()
                                      discrete_form = false,
                                      loc = (nothing, nothing, nothing),
                                      parameters = nothing,
-                                     required_halo_size = 1)
+                                     required_halo_size = 2)
 
     ν = convert_diffusivity(FT, ν; discrete_form, loc, parameters)
     κ = convert_diffusivity(FT, κ; discrete_form, loc, parameters)

From d354418e9898e7588ff3d265366c402bb3706d5f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sat, 9 Sep 2023 18:12:11 -0400
Subject: [PATCH 500/530] buggfix

---
 src/Advection/topologically_conditional_interpolation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Advection/topologically_conditional_interpolation.jl b/src/Advection/topologically_conditional_interpolation.jl
index 0f0fadcf95..3c55d9b73c 100644
--- a/src/Advection/topologically_conditional_interpolation.jl
+++ b/src/Advection/topologically_conditional_interpolation.jl
@@ -60,7 +60,7 @@ for bias in (:symmetric, :left_biased, :right_biased)
                 @eval @inline $alt_interp(i, j, k, grid::$GridType, scheme::LOADV, args...) = $interp(i, j, k, grid, scheme, args...)
             end
 
-            outside_buffer = Symbol(:outside_, bias, :_buffer, loc)
+            outside_buffer = Symbol(:outside_, bias, :_halo, loc)
 
             # Conditional high-order interpolation in Bounded directions
             if ξ == :x

From 30aefe5ca26a02c470285eb9a19694072a2a6f42 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 10 Sep 2023 14:16:44 -0400
Subject: [PATCH 501/530] compute_auxiliaries!

---
 ...static_free_surface_boundary_tendencies.jl | 19 ++++-----------
 ...te_hydrostatic_free_surface_model_state.jl | 24 ++++++++++++++-----
 ...late_nonhydrostatic_boundary_tendencies.jl | 16 ++++---------
 .../update_nonhydrostatic_model_state.jl      | 21 ++++++++++++----
 4 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
index 0940831582..39f04067aa 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
@@ -13,8 +13,12 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     grid = model.grid
     arch = architecture(grid)
 
+    w_parameters = boundary_w_kernel_parameters(grid, arch)
+    p_parameters = boundary_p_kernel_parameters(grid, arch)
+    κ_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
+
     # We need new values for `w`, `p` and `κ`
-    recompute_auxiliaries!(model, grid, arch)
+    compute_auxiliaries!(model; w_parameters, p_parameters, κ_parameters)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
@@ -23,19 +27,6 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
     return nothing
 end
 
-function recompute_auxiliaries!(model::HydrostaticFreeSurfaceModel, grid, arch)
-    
-    w_kernel_parameters = boundary_w_kernel_parameters(grid, arch)
-    p_kernel_parameters = boundary_p_kernel_parameters(grid, arch)
-    κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
-
-    for (wpar, ppar, κpar) in zip(w_kernel_parameters, p_kernel_parameters, κ_kernel_parameters)
-        compute_w_from_continuity!(model.velocities, arch, grid; parameters = wpar)
-        update_hydrostatic_pressure!(model.pressure.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
-    end
-end
-
 # w needs computing in the range - H + 1 : 0 and N - 1 : N + H - 1
 function boundary_w_kernel_parameters(grid, arch)
     Nx, Ny, _ = size(grid)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index beeb1909ec..87f853f3ff 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -5,10 +5,11 @@ using Oceananigans: UpdateStateCallsite
 using Oceananigans.Biogeochemistry: update_biogeochemical_state!
 using Oceananigans.TurbulenceClosures: calculate_diffusivities!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!, mask_immersed_field_xy!, inactive_node
-using Oceananigans.Models.NonhydrostaticModels: update_hydrostatic_pressure!
+using Oceananigans.Models.NonhydrostaticModels: update_hydrostatic_pressure!, p_kernel_parameters
 using Oceananigans.Fields: replace_horizontal_velocity_halos!
 
 import Oceananigans.TimeSteppers: update_state!
+import Oceananigans.Models.NonhydrostaticModels: compute_auxiliaries!
 
 compute_auxiliary_fields!(auxiliary_fields) = Tuple(compute!(a) for a in auxiliary_fields)
 
@@ -32,7 +33,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid, callbacks; comp
     fill_halo_regions!(prognostic_fields(model), model.clock, fields(model); async = true)
 
     @apply_regionally replace_horizontal_velocity_halos!(model.velocities, model.grid)
-    @apply_regionally compute_w_diffusivities_pressure!(model)
+    @apply_regionally compute_auxiliaries!(model)
     fill_halo_regions!(model.diffusivity_fields; only_local_halos = true)
 
     [callback(model) for callback in callbacks if callback.callsite isa UpdateStateCallsite]
@@ -60,9 +61,20 @@ function mask_immersed_model_fields!(model, grid)
     return nothing
 end
 
-function compute_w_diffusivities_pressure!(model) 
-    compute_w_from_continuity!(model)
-    calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-    update_hydrostatic_pressure!(model.pressure.pHY′, model.architecture, model.grid, model.buoyancy, model.tracers)
+function compute_auxiliaries!(model::HydrostaticFreeSurfaceModel; w_parameters = tuple(w_kernel_parameters(model.grid)),
+                                                                  p_parameters = tuple(p_kernel_parameters(model.grid)),
+                                                                  κ_parameters = tuple(:xyz)) 
+    
+    grid = model.grid
+    closure = model.closure
+    diffusivity = model.diffusivity_fields
+
+    for (wpar, ppar, κpar) in zip(w_parameters, p_parameters, κ_parameters)
+        compute_w_from_continuity!(model; w_kernel_parameters = wpar)
+        calculate_diffusivities!(diffusivity, closure, model; κ_kernel_parameters = κpar)
+        update_hydrostatic_pressure!(model.pressure.pHY′, architecture(grid), 
+                                    grid, model.buoyancy, model.tracers; 
+                                    p_kernel_parameters = ppar)
+    end
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index b1375cf3aa..023f91028b 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -8,8 +8,11 @@ function compute_boundary_tendencies!(model::NonhydrostaticModel)
     grid = model.grid
     arch = architecture(grid)
 
+    pparameters = boundary_p_kernel_parameters(grid, arch)
+    κparameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
+
     # We need new values for `p` and `κ`
-    recompute_auxiliaries!(model, grid, arch)
+    compute_auxiliaries!(model; pparameters, κparameters)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
@@ -36,17 +39,6 @@ function boundary_tendency_kernel_parameters(grid, arch)
     return boundary_parameters(sizes, offs, grid, arch)
 end
 
-function recompute_auxiliaries!(model::NonhydrostaticModel, grid, arch)
-    
-    p_kernel_parameters = boundary_p_kernel_parameters(grid, arch)
-    κ_kernel_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
-
-    for (ppar, κpar) in zip(p_kernel_parameters, κ_kernel_parameters)
-        update_hydrostatic_pressure!(model.pressures.pHY′, arch, grid, model.buoyancy, model.tracers; parameters = ppar)
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model; parameters = κpar)
-    end
-end
-
 # p needs computing in the range  0 : 0 and N + 1 : N + 1
 function boundary_p_kernel_parameters(grid, arch)
     Nx, Ny, _ = size(grid)
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 186c54f552..f5d6d9254a 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -31,11 +31,7 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     end
 
     # Calculate diffusivities and hydrostatic pressure
-    @apply_regionally begin
-        calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
-        update_hydrostatic_pressure!(model)
-    end
-
+    @apply_regionally compute_auxiliaries!(model)
     fill_halo_regions!(model.diffusivity_fields; only_local_halos = true)
     
     for callback in callbacks
@@ -50,3 +46,18 @@ function update_state!(model::NonhydrostaticModel, callbacks=[]; compute_tendenc
     return nothing
 end
 
+function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p_kernel_parameters(model.grid)),
+                                                          κ_parameters = tuple(:xyz)) 
+
+    grid = model.grid
+    closure = model.closure
+    diffusivity = model.diffusivity_fields
+
+    for (ppar, κpar) in zip(p_parameters, κ_parameters)
+        calculate_diffusivities!(diffusivity, closure, model; κ_kernel_parameters = κpar)
+        update_hydrostatic_pressure!(model.pressures.pHY′, architecture(grid), 
+                                     grid, model.buoyancy, model.tracers; 
+                                     p_kernel_parameters = ppar)
+    end
+    return nothing
+end

From b8e913f458c72a7436d075ae841c799ca3a150ef Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 10 Sep 2023 14:39:19 -0400
Subject: [PATCH 502/530] bugfix

---
 .../update_hydrostatic_free_surface_model_state.jl          | 6 +++---
 .../update_nonhydrostatic_model_state.jl                    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 87f853f3ff..7b17878ded 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -70,11 +70,11 @@ function compute_auxiliaries!(model::HydrostaticFreeSurfaceModel; w_parameters =
     diffusivity = model.diffusivity_fields
 
     for (wpar, ppar, κpar) in zip(w_parameters, p_parameters, κ_parameters)
-        compute_w_from_continuity!(model; w_kernel_parameters = wpar)
-        calculate_diffusivities!(diffusivity, closure, model; κ_kernel_parameters = κpar)
+        compute_w_from_continuity!(model; parameters = wpar)
+        calculate_diffusivities!(diffusivity, closure, model; parameters = κpar)
         update_hydrostatic_pressure!(model.pressure.pHY′, architecture(grid), 
                                     grid, model.buoyancy, model.tracers; 
-                                    p_kernel_parameters = ppar)
+                                    parameters = ppar)
     end
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index f5d6d9254a..ec42d6d1c2 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -54,10 +54,10 @@ function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p
     diffusivity = model.diffusivity_fields
 
     for (ppar, κpar) in zip(p_parameters, κ_parameters)
-        calculate_diffusivities!(diffusivity, closure, model; κ_kernel_parameters = κpar)
+        calculate_diffusivities!(diffusivity, closure, model; parameters = κpar)
         update_hydrostatic_pressure!(model.pressures.pHY′, architecture(grid), 
                                      grid, model.buoyancy, model.tracers; 
-                                     p_kernel_parameters = ppar)
+                                     parameters = ppar)
     end
     return nothing
 end

From 8c4ed662b6f976bb3b6e604ad89e59cf94c44686 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Sun, 10 Sep 2023 14:51:59 -0400
Subject: [PATCH 503/530] fixed it

---
 .../HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
index a2527bea43..8e8f2cc43a 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_w_from_continuity.jl
@@ -12,7 +12,7 @@ Compute the vertical velocity ``w`` by integrating the continuity equation from
 w^{n+1} = -∫ [∂/∂x (u^{n+1}) + ∂/∂y (v^{n+1})] dz
 ```
 """
-compute_w_from_continuity!(model) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid)
+compute_w_from_continuity!(model; kwargs...) = compute_w_from_continuity!(model.velocities, model.architecture, model.grid; kwargs...)
 
 compute_w_from_continuity!(velocities, arch, grid; parameters = w_kernel_parameters(grid)) = 
     launch!(arch, grid, parameters, _compute_w_from_continuity!, velocities, grid)

From 271aa867d9c4361af232229be694c264628a5f31 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 11 Sep 2023 21:49:58 -0400
Subject: [PATCH 504/530] little change

---
 src/Distributed/distributed_fields.jl         | 22 ++++++++++++++-----
 ...compute_vertically_integrated_variables.jl |  1 -
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
index 301337d28a..75da77c691 100644
--- a/src/Distributed/distributed_fields.jl
+++ b/src/Distributed/distributed_fields.jl
@@ -1,8 +1,6 @@
-import Oceananigans.Fields: Field, FieldBoundaryBuffers, location
+import Oceananigans.Fields: Field, FieldBoundaryBuffers, location, set!
 import Oceananigans.BoundaryConditions: fill_halo_regions!
 
-import Oceananigans.Grids: architecture
-
 using Oceananigans.Fields: validate_field_data, validate_boundary_conditions, validate_indices
 
 function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indices::Tuple, op, status)
@@ -16,8 +14,22 @@ function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indice
     return Field{LX, LY, LZ}(grid, data, new_bcs, indices, op, status, buffers)
 end
 
+child_architecture(f::DistributedField) = child_architecture(architecture(f.grid))
+
 const DistributedField      = Field{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid}
 const DistributedFieldTuple = NamedTuple{S, <:NTuple{N, DistributedField}} where {S, N}
 
-# TODO: make sure the definition of architecture is consistent
-architecture(f::DistributedField) = child_architecture(architecture(f.grid))
+function set!(u::DistributedField, f::Function)
+    if child_architecture(u) isa GPU
+        cpu_grid = on_architecture(CPU(), u.grid)
+        u_cpu = Field(location(u), cpu_grid; indices = indices(u))
+        f_field = field(location(u), f, cpu_grid)
+        set!(u_cpu, f_field)
+        set!(u, u_cpu)
+    elseif child_architecture(u) isa CPU
+        f_field = field(location(u), f, u.grid)
+        set!(u, f_field)
+    end
+
+    return u
+end
\ No newline at end of file
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_vertically_integrated_variables.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_vertically_integrated_variables.jl
index f1a6f918ec..f3bd5b2b27 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_vertically_integrated_variables.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_vertically_integrated_variables.jl
@@ -9,7 +9,6 @@ function compute_vertically_integrated_lateral_areas!(∫ᶻ_A)
     # 2 halos (instead of only 1) are necessary to accomodate the preconditioner
 
     field_grid = ∫ᶻ_A.xᶠᶜᶜ.grid
-    arch = architecture(field_grid)
 
     Axᶠᶜᶜ = GridMetricOperation((Face, Center, Center), Ax, field_grid)
     Ayᶜᶠᶜ = GridMetricOperation((Center, Face, Center), Ay, field_grid)

From 1db41bbe46c9ab7f9c550d50cbf4cea1fc6d2ff3 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 11 Sep 2023 21:54:55 -0400
Subject: [PATCH 505/530] some changes

---
 src/Distributed/distributed_fields.jl  | 9 ++++-----
 src/Distributed/multi_architectures.jl | 5 +++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
index 75da77c691..dc3ad6498f 100644
--- a/src/Distributed/distributed_fields.jl
+++ b/src/Distributed/distributed_fields.jl
@@ -14,19 +14,18 @@ function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indice
     return Field{LX, LY, LZ}(grid, data, new_bcs, indices, op, status, buffers)
 end
 
-child_architecture(f::DistributedField) = child_architecture(architecture(f.grid))
-
 const DistributedField      = Field{<:Any, <:Any, <:Any, <:Any, <:DistributedGrid}
 const DistributedFieldTuple = NamedTuple{S, <:NTuple{N, DistributedField}} where {S, N}
 
 function set!(u::DistributedField, f::Function)
-    if child_architecture(u) isa GPU
-        cpu_grid = on_architecture(CPU(), u.grid)
+    arch = architecture(u)
+    if child_architecture(arch) isa GPU
+        cpu_grid = on_architecture(cpu_architecture(arch), u.grid)
         u_cpu = Field(location(u), cpu_grid; indices = indices(u))
         f_field = field(location(u), f, cpu_grid)
         set!(u_cpu, f_field)
         set!(u, u_cpu)
-    elseif child_architecture(u) isa CPU
+    elseif child_architecture(arch) isa CPU
         f_field = field(location(u), f, u.grid)
         set!(u, f_field)
     end
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 07927decd1..56f07f4853 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -117,6 +117,11 @@ zeros(FT, arch::DistributedArch, N...)    = zeros(FT, child_architecture(arch),
 array_type(arch::DistributedArch)         = array_type(child_architecture(arch))
 sync_device!(arch::DistributedArch)       = sync_device!(arch.child_architecture)
 
+cpu_architecture(arch::DistributedArch{<:CPU}) = arch
+cpu_architecture(arch::DistributedArch{<:GPU}) = 
+    DistributedArch(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
+                           arch.connectivity, arch.communicator, arch.mpi_requests, arch.tag)
+
 #####
 ##### Converting between index and MPI rank taking k as the fast index
 #####

From f2bc0086f022cb8ba547b094f231a33a3d877c54 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 11 Sep 2023 22:40:20 -0400
Subject: [PATCH 506/530] bugfix

---
 src/Distributed/multi_architectures.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 56f07f4853..63c1df9188 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -120,7 +120,7 @@ sync_device!(arch::DistributedArch)       = sync_device!(arch.child_architecture
 cpu_architecture(arch::DistributedArch{<:CPU}) = arch
 cpu_architecture(arch::DistributedArch{<:GPU}) = 
     DistributedArch(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
-                           arch.connectivity, arch.communicator, arch.mpi_requests, arch.tag)
+                           arch.connectivity, arch.communicator, arch.mpi_requests, arch.mpi_tag)
 
 #####
 ##### Converting between index and MPI rank taking k as the fast index

From 0911063f6a5360367e572470f5c68d9dce60af1a Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 11 Sep 2023 23:27:49 -0400
Subject: [PATCH 507/530] bugfix

---
 src/Distributed/distributed_fields.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Distributed/distributed_fields.jl b/src/Distributed/distributed_fields.jl
index dc3ad6498f..cea4aecf2f 100644
--- a/src/Distributed/distributed_fields.jl
+++ b/src/Distributed/distributed_fields.jl
@@ -1,7 +1,7 @@
 import Oceananigans.Fields: Field, FieldBoundaryBuffers, location, set!
 import Oceananigans.BoundaryConditions: fill_halo_regions!
 
-using Oceananigans.Fields: validate_field_data, validate_boundary_conditions, validate_indices
+using Oceananigans.Fields: validate_field_data, indices, validate_boundary_conditions, validate_indices
 
 function Field((LX, LY, LZ)::Tuple, grid::DistributedGrid, data, old_bcs, indices::Tuple, op, status)
     arch = architecture(grid)

From e6608a60d73271ebff9928e25577650986ea49c8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 18 Sep 2023 14:24:33 +0200
Subject: [PATCH 508/530] bugfixxed

---
 .../NonhydrostaticModels/update_hydrostatic_pressure.jl     | 6 +++---
 .../update_nonhydrostatic_model_state.jl                    | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
index 4f1ec1f602..b589596374 100644
--- a/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
+++ b/src/Models/NonhydrostaticModels/update_hydrostatic_pressure.jl
@@ -19,9 +19,9 @@ the `buoyancy_perturbationᶜᶜᶜ` downwards:
     end
 end
 
-update_hydrostatic_pressure!(model) = update_hydrostatic_pressure!(model.grid, model)
-update_hydrostatic_pressure!(::AbstractGrid{<:Any, <:Any, <:Any, <:Flat}, model) = nothing
-update_hydrostatic_pressure!(grid, model) = update_hydrostatic_pressure!(model.pressures.pHY′, model.architecture, model.grid, model.buoyancy, model.tracers)
+update_hydrostatic_pressure!(model; kwargs...) = update_hydrostatic_pressure!(model.grid, model; kwargs...)
+update_hydrostatic_pressure!(::AbstractGrid{<:Any, <:Any, <:Any, <:Flat}, model; kwargs...) = nothing
+update_hydrostatic_pressure!(grid, model; kwargs...) = update_hydrostatic_pressure!(model.pressures.pHY′, model.architecture, model.grid, model.buoyancy, model.tracers; kwargs...)
 
 # Partial cell "algorithm"
 const PCB = PartialCellBottom
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index ec42d6d1c2..72963802a7 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -55,9 +55,7 @@ function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p
 
     for (ppar, κpar) in zip(p_parameters, κ_parameters)
         calculate_diffusivities!(diffusivity, closure, model; parameters = κpar)
-        update_hydrostatic_pressure!(model.pressures.pHY′, architecture(grid), 
-                                     grid, model.buoyancy, model.tracers; 
-                                     parameters = ppar)
+        update_hydrostatic_pressure!(model; parameters = ppar)
     end
     return nothing
 end

From 38f2b877dc2319dc1303077ba99910b720637a05 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:12:15 +0200
Subject: [PATCH 509/530] another bugfix

---
 .../calculate_nonhydrostatic_boundary_tendencies.jl         | 6 +++---
 .../update_nonhydrostatic_model_state.jl                    | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
index 023f91028b..01671beb3c 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
@@ -8,11 +8,11 @@ function compute_boundary_tendencies!(model::NonhydrostaticModel)
     grid = model.grid
     arch = architecture(grid)
 
-    pparameters = boundary_p_kernel_parameters(grid, arch)
-    κparameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
+    p_parameters = boundary_p_kernel_parameters(grid, arch)
+    κ_parameters = boundary_κ_kernel_parameters(grid, model.closure, arch)
 
     # We need new values for `p` and `κ`
-    compute_auxiliaries!(model; pparameters, κparameters)
+    compute_auxiliaries!(model; p_parameters, κ_parameters)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 72963802a7..27f1b43544 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -49,7 +49,6 @@ end
 function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p_kernel_parameters(model.grid)),
                                                           κ_parameters = tuple(:xyz)) 
 
-    grid = model.grid
     closure = model.closure
     diffusivity = model.diffusivity_fields
 

From 7da9b59ac2ffe78f77be1a191ae426eb361ba8ec Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:29:15 +0200
Subject: [PATCH 510/530] compute_diffusivities!

---
 .../HydrostaticFreeSurfaceModels/single_column_model_mode.jl  | 4 ++--
 .../HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl | 2 +-
 .../update_hydrostatic_free_surface_model_state.jl            | 4 ++--
 .../NonhydrostaticModels/update_nonhydrostatic_model_state.jl | 4 ++--
 .../ShallowWaterModels/shallow_water_diffusion_operators.jl   | 4 ++--
 src/Models/ShallowWaterModels/update_shallow_water_state.jl   | 2 +-
 src/TurbulenceClosures/TurbulenceClosures.jl                  | 4 ++--
 src/TurbulenceClosures/closure_tuples.jl                      | 4 ++--
 .../CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl  | 4 ++--
 .../anisotropic_minimum_dissipation.jl                        | 2 +-
 .../convective_adjustment_vertical_diffusivity.jl             | 2 +-
 .../isopycnal_skew_symmetric_diffusivity.jl                   | 2 +-
 .../leith_enstrophy_diffusivity.jl                            | 2 +-
 .../mews_vertical_diffusivity.jl                              | 4 ++--
 .../turbulence_closure_implementations/nothing_closure.jl     | 4 ++--
 .../ri_based_vertical_diffusivity.jl                          | 2 +-
 .../scalar_biharmonic_diffusivity.jl                          | 2 +-
 .../turbulence_closure_implementations/scalar_diffusivity.jl  | 4 ++--
 .../turbulence_closure_implementations/smagorinsky_lilly.jl   | 2 +-
 19 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index c233f00eb1..bf0472e082 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -9,7 +9,7 @@ using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: CATKEVDArray
 
 import Oceananigans.Grids: validate_size, validate_halo
 import Oceananigans.BoundaryConditions: fill_halo_regions!
-import Oceananigans.TurbulenceClosures: time_discretization, calculate_diffusivities!
+import Oceananigans.TurbulenceClosures: time_discretization, compute_diffusivities!
 import Oceananigans.TurbulenceClosures: ∂ⱼ_τ₁ⱼ, ∂ⱼ_τ₂ⱼ, ∇_dot_qᶜ
 import Oceananigans.Coriolis: x_f_cross_U, y_f_cross_U, z_f_cross_U
 
@@ -65,7 +65,7 @@ function update_state!(model::HydrostaticFreeSurfaceModel, grid::SingleColumnGri
     compute_auxiliary_fields!(model.auxiliary_fields)
 
     # Calculate diffusivities
-    calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
+    compute_diffusivities!(model.diffusivity_fields, model.closure, model)
 
     fill_halo_regions!(model.diffusivity_fields, model.clock, fields(model))
 
diff --git a/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
index 4c997f8ec9..2da45da325 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/slice_ensemble_model_mode.jl
@@ -3,7 +3,7 @@ using Oceananigans.TurbulenceClosures: AbstractTurbulenceClosure
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: _top_tke_flux, CATKEVDArray
 
 import Oceananigans.Grids: validate_size, validate_halo, HRegRectilinearGrid
-import Oceananigans.TurbulenceClosures: time_discretization, calculate_diffusivities!, with_tracers
+import Oceananigans.TurbulenceClosures: time_discretization, compute_diffusivities!, with_tracers
 import Oceananigans.TurbulenceClosures: ∂ⱼ_τ₁ⱼ, ∂ⱼ_τ₂ⱼ, ∂ⱼ_τ₃ⱼ, ∇_dot_qᶜ
 import Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: top_tke_flux
 import Oceananigans.Coriolis: x_f_cross_U, y_f_cross_U, z_f_cross_U
diff --git a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
index 7b17878ded..388aed6505 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/update_hydrostatic_free_surface_model_state.jl
@@ -3,7 +3,7 @@ using Oceananigans.BoundaryConditions
 
 using Oceananigans: UpdateStateCallsite
 using Oceananigans.Biogeochemistry: update_biogeochemical_state!
-using Oceananigans.TurbulenceClosures: calculate_diffusivities!
+using Oceananigans.TurbulenceClosures: compute_diffusivities!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!, mask_immersed_field_xy!, inactive_node
 using Oceananigans.Models.NonhydrostaticModels: update_hydrostatic_pressure!, p_kernel_parameters
 using Oceananigans.Fields: replace_horizontal_velocity_halos!
@@ -71,7 +71,7 @@ function compute_auxiliaries!(model::HydrostaticFreeSurfaceModel; w_parameters =
 
     for (wpar, ppar, κpar) in zip(w_parameters, p_parameters, κ_parameters)
         compute_w_from_continuity!(model; parameters = wpar)
-        calculate_diffusivities!(diffusivity, closure, model; parameters = κpar)
+        compute_diffusivities!(diffusivity, closure, model; parameters = κpar)
         update_hydrostatic_pressure!(model.pressure.pHY′, architecture(grid), 
                                     grid, model.buoyancy, model.tracers; 
                                     parameters = ppar)
diff --git a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
index 27f1b43544..cdf4a850ba 100644
--- a/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
+++ b/src/Models/NonhydrostaticModels/update_nonhydrostatic_model_state.jl
@@ -2,7 +2,7 @@ using Oceananigans: UpdateStateCallsite
 using Oceananigans.Architectures
 using Oceananigans.BoundaryConditions
 using Oceananigans.Biogeochemistry: update_biogeochemical_state!
-using Oceananigans.TurbulenceClosures: calculate_diffusivities!
+using Oceananigans.TurbulenceClosures: compute_diffusivities!
 using Oceananigans.Fields: compute!
 using Oceananigans.ImmersedBoundaries: mask_immersed_field!
 
@@ -53,7 +53,7 @@ function compute_auxiliaries!(model::NonhydrostaticModel; p_parameters = tuple(p
     diffusivity = model.diffusivity_fields
 
     for (ppar, κpar) in zip(p_parameters, κ_parameters)
-        calculate_diffusivities!(diffusivity, closure, model; parameters = κpar)
+        compute_diffusivities!(diffusivity, closure, model; parameters = κpar)
         update_hydrostatic_pressure!(model; parameters = ppar)
     end
     return nothing
diff --git a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
index c8dd1d19f0..c3f2f82729 100644
--- a/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_diffusion_operators.jl
@@ -13,7 +13,7 @@ using Oceananigans.TurbulenceClosures:
 
 import Oceananigans.TurbulenceClosures:
                         DiffusivityFields,
-                        calculate_diffusivities!,
+                        compute_diffusivities!,
                         viscosity,
                         with_tracers,
                         νᶜᶜᶜ
@@ -59,7 +59,7 @@ Adapt.adapt_structure(to, closure::ShallowWaterScalarDiffusivity{B}) where B =
     νₑ[i, j, k] = fields.h[i, j, k] * νᶜᶜᶜ(i, j, k, grid, viscosity_location(closure), closure.ν, clock, fields)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::ShallowWaterScalarDiffusivity, model)
+function compute_diffusivities!(diffusivity_fields, closure::ShallowWaterScalarDiffusivity, model)
 
     arch  = model.architecture
     grid  = model.grid
diff --git a/src/Models/ShallowWaterModels/update_shallow_water_state.jl b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
index 6d1d0849dc..52759f020b 100644
--- a/src/Models/ShallowWaterModels/update_shallow_water_state.jl
+++ b/src/Models/ShallowWaterModels/update_shallow_water_state.jl
@@ -14,7 +14,7 @@ function update_state!(model::ShallowWaterModel, callbacks=[]; compute_tendencie
     # Mask immersed fields
     foreach(mask_immersed_field!, model.solution)
 
-    calculate_diffusivities!(model.diffusivity_fields, model.closure, model)
+    compute_diffusivities!(model.diffusivity_fields, model.closure, model)
 
     # Fill halos for velocities and tracers
     fill_halo_regions!(merge(model.solution, model.tracers), model.clock, fields(model))
diff --git a/src/TurbulenceClosures/TurbulenceClosures.jl b/src/TurbulenceClosures/TurbulenceClosures.jl
index 10fee681fd..aa9b253201 100644
--- a/src/TurbulenceClosures/TurbulenceClosures.jl
+++ b/src/TurbulenceClosures/TurbulenceClosures.jl
@@ -22,7 +22,7 @@ export
     VerticallyImplicitTimeDiscretization,
 
     DiffusivityFields,
-    calculate_diffusivities!,
+    compute_diffusivities!,
 
     viscosity, diffusivity,
 
@@ -68,7 +68,7 @@ abstract type AbstractTurbulenceClosure{TimeDiscretization, RequiredHalo} end
 validate_closure(closure) = closure
 closure_summary(closure) = summary(closure)
 with_tracers(tracers, closure::AbstractTurbulenceClosure) = closure
-calculate_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
+compute_diffusivities!(K, closure::AbstractTurbulenceClosure, args...; kwargs...) = nothing
  
 # The required halo size to calculate diffusivities. Take care that if the diffusivity can
 # be calculated from local information, still `B = 1`, because we need at least one additional
diff --git a/src/TurbulenceClosures/closure_tuples.jl b/src/TurbulenceClosures/closure_tuples.jl
index edf37db41c..6321800b14 100644
--- a/src/TurbulenceClosures/closure_tuples.jl
+++ b/src/TurbulenceClosures/closure_tuples.jl
@@ -73,10 +73,10 @@ end
 
 with_tracers(tracers, closure_tuple::Tuple) = Tuple(with_tracers(tracers, closure) for closure in closure_tuple)
 
-function calculate_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...; kwargs...)
+function compute_diffusivities!(diffusivity_fields_tuple, closure_tuple::Tuple, args...; kwargs...)
     for (α, closure) in enumerate(closure_tuple)
         diffusivity_fields = diffusivity_fields_tuple[α]
-        calculate_diffusivities!(diffusivity_fields, closure, args...; kwargs...)
+        compute_diffusivities!(diffusivity_fields, closure, args...; kwargs...)
     end
     return nothing
 end
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
index ce2f443c04..71d27e6719 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/CATKEVerticalDiffusivities/CATKEVerticalDiffusivities.jl
@@ -34,7 +34,7 @@ import Oceananigans.TurbulenceClosures:
     buoyancy_flux,
     dissipation,
     add_closure_specific_boundary_conditions,
-    calculate_diffusivities!,
+    compute_diffusivities!,
     DiffusivityFields,
     implicit_linear_coefficient,
     viscosity,
@@ -237,7 +237,7 @@ const f = Face()
 
 @inline clip(x) = max(zero(x), x)
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = :xyz)
+function compute_diffusivities!(diffusivities, closure::FlavorOfCATKE, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
index 6e07c57d71..390c96555b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/anisotropic_minimum_dissipation.jl
@@ -188,7 +188,7 @@ end
     @inbounds κₑ[i, j, k] = max(zero(FT), κˢᵍˢ)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = :xyz)
+function compute_diffusivities!(diffusivity_fields, closure::AnisotropicMinimumDissipation, model; parameters = :xyz)
     grid = model.grid
     arch = model.architecture
     velocities = model.velocities
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
index a7334841ef..f8da912926 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/convective_adjustment_vertical_diffusivity.jl
@@ -88,7 +88,7 @@ DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfCAVD) = (; κᶜ = Z
 @inline viscosity(::FlavorOfCAVD, diffusivities) = diffusivities.κᵘ
 @inline diffusivity(::FlavorOfCAVD, diffusivities, id) = diffusivities.κᶜ
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = :xyz)
+function compute_diffusivities!(diffusivities, closure::FlavorOfCAVD, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 6a1c4b0f70..8f10676580 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -80,7 +80,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfISSD{TD}) w
     end
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = :xyz)
+function compute_diffusivities!(diffusivities, closure::FlavorOfISSD, model; parameters = :xyz)
 
     arch = model.architecture
     grid = model.grid
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
index 0cff3fbc0b..c38e5707dc 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/leith_enstrophy_diffusivity.jl
@@ -92,7 +92,7 @@ end
     @inbounds νₑ[i, j, k] = prefactor * dynamic_ν
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = :xyz)
+function compute_diffusivities!(diffusivity_fields, closure::TwoDimensionalLeith, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     velocities = model.velocities
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
index 27806fc60c..529f3bc4ef 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/mews_vertical_diffusivity.jl
@@ -32,7 +32,7 @@ import Oceananigans.TurbulenceClosures:
     dissipation,
     validate_closure,
     add_closure_specific_boundary_conditions,
-    calculate_diffusivities!,
+    compute_diffusivities!,
     DiffusivityFields,
     implicit_linear_coefficient,
     viscosity,
@@ -115,7 +115,7 @@ function Base.show(io::IO, closure::MEWS)
                  "    Cᴰ  : ", closure.Cᴰ))
 end
 
-function calculate_diffusivities!(diffusivities, closure::MEWS, model; parameters = :xyz)
+function compute_diffusivities!(diffusivities, closure::MEWS, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     clock = model.clock
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
index ea98c72f45..1f3f63a8a1 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/nothing_closure.jl
@@ -3,8 +3,8 @@
 @inline ∂ⱼ_τ₂ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 @inline ∂ⱼ_τ₃ⱼ(i, j, k, grid::AbstractGrid{FT}, ::Nothing, args...) where FT = zero(FT)
 
-calculate_diffusivities!(diffusivities, ::Nothing, args...; kwargs...) = nothing
-calculate_diffusivities!(::Nothing, ::Nothing, args...; kwargs...) = nothing
+compute_diffusivities!(diffusivities, ::Nothing, args...; kwargs...) = nothing
+compute_diffusivities!(::Nothing, ::Nothing, args...; kwargs...) = nothing
 
 @inline viscosity(::Nothing, ::Nothing) = 0
 @inline diffusivity(::Nothing, ::Nothing, ::Val{id}) where id = 0
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
index f73b3e6f34..16dc985252 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/ri_based_vertical_diffusivity.jl
@@ -142,7 +142,7 @@ function DiffusivityFields(grid, tracer_names, bcs, closure::FlavorOfRBVD)
     return (; κᶜ, κᵘ, Ri)
 end
 
-function calculate_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = :xyz)
+function compute_diffusivities!(diffusivities, closure::FlavorOfRBVD, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     clock = model.clock
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
index e30ed05308..0f46880b83 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_biharmonic_diffusivity.jl
@@ -87,7 +87,7 @@ end
 @inline viscosity(closure::ScalarBiharmonicDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarBiharmonicDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...) = nothing
+compute_diffusivities!(diffusivities, closure::ScalarBiharmonicDiffusivity, args...) = nothing
 
 function Base.summary(closure::ScalarBiharmonicDiffusivity)
     F = summary(formulation(closure))
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
index 88c3200f5e..e9bc88c360 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/scalar_diffusivity.jl
@@ -172,10 +172,10 @@ end
 @inline viscosity(closure::ScalarDiffusivity, K) = closure.ν
 @inline diffusivity(closure::ScalarDiffusivity, K, ::Val{id}) where id = closure.κ[id]
 
-calculate_diffusivities!(diffusivities, ::ScalarDiffusivity, args...) = nothing
+compute_diffusivities!(diffusivities, ::ScalarDiffusivity, args...) = nothing
 
 # Note: we could compute ν and κ (if they are Field):
-# function calculate_diffusivities!(diffusivities, closure::ScalarDiffusivity, args...)
+# function compute_diffusivities!(diffusivities, closure::ScalarDiffusivity, args...)
 #     compute!(viscosity(closure, diffusivities))
 #     !isnothing(closure.κ) && Tuple(compute!(diffusivity(closure, Val(c), diffusivities) for c=1:length(closure.κ)))
 #     return nothing
diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
index c613ccb13c..83d67e381b 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/smagorinsky_lilly.jl
@@ -119,7 +119,7 @@ end
     @inbounds νₑ[i, j, k] = ς * (C * Δᶠ)^2 * sqrt(2Σ²)
 end
 
-function calculate_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = :xyz)
+function compute_diffusivities!(diffusivity_fields, closure::SmagorinskyLilly, model; parameters = :xyz)
     arch = model.architecture
     grid = model.grid
     buoyancy = model.buoyancy

From dafa13c607277db5e307717b0dd7f799d97258ae Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 18 Sep 2023 15:31:25 +0200
Subject: [PATCH 511/530] required halo size

---
 .../isopycnal_skew_symmetric_diffusivity.jl     | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 8f10676580..02430370f4 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -1,15 +1,15 @@
-struct IsopycnalSkewSymmetricDiffusivity{TD, K, S, M, L} <: AbstractTurbulenceClosure{TD, 1}
+struct IsopycnalSkewSymmetricDiffusivity{TD, K, S, M, L, N} <: AbstractTurbulenceClosure{TD, N}
                     κ_skew :: K
                κ_symmetric :: S
           isopycnal_tensor :: M
              slope_limiter :: L
     
-    function IsopycnalSkewSymmetricDiffusivity{TD}(κ_skew :: K,
-                                                   κ_symmetric :: S,
-                                                   isopycnal_tensor :: I,
-                                                   slope_limiter :: L) where {TD, K, S, I, L}
+    function IsopycnalSkewSymmetricDiffusivity{TD, N}(κ_skew :: K,
+                                                      κ_symmetric :: S,
+                                                      isopycnal_tensor :: I,
+                                                      slope_limiter :: L) where {TD, K, S, I, L, N}
 
-        return new{TD, K, S, I, L}(κ_skew, κ_symmetric, isopycnal_tensor, slope_limiter)
+        return new{TD, K, S, I, L, N}(κ_skew, κ_symmetric, isopycnal_tensor, slope_limiter)
     end
 end
 
@@ -36,12 +36,13 @@ function IsopycnalSkewSymmetricDiffusivity(time_disc::TD = VerticallyImplicitTim
                                            κ_skew = 0,
                                            κ_symmetric = 0,
                                            isopycnal_tensor = SmallSlopeIsopycnalTensor(),
-                                           slope_limiter = FluxTapering(1e-2)) where TD
+                                           slope_limiter = FluxTapering(1e-2),
+                                           required_halo_size = 1) where TD
 
     isopycnal_tensor isa SmallSlopeIsopycnalTensor ||
         error("Only isopycnal_tensor=SmallSlopeIsopycnalTensor() is currently supported.")
 
-    return IsopycnalSkewSymmetricDiffusivity{TD}(convert_diffusivity(FT, κ_skew),
+    return IsopycnalSkewSymmetricDiffusivity{TD, required_halo_size}(convert_diffusivity(FT, κ_skew),
                                                  convert_diffusivity(FT, κ_symmetric),
                                                  isopycnal_tensor,
                                                  slope_limiter)

From 56892eb5ccbeb1dbde57c33a2e62299c3021d780 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Mon, 18 Sep 2023 16:09:20 +0200
Subject: [PATCH 512/530] all fixed

---
 .../isopycnal_skew_symmetric_diffusivity.jl                   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
index 02430370f4..ed63c418c3 100644
--- a/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
+++ b/src/TurbulenceClosures/turbulence_closure_implementations/isopycnal_skew_symmetric_diffusivity.jl
@@ -51,10 +51,10 @@ end
 IsopycnalSkewSymmetricDiffusivity(FT::DataType; kw...) = 
     IsopycnalSkewSymmetricDiffusivity(VerticallyImplicitTimeDiscretization(), FT; kw...)
 
-function with_tracers(tracers, closure::ISSD{TD}) where TD
+function with_tracers(tracers, closure::ISSD{TD, N}) where {TD, N}
     κ_skew = !isa(closure.κ_skew, NamedTuple) ? closure.κ_skew : tracer_diffusivities(tracers, closure.κ_skew)
     κ_symmetric = !isa(closure.κ_symmetric, NamedTuple) ? closure.κ_symmetric : tracer_diffusivities(tracers, closure.κ_symmetric)
-    return IsopycnalSkewSymmetricDiffusivity{TD}(κ_skew, κ_symmetric, closure.isopycnal_tensor, closure.slope_limiter)
+    return IsopycnalSkewSymmetricDiffusivity{TD, N}(κ_skew, κ_symmetric, closure.isopycnal_tensor, closure.slope_limiter)
 end
 
 # For ensembles of closures

From bf927aef93bea95c4fe0660454c69a20a5bd87d1 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:23:42 +0200
Subject: [PATCH 513/530] shorten line

---
 src/Advection/vector_invariant_advection.jl | 45 +++++++++++----------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/Advection/vector_invariant_advection.jl b/src/Advection/vector_invariant_advection.jl
index ffc0d45454..48359babee 100644
--- a/src/Advection/vector_invariant_advection.jl
+++ b/src/Advection/vector_invariant_advection.jl
@@ -301,29 +301,30 @@ const CX{N} = Centered{N, <:Any, <:Nothing}
 const CY{N} = Centered{N, <:Any, <:Any, <:Nothing}
 const CZ{N} = Centered{N, <:Any, <:Any, <:Any, <:Nothing}
 
+const AS = AbstractSmoothnessStencil
+
 # To adapt passing smoothness stencils to upwind biased schemes and centered schemes (not weno) 
-for buffer in 1:6
+for b in 1:6
     @eval begin
-        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::C{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-
-        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::CX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::CY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::CZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme::UX{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme::UY{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::U{$buffer},  f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
-        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme::UZ{$buffer}, f::Function, idx, loc, ::AbstractSmoothnessStencil, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, scheme, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s::C{$b},  f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s::C{$b},  f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s::C{$b},  f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s::CX{$b}, f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s::CY{$b}, f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s::CZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_symmetric_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+
+        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UX{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UY{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_left_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+
+        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::U{$b},  f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s::UX{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_xᶠᵃᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s::UY{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_yᵃᶠᵃ(i, j, k, grid, s, f, idx, loc, args...)
+        @inline inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s::UZ{$b}, f::Function, idx, loc, ::AS, args...) = inner_right_biased_interpolate_zᵃᵃᶠ(i, j, k, grid, s, f, idx, loc, args...)
     end
 end

From bac7f4e03843adb04766191772962928e56b68c8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:26:49 +0200
Subject: [PATCH 514/530] fix comment

---
 src/Architectures.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Architectures.jl b/src/Architectures.jl
index 581ea7943c..b555c18736 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -93,7 +93,7 @@ function unified_array(::GPU, arr::AbstractArray)
     return vec
 end
 
-## Only for contiguous data!! (i.e. only if the offset for pointer(dst::CuArray, offset::Int) is 1)
+## GPU to GPU copy of contiguous data
 @inline function device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false) 
     n = length(src)
     context!(context(src)) do

From d48d1c93a3014f507ce42fdd8acf4ae464fb9ed8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:27:53 +0200
Subject: [PATCH 515/530] remove abbreviation

---
 src/Distributed/Distributed.jl                                  | 2 +-
 ..._and_comp.jl => interleave_communication_and_computation.jl} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/Distributed/{interleave_comm_and_comp.jl => interleave_communication_and_computation.jl} (100%)

diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index 9caf1037e3..da5a780a60 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -18,6 +18,6 @@ include("halo_communication_bcs.jl")
 include("distributed_fields.jl")
 include("halo_communication.jl")
 include("distributed_fft_based_poisson_solver.jl")
-include("interleave_comm_and_comp.jl")
+include("interleave_communication_and_computation.jl")
 
 end # module
diff --git a/src/Distributed/interleave_comm_and_comp.jl b/src/Distributed/interleave_communication_and_computation.jl
similarity index 100%
rename from src/Distributed/interleave_comm_and_comp.jl
rename to src/Distributed/interleave_communication_and_computation.jl

From 36794217c9383c63fc36154b28bb00c7e2f6263f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:34:25 +0200
Subject: [PATCH 516/530] remove unused functions

---
 src/Distributed/halo_communication.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index decce72e03..7dd284e612 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -1,6 +1,5 @@
 using KernelAbstractions: @kernel, @index
 using OffsetArrays: OffsetArray
-using CUDA: cuStreamGetFlags, stream, priority_range, CUstream_flags_enum, CuStream, stream!
 
 using Oceananigans.Fields: fill_send_buffers!,
                            recv_from_buffers!, 

From 92739b02ad38e9cfb3ddbb54923cea22a996dbe6 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:36:04 +0200
Subject: [PATCH 517/530] better explanation of the MPI tag

---
 src/Distributed/halo_communication.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index 7dd284e612..eef623e716 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -43,10 +43,8 @@ opposite_side = Dict(
 # Define functions that return unique send and recv MPI tags for each side.
 # It's an integer where
 #   digit 1-2: an identifier for the field that is reset each timestep
-#   digit 3: an identifier for the field's location 
-#   digit 4: the side
-#   digits 5-6: the "from" rank
-#   digits 7-8: the "to" rank
+#   digit 3: an identifier for the field's Z-location
+#   digit 4: the side we send to/recieve from
 
 ID_DIGITS   = 2
 

From eab6dde6071f3f2fe8dca2c37e885031d163632e Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 19 Sep 2023 12:42:31 +0200
Subject: [PATCH 518/530] Update src/ImmersedBoundaries/active_cells_map.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/ImmersedBoundaries/active_cells_map.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index cd55827785..5322ee6c00 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -53,7 +53,7 @@ function compute_active_cells_interior(ibg)
 end
 
 function compute_active_cells_surface(ibg)
-    one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0.0)
+    one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0)
     column    = sum(one_field, dims = 3)
     is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, column)
     active_cells_field = Field{Center, Center, Nothing}(ibg, Bool)

From 3bbcdcd9918186499e188aa545adf1f3e7925f13 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <silvestri.simone0@gmail.com>
Date: Tue, 19 Sep 2023 12:43:13 +0200
Subject: [PATCH 519/530] Update src/Solvers/batched_tridiagonal_solver.jl

Co-authored-by: Navid C. Constantinou <navidcy@users.noreply.github.com>
---
 src/Solvers/batched_tridiagonal_solver.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Solvers/batched_tridiagonal_solver.jl b/src/Solvers/batched_tridiagonal_solver.jl
index 2afa8e1adf..a188d0fc89 100644
--- a/src/Solvers/batched_tridiagonal_solver.jl
+++ b/src/Solvers/batched_tridiagonal_solver.jl
@@ -214,4 +214,4 @@ end
             ϕ[i, j, k] -= t[i, j, k+1] * ϕ[i, j, k+1]
         end
     end
-end
\ No newline at end of file
+end

From 4259130c01147f8df307206de5d56efc41bbd620 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:45:54 +0200
Subject: [PATCH 520/530] change name

---
 .../distributed_nonhydrostatic_model_mpi.jl   |  2 +-
 .../distributed_shallow_water_model_mpi.jl    |  2 +-
 src/Distributed/Distributed.jl                |  2 +-
 .../distributed_fft_based_poisson_solver.jl   |  4 +--
 src/Distributed/distributed_grids.jl          | 22 ++++++-------
 .../distributed_kernel_launching.jl           |  2 +-
 src/Distributed/halo_communication.jl         |  8 ++---
 ...nterleave_communication_and_computation.jl |  8 ++---
 src/Distributed/multi_architectures.jl        | 33 ++++++++++---------
 src/Distributed/partition_assemble.jl         | 12 +++----
 ...distributed_split_explicit_free_surface.jl |  4 +--
 .../hydrostatic_free_surface_model.jl         |  4 +--
 .../NonhydrostaticModels.jl                   |  4 +--
 .../nonhydrostatic_model.jl                   |  2 +-
 src/OutputWriters/output_writer_utils.jl      |  4 +--
 test/test_distributed_models.jl               | 30 ++++++++---------
 test/test_distributed_poisson_solvers.jl      |  2 +-
 .../mpi_geostrophic_adjustment.jl             |  2 +-
 .../mpi_hydrostatic_turbulence.jl             |  2 +-
 ...nhydrostatic_two_dimensional_turbulence.jl |  2 +-
 .../mpi_output_writing.jl                     |  2 +-
 validation/distributed_simulations/mpi_set.jl |  2 +-
 .../mpi_shallow_water_turbulence.jl           |  2 +-
 23 files changed, 80 insertions(+), 77 deletions(-)

diff --git a/benchmark/distributed_nonhydrostatic_model_mpi.jl b/benchmark/distributed_nonhydrostatic_model_mpi.jl
index 1e5b120d23..37411164e2 100644
--- a/benchmark/distributed_nonhydrostatic_model_mpi.jl
+++ b/benchmark/distributed_nonhydrostatic_model_mpi.jl
@@ -28,7 +28,7 @@ local_rank = MPI.Comm_rank(comm)
 @info "Setting up distributed nonhydrostatic model with N=($Nx, $Ny, $Nz) grid points and ranks=($Rx, $Ry, $Rz) on rank $local_rank..."
 
 topo = (Periodic, Periodic, Periodic)
-arch = DistributedArch(CPU(), topology=topo, ranks=(Rx, Ry, Rz), communicator=MPI.COMM_WORLD)
+arch = MultiProcess(CPU(), topology=topo, ranks=(Rx, Ry, Rz), communicator=MPI.COMM_WORLD)
 distributed_grid = RectilinearGrid(arch, topology=topo, size=(Nx, Ny, Nz), extent=(1, 1, 1))
 model = NonhydrostaticModel(grid=distributed_grid)
 
diff --git a/benchmark/distributed_shallow_water_model_mpi.jl b/benchmark/distributed_shallow_water_model_mpi.jl
index 1b639cbc0b..725abe45d9 100644
--- a/benchmark/distributed_shallow_water_model_mpi.jl
+++ b/benchmark/distributed_shallow_water_model_mpi.jl
@@ -30,7 +30,7 @@ Ry = parse(Int, ARGS[4])
 @info "Setting up distributed shallow water model with N=($Nx, $Ny) grid points and ranks=($Rx, $Ry) on rank $local_rank..."
 
 topo = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(), topology=topo, ranks=(Rx, Ry, 1), communicator=MPI.COMM_WORLD)
+arch = MultiProcess(CPU(), topology=topo, ranks=(Rx, Ry, 1), communicator=MPI.COMM_WORLD)
 distributed_grid = RectilinearGrid(arch, topology=topo, size=(Nx, Ny), extent=(1, 1))
 model = ShallowWaterModel(grid=distributed_grid, gravitational_acceleration=1.0)
 set!(model, h=1)
diff --git a/src/Distributed/Distributed.jl b/src/Distributed/Distributed.jl
index da5a780a60..af2103ae18 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/Distributed/Distributed.jl
@@ -1,7 +1,7 @@
 module Distributed
 
 export
-    DistributedArch, child_architecture, reconstruct_global_grid, 
+    MultiProcess, child_architecture, reconstruct_global_grid, 
     inject_halo_communication_boundary_conditions,
     DistributedFFTBasedPoissonSolver
 
diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/Distributed/distributed_fft_based_poisson_solver.jl
index 51f9b8ad22..962d01c831 100644
--- a/src/Distributed/distributed_fft_based_poisson_solver.jl
+++ b/src/Distributed/distributed_fft_based_poisson_solver.jl
@@ -33,7 +33,7 @@ Return a FFT-based solver for the Poisson equation,
 ∇²φ = b
 ```
 
-for `DistributedArch`itectures.
+for `MultiProcess`itectures.
 
 Supported configurations
 ========================
@@ -80,7 +80,7 @@ Restrictions
 ============
 
 The algorithm for two-dimensional decompositions requires that `Nz = size(global_grid, 3)` is larger
-than either `Rx = ranks[1]` or `Ry = ranks[2]`, where `ranks` are configured when building `DistributedArch`.
+than either `Rx = ranks[1]` or `Ry = ranks[2]`, where `ranks` are configured when building `MultiProcess`.
 If `Nz` does not satisfy this condition, we can only support a one-dimensional decomposition.
 
 Algorithm for one-dimensional decompositions
diff --git a/src/Distributed/distributed_grids.jl b/src/Distributed/distributed_grids.jl
index ffa007dd34..e31c7f4831 100644
--- a/src/Distributed/distributed_grids.jl
+++ b/src/Distributed/distributed_grids.jl
@@ -13,20 +13,20 @@ using Oceananigans.ImmersedBoundaries
 
 import Oceananigans.Grids: RectilinearGrid, LatitudeLongitudeGrid, with_halo
 
-const DistributedGrid{FT, TX, TY, TZ} = AbstractGrid{FT, TX, TY, TZ, <:DistributedArch}
+const DistributedGrid{FT, TX, TY, TZ} = AbstractGrid{FT, TX, TY, TZ, <:MultiProcess}
 const DistributedRectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ} =
-    RectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ, <:DistributedArch} where {FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ}
+    RectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ, <:MultiProcess} where {FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ}
 const DistributedLatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ} = 
-    LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:DistributedArch} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
+    LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:MultiProcess} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
 
-const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:DistributedArch} where {FT, TX, TY, TZ, I, M}
+const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:MultiProcess} where {FT, TX, TY, TZ, I, M}
 
 """
-    RectilinearGrid(arch::DistributedArch, FT=Float64; kw...)
+    RectilinearGrid(arch::MultiProcess, FT=Float64; kw...)
 
 Return the rank-local portion of `RectilinearGrid` on `arch`itecture.
 """
-function RectilinearGrid(arch::DistributedArch, 
+function RectilinearGrid(arch::MultiProcess, 
                          FT::DataType = Float64;
                          size,
                          x = nothing,
@@ -69,11 +69,11 @@ function RectilinearGrid(arch::DistributedArch,
 end
 
 """
-    LatitudeLongitudeGrid(arch::DistributedArch, FT=Float64; kw...)
+    LatitudeLongitudeGrid(arch::MultiProcess, FT=Float64; kw...)
 
 Return the rank-local portion of `LatitudeLongitudeGrid` on `arch`itecture.
 """
-function LatitudeLongitudeGrid(arch::DistributedArch,
+function LatitudeLongitudeGrid(arch::MultiProcess,
                                FT::DataType = Float64; 
                                precompute_metrics = true,
                                size,
@@ -321,17 +321,17 @@ function scatter_grid_properties(global_grid)
     return x, y, z, topo, halo
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::RectilinearGrid, local_size)
+function scatter_local_grids(arch::MultiProcess, global_grid::RectilinearGrid, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     return RectilinearGrid(arch, eltype(global_grid); size=local_size, x=x, y=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::LatitudeLongitudeGrid, local_size)
+function scatter_local_grids(arch::MultiProcess, global_grid::LatitudeLongitudeGrid, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     return LatitudeLongitudeGrid(arch, eltype(global_grid); size=local_size, longitude=x, latitude=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::DistributedArch, global_grid::ImmersedBoundaryGrid, local_size)
+function scatter_local_grids(arch::MultiProcess, global_grid::ImmersedBoundaryGrid, local_size)
     ib = global_grid.immersed_boundary
     ug = global_grid.underlying_grid
 
diff --git a/src/Distributed/distributed_kernel_launching.jl b/src/Distributed/distributed_kernel_launching.jl
index d21478b36d..27b8107de2 100644
--- a/src/Distributed/distributed_kernel_launching.jl
+++ b/src/Distributed/distributed_kernel_launching.jl
@@ -1,6 +1,6 @@
 import Oceananigans.Utils: launch!
 
-function launch!(arch::DistributedArch, args...; kwargs...)
+function launch!(arch::MultiProcess, args...; kwargs...)
     child_arch = child_architecture(arch)
     return launch!(child_arch, args...; kwargs...)
 end
diff --git a/src/Distributed/halo_communication.jl b/src/Distributed/halo_communication.jl
index eef623e716..6315c40a06 100644
--- a/src/Distributed/halo_communication.jl
+++ b/src/Distributed/halo_communication.jl
@@ -123,7 +123,7 @@ end
 
     # Overlapping communication and computation, store requests in a `MPI.Request`
     # pool to be waited upon after tendency calculation
-    if async && !(arch isa BlockingDistributedArch)
+    if async && !(arch isa BlockingMultiProcess)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
@@ -238,7 +238,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
     fill_opposite_side_send_buffers! = Symbol("fill_$(opposite_side)_send_buffers!")
 
     @eval begin
-        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side::DCBCT, size, offset, loc, arch::DistributedArch, 
+        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side::DCBCT, size, offset, loc, arch::MultiProcess, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             only_local_halos && return nothing
@@ -255,7 +255,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
             return [send_req1, send_req2, recv_req1, recv_req2]
         end
 
-        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side, size, offset, loc, arch::DistributedArch, 
+        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side, size, offset, loc, arch::MultiProcess, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
@@ -271,7 +271,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
             return [send_req, recv_req]
         end
 
-        function $fill_both_halo!(c, bc_side, bc_opposite_side::DCBCT, size, offset, loc, arch::DistributedArch, 
+        function $fill_both_halo!(c, bc_side, bc_opposite_side::DCBCT, size, offset, loc, arch::MultiProcess, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
diff --git a/src/Distributed/interleave_communication_and_computation.jl b/src/Distributed/interleave_communication_and_computation.jl
index 540cca2eef..8194386469 100644
--- a/src/Distributed/interleave_communication_and_computation.jl
+++ b/src/Distributed/interleave_communication_and_computation.jl
@@ -3,8 +3,8 @@ using Oceananigans.Grids: halo_size
 
 function complete_communication_and_compute_boundary!(model, ::DistributedGrid, arch)
 
-    # We iterate over the fields because we have to clear _ALL_ architectures
-    # and split explicit variables live on a different grid
+    # Iterate over the fields to clear _ALL_ architectures
+    # (split-explicit variables live on a different grid)
     for field in prognostic_fields(model)
         complete_halo_communication!(field)
     end
@@ -16,7 +16,7 @@ function complete_communication_and_compute_boundary!(model, ::DistributedGrid,
 end
 
 # Fallback
-complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingDistributedArch) = nothing
+complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingMultiProcess) = nothing
 complete_communication_and_compute_boundary!(model, grid, arch) = nothing
 
 compute_boundary_tendencies!(model) = nothing
@@ -26,7 +26,7 @@ interior_tendency_kernel_parameters(grid) = :xyz
 interior_tendency_kernel_parameters(grid::DistributedGrid) = 
             interior_tendency_kernel_parameters(grid, architecture(grid))
 
-interior_tendency_kernel_parameters(grid, ::BlockingDistributedArch) = :xyz
+interior_tendency_kernel_parameters(grid, ::BlockingMultiProcess) = :xyz
 
 function interior_tendency_kernel_parameters(grid, arch)
     Rx, Ry, _ = arch.ranks
diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 63c1df9188..093c6bb5c5 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -6,7 +6,7 @@ import Oceananigans.Architectures: device, arch_array, array_type, child_archite
 import Oceananigans.Grids: zeros
 import Oceananigans.Utils: sync_device!
 
-struct DistributedArch{A, R, I, ρ, C, γ, M, T} <: AbstractArchitecture
+struct MultiProcess{A, M, R, I, ρ, C, γ, T} <: AbstractArchitecture
   child_architecture :: A
           local_rank :: R
          local_index :: I
@@ -22,7 +22,7 @@ end
 #####
 
 """
-    DistributedArch(child_architecture = CPU(); 
+    MultiProcess(child_architecture = CPU(); 
                     topology, 
                     ranks, 
                     devices = nothing, 
@@ -57,7 +57,7 @@ Keyword arguments
 - `communicator`: the MPI communicator, `MPI.COMM_WORLD`. This keyword argument should not be tampered with 
                   if not for testing or developing. Change at your own risk!
 """
-function DistributedArch(child_architecture = CPU(); 
+function MultiProcess(child_architecture = CPU(); 
                          topology, 
                          ranks,
                          devices = nothing, 
@@ -101,25 +101,28 @@ function DistributedArch(child_architecture = CPU();
     M = typeof(mpi_requests)
     T = typeof(Ref(0))
 
-    return DistributedArch{A, R, I, ρ, C, γ, M, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, Ref(0))
+    return MultiProcess{A, M, R, I, ρ, C, γ, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, Ref(0))
 end
 
-const BlockingDistributedArch = DistributedArch{<:Any, <:Any, <:Any, <:Any, <:Any, <:Any, <:Nothing}
+const MultiCPUProcess = MultiProcess{CPU}
+const MultiGPUProcess = MultiProcess{CPU}
+
+const BlockingMultiProcess = MultiProcess{<:Any, <:Nothing}
 
 #####
 ##### All the architectures
 #####
 
-child_architecture(arch::DistributedArch) = arch.child_architecture
-device(arch::DistributedArch)             = device(child_architecture(arch))
-arch_array(arch::DistributedArch, A)      = arch_array(child_architecture(arch), A)
-zeros(FT, arch::DistributedArch, N...)    = zeros(FT, child_architecture(arch), N...)
-array_type(arch::DistributedArch)         = array_type(child_architecture(arch))
-sync_device!(arch::DistributedArch)       = sync_device!(arch.child_architecture)
+child_architecture(arch::MultiProcess) = arch.child_architecture
+device(arch::MultiProcess)             = device(child_architecture(arch))
+arch_array(arch::MultiProcess, A)      = arch_array(child_architecture(arch), A)
+zeros(FT, arch::MultiProcess, N...)    = zeros(FT, child_architecture(arch), N...)
+array_type(arch::MultiProcess)         = array_type(child_architecture(arch))
+sync_device!(arch::MultiProcess)       = sync_device!(arch.child_architecture)
 
-cpu_architecture(arch::DistributedArch{<:CPU}) = arch
-cpu_architecture(arch::DistributedArch{<:GPU}) = 
-    DistributedArch(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
+cpu_architecture(arch::MultiCPUProcess) = arch
+cpu_architecture(arch::MultiGPUProcess) = 
+    MultiProcess(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
                            arch.connectivity, arch.communicator, arch.mpi_requests, arch.mpi_tag)
 
 #####
@@ -215,7 +218,7 @@ end
 ##### Pretty printing
 #####
 
-function Base.show(io::IO, arch::DistributedArch)
+function Base.show(io::IO, arch::MultiProcess)
     c = arch.connectivity
     print(io, "Distributed architecture (rank $(arch.local_rank)/$(prod(arch.ranks)-1)) [index $(arch.local_index) / $(arch.ranks)]\n",
               "└── child architecture: $(typeof(child_architecture(arch))) \n",
diff --git a/src/Distributed/partition_assemble.jl b/src/Distributed/partition_assemble.jl
index f560ec7cd8..d0ac6913b0 100644
--- a/src/Distributed/partition_assemble.jl
+++ b/src/Distributed/partition_assemble.jl
@@ -1,20 +1,20 @@
 using Oceananigans.Architectures: arch_array
 
-all_reduce(val, arch::DistributedArch; op = +) = 
+all_reduce(val, arch::MultiProcess; op = +) = 
     MPI.Allreduce(val, op, arch.communicator)
 
 all_reduce(val, arch; kwargs...) = val
 
 """
-    concatenate_local_sizes(n, arch::DistributedArch) 
+    concatenate_local_sizes(n, arch::MultiProcess) 
 
 Return a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
 all 3 directions.
 """
-concatenate_local_sizes(n, arch::DistributedArch) = 
+concatenate_local_sizes(n, arch::MultiProcess) = 
     Tuple(concatenate_local_sizes(n, arch, i) for i in 1:length(n))
 
-function concatenate_local_sizes(n, arch::DistributedArch, idx)
+function concatenate_local_sizes(n, arch::MultiProcess, idx)
     R = arch.ranks[idx]
     r = arch.local_index[idx]
     n = n isa Number ? n : n[idx]
@@ -106,7 +106,7 @@ partition_global_array(arch, c_global::AbstractArray, n) = c_global
 partition_global_array(arch, c_global::Function, n)      = c_global 
 
 # Here we assume that we cannot partition in z (we should remove support for that)
-function partition_global_array(arch::DistributedArch, c_global::AbstractArray, n) 
+function partition_global_array(arch::MultiProcess, c_global::AbstractArray, n) 
     c_global = arch_array(CPU(), c_global)
 
     ri, rj, rk = arch.local_index
@@ -141,7 +141,7 @@ construct_global_array(arch, c_local::AbstractArray, n) = c_local
 construct_global_array(arch, c_local::Function, N)      = c_local
 
 # TODO: This does not work for 3D parallelizations!!!
-function construct_global_array(arch::DistributedArch, c_local::AbstractArray, n) 
+function construct_global_array(arch::MultiProcess, c_local::AbstractArray, n) 
     c_local = arch_array(CPU(), c_local)
 
     ri, rj, rk = arch.local_index
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 3d0ea81814..97ce1cd04c 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -1,6 +1,6 @@
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
 using Oceananigans.Distributed: DistributedGrid, DistributedField
-using Oceananigans.Distributed: BlockingDistributedArch, complete_halo_communication!
+using Oceananigans.Distributed: BlockingMultiProcess, complete_halo_communication!
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliaryFields
@@ -93,7 +93,7 @@ end
 
 const DistributedSplitExplicit = SplitExplicitFreeSurface{<:DistributedField}
 
-wait_free_surface_communication!(::DistributedSplitExplicit, ::BlockingDistributedArch) = nothing
+wait_free_surface_communication!(::DistributedSplitExplicit, ::BlockingMultiProcess) = nothing
     
 function wait_free_surface_communication!(free_surface::DistributedSplitExplicit, arch)
     
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 7d5bfb2da7..6503b9e64d 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -206,8 +206,8 @@ function validate_vertical_velocity_boundary_conditions(w)
     return nothing
 end
 
-validate_free_surface(::DistributedArch, free_surface::SplitExplicitFreeSurface) = free_surface
-validate_free_surface(arch::DistributedArch, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
+validate_free_surface(::MultiProcess, free_surface::SplitExplicitFreeSurface) = free_surface
+validate_free_surface(arch::MultiProcess, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
 validate_free_surface(arch, free_surface) = free_surface
 
 validate_momentum_advection(momentum_advection, ibg::ImmersedBoundaryGrid) = validate_momentum_advection(momentum_advection, ibg.underlying_grid)
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 4488b2c0f0..de60a9260d 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -11,7 +11,7 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Grids: XYRegRectilinearGrid, XZRegRectilinearGrid, YZRegRectilinearGrid
 using Oceananigans.Solvers
-using Oceananigans.Distributed: DistributedArch, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
+using Oceananigans.Distributed: MultiProcess, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: SumOfArrays
 
@@ -19,7 +19,7 @@ import Oceananigans: fields, prognostic_fields
 import Oceananigans.Advection: cell_advection_timescale
 import Oceananigans.TimeSteppers: step_lagrangian_particles!
 
-function PressureSolver(arch::DistributedArch, local_grid::RegRectilinearGrid)
+function PressureSolver(arch::MultiProcess, local_grid::RegRectilinearGrid)
     global_grid = reconstruct_global_grid(local_grid)
     return DistributedFFTBasedPoissonSolver(global_grid, local_grid)
 end
diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
index 5caea08df2..ec9d8345ab 100644
--- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
@@ -2,7 +2,7 @@ using CUDA: has_cuda
 using OrderedCollections: OrderedDict
 
 using Oceananigans.Architectures: AbstractArchitecture
-using Oceananigans.Distributed: DistributedArch
+using Oceananigans.Distributed: MultiProcess
 using Oceananigans.Advection: CenteredSecondOrder
 using Oceananigans.BuoyancyModels: validate_buoyancy, regularize_buoyancy, SeawaterBuoyancy
 using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields
diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 074066a3b1..1473ff7c6b 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -44,7 +44,7 @@ saveproperty!(file, address, grid::AbstractGrid)      = _saveproperty!(file, add
 
 function saveproperty!(file, address, grid::DistributedGrid) 
     arch = architecture(grid)
-    cpu_arch = DistributedArch(CPU(); topology = topology(grid),
+    cpu_arch = MultiProcess(CPU(); topology = topology(grid),
                                       ranks = arch.ranks)
     _saveproperty!(file, address, on_architecture(cpu_arch, grid))
 end
@@ -86,7 +86,7 @@ serializeproperty!(file, address, grid::AbstractGrid) = file[address] = on_archi
 
 function serializeproperty!(file, address, grid::DistributedGrid) 
     arch = architecture(grid)
-    cpu_arch = DistributedArch(CPU(); topology = topology(grid),
+    cpu_arch = MultiProcess(CPU(); topology = topology(grid),
                                       ranks = arch.ranks)
     file[address] = on_architecture(cpu_arch, grid)
 end
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 68e9ddfcf4..8c3e073c78 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -26,7 +26,7 @@ MPI.Init()
 # to initialize MPI.
 
 using Oceananigans.BoundaryConditions: fill_halo_regions!, DCBC
-using Oceananigans.Distributed: DistributedArch, index2rank
+using Oceananigans.Distributed: MultiProcess, index2rank
 using Oceananigans.Fields: AbstractField
 using Oceananigans.Grids:
     halo_size,
@@ -113,7 +113,7 @@ mpi_ranks = MPI.Comm_size(comm)
 
 function test_triply_periodic_rank_connectivity_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(4, 1, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(4, 1, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -147,7 +147,7 @@ end
 
 function test_triply_periodic_rank_connectivity_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(1, 4, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(1, 4, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -187,7 +187,7 @@ end
 
 function test_triply_periodic_rank_connectivity_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(2, 2, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(2, 2, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -231,7 +231,7 @@ end
 
 function test_triply_periodic_local_grid_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(4, 1, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(4, 1, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -249,7 +249,7 @@ end
 
 function test_triply_periodic_local_grid_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(1, 4, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(1, 4, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -267,7 +267,7 @@ end
 
 function test_triply_periodic_local_grid_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=(2, 2, 1), topology = topo)
+    arch = MultiProcess(CPU(), ranks=(2, 2, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     
     i, j, k = arch.local_index
@@ -291,7 +291,7 @@ end
 
 function test_triply_periodic_bc_injection_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(4, 1, 1), topology=topo)
+    arch = MultiProcess(ranks=(4, 1, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -308,7 +308,7 @@ end
 
 function test_triply_periodic_bc_injection_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(1, 4, 1), topology=topo)
+    arch = MultiProcess(ranks=(1, 4, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -325,7 +325,7 @@ end
 
 function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(ranks=(2, 2, 1), topology=topo)
+    arch = MultiProcess(ranks=(2, 2, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -346,7 +346,7 @@ end
 
 function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(child_arch; ranks=(4, 1, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = MultiProcess(child_arch; ranks=(4, 1, 1), topology=topo, devices = (0, 0, 0, 0))
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -370,7 +370,7 @@ end
 
 function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch)
     topo  = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = MultiProcess(child_arch; ranks=(1, 4, 1), topology=topo, devices = (0, 0, 0, 0))
     grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -392,7 +392,7 @@ end
 
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = MultiProcess(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 3), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -464,7 +464,7 @@ end
             for ranks in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]
                 @info "Time-stepping a distributed NonhydrostaticModel with ranks $ranks..."
                 topo = (Periodic, Periodic, Periodic)
-                arch = DistributedArch(; ranks, topology=topo)
+                arch = MultiProcess(; ranks, topology=topo)
                 grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
                 model = NonhydrostaticModel(; grid)
 
@@ -483,7 +483,7 @@ end
     @testset "Time stepping ShallowWaterModel" begin
         for child_arch in archs
             topo = (Periodic, Periodic, Flat)
-            arch = DistributedArch(child_arch; ranks=(1, 4, 1), topology = topo, devices = (0, 0, 0, 0))
+            arch = MultiProcess(child_arch; ranks=(1, 4, 1), topology = topo, devices = (0, 0, 0, 0))
             grid = RectilinearGrid(arch, topology=topo, size=(8, 2), extent=(1, 2), halo=(3, 3))
             model = ShallowWaterModel(; momentum_advection=nothing, mass_advection=nothing, tracer_advection=nothing, grid, gravitational_acceleration=1)
 
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 24c6ccdc1e..01ff15192b 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -65,7 +65,7 @@ end
 
 function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
-    arch = DistributedArch(CPU(), ranks=ranks, topology=topo)
+    arch = MultiProcess(CPU(), ranks=ranks, topology=topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=grid_points, extent=(1, 2, 3))
 
     bcs = FieldBoundaryConditions(local_grid, (Center, Center, Center))
diff --git a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
index 08f66ecdb3..c7bd31e0e8 100644
--- a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
+++ b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
@@ -23,7 +23,7 @@ rank   = MPI.Comm_rank(comm)
 Nranks = MPI.Comm_size(comm)
 
 topo = (Bounded, Periodic, Bounded)
-arch = DistributedArch(CPU(); topology = topo, 
+arch = MultiProcess(CPU(); topology = topo, 
                  ranks=(Nranks, 1, 1),
                  use_buffers = true)
 
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index fcd9e128da..1194b3e1a7 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -75,7 +75,7 @@ Ry = 1
 @assert Nranks == 4
 
 # Enable overlapped communication!
-arch  = DistributedArch(CPU(), ranks = (Rx, Ry, 1), 
+arch  = MultiProcess(CPU(), ranks = (Rx, Ry, 1), 
                         topology=topo, 
                         enable_overlapped_computation = true)
 
diff --git a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
index 3b0a93ef92..53fc3d33c4 100644
--- a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
+++ b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
@@ -28,7 +28,7 @@ Nranks = MPI.Comm_size(comm)
 Nx = Ny = 256
 Lx = Ly = 2π
 topology = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
+arch = MultiProcess(CPU(); topology, ranks=(1, Nranks, 1))
 grid = RectilinearGrid(arch; topology, size=(Nx ÷ Nranks, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
 
 @info "Built $Nranks grids:"
diff --git a/validation/distributed_simulations/mpi_output_writing.jl b/validation/distributed_simulations/mpi_output_writing.jl
index a5092a4c0f..ff990a7c5b 100644
--- a/validation/distributed_simulations/mpi_output_writing.jl
+++ b/validation/distributed_simulations/mpi_output_writing.jl
@@ -9,7 +9,7 @@ rank = MPI.Comm_rank(comm)
 Nranks = MPI.Comm_size(comm)
 
 topology = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(); topology, ranks=(Nranks, 1, 1))
+arch = MultiProcess(CPU(); topology, ranks=(Nranks, 1, 1))
 grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), halo=(3, 3), extent=(2π, 2π))
 
 model = NonhydrostaticModel(; grid)
diff --git a/validation/distributed_simulations/mpi_set.jl b/validation/distributed_simulations/mpi_set.jl
index 2c4a374c0b..2c34e219c5 100644
--- a/validation/distributed_simulations/mpi_set.jl
+++ b/validation/distributed_simulations/mpi_set.jl
@@ -10,7 +10,7 @@ Nranks = MPI.Comm_size(MPI.COMM_WORLD)
 
 # Setup model
 topology = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(); topology, ranks=(1, Nranks, 1))
+arch = MultiProcess(CPU(); topology, ranks=(1, Nranks, 1))
 grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), extent=(2π, 2π))
 c = CenterField(grid)
 
diff --git a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
index 7ada7cd29c..43ba48364e 100644
--- a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
+++ b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
@@ -13,7 +13,7 @@ using Oceananigans.Distributed
 
 ranks = (2, 2, 1)
 topo = (Periodic, Periodic, Flat)
-arch = DistributedArch(CPU(), ranks=ranks, topology=topo)
+arch = MultiProcess(CPU(), ranks=ranks, topology=topo)
 grid = RectilinearGrid(arch, topology=topo, size=(128 ÷ ranks[1], 128 ÷ ranks[2]), extent=(4π, 4π), halo=(3, 3))
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 

From d5e75a38414f71c96a057dd8953a488825b8f7be Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:47:40 +0200
Subject: [PATCH 521/530] docstring

---
 src/Distributed/multi_architectures.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index 093c6bb5c5..c92259237d 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -154,6 +154,11 @@ struct RankConnectivity{E, W, N, S, SW, SE, NW, NE}
     northeast :: NE
 end
 
+"""
+    RankConnectivity(; east, west, north, south, southwest, southeast, northwest, northeast)
+
+generate a `RankConnectivity` object that holds the MPI ranks of the neighboring processors.
+"""
 RankConnectivity(; east, west, north, south, southwest, southeast, northwest, northeast) =
     RankConnectivity(east, west, north, south, southwest, southeast, northwest, northeast)
 

From 256de76e2d56ac8c8daaa3a8c609354c26b472e8 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:49:04 +0200
Subject: [PATCH 522/530] name change on rank

---
 src/Distributed/multi_architectures.jl | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index c92259237d..f68432ffb9 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -201,22 +201,22 @@ function RankConnectivity(local_index, ranks, topology)
     j_north = increment_index(j, Ry, TY)
     j_south = decrement_index(j, Ry, TY)
 
-    r_east  = isnothing(i_east)  ? nothing : index2rank(i_east,  j, k, Rx, Ry, Rz)
-    r_west  = isnothing(i_west)  ? nothing : index2rank(i_west,  j, k, Rx, Ry, Rz)
-    r_north = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
-    r_south = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
-
-    r_northeast = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
-    r_northwest = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
-    r_southeast = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
-    r_southwest = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
-
-    return RankConnectivity(west=r_west, east=r_east, 
-                            south=r_south, north=r_north,
-                            southwest=r_southwest,
-                            southeast=r_southeast,
-                            northwest=r_northwest,
-                            northeast=r_northeast)
+     east_rank = isnothing(i_east)  ? nothing : index2rank(i_east,  j, k, Rx, Ry, Rz)
+     west_rank = isnothing(i_west)  ? nothing : index2rank(i_west,  j, k, Rx, Ry, Rz)
+    north_rank = isnothing(j_north) ? nothing : index2rank(i, j_north, k, Rx, Ry, Rz)
+    south_rank = isnothing(j_south) ? nothing : index2rank(i, j_south, k, Rx, Ry, Rz)
+
+    northeast_rank = isnothing(i_east) || isnothing(j_north) ? nothing : index2rank(i_east, j_north, k, Rx, Ry, Rz)
+    northwest_rank = isnothing(i_west) || isnothing(j_north) ? nothing : index2rank(i_west, j_north, k, Rx, Ry, Rz)
+    southeast_rank = isnothing(i_east) || isnothing(j_south) ? nothing : index2rank(i_east, j_south, k, Rx, Ry, Rz)
+    southwest_rank = isnothing(i_west) || isnothing(j_south) ? nothing : index2rank(i_west, j_south, k, Rx, Ry, Rz)
+
+    return RankConnectivity(west=west_rank, east=east_rank, 
+                            south=south_rank, north=north_rank,
+                            southwest=southwest_rank,
+                            southeast=southeast_rank,
+                            northwest=northwest_rank,
+                            northeast=northeast_rank)
 end
 
 #####

From 0bfeb97409e9a2e8d7c56d5fe6c53255919d068f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:52:12 +0200
Subject: [PATCH 523/530] interior active cells

---
 src/ImmersedBoundaries/ImmersedBoundaries.jl |  8 ++++----
 src/ImmersedBoundaries/active_cells_map.jl   | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/ImmersedBoundaries/ImmersedBoundaries.jl b/src/ImmersedBoundaries/ImmersedBoundaries.jl
index 2cecb31b51..3d6a502088 100644
--- a/src/ImmersedBoundaries/ImmersedBoundaries.jl
+++ b/src/ImmersedBoundaries/ImmersedBoundaries.jl
@@ -105,7 +105,7 @@ struct ImmersedBoundaryGrid{FT, TX, TY, TZ, G, I, M, Arch} <: AbstractGrid{FT, T
     architecture :: Arch
     underlying_grid :: G
     immersed_boundary :: I
-    active_cells_interior :: M
+    interior_active_cells :: M
     
     # Internal interface
     function ImmersedBoundaryGrid{TX, TY, TZ}(grid::G, ib::I, mi::M) where {TX, TY, TZ, G <: AbstractUnderlyingGrid, I, M}
@@ -130,8 +130,8 @@ const IBG = ImmersedBoundaryGrid
 @inline get_ibg_property(ibg::IBG, ::Val{property}) where property = getfield(getfield(ibg, :underlying_grid), property)
 @inline get_ibg_property(ibg::IBG, ::Val{:immersed_boundary})      = getfield(ibg, :immersed_boundary)
 @inline get_ibg_property(ibg::IBG, ::Val{:underlying_grid})        = getfield(ibg, :underlying_grid)
-@inline get_ibg_property(ibg::IBG, ::Val{:active_cells_interior})  = getfield(ibg, :active_cells_interior)
-@inline get_ibg_property(ibg::IBG, ::Val{:active_cells_surface})   = getfield(ibg, :active_cells_surface)
+@inline get_ibg_property(ibg::IBG, ::Val{:interior_active_cells})  = getfield(ibg, :interior_active_cells)
+@inline get_ibg_property(ibg::IBG, ::Val{:surface_active_cells})   = getfield(ibg, :surface_active_cells)
 
 @inline architecture(ibg::IBG) = architecture(ibg.underlying_grid)
 
@@ -140,7 +140,7 @@ const IBG = ImmersedBoundaryGrid
 @inline z_domain(ibg::IBG) = z_domain(ibg.underlying_grid)
 
 Adapt.adapt_structure(to, ibg::IBG{FT, TX, TY, TZ}) where {FT, TX, TY, TZ} =
-    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.active_cells_interior))
+    ImmersedBoundaryGrid{TX, TY, TZ}(adapt(to, ibg.underlying_grid), adapt(to, ibg.immersed_boundary), adapt(to, ibg.interior_active_cells))
 
 with_halo(halo, ibg::ImmersedBoundaryGrid) =
     ImmersedBoundaryGrid(with_halo(halo, ibg.underlying_grid), ibg.immersed_boundary)
diff --git a/src/ImmersedBoundaries/active_cells_map.jl b/src/ImmersedBoundaries/active_cells_map.jl
index 5322ee6c00..f3ce915df7 100644
--- a/src/ImmersedBoundaries/active_cells_map.jl
+++ b/src/ImmersedBoundaries/active_cells_map.jl
@@ -16,11 +16,11 @@ struct SurfaceMap end
 @inline use_only_active_surface_cells(grid::AbstractGrid)   = nothing
 @inline use_only_active_surface_cells(grid::ActiveCellsIBG) = SurfaceMap()
 
-@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.active_cells_interior), 256), length(grid.active_cells_interior)
-@inline active_cells_work_layout(group, size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.active_cells_surface),  256), length(grid.active_cells_surface)
+@inline active_cells_work_layout(group, size, ::InteriorMap, grid::ActiveCellsIBG) = min(length(grid.interior_active_cells), 256), length(grid.interior_active_cells)
+@inline active_cells_work_layout(group, size, ::SurfaceMap,  grid::ActiveCellsIBG) = min(length(grid.surface_active_cells),  256), length(grid.surface_active_cells)
 
-@inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_interior[idx])
-@inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.active_cells_surface[idx])
+@inline active_linear_index_to_interior_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.interior_active_cells[idx])
+@inline  active_linear_index_to_surface_tuple(idx, grid::ActiveCellsIBG) = Base.map(Int, grid.surface_active_cells[idx])
 
 function ImmersedBoundaryGrid(grid, ib; active_cells_map::Bool = true) 
 
@@ -45,14 +45,14 @@ end
 @inline active_cell(i, j, k, ibg) = !immersed_cell(i, j, k, ibg)
 @inline active_column(i, j, k, grid, column) = column[i, j, k] != 0
 
-function compute_active_cells_interior(ibg)
+function compute_interior_active_cells(ibg)
     is_immersed_operation = KernelFunctionOperation{Center, Center, Center}(active_cell, ibg)
     active_cells_field = Field{Center, Center, Center}(ibg, Bool)
     set!(active_cells_field, is_immersed_operation)
     return active_cells_field
 end
 
-function compute_active_cells_surface(ibg)
+function compute_surface_active_cells(ibg)
     one_field = ConditionalOperation{Center, Center, Center}(OneField(Int), identity, ibg, NotImmersed(truefunc), 0)
     column    = sum(one_field, dims = 3)
     is_immersed_column = KernelFunctionOperation{Center, Center, Nothing}(active_column, ibg, column)
@@ -66,7 +66,7 @@ const MAXUInt16 = 2^16 - 1
 const MAXUInt32 = 2^32 - 1
 
 function active_cells_map_interior(ibg)
-    active_cells_field = compute_active_cells_interior(ibg)
+    active_cells_field = compute_interior_active_cells(ibg)
     
     N = maximum(size(ibg))
     IntType = N > MAXUInt8 ? (N > MAXUInt16 ? (N > MAXUInt32 ? UInt64 : UInt32) : UInt16) : UInt8
@@ -106,7 +106,7 @@ end
 # If we eventually want to perform also barotropic step, `w` computation and `p` 
 # computation only on active `columns`
 function active_cells_map_surface(ibg)
-    active_cells_field = compute_active_cells_surface(ibg)
+    active_cells_field = compute_surface_active_cells(ibg)
     interior_cells     = arch_array(CPU(), interior(active_cells_field, :, :, 1))
   
     full_indices = findall(interior_cells)

From 1b9680438cf038584c550120c5a936228c08f2ae Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 12:58:01 +0200
Subject: [PATCH 524/530] calculate -> compute

---
 .../HydrostaticFreeSurfaceModels.jl           |   4 +-
 ...tatic_free_surface_boundary_tendencies.jl} |   2 +-
 ...te_hydrostatic_free_surface_tendencies.jl} |  56 ++---
 ...ute_nonhydrostatic_boundary_tendencies.jl} |   2 +-
 ...l => compute_nonhydrostatic_tendencies.jl} |  34 +--
 .../ShallowWaterModels/ShallowWaterModels.jl  |   2 +-
 .../calculate_shallow_water_tendencies.jl     | 213 ------------------
 .../compute_shallow_water_tendencies.jl       | 213 ++++++++++++++++++
 8 files changed, 263 insertions(+), 263 deletions(-)
 rename src/Models/HydrostaticFreeSurfaceModels/{calculate_hydrostatic_free_surface_boundary_tendencies.jl => compute_hydrostatic_free_surface_boundary_tendencies.jl} (95%)
 rename src/Models/HydrostaticFreeSurfaceModels/{calculate_hydrostatic_free_surface_tendencies.jl => compute_hydrostatic_free_surface_tendencies.jl} (78%)
 rename src/Models/NonhydrostaticModels/{calculate_nonhydrostatic_boundary_tendencies.jl => compute_nonhydrostatic_boundary_tendencies.jl} (97%)
 rename src/Models/NonhydrostaticModels/{calculate_nonhydrostatic_tendencies.jl => compute_nonhydrostatic_tendencies.jl} (84%)
 delete mode 100644 src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
 create mode 100644 src/Models/ShallowWaterModels/compute_shallow_water_tendencies.jl

diff --git a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
index ce804cc291..c922afa29f 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/HydrostaticFreeSurfaceModels.jl
@@ -105,8 +105,8 @@ step_lagrangian_particles!(model::HydrostaticFreeSurfaceModel, Δt) = step_lagra
 
 include("barotropic_pressure_correction.jl")
 include("hydrostatic_free_surface_tendency_kernel_functions.jl")
-include("calculate_hydrostatic_free_surface_tendencies.jl")
-include("calculate_hydrostatic_free_surface_boundary_tendencies.jl")
+include("compute_hydrostatic_free_surface_tendencies.jl")
+include("compute_hydrostatic_free_surface_boundary_tendencies.jl")
 include("update_hydrostatic_free_surface_model_state.jl")
 include("hydrostatic_free_surface_ab2_step.jl")
 include("store_hydrostatic_free_surface_tendencies.jl")
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
similarity index 95%
rename from src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
rename to src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
index 39f04067aa..ce019b6701 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
@@ -22,7 +22,7 @@ function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
-    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
+    compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters)
 
     return nothing
 end
diff --git a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
similarity index 78%
rename from src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
rename to src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
index 0b734c011d..7691a586c2 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/calculate_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
@@ -28,12 +28,12 @@ function compute_tendencies!(model::HydrostaticFreeSurfaceModel, callbacks)
 
     # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
     # interior of the domain
-    calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
+    compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
 
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
-    calculate_hydrostatic_boundary_tendency_contributions!(model.timestepper.Gⁿ,
+    compute_hydrostatic_boundary_tendency_contributions!(model.timestepper.Gⁿ,
                                                            model.architecture,
                                                            model.velocities,
                                                            model.free_surface,
@@ -55,19 +55,19 @@ end
 using Oceananigans.TurbulenceClosures.CATKEVerticalDiffusivities: FlavorOfCATKE
 using Oceananigans.TurbulenceClosures.MEWSVerticalDiffusivities: MEWS
 
-@inline tracer_tendency_kernel_function(model::HFSM, name, c, K)                     = calculate_hydrostatic_free_surface_Gc!, c, K
-@inline tracer_tendency_kernel_function(model::HFSM, ::Val{:K}, c::MEWS,          K) = calculate_hydrostatic_free_surface_Ge!, c, K
-@inline tracer_tendency_kernel_function(model::HFSM, ::Val{:e}, c::FlavorOfCATKE, K) = calculate_hydrostatic_free_surface_Ge!, c, K
+@inline tracer_tendency_kernel_function(model::HFSM, name, c, K)                     = compute_hydrostatic_free_surface_Gc!, c, K
+@inline tracer_tendency_kernel_function(model::HFSM, ::Val{:K}, c::MEWS,          K) = compute_hydrostatic_free_surface_Ge!, c, K
+@inline tracer_tendency_kernel_function(model::HFSM, ::Val{:e}, c::FlavorOfCATKE, K) = compute_hydrostatic_free_surface_Ge!, c, K
 
 function tracer_tendency_kernel_function(model::HFSM, ::Val{:e}, closures::Tuple, diffusivity_fields::Tuple)
     catke_index = findfirst(c -> c isa FlavorOfCATKE, closures)
 
     if isnothing(catke_index)
-        return calculate_hydrostatic_free_surface_Gc!, closures, diffusivity_fields
+        return compute_hydrostatic_free_surface_Gc!, closures, diffusivity_fields
     else
         catke_closure = closures[catke_index]
         catke_diffusivity_fields = diffusivity_fields[catke_index]
-        return calculate_hydrostatic_free_surface_Ge!, catke_closure, catke_diffusivity_fields 
+        return compute_hydrostatic_free_surface_Ge!, catke_closure, catke_diffusivity_fields 
     end
 end
 
@@ -75,24 +75,24 @@ function tracer_tendency_kernel_function(model::HFSM, ::Val{:K}, closures::Tuple
     mews_index = findfirst(c -> c isa MEWS, closures)
 
     if isnothing(mews_index)
-        return calculate_hydrostatic_free_surface_Gc!, closures, diffusivity_fields
+        return compute_hydrostatic_free_surface_Gc!, closures, diffusivity_fields
     else
         mews_closure = closures[mews_index]
         mews_diffusivity_fields = diffusivity_fields[mews_index]
-        return  calculate_hydrostatic_free_surface_Ge!, mews_closure, mews_diffusivity_fields 
+        return compute_hydrostatic_free_surface_Ge!, mews_closure, mews_diffusivity_fields 
     end
 end
 
 top_tracer_boundary_conditions(grid, tracers) =
     NamedTuple(c => tracers[c].boundary_conditions.top for c in propertynames(tracers))
 
-""" Store previous value of the source term and calculate current source term. """
-function calculate_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
+""" Store previous value of the source term and compute current source term. """
+function compute_hydrostatic_free_surface_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
 
     arch = model.architecture
     grid = model.grid
 
-    calculate_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
+    compute_hydrostatic_momentum_tendencies!(model, model.velocities, kernel_parameters; only_active_cells)
 
     top_tracer_bcs = top_tracer_boundary_conditions(grid, model.tracers)
 
@@ -146,7 +146,7 @@ function apply_flux_bcs!(Gcⁿ, c, arch, args...)
     return nothing
 end
 
-function calculate_free_surface_tendency!(grid, model, kernel_parameters)
+function compute_free_surface_tendency!(grid, model, kernel_parameters)
 
     arch = architecture(grid)
 
@@ -158,14 +158,14 @@ function calculate_free_surface_tendency!(grid, model, kernel_parameters)
                  model.clock)
 
     launch!(arch, grid, kernel_parameters,
-            calculate_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, 
+            compute_hydrostatic_free_surface_Gη!, model.timestepper.Gⁿ.η, 
             grid, args)
 
     return nothing
 end
 
 """ Calculate momentum tendencies if momentum is not prescribed."""
-function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = nothing)
+function compute_hydrostatic_momentum_tendencies!(model, velocities, kernel_parameters; only_active_cells = nothing)
 
     grid = model.grid
     arch = architecture(grid)
@@ -192,23 +192,23 @@ function calculate_hydrostatic_momentum_tendencies!(model, velocities, kernel_pa
         
     for parameters in kernel_parameters
         launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, 
+                compute_hydrostatic_free_surface_Gu!, model.timestepper.Gⁿ.u, grid, 
                 only_active_cells, u_kernel_args;
                 only_active_cells)
 
         launch!(arch, grid, parameters,
-                calculate_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, 
+                compute_hydrostatic_free_surface_Gv!, model.timestepper.Gⁿ.v, grid, 
                 only_active_cells, v_kernel_args;
                 only_active_cells)
     end
 
-    calculate_free_surface_tendency!(grid, model, :xy)
+    compute_free_surface_tendency!(grid, model, :xy)
 
     return nothing
 end
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
-function calculate_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velocities, free_surface, tracers, args...)
+function compute_hydrostatic_boundary_tendency_contributions!(Gⁿ, arch, velocities, free_surface, tracers, args...)
 
     # Velocity fields
     for i in (:u, :v)
@@ -231,24 +231,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = hydrostatic_free_surface_u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = hydrostatic_free_surface_v_velocity_tendency(i, j, k, grid, args...)
@@ -259,24 +259,24 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = hydrostatic_free_surface_tracer_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the subgrid scale energy equation. """
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid, interior_map, args)
+@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_hydrostatic_free_surface_Ge!(Ge, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Ge[i, j, k] = hydrostatic_turbulent_kinetic_energy_tendency(i, j, k, grid, args...)
@@ -287,7 +287,7 @@ end
 #####
 
 """ Calculate the right-hand-side of the free surface displacement (``η``) equation. """
-@kernel function calculate_hydrostatic_free_surface_Gη!(Gη, grid, args)
+@kernel function compute_hydrostatic_free_surface_Gη!(Gη, grid, args)
     i, j = @index(Global, NTuple)
     @inbounds Gη[i, j, grid.Nz+1] = free_surface_tendency(i, j, grid, args...)
 end
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
similarity index 97%
rename from src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
rename to src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
index 01671beb3c..3e2c0c7303 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
@@ -16,7 +16,7 @@ function compute_boundary_tendencies!(model::NonhydrostaticModel)
 
     # parameters for communicating North / South / East / West side
     kernel_parameters = boundary_tendency_kernel_parameters(grid, arch)
-    calculate_interior_tendency_contributions!(model, kernel_parameters)
+    compute_interior_tendency_contributions!(model, kernel_parameters)
 
     return nothing
 end
diff --git a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
similarity index 84%
rename from src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
rename to src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
index 6f94527e28..ea495a76a3 100644
--- a/src/Models/NonhydrostaticModels/calculate_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
@@ -28,12 +28,12 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
     # interior of the domain
     kernel_parameters = tuple(interior_tendency_kernel_parameters(model.grid))
 
-    calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
+    compute_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = use_only_active_interior_cells(model.grid))
     complete_communication_and_compute_boundary!(model, model.grid, model.architecture)
                       
     # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
     # boundaries of the domain
-    calculate_boundary_tendency_contributions!(model.timestepper.Gⁿ,
+    compute_boundary_tendency_contributions!(model.timestepper.Gⁿ,
                                                model.architecture,
                                                model.velocities,
                                                model.tracers,
@@ -49,8 +49,8 @@ function compute_tendencies!(model::NonhydrostaticModel, callbacks)
     return nothing
 end
 
-""" Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
+""" Store previous value of the source term and compute current source term. """
+function compute_interior_tendency_contributions!(model, kernel_parameters; only_active_cells = nothing)
 
     tendencies           = model.timestepper.Gⁿ
     arch                 = model.architecture
@@ -90,15 +90,15 @@ function calculate_interior_tendency_contributions!(model, kernel_parameters; on
     w_kernel_args = tuple(start_momentum_kernel_args..., w_immersed_bc, end_momentum_kernel_args..., forcings, clock)
 
     for parameters in kernel_parameters
-        launch!(arch, grid, parameters, calculate_Gu!, 
+        launch!(arch, grid, parameters, compute_Gu!, 
                 tendencies.u, grid, only_active_cells, u_kernel_args;
                 only_active_cells)
 
-        launch!(arch, grid, parameters, calculate_Gv!, 
+        launch!(arch, grid, parameters, compute_Gv!, 
                 tendencies.v, grid, only_active_cells, v_kernel_args;
                 only_active_cells)
 
-        launch!(arch, grid, parameters, calculate_Gw!, 
+        launch!(arch, grid, parameters, compute_Gw!, 
                 tendencies.w, grid, only_active_cells, w_kernel_args;
                 only_active_cells)
     end
@@ -119,7 +119,7 @@ function calculate_interior_tendency_contributions!(model, kernel_parameters; on
                      forcing, clock)
 
         for parameters in kernel_parameters
-            launch!(arch, grid, parameters, calculate_Gc!, 
+            launch!(arch, grid, parameters, compute_Gc!, 
                     c_tendency, grid, only_active_cells, args;
                     only_active_cells)
         end
@@ -133,36 +133,36 @@ end
 #####
 
 """ Calculate the right-hand-side of the u-velocity equation. """
-@kernel function calculate_Gu!(Gu, grid, interior_map, args) 
+@kernel function compute_Gu!(Gu, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gu!(Gu, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gu[i, j, k] = u_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the v-velocity equation. """
-@kernel function calculate_Gv!(Gv, grid, interior_map, args) 
+@kernel function compute_Gv!(Gv, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gv!(Gv, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gv[i, j, k] = v_velocity_tendency(i, j, k, grid, args...)
 end
 
 """ Calculate the right-hand-side of the w-velocity equation. """
-@kernel function calculate_Gw!(Gw, grid, interior_map, args) 
+@kernel function compute_Gw!(Gw, grid, interior_map, args) 
     i, j, k = @index(Global, NTuple)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gw!(Gw, grid::ActiveCellsIBG, ::InteriorMap, args)
+@kernel function compute_Gw!(Gw, grid::ActiveCellsIBG, ::InteriorMap, args)
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gw[i, j, k] = w_velocity_tendency(i, j, k, grid, args...)
@@ -173,12 +173,12 @@ end
 #####
 
 """ Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_Gc!(Gc, grid, interior_map, args)
+@kernel function compute_Gc!(Gc, grid, interior_map, args)
     i, j, k = @index(Global, NTuple)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
 end
 
-@kernel function calculate_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args) 
+@kernel function compute_Gc!(Gc, grid::ActiveCellsIBG, ::InteriorMap, args) 
     idx = @index(Global, Linear)
     i, j, k = active_linear_index_to_interior_tuple(idx, grid)
     @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, args...)
@@ -189,7 +189,7 @@ end
 #####
 
 """ Apply boundary conditions by adding flux divergences to the right-hand-side. """
-function calculate_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
+function compute_boundary_tendency_contributions!(Gⁿ, arch, velocities, tracers, clock, model_fields)
     fields = merge(velocities, tracers)
 
     foreach(i -> apply_x_bcs!(Gⁿ[i], fields[i], arch, clock, model_fields), 1:length(fields))
diff --git a/src/Models/ShallowWaterModels/ShallowWaterModels.jl b/src/Models/ShallowWaterModels/ShallowWaterModels.jl
index edc2667f12..b257ea3765 100644
--- a/src/Models/ShallowWaterModels/ShallowWaterModels.jl
+++ b/src/Models/ShallowWaterModels/ShallowWaterModels.jl
@@ -39,7 +39,7 @@ Return a flattened `NamedTuple` of the prognostic fields associated with `Shallo
 prognostic_fields(model::ShallowWaterModel) = fields(model)
 
 include("solution_and_tracer_tendencies.jl")
-include("calculate_shallow_water_tendencies.jl")
+include("compute_shallow_water_tendencies.jl")
 include("update_shallow_water_state.jl")
 include("shallow_water_advection_operators.jl")
 include("shallow_water_diffusion_operators.jl")
diff --git a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl b/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
deleted file mode 100644
index fdb02e33fe..0000000000
--- a/src/Models/ShallowWaterModels/calculate_shallow_water_tendencies.jl
+++ /dev/null
@@ -1,213 +0,0 @@
-import Oceananigans.TimeSteppers: compute_tendencies!
-
-using Oceananigans.Utils: work_layout
-using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
-using KernelAbstractions: @index, @kernel
-
-using Oceananigans.Architectures: device
-
-using Oceananigans.BoundaryConditions 
-
-
-"""
-    compute_tendencies!(model::ShallowWaterModel)
-
-Calculate the interior and boundary contributions to tendency terms without the
-contribution from non-hydrostatic pressure.
-"""
-function compute_tendencies!(model::ShallowWaterModel, callbacks)
-
-    # Note:
-    #
-    # "tendencies" is a NamedTuple of OffsetArrays corresponding to the tendency data for use
-    # in GPU computations.
-    #
-    # "model.timestepper.Gⁿ" is a NamedTuple of Fields, whose data also corresponds to
-    # tendency data.
-
-    # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
-    # interior of the domain
-    calculate_interior_tendency_contributions!(model.timestepper.Gⁿ,
-                                               model.architecture,
-                                               model.grid,
-                                               model.gravitational_acceleration,
-                                               model.advection,
-                                               model.velocities,
-                                               model.coriolis,
-                                               model.closure,
-                                               model.bathymetry,
-                                               model.solution,
-                                               model.tracers,
-                                               model.diffusivity_fields,
-                                               model.forcing,
-                                               model.clock,
-                                               model.formulation)
-
-    # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
-    # boundaries of the domain
-    calculate_boundary_tendency_contributions!(model.timestepper.Gⁿ,
-                                               model.architecture,
-                                               model.solution,
-                                               model.tracers,
-                                               model.clock,
-                                               fields(model))
-
-    [callback(model) for callback in callbacks if isa(callback.callsite, TendencyCallsite)]
-
-    return nothing
-end
-
-""" Store previous value of the source term and calculate current source term. """
-function calculate_interior_tendency_contributions!(tendencies,
-                                                    arch,
-                                                    grid,
-                                                    gravitational_acceleration,
-                                                    advection,
-                                                    velocities,
-                                                    coriolis,
-                                                    closure, 
-                                                    bathymetry,
-                                                    solution,
-                                                    tracers,
-                                                    diffusivities,
-                                                    forcings,
-                                                    clock,
-                                                    formulation)
-
-    workgroup, worksize = work_layout(grid, :xyz)
-
-    calculate_Guh_kernel! = calculate_Guh!(device(arch), workgroup, worksize)
-    calculate_Gvh_kernel! = calculate_Gvh!(device(arch), workgroup, worksize)
-    calculate_Gh_kernel!  =  calculate_Gh!(device(arch), workgroup, worksize)
-    calculate_Gc_kernel!  =  calculate_Gc!(device(arch), workgroup, worksize)
-
-    args_vel = (grid, gravitational_acceleration, advection.momentum, velocities, coriolis, closure, 
-                      bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
-    args_h   = (grid, gravitational_acceleration, advection.mass, coriolis, closure, 
-                      solution, tracers, diffusivities, forcings, clock, formulation)
-
-    calculate_Guh_kernel!(tendencies[1], args_vel...)
-    calculate_Gvh_kernel!(tendencies[2], args_vel...)
-     calculate_Gh_kernel!(tendencies[3], args_h...)
-
-    for (tracer_index, tracer_name) in enumerate(propertynames(tracers))
-        @inbounds c_tendency = tendencies[tracer_index+3]
-        @inbounds forcing = forcings[tracer_index+3]
-        @inbounds c_advection = advection[tracer_name]
-
-        calculate_Gc_kernel!(c_tendency, grid, Val(tracer_index), c_advection, closure, solution,
-                             tracers, diffusivities, forcing, clock, formulation)
-
-    end
-
-    return nothing
-end
-
-#####
-##### Tendency calculators for the transports and height: uh, vh, h
-#####
-
-""" Calculate the right-hand-side of the uh-transport equation. """
-@kernel function calculate_Guh!(Guh,
-                                grid,
-                                gravitational_acceleration,
-                                advection,
-                                velocities,
-                                coriolis,
-                                closure, 
-                                bathymetry,
-                                solution,
-                                tracers,
-                                diffusivities,
-                                forcings,
-                                clock, 
-                                formulation)
-
-    i, j, k = @index(Global, NTuple)
-
-    @inbounds Guh[i, j, k] = uh_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, velocities, coriolis, closure, 
-                                                    bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
-end
-
-""" Calculate the right-hand-side of the vh-transport equation. """
-@kernel function calculate_Gvh!(Gvh,
-                                grid,
-                                gravitational_acceleration,
-                                advection,
-                                velocities,
-                                coriolis,
-                                closure,
-                                bathymetry,
-                                solution,
-                                tracers,
-                                diffusivities,
-                                forcings,
-                                clock, 
-                                formulation)
-
-    i, j, k = @index(Global, NTuple)
-
-    @inbounds Gvh[i, j, k] = vh_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, velocities, coriolis, closure, 
-                                                    bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
-end
-
-""" Calculate the right-hand-side of the height equation. """
-@kernel function calculate_Gh!(Gh,
-                               grid,
-                               gravitational_acceleration,
-                               advection,
-                               coriolis,
-                               closure,
-                               solution,
-                               tracers,
-                               diffusivities,
-                               forcings,
-                               clock, 
-                               formulation)
-
-    i, j, k = @index(Global, NTuple)
-
-    @inbounds Gh[i, j, k] = h_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, coriolis, closure,
-                                                solution, tracers, diffusivities, forcings, clock, formulation)
-end
-
-#####
-##### Tracer(s)
-#####
-
-""" Calculate the right-hand-side of the tracer advection-diffusion equation. """
-@kernel function calculate_Gc!(Gc,
-                               grid,
-                               tracer_index,
-                               advection,
-                               closure,
-                               solution,
-                               tracers,
-                               diffusivities,
-                               forcing,
-                               clock,
-                               formulation)
-
-    i, j, k = @index(Global, NTuple)
-
-    @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, tracer_index, advection, closure, solution, tracers,
-                                            diffusivities, forcing, clock, formulation)
-end
-
-#####
-##### Boundary contributions to tendencies due to user-prescribed fluxes
-#####
-
-""" Apply boundary conditions by adding flux divergences to the right-hand-side. """
-function calculate_boundary_tendency_contributions!(Gⁿ, arch, solution, tracers, clock, model_fields)
-    prognostic_fields = merge(solution, tracers)
-
-    # Solution fields and tracer fields
-    for i in 1:length(Gⁿ)
-        apply_x_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
-        apply_y_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
-    end
-
-    return nothing
-end
-
diff --git a/src/Models/ShallowWaterModels/compute_shallow_water_tendencies.jl b/src/Models/ShallowWaterModels/compute_shallow_water_tendencies.jl
new file mode 100644
index 0000000000..f75f87db69
--- /dev/null
+++ b/src/Models/ShallowWaterModels/compute_shallow_water_tendencies.jl
@@ -0,0 +1,213 @@
+import Oceananigans.TimeSteppers: compute_tendencies!
+
+using Oceananigans.Utils: work_layout
+using Oceananigans: fields, TimeStepCallsite, TendencyCallsite, UpdateStateCallsite
+using KernelAbstractions: @index, @kernel
+
+using Oceananigans.Architectures: device
+
+using Oceananigans.BoundaryConditions 
+
+
+"""
+    compute_tendencies!(model::ShallowWaterModel)
+
+Calculate the interior and boundary contributions to tendency terms without the
+contribution from non-hydrostatic pressure.
+"""
+function compute_tendencies!(model::ShallowWaterModel, callbacks)
+
+    # Note:
+    #
+    # "tendencies" is a NamedTuple of OffsetArrays corresponding to the tendency data for use
+    # in GPU computations.
+    #
+    # "model.timestepper.Gⁿ" is a NamedTuple of Fields, whose data also corresponds to
+    # tendency data.
+
+    # Calculate contributions to momentum and tracer tendencies from fluxes and volume terms in the
+    # interior of the domain
+    compute_interior_tendency_contributions!(model.timestepper.Gⁿ,
+                                             model.architecture,
+                                             model.grid,
+                                             model.gravitational_acceleration,
+                                             model.advection,
+                                             model.velocities,
+                                             model.coriolis,
+                                             model.closure,
+                                             model.bathymetry,
+                                             model.solution,
+                                             model.tracers,
+                                             model.diffusivity_fields,
+                                             model.forcing,
+                                             model.clock,
+                                             model.formulation)
+
+    # Calculate contributions to momentum and tracer tendencies from user-prescribed fluxes across the
+    # boundaries of the domain
+    compute_boundary_tendency_contributions!(model.timestepper.Gⁿ,
+                                             model.architecture,
+                                             model.solution,
+                                             model.tracers,
+                                             model.clock,
+                                             fields(model))
+
+    [callback(model) for callback in callbacks if isa(callback.callsite, TendencyCallsite)]
+
+    return nothing
+end
+
+""" Store previous value of the source term and calculate current source term. """
+function compute_interior_tendency_contributions!(tendencies,
+                                                  arch,
+                                                  grid,
+                                                  gravitational_acceleration,
+                                                  advection,
+                                                  velocities,
+                                                  coriolis,
+                                                  closure, 
+                                                  bathymetry,
+                                                  solution,
+                                                  tracers,
+                                                  diffusivities,
+                                                  forcings,
+                                                  clock,
+                                                  formulation)
+
+    workgroup, worksize = work_layout(grid, :xyz)
+
+    compute_Guh_kernel! = compute_Guh!(device(arch), workgroup, worksize)
+    compute_Gvh_kernel! = compute_Gvh!(device(arch), workgroup, worksize)
+    compute_Gh_kernel!  =  compute_Gh!(device(arch), workgroup, worksize)
+    compute_Gc_kernel!  =  compute_Gc!(device(arch), workgroup, worksize)
+
+    args_vel = (grid, gravitational_acceleration, advection.momentum, velocities, coriolis, closure, 
+                      bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
+    args_h   = (grid, gravitational_acceleration, advection.mass, coriolis, closure, 
+                      solution, tracers, diffusivities, forcings, clock, formulation)
+
+    compute_Guh_kernel!(tendencies[1], args_vel...)
+    compute_Gvh_kernel!(tendencies[2], args_vel...)
+     compute_Gh_kernel!(tendencies[3], args_h...)
+
+    for (tracer_index, tracer_name) in enumerate(propertynames(tracers))
+        @inbounds c_tendency = tendencies[tracer_index+3]
+        @inbounds forcing = forcings[tracer_index+3]
+        @inbounds c_advection = advection[tracer_name]
+
+        compute_Gc_kernel!(c_tendency, grid, Val(tracer_index), c_advection, closure, solution,
+                           tracers, diffusivities, forcing, clock, formulation)
+
+    end
+
+    return nothing
+end
+
+#####
+##### Tendency calculators for the transports and height: uh, vh, h
+#####
+
+""" Calculate the right-hand-side of the uh-transport equation. """
+@kernel function compute_Guh!(Guh,
+                              grid,
+                              gravitational_acceleration,
+                              advection,
+                              velocities,
+                              coriolis,
+                              closure, 
+                              bathymetry,
+                              solution,
+                              tracers,
+                              diffusivities,
+                              forcings,
+                              clock, 
+                              formulation)
+
+    i, j, k = @index(Global, NTuple)
+
+    @inbounds Guh[i, j, k] = uh_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, velocities, coriolis, closure, 
+                                                    bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
+end
+
+""" Calculate the right-hand-side of the vh-transport equation. """
+@kernel function compute_Gvh!(Gvh,
+                              grid,
+                              gravitational_acceleration,
+                              advection,
+                              velocities,
+                              coriolis,
+                              closure,
+                              bathymetry,
+                              solution,
+                              tracers,
+                              diffusivities,
+                              forcings,
+                              clock, 
+                              formulation)
+
+    i, j, k = @index(Global, NTuple)
+
+    @inbounds Gvh[i, j, k] = vh_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, velocities, coriolis, closure, 
+                                                    bathymetry, solution, tracers, diffusivities, forcings, clock, formulation)
+end
+
+""" Calculate the right-hand-side of the height equation. """
+@kernel function compute_Gh!(Gh,
+                             grid,
+                             gravitational_acceleration,
+                             advection,
+                             coriolis,
+                             closure,
+                             solution,
+                             tracers,
+                             diffusivities,
+                             forcings,
+                             clock, 
+                             formulation)
+
+    i, j, k = @index(Global, NTuple)
+
+    @inbounds Gh[i, j, k] = h_solution_tendency(i, j, k, grid, gravitational_acceleration, advection, coriolis, closure,
+                                                solution, tracers, diffusivities, forcings, clock, formulation)
+end
+
+#####
+##### Tracer(s)
+#####
+
+""" Calculate the right-hand-side of the tracer advection-diffusion equation. """
+@kernel function compute_Gc!(Gc,
+                             grid,
+                             tracer_index,
+                             advection,
+                             closure,
+                             solution,
+                             tracers,
+                             diffusivities,
+                             forcing,
+                             clock,
+                             formulation)
+
+    i, j, k = @index(Global, NTuple)
+
+    @inbounds Gc[i, j, k] = tracer_tendency(i, j, k, grid, tracer_index, advection, closure, solution, tracers,
+                                            diffusivities, forcing, clock, formulation)
+end
+
+#####
+##### Boundary contributions to tendencies due to user-prescribed fluxes
+#####
+
+""" Apply boundary conditions by adding flux divergences to the right-hand-side. """
+function compute_boundary_tendency_contributions!(Gⁿ, arch, solution, tracers, clock, model_fields)
+    prognostic_fields = merge(solution, tracers)
+
+    # Solution fields and tracer fields
+    for i in 1:length(Gⁿ)
+        apply_x_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
+        apply_y_bcs!(Gⁿ[i], prognostic_fields[i], arch, clock, model_fields)
+    end
+
+    return nothing
+end
+

From 8f6fc681a71d916696b94dda1adca79977a05601 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:51:02 +0200
Subject: [PATCH 525/530] fixed tests

---
 src/Distributed/multi_architectures.jl                  | 2 +-
 src/Models/NonhydrostaticModels/NonhydrostaticModels.jl | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Distributed/multi_architectures.jl b/src/Distributed/multi_architectures.jl
index f68432ffb9..73ee309e4f 100644
--- a/src/Distributed/multi_architectures.jl
+++ b/src/Distributed/multi_architectures.jl
@@ -105,7 +105,7 @@ function MultiProcess(child_architecture = CPU();
 end
 
 const MultiCPUProcess = MultiProcess{CPU}
-const MultiGPUProcess = MultiProcess{CPU}
+const MultiGPUProcess = MultiProcess{GPU}
 
 const BlockingMultiProcess = MultiProcess{<:Any, <:Nothing}
 
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index de60a9260d..bcdcf12fe3 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -74,7 +74,7 @@ include("update_hydrostatic_pressure.jl")
 include("update_nonhydrostatic_model_state.jl")
 include("pressure_correction.jl")
 include("nonhydrostatic_tendency_kernel_functions.jl")
-include("calculate_nonhydrostatic_tendencies.jl")
-include("calculate_nonhydrostatic_boundary_tendencies.jl")
+include("compute_nonhydrostatic_tendencies.jl")
+include("compute_nonhydrostatic_boundary_tendencies.jl")
 
 end # module

From de64e92fdb8dffb2c1450516eba8d39f338ae2f9 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 14:37:16 +0200
Subject: [PATCH 526/530] do not compute momentum in prescribed velocities

---
 .../prescribed_hydrostatic_velocity_fields.jl                   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
index 85b26baf12..74e60c88ed 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/prescribed_hydrostatic_velocity_fields.jl
@@ -94,7 +94,7 @@ FreeSurface(::ImplicitFreeSurface{Nothing}, ::PrescribedVelocityFields, grid) =
 FreeSurface(::SplitExplicitFreeSurface,     ::PrescribedVelocityFields, grid) = nothing
 
 hydrostatic_prognostic_fields(::PrescribedVelocityFields, ::Nothing, tracers) = tracers
-calculate_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters; kwargs...) = nothing
+compute_hydrostatic_momentum_tendencies!(model, ::PrescribedVelocityFields, kernel_parameters; kwargs...) = nothing
 
 apply_flux_bcs!(::Nothing, c, arch, clock, model_fields) = nothing
 

From 58d92ec911aea9acc434ba7361d289cee59daf79 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 15:01:36 +0200
Subject: [PATCH 527/530] DistributedComputations

---
 benchmark/distributed_nonhydrostatic_model_mpi.jl             | 2 +-
 benchmark/distributed_shallow_water_model_mpi.jl              | 2 +-
 .../DistributedComputations.jl}                               | 4 ++--
 .../distributed_architectures.jl}                             | 0
 .../distributed_fft_based_poisson_solver.jl                   | 0
 .../distributed_fields.jl                                     | 0
 .../distributed_grids.jl                                      | 0
 .../distributed_kernel_launching.jl                           | 0
 .../halo_communication.jl                                     | 0
 .../halo_communication_bcs.jl                                 | 0
 .../interleave_communication_and_computation.jl               | 0
 .../partition_assemble.jl                                     | 0
 .../compute_hydrostatic_free_surface_boundary_tendencies.jl   | 4 ++--
 .../compute_hydrostatic_free_surface_tendencies.jl            | 4 ++--
 .../distributed_split_explicit_free_surface.jl                | 4 ++--
 .../hydrostatic_free_surface_model.jl                         | 2 +-
 src/Models/NonhydrostaticModels/NonhydrostaticModels.jl       | 2 +-
 .../compute_nonhydrostatic_boundary_tendencies.jl             | 2 +-
 .../NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl | 2 +-
 src/Models/NonhydrostaticModels/nonhydrostatic_model.jl       | 2 +-
 src/Models/NonhydrostaticModels/solve_for_pressure.jl         | 2 +-
 src/Models/ShallowWaterModels/shallow_water_model.jl          | 2 +-
 src/MultiRegion/multi_region_grid.jl                          | 2 +-
 src/Oceananigans.jl                                           | 4 ++--
 src/OutputWriters/output_writer_utils.jl                      | 4 ++--
 src/Simulations/time_step_wizard.jl                           | 2 +-
 test/dependencies_for_runtests.jl                             | 2 +-
 test/test_distributed_models.jl                               | 2 +-
 test/test_distributed_poisson_solvers.jl                      | 4 ++--
 .../distributed_simulations/mpi_geostrophic_adjustment.jl     | 2 +-
 .../distributed_simulations/mpi_hydrostatic_turbulence.jl     | 2 +-
 .../mpi_nonhydrostatic_two_dimensional_turbulence.jl          | 2 +-
 validation/distributed_simulations/mpi_output_writing.jl      | 2 +-
 validation/distributed_simulations/mpi_set.jl                 | 2 +-
 .../distributed_simulations/mpi_shallow_water_turbulence.jl   | 2 +-
 35 files changed, 33 insertions(+), 33 deletions(-)
 rename src/{Distributed/Distributed.jl => DistributedComputations/DistributedComputations.jl} (88%)
 rename src/{Distributed/multi_architectures.jl => DistributedComputations/distributed_architectures.jl} (100%)
 rename src/{Distributed => DistributedComputations}/distributed_fft_based_poisson_solver.jl (100%)
 rename src/{Distributed => DistributedComputations}/distributed_fields.jl (100%)
 rename src/{Distributed => DistributedComputations}/distributed_grids.jl (100%)
 rename src/{Distributed => DistributedComputations}/distributed_kernel_launching.jl (100%)
 rename src/{Distributed => DistributedComputations}/halo_communication.jl (100%)
 rename src/{Distributed => DistributedComputations}/halo_communication_bcs.jl (100%)
 rename src/{Distributed => DistributedComputations}/interleave_communication_and_computation.jl (100%)
 rename src/{Distributed => DistributedComputations}/partition_assemble.jl (100%)

diff --git a/benchmark/distributed_nonhydrostatic_model_mpi.jl b/benchmark/distributed_nonhydrostatic_model_mpi.jl
index 37411164e2..bfe90f8b7d 100644
--- a/benchmark/distributed_nonhydrostatic_model_mpi.jl
+++ b/benchmark/distributed_nonhydrostatic_model_mpi.jl
@@ -6,7 +6,7 @@ using JLD2
 using BenchmarkTools
 
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 
 Logging.global_logger(OceananigansLogger())
 
diff --git a/benchmark/distributed_shallow_water_model_mpi.jl b/benchmark/distributed_shallow_water_model_mpi.jl
index 725abe45d9..63e13fb81a 100644
--- a/benchmark/distributed_shallow_water_model_mpi.jl
+++ b/benchmark/distributed_shallow_water_model_mpi.jl
@@ -6,7 +6,7 @@ using JLD2
 using BenchmarkTools
 
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Benchmarks
 
 Logging.global_logger(OceananigansLogger())
diff --git a/src/Distributed/Distributed.jl b/src/DistributedComputations/DistributedComputations.jl
similarity index 88%
rename from src/Distributed/Distributed.jl
rename to src/DistributedComputations/DistributedComputations.jl
index af2103ae18..d4ba9432ac 100644
--- a/src/Distributed/Distributed.jl
+++ b/src/DistributedComputations/DistributedComputations.jl
@@ -1,4 +1,4 @@
-module Distributed
+module DistributedComputations
 
 export
     MultiProcess, child_architecture, reconstruct_global_grid, 
@@ -10,7 +10,7 @@ using MPI
 using Oceananigans.Utils
 using Oceananigans.Grids
 
-include("multi_architectures.jl")
+include("distributed_architectures.jl")
 include("partition_assemble.jl")
 include("distributed_grids.jl")
 include("distributed_kernel_launching.jl")
diff --git a/src/Distributed/multi_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
similarity index 100%
rename from src/Distributed/multi_architectures.jl
rename to src/DistributedComputations/distributed_architectures.jl
diff --git a/src/Distributed/distributed_fft_based_poisson_solver.jl b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
similarity index 100%
rename from src/Distributed/distributed_fft_based_poisson_solver.jl
rename to src/DistributedComputations/distributed_fft_based_poisson_solver.jl
diff --git a/src/Distributed/distributed_fields.jl b/src/DistributedComputations/distributed_fields.jl
similarity index 100%
rename from src/Distributed/distributed_fields.jl
rename to src/DistributedComputations/distributed_fields.jl
diff --git a/src/Distributed/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
similarity index 100%
rename from src/Distributed/distributed_grids.jl
rename to src/DistributedComputations/distributed_grids.jl
diff --git a/src/Distributed/distributed_kernel_launching.jl b/src/DistributedComputations/distributed_kernel_launching.jl
similarity index 100%
rename from src/Distributed/distributed_kernel_launching.jl
rename to src/DistributedComputations/distributed_kernel_launching.jl
diff --git a/src/Distributed/halo_communication.jl b/src/DistributedComputations/halo_communication.jl
similarity index 100%
rename from src/Distributed/halo_communication.jl
rename to src/DistributedComputations/halo_communication.jl
diff --git a/src/Distributed/halo_communication_bcs.jl b/src/DistributedComputations/halo_communication_bcs.jl
similarity index 100%
rename from src/Distributed/halo_communication_bcs.jl
rename to src/DistributedComputations/halo_communication_bcs.jl
diff --git a/src/Distributed/interleave_communication_and_computation.jl b/src/DistributedComputations/interleave_communication_and_computation.jl
similarity index 100%
rename from src/Distributed/interleave_communication_and_computation.jl
rename to src/DistributedComputations/interleave_communication_and_computation.jl
diff --git a/src/Distributed/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
similarity index 100%
rename from src/Distributed/partition_assemble.jl
rename to src/DistributedComputations/partition_assemble.jl
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
index ce019b6701..01a60e1fc9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_boundary_tendencies.jl
@@ -1,4 +1,4 @@
-import Oceananigans.Distributed: compute_boundary_tendencies!
+import Oceananigans.DistributedComputations: compute_boundary_tendencies!
 using Oceananigans.Utils: worktuple, offsets
 using Oceananigans.TurbulenceClosures: required_halo_size
 using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_parameters,
@@ -6,7 +6,7 @@ using Oceananigans.Models.NonhydrostaticModels: boundary_tendency_kernel_paramet
                                                 boundary_κ_kernel_parameters,
                                                 boundary_parameters
 
-import Oceananigans.Distributed: compute_boundary_tendencies!
+import Oceananigans.DistributedComputations: compute_boundary_tendencies!
 
 # We assume here that top/bottom BC are always synched (no partitioning in z)
 function compute_boundary_tendencies!(model::HydrostaticFreeSurfaceModel)
diff --git a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
index 7691a586c2..10bdd4daae 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/compute_hydrostatic_free_surface_tendencies.jl
@@ -10,8 +10,8 @@ using Oceananigans.Biogeochemistry: update_tendencies!
 import Oceananigans.TimeSteppers: compute_tendencies!
 import Oceananigans: tracer_tendency_kernel_function
 
-import Oceananigans.Distributed: complete_communication_and_compute_boundary!
-import Oceananigans.Distributed: interior_tendency_kernel_parameters
+import Oceananigans.DistributedComputations: complete_communication_and_compute_boundary!
+import Oceananigans.DistributedComputations: interior_tendency_kernel_parameters
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
                                        InteriorMap, active_linear_index_to_interior_tuple
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 97ce1cd04c..0a08e7c1d3 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -1,6 +1,6 @@
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
-using Oceananigans.Distributed: DistributedGrid, DistributedField
-using Oceananigans.Distributed: BlockingMultiProcess, complete_halo_communication!
+using Oceananigans.DistributedComputations: DistributedGrid, DistributedField
+using Oceananigans.DistributedComputations: BlockingMultiProcess, complete_halo_communication!
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliaryFields
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index 6503b9e64d..f86864ecec 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -1,7 +1,7 @@
 using CUDA: has_cuda
 using OrderedCollections: OrderedDict
 
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Oceananigans.Architectures: AbstractArchitecture, GPU
 using Oceananigans.Advection: AbstractAdvectionScheme, CenteredSecondOrder, VectorInvariant
 using Oceananigans.BuoyancyModels: validate_buoyancy, regularize_buoyancy, SeawaterBuoyancy, g_Earth
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index bcdcf12fe3..171cda0d0d 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -11,7 +11,7 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Grids: XYRegRectilinearGrid, XZRegRectilinearGrid, YZRegRectilinearGrid
 using Oceananigans.Solvers
-using Oceananigans.Distributed: MultiProcess, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
+using Oceananigans.DistributedComputations: MultiProcess, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: SumOfArrays
 
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
index 3e2c0c7303..5e941703da 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_boundary_tendencies.jl
@@ -1,4 +1,4 @@
-import Oceananigans.Distributed: compute_boundary_tendencies!
+import Oceananigans.DistributedComputations: compute_boundary_tendencies!
 using Oceananigans.Utils: worktuple, offsets
 using Oceananigans.TurbulenceClosures: required_halo_size
 using Oceananigans.Grids: XFlatGrid, YFlatGrid
diff --git a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
index ea495a76a3..17eced904a 100644
--- a/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
+++ b/src/Models/NonhydrostaticModels/compute_nonhydrostatic_tendencies.jl
@@ -1,7 +1,7 @@
 using Oceananigans.Biogeochemistry: update_tendencies!
 using Oceananigans: fields, TendencyCallsite
 using Oceananigans.Utils: work_layout
-using Oceananigans.Distributed: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
+using Oceananigans.DistributedComputations: complete_communication_and_compute_boundary!, interior_tendency_kernel_parameters
 
 using Oceananigans.ImmersedBoundaries: use_only_active_interior_cells, ActiveCellsIBG, 
                                        InteriorMap, active_linear_index_to_interior_tuple
diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
index ec9d8345ab..4f3a89f97b 100644
--- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
@@ -2,7 +2,7 @@ using CUDA: has_cuda
 using OrderedCollections: OrderedDict
 
 using Oceananigans.Architectures: AbstractArchitecture
-using Oceananigans.Distributed: MultiProcess
+using Oceananigans.DistributedComputations: MultiProcess
 using Oceananigans.Advection: CenteredSecondOrder
 using Oceananigans.BuoyancyModels: validate_buoyancy, regularize_buoyancy, SeawaterBuoyancy
 using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields
diff --git a/src/Models/NonhydrostaticModels/solve_for_pressure.jl b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
index b72780b8af..23bd6b2599 100644
--- a/src/Models/NonhydrostaticModels/solve_for_pressure.jl
+++ b/src/Models/NonhydrostaticModels/solve_for_pressure.jl
@@ -1,6 +1,6 @@
 using Oceananigans.Operators
 using Oceananigans.Solvers: FFTBasedPoissonSolver, FourierTridiagonalPoissonSolver, solve!
-using Oceananigans.Distributed: DistributedFFTBasedPoissonSolver
+using Oceananigans.DistributedComputations: DistributedFFTBasedPoissonSolver
 using Oceananigans.Grids: XDirection, YDirection, ZDirection
 
 using PencilArrays: Permutation
diff --git a/src/Models/ShallowWaterModels/shallow_water_model.jl b/src/Models/ShallowWaterModels/shallow_water_model.jl
index 27ff8b6725..22282746db 100644
--- a/src/Models/ShallowWaterModels/shallow_water_model.jl
+++ b/src/Models/ShallowWaterModels/shallow_water_model.jl
@@ -2,7 +2,7 @@ using Oceananigans: AbstractModel, AbstractOutputWriter, AbstractDiagnostic
 
 using Oceananigans.Architectures: AbstractArchitecture, CPU
 using Oceananigans.AbstractOperations: @at, KernelFunctionOperation
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Oceananigans.Advection: CenteredSecondOrder, VectorInvariant
 using Oceananigans.BoundaryConditions: regularize_field_boundary_conditions
 using Oceananigans.Fields: Field, tracernames, TracerFields, XFaceField, YFaceField, CenterField, compute!
diff --git a/src/MultiRegion/multi_region_grid.jl b/src/MultiRegion/multi_region_grid.jl
index e1adf4ec7a..4b7072bc45 100644
--- a/src/MultiRegion/multi_region_grid.jl
+++ b/src/MultiRegion/multi_region_grid.jl
@@ -3,7 +3,7 @@ using Oceananigans.ImmersedBoundaries: GridFittedBottom, PartialCellBottom, Grid
 
 import Oceananigans.Grids: architecture, size, new_data, halo_size
 import Oceananigans.Grids: with_halo, on_architecture
-import Oceananigans.Distributed: reconstruct_global_grid
+import Oceananigans.DistributedComputations: reconstruct_global_grid
 
 struct MultiRegionGrid{FT, TX, TY, TZ, P, C, G, D, Arch} <: AbstractMultiRegionGrid{FT, TX, TY, TZ, Arch}
     architecture :: Arch
diff --git a/src/Oceananigans.jl b/src/Oceananigans.jl
index 8e8cc7f69e..9c6e658b64 100644
--- a/src/Oceananigans.jl
+++ b/src/Oceananigans.jl
@@ -213,7 +213,7 @@ include("Forcings/Forcings.jl")
 include("Biogeochemistry.jl")
 
 include("ImmersedBoundaries/ImmersedBoundaries.jl")
-include("Distributed/Distributed.jl")
+include("DistributedComputations/DistributedComputations.jl")
 include("TimeSteppers/TimeSteppers.jl")
 include("Models/Models.jl")
 
@@ -244,7 +244,7 @@ using .TurbulenceClosures
 using .Solvers
 using .Forcings
 using .ImmersedBoundaries
-using .Distributed
+using .DistributedComputations
 using .Models
 using .TimeSteppers
 using .Diagnostics
diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 1473ff7c6b..46fcd70746 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -1,7 +1,7 @@
 using StructArrays: StructArray, replace_storage
 using Oceananigans.Grids: on_architecture, architecture
-using Oceananigans.Distributed
-using Oceananigans.Distributed: DistributedGrid
+using Oceananigans.DistributedComputations
+using Oceananigans.DistributedComputations: DistributedGrid
 using Oceananigans.Fields: AbstractField, indices, boundary_conditions, instantiated_location
 using Oceananigans.BoundaryConditions: bc_str, FieldBoundaryConditions, ContinuousBoundaryFunction, DiscreteBoundaryFunction
 using Oceananigans.TimeSteppers: QuasiAdamsBashforth2TimeStepper, RungeKutta3TimeStepper
diff --git a/src/Simulations/time_step_wizard.jl b/src/Simulations/time_step_wizard.jl
index 151cae8dc0..ccc6edbaa2 100644
--- a/src/Simulations/time_step_wizard.jl
+++ b/src/Simulations/time_step_wizard.jl
@@ -89,7 +89,7 @@ function TimeStepWizard(FT=Float64;
 end
 
 using Oceananigans.Grids: topology
-using Oceananigans.Distributed: all_reduce
+using Oceananigans.DistributedComputations: all_reduce
 
 """
      new_time_step(old_Δt, wizard, model)
diff --git a/test/dependencies_for_runtests.jl b/test/dependencies_for_runtests.jl
index d51dac156a..ff093734e6 100644
--- a/test/dependencies_for_runtests.jl
+++ b/test/dependencies_for_runtests.jl
@@ -29,7 +29,7 @@ using Oceananigans.Simulations
 using Oceananigans.Diagnostics
 using Oceananigans.OutputWriters
 using Oceananigans.TurbulenceClosures
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Oceananigans.Logger
 using Oceananigans.Units
 using Oceananigans.Utils
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 8c3e073c78..7a2ee50d76 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -26,7 +26,7 @@ MPI.Init()
 # to initialize MPI.
 
 using Oceananigans.BoundaryConditions: fill_halo_regions!, DCBC
-using Oceananigans.Distributed: MultiProcess, index2rank
+using Oceananigans.DistributedComputations: MultiProcess, index2rank
 using Oceananigans.Fields: AbstractField
 using Oceananigans.Grids:
     halo_size,
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 01ff15192b..5f3df4d821 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -25,8 +25,8 @@ MPI.Init()
 
 # to initialize MPI.
 
-using Oceananigans.Distributed: reconstruct_global_grid
-using Oceananigans.Distributed: ZXYPermutation, ZYXPermutation
+using Oceananigans.DistributedComputations: reconstruct_global_grid
+using Oceananigans.DistributedComputations: ZXYPermutation, ZYXPermutation
 
 @kernel function set_distributed_solver_input!(permuted_ϕ, ϕ, ::ZYXPermutation)
     i, j, k = @index(Global, NTuple)
diff --git a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
index c7bd31e0e8..38f4d1b731 100644
--- a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
+++ b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
@@ -10,7 +10,7 @@
 
 using MPI
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Oceananigans.Grids: topology, architecture
 using Oceananigans.Units: kilometers, meters
 using Printf
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index 1194b3e1a7..24a5ef9570 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -4,7 +4,7 @@ using Oceananigans.Models.HydrostaticFreeSurfaceModels: VerticalVorticityField
 using Printf
 using Statistics
 using Oceananigans.BoundaryConditions
-using Oceananigans.Distributed    
+using Oceananigans.DistributedComputations    
 using Random
 
 # Run with 
diff --git a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
index 53fc3d33c4..dfb1336c43 100644
--- a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
+++ b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
@@ -10,7 +10,7 @@
 
 using MPI
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 using Statistics
 using Printf
 using Logging
diff --git a/validation/distributed_simulations/mpi_output_writing.jl b/validation/distributed_simulations/mpi_output_writing.jl
index ff990a7c5b..efab350fcf 100644
--- a/validation/distributed_simulations/mpi_output_writing.jl
+++ b/validation/distributed_simulations/mpi_output_writing.jl
@@ -1,6 +1,6 @@
 using MPI
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 
 MPI.Init()
 
diff --git a/validation/distributed_simulations/mpi_set.jl b/validation/distributed_simulations/mpi_set.jl
index 2c34e219c5..267b47cd78 100644
--- a/validation/distributed_simulations/mpi_set.jl
+++ b/validation/distributed_simulations/mpi_set.jl
@@ -1,6 +1,6 @@
 using MPI
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 
 MPI.Init()
 
diff --git a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
index 43ba48364e..25a4031d82 100644
--- a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
+++ b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
@@ -9,7 +9,7 @@ mpi_ranks = MPI.Comm_size(comm)
 
 using Statistics
 using Oceananigans
-using Oceananigans.Distributed
+using Oceananigans.DistributedComputations
 
 ranks = (2, 2, 1)
 topo = (Periodic, Periodic, Flat)

From cab51e539640339dc28594194679c0d241731b5f Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 15:01:55 +0200
Subject: [PATCH 528/530] DistributedComputations part #2

---
 .../distributed_nonhydrostatic_model_mpi.jl   |  2 +-
 .../distributed_shallow_water_model_mpi.jl    |  2 +-
 .../DistributedComputations.jl                |  2 +-
 .../distributed_architectures.jl              | 34 +++++++++----------
 .../distributed_fft_based_poisson_solver.jl   |  4 +--
 .../distributed_grids.jl                      | 22 ++++++------
 .../distributed_kernel_launching.jl           |  2 +-
 .../halo_communication.jl                     |  8 ++---
 ...nterleave_communication_and_computation.jl |  4 +--
 .../partition_assemble.jl                     | 12 +++----
 ...distributed_split_explicit_free_surface.jl |  4 +--
 .../hydrostatic_free_surface_model.jl         |  4 +--
 .../NonhydrostaticModels.jl                   |  4 +--
 .../nonhydrostatic_model.jl                   |  2 +-
 src/OutputWriters/output_writer_utils.jl      |  4 +--
 test/test_distributed_models.jl               | 30 ++++++++--------
 test/test_distributed_poisson_solvers.jl      |  2 +-
 .../mpi_geostrophic_adjustment.jl             |  2 +-
 .../mpi_hydrostatic_turbulence.jl             |  2 +-
 ...nhydrostatic_two_dimensional_turbulence.jl |  2 +-
 .../mpi_output_writing.jl                     |  2 +-
 validation/distributed_simulations/mpi_set.jl |  2 +-
 .../mpi_shallow_water_turbulence.jl           |  2 +-
 23 files changed, 77 insertions(+), 77 deletions(-)

diff --git a/benchmark/distributed_nonhydrostatic_model_mpi.jl b/benchmark/distributed_nonhydrostatic_model_mpi.jl
index bfe90f8b7d..b56a24b9ae 100644
--- a/benchmark/distributed_nonhydrostatic_model_mpi.jl
+++ b/benchmark/distributed_nonhydrostatic_model_mpi.jl
@@ -28,7 +28,7 @@ local_rank = MPI.Comm_rank(comm)
 @info "Setting up distributed nonhydrostatic model with N=($Nx, $Ny, $Nz) grid points and ranks=($Rx, $Ry, $Rz) on rank $local_rank..."
 
 topo = (Periodic, Periodic, Periodic)
-arch = MultiProcess(CPU(), topology=topo, ranks=(Rx, Ry, Rz), communicator=MPI.COMM_WORLD)
+arch = Distributed(CPU(), topology=topo, ranks=(Rx, Ry, Rz), communicator=MPI.COMM_WORLD)
 distributed_grid = RectilinearGrid(arch, topology=topo, size=(Nx, Ny, Nz), extent=(1, 1, 1))
 model = NonhydrostaticModel(grid=distributed_grid)
 
diff --git a/benchmark/distributed_shallow_water_model_mpi.jl b/benchmark/distributed_shallow_water_model_mpi.jl
index 63e13fb81a..e0f0a6d046 100644
--- a/benchmark/distributed_shallow_water_model_mpi.jl
+++ b/benchmark/distributed_shallow_water_model_mpi.jl
@@ -30,7 +30,7 @@ Ry = parse(Int, ARGS[4])
 @info "Setting up distributed shallow water model with N=($Nx, $Ny) grid points and ranks=($Rx, $Ry) on rank $local_rank..."
 
 topo = (Periodic, Periodic, Flat)
-arch = MultiProcess(CPU(), topology=topo, ranks=(Rx, Ry, 1), communicator=MPI.COMM_WORLD)
+arch = Distributed(CPU(), topology=topo, ranks=(Rx, Ry, 1), communicator=MPI.COMM_WORLD)
 distributed_grid = RectilinearGrid(arch, topology=topo, size=(Nx, Ny), extent=(1, 1))
 model = ShallowWaterModel(grid=distributed_grid, gravitational_acceleration=1.0)
 set!(model, h=1)
diff --git a/src/DistributedComputations/DistributedComputations.jl b/src/DistributedComputations/DistributedComputations.jl
index d4ba9432ac..c5180588ba 100644
--- a/src/DistributedComputations/DistributedComputations.jl
+++ b/src/DistributedComputations/DistributedComputations.jl
@@ -1,7 +1,7 @@
 module DistributedComputations
 
 export
-    MultiProcess, child_architecture, reconstruct_global_grid, 
+    Distributed, child_architecture, reconstruct_global_grid, 
     inject_halo_communication_boundary_conditions,
     DistributedFFTBasedPoissonSolver
 
diff --git a/src/DistributedComputations/distributed_architectures.jl b/src/DistributedComputations/distributed_architectures.jl
index 73ee309e4f..cdd3340656 100644
--- a/src/DistributedComputations/distributed_architectures.jl
+++ b/src/DistributedComputations/distributed_architectures.jl
@@ -6,7 +6,7 @@ import Oceananigans.Architectures: device, arch_array, array_type, child_archite
 import Oceananigans.Grids: zeros
 import Oceananigans.Utils: sync_device!
 
-struct MultiProcess{A, M, R, I, ρ, C, γ, T} <: AbstractArchitecture
+struct Distributed{A, M, R, I, ρ, C, γ, T} <: AbstractArchitecture
   child_architecture :: A
           local_rank :: R
          local_index :: I
@@ -22,7 +22,7 @@ end
 #####
 
 """
-    MultiProcess(child_architecture = CPU(); 
+    Distributed(child_architecture = CPU(); 
                     topology, 
                     ranks, 
                     devices = nothing, 
@@ -57,7 +57,7 @@ Keyword arguments
 - `communicator`: the MPI communicator, `MPI.COMM_WORLD`. This keyword argument should not be tampered with 
                   if not for testing or developing. Change at your own risk!
 """
-function MultiProcess(child_architecture = CPU(); 
+function Distributed(child_architecture = CPU(); 
                          topology, 
                          ranks,
                          devices = nothing, 
@@ -101,28 +101,28 @@ function MultiProcess(child_architecture = CPU();
     M = typeof(mpi_requests)
     T = typeof(Ref(0))
 
-    return MultiProcess{A, M, R, I, ρ, C, γ, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, Ref(0))
+    return Distributed{A, M, R, I, ρ, C, γ, T}(child_architecture, local_rank, local_index, ranks, local_connectivity, communicator, mpi_requests, Ref(0))
 end
 
-const MultiCPUProcess = MultiProcess{CPU}
-const MultiGPUProcess = MultiProcess{GPU}
+const DistributedCPU = Distributed{CPU}
+const DistributedGPU = Distributed{GPU}
 
-const BlockingMultiProcess = MultiProcess{<:Any, <:Nothing}
+const BlockingDistributed = Distributed{<:Any, <:Nothing}
 
 #####
 ##### All the architectures
 #####
 
-child_architecture(arch::MultiProcess) = arch.child_architecture
-device(arch::MultiProcess)             = device(child_architecture(arch))
-arch_array(arch::MultiProcess, A)      = arch_array(child_architecture(arch), A)
-zeros(FT, arch::MultiProcess, N...)    = zeros(FT, child_architecture(arch), N...)
-array_type(arch::MultiProcess)         = array_type(child_architecture(arch))
-sync_device!(arch::MultiProcess)       = sync_device!(arch.child_architecture)
+child_architecture(arch::Distributed) = arch.child_architecture
+device(arch::Distributed)             = device(child_architecture(arch))
+arch_array(arch::Distributed, A)      = arch_array(child_architecture(arch), A)
+zeros(FT, arch::Distributed, N...)    = zeros(FT, child_architecture(arch), N...)
+array_type(arch::Distributed)         = array_type(child_architecture(arch))
+sync_device!(arch::Distributed)       = sync_device!(arch.child_architecture)
 
-cpu_architecture(arch::MultiCPUProcess) = arch
-cpu_architecture(arch::MultiGPUProcess) = 
-    MultiProcess(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
+cpu_architecture(arch::DistributedCPU) = arch
+cpu_architecture(arch::DistributedGPU) = 
+    Distributed(CPU(), arch.local_rank, arch.local_index, arch.ranks, 
                            arch.connectivity, arch.communicator, arch.mpi_requests, arch.mpi_tag)
 
 #####
@@ -223,7 +223,7 @@ end
 ##### Pretty printing
 #####
 
-function Base.show(io::IO, arch::MultiProcess)
+function Base.show(io::IO, arch::Distributed)
     c = arch.connectivity
     print(io, "Distributed architecture (rank $(arch.local_rank)/$(prod(arch.ranks)-1)) [index $(arch.local_index) / $(arch.ranks)]\n",
               "└── child architecture: $(typeof(child_architecture(arch))) \n",
diff --git a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
index 962d01c831..59540792d2 100644
--- a/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
+++ b/src/DistributedComputations/distributed_fft_based_poisson_solver.jl
@@ -33,7 +33,7 @@ Return a FFT-based solver for the Poisson equation,
 ∇²φ = b
 ```
 
-for `MultiProcess`itectures.
+for `Distributed`itectures.
 
 Supported configurations
 ========================
@@ -80,7 +80,7 @@ Restrictions
 ============
 
 The algorithm for two-dimensional decompositions requires that `Nz = size(global_grid, 3)` is larger
-than either `Rx = ranks[1]` or `Ry = ranks[2]`, where `ranks` are configured when building `MultiProcess`.
+than either `Rx = ranks[1]` or `Ry = ranks[2]`, where `ranks` are configured when building `Distributed`.
 If `Nz` does not satisfy this condition, we can only support a one-dimensional decomposition.
 
 Algorithm for one-dimensional decompositions
diff --git a/src/DistributedComputations/distributed_grids.jl b/src/DistributedComputations/distributed_grids.jl
index e31c7f4831..445bf7a8b4 100644
--- a/src/DistributedComputations/distributed_grids.jl
+++ b/src/DistributedComputations/distributed_grids.jl
@@ -13,20 +13,20 @@ using Oceananigans.ImmersedBoundaries
 
 import Oceananigans.Grids: RectilinearGrid, LatitudeLongitudeGrid, with_halo
 
-const DistributedGrid{FT, TX, TY, TZ} = AbstractGrid{FT, TX, TY, TZ, <:MultiProcess}
+const DistributedGrid{FT, TX, TY, TZ} = AbstractGrid{FT, TX, TY, TZ, <:Distributed}
 const DistributedRectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ} =
-    RectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ, <:MultiProcess} where {FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ}
+    RectilinearGrid{FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ, <:Distributed} where {FT, TX, TY, TZ, FX, FY, FZ, VX, VY, VZ}
 const DistributedLatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ} = 
-    LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:MultiProcess} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
+    LatitudeLongitudeGrid{FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ, <:Distributed} where {FT, TX, TY, TZ, M, MY, FX, FY, FZ, VX, VY, VZ}
 
-const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:MultiProcess} where {FT, TX, TY, TZ, I, M}
+const DistributedImmersedBoundaryGrid = ImmersedBoundaryGrid{FT, TX, TY, TZ, <:DistributedGrid, I, M, <:Distributed} where {FT, TX, TY, TZ, I, M}
 
 """
-    RectilinearGrid(arch::MultiProcess, FT=Float64; kw...)
+    RectilinearGrid(arch::Distributed, FT=Float64; kw...)
 
 Return the rank-local portion of `RectilinearGrid` on `arch`itecture.
 """
-function RectilinearGrid(arch::MultiProcess, 
+function RectilinearGrid(arch::Distributed, 
                          FT::DataType = Float64;
                          size,
                          x = nothing,
@@ -69,11 +69,11 @@ function RectilinearGrid(arch::MultiProcess,
 end
 
 """
-    LatitudeLongitudeGrid(arch::MultiProcess, FT=Float64; kw...)
+    LatitudeLongitudeGrid(arch::Distributed, FT=Float64; kw...)
 
 Return the rank-local portion of `LatitudeLongitudeGrid` on `arch`itecture.
 """
-function LatitudeLongitudeGrid(arch::MultiProcess,
+function LatitudeLongitudeGrid(arch::Distributed,
                                FT::DataType = Float64; 
                                precompute_metrics = true,
                                size,
@@ -321,17 +321,17 @@ function scatter_grid_properties(global_grid)
     return x, y, z, topo, halo
 end
 
-function scatter_local_grids(arch::MultiProcess, global_grid::RectilinearGrid, local_size)
+function scatter_local_grids(arch::Distributed, global_grid::RectilinearGrid, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     return RectilinearGrid(arch, eltype(global_grid); size=local_size, x=x, y=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::MultiProcess, global_grid::LatitudeLongitudeGrid, local_size)
+function scatter_local_grids(arch::Distributed, global_grid::LatitudeLongitudeGrid, local_size)
     x, y, z, topo, halo = scatter_grid_properties(global_grid)
     return LatitudeLongitudeGrid(arch, eltype(global_grid); size=local_size, longitude=x, latitude=y, z=z, halo=halo, topology=topo)
 end
 
-function scatter_local_grids(arch::MultiProcess, global_grid::ImmersedBoundaryGrid, local_size)
+function scatter_local_grids(arch::Distributed, global_grid::ImmersedBoundaryGrid, local_size)
     ib = global_grid.immersed_boundary
     ug = global_grid.underlying_grid
 
diff --git a/src/DistributedComputations/distributed_kernel_launching.jl b/src/DistributedComputations/distributed_kernel_launching.jl
index 27b8107de2..a9e6efe1dc 100644
--- a/src/DistributedComputations/distributed_kernel_launching.jl
+++ b/src/DistributedComputations/distributed_kernel_launching.jl
@@ -1,6 +1,6 @@
 import Oceananigans.Utils: launch!
 
-function launch!(arch::MultiProcess, args...; kwargs...)
+function launch!(arch::Distributed, args...; kwargs...)
     child_arch = child_architecture(arch)
     return launch!(child_arch, args...; kwargs...)
 end
diff --git a/src/DistributedComputations/halo_communication.jl b/src/DistributedComputations/halo_communication.jl
index 6315c40a06..b01b03434a 100644
--- a/src/DistributedComputations/halo_communication.jl
+++ b/src/DistributedComputations/halo_communication.jl
@@ -123,7 +123,7 @@ end
 
     # Overlapping communication and computation, store requests in a `MPI.Request`
     # pool to be waited upon after tendency calculation
-    if async && !(arch isa BlockingMultiProcess)
+    if async && !(arch isa BlockingDistributed)
         push!(arch.mpi_requests, requests...)
         return nothing
     end
@@ -238,7 +238,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
     fill_opposite_side_send_buffers! = Symbol("fill_$(opposite_side)_send_buffers!")
 
     @eval begin
-        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side::DCBCT, size, offset, loc, arch::MultiProcess, 
+        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side::DCBCT, size, offset, loc, arch::Distributed, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             only_local_halos && return nothing
@@ -255,7 +255,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
             return [send_req1, send_req2, recv_req1, recv_req2]
         end
 
-        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side, size, offset, loc, arch::MultiProcess, 
+        function $fill_both_halo!(c, bc_side::DCBCT, bc_opposite_side, size, offset, loc, arch::Distributed, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             $fill_opposite_side_halo!(c, bc_opposite_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
@@ -271,7 +271,7 @@ for (side, opposite_side) in zip([:west, :south], [:east, :north])
             return [send_req, recv_req]
         end
 
-        function $fill_both_halo!(c, bc_side, bc_opposite_side::DCBCT, size, offset, loc, arch::MultiProcess, 
+        function $fill_both_halo!(c, bc_side, bc_opposite_side::DCBCT, size, offset, loc, arch::Distributed, 
                                   grid::DistributedGrid, buffers, args...; only_local_halos = false, kwargs...)
 
             $fill_side_halo!(c, bc_side, size, offset, loc, arch, grid, buffers, args...; kwargs...)
diff --git a/src/DistributedComputations/interleave_communication_and_computation.jl b/src/DistributedComputations/interleave_communication_and_computation.jl
index 8194386469..eedab7be9b 100644
--- a/src/DistributedComputations/interleave_communication_and_computation.jl
+++ b/src/DistributedComputations/interleave_communication_and_computation.jl
@@ -16,7 +16,7 @@ function complete_communication_and_compute_boundary!(model, ::DistributedGrid,
 end
 
 # Fallback
-complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingMultiProcess) = nothing
+complete_communication_and_compute_boundary!(model, ::DistributedGrid, ::BlockingDistributed) = nothing
 complete_communication_and_compute_boundary!(model, grid, arch) = nothing
 
 compute_boundary_tendencies!(model) = nothing
@@ -26,7 +26,7 @@ interior_tendency_kernel_parameters(grid) = :xyz
 interior_tendency_kernel_parameters(grid::DistributedGrid) = 
             interior_tendency_kernel_parameters(grid, architecture(grid))
 
-interior_tendency_kernel_parameters(grid, ::BlockingMultiProcess) = :xyz
+interior_tendency_kernel_parameters(grid, ::BlockingDistributed) = :xyz
 
 function interior_tendency_kernel_parameters(grid, arch)
     Rx, Ry, _ = arch.ranks
diff --git a/src/DistributedComputations/partition_assemble.jl b/src/DistributedComputations/partition_assemble.jl
index d0ac6913b0..1a7e536397 100644
--- a/src/DistributedComputations/partition_assemble.jl
+++ b/src/DistributedComputations/partition_assemble.jl
@@ -1,20 +1,20 @@
 using Oceananigans.Architectures: arch_array
 
-all_reduce(val, arch::MultiProcess; op = +) = 
+all_reduce(val, arch::Distributed; op = +) = 
     MPI.Allreduce(val, op, arch.communicator)
 
 all_reduce(val, arch; kwargs...) = val
 
 """
-    concatenate_local_sizes(n, arch::MultiProcess) 
+    concatenate_local_sizes(n, arch::Distributed) 
 
 Return a 3-Tuple containing a vector of `size(grid, idx)` for each rank in 
 all 3 directions.
 """
-concatenate_local_sizes(n, arch::MultiProcess) = 
+concatenate_local_sizes(n, arch::Distributed) = 
     Tuple(concatenate_local_sizes(n, arch, i) for i in 1:length(n))
 
-function concatenate_local_sizes(n, arch::MultiProcess, idx)
+function concatenate_local_sizes(n, arch::Distributed, idx)
     R = arch.ranks[idx]
     r = arch.local_index[idx]
     n = n isa Number ? n : n[idx]
@@ -106,7 +106,7 @@ partition_global_array(arch, c_global::AbstractArray, n) = c_global
 partition_global_array(arch, c_global::Function, n)      = c_global 
 
 # Here we assume that we cannot partition in z (we should remove support for that)
-function partition_global_array(arch::MultiProcess, c_global::AbstractArray, n) 
+function partition_global_array(arch::Distributed, c_global::AbstractArray, n) 
     c_global = arch_array(CPU(), c_global)
 
     ri, rj, rk = arch.local_index
@@ -141,7 +141,7 @@ construct_global_array(arch, c_local::AbstractArray, n) = c_local
 construct_global_array(arch, c_local::Function, N)      = c_local
 
 # TODO: This does not work for 3D parallelizations!!!
-function construct_global_array(arch::MultiProcess, c_local::AbstractArray, n) 
+function construct_global_array(arch::Distributed, c_local::AbstractArray, n) 
     c_local = arch_array(CPU(), c_local)
 
     ri, rj, rk = arch.local_index
diff --git a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
index 0a08e7c1d3..4dca3a8fdf 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/distributed_split_explicit_free_surface.jl
@@ -1,6 +1,6 @@
 using Oceananigans.AbstractOperations: GridMetricOperation, Δz
 using Oceananigans.DistributedComputations: DistributedGrid, DistributedField
-using Oceananigans.DistributedComputations: BlockingMultiProcess, complete_halo_communication!
+using Oceananigans.DistributedComputations: BlockingDistributed, complete_halo_communication!
 using Oceananigans.Models.HydrostaticFreeSurfaceModels: SplitExplicitState, SplitExplicitFreeSurface
 
 import Oceananigans.Models.HydrostaticFreeSurfaceModels: FreeSurface, SplitExplicitAuxiliaryFields
@@ -93,7 +93,7 @@ end
 
 const DistributedSplitExplicit = SplitExplicitFreeSurface{<:DistributedField}
 
-wait_free_surface_communication!(::DistributedSplitExplicit, ::BlockingMultiProcess) = nothing
+wait_free_surface_communication!(::DistributedSplitExplicit, ::BlockingDistributed) = nothing
     
 function wait_free_surface_communication!(free_surface::DistributedSplitExplicit, arch)
     
diff --git a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
index f86864ecec..4499c6b3b9 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/hydrostatic_free_surface_model.jl
@@ -206,8 +206,8 @@ function validate_vertical_velocity_boundary_conditions(w)
     return nothing
 end
 
-validate_free_surface(::MultiProcess, free_surface::SplitExplicitFreeSurface) = free_surface
-validate_free_surface(arch::MultiProcess, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
+validate_free_surface(::Distributed, free_surface::SplitExplicitFreeSurface) = free_surface
+validate_free_surface(arch::Distributed, free_surface) = error("$(typeof(free_surface)) is not supported with $(typeof(arch))")
 validate_free_surface(arch, free_surface) = free_surface
 
 validate_momentum_advection(momentum_advection, ibg::ImmersedBoundaryGrid) = validate_momentum_advection(momentum_advection, ibg.underlying_grid)
diff --git a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
index 171cda0d0d..0bfa7e38f5 100644
--- a/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
+++ b/src/Models/NonhydrostaticModels/NonhydrostaticModels.jl
@@ -11,7 +11,7 @@ using Oceananigans.Utils
 using Oceananigans.Grids
 using Oceananigans.Grids: XYRegRectilinearGrid, XZRegRectilinearGrid, YZRegRectilinearGrid
 using Oceananigans.Solvers
-using Oceananigans.DistributedComputations: MultiProcess, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
+using Oceananigans.DistributedComputations: Distributed, DistributedFFTBasedPoissonSolver, reconstruct_global_grid   
 using Oceananigans.ImmersedBoundaries: ImmersedBoundaryGrid
 using Oceananigans.Utils: SumOfArrays
 
@@ -19,7 +19,7 @@ import Oceananigans: fields, prognostic_fields
 import Oceananigans.Advection: cell_advection_timescale
 import Oceananigans.TimeSteppers: step_lagrangian_particles!
 
-function PressureSolver(arch::MultiProcess, local_grid::RegRectilinearGrid)
+function PressureSolver(arch::Distributed, local_grid::RegRectilinearGrid)
     global_grid = reconstruct_global_grid(local_grid)
     return DistributedFFTBasedPoissonSolver(global_grid, local_grid)
 end
diff --git a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
index 4f3a89f97b..841be9a917 100644
--- a/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
+++ b/src/Models/NonhydrostaticModels/nonhydrostatic_model.jl
@@ -2,7 +2,7 @@ using CUDA: has_cuda
 using OrderedCollections: OrderedDict
 
 using Oceananigans.Architectures: AbstractArchitecture
-using Oceananigans.DistributedComputations: MultiProcess
+using Oceananigans.DistributedComputations: Distributed
 using Oceananigans.Advection: CenteredSecondOrder
 using Oceananigans.BuoyancyModels: validate_buoyancy, regularize_buoyancy, SeawaterBuoyancy
 using Oceananigans.Biogeochemistry: validate_biogeochemistry, AbstractBiogeochemistry, biogeochemical_auxiliary_fields
diff --git a/src/OutputWriters/output_writer_utils.jl b/src/OutputWriters/output_writer_utils.jl
index 46fcd70746..a06302495e 100644
--- a/src/OutputWriters/output_writer_utils.jl
+++ b/src/OutputWriters/output_writer_utils.jl
@@ -44,7 +44,7 @@ saveproperty!(file, address, grid::AbstractGrid)      = _saveproperty!(file, add
 
 function saveproperty!(file, address, grid::DistributedGrid) 
     arch = architecture(grid)
-    cpu_arch = MultiProcess(CPU(); topology = topology(grid),
+    cpu_arch = Distributed(CPU(); topology = topology(grid),
                                       ranks = arch.ranks)
     _saveproperty!(file, address, on_architecture(cpu_arch, grid))
 end
@@ -86,7 +86,7 @@ serializeproperty!(file, address, grid::AbstractGrid) = file[address] = on_archi
 
 function serializeproperty!(file, address, grid::DistributedGrid) 
     arch = architecture(grid)
-    cpu_arch = MultiProcess(CPU(); topology = topology(grid),
+    cpu_arch = Distributed(CPU(); topology = topology(grid),
                                       ranks = arch.ranks)
     file[address] = on_architecture(cpu_arch, grid)
 end
diff --git a/test/test_distributed_models.jl b/test/test_distributed_models.jl
index 7a2ee50d76..c2fefca6ca 100644
--- a/test/test_distributed_models.jl
+++ b/test/test_distributed_models.jl
@@ -26,7 +26,7 @@ MPI.Init()
 # to initialize MPI.
 
 using Oceananigans.BoundaryConditions: fill_halo_regions!, DCBC
-using Oceananigans.DistributedComputations: MultiProcess, index2rank
+using Oceananigans.DistributedComputations: Distributed, index2rank
 using Oceananigans.Fields: AbstractField
 using Oceananigans.Grids:
     halo_size,
@@ -113,7 +113,7 @@ mpi_ranks = MPI.Comm_size(comm)
 
 function test_triply_periodic_rank_connectivity_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(4, 1, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(4, 1, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -147,7 +147,7 @@ end
 
 function test_triply_periodic_rank_connectivity_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(1, 4, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(1, 4, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -187,7 +187,7 @@ end
 
 function test_triply_periodic_rank_connectivity_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(2, 2, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(2, 2, 1), topology = topo)
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
     @test local_rank == index2rank(arch.local_index..., arch.ranks...)
@@ -231,7 +231,7 @@ end
 
 function test_triply_periodic_local_grid_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(4, 1, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(4, 1, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -249,7 +249,7 @@ end
 
 function test_triply_periodic_local_grid_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(1, 4, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(1, 4, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
 
     local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
@@ -267,7 +267,7 @@ end
 
 function test_triply_periodic_local_grid_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=(2, 2, 1), topology = topo)
+    arch = Distributed(CPU(), ranks=(2, 2, 1), topology = topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     
     i, j, k = arch.local_index
@@ -291,7 +291,7 @@ end
 
 function test_triply_periodic_bc_injection_with_411_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(ranks=(4, 1, 1), topology=topo)
+    arch = Distributed(ranks=(4, 1, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(2, 8, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -308,7 +308,7 @@ end
 
 function test_triply_periodic_bc_injection_with_141_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(ranks=(1, 4, 1), topology=topo)
+    arch = Distributed(ranks=(1, 4, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -325,7 +325,7 @@ end
 
 function test_triply_periodic_bc_injection_with_221_ranks()
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(ranks=(2, 2, 1), topology=topo)
+    arch = Distributed(ranks=(2, 2, 1), topology=topo)
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 8), extent=(1, 2, 3))
     model = NonhydrostaticModel(grid=grid)
 
@@ -346,7 +346,7 @@ end
 
 function test_triply_periodic_halo_communication_with_411_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(child_arch; ranks=(4, 1, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = Distributed(child_arch; ranks=(4, 1, 1), topology=topo, devices = (0, 0, 0, 0))
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -370,7 +370,7 @@ end
 
 function test_triply_periodic_halo_communication_with_141_ranks(halo, child_arch)
     topo  = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(child_arch; ranks=(1, 4, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = Distributed(child_arch; ranks=(1, 4, 1), topology=topo, devices = (0, 0, 0, 0))
     grid  = RectilinearGrid(arch, topology=topo, size=(4, 4, 4), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -392,7 +392,7 @@ end
 
 function test_triply_periodic_halo_communication_with_221_ranks(halo, child_arch)
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
+    arch = Distributed(child_arch; ranks=(2, 2, 1), topology=topo, devices = (0, 0, 0, 0))
     grid = RectilinearGrid(arch, topology=topo, size=(4, 4, 3), extent=(1, 2, 3), halo=halo)
     model = NonhydrostaticModel(grid=grid)
 
@@ -464,7 +464,7 @@ end
             for ranks in [(1, 4, 1), (2, 2, 1), (4, 1, 1)]
                 @info "Time-stepping a distributed NonhydrostaticModel with ranks $ranks..."
                 topo = (Periodic, Periodic, Periodic)
-                arch = MultiProcess(; ranks, topology=topo)
+                arch = Distributed(; ranks, topology=topo)
                 grid = RectilinearGrid(arch, topology=topo, size=(8, 2, 8), extent=(1, 2, 3))
                 model = NonhydrostaticModel(; grid)
 
@@ -483,7 +483,7 @@ end
     @testset "Time stepping ShallowWaterModel" begin
         for child_arch in archs
             topo = (Periodic, Periodic, Flat)
-            arch = MultiProcess(child_arch; ranks=(1, 4, 1), topology = topo, devices = (0, 0, 0, 0))
+            arch = Distributed(child_arch; ranks=(1, 4, 1), topology = topo, devices = (0, 0, 0, 0))
             grid = RectilinearGrid(arch, topology=topo, size=(8, 2), extent=(1, 2), halo=(3, 3))
             model = ShallowWaterModel(; momentum_advection=nothing, mass_advection=nothing, tracer_advection=nothing, grid, gravitational_acceleration=1)
 
diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
index 5f3df4d821..927c051876 100644
--- a/test/test_distributed_poisson_solvers.jl
+++ b/test/test_distributed_poisson_solvers.jl
@@ -65,7 +65,7 @@ end
 
 function divergence_free_poisson_solution_triply_periodic(grid_points, ranks)
     topo = (Periodic, Periodic, Periodic)
-    arch = MultiProcess(CPU(), ranks=ranks, topology=topo)
+    arch = Distributed(CPU(), ranks=ranks, topology=topo)
     local_grid = RectilinearGrid(arch, topology=topo, size=grid_points, extent=(1, 2, 3))
 
     bcs = FieldBoundaryConditions(local_grid, (Center, Center, Center))
diff --git a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
index 38f4d1b731..1d1ffa9deb 100644
--- a/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
+++ b/validation/distributed_simulations/mpi_geostrophic_adjustment.jl
@@ -23,7 +23,7 @@ rank   = MPI.Comm_rank(comm)
 Nranks = MPI.Comm_size(comm)
 
 topo = (Bounded, Periodic, Bounded)
-arch = MultiProcess(CPU(); topology = topo, 
+arch = Distributed(CPU(); topology = topo, 
                  ranks=(Nranks, 1, 1),
                  use_buffers = true)
 
diff --git a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
index 24a5ef9570..731f59a004 100644
--- a/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
+++ b/validation/distributed_simulations/mpi_hydrostatic_turbulence.jl
@@ -75,7 +75,7 @@ Ry = 1
 @assert Nranks == 4
 
 # Enable overlapped communication!
-arch  = MultiProcess(CPU(), ranks = (Rx, Ry, 1), 
+arch  = Distributed(CPU(), ranks = (Rx, Ry, 1), 
                         topology=topo, 
                         enable_overlapped_computation = true)
 
diff --git a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
index dfb1336c43..31cd84fe5f 100644
--- a/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
+++ b/validation/distributed_simulations/mpi_nonhydrostatic_two_dimensional_turbulence.jl
@@ -28,7 +28,7 @@ Nranks = MPI.Comm_size(comm)
 Nx = Ny = 256
 Lx = Ly = 2π
 topology = (Periodic, Periodic, Flat)
-arch = MultiProcess(CPU(); topology, ranks=(1, Nranks, 1))
+arch = Distributed(CPU(); topology, ranks=(1, Nranks, 1))
 grid = RectilinearGrid(arch; topology, size=(Nx ÷ Nranks, Ny), halo=(3, 3), x=(0, 2π), y=(0, 2π))
 
 @info "Built $Nranks grids:"
diff --git a/validation/distributed_simulations/mpi_output_writing.jl b/validation/distributed_simulations/mpi_output_writing.jl
index efab350fcf..fd6e7e6e7e 100644
--- a/validation/distributed_simulations/mpi_output_writing.jl
+++ b/validation/distributed_simulations/mpi_output_writing.jl
@@ -9,7 +9,7 @@ rank = MPI.Comm_rank(comm)
 Nranks = MPI.Comm_size(comm)
 
 topology = (Periodic, Periodic, Flat)
-arch = MultiProcess(CPU(); topology, ranks=(Nranks, 1, 1))
+arch = Distributed(CPU(); topology, ranks=(Nranks, 1, 1))
 grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), halo=(3, 3), extent=(2π, 2π))
 
 model = NonhydrostaticModel(; grid)
diff --git a/validation/distributed_simulations/mpi_set.jl b/validation/distributed_simulations/mpi_set.jl
index 267b47cd78..97f3182a4f 100644
--- a/validation/distributed_simulations/mpi_set.jl
+++ b/validation/distributed_simulations/mpi_set.jl
@@ -10,7 +10,7 @@ Nranks = MPI.Comm_size(MPI.COMM_WORLD)
 
 # Setup model
 topology = (Periodic, Periodic, Flat)
-arch = MultiProcess(CPU(); topology, ranks=(1, Nranks, 1))
+arch = Distributed(CPU(); topology, ranks=(1, Nranks, 1))
 grid = RectilinearGrid(arch; topology, size=(16 ÷ Nranks, 16), extent=(2π, 2π))
 c = CenterField(grid)
 
diff --git a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
index 25a4031d82..0f6528970e 100644
--- a/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
+++ b/validation/distributed_simulations/mpi_shallow_water_turbulence.jl
@@ -13,7 +13,7 @@ using Oceananigans.DistributedComputations
 
 ranks = (2, 2, 1)
 topo = (Periodic, Periodic, Flat)
-arch = MultiProcess(CPU(), ranks=ranks, topology=topo)
+arch = Distributed(CPU(), ranks=ranks, topology=topo)
 grid = RectilinearGrid(arch, topology=topo, size=(128 ÷ ranks[1], 128 ÷ ranks[2]), extent=(4π, 4π), halo=(3, 3))
 local_rank = MPI.Comm_rank(MPI.COMM_WORLD)
 

From dfbc048f3db78a1362003c98e8b76b08cf375c4c Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 16:27:35 +0200
Subject: [PATCH 529/530] bugfix

---
 .../single_column_model_mode.jl                             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
index bf0472e082..43c3b219a6 100644
--- a/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
+++ b/src/Models/HydrostaticFreeSurfaceModels/single_column_model_mode.jl
@@ -49,11 +49,11 @@ compute_w_from_continuity!(::PrescribedVelocityFields, arch, ::SingleColumnGrid;
 ##### Time-step optimizations
 #####
 
-calculate_free_surface_tendency!(::SingleColumnGrid, args...) = nothing
+compute_free_surface_tendency!(::SingleColumnGrid, args...) = nothing
 
 # Disambiguation
-calculate_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
-calculate_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
+compute_free_surface_tendency!(::SingleColumnGrid, ::ImplicitFreeSurfaceHFSM     , args...) = nothing
+compute_free_surface_tendency!(::SingleColumnGrid, ::SplitExplicitFreeSurfaceHFSM, args...) = nothing
 
 # Fast state update and halo filling
 

From 55b92992d8ffe3f51883772b2fc725d07e84a105 Mon Sep 17 00:00:00 2001
From: Simone Silvestri <33547697+simone-silvestri@users.noreply.github.com>
Date: Tue, 19 Sep 2023 17:34:18 +0200
Subject: [PATCH 530/530] fixed the docs

---
 docs/src/appendix/library.md | 2 +-
 src/Architectures.jl         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/appendix/library.md b/docs/src/appendix/library.md
index a07ec8909a..a0bc575421 100644
--- a/docs/src/appendix/library.md
+++ b/docs/src/appendix/library.md
@@ -61,7 +61,7 @@ Private = false
 ## Distributed
 
 ```@autodocs
-Modules = [Oceananigans.Distributed]
+Modules = [Oceananigans.DistributedComputations]
 Private = false
 ```
 
diff --git a/src/Architectures.jl b/src/Architectures.jl
index b555c18736..2ae8a1c8bd 100644
--- a/src/Architectures.jl
+++ b/src/Architectures.jl
@@ -32,7 +32,7 @@ Run Oceananigans on a single NVIDIA CUDA GPU.
 struct GPU <: AbstractArchitecture end
 
 #####
-##### These methods are extended in Distributed.jl
+##### These methods are extended in DistributedComputations.jl
 #####
 
 device(::CPU) = KernelAbstractions.CPU()