[Py2F]: add profiling support & optimisations (#449)

Adds CachedProgram to diffusion stencils, as well as other optimisations for py2f and changes so that APE experiments can be run with py2f.
C2SM · May 29, 2024 · 1752f87 · 1752f87
1 parent c918d52
commit 1752f87
Show file tree

Hide file tree

Showing 34 changed files with 602 additions and 165 deletions.
diff --git a/ci/base.yml b/ci/base.yml
@@ -43,3 +43,4 @@ variables:
     VIRTUALENV_SYSTEM_SITE_PACKAGES: 1
     CSCS_NEEDED_DATA: icon4py
     TEST_DATA_PATH: "/project/d121/icon4py/ci/testdata"
+    ICON_GRID_LOC: "/project/d121/icon4py/ci/testdata/grids/mch_ch_r04b09_dsl"
diff --git a/model/atmosphere/diffusion/src/icon4py/model/atmosphere/diffusion/cached.py b/model/atmosphere/diffusion/src/icon4py/model/atmosphere/diffusion/cached.py
@@ -0,0 +1,79 @@
+# ICON4Py - ICON inspired code in Python and GT4Py
+#
+# Copyright (c) 2022, ETH Zurich and MeteoSwiss
+# All rights reserved.
+#
+# This file is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or any later
+# version. See the LICENSE.txt file at the top-level directory of this
+# distribution for a copy of the license or check <https://www.gnu.org/licenses/>.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+from icon4py.model.atmosphere.diffusion.diffusion_utils import (
+    copy_field as copy_field_orig,
+    init_diffusion_local_fields_for_regular_timestep as init_diffusion_local_fields_for_regular_timestep_orig,
+    scale_k as scale_k_orig,
+    setup_fields_for_initial_step as setup_fields_for_initial_step_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.apply_diffusion_to_vn import (
+    apply_diffusion_to_vn as apply_diffusion_to_vn_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence import (
+    apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence as apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.calculate_diagnostic_quantities_for_turbulence import (
+    calculate_diagnostic_quantities_for_turbulence as calculate_diagnostic_quantities_for_turbulence_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools import (
+    calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools as calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.calculate_nabla2_and_smag_coefficients_for_vn import (
+    calculate_nabla2_and_smag_coefficients_for_vn as calculate_nabla2_and_smag_coefficients_for_vn_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.calculate_nabla2_for_theta import (
+    calculate_nabla2_for_theta as calculate_nabla2_for_theta_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.truly_horizontal_diffusion_nabla_of_theta_over_steep_points import (
+    truly_horizontal_diffusion_nabla_of_theta_over_steep_points as truly_horizontal_diffusion_nabla_of_theta_over_steep_points_orig,
+)
+from icon4py.model.atmosphere.diffusion.stencils.update_theta_and_exner import (
+    update_theta_and_exner as update_theta_and_exner_orig,
+)
+from icon4py.model.common.caching import CachedProgram
+from icon4py.model.common.interpolation.stencils.mo_intp_rbf_rbf_vec_interpol_vertex import (
+    mo_intp_rbf_rbf_vec_interpol_vertex as mo_intp_rbf_rbf_vec_interpol_vertex_orig,
+)
+
+
+# diffusion run stencils
+apply_diffusion_to_vn = CachedProgram(apply_diffusion_to_vn_orig)
+apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence = CachedProgram(
+    apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence_orig
+)
+calculate_diagnostic_quantities_for_turbulence = CachedProgram(
+    calculate_diagnostic_quantities_for_turbulence_orig
+)
+calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools = CachedProgram(
+    calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools_orig
+)
+calculate_nabla2_and_smag_coefficients_for_vn = CachedProgram(
+    calculate_nabla2_and_smag_coefficients_for_vn_orig
+)
+calculate_nabla2_for_theta = CachedProgram(calculate_nabla2_for_theta_orig)
+truly_horizontal_diffusion_nabla_of_theta_over_steep_points = CachedProgram(
+    truly_horizontal_diffusion_nabla_of_theta_over_steep_points_orig
+)
+update_theta_and_exner = CachedProgram(update_theta_and_exner_orig)
+
+mo_intp_rbf_rbf_vec_interpol_vertex = CachedProgram(mo_intp_rbf_rbf_vec_interpol_vertex_orig)
+
+
+# model init stencils
+setup_fields_for_initial_step = CachedProgram(setup_fields_for_initial_step_orig, with_domain=False)
+copy_field = CachedProgram(copy_field_orig, with_domain=False)
+init_diffusion_local_fields_for_regular_timestep = CachedProgram(
+    init_diffusion_local_fields_for_regular_timestep_orig, with_domain=False
+)
+scale_k = CachedProgram(scale_k_orig, with_domain=False)
diff --git a/model/atmosphere/diffusion/src/icon4py/model/atmosphere/diffusion/diffusion.py b/model/atmosphere/diffusion/src/icon4py/model/atmosphere/diffusion/diffusion.py
@@ -28,35 +28,27 @@
     DiffusionMetricState,
 )
 from icon4py.model.atmosphere.diffusion.diffusion_utils import (
-    copy_field,
-    init_diffusion_local_fields_for_regular_timestep,
     init_nabla2_factor_in_upper_damping_zone,
-    scale_k,
-    setup_fields_for_initial_step,
     zero_field,
 )
-from icon4py.model.atmosphere.diffusion.stencils.apply_diffusion_to_vn import apply_diffusion_to_vn
-from icon4py.model.atmosphere.diffusion.stencils.apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence import (
-    apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence,
-)
-from icon4py.model.atmosphere.diffusion.stencils.calculate_diagnostic_quantities_for_turbulence import (
-    calculate_diagnostic_quantities_for_turbulence,
-)
-from icon4py.model.atmosphere.diffusion.stencils.calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools import (
-    calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools,
-)
-from icon4py.model.atmosphere.diffusion.stencils.calculate_nabla2_and_smag_coefficients_for_vn import (
+
+# cached program import
+from icon4py.model.atmosphere.diffusion.cached import (
+    init_diffusion_local_fields_for_regular_timestep,
+    setup_fields_for_initial_step,
+    scale_k,
     calculate_nabla2_and_smag_coefficients_for_vn,
-)
-from icon4py.model.atmosphere.diffusion.stencils.calculate_nabla2_for_theta import (
     calculate_nabla2_for_theta,
-)
-from icon4py.model.atmosphere.diffusion.stencils.truly_horizontal_diffusion_nabla_of_theta_over_steep_points import (
     truly_horizontal_diffusion_nabla_of_theta_over_steep_points,
-)
-from icon4py.model.atmosphere.diffusion.stencils.update_theta_and_exner import (
+    apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence,
+    apply_diffusion_to_vn,
+    calculate_diagnostic_quantities_for_turbulence,
+    calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools,
     update_theta_and_exner,
+    copy_field,
+    mo_intp_rbf_rbf_vec_interpol_vertex,
 )
+
 from icon4py.model.common.constants import (
     CPD,
     DEFAULT_PHYSICS_DYNAMICS_TIMESTEP_RATIO,
@@ -68,9 +60,6 @@
 from icon4py.model.common.grid.horizontal import CellParams, EdgeParams, HorizontalMarkerIndex
 from icon4py.model.common.grid.icon import IconGrid
 from icon4py.model.common.grid.vertical import VerticalModelParams
-from icon4py.model.common.interpolation.stencils.mo_intp_rbf_rbf_vec_interpol_vertex import (
-    mo_intp_rbf_rbf_vec_interpol_vertex,
-)
 from icon4py.model.common.states.prognostic_state import PrognosticState
 from icon4py.model.common.settings import xp
 
@@ -752,6 +741,7 @@ def _do_diffusion_step(
         )
         # TODO (magdalena) get rid of this copying. So far passing an empty buffer instead did not verify?
         copy_field(prognostic_state.w, self.w_tmp, offset_provider={})
+
         apply_diffusion_to_w_and_compute_horizontal_gradients_for_turbulence(
             area=self.cell_params.area,
             geofac_n2s=self.interpolation_state.geofac_n2s,
@@ -784,6 +774,7 @@ def _do_diffusion_step(
         log.debug(
             "running fused stencils 11 12 (calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools): start"
         )
+
         calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools(
             theta_v=prognostic_state.theta_v,
             theta_ref_mc=self.metric_state.theta_ref_mc,
@@ -799,6 +790,7 @@ def _do_diffusion_step(
         log.debug(
             "running stencils 11 12 (calculate_enhanced_diffusion_coefficients_for_grid_point_cold_pools): end"
         )
+
         log.debug("running stencils 13 14 (calculate_nabla2_for_theta): start")
         calculate_nabla2_for_theta(
             kh_smag_e=self.kh_smag_e,

diff --git a/...ffusion/tests/diffusion_stencil_tests/test_temporary_fields_for_turbulence_diagnostics.py b/...ffusion/tests/diffusion_stencil_tests/test_temporary_fields_for_turbulence_diagnostics.py
@@ -43,12 +43,15 @@ def reference(
         **kwargs,
     ) -> dict:
         c2e = grid.connectivities[C2EDim]
+        c2ce = grid.get_offset_provider("C2CE").table
+
         geofac_div = np.expand_dims(geofac_div, axis=-1)
-        vn_geofac = vn[c2e] * geofac_div[grid.get_offset_provider("C2CE").table]
-        div = np.sum(vn_geofac, axis=1)
         e_bln_c_s = np.expand_dims(e_bln_c_s, axis=-1)
         diff_multfac_smag = np.expand_dims(diff_multfac_smag, axis=0)
-        mul = kh_smag_ec[c2e] * e_bln_c_s[grid.get_offset_provider("C2CE").table]
+
+        vn_geofac = vn[c2e] * geofac_div[c2ce]
+        div = np.sum(vn_geofac, axis=1)
+        mul = kh_smag_ec[c2e] * e_bln_c_s[c2ce]
         summed = np.sum(mul, axis=1)
         kh_c = summed / diff_multfac_smag
 

diff --git a/...here/dycore/tests/dycore_stencil_tests/test_add_interpolated_horizontal_advection_of_w.py b/...here/dycore/tests/dycore_stencil_tests/test_add_interpolated_horizontal_advection_of_w.py
@@ -27,8 +27,10 @@ def add_interpolated_horizontal_advection_of_w_numpy(
     grid, e_bln_c_s: np.array, z_v_grad_w: np.array, ddt_w_adv: np.array, **kwargs
 ) -> np.array:
     e_bln_c_s = np.expand_dims(e_bln_c_s, axis=-1)
+    c2ce = grid.get_offset_provider("C2CE").table
+
     ddt_w_adv = ddt_w_adv + np.sum(
-        z_v_grad_w[grid.connectivities[C2EDim]] * e_bln_c_s[grid.get_offset_provider("C2CE").table],
+        z_v_grad_w[grid.connectivities[C2EDim]] * e_bln_c_s[c2ce],
         axis=1,
     )
     return ddt_w_adv

diff --git a/model/atmosphere/dycore/tests/dycore_stencil_tests/test_interpolate_to_cell_center.py b/model/atmosphere/dycore/tests/dycore_stencil_tests/test_interpolate_to_cell_center.py
@@ -30,9 +30,10 @@ def interpolate_to_cell_center_numpy(
     grid, interpolant: np.array, e_bln_c_s: np.array, **kwargs
 ) -> np.array:
     e_bln_c_s = np.expand_dims(e_bln_c_s, axis=-1)
+    c2ce = grid.get_offset_provider("C2CE").table
+
     interpolation = np.sum(
-        interpolant[grid.connectivities[C2EDim]]
-        * e_bln_c_s[grid.get_offset_provider("C2CE").table],
+        interpolant[grid.connectivities[C2EDim]] * e_bln_c_s[c2ce],
         axis=1,
     )
     return interpolation

diff --git a/.../dycore/tests/dycore_stencil_tests/test_mcompute_divergence_of_fluxes_of_rho_and_theta.py b/.../dycore/tests/dycore_stencil_tests/test_mcompute_divergence_of_fluxes_of_rho_and_theta.py
@@ -28,7 +28,7 @@
 from icon4py.model.common.type_alias import vpfloat, wpfloat
 
 
-class TestComputeDivergenceOfFluxesOfRhoAndTheta(StencilTest):
+class TestComputeDivergenconnectivityceOfFluxesOfRhoAndTheta(StencilTest):
     PROGRAM = compute_divergence_of_fluxes_of_rho_and_theta
     OUTPUTS = ("z_flxdiv_mass", "z_flxdiv_theta")
 
@@ -42,12 +42,14 @@ def reference(
     ) -> tuple[np.array]:
         c2e = grid.connectivities[C2EDim]
         geofac_div = np.expand_dims(geofac_div, axis=-1)
+        c2ce = grid.get_offset_provider("C2CE").table
+
         z_flxdiv_mass = np.sum(
-            geofac_div[grid.get_offset_provider("C2CE").table] * mass_fl_e[c2e],
+            geofac_div[c2ce] * mass_fl_e[c2e],
             axis=1,
         )
         z_flxdiv_theta = np.sum(
-            geofac_div[grid.get_offset_provider("C2CE").table] * z_theta_v_fl_e[c2e],
+            geofac_div[c2ce] * z_theta_v_fl_e[c2e],
             axis=1,
         )
         return dict(z_flxdiv_mass=z_flxdiv_mass, z_flxdiv_theta=z_flxdiv_theta)

diff --git a/model/common/src/icon4py/model/common/caching.py b/model/common/src/icon4py/model/common/caching.py
@@ -0,0 +1,135 @@
+# ICON4Py - ICON inspired code in Python and GT4Py
+#
+# Copyright (c) 2022, ETH Zurich and MeteoSwiss
+# All rights reserved.
+#
+# This file is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the
+# Free Software Foundation, either version 3 of the License, or any later
+# version. See the LICENSE.txt file at the top-level directory of this
+# distribution for a copy of the license or check <https://www.gnu.org/licenses/>.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import dataclasses
+from typing import Any, Callable, Optional
+
+import numpy as np
+from gt4py import next as gtx
+from gt4py.next.otf import workflow
+from gt4py.next.program_processors.runners.gtfn import extract_connectivity_args
+
+from icon4py.model.common.settings import device
+
+
+try:
+    import cupy as cp
+    from gt4py.next.embedded.nd_array_field import CuPyArrayField
+except ImportError:
+    cp: Optional = None  # type:ignore[no-redef]
+
+from gt4py.next.embedded.nd_array_field import NumPyArrayField
+
+
+def handle_numpy_integer(value):
+    return int(value)
+
+
+def handle_common_field(value, sizes):
+    sizes.extend(value.shape)
+    return value  # Return the value unmodified, but side-effect on sizes
+
+
+def handle_default(value):
+    return value  # Return the value unchanged
+
+
+if cp:
+    type_handlers = {
+        np.integer: handle_numpy_integer,
+        NumPyArrayField: handle_common_field,
+        CuPyArrayField: handle_common_field,
+    }
+else:
+    type_handlers = {
+        np.integer: handle_numpy_integer,
+        NumPyArrayField: handle_common_field,
+    }
+
+
+def process_arg(value, sizes):
+    handler = type_handlers.get(type(value), handle_default)
+    return handler(value, sizes) if handler == handle_common_field else handler(value)
+
+
+@dataclasses.dataclass
+class CachedProgram:
+    """Class to handle caching and compilation of GT4Py programs.
+
+    This class is responsible for caching and compiling GT4Py programs
+    with optional domain information. The compiled program and its
+    connectivity arguments are stored for efficient execution.
+
+    Attributes:
+        program (gtx.ffront.decorator.Program): The GT4Py program to be cached and compiled.
+        with_domain (bool): Flag to indicate if the program should be compiled with domain information. Defaults to True.
+        _compiled_program (Optional[Callable]): The compiled GT4Py program.
+        _conn_args (Any): Connectivity arguments extracted from the offset provider.
+        _compiled_args (tuple): Arguments used during the compilation of the program.
+
+    Properties:
+        compiled_program (Callable): Returns the compiled GT4Py program.
+        conn_args (Any): Returns the connectivity arguments.
+
+    Note:
+        This functionality will be provided by GT4Py in the future.
+    """
+
+    program: gtx.ffront.decorator.Program
+    with_domain: bool = True
+    _compiled_program: Optional[Callable] = None
+    _conn_args: Any = None
+    _compiled_args: tuple = dataclasses.field(default_factory=tuple)
+
+    @property
+    def compiled_program(self) -> Callable:
+        return self._compiled_program
+
+    @property
+    def conn_args(self) -> Callable:
+        return self._conn_args
+
+    def compile_the_program(
+        self, *args, offset_provider: dict[str, gtx.Dimension], **kwargs: Any
+    ) -> Callable:
+        backend = self.program.backend
+        program_call = backend.transforms_prog(
+            workflow.InputWithArgs(
+                data=self.program.definition_stage,
+                args=args,
+                kwargs=kwargs | {"offset_provider": offset_provider},
+            )
+        )
+        self._compiled_args = program_call.args
+        return backend.executor.otf_workflow(program_call)
+
+    def __call__(self, *args, offset_provider: dict[str, gtx.Dimension], **kwargs: Any) -> None:
+        if not self.compiled_program:
+            self._compiled_program = self.compile_the_program(
+                *args, offset_provider=offset_provider, **kwargs
+            )
+            self._conn_args = extract_connectivity_args(offset_provider, device)
+
+        kwargs_as_tuples = tuple(kwargs.values())
+        program_args = list(args) + list(kwargs_as_tuples)
+        sizes = []
+
+        # Convert numpy integers in args to int and handle gtx.common.Field
+        for i in range(len(program_args)):
+            program_args[i] = process_arg(program_args[i], sizes)
+
+        if not self.with_domain:
+            program_args.extend(sizes)
+
+        # todo(samkellerhals): if we merge gt4py PR we can also pass connectivity args here conn_args=self.conn_args
+        return self.compiled_program(*program_args, offset_provider=offset_provider)
diff --git a/model/common/src/icon4py/model/common/config.py b/model/common/src/icon4py/model/common/config.py
@@ -69,3 +69,7 @@ def device(self):
         }
         device = device_map[self.icon4py_backend]
         return device
+
+    @cached_property
+    def limited_area(self):
+        return os.environ.get("ICON4PY_LAM", False)