From 158afc9eaf1ffc36894c15e47a99d7c6e03f1f49 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 4 Aug 2022 16:21:49 +0300
Subject: [PATCH] implement hierachical matrix solver

---
 .gitignore                                 |   2 +
 examples/scaling-study-hmatrix.py          | 198 +++++++
 pytential/linalg/direct_solver_symbolic.py |   8 +-
 pytential/linalg/hmatrix.py                | 544 +++++++++++++++++++
 pytential/linalg/proxy.py                  |   2 +-
 pytential/linalg/skeletonization.py        |  32 +-
 pytential/linalg/utils.py                  |  62 +++
 pytential/symbolic/matrix.py               |  18 +-
 test/extra_matrix_data.py                  |  21 +-
 test/test_linalg_hmatrix.py                | 582 +++++++++++++++++++++
 test/test_linalg_skeletonization.py        |  99 ++++
 11 files changed, 1543 insertions(+), 25 deletions(-)
 create mode 100644 examples/scaling-study-hmatrix.py
 create mode 100644 pytential/linalg/hmatrix.py
 create mode 100644 test/test_linalg_hmatrix.py

diff --git a/.gitignore b/.gitignore
index daf719e6f..c89b827fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,8 @@ examples/*.pdf
 
 *.vtu
 *.vts
+*.png
+*.gif
 
 .cache
 
diff --git a/examples/scaling-study-hmatrix.py b/examples/scaling-study-hmatrix.py
new file mode 100644
index 000000000..d163e7383
--- /dev/null
+++ b/examples/scaling-study-hmatrix.py
@@ -0,0 +1,198 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from meshmode.array_context import PyOpenCLArrayContext
+from pytools.convergence import EOCRecorder
+
+from pytential import GeometryCollection, sym
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Timings:
+    build: float
+    matvec: float
+
+
+def run_hmatrix_matvec(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptor) -> None:
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(places.ambient_dim)
+    sym_u = sym.var("u")
+    sym_op = -0.5 * sym_u + sym.D(kernel, sym_u, qbx_forced_limit="avg")
+
+    density_discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    u = actx.thaw(density_discr.nodes()[0])
+
+    def build_hmat():
+        from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+        return build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dofdesc],
+            context={},
+            auto_where=dofdesc,
+            id_eps=1.0e-10,
+            _tree_kind="adaptive-level-restricted",
+            _approx_nproxy=64,
+            _proxy_radius_factor=1.15).get_forward()
+
+    # warmup
+    from pytools import ProcessTimer
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    logger.info("build(warmup): %s", pt)
+
+    # build
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    t_build = pt.wall_elapsed
+    logger.info("build: %s", pt)
+
+    # matvec
+    with ProcessTimer() as pt:
+        du = hmat @ u
+        assert du is not None
+        actx.queue.finish()
+
+    t_matvec = pt.wall_elapsed
+    logger.info("matvec: %s", pt)
+
+    return Timings(t_build, t_matvec)
+
+
+def run_scaling_study(
+        ambient_dim: int, *,
+        target_order: int = 4,
+        source_ovsmp: int = 4,
+        qbx_order: int = 4,
+        ) -> None:
+    dd = sym.DOFDescriptor(f"d{ambient_dim}", discr_stage=sym.QBX_SOURCE_STAGE2)
+
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
+
+    eoc_build = EOCRecorder()
+    eoc_matvec = EOCRecorder()
+
+    import meshmode.mesh.generation as mgen
+    import meshmode.discretization.poly_element as mpoly
+    resolutions = [64, 128, 256, 512, 1024, 1536, 2048, 2560, 3072]
+
+    for n in resolutions:
+        mesh = mgen.make_curve_mesh(
+            mgen.NArmedStarfish(5, 0.25),
+            np.linspace(0, 1, n),
+            order=target_order)
+
+        from meshmode.discretization import Discretization
+        pre_density_discr = Discretization(actx, mesh,
+            mpoly.InterpolatoryQuadratureGroupFactory(target_order))
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(
+            pre_density_discr,
+            fine_order=source_ovsmp * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False, fmm_backend=None,
+            )
+        places = GeometryCollection(qbx, auto_where=dd.geometry)
+        density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+        logger.info("ndofs:     %d", density_discr.ndofs)
+        logger.info("nelements: %d", density_discr.mesh.nelements)
+
+        timings = run_hmatrix_matvec(actx, places, dofdesc=dd)
+        eoc_build.add_data_point(density_discr.ndofs, timings.build)
+        eoc_matvec.add_data_point(density_discr.ndofs, timings.matvec)
+
+    for name, eoc in [("build", eoc_build), ("matvec", eoc_matvec)]:
+        logger.info("%s\n%s",
+            name, eoc.pretty_print(
+                abscissa_label="dofs",
+                error_label=f"{name} (s)",
+                abscissa_format="%d",
+                error_format="%.3fs",
+                eoc_format="%.2f",
+                )
+            )
+        visualize_eoc(f"scaling-study-hmatrix-{name}", eoc, 1)
+
+
+def visualize_eoc(
+        filename: str, eoc: EOCRecorder, order: int,
+        overwrite: bool = False) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.info("matplotlib not available for plotting")
+        return
+
+    fig = plt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    h, error = np.array(eoc.history).T  # type: ignore[no-untyped-call]
+    ax.loglog(h, error, "o-")
+
+    max_h = np.max(h)
+    min_e = np.min(error)
+    max_e = np.max(error)
+    min_h = np.exp(np.log(max_h) + np.log(min_e / max_e) / order)
+
+    ax.loglog(
+        [max_h, min_h], [max_e, min_e], "k-", label=rf"$\mathcal{{O}}(h^{order})$"
+    )
+
+    # }}}
+
+    ax.grid(True, which="major", linestyle="-", alpha=0.75)
+    ax.grid(True, which="minor", linestyle="--", alpha=0.5)
+
+    ax.set_xlabel("$N$")
+    ax.set_ylabel("$T~(s)$")
+
+    import pathlib
+    filename = pathlib.Path(filename)
+    if not overwrite and filename.exists():
+        raise FileExistsError(f"output file '{filename}' already exists")
+
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    run_scaling_study(ambient_dim=2)
diff --git a/pytential/linalg/direct_solver_symbolic.py b/pytential/linalg/direct_solver_symbolic.py
index c57752840..5c11eee1d 100644
--- a/pytential/linalg/direct_solver_symbolic.py
+++ b/pytential/linalg/direct_solver_symbolic.py
@@ -54,12 +54,16 @@ def prepare_expr(places, exprs, auto_where=None):
     return _prepare_expr(places, exprs, auto_where=auto_where)
 
 
-def prepare_proxy_expr(places, exprs, auto_where=None):
+def prepare_proxy_expr(
+        places, exprs,
+        auto_where=None,
+        remove_transforms: bool = True):
     def _prepare_expr(expr):
         # remove all diagonal / non-operator terms in the expression
         expr = IntGTermCollector()(expr)
         # ensure all IntGs remove all the kernel derivatives
-        expr = KernelTransformationRemover()(expr)
+        if remove_transforms:
+            expr = KernelTransformationRemover()(expr)
         # ensure all IntGs have their source and targets set
         expr = DOFDescriptorReplacer(auto_where[0], auto_where[1])(expr)
 
diff --git a/pytential/linalg/hmatrix.py b/pytential/linalg/hmatrix.py
new file mode 100644
index 000000000..a72a209cb
--- /dev/null
+++ b/pytential/linalg/hmatrix.py
@@ -0,0 +1,544 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterable, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import numpy.linalg as la
+from scipy.sparse.linalg import LinearOperator
+
+from arraycontext import PyOpenCLArrayContext, ArrayOrContainerT, flatten, unflatten
+from meshmode.dof_array import DOFArray
+
+from pytential import GeometryCollection, sym
+from pytential.linalg.proxy import ProxyGeneratorBase
+from pytential.linalg.cluster import ClusterTree, ClusterLevel
+from pytential.linalg.skeletonization import (
+    SkeletonizationWrangler, SkeletonizationResult)
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+Hierarical Matrix Construction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ProxyHierarchicalMatrixWrangler
+.. autoclass:: ProxyHierarchicalMatrix
+.. autoclass:: ProxyHierarchicalForwardMatrix
+.. autoclass:: ProxyHierarchicalBackwardMatrix
+
+.. autofunction:: build_hmatrix_by_proxy
+"""
+
+
+# {{{ error model
+
+def hmatrix_error_from_param(
+        ambient_dim: int,
+        *,
+        id_eps: float,
+        id_rank: int,
+        min_proxy_radius: float,
+        max_cluster_radius: float,
+        nproxies: int,
+        nsources: int,
+        ntargets: int, c: float = 1.0e-3) -> float:
+    if ambient_dim == 2:
+        p = int(0.5 * id_rank)
+    elif ambient_dim == 3:
+        p = int((np.sqrt(1 + 4 * id_rank) - 1) / 2)
+    else:
+        raise ValueError(f"unsupported ambient dimension: '{ambient_dim}'")
+
+    rho = alpha = max_cluster_radius / min_proxy_radius
+    return (
+        c * rho ** (p + 1) / (1 - rho)
+        + np.sqrt(nsources / nproxies)
+        * (1 - alpha ** (p + 1)) / (1 - alpha) * id_eps
+        )
+
+# }}}
+
+
+# {{{ update diagonals
+
+def _update_skeleton_diagonal(
+        skeleton: SkeletonizationResult,
+        parent: Optional[SkeletonizationResult],
+        clevel: Optional[ClusterLevel],
+        diagonal: Optional[np.ndarray] = None) -> SkeletonizationResult:
+    """Due to the evaluation in :func:`_skeletonize_block_by_proxy_with_mats`,
+    the diagonal matrix in *skeleton* also contains the indices from its
+    parent. In particular, at a level :math:`l` we need the diagonal block::
+
+        0               D_{i, j + 1}        D_{i, j + 2}
+        D_{i + 1, j}    0                   D_{i + 1, j + 2}
+        D_{i + 2, j}    D_{i + 2, j + 1}    0
+
+    but the version in *skeleton* also fills in the 0 blocks in there. This
+    routine goes through them and zeros them out.
+    """
+
+    if clevel is None:
+        return skeleton
+
+    assert parent is not None
+    assert skeleton.tgt_src_index.shape == parent.skel_tgt_src_index.shape
+
+    if diagonal is None:
+        diagonal = np.zeros(parent.nclusters)
+
+    from numbers import Number
+    if isinstance(diagonal, Number):
+        diagonal = np.full(parent.nclusters, diagonal)
+
+    assert diagonal.size == parent.nclusters
+    targets, sources = parent.skel_tgt_src_index
+
+    # FIXME: nicer way to do this?
+    mat: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+    for k in range(skeleton.nclusters):
+        D = skeleton.D[k].copy()
+
+        i = j = 0
+        for icluster in clevel.parent_map[k]:
+            di = targets.cluster_size(icluster)
+            dj = sources.cluster_size(icluster)
+            D[np.s_[i:i + di], np.s_[j:j + dj]] = diagonal[icluster]
+
+            i += di
+            j += dj
+
+        assert D.shape == (i, j)
+        mat[k] = D
+
+    from dataclasses import replace
+    return replace(skeleton, D=mat)
+
+
+def _update_skeletons_diagonal(
+        wrangler: "ProxyHierarchicalMatrixWrangler",
+        func: Callable[[SkeletonizationResult], Optional[np.ndarray]],
+        ) -> np.ndarray:
+    skeletons: np.ndarray = np.empty(wrangler.skeletons.shape, dtype=object)
+    skeletons[0] = wrangler.skeletons[0]
+
+    for i in range(1, wrangler.ctree.nlevels):
+        skeletons[i] = _update_skeleton_diagonal(
+            wrangler.skeletons[i],
+            wrangler.skeletons[i - 1],
+            wrangler.ctree.levels[i - 1],
+            diagonal=func(skeletons[i - 1]))
+
+    return skeletons
+
+# }}}
+
+
+# {{{ ProxyHierarchicalMatrix
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrixWrangler:
+    """
+    .. automethod:: get_forward
+    .. automethod:: get_backward
+    """
+
+    wrangler: SkeletonizationWrangler
+    proxy: ProxyGeneratorBase
+    ctree: ClusterTree
+    skeletons: np.ndarray
+
+    def get_forward(self) -> "ProxyHierarchicalForwardMatrix":
+        return ProxyHierarchicalForwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, lambda s: None),
+            )
+
+    def get_backward(self) -> "ProxyHierarchicalBackwardMatrix":
+        return ProxyHierarchicalBackwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, lambda s: s.Dhat))
+
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrix(LinearOperator):
+    """
+    .. attribute:: skeletons
+
+        An :class:`~numpy.ndarray` containing skeletonization information
+        for each level of the hierarchy. For additional details, see
+        :class:`~pytential.linalg.skeletonization.SkeletonizationResult`.
+
+    This class implements the :class:`scipy.sparse.linalg.LinearOperator`
+    interface. In particular, the following attributes and methods:
+
+    .. attribute:: shape
+
+        A :class:`tuple` that gives the matrix size ``(m, n)``.
+
+    .. attribute:: dtype
+
+        The data type of the matrix entries.
+
+    .. automethod:: matvec
+    .. automethod:: __matmul__
+    """
+
+    ctree: ClusterTree
+    skeletons: np.ndarray
+
+    @property
+    def shape(self):
+        return self.skeletons[0].tgt_src_index.shape
+
+    @property
+    def dtype(self):
+        # FIXME: assert that everyone has this dtype?
+        return self.skeletons[0].R[0].dtype
+
+    @property
+    def nlevels(self):
+        return self.skeletons.size
+
+    @property
+    def nclusters(self):
+        return self.skeletons[0].nclusters
+
+    def __matmul__(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        """Same as :meth:`matvec`."""
+        return self._matvec(x)
+
+    def _matmat(self, mat):
+        raise NotImplementedError
+
+    def _adjoint(self, x):
+        raise NotImplementedError
+
+# }}}
+
+
+# {{{ forward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalForwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_forward_matvec(self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result   # type: ignore[return-value]
+
+
+def apply_skeleton_forward_matvec(
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+    x = split_array(ary, sources)   # type: ignore[arg-type]
+
+    # NOTE: this computes a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   x_{k + 1} = R_k x_k
+    #   z_{k + 1} = D_k x_k
+    #
+    # and, at the root level, we have
+    #
+    #   x_{N + 1} = z_{N + 1} = D_N x_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   b_{k - 1} = z_k + L_k b_k
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    d_dot_x: np.ndarray = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    from pytential.linalg.cluster import cluster
+    for k, clevel in enumerate(hmat.ctree.levels):
+        skeleton = hmat.skeletons[k]
+        assert x.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[1] == sum(xi.size for xi in x)
+
+        d_dot_x_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+        r_dot_x_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+
+        for i in range(skeleton.nclusters):
+            r_dot_x_k[i] = skeleton.R[i] @ x[i]
+            d_dot_x_k[i] = skeleton.D[i] @ x[i]
+
+        d_dot_x[k] = d_dot_x_k
+        x = cluster(r_dot_x_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    # NOTE: at root level, we just multiply with the full diagonal
+    b = d_dot_x[hmat.nlevels - 1]
+    assert b.shape == (1,)
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        d_dot_x_k = d_dot_x[k]
+        assert d_dot_x_k.shape == (skeleton.nclusters,)
+
+        b = uncluster(b, skeleton.skel_tgt_src_index.targets, clevel)
+        for i in range(skeleton.nclusters):
+            b[i] = d_dot_x_k[i] + skeleton.L[i] @ b[i]
+
+    assert b.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(b)[np.argsort(targets.indices)]
+
+# }}}
+
+
+# {{{ backward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalBackwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_backward_matvec(actx, self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result   # type: ignore[return-value]
+
+
+def apply_skeleton_backward_matvec(
+        actx: Optional[PyOpenCLArrayContext],
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+
+    b = split_array(ary, targets)   # type: ignore[arg-type]
+    r_dot_b: np.ndarray = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    # NOTE: this solves a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0 = b_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   b_{k + 1} = \hat{D}_k R_k D_k^{-1} b_k
+    #   \hat{D}_k = (R_k D_k^{-1} L_k)^{-1}
+    #
+    # and, at the root level, we solve
+    #
+    #   D_N x_N = b_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   x_{k} = D_k^{-1} (b_k - L_k b_{k + 1} + L_k \hat{D}_k x_{k + 1})
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    from pytential.linalg.cluster import cluster
+
+    for k, clevel in enumerate(hmat.ctree.levels):
+        skeleton = hmat.skeletons[k]
+        assert b.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[0] == sum(bi.size for bi in b)
+
+        dhat_dot_b_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+        for i in range(skeleton.nclusters):
+            dhat_dot_b_k[i] = (
+                skeleton.Dhat[i] @ (skeleton.R[i] @ (skeleton.invD[i] @ b[i]))
+                )
+
+        r_dot_b[k] = b
+        b = cluster(dhat_dot_b_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    from pytools.obj_array import make_obj_array
+    assert b.shape == (1,)
+    x = make_obj_array([
+        la.solve(D, bi) for D, bi in zip(hmat.skeletons[-1].D, b)
+        ])
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        b0 = r_dot_b[k]
+        b1 = r_dot_b[k + 1]
+        assert b0.shape == (skeleton.nclusters,)
+
+        x = uncluster(x, skeleton.skel_tgt_src_index.sources, clevel)
+        b1 = uncluster(b1, skeleton.skel_tgt_src_index.targets, clevel)
+
+        for i in range(skeleton.nclusters):
+            sx = b1[i] - skeleton.Dhat[i] @ x[i]
+            x[i] = skeleton.invD[i] @ (b0[i] - skeleton.L[i] @ sx)
+
+    assert x.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(x)[np.argsort(sources.indices)]
+
+# }}}
+
+
+# {{{ build_hmatrix_by_proxy
+
+def build_hmatrix_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        exprs: Union[sym.Expression, Iterable[sym.Expression]],
+        input_exprs: Union[sym.Expression, Iterable[sym.Expression]], *,
+        auto_where: Optional[sym.DOFDescriptorLike] = None,
+        domains: Optional[Sequence[sym.DOFDescriptorLike]] = None,
+        context: Optional[Dict[str, Any]] = None,
+        id_eps: float = 1.0e-8,
+
+        # NOTE: these are dev variables and can disappear at any time!
+        _tree_kind: Optional[str] = "adaptive-level-restricted",
+        _weighted_proxy: Optional[Union[bool, Tuple[bool, bool]]] = None,
+
+        # TODO: plugin in error model to get an estimate for:
+        #   * how many points we want per cluster?
+        #   * how many proxy points we want?
+        #   * how far away should the proxy points be?
+        # based on id_eps. How many of these should be user tunable?
+        _max_particles_in_box: Optional[int] = None,
+        _approx_nproxy: Optional[int] = None,
+        _proxy_radius_factor: Optional[float] = None,
+        ) -> ProxyHierarchicalMatrixWrangler:
+    from pytential.symbolic.matrix import P2PClusterMatrixBuilder
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+
+    def P2PClusterMatrixBuilderWithDiagonal(*args, **kwargs):
+        kwargs["exclude_self"] = True
+        return P2PClusterMatrixBuilder(*args, **kwargs)
+
+    wrangler = make_skeletonization_wrangler(
+            places, exprs, input_exprs,
+            domains=domains, context=context, auto_where=auto_where,
+            _weighted_proxy=_weighted_proxy,
+            # _neighbor_cluster_builder=P2PClusterMatrixBuilderWithDiagonal,
+            # _proxy_source_cluster_builder=P2PClusterMatrixBuilder,
+            # _proxy_target_cluster_builder=P2PClusterMatrixBuilder,
+            )
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise ValueError("multi-block operators are not supported")
+
+    from pytential.linalg.proxy import QBXProxyGenerator
+    proxy = QBXProxyGenerator(places,
+            approx_nproxy=_approx_nproxy,
+            radius_factor=_proxy_radius_factor)
+
+    from pytential.linalg.cluster import partition_by_nodes
+    cluster_index, ctree = partition_by_nodes(
+        actx, places,
+        dofdesc=wrangler.domains[0],
+        tree_kind=_tree_kind,
+        max_particles_in_box=_max_particles_in_box)
+
+    logger.info("tree levels: %d", ctree.nlevels)
+    logger.info("cluster count: %d", cluster_index.nclusters)
+    logger.info("root cluster sizes: %s", [
+        # NOTE: making into a list so that they all get printed
+        int(s) for s in np.diff(cluster_index.starts)
+        ])
+
+    from pytential.linalg.utils import TargetAndSourceClusterList
+    tgt_src_index = TargetAndSourceClusterList(
+        targets=cluster_index, sources=cluster_index)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, exprs, input_exprs,
+        id_eps=id_eps,
+        max_particles_in_box=_max_particles_in_box,
+        _proxy=proxy,
+        _wrangler=wrangler,
+        )
+
+    return ProxyHierarchicalMatrixWrangler(
+        wrangler=wrangler, proxy=proxy, ctree=ctree, skeletons=skeletons
+        )
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
index 5e3b9cec4..6eb8c721f 100644
--- a/pytential/linalg/proxy.py
+++ b/pytential/linalg/proxy.py
@@ -360,7 +360,7 @@ def __call__(self,
                 source_dd.geometry, source_dd.discr_stage)
         assert isinstance(discr, Discretization)
 
-        include_cluster_radii = kwargs.pop("include_cluster_radii", False)
+        include_cluster_radii = kwargs.pop("include_cluster_radii", True)
 
         # {{{ get proxy centers and radii
 
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index e9ebec2eb..477acf9c8 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -26,8 +26,10 @@
         TYPE_CHECKING)
 
 import numpy as np
+import numpy.linalg as la
 
 from arraycontext import PyOpenCLArrayContext, Array
+from pytools import memoize_method
 
 from pytential import GeometryCollection, sym
 from pytential.linalg.proxy import ProxyGeneratorBase, ProxyClusterGeometryData
@@ -425,6 +427,7 @@ def make_skeletonization_wrangler(
 
         # internal
         _weighted_proxy: Optional[Union[bool, Tuple[bool, bool]]] = None,
+        _remove_source_transforms: bool = False,
         _proxy_source_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
         _proxy_target_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
         _neighbor_cluster_builder: Optional[Callable[..., np.ndarray]] = None,
@@ -451,9 +454,13 @@ def make_skeletonization_wrangler(
 
     exprs = prepare_expr(places, exprs, auto_where)
     source_proxy_exprs = prepare_proxy_expr(
-            places, exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET))
+            places, exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET),
+            remove_transforms=_remove_source_transforms)
     target_proxy_exprs = prepare_proxy_expr(
-            places, exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]))
+            places, exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]),
+            # NOTE: transforms are unconditionally removed here because the
+            # source would be the proxies, where we do not have normals, etc.
+            remove_transforms=True)
 
     # }}}
 
@@ -463,7 +470,7 @@ def make_skeletonization_wrangler(
         weighted_sources = weighted_targets = True
     elif isinstance(_weighted_proxy, bool):
         weighted_sources = weighted_targets = _weighted_proxy
-    elif isinstance(_weighted_proxy, tuple):
+    elif isinstance(_weighted_proxy, tuple) and len(_weighted_proxy) == 2:
         weighted_sources, weighted_targets = _weighted_proxy
     else:
         raise ValueError(f"unknown value for weighting: '{_weighted_proxy}'")
@@ -661,9 +668,9 @@ def _skeletonize_block_by_proxy_with_mats(
 
         if __debug__:
             isfinite = np.isfinite(tgt_mat)
-            assert np.all(isfinite), np.where(isfinite)
+            assert np.all(isfinite), np.where(~isfinite)
             isfinite = np.isfinite(src_mat)
-            assert np.all(isfinite), np.where(isfinite)
+            assert np.all(isfinite), np.where(~isfinite)
 
         # skeletonize target points
         k, idx, interp = interp_decomp(tgt_mat.T, rank=k, eps=id_eps)
@@ -813,6 +820,21 @@ def __post_init__(self):
     def nclusters(self):
         return self.tgt_src_index.nclusters
 
+    @property
+    @memoize_method
+    def invD(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([la.inv(D) for D in self.D])
+
+    @property
+    @memoize_method
+    def Dhat(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([
+            la.inv(self.R[i] @ self.invD[i] @ self.L[i])
+            for i in range(self.nclusters)
+            ])
+
 
 def skeletonize_by_proxy(
         actx: PyOpenCLArrayContext,
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index 1a8da49ff..4c6f19d40 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -440,6 +440,22 @@ def mnorm(x: np.ndarray, y: np.ndarray) -> "np.floating[Any]":
     return tgt_error, src_error
 
 
+def skeletonization_matrix(
+        mat: np.ndarray, skeleton: "SkeletonizationResult",
+        ) -> Tuple[np.ndarray, np.ndarray]:
+    D: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+    S: np.ndarray = np.empty((skeleton.nclusters, skeleton.nclusters), dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            D[i] = skeleton.tgt_src_index.cluster_take(mat, i, i)
+        else:
+            S[i, j] = skeleton.skel_tgt_src_index.cluster_take(mat, i, j)
+
+    return D, S
+
+
 def skeletonization_error(
         mat: np.ndarray, skeleton: "SkeletonizationResult", *,
         ord: Optional[float] = None,
@@ -501,4 +517,50 @@ def skeletonization_error(
 
 # }}}
 
+
+# {{{ eigenvalues
+
+def eigs(
+        mat, *,
+        k: int = 6,
+        which: str = "LM",
+        maxiter: Optional[int] = None,
+        tol: float = 0.0) -> np.ndarray:
+    import scipy.sparse.linalg as ssla
+
+    result = ssla.eigs(mat,
+            k=k,
+            which=which,
+            maxiter=maxiter,
+            tol=tol,
+            return_eigenvectors=False)
+
+    imag_norm = np.linalg.norm(np.imag(result), ord=np.inf)
+    if imag_norm > 1.0e-14:
+        from warnings import warn
+        warn(f"eigenvalues are not real enough: norm(imag) = {imag_norm:.12e}")
+
+    return result
+
+
+def cond(mat, *,
+        mat_inv=None,
+        p: Optional[float] = None,
+        tol: float = 1.0e-6) -> float:
+    if p is None:
+        p = 2
+
+    if p != 2:
+        raise ValueError(f"unsupported norm order: '{p}'")
+
+    lambda_max = eigs(mat, k=1, which="LM", tol=tol)
+    if mat_inv is None:
+        lambda_min = eigs(mat, k=1, which="SM", tol=tol)
+    else:
+        lambda_min = eigs(mat_inv, k=1, which="LM", tol=tol)
+
+    return np.abs(lambda_max) / np.abs(lambda_min)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index c24353dca..0484527bb 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -461,7 +461,6 @@ def map_int_g(self, expr):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for density, kernel in zip(expr.densities, expr.source_kernels):
@@ -475,12 +474,10 @@ def map_int_g(self, expr):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixGenerator
             mat_gen = P2PMatrixGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -490,7 +487,7 @@ def map_int_g(self, expr):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = base_kernel.get_args() + base_kernel.get_source_args()
+            kernel_args = kernel.get_args() + kernel.get_source_args()
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
@@ -638,7 +635,6 @@ def map_int_g(self, expr):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for kernel, density in zip(expr.source_kernels, expr.densities):
@@ -659,12 +655,10 @@ def map_int_g(self, expr):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixSubsetGenerator
             mat_gen = P2PMatrixSubsetGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -674,7 +668,7 @@ def map_int_g(self, expr):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = base_kernel.get_args() + base_kernel.get_source_args()
+            kernel_args = kernel.get_args() + kernel.get_source_args()
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
diff --git a/test/extra_matrix_data.py b/test/extra_matrix_data.py
index 70710bec6..a0018ccce 100644
--- a/test/extra_matrix_data.py
+++ b/test/extra_matrix_data.py
@@ -43,14 +43,23 @@ class MatrixTestCaseMixin:
     proxy_target_cluster_builder: Optional[Callable[..., Any]] = None
     neighbor_cluster_builder: Optional[Callable[..., Any]] = None
 
-    def get_cluster_index(self, actx, places, dofdesc=None):
+    def max_particles_in_box_for_discr(self, discr):
+        max_particles_in_box = self.max_particles_in_box
+        if max_particles_in_box is None:
+            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+
+        return max_particles_in_box
+
+    def get_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         if dofdesc is None:
             dofdesc = places.auto_source
         discr = places.get_discretization(dofdesc.geometry)
 
-        max_particles_in_box = self.max_particles_in_box
         if max_particles_in_box is None:
-            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+            max_particles_in_box = self.max_particles_in_box
+            if max_particles_in_box is None:
+                max_particles_in_box = discr.ndofs // self.approx_cluster_count
 
         from pytential.linalg.cluster import partition_by_nodes
         cindex, ctree = partition_by_nodes(actx, places,
@@ -77,9 +86,11 @@ def get_cluster_index(self, actx, places, dofdesc=None):
 
         return cindex, ctree
 
-    def get_tgt_src_cluster_index(self, actx, places, dofdesc=None):
+    def get_tgt_src_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         from pytential.linalg import TargetAndSourceClusterList
-        cindex, ctree = self.get_cluster_index(actx, places, dofdesc=dofdesc)
+        cindex, ctree = self.get_cluster_index(
+            actx, places, dofdesc=dofdesc, max_particles_in_box=max_particles_in_box)
         return TargetAndSourceClusterList(cindex, cindex), ctree
 
     def get_operator(self, ambient_dim, qbx_forced_limit=_NoArgSentinel):
diff --git a/test/test_linalg_hmatrix.py b/test/test_linalg_hmatrix.py
new file mode 100644
index 000000000..1bba7b9d2
--- /dev/null
+++ b/test/test_linalg_hmatrix.py
@@ -0,0 +1,582 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import replace
+import pytest
+
+import numpy as np
+
+from pytential import bind, sym
+from pytential import GeometryCollection
+
+from meshmode.mesh.generation import NArmedStarfish
+
+from meshmode import _acf           # noqa: F401
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+
+import extra_matrix_data as extra
+import logging
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+HMATRIX_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="scalar",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="double",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.TorusTestCase(
+            target_order=4,
+            op_type="scalar",
+            resolutions=[0])
+        ]
+
+
+# {{{ test_hmatrix_forward_matvec_single_level
+
+def hmatrix_matvec_single_level(mat, x, skeleton):
+    from pytential.linalg.cluster import split_array
+    targets, sources = skeleton.tgt_src_index
+    y = split_array(x, sources)
+
+    y_hat = np.empty(y.shape, dtype=object)
+
+    for i in range(skeleton.nclusters):
+        y_hat[i] = skeleton.R[i] @ y[i]
+
+    from pytential.linalg.utils import skeletonization_matrix
+    D, S = skeletonization_matrix(mat, skeleton)
+    syhat = np.zeros(y.shape, dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            continue
+
+        syhat[i] = syhat[i] + S[i, j] @ y_hat[j]
+
+    for i in range(skeleton.nclusters):
+        y[i] = D[i] @ y[i] + skeleton.L[i] @ syhat[i]
+
+    return np.concatenate(y)[np.argsort(targets.indices)]
+
+
+@pytest.mark.parametrize("case", HMATRIX_TEST_CASES)
+@pytest.mark.parametrize("discr_stage", [sym.QBX_SOURCE_STAGE1])
+def test_hmatrix_forward_matvec_single_level(
+        actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.15}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct reference
+
+    from pytential.linalg.direct_solver_symbolic import prepare_expr
+    from pytential.symbolic.matrix import MatrixBuilder
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+    mat = MatrixBuilder(
+        actx,
+        dep_expr=sym_u,
+        other_dep_exprs=[],
+        dep_discr=density_discr,
+        places=places,
+        context={},
+        )(prepare_expr(places, sym_op, (dd, dd)))
+
+    from arraycontext import flatten, unflatten
+    x = actx.thaw(density_discr.nodes()[0])
+    y = actx.to_numpy(flatten(x, actx))
+    r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+
+    # }}}
+
+    # {{{ check matvec
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    from pytential.linalg.skeletonization import skeletonize_by_proxy
+    for i in range(id_eps.size):
+        skeleton = skeletonize_by_proxy(
+            actx, places, tgt_src_index, sym_op, sym_u,
+            domains=[dd], context={},
+            approx_nproxy=case.proxy_approx_count,
+            proxy_radius_factor=case.proxy_radius_factor,
+            id_eps=id_eps[i],
+            )
+        r_hmat = hmatrix_matvec_single_level(mat, y, skeleton[0, 0])
+        r_hmat = unflatten(x, actx.from_numpy(r_hmat), actx)
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmat - r_lpot) / flat_norm(r_lpot)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        # assert rec_error[i] < 0.1
+
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    basename = "linalg_hmatrix_single_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmat, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_forward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_forward_matvec(
+        actx_factory, case, discr_stage, p2p=False, visualize=False):
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x = actx.thaw(density_discr.nodes()[0])
+
+    if p2p:
+        # NOTE: this also needs changed in `build_hmatrix_by_proxy`
+        # to actually evaluate the p2p interactions instead of qbx
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import P2PMatrixBuilder
+        mat = P2PMatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        from arraycontext import flatten, unflatten
+        y = actx.to_numpy(flatten(x, actx))
+        r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+    else:
+        r_lpot = bind(places, sym_op, auto_where=dd)(actx, u=x)
+
+    from pytential.linalg.hmatrix import hmatrix_error_from_param
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+    model_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+        hmat = wrangler.get_forward()
+
+        # {{{ skeletonization error
+
+        from meshmode.dof_array import flat_norm
+        r_hmap = hmat @ x
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmap - r_lpot) / flat_norm(r_lpot)
+            )
+
+        # }}}
+
+        # {{{ model error
+
+        skeleton = hmat.skeletons[0]
+        icluster = np.argmax(np.diff(skeleton.skel_tgt_src_index.targets.starts))
+
+        proxy_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy.radii[icluster]
+            )
+        cluster_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy._cluster_radii[icluster]
+            )
+
+        model_error[i] = hmatrix_error_from_param(
+            places.ambient_dim,
+            id_eps=id_eps[i],
+            min_proxy_radius=proxy_radius,
+            max_cluster_radius=cluster_radius,
+            id_rank=skeleton.skel_tgt_src_index.targets.cluster_size(icluster),
+            nproxies=skeleton._src_eval_result.pxy.pxyindex.cluster_size(icluster),
+            ntargets=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            nsources=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            c=1.0e-8
+            )
+
+        # }}}
+
+        logger.info("id_eps %.2e error: %.12e (%.12e)",
+            id_eps[i], rec_error[i], model_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+    ax.loglog(id_eps, model_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    lpot_name = "p2p" if p2p else "qbx"
+    basename = f"linalg_hmatrix_{lpot_name}_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmap, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_backward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_backward_matvec(actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    if visualize:
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import MatrixBuilder
+        mat = MatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        import pytential.linalg.utils as hla
+        eigs_ref = hla.eigs(mat, k=5)
+        kappa_ref = np.linalg.cond(mat, p=2)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x_ref = actx.thaw(density_discr.nodes()[0])
+    b_ref = bind(places, sym_op, auto_where=dd)(actx, u=x_ref)
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    if visualize:
+        rec_eigs = np.zeros((id_eps.size, eigs_ref.size), dtype=np.complex128)
+        rec_kappa = np.zeros(id_eps.size)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+
+        hmat_inv = wrangler.get_backward()
+        x_hmat = hmat_inv @ b_ref
+
+        if visualize:
+            hmat = wrangler.get_forward()
+            rec_eigs[i, :] = hla.eigs(hmat, k=5, tol=1.0e-6)
+            rec_kappa[i] = hla.cond(hmat, p=2, tol=1.0e-6)
+
+            logger.info("eigs: %s %s", eigs_ref, rec_eigs[i])
+            logger.info("kappa %.12e %.12e", kappa_ref, rec_kappa[i])
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(x_hmat - x_ref) / flat_norm(x_ref)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+
+    # {{{ convergence
+
+    ax = fig.gca()
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_convergence")
+    fig.clf()
+
+    # }}}
+
+    # {{{ eigs
+
+    ax = fig.gca()
+    ax.plot(np.real(eigs_ref), np.imag(eigs_ref), "ko")
+    for i in range(id_eps.size):
+        ax.plot(np.real(rec_eigs[i]), np.imag(rec_eigs[i]), "v")
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\Re \lambda$")
+    ax.set_ylabel(r"$\Im \lambda$")
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_eigs")
+    fig.clf()
+
+    # }}}
+
+    if case.ambient_dim == 2:
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        x_hmat = actx.to_numpy(flatten(x_hmat, actx))
+        x_ref = actx.to_numpy(flatten(x_ref, actx))
+
+        ax.semilogy(x_hmat - x_ref)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_error")
+        fig.clf()
+
+    pt.close(fig)
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_skeletonization.py b/test/test_linalg_skeletonization.py
index b7be1060f..04da07dea 100644
--- a/test/test_linalg_skeletonization.py
+++ b/test/test_linalg_skeletonization.py
@@ -138,6 +138,105 @@ def test_skeletonize_symbolic(actx_factory, case, visualize=False):
 # }}}
 
 
+# {{{ test_skeletonize_diagonal
+
+@pytest.mark.parametrize("case", [
+    SKELETONIZE_TEST_CASES[0],
+    SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[2],
+    ])
+def test_skeletonize_diagonal(actx_factory, case, visualize=False):
+    import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
+    sli.seed(42)
+
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    # {{{ setup
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    resolution = case.resolutions[-1]
+
+    qbx = case.get_layer_potential(actx, resolution, case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    # }}}
+
+    # {{{ check
+
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+    wrangler = make_skeletonization_wrangler(
+            places, sym_op, sym_u,
+            auto_where=dd, context=case.knl_concrete_kwargs)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, sym_op, sym_u,
+        auto_where=dd,
+        context=case.knl_concrete_kwargs,
+        approx_nproxy=case.proxy_approx_count,
+        proxy_radius_factor=case.proxy_radius_factor,
+        id_eps=case.id_eps,
+        _wrangler=wrangler,
+        )
+
+    from pytential.linalg.hmatrix import _update_skeleton_diagonal
+    for i in range(1, skeletons.size):
+        skeletons[i] = _update_skeleton_diagonal(
+            skeletons[i], skeletons[i - 1], ctree.levels[i - 1],
+            )
+
+    from pytential.linalg.cluster import cluster
+    parent = None
+    for k, clevel in enumerate(ctree.levels):
+        from pytential.linalg.utils import make_flat_cluster_diag
+
+        tgt_src_index = skeletons[k].tgt_src_index
+        D1 = wrangler.evaluate_self(actx, places, tgt_src_index, 0, 0)
+        D1 = make_flat_cluster_diag(D1, tgt_src_index)
+
+        if k == 0:
+            D = D0 = D1
+        else:
+            skel_tgt_src_index = skeletons[k - 1].skel_tgt_src_index
+            assert skel_tgt_src_index.shape == tgt_src_index.shape
+
+            D0 = wrangler.evaluate_self(actx, places, skel_tgt_src_index, 0, 0)
+            D0 = cluster(make_flat_cluster_diag(D0, skel_tgt_src_index), parent)
+
+            D = D1 - D0
+
+        parent = clevel
+
+        assert D1.shape == (skeletons[k].nclusters,)
+        assert D1.shape == D0.shape, (D1.shape, D0.shape)
+        assert D1.shape == D.shape, (D1.shape, D.shape)
+
+        for i in range(skeletons[k].nclusters):
+            assert D1[i].shape == skeletons[k].tgt_src_index.cluster_shape(i, i)
+            assert D1[i].shape == D0[i].shape, (D1[i].shape, D0[i].shape)
+
+            error = la.norm(D[i] - skeletons[k].D[i]) / (la.norm(D[i]) + 1.0e-12)
+            logger.info("level %04d / %04d cluster %3d (%4d, %4d) error %.12e",
+                k, ctree.nlevels,
+                ctree.tree_cluster_parent_ids[clevel.box_ids][i],
+                *skeletons[k].tgt_src_index.cluster_shape(i, i), error)
+
+            assert error < 1.0e-15
+
+        logger.info("")
+
+    # }}}
+
+# }}}
+
+
 # {{{ test_skeletonize_by_proxy