From 15eee4940318cfbd2baccc5a6a2eba9d4aab67f4 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 17 Jun 2022 15:10:34 +0300
Subject: [PATCH 1/4] add recursive clustering and skeletonization

---
 doc/conf.py                                |   4 +
 doc/linalg.rst                             |  22 +-
 pytential/linalg/__init__.py               |  16 -
 pytential/linalg/cluster.py                | 489 +++++++++++++++++++++
 pytential/linalg/direct_solver_symbolic.py |  13 +-
 pytential/linalg/proxy.py                  | 255 ++++-------
 pytential/linalg/skeletonization.py        | 282 +++++++++---
 pytential/linalg/utils.py                  |  53 ++-
 pytential/symbolic/matrix.py               |   5 +-
 test/extra_matrix_data.py                  |  10 +-
 test/test_linalg_cluster.py                | 129 ++++++
 test/test_linalg_proxy.py                  |  14 +-
 test/test_linalg_skeletonization.py        | 110 ++---
 test/test_matrix.py                        |   8 +-
 14 files changed, 1081 insertions(+), 329 deletions(-)
 create mode 100644 pytential/linalg/cluster.py
 create mode 100644 test/test_linalg_cluster.py

diff --git a/doc/conf.py b/doc/conf.py
index 31bc6933f..9d69973a7 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -17,6 +17,10 @@
         "DOFDescriptorLike": "pytential.symbolic.dof_desc.DOFDescriptorLike",
         }
 
+nitpick_ignore_regex = [
+    ["py:class", r".*_ProxyNeighborEvaluationResult"],
+    ]
+
 intersphinx_mapping = {
     "arraycontext": ("https://documen.tician.de/arraycontext", None),
     "boxtree": ("https://documen.tician.de/boxtree", None),
diff --git a/doc/linalg.rst b/doc/linalg.rst
index 97497434f..d3d4870ce 100644
--- a/doc/linalg.rst
+++ b/doc/linalg.rst
@@ -8,7 +8,7 @@ scheme is used:
   component of the Stokeslet.
 * ``cluster`` refers to a piece of a ``block`` as used by the recursive
   proxy-based skeletonization of the direct solver algorithms. Clusters
-  are represented by a :class:`~pytential.linalg.TargetAndSourceClusterList`.
+  are represented by a :class:`~pytential.linalg.utils.TargetAndSourceClusterList`.
 
 GMRES
 -----
@@ -20,17 +20,31 @@ GMRES
 Hierarchical Direct Solver
 --------------------------
 
+.. note::
+
+    High-level API for direct solvers is in progress.
+
+Low-level Functionality
+-----------------------
+
 .. warning::
 
     All the classes and routines in this module are experimental and the
     API can change at any point.
 
+.. automodule:: pytential.linalg.skeletonization
+.. automodule:: pytential.linalg.cluster
 .. automodule:: pytential.linalg.proxy
-.. automodule:: pytential.linalg.utils
 
-Internal Functionality
-----------------------
+Internal Functionality and Utilities
+------------------------------------
+
+.. warning::
 
+    All the classes and routines in this module are experimental and the
+    API can change at any point.
+
+.. automodule:: pytential.linalg.utils
 .. automodule:: pytential.linalg.direct_solver_symbolic
 
 .. vim: sw=4:tw=75:fdm=marker
diff --git a/pytential/linalg/__init__.py b/pytential/linalg/__init__.py
index bfbd4d82e..eed539cc8 100644
--- a/pytential/linalg/__init__.py
+++ b/pytential/linalg/__init__.py
@@ -25,25 +25,9 @@
         make_index_list, make_index_cluster_cartesian_product,
         interp_decomp,
         )
-from pytential.linalg.proxy import (
-        ProxyClusterGeometryData, ProxyPointTarget, ProxyPointSource,
-        ProxyGeneratorBase, ProxyGenerator, QBXProxyGenerator,
-        partition_by_nodes, gather_cluster_neighbor_points,
-        )
-from pytential.linalg.skeletonization import (
-        SkeletonizationWrangler, make_skeletonization_wrangler,
-        SkeletonizationResult, skeletonize_by_proxy,
-        )
 
 __all__ = (
     "IndexList", "TargetAndSourceClusterList",
     "make_index_list", "make_index_cluster_cartesian_product",
     "interp_decomp",
-
-    "ProxyClusterGeometryData", "ProxyPointTarget", "ProxyPointSource",
-    "ProxyGeneratorBase", "ProxyGenerator", "QBXProxyGenerator",
-    "partition_by_nodes", "gather_cluster_neighbor_points",
-
-    "SkeletonizationWrangler", "make_skeletonization_wrangler",
-    "SkeletonizationResult", "skeletonize_by_proxy",
 )
diff --git a/pytential/linalg/cluster.py b/pytential/linalg/cluster.py
new file mode 100644
index 000000000..7a90e2cac
--- /dev/null
+++ b/pytential/linalg/cluster.py
@@ -0,0 +1,489 @@
+from __future__ import annotations
+
+
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import pathlib
+from collections.abc import Iterator
+from dataclasses import dataclass, replace
+from functools import singledispatch
+from typing import Any
+
+import numpy as np
+
+from pytools import memoize_method
+from pytools.obj_array import make_obj_array
+
+from arraycontext import PyOpenCLArrayContext
+from boxtree.tree import Tree
+from meshmode.discretization import Discretization
+from pytential import sym, GeometryCollection
+from pytential.linalg.proxy import ProxyGenerator
+from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
+from pytential.qbx import QBXLayerPotentialSource
+
+__doc__ = """
+Clustering
+~~~~~~~~~~
+
+.. autoclass:: ClusterLevel
+.. autoclass:: ClusterTree
+
+.. autofunction:: split_array
+.. autofunction:: cluster
+.. autofunction:: uncluster
+
+.. autofunction:: partition_by_nodes
+"""
+
+# FIXME: this is just an arbitrary value
+_DEFAULT_MAX_PARTICLES_IN_BOX = 32
+
+
+# {{{ cluster tree
+
+
+def make_cluster_parent_map(parent_ids: np.ndarray) -> np.ndarray:
+    """Construct a parent map for :attr:`ClusterLevel.parent_map`."""
+    # NOTE: np.unique returns a sorted array
+    unique_parent_ids = np.unique(parent_ids)
+    ids = np.arange(parent_ids.size)
+
+    return make_obj_array([
+        ids[parent_ids == unique_parent_ids[i]]
+        for i in range(unique_parent_ids.size)
+        ])
+
+
+@dataclass(frozen=True)
+class ClusterLevel:
+    """A level in a :class:`ClusterTree`.
+
+    .. attribute:: level
+
+        Current level that is represented.
+
+    .. attribute:: nclusters
+
+        Number of clusters on the current level (same as number of boxes
+        in :attr:`box_ids`).
+
+    .. attribute:: box_ids
+
+        Box IDs on the current level.
+
+    .. attribute:: parent_map
+
+        An object :class:`~numpy.ndarray` containing buckets of child indices,
+        i.e. ``parent_map[i]`` contains all the child indices that will cluster
+        into the same parent. Note that this indexing is local to this level
+        and is not related to the tree indexing stored by the :class:`ClusterTree`.
+    """
+
+    level: int
+    box_ids: np.ndarray
+    parent_map: np.ndarray
+
+    @property
+    def nclusters(self):
+        return self.box_ids.size
+
+
+@dataclass(frozen=True)
+class ClusterTree:
+    r"""Hierarchical cluster representation.
+
+    .. attribute:: nlevels
+
+        Total number of levels in the tree.
+
+    .. attribute:: leaf_cluster_box_ids
+
+        Box IDs for each cluster on the leaf level of the tree.
+
+    .. attribute:: tree_cluster_parent_ids
+
+        Parent box IDs for :attr:`leaf_cluster_box_ids`.
+
+    .. attribute:: levels
+
+        An :class:`~numpy.ndarray` of :class:`ClusterLevel`\ s.
+    """
+
+    nlevels: int
+    leaf_cluster_box_ids: np.ndarray
+    tree_cluster_parent_ids: np.ndarray
+
+    # NOTE: only here to allow easier debugging + testing
+    _tree: Tree | None
+
+    @property
+    def nclusters(self):
+        return self.leaf_cluster_box_ids.size
+
+    @property
+    @memoize_method
+    def levels(self) -> np.ndarray:
+        return make_obj_array(list(self.iter_levels()))
+
+    def iter_levels(self) -> Iterator[ClusterLevel]:
+        """
+        :returns: an iterator over all the :class:`ClusterLevel` levels.
+        """
+
+        box_ids = self.leaf_cluster_box_ids
+        parent_ids = self.tree_cluster_parent_ids[box_ids]
+        clevel = ClusterLevel(
+            level=self.nlevels - 1,
+            box_ids=box_ids,
+            parent_map=make_cluster_parent_map(parent_ids),
+            )
+
+        for _ in range(self.nlevels - 1, -1, -1):
+            yield clevel
+
+            box_ids = np.unique(self.tree_cluster_parent_ids[clevel.box_ids])
+            parent_ids = self.tree_cluster_parent_ids[box_ids]
+            clevel = ClusterLevel(
+                level=clevel.level - 1,
+                box_ids=box_ids,
+                parent_map=make_cluster_parent_map(parent_ids)
+                )
+
+        assert clevel.nclusters == 1
+
+# }}}
+
+
+# {{{ cluster
+
+def split_array(x: np.ndarray, index: IndexList) -> np.ndarray:
+    """
+    :returns: an object :class:`~numpy.ndarray` where each entry contains the
+        elements of the :math:`i`-th cluster in *index*.
+    """
+    assert x.size == index.nindices
+
+    from pytools.obj_array import make_obj_array
+    return make_obj_array([
+        index.cluster_take(x, i) for i in range(index.nclusters)
+        ])
+
+
+@singledispatch
+def cluster(obj: Any, clevel: ClusterLevel) -> Any:
+    """Merge together elements of *obj* into their parent object, as described
+    by :attr:`ClusterLevel.parent_map`.
+    """
+    raise NotImplementedError(type(obj).__name__)
+
+
+@cluster.register(IndexList)
+def _cluster_index_list(obj: IndexList, clevel: ClusterLevel) -> IndexList:
+    assert obj.nclusters == clevel.nclusters
+
+    if clevel.nclusters == 1:
+        return obj
+
+    from pytential.linalg.utils import make_index_list
+    indices = make_obj_array([
+        np.concatenate([obj.cluster_indices(i) for i in ppm])
+        for ppm in clevel.parent_map
+        ])
+
+    return make_index_list(indices)
+
+
+@cluster.register(TargetAndSourceClusterList)
+def _cluster_target_and_source_cluster_list(
+        obj: TargetAndSourceClusterList, clevel: ClusterLevel,
+        ) -> TargetAndSourceClusterList:
+    assert obj.nclusters == clevel.nclusters
+
+    if clevel.nclusters == 1:
+        return obj
+
+    return replace(obj,
+        targets=cluster(obj.targets, clevel),
+        sources=cluster(obj.sources, clevel))
+
+
+@cluster.register(np.ndarray)
+def _cluster_ndarray(obj: np.ndarray, clevel: ClusterLevel) -> np.ndarray:
+    assert obj.shape == (clevel.nclusters,)
+    if clevel.nclusters == 1:
+        return obj
+
+    def make_block(i: int, j: int):
+        if i == j:
+            return obj[i]
+
+        return np.zeros((obj[i].shape[0], obj[j].shape[1]), dtype=obj[i].dtype)
+
+    from pytools import single_valued
+    ndim = single_valued(block.ndim for block in obj)
+
+    if ndim == 1:
+        return make_obj_array([
+            np.concatenate([obj[i] for i in ppm]) for ppm in clevel.parent_map
+            ])
+    elif ndim == 2:
+        return make_obj_array([
+            np.block([[make_block(i, j) for j in ppm] for i in ppm])
+            for ppm in clevel.parent_map
+            ])
+    else:
+        raise ValueError(f"unsupported ndarray dimension: '{ndim}'")
+
+# }}}
+
+
+# {{{ uncluster
+
+def uncluster(ary: np.ndarray, index: IndexList, clevel: ClusterLevel) -> np.ndarray:
+    """Performs the reverse of :func:`cluster` on object arrays.
+
+    :arg ary: an object :class:`~numpy.ndarray` with a shape that matches
+        :attr:`ClusterLevel.parent_map`.
+    :arg index: an :class:`~pytential.linalg.utils.IndexList` for the
+        current level, as given by :attr:`ClusterLevel.box_ids`.
+    :returns: an object :class:`~numpy.ndarray` with a shape that matches
+        :attr:`ClusterLevel.box_ids` of all the elements of *ary* that belong
+        to each child cluster.
+    """
+    assert ary.dtype.char == "O"
+    assert ary.shape == (clevel.parent_map.size,)
+
+    if index.nclusters == 1:
+        return ary
+
+    result: np.ndarray = np.empty(index.nclusters, dtype=object)
+    for ifrom, ppm in enumerate(clevel.parent_map):
+        offset = 0
+        for ito in ppm:
+            cluster_size = index.cluster_size(ito)
+            result[ito] = ary[ifrom][offset:offset + cluster_size]
+            offset += cluster_size
+
+        assert ary[ifrom].shape == (offset,)
+
+    return result
+
+# }}}
+
+
+# {{{ cluster generation
+
+def _build_binary_ish_tree_from_starts(starts: np.ndarray) -> ClusterTree:
+    partition_box_ids: np.ndarray[tuple[int, ...], Any] = np.arange(starts.size - 1)
+
+    box_ids = partition_box_ids
+
+    box_parent_ids = []
+    offset = box_ids.size
+    while box_ids.size > 1:
+        # NOTE: this is probably not the most efficient way to do it, but this
+        # code is mostly meant for debugging using a simple tree
+        clusters = np.array_split(box_ids, box_ids.size // 2)
+        parent_ids = offset + np.arange(len(clusters))
+        box_parent_ids.append(np.repeat(parent_ids, [len(c) for c in clusters]))
+
+        box_ids = parent_ids
+        offset += box_ids.size
+
+    # NOTE: make the root point to itself
+    box_parent_ids.append(np.array([offset - 1]))
+    nlevels = len(box_parent_ids)
+
+    return ClusterTree(
+        nlevels=nlevels,
+        leaf_cluster_box_ids=partition_box_ids,
+        tree_cluster_parent_ids=np.concatenate(box_parent_ids),
+        _tree=None)
+
+
+def partition_by_nodes(
+        actx: PyOpenCLArrayContext, places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptorLike | None = None,
+        tree_kind: str | None = "adaptive-level-restricted",
+        max_particles_in_box: int | None = None) -> tuple[IndexList, ClusterTree]:
+    """Generate equally sized ranges of nodes. The partition is created at the
+    lowest level of granularity, i.e. nodes. This results in balanced ranges
+    of points, but will split elements across different ranges.
+
+    :arg dofdesc: a :class:`~pytential.symbolic.dof_desc.DOFDescriptor` for
+        the geometry in *places* which should be partitioned.
+    :arg tree_kind: if not *None*, it is passed to :class:`boxtree.TreeBuilder`.
+    :arg max_particles_in_box: value used to control the number of points
+        in each partition (and thus the number of partitions). See the documentation
+        in :class:`boxtree.TreeBuilder`.
+    """
+    if dofdesc is None:
+        dofdesc = places.auto_source
+    dofdesc = sym.as_dofdesc(dofdesc)
+
+    if max_particles_in_box is None:
+        max_particles_in_box = _DEFAULT_MAX_PARTICLES_IN_BOX
+
+    lpot_source = places.get_geometry(dofdesc.geometry)
+    assert isinstance(lpot_source, Discretization | QBXLayerPotentialSource)
+
+    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    assert isinstance(discr, Discretization)
+
+    if tree_kind is not None:
+        setup_actx = lpot_source._setup_actx
+        assert isinstance(setup_actx, PyOpenCLArrayContext)
+
+        from pytential.qbx.utils import tree_code_container
+        tcc = tree_code_container(setup_actx)
+
+        from arraycontext import flatten
+        from meshmode.dof_array import DOFArray
+        tree, _ = tcc.build_tree()(actx.queue,
+                particles=flatten(
+                    actx.thaw(discr.nodes()), actx, leaf_class=DOFArray
+                    ),
+                max_particles_in_box=max_particles_in_box,
+                kind=tree_kind)
+
+        from boxtree import box_flags_enum
+        tree = tree.get(actx.queue)
+        # FIXME maybe this should use IS_LEAF once available?
+        leaf_boxes, = (
+                tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
+                ).nonzero()
+
+        # FIXME: this annotation is not needed with numpy 2.0
+        indices: np.ndarray = np.empty(len(leaf_boxes), dtype=object)
+        starts = None
+
+        for i, ibox in enumerate(leaf_boxes):
+            box_start = tree.box_source_starts[ibox]
+            box_end = box_start + tree.box_source_counts_cumul[ibox]
+            indices[i] = tree.user_source_ids[box_start:box_end]
+
+        ctree = ClusterTree(
+            nlevels=tree.nlevels,
+            leaf_cluster_box_ids=leaf_boxes,
+            tree_cluster_parent_ids=tree.box_parent_ids,
+            _tree=tree)
+    else:
+        if discr.ambient_dim != 2 and discr.dim == 1:
+            raise ValueError("only curves are supported for 'tree_kind=None'")
+
+        nclusters = max(discr.ndofs // max_particles_in_box, 2)
+        indices = np.arange(0, discr.ndofs, dtype=np.int64)
+        starts = np.linspace(0, discr.ndofs, nclusters + 1, dtype=np.int64)
+
+        # FIXME: mypy seems to be able to figure this out with numpy 2.0
+        assert starts is not None
+        assert starts[-1] == discr.ndofs
+
+        ctree = _build_binary_ish_tree_from_starts(starts)
+
+    from pytential.linalg import make_index_list
+    return make_index_list(indices, starts=starts), ctree
+
+# }}}
+
+
+# {{{ visualize clusters
+
+def visualize_clusters(actx: PyOpenCLArrayContext,
+                       generator: ProxyGenerator,
+                       srcindex: IndexList,
+                       tree: ClusterTree,
+                       filename: str | pathlib.Path, *,
+                       dofdesc: sym.DOFDescriptorLike = None,
+                       overwrite: bool = False) -> None:
+    filename = pathlib.Path(filename)
+
+    places = generator.places
+    if dofdesc is None:
+        dofdesc = places.auto_source
+    dofdesc = sym.as_dofdesc(dofdesc)
+
+    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    assert isinstance(discr, Discretization)
+
+    if discr.ambient_dim != 2:
+        raise NotImplementedError(f"Unsupported dimension: {discr.ambient_dim}")
+
+    import matplotlib.pyplot as pt
+
+    from arraycontext import flatten
+    from boxtree.visualization import TreePlotter
+    from meshmode.dof_array import DOFArray
+
+    x, y = actx.to_numpy(flatten(discr.nodes(), actx, leaf_class=DOFArray))
+    for clevel in tree.levels:
+        outfile = filename.with_stem(f"{filename.stem}-{clevel.level:03d}")
+        if not overwrite and outfile.exists():
+            raise FileExistsError(f"Output file '{outfile}' already exists")
+
+        pxy = generator(actx, dofdesc, srcindex).to_numpy(actx)
+        pxycenters = pxy.centers
+        pxyradii = pxy.radii
+        clsradii = pxy.cluster_radii
+
+        fig = pt.figure()
+        ax = fig.gca()
+
+        plotter = TreePlotter(tree._tree)
+        plotter.set_bounding_box()
+        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
+
+        ax.plot(x, y, "ko", ms=2.0)
+        for i in range(srcindex.nclusters):
+            isrc = srcindex.cluster_indices(i)
+            ax.plot(x[isrc], y[isrc], "o", ms=2.0)
+
+        from itertools import cycle
+        colors = cycle(pt.rcParams["axes.prop_cycle"].by_key()["color"])
+
+        for ppm in clevel.parent_map:
+            color = next(colors)
+            for j in ppm:
+                center = (pxycenters[0, j], pxycenters[1, j])
+                c = pt.Circle(center, pxyradii[j], color=color, alpha=0.1)
+                ax.add_artist(c)
+                c = pt.Circle(center, clsradii[j], color=color, alpha=0.1)
+                ax.add_artist(c)
+                ax.text(*center, f"{j}", fontsize=18)
+
+        ax.set_xlabel("$x$")
+        ax.set_ylabel("$y$")
+        ax.relim()
+        ax.autoscale()
+        ax.set_aspect("equal")
+
+        fig.savefig(outfile)
+        pt.close(fig)
+
+        srcindex = cluster(srcindex, clevel)
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/direct_solver_symbolic.py b/pytential/linalg/direct_solver_symbolic.py
index fc6322f5f..118b94597 100644
--- a/pytential/linalg/direct_solver_symbolic.py
+++ b/pytential/linalg/direct_solver_symbolic.py
@@ -20,6 +20,8 @@
 THE SOFTWARE.
 """
 
+import numpy as np
+
 from pytools.obj_array import make_obj_array
 
 from pytential.symbolic.mappers import (
@@ -44,9 +46,12 @@ class PROXY_SKELETONIZATION_TARGET:             # noqa: N801
 
 def prepare_expr(places, exprs, auto_where=None):
     from pytential.symbolic.execution import _prepare_expr
-    return make_obj_array([
-        _prepare_expr(places, expr, auto_where=auto_where)
-        for expr in exprs])
+    if isinstance(exprs, np.ndarray):
+        return make_obj_array([
+            _prepare_expr(places, expr, auto_where=auto_where)
+            for expr in exprs])
+
+    return _prepare_expr(places, exprs, auto_where=auto_where)
 
 
 def prepare_proxy_expr(places, exprs, auto_where=None):
@@ -224,3 +229,5 @@ def __init__(self, source, target):
         self.operand_rec = _LocationReplacer(source, default_source=source)
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/proxy.py b/pytential/linalg/proxy.py
index 23555cccb..16708b4fb 100644
--- a/pytential/linalg/proxy.py
+++ b/pytential/linalg/proxy.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2018 Alexandru Fikl"
 
 __license__ = """
@@ -27,6 +30,7 @@
 import numpy as np
 import numpy.linalg as la
 
+import arraycontext
 from arraycontext import PyOpenCLArrayContext, flatten
 from meshmode.discretization import Discretization
 from meshmode.dof_array import DOFArray
@@ -42,13 +46,14 @@
 import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 __doc__ = """
 Proxy Point Generation
 ~~~~~~~~~~~~~~~~~~~~~~
 
-.. currentmodule:: pytential.linalg
-
 .. autoclass:: ProxyPointSource
 .. autoclass:: ProxyPointTarget
 .. autoclass:: ProxyClusterGeometryData
@@ -57,7 +62,6 @@
 .. autoclass:: ProxyGenerator
 .. autoclass:: QBXProxyGenerator
 
-.. autofunction:: partition_by_nodes
 .. autofunction:: gather_cluster_neighbor_points
 """
 
@@ -65,83 +69,6 @@
 _DEFAULT_MAX_PARTICLES_IN_BOX = 32
 
 
-# {{{ point index partitioning
-
-def partition_by_nodes(
-        actx: PyOpenCLArrayContext, places: GeometryCollection, *,
-        dofdesc: DOFDescriptorLike | None = None,
-        tree_kind: str | None = "adaptive-level-restricted",
-        max_particles_in_box: int | None = None) -> IndexList:
-    """Generate equally sized ranges of nodes. The partition is created at the
-    lowest level of granularity, i.e. nodes. This results in balanced ranges
-    of points, but will split elements across different ranges.
-
-    :arg dofdesc: a :class:`~pytential.symbolic.dof_desc.DOFDescriptor` for
-        the geometry in *places* which should be partitioned.
-    :arg tree_kind: if not *None*, it is passed to :class:`boxtree.TreeBuilder`.
-    :arg max_particles_in_box: value used to control the number of points
-        in each partition (and thus the number of partitions). See the documentation
-        in :class:`boxtree.TreeBuilder`.
-    """
-    if dofdesc is None:
-        dofdesc = places.auto_source
-    dofdesc = sym.as_dofdesc(dofdesc)
-
-    if max_particles_in_box is None:
-        max_particles_in_box = _DEFAULT_MAX_PARTICLES_IN_BOX
-
-    from pytential.source import LayerPotentialSourceBase
-
-    lpot_source = places.get_geometry(dofdesc.geometry)
-    assert isinstance(lpot_source, LayerPotentialSourceBase)
-
-    discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
-    assert isinstance(discr, Discretization)
-
-    if tree_kind is not None:
-        from pytential.qbx.utils import tree_code_container
-        tcc = tree_code_container(lpot_source._setup_actx)
-
-        tree, _ = tcc.build_tree()(actx.queue,
-                particles=flatten(
-                    actx.thaw(discr.nodes()), actx, leaf_class=DOFArray
-                    ),
-                max_particles_in_box=max_particles_in_box,
-                kind=tree_kind)
-
-        from boxtree import box_flags_enum
-        tree = tree.get(actx.queue)
-        # FIXME maybe this should use IS_LEAF once available?
-        leaf_boxes, = (
-                tree.box_flags & box_flags_enum.HAS_SOURCE_OR_TARGET_CHILD_BOXES == 0
-                ).nonzero()
-
-        indices: np.ndarray = np.empty(len(leaf_boxes), dtype=object)
-        starts: np.ndarray | None = None
-
-        for i, ibox in enumerate(leaf_boxes):
-            box_start = tree.box_source_starts[ibox]
-            box_end = box_start + tree.box_source_counts_cumul[ibox]
-            indices[i] = tree.user_source_ids[box_start:box_end]
-    else:
-        if discr.ambient_dim != 2 and discr.dim == 1:
-            raise ValueError("only curves are supported for 'tree_kind=None'")
-
-        nclusters = max(discr.ndofs // max_particles_in_box, 2)
-        indices = np.arange(0, discr.ndofs, dtype=np.int64)
-        starts = np.linspace(0, discr.ndofs, nclusters + 1, dtype=np.int64)
-
-        # FIXME: I'm not sure why mypy can't figure this out.
-        assert starts is not None
-
-        assert starts[-1] == discr.ndofs
-
-    from pytential.linalg import make_index_list
-    return make_index_list(indices, starts=starts)
-
-# }}}
-
-
 # {{{ proxy points
 
 class ProxyPointSource(PointPotentialSource):
@@ -189,31 +116,17 @@ def __init__(self,
 
 @dataclass(frozen=True)
 class ProxyClusterGeometryData:
-    """
-    .. attribute:: srcindex
-
-        A :class:`~pytential.linalg.IndexList` describing which cluster
-        of points each proxy ball was created from.
-
-    .. attribute:: pxyindex
-
-        A :class:`~pytential.linalg.IndexList` describing which proxies
-        belong to which cluster.
-
-    .. attribute:: points
-
-        A concatenated array of all the proxy points. Can be sliced into
-        using :attr:`pxyindex` (shape ``(dim, nproxies)``).
-
-    .. attribute:: centers
-
-        An array of all the proxy ball centers (shape ``(dim, nclusters)``).
-
-    .. attribute:: radii
-
-        An array of all the proxy ball radii (shape ``(nclusters,)``).
-
-    .. attribute:: nclusters
+    """The proxy information generated by :class:`ProxyGeneratorBase`.
+
+    .. autoattribute:: nclusters
+    .. autoattribute:: places
+    .. autoattribute:: dofdesc
+    .. autoattribute:: srcindex
+    .. autoattribute:: pxyindex
+    .. autoattribute:: points
+    .. autoattribute:: centers
+    .. autoattribute:: radii
+    .. autoattribute:: cluster_radii
 
     .. automethod:: __init__
     .. automethod:: to_numpy
@@ -222,39 +135,55 @@ class ProxyClusterGeometryData:
     """
 
     places: GeometryCollection
+    """Geometry collection containing the used :attr:`dofdesc`."""
     dofdesc: sym.DOFDescriptor
+    """A descriptor for the geometry used to compute the proxy points."""
 
     srcindex: IndexList
+    """A :class:`~pytential.linalg.utils.IndexList` describing which cluster
+    of points each proxy ball was created from.
+    """
     pxyindex: IndexList
+    """A :class:`~pytential.linalg.utils.IndexList` describing which proxies
+    belong to which cluster.
+    """
 
     points: np.ndarray
+    """A concatenated array of all the proxy points. Can be sliced into
+    using :attr:`pxyindex` (shape ``(dim, nproxies)``).
+    """
     centers: np.ndarray
+    """An array of all the proxy ball centers (shape ``(dim, nclusters)``)."""
     radii: np.ndarray
+    """An array of all the proxy ball radii (shape ``(nclusters,)``)."""
 
-    _cluster_radii: np.ndarray | None = None
+    cluster_radii: np.ndarray
+    """An array of all the cluster ball radii (shape ``(nclusters,)``). The
+    proxy radii are essentially just the cluster radii multiplied by the
+    ``radius_factor``.
+    """
 
     @property
     def nclusters(self) -> int:
+        """Number of clusters."""
         return self.srcindex.nclusters
 
     @property
     def discr(self):
+        """The discretization that corresponds to :attr:`dofdesc` in :attr:`places`."""
         return self.places.get_discretization(
                 self.dofdesc.geometry,
                 self.dofdesc.discr_stage)
 
-    def to_numpy(self, actx: PyOpenCLArrayContext) -> "ProxyClusterGeometryData":
-        if self._cluster_radii is not None:
-            cluster_radii = actx.to_numpy(self._cluster_radii)
-        else:
-            cluster_radii = None
-
+    def to_numpy(self, actx: PyOpenCLArrayContext) -> ProxyClusterGeometryData:
+        """Convert the proxy geometry data to :mod:`numpy` arrays."""
         from dataclasses import replace
         return replace(self,
                 points=np.stack(actx.to_numpy(self.points)),
                 centers=np.stack(actx.to_numpy(self.centers)),
                 radii=actx.to_numpy(self.radii),
-                _cluster_radii=cluster_radii)
+                cluster_radii=actx.to_numpy(self.cluster_radii),
+                )
 
     def as_sources(self) -> ProxyPointSource:
         lpot_source = self.places.get_geometry(self.dofdesc.geometry)
@@ -422,7 +351,7 @@ def get_radii_kernel_ex(self, actx: PyOpenCLArrayContext) -> lp.ExecutorBase:
         raise NotImplementedError
 
     def __call__(self,
-            actx: PyOpenCLArrayContext,
+            actx: arraycontext.PyOpenCLArrayContext,
             source_dd: DOFDescriptorLike | None,
             dof_index: IndexList,
             **kwargs: Any) -> ProxyClusterGeometryData:
@@ -441,8 +370,6 @@ def __call__(self,
                 source_dd.geometry, source_dd.discr_stage)
         assert isinstance(discr, Discretization)
 
-        include_cluster_radii = kwargs.pop("include_cluster_radii", False)
-
         # {{{ get proxy centers and radii
 
         sources = flatten(discr.nodes(), actx, leaf_class=DOFArray)
@@ -454,25 +381,13 @@ def __call__(self,
                 srcstarts=dof_index.starts)
 
         knl = self.get_radii_kernel_ex(actx)
-        _, (radii_dev,) = knl(actx.queue,
+        _, (cluster_radii,) = knl(actx.queue,
                 sources=sources,
                 srcindices=dof_index.indices,
                 srcstarts=dof_index.starts,
-                radius_factor=self.radius_factor,
                 proxy_centers=centers_dev,
                 **kwargs)
-
-        if include_cluster_radii:
-            knl = make_compute_cluster_radii_kernel_ex(actx, self.ambient_dim)
-            _, (cluster_radii,) = knl(actx.queue,
-                    sources=sources,
-                    srcindices=dof_index.indices,
-                    srcstarts=dof_index.starts,
-                    radius_factor=1.0,
-                    proxy_centers=centers_dev)
-            cluster_radii = actx.freeze(cluster_radii)
-        else:
-            cluster_radii = None
+        radii_dev = self.radius_factor * cluster_radii
 
         # }}}
 
@@ -505,7 +420,7 @@ def __call__(self,
                 points=actx.freeze(actx.from_numpy(proxies)),
                 centers=actx.freeze(centers_dev),
                 radii=actx.freeze(radii_dev),
-                _cluster_radii=cluster_radii
+                cluster_radii=actx.freeze(cluster_radii),
                 )
 
 
@@ -527,13 +442,12 @@ def prg():
                         - sources[idim, srcindices[i + ioffset]]) ** 2)
                         ))
 
-                proxy_radius[icluster] = radius_factor * cluster_radius
+                proxy_radius[icluster] = cluster_radius
             end
             """, [
                 lp.GlobalArg("sources", None,
                     shape=(ndim, "nsources"), dim_tags="sep,C", offset=lp.auto),
                 lp.ValueArg("nsources", np.int64),
-                lp.ValueArg("radius_factor", np.float64),
                 ...
                 ],
             name="compute_cluster_radii_knl",
@@ -587,7 +501,7 @@ def prg():
 
                 <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext
 
-                proxy_radius[icluster] = radius_factor * rqbx
+                proxy_radius[icluster] = rqbx
             end
             """, [
                 lp.GlobalArg("sources", None,
@@ -597,7 +511,6 @@ def prg():
                 lp.GlobalArg("center_ext", None,
                     shape=(ndim, "nsources"), dim_tags="sep,C", offset=lp.auto),
                 lp.ValueArg("nsources", np.int64),
-                lp.ValueArg("radius_factor", np.float64),
                 ...
                 ],
             name="compute_cluster_qbx_radii_knl",
@@ -651,24 +564,40 @@ def __call__(self,
 # {{{ gather_cluster_neighbor_points
 
 def gather_cluster_neighbor_points(
-        actx: PyOpenCLArrayContext, pxy: ProxyClusterGeometryData, *,
+        actx: PyOpenCLArrayContext,
+        pxy: ProxyClusterGeometryData,
+        tgtindex: IndexList | None = None,
+        *,
         max_particles_in_box: int | None = None) -> IndexList:
-    """Generate a set of neighboring points for each cluster of points in
-    *pxy*. Neighboring points of a cluster :math:`i` are defined
-    as all the points inside the proxy ball :math:`i` that do not also
-    belong to the cluster itself.
+    r"""Generate a set of neighboring points for each cluster of points in *pxy*.
+
+    Neighboring points of a cluster :math:`i` are defined as all the points
+    from *tgtindex* that are inside the proxy ball :math:`i` but outside the
+    cluster itself. For example, given a cluster with radius :math:`r_s` and
+    proxy radius :math:`r_p > r_s`, then we gather all points such that
+    :math:`r_s < \|\mathbf{x}\| <= r_p`.
     """
 
+    srcindex = pxy.srcindex
+    if tgtindex is None:
+        tgtindex = srcindex
+
+    nclusters = srcindex.nclusters
+    if tgtindex.nclusters != nclusters:
+        raise ValueError("'tgtindex' have a different number of clusters: "
+                         f"'{tgtindex.nclusters}' (expected {nclusters})")
+
     if max_particles_in_box is None:
         max_particles_in_box = _DEFAULT_MAX_PARTICLES_IN_BOX
 
-    from pytential.source import LayerPotentialSourceBase
-
     dofdesc = pxy.dofdesc
     lpot_source = pxy.places.get_geometry(dofdesc.geometry)
-    assert isinstance(lpot_source, LayerPotentialSourceBase)
-
     discr = pxy.places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+
+    assert (
+        dofdesc.discr_stage is None
+        or isinstance(lpot_source, QBXLayerPotentialSource)
+        ), (dofdesc, type(lpot_source))
     assert isinstance(discr, Discretization)
 
     # {{{ get only sources in the current cluster set
@@ -696,18 +625,23 @@ def prg() -> lp.ExecutorBase:
 
         return knl.executor(actx.context)
 
-    _, (sources,) = prg()(actx.queue,
+    _, (targets,) = prg()(actx.queue,
             ary=flatten(discr.nodes(), actx, leaf_class=DOFArray),
-            srcindices=pxy.srcindex.indices)
+            srcindices=tgtindex.indices)
 
     # }}}
 
     # {{{ perform area query
 
     from pytential.qbx.utils import tree_code_container
-    tcc = tree_code_container(lpot_source._setup_actx)
 
-    tree, _ = tcc.build_tree()(actx.queue, sources,
+    # NOTE: use the base source's actx for caching the code -- that has
+    # the best chance of surviving even when updating the lpot_source
+    setup_actx = discr._setup_actx
+    assert isinstance(setup_actx, PyOpenCLArrayContext)
+
+    tcc = tree_code_container(setup_actx)
+    tree, _ = tcc.build_tree()(actx.queue, targets,
             max_particles_in_box=max_particles_in_box)
     query, _ = tcc.build_area_query()(actx.queue, tree, pxy.centers, pxy.radii)
 
@@ -720,10 +654,10 @@ def prg() -> lp.ExecutorBase:
 
     pxycenters = actx.to_numpy(pxy.centers)
     pxyradii = actx.to_numpy(pxy.radii)
-    srcindex = pxy.srcindex
 
-    nbrindices: np.ndarray = np.empty(srcindex.nclusters, dtype=object)
-    for icluster in range(srcindex.nclusters):
+    eps = 100 * np.finfo(pxyradii.dtype).eps
+    nbrindices: np.ndarray = np.empty(nclusters, dtype=object)
+    for icluster in range(nclusters):
         # get list of boxes intersecting the current ball
         istart = query.leaves_near_ball_starts[icluster]
         iend = query.leaves_near_ball_starts[icluster + 1]
@@ -742,16 +676,17 @@ def prg() -> lp.ExecutorBase:
         isources = tree.user_source_ids[isources]
 
         # get nodes inside the ball but outside the current cluster
-        # FIXME: this assumes that only the points in `pxy.secindex` should
-        # count as neighbors, not all the nodes in the discretization.
-        # FIXME: it also assumes that all the indices are sorted?
         center = pxycenters[:, icluster].reshape(-1, 1)
-        radius = pxyradii[icluster]
-        mask = ((la.norm(nodes - center, axis=0) < radius)
-                & ((isources < srcindex.starts[icluster])
-                    | (srcindex.starts[icluster + 1] <= isources)))
-
-        nbrindices[icluster] = srcindex.indices[isources[mask]]
+        radii = la.norm(nodes - center, axis=0) - eps
+        mask = (
+            (radii <= pxyradii[icluster])
+            & ((isources < tgtindex.starts[icluster])
+               | (tgtindex.starts[icluster + 1] <= isources)))
+
+        nbrindices[icluster] = tgtindex.indices[isources[mask]]
+        if nbrindices[icluster].size == 0:
+            logger.warning("Cluster '%d' has no neighbors. You might need to "
+                           "increase the proxy 'radius_factor'.", icluster)
 
     # }}}
 
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index 365a44aa3..b57db6975 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2018-2022 Alexandru Fikl"
 
 __license__ = """
@@ -22,7 +25,7 @@
 
 from collections.abc import Callable, Hashable, Sequence
 from dataclasses import dataclass
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 
@@ -32,22 +35,28 @@
 from pytential.symbolic.matrix import ClusterMatrixBuilderBase
 from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
 from pytential.linalg.proxy import ProxyGeneratorBase, ProxyClusterGeometryData
+from pytential.linalg.cluster import ClusterTree, cluster
 from pytential.linalg.direct_solver_symbolic import (
         PROXY_SKELETONIZATION_TARGET, PROXY_SKELETONIZATION_SOURCE,
         prepare_expr, prepare_proxy_expr)
 
+if TYPE_CHECKING:
+    from pytential.linalg.utils import IndexList, TargetAndSourceClusterList
 
-__doc__ = """
-.. currentmodule:: pytential.linalg
+import logging
+logger = logging.getLogger(__name__)
 
+
+__doc__ = """
 Skeletonization
----------------
+~~~~~~~~~~~~~~~
 
 .. autoclass:: SkeletonizationWrangler
 .. autoclass:: make_skeletonization_wrangler
 
 .. autoclass:: SkeletonizationResult
 .. autofunction:: skeletonize_by_proxy
+.. autofunction:: rec_skeletonize_by_proxy
 """
 
 
@@ -107,7 +116,7 @@
 def _approximate_geometry_waa_magnitude(
         actx: PyOpenCLArrayContext,
         places: GeometryCollection,
-        cluster_index: "IndexList",
+        cluster_index: IndexList,
         domain: sym.DOFDescriptor) -> Array:
     """
     :arg cluster_index: a :class:`~pytential.linalg.IndexList` representing the
@@ -132,7 +141,9 @@ def prg():
             """
             <> ioffset = starts[icluster]
             <> npoints = starts[icluster + 1] - ioffset
-            result[icluster] = reduce(sum, i, waa[indices[i + ioffset]]) / npoints
+            result[icluster] = (
+                reduce(sum, i, abs(waa[indices[i + ioffset]])) / npoints
+                if npoints > 0 else 1.0)
             """,
             lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
             )
@@ -232,15 +243,17 @@ class SkeletonizationWrangler:
     .. attribute:: proxy_source_cluster_builder
     .. attribute:: proxy_target_cluster_builder
 
-        A callable that is used to evaluate farfield proxy interactions.
+        A subclass of ``pytential.symbolic.matrix.ClusterMatrixBuilderBase``
+        that is used to evaluate farfield proxy interactions.
         This should follow the calling convention of the constructor to
-        :class:`pytential.symbolic.matrix.P2PClusterMatrixBuilder`.
+        ``pytential.symbolic.matrix.P2PClusterMatrixBuilder``.
 
     .. attribute:: neighbor_cluster_builder
 
-        A callable that is used to evaluate nearfield neighbour interactions.
+        A subclass of ``pytential.symbolic.matrix.ClusterMatrixBuilderBase``
+        that is used to evaluate nearfield neighbour interactions.
         This should follow the calling convention of the constructor to
-        :class:`pytential.symbolic.matrix.QBXClusterMatrixBuilder`.
+        ``pytential.symbolic.matrix.QBXClusterMatrixBuilder``.
 
     .. automethod:: evaluate_source_proxy_interaction
     .. automethod:: evaluate_target_proxy_interaction
@@ -291,35 +304,53 @@ def _evaluate_expr(self,
                 context=self.context,
                 **kwargs)(expr)
 
+    def evaluate_self(
+            self,
+            actx: PyOpenCLArrayContext,
+            places: GeometryCollection,
+            tgt_src_index: TargetAndSourceClusterList,
+            ibrow: int, ibcol: int,
+            ) -> np.ndarray:
+        cls = self.neighbor_cluster_builder
+        return self._evaluate_expr(
+            actx, places, cls, tgt_src_index, self.exprs[ibrow],
+            idomain=ibcol, _weighted=True)
+
     # {{{ nearfield
 
-    def evaluate_source_neighbor_interaction(self,
+    def evaluate_source_neighbor_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData, nbrindex: IndexList, *,
-            ibrow: int, ibcol: int) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+            ibrow: int, ibcol: int,
+            ) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+        from pytential.linalg.utils import TargetAndSourceClusterList
         nbr_src_index = TargetAndSourceClusterList(nbrindex, pxy.srcindex)
 
         eval_mapper_cls = self.neighbor_cluster_builder
         expr = self.exprs[ibrow]
         mat = self._evaluate_expr(
                 actx, places, eval_mapper_cls, nbr_src_index, expr,
-                idomain=ibcol, _weighted=self.weighted_sources)
+                idomain=ibcol, _weighted=True)
 
         return mat, nbr_src_index
 
-    def evaluate_target_neighbor_interaction(self,
+    def evaluate_target_neighbor_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData, nbrindex: IndexList, *,
-            ibrow: int, ibcol: int) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+            ibrow: int, ibcol: int,
+            ) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+        from pytential.linalg.utils import TargetAndSourceClusterList
         tgt_nbr_index = TargetAndSourceClusterList(pxy.srcindex, nbrindex)
 
         eval_mapper_cls = self.neighbor_cluster_builder
         expr = self.exprs[ibrow]
         mat = self._evaluate_expr(
                 actx, places, eval_mapper_cls, tgt_nbr_index, expr,
-                idomain=ibcol, _weighted=self.weighted_targets)
+                idomain=ibcol, _weighted=True)
 
         return mat, tgt_nbr_index
 
@@ -327,17 +358,25 @@ def evaluate_target_neighbor_interaction(self,
 
     # {{{ proxy
 
-    def evaluate_source_proxy_interaction(self,
+    def evaluate_source_proxy_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData, nbrindex: IndexList, *,
-            ibrow: int, ibcol: int) -> tuple[np.ndarray, TargetAndSourceClusterList]:
-        from pytential.collection import add_geometry_to_collection
+            ibrow: int, ibcol: int,
+            ) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+        from pytential.linalg.utils import TargetAndSourceClusterList
         pxy_src_index = TargetAndSourceClusterList(pxy.pxyindex, pxy.srcindex)
+
+        from pytential.collection import add_geometry_to_collection
         places = add_geometry_to_collection(
                 places, {PROXY_SKELETONIZATION_TARGET: pxy.as_targets()}
                 )
 
+        if not self.weighted_sources:
+            logger.warning("Source-Proxy weighting is turned off. This will not give "
+                           "good results for skeletonization.", stacklevel=3)
+
         eval_mapper_cls = self.proxy_source_cluster_builder
         expr = self.source_proxy_exprs[ibrow]
         mat = self._evaluate_expr(
@@ -348,13 +387,17 @@ def evaluate_source_proxy_interaction(self,
 
         return mat, pxy_src_index
 
-    def evaluate_target_proxy_interaction(self,
+    def evaluate_target_proxy_interaction(
+            self,
             actx: PyOpenCLArrayContext,
             places: GeometryCollection,
             pxy: ProxyClusterGeometryData, nbrindex: IndexList, *,
-            ibrow: int, ibcol: int) -> tuple[np.ndarray, TargetAndSourceClusterList]:
-        from pytential.collection import add_geometry_to_collection
+            ibrow: int, ibcol: int,
+            ) -> tuple[np.ndarray, TargetAndSourceClusterList]:
+        from pytential.linalg.utils import TargetAndSourceClusterList
         tgt_pxy_index = TargetAndSourceClusterList(pxy.srcindex, pxy.pxyindex)
+
+        from pytential.collection import add_geometry_to_collection
         places = add_geometry_to_collection(
                 places, {PROXY_SKELETONIZATION_SOURCE: pxy.as_sources()}
                 )
@@ -371,6 +414,9 @@ def evaluate_target_proxy_interaction(self,
             mat = _apply_weights(
                     actx, mat, places,
                     tgt_pxy_index, nbrindex, self.domains[ibcol])
+        else:
+            logger.warning("Target-Proxy weighting is turned off. This will not give "
+                           "good results for skeletonization.", stacklevel=3)
 
         return mat, tgt_pxy_index
 
@@ -541,7 +587,8 @@ def _evaluate_proxy_skeletonization_interaction(
         places: GeometryCollection,
         proxy_generator: ProxyGeneratorBase,
         wrangler: SkeletonizationWrangler,
-        cluster_index: IndexList, *,
+        source_index: IndexList,
+        target_index: IndexList, *,
         evaluate_proxy: Callable[...,
             tuple[np.ndarray, TargetAndSourceClusterList]],
         evaluate_neighbor: Callable[...,
@@ -553,23 +600,24 @@ def _evaluate_proxy_skeletonization_interaction(
     each cluster in *cluster_index*.
     """
 
-    if cluster_index.nclusters == 1:
+    if source_index.nclusters == 1:
         raise ValueError("cannot make a proxy skeleton for a single cluster")
 
-    from pytential.linalg import gather_cluster_neighbor_points
-    pxy = proxy_generator(actx, dofdesc, cluster_index)
+    from pytential.linalg.proxy import gather_cluster_neighbor_points
+    pxy = proxy_generator(actx, dofdesc, source_index)
     nbrindex = gather_cluster_neighbor_points(
-            actx, pxy,
+            actx, pxy, target_index,
             max_particles_in_box=max_particles_in_box)
 
     pxymat, pxy_cluster_index = evaluate_proxy(actx, places, pxy, nbrindex)
     nbrmat, nbr_cluster_index = evaluate_neighbor(actx, places, pxy, nbrindex)
-
-    return _ProxyNeighborEvaluationResult(
+    result = _ProxyNeighborEvaluationResult(
             pxy=pxy,
             pxymat=pxymat, pxyindex=pxy_cluster_index,
             nbrmat=nbrmat, nbrindex=nbr_cluster_index)
 
+    return result
+
 
 def _skeletonize_block_by_proxy_with_mats(
         actx: PyOpenCLArrayContext, ibrow: int, ibcol: int,
@@ -581,7 +629,7 @@ def _skeletonize_block_by_proxy_with_mats(
         id_rank: int | None = None,
         max_particles_in_box: int | None = None,
         rng: np.random.Generator | None = None,
-        ) -> "SkeletonizationResult":
+        ) -> SkeletonizationResult:
     nclusters = tgt_src_index.nclusters
     if nclusters == 1:
         raise ValueError("cannot make a proxy skeleton for a single cluster")
@@ -595,7 +643,7 @@ def _skeletonize_block_by_proxy_with_mats(
             max_particles_in_box=max_particles_in_box)
 
     src_result = evaluate_skeletonization_interaction(
-            tgt_src_index.sources,
+            tgt_src_index.sources, tgt_src_index.targets,
             evaluate_proxy=partial(
                 wrangler.evaluate_source_proxy_interaction,
                 ibrow=ibrow, ibcol=ibcol),
@@ -604,7 +652,7 @@ def _skeletonize_block_by_proxy_with_mats(
                 ibrow=ibrow, ibcol=ibcol),
             )
     tgt_result = evaluate_skeletonization_interaction(
-            tgt_src_index.targets,
+            tgt_src_index.targets, tgt_src_index.sources,
             evaluate_proxy=partial(
                 wrangler.evaluate_target_proxy_interaction,
                 ibrow=ibrow, ibcol=ibcol),
@@ -613,8 +661,8 @@ def _skeletonize_block_by_proxy_with_mats(
                 ibrow=ibrow, ibcol=ibcol)
             )
 
-    src_skl_indices: np.ndarray = np.empty(nclusters, dtype=object)
-    tgt_skl_indices: np.ndarray = np.empty(nclusters, dtype=object)
+    skel_src_indices: np.ndarray = np.empty(nclusters, dtype=object)
+    skel_tgt_indices: np.ndarray = np.empty(nclusters, dtype=object)
     skel_starts: np.ndarray = np.zeros(nclusters + 1, dtype=np.int32)
 
     L: np.ndarray = np.empty(nclusters, dtype=object)
@@ -643,7 +691,7 @@ def _skeletonize_block_by_proxy_with_mats(
             interp = interp[:k, :]
 
         L[i] = interp.T
-        tgt_skl_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
+        skel_tgt_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
         assert L[i].shape == (tgt_mat.shape[0], k)
 
         # skeletonize source points
@@ -651,22 +699,49 @@ def _skeletonize_block_by_proxy_with_mats(
         assert 0 < k <= len(idx)
 
         R[i] = interp
-        src_skl_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
+        skel_src_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
         assert R[i].shape == (k, src_mat.shape[1])
 
         skel_starts[i + 1] = skel_starts[i] + k
-        assert tgt_skl_indices[i].shape == src_skl_indices[i].shape
+        assert skel_tgt_indices[i].shape == skel_src_indices[i].shape
+
+    # evaluate diagonal
+    from pytential.linalg.utils import make_flat_cluster_diag
+    mat = wrangler.evaluate_self(actx, places, tgt_src_index, ibrow, ibcol)
+    D = make_flat_cluster_diag(mat, tgt_src_index)
 
-    from pytential.linalg import make_index_list
-    src_skl_index = make_index_list(np.hstack(list(src_skl_indices)), skel_starts)
-    tgt_skl_index = make_index_list(np.hstack(list(tgt_skl_indices)), skel_starts)
-    skel_tgt_src_index = TargetAndSourceClusterList(tgt_skl_index, src_skl_index)
+    from pytential.linalg import TargetAndSourceClusterList, make_index_list
+    skel_src_index = make_index_list(np.hstack(list(skel_src_indices)), skel_starts)
+    skel_tgt_index = make_index_list(np.hstack(list(skel_tgt_indices)), skel_starts)
+    skel_tgt_src_index = TargetAndSourceClusterList(skel_tgt_index, skel_src_index)
 
     return SkeletonizationResult(
-            L=L, R=R,
+            L=L, R=R, D=D,
             tgt_src_index=tgt_src_index, skel_tgt_src_index=skel_tgt_src_index,
             _src_eval_result=src_result, _tgt_eval_result=tgt_result)
 
+
+def _evaluate_root(
+        actx: PyOpenCLArrayContext, ibrow: int, ibcol: int,
+        places: GeometryCollection,
+        wrangler: SkeletonizationWrangler,
+        tgt_src_index: TargetAndSourceClusterList
+        ) -> SkeletonizationResult:
+    assert tgt_src_index.nclusters == 1
+
+    from pytential.linalg.utils import make_flat_cluster_diag
+    mat = wrangler.evaluate_self(actx, places, tgt_src_index, ibrow, ibcol)
+    D = make_flat_cluster_diag(mat, tgt_src_index)
+
+    from pytools.obj_array import make_obj_array
+    return SkeletonizationResult(
+        L=make_obj_array([np.eye(*D[0].shape)]),
+        R=make_obj_array([np.eye(*D[0].shape)]),
+        D=D,
+        tgt_src_index=tgt_src_index, skel_tgt_src_index=tgt_src_index,
+        _src_eval_result=None, _tgt_eval_result=None,
+        )
+
 # }}}
 
 
@@ -705,21 +780,27 @@ class SkeletonizationResult:
 
         An object :class:`~numpy.ndarray` of size ``(nclusters,)``.
 
+    .. attribute:: D
+
+        An object :class:`~numpy.ndarray` of size ``(nclusters,)``. This
+        contains the diagonal blocks for :attr:`tgt_src_index`.
+
     .. attribute:: tgt_src_index
 
-        A :class:`~pytential.linalg.TargetAndSourceClusterList` representing the
-        indices in the original matrix :math:`A` that have been skeletonized.
+        A :class:`~pytential.linalg.utils.TargetAndSourceClusterList` representing
+        the indices in the original matrix :math:`A` that has been skeletonized.
 
     .. attribute:: skel_tgt_src_index
 
-        A :class:`~pytential.linalg.TargetAndSourceClusterList` representing a
-        subset of :attr:`tgt_src_index`, i.e. the skeleton of each cluster of
+        A :class:`~pytential.linalg.utils.TargetAndSourceClusterList` representing
+        a subset of :attr:`tgt_src_index`, i.e. the skeleton of each cluster of
         :math:`A`. These indices can be used to reconstruct the :math:`S`
         matrix.
     """
 
     L: np.ndarray
     R: np.ndarray
+    D: np.ndarray
 
     tgt_src_index: TargetAndSourceClusterList
     skel_tgt_src_index: TargetAndSourceClusterList
@@ -759,6 +840,7 @@ def skeletonize_by_proxy(
         input_exprs: sym.var | Sequence[sym.var], *,
         domains: Sequence[Hashable] | None = None,
         context: dict[str, Any] | None = None,
+        auto_where: Any = None,
 
         approx_nproxy: int | None = None,
         proxy_radius_factor: float | None = None,
@@ -769,7 +851,7 @@ def skeletonize_by_proxy(
         max_particles_in_box: int | None = None) -> np.ndarray:
     r"""Evaluate and skeletonize a symbolic expression using proxy-based methods.
 
-    :arg tgt_src_index: a :class:`~pytential.linalg.TargetAndSourceClusterList`
+    :arg tgt_src_index: a :class:`~pytential.linalg.utils.TargetAndSourceClusterList`
         indicating which indices participate in the skeletonization.
 
     :arg exprs: see :func:`make_skeletonization_wrangler`.
@@ -777,8 +859,8 @@ def skeletonize_by_proxy(
     :arg domains: see :func:`make_skeletonization_wrangler`.
     :arg context: see :func:`make_skeletonization_wrangler`.
 
-    :arg approx_nproxy: see :class:`~pytential.linalg.ProxyGenerator`.
-    :arg proxy_radius_factor: see :class:`~pytential.linalg.ProxyGenerator`.
+    :arg approx_nproxy: see :class:`~pytential.linalg.proxy.ProxyGenerator`.
+    :arg proxy_radius_factor: see :class:`~pytential.linalg.proxy.ProxyGenerator`.
 
     :arg id_eps: a floating point value used as a tolerance in skeletonizing
         each block in *tgt_src_index*.
@@ -792,21 +874,105 @@ def skeletonize_by_proxy(
     from pytential.linalg.proxy import QBXProxyGenerator
     wrangler = make_skeletonization_wrangler(
             places, exprs, input_exprs,
-            domains=domains, context=context)
+            domains=domains, context=context, auto_where=auto_where)
     proxy = QBXProxyGenerator(places,
             approx_nproxy=approx_nproxy,
             radius_factor=proxy_radius_factor)
 
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise NotImplementedError("support for block matrices")
+
+    from itertools import product
+
     skels: np.ndarray = np.empty((wrangler.nrows, wrangler.ncols), dtype=object)
-    for ibrow in range(wrangler.nrows):
-        for ibcol in range(wrangler.ncols):
-            skels[ibrow, ibcol] = _skeletonize_block_by_proxy_with_mats(
-                    actx, ibrow, ibcol, places, proxy, wrangler, tgt_src_index,
-                    id_eps=id_eps,
-                    id_rank=id_rank,
-                    max_particles_in_box=max_particles_in_box,
-                    rng=rng)
+    for ibrow, ibcol in product(range(wrangler.nrows), range(wrangler.ncols)):
+        skels[ibrow, ibcol] = _skeletonize_block_by_proxy_with_mats(
+                actx, ibrow, ibcol, places, proxy, wrangler, tgt_src_index,
+                id_eps=id_eps, id_rank=id_rank,
+                max_particles_in_box=max_particles_in_box,
+                rng=rng)
 
     return skels
 
 # }}}
+
+
+# {{{ recursive skeletonization by proxy
+
+def rec_skeletonize_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+
+        ctree: ClusterTree,
+        tgt_src_index: TargetAndSourceClusterList,
+        exprs: sym.var | Sequence[sym.var],
+        input_exprs: sym.var | Sequence[sym.var], *,
+        domains: Sequence[Hashable] | None = None,
+        context: dict[str, Any] | None = None,
+        auto_where: Any = None,
+
+        approx_nproxy: int | None = None,
+        proxy_radius_factor: float | None = None,
+
+        id_eps: float | None = None,
+        rng: np.random.Generator | None = None,
+        max_particles_in_box: int | None = None,
+
+        _wrangler: SkeletonizationWrangler | None = None,
+        _proxy: ProxyGeneratorBase | None = None) -> np.ndarray:
+    r"""Performs recursive skeletonization based on :func:`skeletonize_by_proxy`.
+
+    :returns: an object :class:`~numpy.ndarray` of :class:`SkeletonizationResult`\ s,
+        one per level in *ctree*.
+    """
+
+    assert ctree.nclusters == tgt_src_index.nclusters
+
+    if id_eps is None:
+        id_eps = 1.0e-8
+
+    if _proxy is None:
+        from pytential.linalg.proxy import QBXProxyGenerator
+        proxy: ProxyGeneratorBase = QBXProxyGenerator(places,
+                approx_nproxy=approx_nproxy,
+                radius_factor=proxy_radius_factor)
+    else:
+        proxy = _proxy
+
+    if _wrangler is None:
+        wrangler = make_skeletonization_wrangler(
+                places, exprs, input_exprs,
+                domains=domains, context=context, auto_where=auto_where)
+    else:
+        wrangler = _wrangler
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise NotImplementedError("support for block matrices")
+
+    from itertools import product
+    skel_per_level: np.ndarray = np.empty(ctree.nlevels, dtype=object)
+    for i, clevel in enumerate(ctree.levels[:-1]):
+        for ibrow, ibcol in product(range(wrangler.nrows), range(wrangler.ncols)):
+            skeleton = _skeletonize_block_by_proxy_with_mats(
+                actx, ibrow, ibcol, proxy.places, proxy, wrangler, tgt_src_index,
+                id_eps=id_eps,
+                # NOTE: we probably never want to set the rank here?
+                id_rank=None,
+                rng=rng,
+                max_particles_in_box=max_particles_in_box)
+
+        skel_per_level[i] = skeleton
+        tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
+
+    assert tgt_src_index.nclusters == 1
+    assert not isinstance(skel_per_level[-1], SkeletonizationResult)
+
+    # evaluate the full root cluster (no skeletonization or anything)
+    skeleton = _evaluate_root(actx, 0, 0, places, wrangler, tgt_src_index)
+    skel_per_level[-1] = skeleton
+
+    return skel_per_level
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index eca334568..ee64b1847 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2018-2022 Alexandru Fikl"
 
 __license__ = """
@@ -34,16 +37,14 @@
 
 
 __doc__ = """
-Misc
-~~~~
-
-.. currentmodule:: pytential.linalg
-
 .. autoclass:: IndexList
 .. autoclass:: TargetAndSourceClusterList
 
 .. autofunction:: make_index_list
 .. autofunction:: make_index_cluster_cartesian_product
+.. autofunction:: make_flat_cluster_diag
+
+.. autofunction:: interp_decomp
 """
 
 
@@ -54,6 +55,7 @@ class IndexList:
     """Convenience class for working with clusters (subsets) of an array.
 
     .. attribute:: nclusters
+    .. attribute:: nindices
     .. attribute:: indices
 
         An :class:`~numpy.ndarray` of not necessarily continuous or increasing
@@ -78,6 +80,10 @@ class IndexList:
     def nclusters(self) -> int:
         return self.starts.size - 1
 
+    @property
+    def nindices(self) -> int:
+        return self.indices.size
+
     def cluster_size(self, i: int) -> int:
         if not (0 <= i < self.nclusters):
             raise IndexError(
@@ -113,6 +119,11 @@ class TargetAndSourceClusterList:
     """Convenience class for working with clusters (subsets) of a matrix.
 
     .. attribute:: nclusters
+
+    .. attribute:: shape
+
+        Shape of the Cartesian product of the :attr:`targets` and :attr:`sources`.
+
     .. attribute:: targets
 
         An :class:`IndexList` encapsulating target cluster indices.
@@ -153,6 +164,13 @@ def _flat_cluster_starts(self):
     def _flat_total_size(self):
         return self._flat_cluster_starts[-1]
 
+    def __iter__(self):
+        return iter((self.targets, self.sources))
+
+    @property
+    def shape(self):
+        return (self.targets.nindices, self.sources.nindices)
+
     def cluster_shape(self, i: int, j: int) -> tuple[int, int]:
         r"""
         :returns: the shape of the cluster ``(i, j)``, where *i* indexes into
@@ -308,15 +326,14 @@ def make_flat_cluster_diag(
         correspondence to the index sets constructed by
         :func:`make_index_cluster_cartesian_product` for *mindex*.
 
-    :returns: a block diagonal object :class:`~numpy.ndarray`, where each
-        diagonal element :math:`(i, i)` is the reshaped slice of *mat* that
-        corresponds to the cluster :math:`i`.
+    :returns: an object :class:`~numpy.ndarray`, where each element represents
+        the block of a block-diagonal matrix.
     """
-    cluster_mat: np.ndarray = np.full((mindex.nclusters, mindex.nclusters), 0,
-                                      dtype=object)
+
+    cluster_mat: np.ndarray = np.empty(mindex.nclusters, dtype=object)
     for i in range(mindex.nclusters):
         shape = mindex.cluster_shape(i, i)
-        cluster_mat[i, i] = mindex.flat_cluster_take(mat, i).reshape(*shape)
+        cluster_mat[i] = mindex.flat_cluster_take(mat, i).reshape(*shape)
 
     return cluster_mat
 
@@ -337,8 +354,8 @@ def interp_decomp(
     :return: a tuple ``(k, idx, interp)`` containing the numerical rank *k*,
         the column indices *idx* and the resulting interpolation matrix *interp*.
     """
-    if rank is not None and eps is not None:
-        raise ValueError("providing both 'rank' and 'eps' is not supported")
+    if (rank is not None and eps is not None) or (rank is None and eps is None):
+        raise ValueError("either 'rank' or 'eps' must be provided (not both)")
 
     from scipy import __version__
 
@@ -366,7 +383,7 @@ def interp_decomp(
 # {{{ cluster matrix errors
 
 def cluster_skeletonization_error(
-        mat: np.ndarray, skeleton: "SkeletonizationResult", *,
+        mat: np.ndarray, skeleton: SkeletonizationResult, *,
         ord: float | None = None,
         relative: bool = False) -> tuple[np.ndarray, np.ndarray]:
     r"""Evaluate the cluster-wise skeletonization errors for the given *skeleton*.
@@ -402,7 +419,7 @@ def cluster_skeletonization_error(
     tgt_src_index = skeleton.tgt_src_index
     nclusters = skeleton.nclusters
 
-    def mnorm(x: np.ndarray, y: np.ndarray) -> "np.floating[Any]":
+    def mnorm(x: np.ndarray, y: np.ndarray) -> np.floating[Any]:
         result = la.norm(x - y, ord=ord)
         if relative:
             result = result / la.norm(x, ord=ord)
@@ -440,9 +457,9 @@ def mnorm(x: np.ndarray, y: np.ndarray) -> "np.floating[Any]":
 
 
 def skeletonization_error(
-        mat: np.ndarray, skeleton: "SkeletonizationResult", *,
+        mat: np.ndarray, skeleton: SkeletonizationResult, *,
         ord: float | None = None,
-        relative: bool = False) -> "np.floating[Any]":
+        relative: bool = False) -> np.floating[Any]:
     r"""Computes the skeletonization error for the entire matrix *mat*.
 
     The error computed here is given by
@@ -499,3 +516,5 @@ def skeletonization_error(
     return result
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index 99ca6a174..6ec13c496 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -284,7 +284,7 @@ def map_call(self, expr):
 
 class ClusterMatrixBuilderBase(MatrixBuilderBase):
     """Evaluate individual clusters of a matrix operator, as defined by a
-    :class:`~pytential.linalg.TargetAndSourceClusterList`.
+    :class:`~pytential.linalg.utils.TargetAndSourceClusterList`.
 
     Unlike, e.g. :class:`MatrixBuilder`, matrix cluster builders are
     significantly reduced in scope. They are basically just meant
@@ -304,7 +304,8 @@ def __init__(self,
                  tgt_src_index: TargetAndSourceClusterList,
                  context: dict[str, Any]) -> None:
         """
-        :arg tgt_src_index: a :class:`~pytential.linalg.TargetAndSourceClusterList`
+        :arg tgt_src_index: a
+            :class:`~pytential.linalg.utils.TargetAndSourceClusterList`
             class describing which clusters are going to be evaluated.
         """
 
diff --git a/test/extra_matrix_data.py b/test/extra_matrix_data.py
index 7c2b8e5a6..d4ab820a6 100644
--- a/test/extra_matrix_data.py
+++ b/test/extra_matrix_data.py
@@ -53,8 +53,8 @@ def get_cluster_index(self, actx, places, dofdesc=None):
         if max_particles_in_box is None:
             max_particles_in_box = discr.ndofs // self.approx_cluster_count
 
-        from pytential.linalg import partition_by_nodes
-        cindex = partition_by_nodes(actx, places,
+        from pytential.linalg.cluster import partition_by_nodes
+        cindex, ctree = partition_by_nodes(actx, places,
                 dofdesc=dofdesc,
                 tree_kind=self.tree_kind,
                 max_particles_in_box=max_particles_in_box)
@@ -76,12 +76,12 @@ def get_cluster_index(self, actx, places, dofdesc=None):
             from pytential.linalg import make_index_list
             cindex = make_index_list(subset)
 
-        return cindex
+        return cindex, ctree
 
     def get_tgt_src_cluster_index(self, actx, places, dofdesc=None):
         from pytential.linalg import TargetAndSourceClusterList
-        cindex = self.get_cluster_index(actx, places, dofdesc=dofdesc)
-        return TargetAndSourceClusterList(cindex, cindex)
+        cindex, ctree = self.get_cluster_index(actx, places, dofdesc=dofdesc)
+        return TargetAndSourceClusterList(cindex, cindex), ctree
 
     def get_operator(self, ambient_dim, qbx_forced_limit=_NoArgSentinel):
         knl = self.knl_class(ambient_dim)
diff --git a/test/test_linalg_cluster.py b/test/test_linalg_cluster.py
new file mode 100644
index 000000000..ed8b2045e
--- /dev/null
+++ b/test/test_linalg_cluster.py
@@ -0,0 +1,129 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import pytest
+
+import numpy as np
+
+from pytential import GeometryCollection
+from meshmode.mesh.generation import NArmedStarfish
+
+from meshmode import _acf           # noqa: F401
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+
+import extra_matrix_data as extra
+import logging
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+CLUSTER_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[64]),
+        extra.TorusTestCase(
+            target_order=4,
+            resolutions=[1])
+        ]
+
+
+# {{{ test_cluster_tree
+
+@pytest.mark.parametrize(("case", "tree_kind"), [
+    (CLUSTER_TEST_CASES[0], None),
+    (CLUSTER_TEST_CASES[0], "adaptive"),
+    (CLUSTER_TEST_CASES[0], "adaptive-level-restricted"),
+    (CLUSTER_TEST_CASES[1], "adaptive"),
+    ])
+def test_cluster_tree(actx_factory, case, tree_kind, visualize=False):
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    from dataclasses import replace
+    actx = actx_factory()
+    case = replace(case, tree_kind=tree_kind)
+    logger.info("\n%s", case)
+
+    discr = case.get_discretization(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(discr, auto_where=case.name)
+
+    srcindex, ctree = case.get_cluster_index(actx, places)
+    assert srcindex.nclusters == ctree.nclusters
+
+    from pytential.linalg.cluster import split_array
+    rng = np.random.default_rng(42)
+    x = split_array(rng.random(srcindex.indices.shape), srcindex)
+
+    logger.info("nclusters %4d nlevels %4d", srcindex.nclusters, ctree.nlevels)
+
+    if visualize and ctree._tree is not None:
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(10, 10), dpi=300)
+
+        from boxtree.visualization import TreePlotter
+        plotter = TreePlotter(ctree._tree)
+        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
+        plotter.draw_box_numbers()
+        plotter.set_bounding_box()
+
+        fig.savefig("test_cluster_tree")
+
+    from pytential.linalg.cluster import cluster, uncluster
+    for clevel in ctree.levels:
+        logger.info("======== Level %d", clevel.level)
+        logger.info("box_ids        %s", clevel.box_ids)
+        logger.info("sizes          %s", np.diff(srcindex.starts))
+        logger.info("parent_map     %s", clevel.parent_map)
+
+        assert srcindex.nclusters == clevel.nclusters
+
+        next_srcindex = cluster(srcindex, clevel)
+        for i, ppm in enumerate(clevel.parent_map):
+            partition = np.concatenate([srcindex.cluster_indices(j) for j in ppm])
+
+            assert partition.size == next_srcindex.cluster_size(i)
+            assert np.allclose(partition, next_srcindex.cluster_indices(i))
+
+        y = cluster(x, clevel)
+        z = uncluster(y, srcindex, clevel)
+        assert all(np.allclose(xi, zi) for xi, zi in zip(x, z, strict=True))
+
+        srcindex = next_srcindex
+        x = y
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_proxy.py b/test/test_linalg_proxy.py
index 99029b63d..cb1184896 100644
--- a/test/test_linalg_proxy.py
+++ b/test/test_linalg_proxy.py
@@ -29,7 +29,7 @@
 from arraycontext import flatten, unflatten
 from pytential import bind, sym
 from pytential import GeometryCollection
-from pytential.linalg import ProxyGenerator, QBXProxyGenerator
+from pytential.linalg.proxy import ProxyGenerator, QBXProxyGenerator
 from meshmode.mesh.generation import ellipse, NArmedStarfish
 
 from meshmode import _acf           # noqa: F401
@@ -214,7 +214,7 @@ def test_partition_points(actx_factory, tree_kind, case, visualize=False):
     places = GeometryCollection(qbx, auto_where=case.name)
 
     density_discr = places.get_discretization(case.name)
-    mindex = case.get_cluster_index(actx, places)
+    mindex, _ = case.get_cluster_index(actx, places)
 
     expected_indices = np.arange(0, density_discr.ndofs)
     assert mindex.starts[-1] == density_discr.ndofs
@@ -261,7 +261,7 @@ def test_proxy_generator(actx_factory, case,
     places = GeometryCollection(qbx, auto_where=case.name)
 
     density_discr = places.get_discretization(case.name)
-    cindex = case.get_cluster_index(actx, places)
+    cindex, _ = case.get_cluster_index(actx, places)
 
     generator = proxy_generator_cls(places,
             approx_nproxy=case.proxy_approx_count,
@@ -307,7 +307,7 @@ def test_proxy_generator(actx_factory, case,
     ProxyGenerator, QBXProxyGenerator,
     ])
 @pytest.mark.parametrize("index_sparsity_factor", [1.0, 0.6])
-@pytest.mark.parametrize("proxy_radius_factor", [1, 1.1])
+@pytest.mark.parametrize("proxy_radius_factor", [1.0, 1.1])
 def test_neighbor_points(actx_factory, case,
         proxy_generator_cls, index_sparsity_factor, proxy_radius_factor,
         visualize=False):
@@ -330,7 +330,7 @@ def test_neighbor_points(actx_factory, case,
     dofdesc = places.auto_source
 
     density_discr = places.get_discretization(dofdesc.geometry)
-    srcindex = case.get_cluster_index(actx, places)
+    srcindex, _ = case.get_cluster_index(actx, places)
 
     # generate proxy points
     generator = proxy_generator_cls(places,
@@ -339,8 +339,8 @@ def test_neighbor_points(actx_factory, case,
     pxy = generator(actx, dofdesc, srcindex)
 
     # get neighboring points
-    from pytential.linalg import gather_cluster_neighbor_points
-    nbrindex = gather_cluster_neighbor_points(actx, pxy)
+    from pytential.linalg.proxy import gather_cluster_neighbor_points
+    nbrindex = gather_cluster_neighbor_points(actx, pxy, srcindex)
 
     pxy = pxy.to_numpy(actx)
     nodes = actx.to_numpy(
diff --git a/test/test_linalg_skeletonization.py b/test/test_linalg_skeletonization.py
index bc61e3986..89e0eb091 100644
--- a/test/test_linalg_skeletonization.py
+++ b/test/test_linalg_skeletonization.py
@@ -107,6 +107,7 @@ def test_skeletonize_symbolic(actx_factory, case, visualize=False):
     """
 
     actx = actx_factory()
+    rng = np.random.default_rng(42)
 
     if visualize:
         logging.basicConfig(level=logging.INFO)
@@ -119,40 +120,21 @@ def test_skeletonize_symbolic(actx_factory, case, visualize=False):
     places = GeometryCollection(qbx, auto_where=dd)
 
     density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
-    tgt_src_index = case.get_tgt_src_cluster_index(actx, places, dd)
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
 
     logger.info("nclusters %3d ndofs %7d",
             tgt_src_index.nclusters, density_discr.ndofs)
 
     # }}}
 
-    # {{{ wranglers
-
-    from pytential.linalg import QBXProxyGenerator
-    from pytential.linalg.skeletonization import make_skeletonization_wrangler
-    proxy_generator = QBXProxyGenerator(places,
-            radius_factor=case.proxy_radius_factor,
-            approx_nproxy=case.proxy_approx_count)
-
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
     sym_u, sym_op = case.get_operator(places.ambient_dim)
-    wrangler = make_skeletonization_wrangler(places, sym_op, sym_u,
-            domains=None,
-            context=case.knl_concrete_kwargs,
-            _weighted_proxy=case.weighted_proxy,
-            _proxy_source_cluster_builder=case.proxy_source_cluster_builder,
-            _proxy_target_cluster_builder=case.proxy_target_cluster_builder,
-            _neighbor_cluster_builder=case.neighbor_cluster_builder)
-
-    # }}}
-
-    from pytential.linalg.skeletonization import (
-            _skeletonize_block_by_proxy_with_mats)
-
-    rng = np.random.default_rng(42)
-    _skeletonize_block_by_proxy_with_mats(
-        actx, 0, 0, places, proxy_generator, wrangler, tgt_src_index,
+    rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, sym_op, sym_u,
+        context=case.knl_concrete_kwargs,
+        auto_where=dd,
         id_eps=1.0e-8,
-        rng=rng,
+        rng=rng
     )
 
 # }}}
@@ -165,6 +147,7 @@ def run_skeletonize_by_proxy(actx, case, resolution,
                              places=None, mat=None,
                              ctol=None, rtol=None,
                              rng=None,
+                             tgt_src_index=None,
                              suffix="", visualize=False):
     from pytools import ProcessTimer
 
@@ -174,9 +157,12 @@ def run_skeletonize_by_proxy(actx, case, resolution,
     if places is None:
         qbx = case.get_layer_potential(actx, resolution, case.target_order)
         places = GeometryCollection(qbx, auto_where=dd)
+    else:
+        qbx = places.get_geometry(dd.geometry)
 
     density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
-    tgt_src_index = case.get_tgt_src_cluster_index(actx, places, dd)
+    if tgt_src_index is None:
+        tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
 
     logger.info("nclusters %3d ndofs %7d",
             tgt_src_index.nclusters, density_discr.ndofs)
@@ -193,7 +179,7 @@ def run_skeletonize_by_proxy(actx, case, resolution,
     logger.info("proxy factor %.2f count %7d",
                 case.proxy_radius_factor, proxy_approx_count)
 
-    from pytential.linalg import QBXProxyGenerator
+    from pytential.linalg.proxy import QBXProxyGenerator
     from pytential.linalg.skeletonization import make_skeletonization_wrangler
     proxy_generator = QBXProxyGenerator(places,
             radius_factor=case.proxy_radius_factor,
@@ -201,8 +187,8 @@ def run_skeletonize_by_proxy(actx, case, resolution,
 
     sym_u, sym_op = case.get_operator(places.ambient_dim)
     wrangler = make_skeletonization_wrangler(places, sym_op, sym_u,
-            domains=None,
             context=case.knl_concrete_kwargs,
+            auto_where=dd,
             _weighted_proxy=case.weighted_proxy,
             _proxy_source_cluster_builder=case.proxy_source_cluster_builder,
             _proxy_target_cluster_builder=case.proxy_target_cluster_builder,
@@ -247,10 +233,13 @@ def run_skeletonize_by_proxy(actx, case, resolution,
 
     logger.info("[time] skeletonization by proxy: %s", p)
 
+    def intersect1d(x, y):
+        return np.where((x.reshape(1, -1) - y.reshape(-1, 1)) == 0)[1]
+
     L, R = skeleton.L, skeleton.R
     for i in range(tgt_src_index.nclusters):
         # targets (rows)
-        bi = np.searchsorted(
+        bi = intersect1d(
             tgt_src_index.targets.cluster_indices(i),
             skeleton.skel_tgt_src_index.targets.cluster_indices(i),
             )
@@ -260,7 +249,7 @@ def run_skeletonize_by_proxy(actx, case, resolution,
         tgt_error = la.norm(A - L[i] @ S, ord=ord) / la.norm(A, ord=ord)
 
         # sources (columns)
-        bj = np.searchsorted(
+        bj = intersect1d(
             tgt_src_index.sources.cluster_indices(i),
             skeleton.skel_tgt_src_index.sources.cluster_indices(i),
             )
@@ -274,8 +263,8 @@ def run_skeletonize_by_proxy(actx, case, resolution,
                 src_error, tgt_error, R[i].shape[0], R[i].shape[1])
 
         if ctol is not None:
-            assert src_error < ctol * case.id_eps
-            assert tgt_error < ctol * case.id_eps
+            assert src_error < ctol
+            assert tgt_error < ctol
 
     # }}}
 
@@ -306,9 +295,9 @@ def run_skeletonize_by_proxy(actx, case, resolution,
             rtol if rtol is not None else 0.0)
 
     if rtol:
-        assert err_l < rtol * case.id_eps
-        assert err_r < rtol * case.id_eps
-        assert err_f < rtol * case.id_eps
+        assert err_l < rtol
+        assert err_r < rtol
+        assert err_f < rtol
 
     # }}}
 
@@ -348,19 +337,18 @@ def run_skeletonize_by_proxy(actx, case, resolution,
 
     # }}}
 
-    return err_f, (places, mat)
+    return err_f, (places, mat, skeleton)
 
 
 @pytest.mark.parametrize("case", [
-    # NOTE: skip 2d tests, since they're better checked for convergence in
-    # `test_skeletonize_by_proxy_convergence`
-    # SKELETONIZE_TEST_CASES[0], SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[0],
+    SKELETONIZE_TEST_CASES[1],
     SKELETONIZE_TEST_CASES[2],
     ])
 def test_skeletonize_by_proxy(actx_factory, case, visualize=False):
-    r"""Test single-level skeletonization accuracy. Checks that the error
-    satisfies :math:`e < c \epsilon_{id}` for a fixed ID tolerance and an
-    empirically determined (not too huge) :math:`c`.
+    r"""Test multilevel skeletonization accuracy. Checks that the error for
+    every level satisfies :math:`e < c \epsilon_{id}` for a fixed ID tolerance
+    and an empirically determined (not too huge) :math:`c`.
     """
 
     import scipy.linalg.interpolative as sli
@@ -376,13 +364,27 @@ def test_skeletonize_by_proxy(actx_factory, case, visualize=False):
     case = replace(case, approx_cluster_count=6, id_eps=1.0e-8)
     logger.info("\n%s", case)
 
-    run_skeletonize_by_proxy(
-        actx, case, case.resolutions[0],
-        ctol=10,
-        # FIXME: why is the 3D error so large?
-        rtol=10**case.ambient_dim,
-        rng=rng,
-        visualize=visualize)
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[0], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
+    mat = None
+
+    from pytential.linalg.cluster import cluster
+    for clevel in ctree.levels[:-1]:
+        logger.info("[%2d/%2d] nclusters %3d",
+            clevel.level, ctree.nlevels, clevel.nclusters)
+
+        _, (_, mat, skeleton) = run_skeletonize_by_proxy(
+            actx, case, case.resolutions[0],
+            ctol=10 * case.id_eps,
+            # FIXME: why is the 3D error so large?
+            rtol=10**case.ambient_dim * case.id_eps,
+            places=places, mat=mat, rng=rng, tgt_src_index=tgt_src_index,
+            visualize=visualize)
+
+        tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
 
 # }}}
 
@@ -448,9 +450,11 @@ def test_skeletonize_by_proxy_convergence(
     places = mat = None
     for i in range(id_eps.size):
         case = replace(case, id_eps=id_eps[i], weighted_proxy=weighted)
-        rec_error[i], (places, mat) = run_skeletonize_by_proxy(
-            actx, case, r, places=places, mat=mat,
-            suffix=f"{suffix}_{i:04d}", rng=rng, visualize=False)
+
+        if not was_zero:
+            rec_error[i], (places, mat, _) = run_skeletonize_by_proxy(
+                actx, case, r, places=places, mat=mat, rng=rng,
+                suffix=f"{suffix}_{i:04d}", visualize=False)
 
         was_zero = rec_error[i] == 0.0
         eoc.add_data_point(id_eps[i], rec_error[i])
diff --git a/test/test_matrix.py b/test/test_matrix.py
index 905ae70bf..4d9c416bb 100644
--- a/test/test_matrix.py
+++ b/test/test_matrix.py
@@ -61,7 +61,7 @@ def max_cluster_error(mat, clusters, mindex, p=None):
 
         error = max(
                 error,
-                la.norm(mat_i - clusters[i, i], ord=p) / norm_mat_i
+                la.norm(mat_i - clusters[i], ord=p) / norm_mat_i
                 )
 
     return error
@@ -346,7 +346,7 @@ def test_cluster_builder(actx_factory, ambient_dim,
 
     # {{{ matrix
 
-    mindex = case.get_tgt_src_cluster_index(actx, places)
+    mindex, _ = case.get_tgt_src_cluster_index(actx, places)
     kwargs = {
             "dep_expr": sym_u,
             "other_dep_exprs": [],
@@ -468,8 +468,8 @@ def test_build_matrix_fixed_stage(actx_factory,
     logger.info("ndofs:         %d", target_discr.ndofs)
 
     from pytential.linalg import TargetAndSourceClusterList
-    itargets = case.get_cluster_index(actx, places, target_dd)
-    jsources = case.get_cluster_index(actx, places, source_dd)
+    itargets, _ = case.get_cluster_index(actx, places, target_dd)
+    jsources, _ = case.get_cluster_index(actx, places, source_dd)
     mindex = TargetAndSourceClusterList(itargets, jsources)
 
     kwargs = {

From 81ef190bbd60bc21cda7d210703b5dad717607af Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 17 Jun 2022 15:10:34 +0300
Subject: [PATCH 2/4] add recursive clustering and skeletonization

---
 pytential/linalg/skeletonization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index b57db6975..583500c19 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -915,8 +915,8 @@ def rec_skeletonize_by_proxy(
         proxy_radius_factor: float | None = None,
 
         id_eps: float | None = None,
-        rng: np.random.Generator | None = None,
         max_particles_in_box: int | None = None,
+        rng: np.random.Generator | None = None,
 
         _wrangler: SkeletonizationWrangler | None = None,
         _proxy: ProxyGeneratorBase | None = None) -> np.ndarray:
@@ -961,7 +961,7 @@ def rec_skeletonize_by_proxy(
                 rng=rng,
                 max_particles_in_box=max_particles_in_box)
 
-        skel_per_level[i] = skeleton
+        skel_per_level[i] = skeleton  # type: ignore[call-overload]
         tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
 
     assert tgt_src_index.nclusters == 1
@@ -969,7 +969,7 @@ def rec_skeletonize_by_proxy(
 
     # evaluate the full root cluster (no skeletonization or anything)
     skeleton = _evaluate_root(actx, 0, 0, places, wrangler, tgt_src_index)
-    skel_per_level[-1] = skeleton
+    skel_per_level[-1] = skeleton  # type: ignore[call-overload]
 
     return skel_per_level
 

From e40ef4c2048eb3992efbf426356d062ab8b53f9d Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 17 Jun 2022 15:10:34 +0300
Subject: [PATCH 3/4] add recursive clustering and skeletonization

---
 pytential/linalg/cluster.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytential/linalg/cluster.py b/pytential/linalg/cluster.py
index 7a90e2cac..fce5c2a26 100644
--- a/pytential/linalg/cluster.py
+++ b/pytential/linalg/cluster.py
@@ -296,7 +296,6 @@ def uncluster(ary: np.ndarray, index: IndexList, clevel: ClusterLevel) -> np.nda
 
 def _build_binary_ish_tree_from_starts(starts: np.ndarray) -> ClusterTree:
     partition_box_ids: np.ndarray[tuple[int, ...], Any] = np.arange(starts.size - 1)
-
     box_ids = partition_box_ids
 
     box_parent_ids = []

From bf42a83f335715be76740b58e7e7d36eff2e3c53 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 4 Aug 2022 16:21:49 +0300
Subject: [PATCH 4/4] implement hierachical matrix solver

---
 .gitignore                                 |   2 +
 examples/scaling-study-hmatrix.py          | 198 +++++++
 pytential/linalg/direct_solver_symbolic.py |   8 +-
 pytential/linalg/hmatrix.py                | 547 +++++++++++++++++++
 pytential/linalg/skeletonization.py        |  42 +-
 pytential/linalg/utils.py                  |  63 +++
 pytential/symbolic/matrix.py               |  18 +-
 test/extra_matrix_data.py                  |  21 +-
 test/test_linalg_hmatrix.py                | 590 +++++++++++++++++++++
 test/test_linalg_skeletonization.py        | 101 ++++
 10 files changed, 1561 insertions(+), 29 deletions(-)
 create mode 100644 examples/scaling-study-hmatrix.py
 create mode 100644 pytential/linalg/hmatrix.py
 create mode 100644 test/test_linalg_hmatrix.py

diff --git a/.gitignore b/.gitignore
index daf719e6f..c89b827fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,8 @@ examples/*.pdf
 
 *.vtu
 *.vts
+*.png
+*.gif
 
 .cache
 
diff --git a/examples/scaling-study-hmatrix.py b/examples/scaling-study-hmatrix.py
new file mode 100644
index 000000000..d163e7383
--- /dev/null
+++ b/examples/scaling-study-hmatrix.py
@@ -0,0 +1,198 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from meshmode.array_context import PyOpenCLArrayContext
+from pytools.convergence import EOCRecorder
+
+from pytential import GeometryCollection, sym
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Timings:
+    build: float
+    matvec: float
+
+
+def run_hmatrix_matvec(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptor) -> None:
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(places.ambient_dim)
+    sym_u = sym.var("u")
+    sym_op = -0.5 * sym_u + sym.D(kernel, sym_u, qbx_forced_limit="avg")
+
+    density_discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    u = actx.thaw(density_discr.nodes()[0])
+
+    def build_hmat():
+        from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+        return build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dofdesc],
+            context={},
+            auto_where=dofdesc,
+            id_eps=1.0e-10,
+            _tree_kind="adaptive-level-restricted",
+            _approx_nproxy=64,
+            _proxy_radius_factor=1.15).get_forward()
+
+    # warmup
+    from pytools import ProcessTimer
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    logger.info("build(warmup): %s", pt)
+
+    # build
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    t_build = pt.wall_elapsed
+    logger.info("build: %s", pt)
+
+    # matvec
+    with ProcessTimer() as pt:
+        du = hmat @ u
+        assert du is not None
+        actx.queue.finish()
+
+    t_matvec = pt.wall_elapsed
+    logger.info("matvec: %s", pt)
+
+    return Timings(t_build, t_matvec)
+
+
+def run_scaling_study(
+        ambient_dim: int, *,
+        target_order: int = 4,
+        source_ovsmp: int = 4,
+        qbx_order: int = 4,
+        ) -> None:
+    dd = sym.DOFDescriptor(f"d{ambient_dim}", discr_stage=sym.QBX_SOURCE_STAGE2)
+
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
+
+    eoc_build = EOCRecorder()
+    eoc_matvec = EOCRecorder()
+
+    import meshmode.mesh.generation as mgen
+    import meshmode.discretization.poly_element as mpoly
+    resolutions = [64, 128, 256, 512, 1024, 1536, 2048, 2560, 3072]
+
+    for n in resolutions:
+        mesh = mgen.make_curve_mesh(
+            mgen.NArmedStarfish(5, 0.25),
+            np.linspace(0, 1, n),
+            order=target_order)
+
+        from meshmode.discretization import Discretization
+        pre_density_discr = Discretization(actx, mesh,
+            mpoly.InterpolatoryQuadratureGroupFactory(target_order))
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(
+            pre_density_discr,
+            fine_order=source_ovsmp * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False, fmm_backend=None,
+            )
+        places = GeometryCollection(qbx, auto_where=dd.geometry)
+        density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+        logger.info("ndofs:     %d", density_discr.ndofs)
+        logger.info("nelements: %d", density_discr.mesh.nelements)
+
+        timings = run_hmatrix_matvec(actx, places, dofdesc=dd)
+        eoc_build.add_data_point(density_discr.ndofs, timings.build)
+        eoc_matvec.add_data_point(density_discr.ndofs, timings.matvec)
+
+    for name, eoc in [("build", eoc_build), ("matvec", eoc_matvec)]:
+        logger.info("%s\n%s",
+            name, eoc.pretty_print(
+                abscissa_label="dofs",
+                error_label=f"{name} (s)",
+                abscissa_format="%d",
+                error_format="%.3fs",
+                eoc_format="%.2f",
+                )
+            )
+        visualize_eoc(f"scaling-study-hmatrix-{name}", eoc, 1)
+
+
+def visualize_eoc(
+        filename: str, eoc: EOCRecorder, order: int,
+        overwrite: bool = False) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.info("matplotlib not available for plotting")
+        return
+
+    fig = plt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    h, error = np.array(eoc.history).T  # type: ignore[no-untyped-call]
+    ax.loglog(h, error, "o-")
+
+    max_h = np.max(h)
+    min_e = np.min(error)
+    max_e = np.max(error)
+    min_h = np.exp(np.log(max_h) + np.log(min_e / max_e) / order)
+
+    ax.loglog(
+        [max_h, min_h], [max_e, min_e], "k-", label=rf"$\mathcal{{O}}(h^{order})$"
+    )
+
+    # }}}
+
+    ax.grid(True, which="major", linestyle="-", alpha=0.75)
+    ax.grid(True, which="minor", linestyle="--", alpha=0.5)
+
+    ax.set_xlabel("$N$")
+    ax.set_ylabel("$T~(s)$")
+
+    import pathlib
+    filename = pathlib.Path(filename)
+    if not overwrite and filename.exists():
+        raise FileExistsError(f"output file '{filename}' already exists")
+
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    run_scaling_study(ambient_dim=2)
diff --git a/pytential/linalg/direct_solver_symbolic.py b/pytential/linalg/direct_solver_symbolic.py
index 118b94597..2902ee2c2 100644
--- a/pytential/linalg/direct_solver_symbolic.py
+++ b/pytential/linalg/direct_solver_symbolic.py
@@ -54,12 +54,16 @@ def prepare_expr(places, exprs, auto_where=None):
     return _prepare_expr(places, exprs, auto_where=auto_where)
 
 
-def prepare_proxy_expr(places, exprs, auto_where=None):
+def prepare_proxy_expr(
+        places, exprs,
+        auto_where=None,
+        remove_transforms: bool = True):
     def _prepare_expr(expr):
         # remove all diagonal / non-operator terms in the expression
         expr = IntGTermCollector()(expr)
         # ensure all IntGs remove all the kernel derivatives
-        expr = KernelTransformationRemover()(expr)
+        if remove_transforms:
+            expr = KernelTransformationRemover()(expr)
         # ensure all IntGs have their source and targets set
         expr = DOFDescriptorReplacer(auto_where[0], auto_where[1])(expr)
 
diff --git a/pytential/linalg/hmatrix.py b/pytential/linalg/hmatrix.py
new file mode 100644
index 000000000..6dce97c47
--- /dev/null
+++ b/pytential/linalg/hmatrix.py
@@ -0,0 +1,547 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import numpy.linalg as la
+from scipy.sparse.linalg import LinearOperator
+
+from arraycontext import PyOpenCLArrayContext, ArrayOrContainerT, flatten, unflatten
+from meshmode.dof_array import DOFArray
+
+from pytential import GeometryCollection, sym
+from pytential.linalg.proxy import ProxyGeneratorBase
+from pytential.linalg.cluster import ClusterTree, ClusterLevel
+from pytential.linalg.skeletonization import (
+    SkeletonizationWrangler, SkeletonizationResult)
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+Hierarical Matrix Construction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: ProxyHierarchicalMatrixWrangler
+.. autoclass:: ProxyHierarchicalMatrix
+.. autoclass:: ProxyHierarchicalForwardMatrix
+.. autoclass:: ProxyHierarchicalBackwardMatrix
+
+.. autofunction:: build_hmatrix_by_proxy
+"""
+
+
+# {{{ error model
+
+def hmatrix_error_from_param(
+        ambient_dim: int,
+        *,
+        id_eps: float,
+        id_rank: int,
+        min_proxy_radius: float,
+        max_cluster_radius: float,
+        nproxies: int,
+        nsources: int,
+        ntargets: int, c: float = 1.0e-3) -> float:
+    if ambient_dim == 2:
+        p = int(0.5 * id_rank)
+    elif ambient_dim == 3:
+        p = int((np.sqrt(1 + 4 * id_rank) - 1) / 2)
+    else:
+        raise ValueError(f"unsupported ambient dimension: '{ambient_dim}'")
+
+    rho = alpha = max_cluster_radius / min_proxy_radius
+    return (
+        c * rho ** (p + 1) / (1 - rho)
+        + np.sqrt(nsources / nproxies)
+        * (1 - alpha ** (p + 1)) / (1 - alpha) * id_eps
+        )
+
+# }}}
+
+
+# {{{ update diagonals
+
+def _update_skeleton_diagonal(
+        skeleton: SkeletonizationResult,
+        parent: SkeletonizationResult | None,
+        clevel: ClusterLevel | None,
+        diagonal: np.ndarray | None = None) -> SkeletonizationResult:
+    """Due to the evaluation in :func:`_skeletonize_block_by_proxy_with_mats`,
+    the diagonal matrix in *skeleton* also contains the indices from its
+    parent. In particular, at a level :math:`l` we need the diagonal block::
+
+        0               D_{i, j + 1}        D_{i, j + 2}
+        D_{i + 1, j}    0                   D_{i + 1, j + 2}
+        D_{i + 2, j}    D_{i + 2, j + 1}    0
+
+    but the version in *skeleton* also fills in the 0 blocks in there. This
+    routine goes through them and zeros them out.
+    """
+
+    if clevel is None:
+        return skeleton
+
+    assert parent is not None
+    assert skeleton.tgt_src_index.shape == parent.skel_tgt_src_index.shape
+
+    if diagonal is None:
+        diagonal = np.zeros(parent.nclusters)
+
+    from numbers import Number
+    if isinstance(diagonal, Number):
+        diagonal = np.full(parent.nclusters, diagonal)
+
+    assert diagonal.size == parent.nclusters
+    targets, sources = parent.skel_tgt_src_index
+
+    # FIXME: nicer way to do this?
+    mat: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+    for k in range(skeleton.nclusters):
+        D = skeleton.D[k].copy()
+
+        i = j = 0
+        for icluster in clevel.parent_map[k]:
+            di = targets.cluster_size(icluster)
+            dj = sources.cluster_size(icluster)
+            D[np.s_[i:i + di], np.s_[j:j + dj]] = diagonal[icluster]
+
+            i += di
+            j += dj
+
+        assert D.shape == (i, j)
+        mat[k] = D
+
+    from dataclasses import replace
+    return replace(skeleton, D=mat)
+
+
+def _update_skeletons_diagonal(
+        wrangler: "ProxyHierarchicalMatrixWrangler",
+        func: Callable[[SkeletonizationResult], np.ndarray | None],
+        ) -> np.ndarray:
+    skeletons: np.ndarray = np.empty(wrangler.skeletons.shape, dtype=object)
+    skeletons[0] = wrangler.skeletons[0]
+
+    for i in range(1, wrangler.ctree.nlevels):
+        skeletons[i] = _update_skeleton_diagonal(
+            wrangler.skeletons[i],
+            wrangler.skeletons[i - 1],
+            wrangler.ctree.levels[i - 1],
+            diagonal=func(skeletons[i - 1]))
+
+    return skeletons
+
+# }}}
+
+
+# {{{ ProxyHierarchicalMatrix
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrixWrangler:
+    """
+    .. automethod:: get_forward
+    .. automethod:: get_backward
+    """
+
+    wrangler: SkeletonizationWrangler
+    proxy: ProxyGeneratorBase
+    ctree: ClusterTree
+    skeletons: np.ndarray
+
+    def get_forward(self) -> "ProxyHierarchicalForwardMatrix":
+        return ProxyHierarchicalForwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, lambda s: None),
+            )
+
+    def get_backward(self) -> "ProxyHierarchicalBackwardMatrix":
+        return ProxyHierarchicalBackwardMatrix(
+            ctree=self.ctree,
+            skeletons=_update_skeletons_diagonal(self, lambda s: s.Dhat))
+
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrix(LinearOperator):
+    """
+    .. attribute:: skeletons
+
+        An :class:`~numpy.ndarray` containing skeletonization information
+        for each level of the hierarchy. For additional details, see
+        :class:`~pytential.linalg.skeletonization.SkeletonizationResult`.
+
+    This class implements the :class:`scipy.sparse.linalg.LinearOperator`
+    interface. In particular, the following attributes and methods:
+
+    .. attribute:: shape
+
+        A :class:`tuple` that gives the matrix size ``(m, n)``.
+
+    .. attribute:: dtype
+
+        The data type of the matrix entries.
+
+    .. automethod:: matvec
+    .. automethod:: __matmul__
+    """
+
+    ctree: ClusterTree
+    skeletons: np.ndarray
+
+    @property
+    def shape(self):
+        return self.skeletons[0].tgt_src_index.shape
+
+    @property
+    def dtype(self):
+        # FIXME: assert that everyone has this dtype?
+        return self.skeletons[0].R[0].dtype
+
+    @property
+    def nlevels(self):
+        return self.skeletons.size
+
+    @property
+    def nclusters(self):
+        return self.skeletons[0].nclusters
+
+    def __matmul__(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        """Same as :meth:`matvec`."""
+        return self._matvec(x)
+
+    def _matmat(self, mat):
+        raise NotImplementedError
+
+    def _adjoint(self, x):
+        raise NotImplementedError
+
+# }}}
+
+
+# {{{ forward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalForwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_forward_matvec(self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result
+
+
+def apply_skeleton_forward_matvec(
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+    x = split_array(ary, sources)   # type: ignore[arg-type]
+
+    # NOTE: this computes a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   x_{k + 1} = R_k x_k
+    #   z_{k + 1} = D_k x_k
+    #
+    # and, at the root level, we have
+    #
+    #   x_{N + 1} = z_{N + 1} = D_N x_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   b_{k - 1} = z_k + L_k b_k
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    d_dot_x: np.ndarray = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    from pytential.linalg.cluster import cluster
+    for k, clevel in enumerate(hmat.ctree.levels):
+        skeleton = hmat.skeletons[k]
+        assert x.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[1] == sum(xi.size for xi in x)
+
+        d_dot_x_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+        r_dot_x_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+
+        for i in range(skeleton.nclusters):
+            r_dot_x_k[i] = skeleton.R[i] @ x[i]
+            d_dot_x_k[i] = skeleton.D[i] @ x[i]
+
+        d_dot_x[k] = d_dot_x_k
+        x = cluster(r_dot_x_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    # NOTE: at root level, we just multiply with the full diagonal
+    b = d_dot_x[hmat.nlevels - 1]
+    assert b.shape == (1,)
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        d_dot_x_k = d_dot_x[k]
+        assert d_dot_x_k.shape == (skeleton.nclusters,)
+
+        b = uncluster(b, skeleton.skel_tgt_src_index.targets, clevel)
+        for i in range(skeleton.nclusters):
+            b[i] = d_dot_x_k[i] + skeleton.L[i] @ b[i]
+
+    assert b.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(b)[np.argsort(targets.indices)]
+
+# }}}
+
+
+# {{{ backward
+
+@dataclass(frozen=True)
+class ProxyHierarchicalBackwardMatrix(ProxyHierarchicalMatrix):
+    def _matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        if isinstance(x, DOFArray):
+            from arraycontext import get_container_context_recursively_opt
+            actx = get_container_context_recursively_opt(x)
+            if actx is None:
+                raise ValueError("input array is frozen")
+
+            ary = actx.to_numpy(flatten(x, actx))
+        elif isinstance(x, np.ndarray) and x.dtype.char != "O":
+            ary = x
+        else:
+            raise TypeError(f"unsupported input type: {type(x)}")
+
+        assert actx is None or isinstance(actx, PyOpenCLArrayContext)
+        result = apply_skeleton_backward_matvec(actx, self, ary)
+
+        if isinstance(x, DOFArray):
+            assert actx is not None
+            result = unflatten(x, actx.from_numpy(result), actx)
+
+        return result
+
+
+def apply_skeleton_backward_matvec(
+        actx: PyOpenCLArrayContext | None,
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    from pytential.linalg.cluster import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+
+    b = split_array(ary, targets)   # type: ignore[arg-type]
+    r_dot_b: np.ndarray = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    # NOTE: this solves a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0 = b_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   b_{k + 1} = \hat{D}_k R_k D_k^{-1} b_k
+    #   \hat{D}_k = (R_k D_k^{-1} L_k)^{-1}
+    #
+    # and, at the root level, we solve
+    #
+    #   D_N x_N = b_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   x_{k} = D_k^{-1} (b_k - L_k b_{k + 1} + L_k \hat{D}_k x_{k + 1})
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    from pytential.linalg.cluster import cluster
+
+    for k, clevel in enumerate(hmat.ctree.levels):
+        skeleton = hmat.skeletons[k]
+        assert b.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[0] == sum(bi.size for bi in b)
+
+        dhat_dot_b_k: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+        for i in range(skeleton.nclusters):
+            dhat_dot_b_k[i] = (
+                skeleton.Dhat[i] @ (skeleton.R[i] @ (skeleton.invD[i] @ b[i]))
+                )
+
+        r_dot_b[k] = b
+        b = cluster(dhat_dot_b_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    from pytools.obj_array import make_obj_array
+    assert b.shape == (1,)
+    x = make_obj_array([
+        la.solve(D, bi) for D, bi in zip(hmat.skeletons[-1].D, b, strict=True)
+        ])
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(hmat.ctree.levels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        b0 = r_dot_b[k]
+        b1 = r_dot_b[k + 1]
+        assert b0.shape == (skeleton.nclusters,)
+
+        x = uncluster(x, skeleton.skel_tgt_src_index.sources, clevel)
+        b1 = uncluster(b1, skeleton.skel_tgt_src_index.targets, clevel)
+
+        for i in range(skeleton.nclusters):
+            sx = b1[i] - skeleton.Dhat[i] @ x[i]
+            x[i] = skeleton.invD[i] @ (b0[i] - skeleton.L[i] @ sx)
+
+    assert x.shape == (hmat.nclusters,)
+
+    # }}}
+
+    return np.concatenate(x)[np.argsort(sources.indices)]
+
+# }}}
+
+
+# {{{ build_hmatrix_by_proxy
+
+def build_hmatrix_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        exprs: sym.Expression | Sequence[sym.Expression],
+        input_exprs: sym.Variable | Sequence[sym.Variable], *,
+        auto_where: sym.DOFDescriptorLike | None = None,
+        domains: Sequence[sym.DOFDescriptorLike] | None = None,
+        context: dict[str, Any] | None = None,
+        id_eps: float = 1.0e-8,
+        rng: np.random.Generator | None = None,
+
+        # NOTE: these are dev variables and can disappear at any time!
+        _tree_kind: str | None = "adaptive-level-restricted",
+        _weighted_proxy: bool | tuple[bool, bool] | None = None,
+
+        # TODO: plugin in error model to get an estimate for:
+        #   * how many points we want per cluster?
+        #   * how many proxy points we want?
+        #   * how far away should the proxy points be?
+        # based on id_eps. How many of these should be user tunable?
+        _max_particles_in_box: int | None = None,
+        _approx_nproxy: int | None = None,
+        _proxy_radius_factor: float | None = None,
+        ) -> ProxyHierarchicalMatrixWrangler:
+    from pytential.symbolic.matrix import P2PClusterMatrixBuilder
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+
+    def P2PClusterMatrixBuilderWithDiagonal(*args, **kwargs):
+        kwargs["exclude_self"] = True
+        return P2PClusterMatrixBuilder(*args, **kwargs)
+
+    wrangler = make_skeletonization_wrangler(
+            places, exprs, input_exprs,
+            domains=domains, context=context, auto_where=auto_where,
+            _weighted_proxy=_weighted_proxy,
+            # _neighbor_cluster_builder=P2PClusterMatrixBuilderWithDiagonal,
+            # _proxy_source_cluster_builder=P2PClusterMatrixBuilder,
+            # _proxy_target_cluster_builder=P2PClusterMatrixBuilder,
+            )
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise ValueError("multi-block operators are not supported")
+
+    from pytential.linalg.proxy import QBXProxyGenerator
+    proxy = QBXProxyGenerator(places,
+            approx_nproxy=_approx_nproxy,
+            radius_factor=_proxy_radius_factor)
+
+    from pytential.linalg.cluster import partition_by_nodes
+    cluster_index, ctree = partition_by_nodes(
+        actx, places,
+        dofdesc=wrangler.domains[0],
+        tree_kind=_tree_kind,
+        max_particles_in_box=_max_particles_in_box)
+
+    logger.info("tree levels: %d", ctree.nlevels)
+    logger.info("cluster count: %d", cluster_index.nclusters)
+    logger.info("root cluster sizes: %s", [
+        # NOTE: making into a list so that they all get printed
+        int(s) for s in np.diff(cluster_index.starts)
+        ])
+
+    from pytential.linalg.utils import TargetAndSourceClusterList
+    tgt_src_index = TargetAndSourceClusterList(
+        targets=cluster_index, sources=cluster_index)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, exprs, input_exprs,
+        id_eps=id_eps,
+        rng=rng,
+        max_particles_in_box=_max_particles_in_box,
+        _proxy=proxy,
+        _wrangler=wrangler,
+        )
+
+    return ProxyHierarchicalMatrixWrangler(
+        wrangler=wrangler, proxy=proxy, ctree=ctree, skeletons=skeletons
+        )
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index 583500c19..69dc10ab6 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -28,8 +28,10 @@
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
+import numpy.linalg as la
 
 from arraycontext import PyOpenCLArrayContext, Array
+from pytools import memoize_method
 
 from pytential import GeometryCollection, sym
 from pytential.symbolic.matrix import ClusterMatrixBuilderBase
@@ -425,7 +427,7 @@ def evaluate_target_proxy_interaction(
 
 def make_skeletonization_wrangler(
         places: GeometryCollection,
-        exprs: sym.var | Sequence[sym.var],
+        exprs: sym.Expression | Sequence[sym.Expression],
         input_exprs: sym.var | Sequence[sym.var], *,
         domains: Sequence[Hashable] | None = None,
         context: dict[str, Any] | None = None,
@@ -433,6 +435,7 @@ def make_skeletonization_wrangler(
 
         # internal
         _weighted_proxy: bool | tuple[bool, bool] | None = None,
+        _remove_source_transforms: bool = False,
         _proxy_source_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
         _proxy_target_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
         _neighbor_cluster_builder: type[ClusterMatrixBuilderBase] | None = None,
@@ -460,9 +463,13 @@ def make_skeletonization_wrangler(
 
     prepared_lpot_exprs = prepare_expr(places, lpot_exprs, auto_where)
     source_proxy_exprs = prepare_proxy_expr(
-            places, prepared_lpot_exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET))
+            places, prepared_lpot_exprs, (auto_where[0], PROXY_SKELETONIZATION_TARGET),
+            remove_transforms=_remove_source_transforms)
     target_proxy_exprs = prepare_proxy_expr(
-            places, prepared_lpot_exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]))
+            places, prepared_lpot_exprs, (PROXY_SKELETONIZATION_SOURCE, auto_where[1]),
+            # NOTE: transforms are unconditionally removed here because the
+            # source would be the proxies, where we do not have normals, etc.
+            remove_transforms=True)
 
     # }}}
 
@@ -472,7 +479,7 @@ def make_skeletonization_wrangler(
         weighted_sources = weighted_targets = True
     elif isinstance(_weighted_proxy, bool):
         weighted_sources = weighted_targets = _weighted_proxy
-    elif isinstance(_weighted_proxy, tuple):
+    elif isinstance(_weighted_proxy, tuple) and len(_weighted_proxy) == 2:
         weighted_sources, weighted_targets = _weighted_proxy
     else:
         raise ValueError(f"unknown value for weighting: '{_weighted_proxy}'")
@@ -678,9 +685,9 @@ def _skeletonize_block_by_proxy_with_mats(
 
         if __debug__:
             isfinite = np.isfinite(tgt_mat)
-            assert np.all(isfinite), np.where(isfinite)
+            assert np.all(isfinite), np.where(~isfinite)
             isfinite = np.isfinite(src_mat)
-            assert np.all(isfinite), np.where(isfinite)
+            assert np.all(isfinite), np.where(~isfinite)
 
         # skeletonize target points
         k, idx, interp = interp_decomp(tgt_mat.T, rank=k, eps=id_eps, rng=rng)
@@ -830,13 +837,28 @@ def __post_init__(self):
     def nclusters(self):
         return self.tgt_src_index.nclusters
 
+    @property
+    @memoize_method
+    def invD(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([la.inv(D) for D in self.D])
+
+    @property
+    @memoize_method
+    def Dhat(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([
+            la.inv(self.R[i] @ self.invD[i] @ self.L[i])
+            for i in range(self.nclusters)
+            ])
+
 
 def skeletonize_by_proxy(
         actx: PyOpenCLArrayContext,
         places: GeometryCollection,
 
         tgt_src_index: TargetAndSourceClusterList,
-        exprs: sym.var | Sequence[sym.var],
+        exprs: sym.Expression | Sequence[sym.Expression],
         input_exprs: sym.var | Sequence[sym.var], *,
         domains: Sequence[Hashable] | None = None,
         context: dict[str, Any] | None = None,
@@ -905,7 +927,7 @@ def rec_skeletonize_by_proxy(
 
         ctree: ClusterTree,
         tgt_src_index: TargetAndSourceClusterList,
-        exprs: sym.var | Sequence[sym.var],
+        exprs: sym.Expression | Sequence[sym.Expression],
         input_exprs: sym.var | Sequence[sym.var], *,
         domains: Sequence[Hashable] | None = None,
         context: dict[str, Any] | None = None,
@@ -961,7 +983,7 @@ def rec_skeletonize_by_proxy(
                 rng=rng,
                 max_particles_in_box=max_particles_in_box)
 
-        skel_per_level[i] = skeleton  # type: ignore[call-overload]
+        skel_per_level[i] = skeleton
         tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
 
     assert tgt_src_index.nclusters == 1
@@ -969,7 +991,7 @@ def rec_skeletonize_by_proxy(
 
     # evaluate the full root cluster (no skeletonization or anything)
     skeleton = _evaluate_root(actx, 0, 0, places, wrangler, tgt_src_index)
-    skel_per_level[-1] = skeleton  # type: ignore[call-overload]
+    skel_per_level[-1] = skeleton
 
     return skel_per_level
 
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index ee64b1847..8f10ecd0a 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -456,6 +456,22 @@ def mnorm(x: np.ndarray, y: np.ndarray) -> np.floating[Any]:
     return tgt_error, src_error
 
 
+def skeletonization_matrix(
+        mat: np.ndarray, skeleton: SkeletonizationResult,
+        ) -> tuple[np.ndarray, np.ndarray]:
+    D: np.ndarray = np.empty(skeleton.nclusters, dtype=object)
+    S: np.ndarray = np.empty((skeleton.nclusters, skeleton.nclusters), dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            D[i] = skeleton.tgt_src_index.cluster_take(mat, i, i)
+        else:
+            S[i, j] = skeleton.skel_tgt_src_index.cluster_take(mat, i, j)
+
+    return D, S
+
+
 def skeletonization_error(
         mat: np.ndarray, skeleton: SkeletonizationResult, *,
         ord: float | None = None,
@@ -517,4 +533,51 @@ def skeletonization_error(
 
 # }}}
 
+
+# {{{ eigenvalues
+
+def eigs(
+        mat, *,
+        k: int = 6,
+        which: str = "LM",
+        maxiter: int | None = None,
+        tol: float = 0.0) -> np.ndarray:
+    import scipy.sparse.linalg as ssla
+
+    result = ssla.eigs(mat,
+            k=k,
+            which=which,
+            maxiter=maxiter,
+            tol=tol,
+            return_eigenvectors=False)
+
+    imag_norm = np.linalg.norm(np.imag(result), ord=np.inf)
+    if imag_norm > 1.0e-14:
+        from warnings import warn
+        warn(f"eigenvalues are not real enough: norm(imag) = {imag_norm:.12e}",
+             stacklevel=2)
+
+    return result
+
+
+def cond(mat, *,
+        mat_inv=None,
+        p: float | None = None,
+        tol: float = 1.0e-6) -> float:
+    if p is None:
+        p = 2
+
+    if p != 2:
+        raise ValueError(f"unsupported norm order: '{p}'")
+
+    lambda_max = eigs(mat, k=1, which="LM", tol=tol)
+    if mat_inv is None:
+        lambda_min = eigs(mat, k=1, which="SM", tol=tol)
+    else:
+        lambda_min = eigs(mat_inv, k=1, which="LM", tol=tol)
+
+    return np.abs(lambda_max) / np.abs(lambda_min)
+
+# }}}
+
 # vim: foldmethod=marker
diff --git a/pytential/symbolic/matrix.py b/pytential/symbolic/matrix.py
index 6ec13c496..1bab396e9 100644
--- a/pytential/symbolic/matrix.py
+++ b/pytential/symbolic/matrix.py
@@ -547,7 +547,6 @@ def map_int_g(self, expr):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for density, kernel in zip(expr.densities, expr.source_kernels, strict=True):
@@ -561,12 +560,10 @@ def map_int_g(self, expr):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixGenerator
             mat_gen = P2PMatrixGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -576,7 +573,7 @@ def map_int_g(self, expr):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = base_kernel.get_args() + base_kernel.get_source_args()
+            kernel_args = kernel.get_args() + kernel.get_source_args()
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
@@ -761,7 +758,6 @@ def map_int_g(self, expr):
                 expr.target.geometry, expr.target.discr_stage)
 
         actx = self.array_context
-        target_base_kernel = expr.target_kernel.get_base_kernel()
 
         result = 0
         for kernel, density in zip(expr.source_kernels, expr.densities, strict=True):
@@ -782,12 +778,10 @@ def map_int_g(self, expr):
 
             # {{{ generator
 
-            base_kernel = kernel.get_base_kernel()
-
             from sumpy.p2p import P2PMatrixSubsetGenerator
             mat_gen = P2PMatrixSubsetGenerator(actx.context,
-                    source_kernels=(base_kernel,),
-                    target_kernels=(target_base_kernel,),
+                    source_kernels=(kernel,),
+                    target_kernels=(expr.target_kernel,),
                     exclude_self=self.exclude_self)
 
             # }}}
@@ -797,7 +791,7 @@ def map_int_g(self, expr):
             # {{{ kernel args
 
             # NOTE: copied from pytential.symbolic.primitives.IntG
-            kernel_args = base_kernel.get_args() + base_kernel.get_source_args()
+            kernel_args = kernel.get_args() + kernel.get_source_args()
             kernel_args = {arg.loopy_arg.name for arg in kernel_args}
 
             kernel_args = _get_layer_potential_args(
diff --git a/test/extra_matrix_data.py b/test/extra_matrix_data.py
index d4ab820a6..339d40cd8 100644
--- a/test/extra_matrix_data.py
+++ b/test/extra_matrix_data.py
@@ -44,14 +44,23 @@ class MatrixTestCaseMixin:
     proxy_target_cluster_builder: Callable[..., Any] | None = None
     neighbor_cluster_builder: Callable[..., Any] | None = None
 
-    def get_cluster_index(self, actx, places, dofdesc=None):
+    def max_particles_in_box_for_discr(self, discr):
+        max_particles_in_box = self.max_particles_in_box
+        if max_particles_in_box is None:
+            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+
+        return max_particles_in_box
+
+    def get_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         if dofdesc is None:
             dofdesc = places.auto_source
         discr = places.get_discretization(dofdesc.geometry)
 
-        max_particles_in_box = self.max_particles_in_box
         if max_particles_in_box is None:
-            max_particles_in_box = discr.ndofs // self.approx_cluster_count
+            max_particles_in_box = self.max_particles_in_box
+            if max_particles_in_box is None:
+                max_particles_in_box = discr.ndofs // self.approx_cluster_count
 
         from pytential.linalg.cluster import partition_by_nodes
         cindex, ctree = partition_by_nodes(actx, places,
@@ -78,9 +87,11 @@ def get_cluster_index(self, actx, places, dofdesc=None):
 
         return cindex, ctree
 
-    def get_tgt_src_cluster_index(self, actx, places, dofdesc=None):
+    def get_tgt_src_cluster_index(
+            self, actx, places, dofdesc=None, max_particles_in_box=None):
         from pytential.linalg import TargetAndSourceClusterList
-        cindex, ctree = self.get_cluster_index(actx, places, dofdesc=dofdesc)
+        cindex, ctree = self.get_cluster_index(
+            actx, places, dofdesc=dofdesc, max_particles_in_box=max_particles_in_box)
         return TargetAndSourceClusterList(cindex, cindex), ctree
 
     def get_operator(self, ambient_dim, qbx_forced_limit=_NoArgSentinel):
diff --git a/test/test_linalg_hmatrix.py b/test/test_linalg_hmatrix.py
new file mode 100644
index 000000000..649a3d6db
--- /dev/null
+++ b/test/test_linalg_hmatrix.py
@@ -0,0 +1,590 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import replace
+import pytest
+
+import numpy as np
+
+from pytential import bind, sym
+from pytential import GeometryCollection
+
+from meshmode.mesh.generation import NArmedStarfish
+
+from meshmode import _acf           # noqa: F401
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+
+import extra_matrix_data as extra
+import logging
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+HMATRIX_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="scalar",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="double",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[512]),
+        extra.TorusTestCase(
+            target_order=4,
+            op_type="scalar",
+            resolutions=[0])
+        ]
+
+
+# {{{ test_hmatrix_forward_matvec_single_level
+
+def hmatrix_matvec_single_level(mat, x, skeleton):
+    from pytential.linalg.cluster import split_array
+    targets, sources = skeleton.tgt_src_index
+    y = split_array(x, sources)
+
+    y_hat = np.empty(y.shape, dtype=object)
+
+    for i in range(skeleton.nclusters):
+        y_hat[i] = skeleton.R[i] @ y[i]
+
+    from pytential.linalg.utils import skeletonization_matrix
+    D, S = skeletonization_matrix(mat, skeleton)
+    syhat = np.zeros(y.shape, dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            continue
+
+        syhat[i] = syhat[i] + S[i, j] @ y_hat[j]
+
+    for i in range(skeleton.nclusters):
+        y[i] = D[i] @ y[i] + skeleton.L[i] @ syhat[i]
+
+    return np.concatenate(y)[np.argsort(targets.indices)]
+
+
+@pytest.mark.parametrize("case", HMATRIX_TEST_CASES)
+@pytest.mark.parametrize("discr_stage", [sym.QBX_SOURCE_STAGE1])
+def test_hmatrix_forward_matvec_single_level(
+        actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.15}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct reference
+
+    from pytential.linalg.direct_solver_symbolic import prepare_expr
+    from pytential.symbolic.matrix import MatrixBuilder
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+    mat = MatrixBuilder(
+        actx,
+        dep_expr=sym_u,
+        other_dep_exprs=[],
+        dep_discr=density_discr,
+        places=places,
+        context={},
+        )(prepare_expr(places, sym_op, (dd, dd)))
+
+    from arraycontext import flatten, unflatten
+    x = actx.thaw(density_discr.nodes()[0])
+    y = actx.to_numpy(flatten(x, actx))
+    r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+
+    # }}}
+
+    # {{{ check matvec
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    from pytential.linalg.skeletonization import skeletonize_by_proxy
+    for i in range(id_eps.size):
+        skeleton = skeletonize_by_proxy(
+            actx, places, tgt_src_index, sym_op, sym_u,
+            domains=[dd], context={},
+            approx_nproxy=case.proxy_approx_count,
+            proxy_radius_factor=case.proxy_radius_factor,
+            id_eps=id_eps[i],
+            rng=rng,
+            )
+        r_hmat = hmatrix_matvec_single_level(mat, y, skeleton[0, 0])
+        r_hmat = unflatten(x, actx.from_numpy(r_hmat), actx)
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmat - r_lpot) / flat_norm(r_lpot)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        # assert rec_error[i] < 0.1
+
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    basename = "linalg_hmatrix_single_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmat, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_forward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_forward_matvec(
+        actx_factory, case, discr_stage, p2p=False, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 256, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x = actx.thaw(density_discr.nodes()[0])
+
+    if p2p:
+        # NOTE: this also needs changed in `build_hmatrix_by_proxy`
+        # to actually evaluate the p2p interactions instead of qbx
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import P2PMatrixBuilder
+        mat = P2PMatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        from arraycontext import flatten, unflatten
+        y = actx.to_numpy(flatten(x, actx))
+        r_lpot = unflatten(x, actx.from_numpy(mat @ y), actx)
+    else:
+        r_lpot = bind(places, sym_op, auto_where=dd)(actx, u=x)
+
+    from pytential.linalg.hmatrix import hmatrix_error_from_param
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+    model_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            rng=rng,
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+        hmat = wrangler.get_forward()
+
+        # {{{ skeletonization error
+
+        from meshmode.dof_array import flat_norm
+        r_hmap = hmat @ x
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmap - r_lpot) / flat_norm(r_lpot)
+            )
+
+        # }}}
+
+        # {{{ model error
+
+        skeleton = hmat.skeletons[0]
+        icluster = np.argmax(np.diff(skeleton.skel_tgt_src_index.targets.starts))
+
+        proxy_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy.radii[icluster]
+            )
+        cluster_radius = actx.to_numpy(
+            skeleton._src_eval_result.pxy.cluster_radii[icluster]
+            )
+
+        model_error[i] = hmatrix_error_from_param(
+            places.ambient_dim,
+            id_eps=id_eps[i],
+            min_proxy_radius=proxy_radius,
+            max_cluster_radius=cluster_radius,
+            id_rank=skeleton.skel_tgt_src_index.targets.cluster_size(icluster),
+            nproxies=skeleton._src_eval_result.pxy.pxyindex.cluster_size(icluster),
+            ntargets=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            nsources=skeleton.tgt_src_index.targets.cluster_size(icluster),
+            c=1.0e-8
+            )
+
+        # }}}
+
+        logger.info("id_eps %.2e error: %.12e (%.12e)",
+            id_eps[i], rec_error[i], model_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+    ax.loglog(id_eps, model_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    lpot_name = "p2p" if p2p else "qbx"
+    basename = f"linalg_hmatrix_{lpot_name}_matvec"
+    fig.savefig(f"{basename}_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmap, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"{basename}_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_backward_matvec
+
+@pytest.mark.parametrize("case", [
+    HMATRIX_TEST_CASES[0],
+    HMATRIX_TEST_CASES[1],
+    pytest.param(HMATRIX_TEST_CASES[2], marks=pytest.mark.slowtest),
+    ])
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1,
+    # sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_backward_matvec(actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    max_particles_in_box = case.max_particles_in_box_for_discr(density_discr)
+
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(
+        actx, places, dd, max_particles_in_box=max_particles_in_box)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    if visualize:
+        from pytential.linalg.direct_solver_symbolic import prepare_expr
+        from pytential.symbolic.matrix import MatrixBuilder
+        mat = MatrixBuilder(
+            actx,
+            dep_expr=sym_u,
+            other_dep_exprs=[],
+            dep_discr=density_discr,
+            places=places,
+            context={},
+            )(prepare_expr(places, sym_op, (dd, dd)))
+
+        import pytential.linalg.utils as hla
+        eigs_ref = hla.eigs(mat, k=5)
+        kappa_ref = np.linalg.cond(mat, p=2)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x_ref = actx.thaw(density_discr.nodes()[0])
+    b_ref = bind(places, sym_op, auto_where=dd)(actx, u=x_ref)
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    if visualize:
+        rec_eigs = np.zeros((id_eps.size, eigs_ref.size), dtype=np.complex128)
+        rec_kappa = np.zeros(id_eps.size)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        wrangler = build_hmatrix_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            rng=rng,
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+
+        hmat_inv = wrangler.get_backward()
+        x_hmat = hmat_inv @ b_ref
+
+        if visualize:
+            hmat = wrangler.get_forward()
+            rec_eigs[i, :] = hla.eigs(hmat, k=5, tol=1.0e-6)
+            rec_kappa[i] = hla.cond(hmat, p=2, tol=1.0e-6)
+
+            logger.info("eigs: %s %s", eigs_ref, rec_eigs[i])
+            logger.info("kappa %.12e %.12e", kappa_ref, rec_kappa[i])
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(x_hmat - x_ref) / flat_norm(x_ref)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    if not visualize:
+        assert eoc.order_estimate() > 0.6
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+
+    # {{{ convergence
+
+    ax = fig.gca()
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_convergence")
+    fig.clf()
+
+    # }}}
+
+    # {{{ eigs
+
+    ax = fig.gca()
+    ax.plot(np.real(eigs_ref), np.imag(eigs_ref), "ko")
+    for i in range(id_eps.size):
+        ax.plot(np.real(rec_eigs[i]), np.imag(rec_eigs[i]), "v")
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\Re \lambda$")
+    ax.set_ylabel(r"$\Im \lambda$")
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_eigs")
+    fig.clf()
+
+    # }}}
+
+    if case.ambient_dim == 2:
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        x_hmat = actx.to_numpy(flatten(x_hmat, actx))
+        x_ref = actx.to_numpy(flatten(x_ref, actx))
+
+        ax.semilogy(x_hmat - x_ref)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_error")
+        fig.clf()
+
+    pt.close(fig)
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker
diff --git a/test/test_linalg_skeletonization.py b/test/test_linalg_skeletonization.py
index 89e0eb091..6f33df48b 100644
--- a/test/test_linalg_skeletonization.py
+++ b/test/test_linalg_skeletonization.py
@@ -140,6 +140,107 @@ def test_skeletonize_symbolic(actx_factory, case, visualize=False):
 # }}}
 
 
+# {{{ test_skeletonize_diagonal
+
+@pytest.mark.parametrize("case", [
+    SKELETONIZE_TEST_CASES[0],
+    SKELETONIZE_TEST_CASES[1],
+    SKELETONIZE_TEST_CASES[2],
+    ])
+def test_skeletonize_diagonal(actx_factory, case, visualize=False):
+    import scipy.linalg.interpolative as sli    # pylint:disable=no-name-in-module
+    sli.seed(42)
+
+    actx = actx_factory()
+    rng = np.random.default_rng(42)
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    # {{{ setup
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    resolution = case.resolutions[-1]
+
+    qbx = case.get_layer_potential(actx, resolution, case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    tgt_src_index, ctree = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    # }}}
+
+    # {{{ check
+
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+    wrangler = make_skeletonization_wrangler(
+            places, sym_op, sym_u,
+            auto_where=dd, context=case.knl_concrete_kwargs)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, sym_op, sym_u,
+        auto_where=dd,
+        context=case.knl_concrete_kwargs,
+        approx_nproxy=case.proxy_approx_count,
+        proxy_radius_factor=case.proxy_radius_factor,
+        id_eps=case.id_eps,
+        rng=rng,
+        _wrangler=wrangler,
+        )
+
+    from pytential.linalg.hmatrix import _update_skeleton_diagonal
+    for i in range(1, skeletons.size):
+        skeletons[i] = _update_skeleton_diagonal(
+            skeletons[i], skeletons[i - 1], ctree.levels[i - 1],
+            )
+
+    from pytential.linalg.cluster import cluster
+    parent = None
+    for k, clevel in enumerate(ctree.levels):
+        from pytential.linalg.utils import make_flat_cluster_diag
+
+        tgt_src_index = skeletons[k].tgt_src_index
+        D1 = wrangler.evaluate_self(actx, places, tgt_src_index, 0, 0)
+        D1 = make_flat_cluster_diag(D1, tgt_src_index)
+
+        if k == 0:
+            D = D0 = D1
+        else:
+            skel_tgt_src_index = skeletons[k - 1].skel_tgt_src_index
+            assert skel_tgt_src_index.shape == tgt_src_index.shape
+
+            D0 = wrangler.evaluate_self(actx, places, skel_tgt_src_index, 0, 0)
+            D0 = cluster(make_flat_cluster_diag(D0, skel_tgt_src_index), parent)
+
+            D = D1 - D0
+
+        parent = clevel
+
+        assert D1.shape == (skeletons[k].nclusters,)
+        assert D1.shape == D0.shape, (D1.shape, D0.shape)
+        assert D1.shape == D.shape, (D1.shape, D.shape)
+
+        for i in range(skeletons[k].nclusters):
+            assert D1[i].shape == skeletons[k].tgt_src_index.cluster_shape(i, i)
+            assert D1[i].shape == D0[i].shape, (D1[i].shape, D0[i].shape)
+
+            error = la.norm(D[i] - skeletons[k].D[i]) / (la.norm(D[i]) + 1.0e-12)
+            logger.info("level %04d / %04d cluster %3d (%4d, %4d) error %.12e",
+                k, ctree.nlevels,
+                ctree.tree_cluster_parent_ids[clevel.box_ids][i],
+                *skeletons[k].tgt_src_index.cluster_shape(i, i), error)
+
+            assert error < 1.0e-15
+
+        logger.info("")
+
+    # }}}
+
+# }}}
+
+
 # {{{ test_skeletonize_by_proxy