diff --git a/examples/scaling-study-hmatrix.py b/examples/scaling-study-hmatrix.py
new file mode 100644
index 000000000..2356906e4
--- /dev/null
+++ b/examples/scaling-study-hmatrix.py
@@ -0,0 +1,198 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from meshmode.array_context import PyOpenCLArrayContext
+from pytools.convergence import EOCRecorder
+
+from pytential import GeometryCollection, sym
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Timings:
+    build: float
+    matvec: float
+
+
+def run_hmatrix_matvec(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection, *,
+        dofdesc: sym.DOFDescriptor) -> None:
+    from sumpy.kernel import LaplaceKernel
+    kernel = LaplaceKernel(places.ambient_dim)
+    sym_u = sym.var("u")
+    sym_op = -0.5 * sym_u + sym.D(kernel, sym_u, qbx_forced_limit="avg")
+
+    density_discr = places.get_discretization(dofdesc.geometry, dofdesc.discr_stage)
+    u = actx.thaw(density_discr.nodes()[0])
+
+    def build_hmat():
+        from pytential.linalg.hmatrix import build_hmatrix_matvec_by_proxy
+        return build_hmatrix_matvec_by_proxy(
+            actx, places, sym_op, sym_u,
+            domains=[dofdesc],
+            context={},
+            auto_where=dofdesc,
+            id_eps=1.0e-10,
+            _tree_kind="adaptive-level-restricted",
+            _approx_nproxy=64,
+            _proxy_radius_factor=1.15)
+
+    # warmup
+    from pytools import ProcessTimer
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    logger.info("build(warmup): %s", pt)
+
+    # build
+    with ProcessTimer() as pt:
+        hmat = build_hmat()
+        actx.queue.finish()
+
+    t_build = pt.wall_elapsed
+    logger.info("build: %s", pt)
+
+    # matvec
+    with ProcessTimer() as pt:
+        du = hmat @ u
+        assert du is not None
+        actx.queue.finish()
+
+    t_matvec = pt.wall_elapsed
+    logger.info("matvec: %s", pt)
+
+    return Timings(t_build, t_matvec)
+
+
+def run_scaling_study(
+        ambient_dim: int, *,
+        target_order: int = 4,
+        source_ovsmp: int = 4,
+        qbx_order: int = 4,
+        ) -> None:
+    dd = sym.DOFDescriptor(f"d{ambient_dim}", discr_stage=sym.QBX_SOURCE_STAGE2)
+
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
+
+    eoc_build = EOCRecorder()
+    eoc_matvec = EOCRecorder()
+
+    import meshmode.mesh.generation as mgen
+    import meshmode.discretization.poly_element as mpoly
+    resolutions = [64, 128, 256, 512, 1024, 1536, 2048, 2560, 3072]
+
+    for n in resolutions:
+        mesh = mgen.make_curve_mesh(
+            mgen.NArmedStarfish(5, 0.25),
+            np.linspace(0, 1, n),
+            order=target_order)
+
+        from meshmode.discretization import Discretization
+        pre_density_discr = Discretization(actx, mesh,
+            mpoly.InterpolatoryQuadratureGroupFactory(target_order))
+
+        from pytential.qbx import QBXLayerPotentialSource
+        qbx = QBXLayerPotentialSource(
+            pre_density_discr,
+            fine_order=source_ovsmp * target_order,
+            qbx_order=qbx_order,
+            fmm_order=False, fmm_backend=None,
+            )
+        places = GeometryCollection(qbx, auto_where=dd.geometry)
+        density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+
+        logger.info("ndofs:     %d", density_discr.ndofs)
+        logger.info("nelements: %d", density_discr.mesh.nelements)
+
+        timings = run_hmatrix_matvec(actx, places, dofdesc=dd)
+        eoc_build.add_data_point(density_discr.ndofs, timings.build)
+        eoc_matvec.add_data_point(density_discr.ndofs, timings.matvec)
+
+    for name, eoc in [("build", eoc_build), ("matvec", eoc_matvec)]:
+        logger.info("%s\n%s",
+            name, eoc.pretty_print(
+                abscissa_label="dofs",
+                error_label=f"{name} (s)",
+                abscissa_format="%d",
+                error_format="%.3fs",
+                eoc_format="%.2f",
+                )
+            )
+        visualize_eoc(f"scaling-study-hmatrix-{name}", eoc, 1)
+
+
+def visualize_eoc(
+        filename: str, eoc: EOCRecorder, order: int,
+        overwrite: bool = False) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        logger.info("matplotlib not available for plotting")
+        return
+
+    fig = plt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    h, error = np.array(eoc.history).T  # type: ignore[no-untyped-call]
+    ax.loglog(h, error, "o-")
+
+    max_h = np.max(h)
+    min_e = np.min(error)
+    max_e = np.max(error)
+    min_h = np.exp(np.log(max_h) + np.log(min_e / max_e) / order)
+
+    ax.loglog(
+        [max_h, min_h], [max_e, min_e], "k-", label=rf"$\mathcal{{O}}(h^{order})$"
+    )
+
+    # }}}
+
+    ax.grid(True, which="major", linestyle="-", alpha=0.75)
+    ax.grid(True, which="minor", linestyle="--", alpha=0.5)
+
+    ax.set_xlabel("$N$")
+    ax.set_ylabel("$T~(s)$")
+
+    import pathlib
+    filename = pathlib.Path(filename)
+    if not overwrite and filename.exists():
+        raise FileExistsError(f"output file '{filename}' already exists")
+
+    fig.savefig(filename)
+    plt.close(fig)
+
+
+if __name__ == "__main__":
+    run_scaling_study(ambient_dim=2)
diff --git a/pytential/linalg/cluster.py b/pytential/linalg/cluster.py
index 33bd97857..9d870bcf5 100644
--- a/pytential/linalg/cluster.py
+++ b/pytential/linalg/cluster.py
@@ -220,6 +220,23 @@ def make_block(i: int, j: int):
     else:
         raise ValueError(f"unsupported ndarray dimension: '{ndim}'")
 
+
+def uncluster(ary: np.ndarray, index: IndexList, clevel: ClusterLevel) -> np.ndarray:
+    assert ary.shape == (clevel.parent_map.size,)
+
+    if index.nclusters == 1:
+        return ary
+
+    result = np.empty(index.nclusters, dtype=object)
+    for ifrom, ppm in enumerate(clevel.parent_map):
+        offset = 0
+        for ito in ppm:
+            cluster_size = index.cluster_size(ito)
+            result[ito] = ary[ifrom][offset:offset + cluster_size]
+            offset += cluster_size
+
+    return result
+
 # }}}
 
 
diff --git a/pytential/linalg/hmatrix.py b/pytential/linalg/hmatrix.py
new file mode 100644
index 000000000..dea041f4d
--- /dev/null
+++ b/pytential/linalg/hmatrix.py
@@ -0,0 +1,359 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, Optional, Union
+
+import numpy as np
+import numpy.linalg as la
+
+from arraycontext import PyOpenCLArrayContext, ArrayOrContainerT, flatten, unflatten
+from meshmode.dof_array import DOFArray
+
+from pytential import GeometryCollection, sym
+from pytential.linalg.cluster import ClusterTree
+from pytential.linalg.skeletonization import SkeletonizationWrangler
+
+__doc__ = """
+Hierarical Matrix Construction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+
+# {{{ ProxyHierarchicalMatrix
+
+@dataclass(frozen=True)
+class ProxyHierarchicalMatrix:
+    """
+    .. attribute:: skeletons
+
+        An :class:`~numpy.ndarray` containing skeletonization information
+        for each level of the hierarchy. For additional details, see
+        :class:`~pytential.linalg.skeletonization.SkeletonizationResult`.
+
+    This class implements the :class:`scipy.sparse.linalg.LinearOperator`
+    interface. In particular, the following attributes and methods:
+
+    .. attribute:: shape
+
+        A :class:`tuple` that gives the matrix size ``(m, n)``.
+
+    .. attribute:: dtype
+
+        The data type of the matrix entries.
+
+    .. automethod:: matvec
+    .. automethod:: __matmul__
+    """
+
+    wrangler: SkeletonizationWrangler
+    ctree: ClusterTree
+    method: str
+
+    skeletons: np.ndarray
+
+    @property
+    def shape(self):
+        return self.skeletons[0].tgt_src_index.shape
+
+    @property
+    def dtype(self):
+        # FIXME: assert that everyone has this dtype?
+        return self.skeletons[0].R[0].dtype
+
+    @property
+    def nlevels(self):
+        return self.skeletons.size
+
+    @property
+    def nclusters(self):
+        return self.skeletons[0].nclusters
+
+    def matvec(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        """Implements a matrix-vector multiplication :math:`H x`."""
+        from arraycontext import get_container_context_recursively_opt
+        actx = get_container_context_recursively_opt(x)
+        if actx is None:
+            raise ValueError("input array is frozen")
+
+        if self.method == "forward":
+            return apply_skeleton_matvec(actx, self, x)
+        elif self.method == "backward":
+            return apply_skeleton_inverse_matvec(actx, self, x)
+        else:
+            raise ValueError(f"unknown matvec method: '{self.method}'")
+
+    def __matmul__(self, x: ArrayOrContainerT) -> ArrayOrContainerT:
+        """Same as :meth:`matvec`."""
+        return self.matvec(x)
+
+    def rmatvec(self, x):
+        raise NotImplementedError
+
+    def matmat(self, mat):
+        raise NotImplementedError
+
+    def rmatmat(self, mat):
+        raise NotImplementedError
+
+
+def apply_skeleton_matvec(
+        actx: PyOpenCLArrayContext,
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    if not isinstance(ary, DOFArray):
+        raise TypeError(f"unsupported array container: '{type(ary).__name__}'")
+
+    from pytential.linalg.utils import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+    x = split_array(actx.to_numpy(flatten(ary, actx)), sources)
+
+    # NOTE: this computes a telescoping product of the form
+    #
+    #   A x_0 = (D0 + L0 (D1 + L1 (...) R1) R0) x_0
+    #
+    # with arbitrary numbers of levels. When recursing down, we compute
+    #
+    #   x_{k + 1} = R_k x_k
+    #   z_{k + 1} = D_k x_k
+    #
+    # and, at the root level, we have
+    #
+    #   x_{N + 1} = z_{N + 1} = D_N x_N.
+    #
+    # When recursing back up, we take `b_{N + 1} = x_{N + 1}` and
+    #
+    #   b_{k - 1} = z_k + L_k b_k
+    #
+    # which gives back the desired product when we reach the leaf level again.
+
+    d_dot_x = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    from pytential.linalg.cluster import cluster
+    clevels = list(hmat.ctree.levels(root=True))
+
+    for k, clevel in enumerate(clevels):
+        skeleton = hmat.skeletons[k]
+        assert x.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[1] == sum([xi.size for xi in x])
+
+        d_dot_x_k = np.empty(skeleton.nclusters, dtype=object)
+        r_dot_x_k = np.empty(skeleton.nclusters, dtype=object)
+
+        for i in range(skeleton.nclusters):
+            r_dot_x_k[i] = skeleton.R[i] @ x[i]
+            d_dot_x_k[i] = skeleton.D[i] @ x[i]
+
+        d_dot_x[k] = d_dot_x_k
+        x = cluster(r_dot_x_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    # NOTE: at root level, we just multiply with the full diagonal
+    b = d_dot_x[hmat.nlevels - 1]
+    assert b.shape == (1,)
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(clevels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        d_dot_x_k = d_dot_x[k]
+        assert d_dot_x_k.shape == (skeleton.nclusters,)
+
+        b = uncluster(b, skeleton.skel_tgt_src_index.targets, clevel)
+        for i in range(skeleton.nclusters):
+            b[i] = d_dot_x_k[i] + skeleton.L[i] @ b[i]
+
+    assert b.shape == (hmat.nclusters,)
+
+    # }}}
+
+    b = np.concatenate(b)[np.argsort(targets.indices)]
+    return unflatten(ary, actx.from_numpy(b), actx)
+
+
+def apply_skeleton_inverse_matvec(
+        actx: PyOpenCLArrayContext,
+        hmat: ProxyHierarchicalMatrix,
+        ary: ArrayOrContainerT,
+        ) -> ArrayOrContainerT:
+    if not isinstance(ary, DOFArray):
+        raise TypeError(f"unsupported array container: '{type(ary).__name__}'")
+
+    from pytential.linalg.utils import split_array
+    targets, sources = hmat.skeletons[0].tgt_src_index
+
+    b = split_array(actx.to_numpy(flatten(ary, actx)), targets)
+    dhat_dot_b = np.empty(hmat.nlevels, dtype=object)
+
+    # {{{ recurse down
+
+    from pytential.linalg.cluster import cluster
+    clevels = list(hmat.ctree.levels(root=True))
+
+    for k, clevel in enumerate(clevels):
+        skeleton = hmat.skeletons[k]
+        assert b.shape == (skeleton.nclusters,)
+        assert skeleton.tgt_src_index.shape[0] == sum([bi.size for bi in b])
+
+        dhat_dot_b_k = np.empty(skeleton.nclusters, dtype=object)
+        for i in range(skeleton.nclusters):
+            dhat_dot_b_k[i] = (
+                skeleton.Dhat[i] @ (skeleton.R[i] @ (skeleton.invD[i] @ b[i]))
+                )
+
+        dhat_dot_b[k] = dhat_dot_b_k
+        b = cluster(dhat_dot_b_k, clevel)
+
+    # }}}
+
+    # {{{ root
+
+    from pytools.obj_array import make_obj_array
+    assert b.shape == (1,)
+    x = make_obj_array([
+        la.solve(D, bi) for D, bi in zip(hmat.skeletons[-1].D, b)
+        ])
+
+    # }}}
+
+    # {{{ recurse up
+
+    from pytential.linalg.cluster import uncluster
+
+    for k, clevel in reversed(list(enumerate(clevels[:-1]))):
+        skeleton = hmat.skeletons[k]
+        dhat_dot_b_k0 = dhat_dot_b[k]
+        dhat_dot_b_k1 = dhat_dot_b[k + 1]
+        assert dhat_dot_b_k0.shape == (skeleton.nclusters,)
+
+        x = uncluster(x, skeleton.skel_tgt_src_index.sources, clevel)
+        dhat_dot_b_k1 = uncluster(
+            dhat_dot_b_k1, skeleton.skel_tgt_src_index.sources, clevel)
+
+        for i in range(skeleton.nclusters):
+            x[i] = skeleton.invD[i] @ (
+                dhat_dot_b_k0[i]
+                - skeleton.L[i] @ dhat_dot_b_k1[i]
+                + skeleton.L[i] @ (skeleton.Dhat @ x[i])
+                )
+
+    assert x.shape == (hmat.nclusters,)
+
+    # }}}
+
+    x = np.concatenate(x)[np.argsort(sources.indices)]
+    return unflatten(ary, actx.from_numpy(x), actx)
+
+# }}}
+
+
+# {{{ build_hmatrix_matvec_by_proxy
+
+def build_hmatrix_matvec_by_proxy(
+        actx: PyOpenCLArrayContext,
+        places: GeometryCollection,
+        exprs: Union[sym.Expression, Iterable[sym.Expression]],
+        input_exprs: Union[sym.Expression, Iterable[sym.Expression]], *,
+        matvec: str = "forward",
+        auto_where: Optional[sym.DOFDescriptorLike] = None,
+        domains: Optional[Iterable[sym.DOFDescriptorLike]] = None,
+        context: Optional[Dict[str, Any]] = None,
+        id_eps: float = 1.0e-8,
+
+        # NOTE: these are dev variables and can disappear at any time!
+        # TODO: plugin in error model to get an estimate for:
+        #   * how many points we want per cluster?
+        #   * how many proxy points we want?
+        #   * how far away should the proxy points be?
+        # based on id_eps. How many of these should be user tunable?
+        _tree_kind: Optional[str] = "adaptive-level-restricted",
+        _max_particles_in_box: Optional[int] = None,
+
+        _approx_nproxy: Optional[int] = None,
+        _proxy_radius_factor: Optional[float] = None,
+        ):
+    if matvec not in ("forward", "backward"):
+        raise ValueError(f"unknown matvec: '{matvec}'")
+
+    from pytential.linalg.skeletonization import make_skeletonization_wrangler
+    wrangler = make_skeletonization_wrangler(
+            places, exprs, input_exprs,
+            domains=domains, context=context, auto_where=auto_where)
+
+    if wrangler.nrows != 1 or wrangler.ncols != 1:
+        raise ValueError("multi-block operators are not supported")
+
+    from pytential.linalg.proxy import QBXProxyGenerator
+    proxy = QBXProxyGenerator(places,
+            approx_nproxy=_approx_nproxy,
+            radius_factor=_proxy_radius_factor)
+
+    from pytential.linalg.cluster import partition_by_nodes
+    cluster_index, ctree = partition_by_nodes(
+        actx, places,
+        dofdesc=wrangler.domains[0],
+        tree_kind=_tree_kind,
+        max_particles_in_box=_max_particles_in_box)
+
+    from pytential.linalg.utils import TargetAndSourceClusterList
+    tgt_src_index = TargetAndSourceClusterList(
+        targets=cluster_index, sources=cluster_index)
+
+    from pytential.linalg.skeletonization import rec_skeletonize_by_proxy
+    skeletons = rec_skeletonize_by_proxy(
+        actx, places, ctree, tgt_src_index, exprs, input_exprs,
+        id_eps=id_eps,
+        max_particles_in_box=_max_particles_in_box,
+        _proxy=proxy,
+        _wrangler=wrangler,
+        )
+
+    from pytential.linalg.skeletonization import _fix_skeleton_diagonal
+    parent = None
+
+    for i, clevel in enumerate(ctree.levels(root=True)):
+        if i == 0:
+            parent = clevel
+            continue
+
+        D = skeletons[i - 1].Dhat if matvec == "backward" else None
+        skeletons[i] = _fix_skeleton_diagonal(
+            skeletons[i], skeletons[i - 1], parent, diagonal=D)
+
+        parent = clevel
+
+    return ProxyHierarchicalMatrix(
+        wrangler=wrangler, ctree=ctree, method=matvec, skeletons=skeletons)
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/pytential/linalg/skeletonization.py b/pytential/linalg/skeletonization.py
index b7340965f..4de7703a1 100644
--- a/pytential/linalg/skeletonization.py
+++ b/pytential/linalg/skeletonization.py
@@ -26,8 +26,10 @@
         TYPE_CHECKING)
 
 import numpy as np
+import numpy.linalg as la
 
 from arraycontext import PyOpenCLArrayContext, Array, ArrayT
+from pytools import memoize_method
 
 from pytential import GeometryCollection, sym
 from pytential.linalg.proxy import ProxyGeneratorBase, ProxyClusterGeometryData
@@ -637,8 +639,8 @@ def _skeletonize_block_by_proxy_with_mats(
                 ibrow=ibrow, ibcol=ibcol)
             )
 
-    src_skl_indices = np.empty(nclusters, dtype=object)
-    tgt_skl_indices = np.empty(nclusters, dtype=object)
+    src_skel_indices = np.empty(nclusters, dtype=object)
+    tgt_skel_indices = np.empty(nclusters, dtype=object)
     skel_starts = np.zeros(nclusters + 1, dtype=np.int32)
 
     L = np.empty(nclusters, dtype=object)
@@ -661,14 +663,14 @@ def _skeletonize_block_by_proxy_with_mats(
         assert k > 0
 
         L[i] = interp.T
-        tgt_skl_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
+        tgt_skel_indices[i] = tgt_src_index.targets.cluster_indices(i)[idx[:k]]
 
         # skeletonize source points
         k, idx, interp = interp_decomp(src_mat, rank=k, eps=None)
         assert k > 0
 
         R[i] = interp
-        src_skl_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
+        src_skel_indices[i] = tgt_src_index.sources.cluster_indices(i)[idx[:k]]
 
         skel_starts[i + 1] = skel_starts[i] + k
         assert R[i].shape == (k, src_mat.shape[1])
@@ -679,9 +681,9 @@ def _skeletonize_block_by_proxy_with_mats(
     D = make_flat_cluster_diag(mat, tgt_src_index)
 
     from pytential.linalg import TargetAndSourceClusterList, make_index_list
-    src_skl_index = make_index_list(np.hstack(src_skl_indices), skel_starts)
-    tgt_skl_index = make_index_list(np.hstack(tgt_skl_indices), skel_starts)
-    skel_tgt_src_index = TargetAndSourceClusterList(tgt_skl_index, src_skl_index)
+    src_skel_index = make_index_list(np.hstack(src_skel_indices), skel_starts)
+    tgt_skel_index = make_index_list(np.hstack(tgt_skel_indices), skel_starts)
+    skel_tgt_src_index = TargetAndSourceClusterList(tgt_skel_index, src_skel_index)
 
     return SkeletonizationResult(
             L=L, R=R, D=D,
@@ -776,6 +778,21 @@ def __post_init__(self):
     def nclusters(self):
         return self.tgt_src_index.nclusters
 
+    @property
+    @memoize_method
+    def invD(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([la.inv(D) for D in self.D])
+
+    @property
+    @memoize_method
+    def Dhat(self) -> np.ndarray:
+        from pytools.obj_array import make_obj_array
+        return make_obj_array([
+            la.inv(self.R[i] @ self.invD[i] @ self.L[i])
+            for i in range(self.nclusters)
+            ])
+
 
 def skeletonize_by_proxy(
         actx: PyOpenCLArrayContext,
@@ -866,7 +883,8 @@ def _evaluate_root_diagonal(
 def _fix_skeleton_diagonal(
         skeleton: SkeletonizationResult,
         parent: Optional[SkeletonizationResult],
-        clevel: Optional[ClusterLevel]) -> SkeletonizationResult:
+        clevel: Optional[ClusterLevel],
+        diagonal: Optional[np.ndarray] = None) -> SkeletonizationResult:
     """Due to the evaluation in :func:`_skeletonize_block_by_proxy_with_mats`,
     the diagonal matrix in *skeleton* also contains the indices from its
     parent. In particular, at a level :math:`l` we need the diagonal block::
@@ -885,6 +903,12 @@ def _fix_skeleton_diagonal(
     assert parent is not None
     assert skeleton.tgt_src_index.shape == parent.skel_tgt_src_index.shape
 
+    if diagonal is None:
+        diagonal = np.zeros(parent.nclusters)
+
+    assert diagonal.size == parent.nclusters
+    targets, sources = parent.skel_tgt_src_index
+
     # FIXME: nicer way to do this?
     mat = np.empty(skeleton.nclusters, dtype=object)
     for k in range(skeleton.nclusters):
@@ -892,9 +916,9 @@ def _fix_skeleton_diagonal(
 
         i = j = 0
         for icluster in clevel.parent_map[k]:
-            di = parent.skel_tgt_src_index.targets.cluster_size(icluster)
-            dj = parent.skel_tgt_src_index.sources.cluster_size(icluster)
-            D[np.s_[i:i + di], np.s_[j:j + dj]] = 0.0
+            di = targets.cluster_size(icluster)
+            dj = sources.cluster_size(icluster)
+            D[np.s_[i:i + di], np.s_[j:j + dj]] = diagonal[icluster]
 
             i += di
             j += dj
@@ -965,9 +989,6 @@ def rec_skeletonize_by_proxy(
                 id_rank=None,
                 max_particles_in_box=max_particles_in_box)
 
-            skeleton = _fix_skeleton_diagonal(
-                skeleton, skel_per_level[i - 1], parent_level)
-
         skel_per_level[i] = skeleton
         tgt_src_index = cluster(skeleton.skel_tgt_src_index, clevel)
         parent_level = clevel
@@ -978,7 +999,6 @@ def rec_skeletonize_by_proxy(
 
     # evaluate the root
     skeleton = _evaluate_root_diagonal(actx, 0, 0, places, wrangler, tgt_src_index)
-    skeleton = _fix_skeleton_diagonal(skeleton, skel_per_level[-2], parent_level)
     skel_per_level[-1] = skeleton
 
     return skel_per_level
diff --git a/pytential/linalg/utils.py b/pytential/linalg/utils.py
index c1f3115d4..2007d9747 100644
--- a/pytential/linalg/utils.py
+++ b/pytential/linalg/utils.py
@@ -88,6 +88,9 @@ def cluster_size(self, i: int) -> int:
 
         return self.starts[i + 1] - self.starts[i]
 
+    def cluster_slice(self, i: int) -> slice:
+        return np.s_[self.starts[i]:self.starts[i + 1]]
+
     def cluster_indices(self, i: int) -> np.ndarray:
         """
         :returns: a view into the :attr:`indices` array for the range
@@ -97,7 +100,7 @@ def cluster_indices(self, i: int) -> np.ndarray:
             raise IndexError(
                     f"cluster {i} is out of bounds for {self.nclusters} clusters")
 
-        return self.indices[self.starts[i]:self.starts[i + 1]]
+        return self.indices[self.cluster_slice(i)]
 
     def cluster_take(self, x: np.ndarray, i: int) -> np.ndarray:
         """
@@ -334,6 +337,15 @@ def make_flat_cluster_diag(
 
     return cluster_mat
 
+
+def split_array(x: np.ndarray, index: IndexList) -> np.ndarray:
+    assert x.size == index.nindices
+
+    from pytools.obj_array import make_obj_array
+    return make_obj_array([
+        index.cluster_take(x, i) for i in range(index.nclusters)
+        ])
+
 # }}}
 
 
@@ -440,6 +452,22 @@ def mnorm(x: np.ndarray, y: np.ndarray) -> float:
     return tgt_error, src_error
 
 
+def skeletonization_matrix(
+        mat: np.ndarray, skeleton: "SkeletonizationResult",
+        ) -> Tuple[np.ndarray, np.ndarray]:
+    D = np.empty(skeleton.nclusters, dtype=object)
+    S = np.empty((skeleton.nclusters, skeleton.nclusters), dtype=object)
+
+    from itertools import product
+    for i, j in product(range(skeleton.nclusters), repeat=2):
+        if i == j:
+            D[i] = skeleton.tgt_src_index.cluster_take(mat, i, i)
+        else:
+            S[i, j] = skeleton.skel_tgt_src_index.cluster_take(mat, i, j)
+
+    return D, S
+
+
 def skeletonization_error(
         mat: np.ndarray, skeleton: "SkeletonizationResult", *,
         ord: Optional[float] = None,
diff --git a/test/test_linalg_hmatrix.py b/test/test_linalg_hmatrix.py
new file mode 100644
index 000000000..0b9c69709
--- /dev/null
+++ b/test/test_linalg_hmatrix.py
@@ -0,0 +1,302 @@
+__copyright__ = "Copyright (C) 2022 Alexandru Fikl"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from dataclasses import replace
+import pytest
+
+import numpy as np
+
+from pytential import bind, sym
+from pytential import GeometryCollection
+
+from meshmode.mesh.generation import NArmedStarfish
+
+from meshmode import _acf           # noqa: F401
+from arraycontext import pytest_generate_tests_for_array_contexts
+from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+
+import extra_matrix_data as extra
+import logging
+logger = logging.getLogger(__name__)
+
+pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    PytestPyOpenCLArrayContextFactory,
+    ])
+
+
+HMATRIX_TEST_CASES = [
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="scalar",
+            target_order=4,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[256]),
+        extra.CurveTestCase(
+            name="starfish",
+            op_type="double",
+            target_order=6,
+            curve_fn=NArmedStarfish(5, 0.25),
+            resolutions=[256]),
+        extra.TorusTestCase(
+            target_order=4,
+            op_type="scalar",
+            resolutions=[0])
+        ]
+
+
+# {{{ test_hmatrix_forward_matvec
+
+@pytest.mark.parametrize("case", HMATRIX_TEST_CASES)
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1, sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_forward_matvec(actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_matvec_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x = actx.thaw(density_discr.nodes()[0])
+    r_lpot = bind(places, sym_op, auto_where=dd)(actx, u=x)
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        hmat = build_hmatrix_matvec_by_proxy(actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            matvec="forward",
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=case.max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+        r_hmap = hmat @ x
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(r_hmap - r_lpot) / flat_norm(r_lpot)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        assert rec_error[i] < 0.1
+
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    # }}}
+
+    if not visualize:
+        return
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    fig.savefig(f"linalg_hmatrix_matvec_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        r_hmap = actx.to_numpy(flatten(r_hmap, actx))
+        r_lpot = actx.to_numpy(flatten(r_lpot, actx))
+
+        ax.semilogy(r_hmap - r_lpot)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"linalg_hmatrix_matvec_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+# {{{ test_hmatrix_backward_matvec
+
+@pytest.mark.skip
+@pytest.mark.parametrize("case", HMATRIX_TEST_CASES)
+@pytest.mark.parametrize("discr_stage", [
+    sym.QBX_SOURCE_STAGE1, sym.QBX_SOURCE_STAGE2
+    ])
+def test_hmatrix_backward_matvec(actx_factory, case, discr_stage, visualize=False):
+    actx = actx_factory()
+
+    if visualize:
+        logging.basicConfig(level=logging.INFO)
+
+    if case.ambient_dim == 2:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+    else:
+        kwargs = {"proxy_approx_count": 64, "proxy_radius_factor": 1.25}
+
+    case = replace(case, skel_discr_stage=discr_stage, **kwargs)
+    logger.info("\n%s", case)
+
+    # {{{ geometry
+
+    dd = sym.DOFDescriptor(case.name, discr_stage=case.skel_discr_stage)
+    qbx = case.get_layer_potential(actx, case.resolutions[-1], case.target_order)
+    places = GeometryCollection(qbx, auto_where=dd)
+
+    density_discr = places.get_discretization(dd.geometry, dd.discr_stage)
+    tgt_src_index, _ = case.get_tgt_src_cluster_index(actx, places, dd)
+
+    logger.info("dd %s", dd)
+    logger.info("nclusters %3d ndofs %7d",
+            tgt_src_index.nclusters, density_discr.ndofs)
+
+    # }}}
+
+    # {{{ construct hmatrix
+
+    from pytential.linalg.hmatrix import build_hmatrix_matvec_by_proxy
+    sym_u, sym_op = case.get_operator(places.ambient_dim)
+
+    x_ref = actx.thaw(density_discr.nodes()[0])
+    b_ref = bind(places, sym_op, auto_where=dd)(actx, u=x_ref)
+
+    id_eps = 10.0 ** (-np.arange(2, 16))
+    rec_error = np.zeros_like(id_eps)
+
+    from pytools.convergence import EOCRecorder
+    eoc = EOCRecorder()
+
+    for i in range(id_eps.size):
+        hmat = build_hmatrix_matvec_by_proxy(actx, places, sym_op, sym_u,
+            domains=[dd],
+            context=case.knl_concrete_kwargs,
+            id_eps=id_eps[i],
+            matvec="backward",
+            _tree_kind=case.tree_kind,
+            _max_particles_in_box=case.max_particles_in_box,
+            _approx_nproxy=case.proxy_approx_count,
+            _proxy_radius_factor=case.proxy_radius_factor,
+            )
+        x_hmat = hmat @ b_ref
+
+        from meshmode.dof_array import flat_norm
+        rec_error[i] = actx.to_numpy(
+            flat_norm(x_hmat - x_ref) / flat_norm(x_ref)
+            )
+        logger.info("id_eps %.2e error: %.12e", id_eps[i], rec_error[i])
+        assert rec_error[i] < 0.1
+
+        eoc.add_data_point(id_eps[i], rec_error[i])
+
+    logger.info("\n%s", eoc.pretty_print(
+        abscissa_format="%.8e",
+        error_format="%.8e",
+        eoc_format="%.2f"))
+
+    # }}}
+
+    if not visualize:
+        return
+
+    from pystopt.visualization.matplotlib import setup_matplotlib
+    setup_matplotlib()
+
+    import matplotlib.pyplot as pt
+    fig = pt.figure(figsize=(10, 10), dpi=300)
+    ax = fig.gca()
+
+    ax.loglog(id_eps, id_eps, "k--")
+    ax.loglog(id_eps, rec_error)
+
+    ax.grid(True)
+    ax.set_xlabel(r"$\epsilon_{id}$")
+    ax.set_ylabel("$Error$")
+    ax.set_title(case.name)
+
+    fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_convergence")
+
+    if case.ambient_dim == 2:
+        fig.clf()
+        ax = fig.gca()
+
+        from arraycontext import flatten
+        x_hmat = actx.to_numpy(flatten(x_hmat, actx))
+        x_ref = actx.to_numpy(flatten(x_ref, actx))
+
+        ax.semilogy(x_hmat - x_ref)
+        ax.set_ylim([1.0e-16, 1.0])
+        fig.savefig(f"linalg_hmatrix_inverse_{case.name}_{case.op_type}_error")
+
+    pt.close(fig)
+
+# }}}
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: fdm=marker