From fead46f49e9bf0c0321b6ac1f19f1213a0f4712e Mon Sep 17 00:00:00 2001
From: Matthew Scroggs <matthew.w.scroggs@gmail.com>
Date: Mon, 19 Aug 2024 09:08:14 +0100
Subject: [PATCH] Reimplement parallel assembly (#277)

* restructure into folders

* kifmm version

* remove unused tests

* reimplement parallel assembly

* use ids in parallel test to account for rearranged cell indices

* ndgrid branch

* clippy

* ndgrid main

* update examples, tests, benches

* clippy
---
 Cargo.toml                                    |  20 +-
 benches/assembly_benchmark.rs                 |   5 +-
 examples/assembly.rs                          |   5 +-
 examples/test_parallel_assembly.rs            | 253 ++++++++++++++++++
 src/assembly.rs                               |   2 +-
 src/assembly/batched.rs                       |  27 +-
 src/assembly/batched/boundary.rs              |  90 ++++---
 .../{ => boundary}/adjoint_double_layer.rs    |   5 +-
 .../batched/{ => boundary}/double_layer.rs    |   5 +-
 .../batched/{ => boundary}/hypersingular.rs   |   5 +-
 .../batched/{ => boundary}/single_layer.rs    |   5 +-
 src/assembly/batched/potential.rs             |   5 +-
 .../double_layer.rs}                          |   3 +-
 .../single_layer.rs}                          |   3 +-
 src/assembly/fmm_tools.rs                     |   2 +-
 src/function/function_space.rs                |  10 +-
 src/function/function_space/parallel.rs       |  64 +++--
 src/function/function_space/serial.rs         |   2 +-
 src/lib.rs                                    |   7 +-
 src/traits.rs                                 |   6 +-
 src/traits/function.rs                        |  32 ++-
 tests/compare_to_bempp_cl.rs                  |   5 +-
 tests/fmm.rs.bak                              |   5 +-
 tests/mixed_assembly.rs.bak                   |   5 +-
 24 files changed, 431 insertions(+), 140 deletions(-)
 create mode 100644 examples/test_parallel_assembly.rs
 rename src/assembly/batched/{ => boundary}/adjoint_double_layer.rs (98%)
 rename src/assembly/batched/{ => boundary}/double_layer.rs (98%)
 rename src/assembly/batched/{ => boundary}/hypersingular.rs (99%)
 rename src/assembly/batched/{ => boundary}/single_layer.rs (97%)
 rename src/assembly/batched/{double_layer_potential.rs => potential/double_layer.rs} (98%)
 rename src/assembly/batched/{single_layer_potential.rs => potential/single_layer.rs} (98%)

diff --git a/Cargo.toml b/Cargo.toml
index 4ccf75b8..d95a28de 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,5 @@
 [features]
-mpi = ["dep:mpi"]
+mpi = ["dep:mpi", "ndelement/mpi", "ndgrid/mpi"]
 strict = []
 
 [package]
@@ -20,28 +20,22 @@ name = "bempp"
 crate-type = ["lib", "cdylib"]
 
 [dependencies]
-approx = "0.5"
-cauchy = "0.4.*"
 itertools = "0.13.*"
 mpi = { version = "0.8.*", optional = true }
 num = "0.4"
-paste = "1.*"
 lazy_static = "1.4"
-libc = "0.2"
-log = "0.4"
 ndelement = { git = "https://github.com/bempp/ndelement.git" }
 ndgrid = { git = "https://github.com/bempp/ndgrid.git" }
 rayon = "1.9"
-rand = "0.8.5"
 rlst = { version = "0.2" }
 green-kernels = { git = "https://github.com/bempp/green-kernels.git" }
-thiserror="1.*"
 
 [dev-dependencies]
+approx = "0.5"
+paste = "1.*"
+cauchy = "0.4.*"
 criterion = { version = "0.5.*", features = ["html_reports"]}
-kifmm = { version = "0.1" }
-blas-src = { version = "0.10", features = ["blis"]}
-lapack-src = { version = "0.10", features = ["netlib"]}
+# kifmm = { version = "1.0" }
 
 [[bench]]
 name = "assembly_benchmark"
@@ -52,7 +46,3 @@ cargo-args = ["-Zunstable-options", "-Zrustdoc-scrape-examples"]
 
 [lints.clippy]
 wildcard_imports = "forbid"
-
-[target.aarch64-apple-darwin.dev-dependencies]
-blas-src = { version = "0.10", features = ["accelerate"]}
-lapack-src = { version = "0.10", features = ["accelerate"]}
diff --git a/benches/assembly_benchmark.rs b/benches/assembly_benchmark.rs
index caa62821..ba46e374 100644
--- a/benches/assembly_benchmark.rs
+++ b/benches/assembly_benchmark.rs
@@ -1,15 +1,12 @@
 use bempp::assembly::{batched, batched::BatchedAssembler};
 use bempp::function::SerialFunctionSpace;
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use criterion::{criterion_group, criterion_main, Criterion};
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::{Continuity, ReferenceCellType};
 use ndgrid::shapes::regular_sphere;
 use rlst::rlst_dynamic_array2;
 
-extern crate blas_src;
-extern crate lapack_src;
-
 pub fn assembly_parts_benchmark(c: &mut Criterion) {
     let mut group = c.benchmark_group("assembly");
     group.sample_size(20);
diff --git a/examples/assembly.rs b/examples/assembly.rs
index 08259e41..5a80449b 100644
--- a/examples/assembly.rs
+++ b/examples/assembly.rs
@@ -1,14 +1,11 @@
 use bempp::assembly::{batched, batched::BatchedAssembler};
 use bempp::function::SerialFunctionSpace;
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::{Continuity, ReferenceCellType};
 use ndgrid::shapes::regular_sphere;
 use rlst::{rlst_dynamic_array2, RandomAccessByRef};
 
-extern crate blas_src;
-extern crate lapack_src;
-
 fn main() {
     // Create a grid, family of elements, and function space
     let grid = regular_sphere(0);
diff --git a/examples/test_parallel_assembly.rs b/examples/test_parallel_assembly.rs
new file mode 100644
index 00000000..1d650544
--- /dev/null
+++ b/examples/test_parallel_assembly.rs
@@ -0,0 +1,253 @@
+//? mpirun -n {{NPROCESSES}} --features "mpi"
+
+#[cfg(feature = "mpi")]
+use approx::assert_relative_eq;
+#[cfg(feature = "mpi")]
+use bempp::{
+    assembly::batched,
+    assembly::batched::BatchedAssembler,
+    function::{ParallelFunctionSpace, SerialFunctionSpace},
+    traits::FunctionSpace,
+};
+#[cfg(feature = "mpi")]
+use itertools::izip;
+#[cfg(feature = "mpi")]
+use mpi::{
+    environment::Universe,
+    request::WaitGuard,
+    traits::{Communicator, Destination, Source},
+};
+#[cfg(feature = "mpi")]
+use ndelement::{
+    ciarlet::{CiarletElement, LagrangeElementFamily},
+    types::{Continuity, ReferenceCellType},
+};
+#[cfg(feature = "mpi")]
+use ndgrid::{
+    grid::parallel::ParallelGrid,
+    traits::{Builder, Entity, Grid, ParallelBuilder},
+    SingleElementGrid, SingleElementGridBuilder,
+};
+#[cfg(feature = "mpi")]
+use rlst::{CsrMatrix, Shape};
+#[cfg(feature = "mpi")]
+use std::collections::{hash_map::Entry, HashMap};
+
+#[cfg(feature = "mpi")]
+fn create_single_element_grid_data(b: &mut SingleElementGridBuilder<f64>, n: usize) {
+    for y in 0..n {
+        for x in 0..n {
+            b.add_point(
+                y * n + x,
+                &[x as f64 / (n - 1) as f64, y as f64 / (n - 1) as f64, 0.0],
+            );
+        }
+    }
+
+    for i in 0..n - 1 {
+        for j in 0..n - 1 {
+            b.add_cell(
+                i * (n - 1) + j,
+                &[j * n + i, j * n + i + 1, j * n + i + n, j * n + i + n + 1],
+            );
+        }
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn example_single_element_grid<C: Communicator>(
+    comm: &C,
+    n: usize,
+) -> ParallelGrid<'_, C, SingleElementGrid<f64, CiarletElement<f64>>> {
+    let rank = comm.rank();
+
+    let mut b = SingleElementGridBuilder::<f64>::new(3, (ReferenceCellType::Quadrilateral, 1));
+
+    if rank == 0 {
+        create_single_element_grid_data(&mut b, n);
+        b.create_parallel_grid(comm)
+    } else {
+        b.receive_parallel_grid(comm, 0)
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn example_single_element_grid_serial(n: usize) -> SingleElementGrid<f64, CiarletElement<f64>> {
+    let mut b = SingleElementGridBuilder::<f64>::new(3, (ReferenceCellType::Quadrilateral, 1));
+    create_single_element_grid_data(&mut b, n);
+    b.create_grid()
+}
+
+#[cfg(feature = "mpi")]
+fn test_parallel_assembly_single_element_grid<C: Communicator>(
+    comm: &C,
+    degree: usize,
+    cont: Continuity,
+) {
+    let rank = comm.rank();
+    let size = comm.size();
+
+    let n = 10;
+    let grid = example_single_element_grid(comm, n);
+    let element = LagrangeElementFamily::<f64>::new(degree, cont);
+    let space = ParallelFunctionSpace::new(&grid, &element);
+
+    let a = batched::LaplaceSingleLayerAssembler::<f64>::default();
+
+    let matrix = a.parallel_assemble_singular_into_csr(&space, &space);
+
+    if rank == 0 {
+        // Compute the same matrix on a single process
+        let serial_grid = example_single_element_grid_serial(n);
+        let serial_space = SerialFunctionSpace::new(&serial_grid, &element);
+        let serial_matrix = a.assemble_singular_into_csr(&serial_space, &serial_space);
+
+        // Dofs associated with each cell (by cell id)
+        let mut serial_dofmap = HashMap::new();
+        for cell in serial_grid.entity_iter(2) {
+            serial_dofmap.insert(
+                cell.id().unwrap(),
+                serial_space
+                    .cell_dofs(cell.local_index())
+                    .unwrap()
+                    .iter()
+                    .map(|i| serial_space.global_dof_index(*i))
+                    .collect::<Vec<_>>(),
+            );
+        }
+        let mut parallel_dofmap = HashMap::new();
+        for cell in grid.entity_iter(2) {
+            parallel_dofmap.insert(
+                cell.id().unwrap(),
+                space
+                    .cell_dofs(cell.local_index())
+                    .unwrap()
+                    .iter()
+                    .map(|i| space.global_dof_index(*i))
+                    .collect::<Vec<_>>(),
+            );
+        }
+        for p in 1..size {
+            let process = comm.process_at_rank(p);
+            let (cell_ids, _status) = process.receive_vec::<usize>();
+            let (dofs, _status) = process.receive_vec::<usize>();
+            let (dofs_len, _status) = process.receive_vec::<usize>();
+            let mut start = 0;
+            for (id, len) in izip!(cell_ids, dofs_len) {
+                if let Entry::Vacant(e) = parallel_dofmap.entry(id) {
+                    e.insert(dofs[start..start + len].to_vec());
+                } else {
+                    assert_eq!(parallel_dofmap[&id], dofs[start..start + len]);
+                }
+                start += len;
+            }
+        }
+
+        let mut index_map = vec![0; serial_space.global_size()];
+
+        for (id, dofs) in parallel_dofmap {
+            for (i, j) in izip!(&serial_dofmap[&id], dofs) {
+                index_map[j] = *i;
+            }
+        }
+
+        // Gather sparse matrices onto process 0
+        let mut rows = vec![];
+        let mut cols = vec![];
+        let mut data = vec![];
+
+        let mut r = 0;
+        for (i, index) in matrix.indices().iter().enumerate() {
+            while i >= matrix.indptr()[r + 1] {
+                r += 1;
+            }
+            rows.push(index_map[r]);
+            cols.push(index_map[*index]);
+            data.push(matrix.data()[i]);
+        }
+
+        for p in 1..size {
+            let process = comm.process_at_rank(p);
+            let (indices, _status) = process.receive_vec::<usize>();
+            let (indptr, _status) = process.receive_vec::<usize>();
+            let (subdata, _status) = process.receive_vec::<f64>();
+            let mat = CsrMatrix::new(matrix.shape(), indices, indptr, subdata);
+
+            let mut r = 0;
+            for (i, index) in mat.indices().iter().enumerate() {
+                while i >= mat.indptr()[r + 1] {
+                    r += 1;
+                }
+                rows.push(index_map[r]);
+                cols.push(index_map[*index]);
+                data.push(mat.data()[i]);
+            }
+        }
+        let full_matrix = CsrMatrix::from_aij(
+            [space.global_size(), space.global_size()],
+            &rows,
+            &cols,
+            &data,
+        )
+        .unwrap();
+
+        // Compare to matrix assembled on just this process
+        for (i, j) in full_matrix.indices().iter().zip(serial_matrix.indices()) {
+            assert_eq!(i, j);
+        }
+        for (i, j) in full_matrix.indptr().iter().zip(serial_matrix.indptr()) {
+            assert_eq!(i, j);
+        }
+        for (i, j) in full_matrix.data().iter().zip(serial_matrix.data()) {
+            assert_relative_eq!(i, j, epsilon = 1e-10);
+        }
+    } else {
+        let mut cell_ids = vec![];
+        let mut dofs = vec![];
+        let mut dofs_len = vec![];
+        for cell in grid.entity_iter(2) {
+            cell_ids.push(cell.id().unwrap());
+            let cell_dofs = space
+                .cell_dofs(cell.local_index())
+                .unwrap()
+                .iter()
+                .map(|i| space.global_dof_index(*i))
+                .collect::<Vec<_>>();
+            dofs.extend_from_slice(&cell_dofs);
+            dofs_len.push(cell_dofs.len());
+        }
+
+        mpi::request::scope(|scope| {
+            let root = comm.process_at_rank(0);
+            // TODO: send this:
+            let _ = WaitGuard::from(root.immediate_send(scope, &cell_ids));
+            let _ = WaitGuard::from(root.immediate_send(scope, &dofs));
+            let _ = WaitGuard::from(root.immediate_send(scope, &dofs_len));
+            let _ = WaitGuard::from(root.immediate_send(scope, matrix.indices()));
+            let _ = WaitGuard::from(root.immediate_send(scope, matrix.indptr()));
+            let _ = WaitGuard::from(root.process_at_rank(0).immediate_send(scope, matrix.data()));
+        });
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn main() {
+    let universe: Universe = mpi::initialize().unwrap();
+    let world = universe.world();
+    let rank = world.rank();
+
+    for degree in 0..4 {
+        if rank == 0 {
+            println!("Testing assembly with DP{degree} using SingleElementGrid in parallel.");
+        }
+        test_parallel_assembly_single_element_grid(&world, degree, Continuity::Discontinuous);
+    }
+    for degree in 1..4 {
+        if rank == 0 {
+            println!("Testing assembly with P{degree} using SingleElementGrid in parallel.");
+        }
+        test_parallel_assembly_single_element_grid(&world, degree, Continuity::Standard);
+    }
+}
+#[cfg(not(feature = "mpi"))]
+fn main() {}
diff --git a/src/assembly.rs b/src/assembly.rs
index 2cc77636..b578a399 100644
--- a/src/assembly.rs
+++ b/src/assembly.rs
@@ -8,7 +8,7 @@ mod test {
     use super::batched::BatchedAssembler;
     use super::*;
     use crate::function::SerialFunctionSpace;
-    use crate::traits::function::FunctionSpace;
+    use crate::traits::FunctionSpace;
     use cauchy::{c32, c64};
     use ndelement::ciarlet::CiarletElement;
     use ndelement::ciarlet::LagrangeElementFamily;
diff --git a/src/assembly/batched.rs b/src/assembly/batched.rs
index 26e05c84..fecf40c3 100644
--- a/src/assembly/batched.rs
+++ b/src/assembly/batched.rs
@@ -4,29 +4,22 @@ use green_kernels::types::GreenKernelEvalType;
 use rlst::{Array, BaseArray, VectorContainer};
 
 mod boundary;
-pub use boundary::{BatchedAssembler, BatchedAssemblerOptions};
-mod potential;
-pub use potential::{BatchedPotentialAssembler, BatchedPotentialAssemblerOptions};
-
-mod adjoint_double_layer;
-mod double_layer;
-mod hypersingular;
-mod single_layer;
-pub use adjoint_double_layer::{
+pub use boundary::adjoint_double_layer::{
     HelmholtzAdjointDoubleLayerAssembler, LaplaceAdjointDoubleLayerAssembler,
 };
-pub use double_layer::{HelmholtzDoubleLayerAssembler, LaplaceDoubleLayerAssembler};
-pub use hypersingular::{HelmholtzHypersingularAssembler, LaplaceHypersingularAssembler};
-pub use single_layer::{HelmholtzSingleLayerAssembler, LaplaceSingleLayerAssembler};
+pub use boundary::double_layer::{HelmholtzDoubleLayerAssembler, LaplaceDoubleLayerAssembler};
+pub use boundary::hypersingular::{HelmholtzHypersingularAssembler, LaplaceHypersingularAssembler};
+pub use boundary::single_layer::{HelmholtzSingleLayerAssembler, LaplaceSingleLayerAssembler};
+pub use boundary::{BatchedAssembler, BatchedAssemblerOptions};
 
-mod double_layer_potential;
-mod single_layer_potential;
-pub use double_layer_potential::{
+mod potential;
+pub use potential::double_layer::{
     HelmholtzDoubleLayerPotentialAssembler, LaplaceDoubleLayerPotentialAssembler,
 };
-pub use single_layer_potential::{
+pub use potential::single_layer::{
     HelmholtzSingleLayerPotentialAssembler, LaplaceSingleLayerPotentialAssembler,
 };
+pub use potential::{BatchedPotentialAssembler, BatchedPotentialAssemblerOptions};
 
 type RlstArray<T, const DIM: usize> = Array<T, BaseArray<T, VectorContainer<T>, DIM>, DIM>;
 
@@ -34,7 +27,7 @@ type RlstArray<T, const DIM: usize> = Array<T, BaseArray<T, VectorContainer<T>,
 mod test {
     use super::*;
     use crate::function::SerialFunctionSpace;
-    use crate::traits::function::FunctionSpace;
+    use crate::traits::FunctionSpace;
     use approx::*;
     use ndelement::ciarlet::LagrangeElementFamily;
     use ndelement::types::Continuity;
diff --git a/src/assembly/batched/boundary.rs b/src/assembly/batched/boundary.rs
index 7ff6637f..48e77b97 100644
--- a/src/assembly/batched/boundary.rs
+++ b/src/assembly/batched/boundary.rs
@@ -1,13 +1,20 @@
 //! Batched dense assembly of boundary operators
+pub(crate) mod adjoint_double_layer;
+pub(crate) mod double_layer;
+pub(crate) mod hypersingular;
+pub(crate) mod single_layer;
+
 use crate::assembly::common::{equal_grids, RawData2D, SparseMatrixData};
 use crate::quadrature::duffy::{
     quadrilateral_duffy, quadrilateral_triangle_duffy, triangle_duffy, triangle_quadrilateral_duffy,
 };
 use crate::quadrature::simplex_rules::simplex_rule;
 use crate::quadrature::types::{CellToCellConnectivity, TestTrialNumericalQuadratureDefinition};
-use crate::traits::function::FunctionSpace;
-//#[cfg(feature = "mpi")]
-//use crate::traits::function::FunctionSpaceInParallel;
+use crate::traits::FunctionSpace;
+#[cfg(feature = "mpi")]
+use crate::traits::ParallelFunctionSpace;
+#[cfg(feature = "mpi")]
+use mpi::traits::Communicator;
 use ndelement::reference_cell;
 use ndelement::traits::FiniteElement;
 use ndelement::types::ReferenceCellType;
@@ -1031,23 +1038,24 @@ pub trait BatchedAssembler: Sync + Sized {
         .unwrap()
     }
 
-    /*
-        #[cfg(feature = "mpi")]
-        /// Assemble the singular contributions into a CSR sparse matrix, indexed by global DOF numbers
-        fn parallel_assemble_singular_into_csr<
-            TestGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
-            TrialGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
-            Element: FiniteElement<T = Self::T> + Sync,
-            SerialTestSpace: FunctionSpace<Grid = TestGrid, FiniteElement = Element> + Sync,
-            SerialTrialSpace: FunctionSpace<Grid = TrialGrid, FiniteElement = Element> + Sync,
-        >(
-            &self,
-            trial_space: &(impl FunctionSpaceInParallel<SerialSpace = SerialTrialSpace> + FunctionSpace),
-            test_space: &(impl FunctionSpaceInParallel<SerialSpace = SerialTestSpace> + FunctionSpace),
-        ) -> CsrMatrix<Self::T> {
-            self.assemble_singular_into_csr(trial_space.local_space(), test_space.local_space())
-        }
-    */
+    #[cfg(feature = "mpi")]
+    /// Assemble the singular contributions into a CSR sparse matrix, indexed by global DOF numbers
+    fn parallel_assemble_singular_into_csr<
+        'a,
+        C: Communicator,
+        TestGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
+        TrialGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
+        Element: FiniteElement<T = Self::T> + Sync,
+        SerialTestSpace: FunctionSpace<Grid = TestGrid, FiniteElement = Element> + Sync + 'a,
+        SerialTrialSpace: FunctionSpace<Grid = TrialGrid, FiniteElement = Element> + Sync + 'a,
+    >(
+        &self,
+        trial_space: &'a (impl ParallelFunctionSpace<C, LocalSpace<'a> = SerialTrialSpace> + 'a),
+        test_space: &'a (impl ParallelFunctionSpace<C, LocalSpace<'a> = SerialTestSpace> + 'a),
+    ) -> CsrMatrix<Self::T> {
+        self.assemble_singular_into_csr(trial_space.local_space(), test_space.local_space())
+    }
+
     /// Assemble the singular correction into a dense matrix
     ///
     /// The singular correction is the contribution is the terms for adjacent cells are assembled using an (incorrect) non-singular quadrature rule
@@ -1101,26 +1109,28 @@ pub trait BatchedAssembler: Sync + Sized {
         )
         .unwrap()
     }
-    /*
-        #[cfg(feature = "mpi")]
-        /// Assemble the singular contributions into a CSR sparse matrix, indexed by global DOF numbers
-        fn parallel_assemble_singular_correction_into_csr<
-            TestGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
-            TrialGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
-            Element: FiniteElement<T = Self::T> + Sync,
-            SerialTestSpace: FunctionSpace<Grid = TestGrid, FiniteElement = Element> + Sync,
-            SerialTrialSpace: FunctionSpace<Grid = TrialGrid, FiniteElement = Element> + Sync,
-        >(
-            &self,
-            trial_space: &(impl FunctionSpaceInParallel<SerialSpace = SerialTrialSpace> + FunctionSpace),
-            test_space: &(impl FunctionSpaceInParallel<SerialSpace = SerialTestSpace> + FunctionSpace),
-        ) -> CsrMatrix<Self::T> {
-            self.assemble_singular_correction_into_csr(
-                trial_space.local_space(),
-                test_space.local_space(),
-            )
-        }
-    */
+
+    #[cfg(feature = "mpi")]
+    /// Assemble the singular contributions into a CSR sparse matrix, indexed by global DOF numbers
+    fn parallel_assemble_singular_correction_into_csr<
+        'a,
+        C: Communicator,
+        TestGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
+        TrialGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
+        Element: FiniteElement<T = Self::T> + Sync,
+        SerialTestSpace: FunctionSpace<Grid = TestGrid, FiniteElement = Element> + Sync + 'a,
+        SerialTrialSpace: FunctionSpace<Grid = TrialGrid, FiniteElement = Element> + Sync + 'a,
+    >(
+        &self,
+        trial_space: &'a (impl ParallelFunctionSpace<C, LocalSpace<'a> = SerialTrialSpace> + 'a),
+        test_space: &'a (impl ParallelFunctionSpace<C, LocalSpace<'a> = SerialTestSpace> + 'a),
+    ) -> CsrMatrix<Self::T> {
+        self.assemble_singular_correction_into_csr(
+            trial_space.local_space(),
+            test_space.local_space(),
+        )
+    }
+
     /// Assemble into a dense matrix
     fn assemble_into_dense<
         TestGrid: Grid<T = <Self::T as RlstScalar>::Real, EntityDescriptor = ReferenceCellType> + Sync,
diff --git a/src/assembly/batched/adjoint_double_layer.rs b/src/assembly/batched/boundary/adjoint_double_layer.rs
similarity index 98%
rename from src/assembly/batched/adjoint_double_layer.rs
rename to src/assembly/batched/boundary/adjoint_double_layer.rs
index 67abc829..504a7146 100644
--- a/src/assembly/batched/adjoint_double_layer.rs
+++ b/src/assembly/batched/boundary/adjoint_double_layer.rs
@@ -1,5 +1,8 @@
 //! Adjoint double layer assemblers
-use super::{BatchedAssembler, BatchedAssemblerOptions, GreenKernelEvalType, RlstArray};
+use super::{
+    super::{GreenKernelEvalType, RlstArray},
+    BatchedAssembler, BatchedAssemblerOptions,
+};
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use rlst::{MatrixInverse, RlstScalar, UnsafeRandomAccessByRef};
 
diff --git a/src/assembly/batched/double_layer.rs b/src/assembly/batched/boundary/double_layer.rs
similarity index 98%
rename from src/assembly/batched/double_layer.rs
rename to src/assembly/batched/boundary/double_layer.rs
index 6238975d..de5c5d0a 100644
--- a/src/assembly/batched/double_layer.rs
+++ b/src/assembly/batched/boundary/double_layer.rs
@@ -1,5 +1,8 @@
 //! Double layer assemblers
-use super::{BatchedAssembler, BatchedAssemblerOptions, GreenKernelEvalType, RlstArray};
+use super::{
+    super::{GreenKernelEvalType, RlstArray},
+    BatchedAssembler, BatchedAssemblerOptions,
+};
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use rlst::{MatrixInverse, RlstScalar, UnsafeRandomAccessByRef};
 
diff --git a/src/assembly/batched/hypersingular.rs b/src/assembly/batched/boundary/hypersingular.rs
similarity index 99%
rename from src/assembly/batched/hypersingular.rs
rename to src/assembly/batched/boundary/hypersingular.rs
index 5d58167c..f183292d 100644
--- a/src/assembly/batched/hypersingular.rs
+++ b/src/assembly/batched/boundary/hypersingular.rs
@@ -1,9 +1,10 @@
 //! Hypersingular assemblers
 use super::{
-    BatchedAssembler, BatchedAssemblerOptions, GreenKernelEvalType, RlstArray, SparseMatrixData,
+    super::{GreenKernelEvalType, RlstArray, SparseMatrixData},
+    BatchedAssembler, BatchedAssemblerOptions,
 };
 use crate::assembly::common::equal_grids;
-use crate::traits::function::FunctionSpace;
+use crate::traits::FunctionSpace;
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use ndelement::traits::FiniteElement;
 use ndelement::types::ReferenceCellType;
diff --git a/src/assembly/batched/single_layer.rs b/src/assembly/batched/boundary/single_layer.rs
similarity index 97%
rename from src/assembly/batched/single_layer.rs
rename to src/assembly/batched/boundary/single_layer.rs
index 8607e684..4976b491 100644
--- a/src/assembly/batched/single_layer.rs
+++ b/src/assembly/batched/boundary/single_layer.rs
@@ -1,5 +1,8 @@
 //! Single layer assemblers
-use super::{BatchedAssembler, BatchedAssemblerOptions, GreenKernelEvalType, RlstArray};
+use super::{
+    super::{GreenKernelEvalType, RlstArray},
+    BatchedAssembler, BatchedAssemblerOptions,
+};
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use rlst::{MatrixInverse, RlstScalar, UnsafeRandomAccessByRef};
 
diff --git a/src/assembly/batched/potential.rs b/src/assembly/batched/potential.rs
index 5892f1a4..077f14ab 100644
--- a/src/assembly/batched/potential.rs
+++ b/src/assembly/batched/potential.rs
@@ -1,7 +1,10 @@
 //! Batched dense assembly of potential operators
+pub(crate) mod double_layer;
+pub(crate) mod single_layer;
+
 use crate::assembly::common::RawData2D;
 use crate::quadrature::simplex_rules::simplex_rule;
-use crate::traits::function::FunctionSpace;
+use crate::traits::FunctionSpace;
 use ndelement::traits::FiniteElement;
 use ndelement::types::ReferenceCellType;
 use ndgrid::traits::{GeometryMap, Grid};
diff --git a/src/assembly/batched/double_layer_potential.rs b/src/assembly/batched/potential/double_layer.rs
similarity index 98%
rename from src/assembly/batched/double_layer_potential.rs
rename to src/assembly/batched/potential/double_layer.rs
index 8e8d93bf..77cb6c4b 100644
--- a/src/assembly/batched/double_layer_potential.rs
+++ b/src/assembly/batched/potential/double_layer.rs
@@ -1,6 +1,7 @@
 //! Batched dense assembly of boundary operators
 use super::{
-    BatchedPotentialAssembler, BatchedPotentialAssemblerOptions, GreenKernelEvalType, RlstArray,
+    super::{GreenKernelEvalType, RlstArray},
+    BatchedPotentialAssembler, BatchedPotentialAssemblerOptions,
 };
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use rlst::{MatrixInverse, RlstScalar, UnsafeRandomAccessByRef};
diff --git a/src/assembly/batched/single_layer_potential.rs b/src/assembly/batched/potential/single_layer.rs
similarity index 98%
rename from src/assembly/batched/single_layer_potential.rs
rename to src/assembly/batched/potential/single_layer.rs
index a1f7652e..d777aad5 100644
--- a/src/assembly/batched/single_layer_potential.rs
+++ b/src/assembly/batched/potential/single_layer.rs
@@ -1,6 +1,7 @@
 //! Batched dense assembly of boundary operators
 use super::{
-    BatchedPotentialAssembler, BatchedPotentialAssemblerOptions, GreenKernelEvalType, RlstArray,
+    super::{GreenKernelEvalType, RlstArray},
+    BatchedPotentialAssembler, BatchedPotentialAssemblerOptions,
 };
 use green_kernels::{helmholtz_3d::Helmholtz3dKernel, laplace_3d::Laplace3dKernel, traits::Kernel};
 use rlst::{MatrixInverse, RlstScalar, UnsafeRandomAccessByRef};
diff --git a/src/assembly/fmm_tools.rs b/src/assembly/fmm_tools.rs
index 8de08279..edda0574 100644
--- a/src/assembly/fmm_tools.rs
+++ b/src/assembly/fmm_tools.rs
@@ -2,7 +2,7 @@
 use crate::assembly::common::SparseMatrixData;
 use crate::function::SerialFunctionSpace;
 use crate::quadrature::simplex_rules::simplex_rule;
-use crate::traits::function::FunctionSpace;
+use crate::traits::FunctionSpace;
 use ndelement::traits::FiniteElement;
 use ndelement::types::ReferenceCellType;
 use ndgrid::traits::{GeometryMap, Grid};
diff --git a/src/function/function_space.rs b/src/function/function_space.rs
index 6b468f93..9f6f2d66 100644
--- a/src/function/function_space.rs
+++ b/src/function/function_space.rs
@@ -1,11 +1,11 @@
 //! Function space
 
 mod common;
-//#[cfg(feature = "mpi")]
-//mod parallel;
+#[cfg(feature = "mpi")]
+mod parallel;
 mod serial;
 
-pub(crate) use common::assign_dofs;
-//#[cfg(feature = "mpi")]
-//pub use parallel::ParallelFunctionSpace;
+use common::assign_dofs;
+#[cfg(feature = "mpi")]
+pub use parallel::ParallelFunctionSpace;
 pub use serial::SerialFunctionSpace;
diff --git a/src/function/function_space/parallel.rs b/src/function/function_space/parallel.rs
index b37f15ee..3363609e 100644
--- a/src/function/function_space/parallel.rs
+++ b/src/function/function_space/parallel.rs
@@ -1,7 +1,7 @@
 //! Parallel function space
 
 use crate::function::{function_space::assign_dofs, SerialFunctionSpace};
-use crate::traits::function::{FunctionSpace, FunctionSpaceInParallel};
+use crate::traits::{FunctionSpace, ParallelFunctionSpace as ParallelFunctionSpaceTrait};
 use mpi::{
     point_to_point::{Destination, Source},
     request::WaitGuard,
@@ -10,20 +10,30 @@ use mpi::{
 use ndelement::ciarlet::CiarletElement;
 use ndelement::traits::ElementFamily;
 use ndelement::types::ReferenceCellType;
-use ndgrid::{traits::Grid, types::Ownership};
+use ndgrid::{
+    traits::{Grid, ParallelGrid},
+    types::Ownership,
+};
 use rlst::{MatrixInverse, RlstScalar};
 use std::collections::HashMap;
 
 /// The local function space on a process
-pub struct LocalFunctionSpace<'a, T: RlstScalar + MatrixInverse, GridImpl: GridType<T = T::Real>> {
+pub struct LocalFunctionSpace<
+    'a,
+    T: RlstScalar + MatrixInverse,
+    GridImpl: Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
+> {
     serial_space: SerialFunctionSpace<'a, T, GridImpl>,
     global_size: usize,
     global_dof_numbers: Vec<usize>,
     ownership: Vec<Ownership>,
 }
 
-impl<'a, T: RlstScalar + MatrixInverse, GridImpl: GridType<T = T::Real>> FunctionSpace
-    for LocalFunctionSpace<'a, T, GridImpl>
+impl<
+        'a,
+        T: RlstScalar + MatrixInverse,
+        GridImpl: Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
+    > FunctionSpace for LocalFunctionSpace<'a, T, GridImpl>
 {
     type Grid = GridImpl;
     type FiniteElement = CiarletElement<T>;
@@ -57,19 +67,23 @@ impl<'a, T: RlstScalar + MatrixInverse, GridImpl: GridType<T = T::Real>> Functio
         self.ownership[local_dof_index]
     }
 }
-
 /// A parallel function space
 pub struct ParallelFunctionSpace<
     'a,
+    C: Communicator,
     T: RlstScalar + MatrixInverse,
-    GridImpl: ParallelGridType + GridType<T = T::Real>,
+    GridImpl: ParallelGrid<C> + Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
 > {
     grid: &'a GridImpl,
-    local_space: LocalFunctionSpace<'a, T, <GridImpl as ParallelGridType>::LocalGridType>,
+    local_space: LocalFunctionSpace<'a, T, <GridImpl as ParallelGrid<C>>::LocalGrid<'a>>,
 }
 
-impl<'a, T: RlstScalar + MatrixInverse, GridImpl: ParallelGridType + GridType<T = T::Real>>
-    ParallelFunctionSpace<'a, T, GridImpl>
+impl<
+        'a,
+        C: Communicator,
+        T: RlstScalar + MatrixInverse,
+        GridImpl: ParallelGrid<C> + Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
+    > ParallelFunctionSpace<'a, C, T, GridImpl>
 {
     /// Create new function space
     pub fn new(
@@ -89,7 +103,7 @@ impl<'a, T: RlstScalar + MatrixInverse, GridImpl: ParallelGridType + GridType<T
             assign_dofs(rank as usize, grid, e_family);
 
         let mut elements = HashMap::new();
-        for cell in grid.cell_types() {
+        for cell in grid.entity_types(grid.topology_dim()) {
             elements.insert(*cell, e_family.element(*cell));
         }
 
@@ -210,22 +224,38 @@ impl<'a, T: RlstScalar + MatrixInverse, GridImpl: ParallelGridType + GridType<T
     }
 }
 
-impl<'a, T: RlstScalar + MatrixInverse, GridImpl: ParallelGridType + GridType<T = T::Real>>
-    FunctionSpaceInParallel for ParallelFunctionSpace<'a, T, GridImpl>
+impl<
+        'g,
+        C: Communicator,
+        T: RlstScalar + MatrixInverse,
+        GridImpl: ParallelGrid<C> + Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
+    > ParallelFunctionSpaceTrait<C> for ParallelFunctionSpace<'g, C, T, GridImpl>
 {
     type ParallelGrid = GridImpl;
-    type SerialSpace = LocalFunctionSpace<'a, T, <GridImpl as ParallelGridType>::LocalGridType>;
+    type LocalSpace<'a> = LocalFunctionSpace<'a, T, <GridImpl as ParallelGrid<C>>::LocalGrid<'g>> where Self: 'a;
+
+    fn comm(&self) -> &C {
+        self.grid.comm()
+    }
+
+    fn grid(&self) -> &GridImpl {
+        self.grid
+    }
 
     /// Get the local space on the process
     fn local_space(
         &self,
-    ) -> &LocalFunctionSpace<'a, T, <GridImpl as ParallelGridType>::LocalGridType> {
+    ) -> &LocalFunctionSpace<'_, T, <GridImpl as ParallelGrid<C>>::LocalGrid<'g>> {
         &self.local_space
     }
 }
 
-impl<'a, T: RlstScalar + MatrixInverse, GridImpl: ParallelGridType + GridType<T = T::Real>>
-    FunctionSpace for ParallelFunctionSpace<'a, T, GridImpl>
+impl<
+        'a,
+        C: Communicator,
+        T: RlstScalar + MatrixInverse,
+        GridImpl: ParallelGrid<C> + Grid<T = T::Real, EntityDescriptor = ReferenceCellType>,
+    > FunctionSpace for ParallelFunctionSpace<'a, C, T, GridImpl>
 {
     type Grid = GridImpl;
     type FiniteElement = CiarletElement<T>;
diff --git a/src/function/function_space/serial.rs b/src/function/function_space/serial.rs
index 591c1c9f..500b7210 100644
--- a/src/function/function_space/serial.rs
+++ b/src/function/function_space/serial.rs
@@ -1,7 +1,7 @@
 //! Serial function space
 
 use crate::function::function_space::assign_dofs;
-use crate::traits::function::FunctionSpace;
+use crate::traits::FunctionSpace;
 use ndelement::ciarlet::CiarletElement;
 use ndelement::traits::{ElementFamily, FiniteElement};
 use ndelement::types::ReferenceCellType;
diff --git a/src/lib.rs b/src/lib.rs
index f1b0d0c2..807c610a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,5 @@
 //! Bempp
-#![cfg_attr(feature = "strict", deny(warnings))]
+#![cfg_attr(feature = "strict", deny(warnings), deny(unused_crate_dependencies))]
 #![warn(missing_docs)]
 
 #[macro_use]
@@ -9,3 +9,8 @@ pub mod assembly;
 pub mod function;
 pub mod quadrature;
 pub mod traits;
+
+#[cfg(test)]
+mod test {
+    use criterion as _; // Hack to show that criterion is used, as cargo test does not see benches
+}
diff --git a/src/traits.rs b/src/traits.rs
index f0ed057b..6930b5c0 100644
--- a/src/traits.rs
+++ b/src/traits.rs
@@ -1,3 +1,7 @@
 //! Trait definitions
 
-pub mod function;
+mod function;
+
+pub use function::FunctionSpace;
+#[cfg(feature = "mpi")]
+pub use function::ParallelFunctionSpace;
diff --git a/src/traits/function.rs b/src/traits/function.rs
index 7524f321..f5643e1a 100644
--- a/src/traits/function.rs
+++ b/src/traits/function.rs
@@ -1,10 +1,10 @@
 //! Functions and functions spaces
-//#[cfg(feature = "mpi")]
-//use crate::traits::grid::ParallelGridType;
-use ndelement::traits::FiniteElement;
-use ndelement::types::ReferenceCellType;
-use ndgrid::traits::Grid;
-use ndgrid::types::Ownership;
+#[cfg(feature = "mpi")]
+use mpi::traits::Communicator;
+use ndelement::{traits::FiniteElement, types::ReferenceCellType};
+#[cfg(feature = "mpi")]
+use ndgrid::traits::ParallelGrid;
+use ndgrid::{traits::Grid, types::Ownership};
 use std::collections::HashMap;
 
 /// A function space
@@ -48,16 +48,22 @@ pub trait FunctionSpace {
     fn ownership(&self, local_dof_index: usize) -> Ownership;
 }
 
-/*
 #[cfg(feature = "mpi")]
 /// A function space in parallel
-pub trait FunctionSpaceInParallel {
-    /// The parallel grid type
-    type ParallelGrid: Grid + ParallelGrid;
+pub trait ParallelFunctionSpace<C: Communicator>: FunctionSpace {
+    /// Parallel grid type
+    type ParallelGrid: ParallelGrid<C> + Grid;
     /// The type of the serial space on each process
-    type SerialSpace: FunctionSpace<Grid = <Self::ParallelGrid as ParallelGrid>::LocalGrid>;
+    type LocalSpace<'a>: FunctionSpace
+    where
+        Self: 'a;
+
+    /// MPI communicator
+    fn comm(&self) -> &C;
+
+    /// Get the grid
+    fn grid(&self) -> &Self::ParallelGrid;
 
     /// Get the local space on the process
-    fn local_space(&self) -> &Self::SerialSpace;
+    fn local_space(&self) -> &Self::LocalSpace<'_>;
 }
-*/
diff --git a/tests/compare_to_bempp_cl.rs b/tests/compare_to_bempp_cl.rs
index 1e411ffd..de6d5cde 100644
--- a/tests/compare_to_bempp_cl.rs
+++ b/tests/compare_to_bempp_cl.rs
@@ -1,16 +1,13 @@
 use approx::*;
 use bempp::assembly::{batched, batched::BatchedAssembler, batched::BatchedPotentialAssembler};
 use bempp::function::SerialFunctionSpace;
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use cauchy::c64;
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::Continuity;
 use ndgrid::shapes::regular_sphere;
 use rlst::{rlst_dynamic_array2, RandomAccessByRef, RandomAccessMut};
 
-extern crate blas_src;
-extern crate lapack_src;
-
 #[test]
 fn test_laplace_single_layer_dp0_dp0() {
     let grid = regular_sphere(0);
diff --git a/tests/fmm.rs.bak b/tests/fmm.rs.bak
index 987078c0..77a90f64 100644
--- a/tests/fmm.rs.bak
+++ b/tests/fmm.rs.bak
@@ -3,7 +3,7 @@ use bempp::assembly::batched::BatchedAssembler;
 use bempp::assembly::{batched, fmm_tools};
 use bempp::function::SerialFunctionSpace;
 use ndgrid::{shapes::regular_sphere, traits::Grid};
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use green_kernels::laplace_3d::Laplace3dKernel;
 use green_kernels::{traits::Kernel, types::EvalType};
 #[cfg(not(debug_assertions))]
@@ -19,9 +19,6 @@ use rlst::{
     RawAccess, RawAccessMut,
 };
 
-extern crate blas_src;
-extern crate lapack_src;
-
 fn fmm_prototype<TestGrid: Grid<T = f64, EntityDescriptor = ReferenceCellType> + Sync, TrialGrid: Grid<T = f64, EntityDescriptor = ReferenceCellType> + Sync>(
     trial_space: &SerialFunctionSpace<f64, TrialGrid>,
     test_space: &SerialFunctionSpace<f64, TestGrid>,
diff --git a/tests/mixed_assembly.rs.bak b/tests/mixed_assembly.rs.bak
index 64d083f7..b2f18791 100644
--- a/tests/mixed_assembly.rs.bak
+++ b/tests/mixed_assembly.rs.bak
@@ -5,16 +5,13 @@ use ndgrid::{
     grid::{SingleElementGrid, SingleElementGridBuilder},
     traits::Builder,
 };
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use cauchy::c64;
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::{Continuity, ReferenceCellType};
 use paste::paste;
 use rlst::{rlst_dynamic_array2, RandomAccessByRef};
 
-extern crate blas_src;
-extern crate lapack_src;
-
 fn mixed_grid() -> MixedGrid<f64> {
     let mut b = MixedGridBuilder::<3, f64>::new(());
     b.add_point(0, [0.0, 0.0, 0.0]);