Reimplement parallel assembly (#277)

* restructure into folders * kifmm version * remove unused tests * reimplement parallel assembly * use ids in parallel test to account for rearranged cell indices * ndgrid branch * clippy * ndgrid main * update examples, tests, benches * clippy
bempp · Aug 19, 2024 · fead46f · fead46f
1 parent 3e437e5
commit fead46f
Show file tree

Hide file tree

Showing 24 changed files with 431 additions and 140 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,5 +1,5 @@
 [features]
-mpi = ["dep:mpi"]
+mpi = ["dep:mpi", "ndelement/mpi", "ndgrid/mpi"]
 strict = []
 
 [package]
@@ -20,28 +20,22 @@ name = "bempp"
 crate-type = ["lib", "cdylib"]
 
 [dependencies]
-approx = "0.5"
-cauchy = "0.4.*"
 itertools = "0.13.*"
 mpi = { version = "0.8.*", optional = true }
 num = "0.4"
-paste = "1.*"
 lazy_static = "1.4"
-libc = "0.2"
-log = "0.4"
 ndelement = { git = "https://github.com/bempp/ndelement.git" }
 ndgrid = { git = "https://github.com/bempp/ndgrid.git" }
 rayon = "1.9"
-rand = "0.8.5"
 rlst = { version = "0.2" }
 green-kernels = { git = "https://github.com/bempp/green-kernels.git" }
-thiserror="1.*"
 
 [dev-dependencies]
+approx = "0.5"
+paste = "1.*"
+cauchy = "0.4.*"
 criterion = { version = "0.5.*", features = ["html_reports"]}
-kifmm = { version = "0.1" }
-blas-src = { version = "0.10", features = ["blis"]}
-lapack-src = { version = "0.10", features = ["netlib"]}
+# kifmm = { version = "1.0" }
 
 [[bench]]
 name = "assembly_benchmark"
@@ -52,7 +46,3 @@ cargo-args = ["-Zunstable-options", "-Zrustdoc-scrape-examples"]
 
 [lints.clippy]
 wildcard_imports = "forbid"
-
-[target.aarch64-apple-darwin.dev-dependencies]
-blas-src = { version = "0.10", features = ["accelerate"]}
-lapack-src = { version = "0.10", features = ["accelerate"]}
diff --git a/benches/assembly_benchmark.rs b/benches/assembly_benchmark.rs
@@ -1,15 +1,12 @@
 use bempp::assembly::{batched, batched::BatchedAssembler};
 use bempp::function::SerialFunctionSpace;
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use criterion::{criterion_group, criterion_main, Criterion};
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::{Continuity, ReferenceCellType};
 use ndgrid::shapes::regular_sphere;
 use rlst::rlst_dynamic_array2;
 
-extern crate blas_src;
-extern crate lapack_src;
-
 pub fn assembly_parts_benchmark(c: &mut Criterion) {
     let mut group = c.benchmark_group("assembly");
     group.sample_size(20);

diff --git a/examples/assembly.rs b/examples/assembly.rs
@@ -1,14 +1,11 @@
 use bempp::assembly::{batched, batched::BatchedAssembler};
 use bempp::function::SerialFunctionSpace;
-use bempp::traits::function::FunctionSpace;
+use bempp::traits::FunctionSpace;
 use ndelement::ciarlet::LagrangeElementFamily;
 use ndelement::types::{Continuity, ReferenceCellType};
 use ndgrid::shapes::regular_sphere;
 use rlst::{rlst_dynamic_array2, RandomAccessByRef};
 
-extern crate blas_src;
-extern crate lapack_src;
-
 fn main() {
     // Create a grid, family of elements, and function space
     let grid = regular_sphere(0);

diff --git a/examples/test_parallel_assembly.rs b/examples/test_parallel_assembly.rs
@@ -0,0 +1,253 @@
+//? mpirun -n {{NPROCESSES}} --features "mpi"
+
+#[cfg(feature = "mpi")]
+use approx::assert_relative_eq;
+#[cfg(feature = "mpi")]
+use bempp::{
+    assembly::batched,
+    assembly::batched::BatchedAssembler,
+    function::{ParallelFunctionSpace, SerialFunctionSpace},
+    traits::FunctionSpace,
+};
+#[cfg(feature = "mpi")]
+use itertools::izip;
+#[cfg(feature = "mpi")]
+use mpi::{
+    environment::Universe,
+    request::WaitGuard,
+    traits::{Communicator, Destination, Source},
+};
+#[cfg(feature = "mpi")]
+use ndelement::{
+    ciarlet::{CiarletElement, LagrangeElementFamily},
+    types::{Continuity, ReferenceCellType},
+};
+#[cfg(feature = "mpi")]
+use ndgrid::{
+    grid::parallel::ParallelGrid,
+    traits::{Builder, Entity, Grid, ParallelBuilder},
+    SingleElementGrid, SingleElementGridBuilder,
+};
+#[cfg(feature = "mpi")]
+use rlst::{CsrMatrix, Shape};
+#[cfg(feature = "mpi")]
+use std::collections::{hash_map::Entry, HashMap};
+
+#[cfg(feature = "mpi")]
+fn create_single_element_grid_data(b: &mut SingleElementGridBuilder<f64>, n: usize) {
+    for y in 0..n {
+        for x in 0..n {
+            b.add_point(
+                y * n + x,
+                &[x as f64 / (n - 1) as f64, y as f64 / (n - 1) as f64, 0.0],
+            );
+        }
+    }
+
+    for i in 0..n - 1 {
+        for j in 0..n - 1 {
+            b.add_cell(
+                i * (n - 1) + j,
+                &[j * n + i, j * n + i + 1, j * n + i + n, j * n + i + n + 1],
+            );
+        }
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn example_single_element_grid<C: Communicator>(
+    comm: &C,
+    n: usize,
+) -> ParallelGrid<'_, C, SingleElementGrid<f64, CiarletElement<f64>>> {
+    let rank = comm.rank();
+
+    let mut b = SingleElementGridBuilder::<f64>::new(3, (ReferenceCellType::Quadrilateral, 1));
+
+    if rank == 0 {
+        create_single_element_grid_data(&mut b, n);
+        b.create_parallel_grid(comm)
+    } else {
+        b.receive_parallel_grid(comm, 0)
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn example_single_element_grid_serial(n: usize) -> SingleElementGrid<f64, CiarletElement<f64>> {
+    let mut b = SingleElementGridBuilder::<f64>::new(3, (ReferenceCellType::Quadrilateral, 1));
+    create_single_element_grid_data(&mut b, n);
+    b.create_grid()
+}
+
+#[cfg(feature = "mpi")]
+fn test_parallel_assembly_single_element_grid<C: Communicator>(
+    comm: &C,
+    degree: usize,
+    cont: Continuity,
+) {
+    let rank = comm.rank();
+    let size = comm.size();
+
+    let n = 10;
+    let grid = example_single_element_grid(comm, n);
+    let element = LagrangeElementFamily::<f64>::new(degree, cont);
+    let space = ParallelFunctionSpace::new(&grid, &element);
+
+    let a = batched::LaplaceSingleLayerAssembler::<f64>::default();
+
+    let matrix = a.parallel_assemble_singular_into_csr(&space, &space);
+
+    if rank == 0 {
+        // Compute the same matrix on a single process
+        let serial_grid = example_single_element_grid_serial(n);
+        let serial_space = SerialFunctionSpace::new(&serial_grid, &element);
+        let serial_matrix = a.assemble_singular_into_csr(&serial_space, &serial_space);
+
+        // Dofs associated with each cell (by cell id)
+        let mut serial_dofmap = HashMap::new();
+        for cell in serial_grid.entity_iter(2) {
+            serial_dofmap.insert(
+                cell.id().unwrap(),
+                serial_space
+                    .cell_dofs(cell.local_index())
+                    .unwrap()
+                    .iter()
+                    .map(|i| serial_space.global_dof_index(*i))
+                    .collect::<Vec<_>>(),
+            );
+        }
+        let mut parallel_dofmap = HashMap::new();
+        for cell in grid.entity_iter(2) {
+            parallel_dofmap.insert(
+                cell.id().unwrap(),
+                space
+                    .cell_dofs(cell.local_index())
+                    .unwrap()
+                    .iter()
+                    .map(|i| space.global_dof_index(*i))
+                    .collect::<Vec<_>>(),
+            );
+        }
+        for p in 1..size {
+            let process = comm.process_at_rank(p);
+            let (cell_ids, _status) = process.receive_vec::<usize>();
+            let (dofs, _status) = process.receive_vec::<usize>();
+            let (dofs_len, _status) = process.receive_vec::<usize>();
+            let mut start = 0;
+            for (id, len) in izip!(cell_ids, dofs_len) {
+                if let Entry::Vacant(e) = parallel_dofmap.entry(id) {
+                    e.insert(dofs[start..start + len].to_vec());
+                } else {
+                    assert_eq!(parallel_dofmap[&id], dofs[start..start + len]);
+                }
+                start += len;
+            }
+        }
+
+        let mut index_map = vec![0; serial_space.global_size()];
+
+        for (id, dofs) in parallel_dofmap {
+            for (i, j) in izip!(&serial_dofmap[&id], dofs) {
+                index_map[j] = *i;
+            }
+        }
+
+        // Gather sparse matrices onto process 0
+        let mut rows = vec![];
+        let mut cols = vec![];
+        let mut data = vec![];
+
+        let mut r = 0;
+        for (i, index) in matrix.indices().iter().enumerate() {
+            while i >= matrix.indptr()[r + 1] {
+                r += 1;
+            }
+            rows.push(index_map[r]);
+            cols.push(index_map[*index]);
+            data.push(matrix.data()[i]);
+        }
+
+        for p in 1..size {
+            let process = comm.process_at_rank(p);
+            let (indices, _status) = process.receive_vec::<usize>();
+            let (indptr, _status) = process.receive_vec::<usize>();
+            let (subdata, _status) = process.receive_vec::<f64>();
+            let mat = CsrMatrix::new(matrix.shape(), indices, indptr, subdata);
+
+            let mut r = 0;
+            for (i, index) in mat.indices().iter().enumerate() {
+                while i >= mat.indptr()[r + 1] {
+                    r += 1;
+                }
+                rows.push(index_map[r]);
+                cols.push(index_map[*index]);
+                data.push(mat.data()[i]);
+            }
+        }
+        let full_matrix = CsrMatrix::from_aij(
+            [space.global_size(), space.global_size()],
+            &rows,
+            &cols,
+            &data,
+        )
+        .unwrap();
+
+        // Compare to matrix assembled on just this process
+        for (i, j) in full_matrix.indices().iter().zip(serial_matrix.indices()) {
+            assert_eq!(i, j);
+        }
+        for (i, j) in full_matrix.indptr().iter().zip(serial_matrix.indptr()) {
+            assert_eq!(i, j);
+        }
+        for (i, j) in full_matrix.data().iter().zip(serial_matrix.data()) {
+            assert_relative_eq!(i, j, epsilon = 1e-10);
+        }
+    } else {
+        let mut cell_ids = vec![];
+        let mut dofs = vec![];
+        let mut dofs_len = vec![];
+        for cell in grid.entity_iter(2) {
+            cell_ids.push(cell.id().unwrap());
+            let cell_dofs = space
+                .cell_dofs(cell.local_index())
+                .unwrap()
+                .iter()
+                .map(|i| space.global_dof_index(*i))
+                .collect::<Vec<_>>();
+            dofs.extend_from_slice(&cell_dofs);
+            dofs_len.push(cell_dofs.len());
+        }
+
+        mpi::request::scope(|scope| {
+            let root = comm.process_at_rank(0);
+            // TODO: send this:
+            let _ = WaitGuard::from(root.immediate_send(scope, &cell_ids));
+            let _ = WaitGuard::from(root.immediate_send(scope, &dofs));
+            let _ = WaitGuard::from(root.immediate_send(scope, &dofs_len));
+            let _ = WaitGuard::from(root.immediate_send(scope, matrix.indices()));
+            let _ = WaitGuard::from(root.immediate_send(scope, matrix.indptr()));
+            let _ = WaitGuard::from(root.process_at_rank(0).immediate_send(scope, matrix.data()));
+        });
+    }
+}
+
+#[cfg(feature = "mpi")]
+fn main() {
+    let universe: Universe = mpi::initialize().unwrap();
+    let world = universe.world();
+    let rank = world.rank();
+
+    for degree in 0..4 {
+        if rank == 0 {
+            println!("Testing assembly with DP{degree} using SingleElementGrid in parallel.");
+        }
+        test_parallel_assembly_single_element_grid(&world, degree, Continuity::Discontinuous);
+    }
+    for degree in 1..4 {
+        if rank == 0 {
+            println!("Testing assembly with P{degree} using SingleElementGrid in parallel.");
+        }
+        test_parallel_assembly_single_element_grid(&world, degree, Continuity::Standard);
+    }
+}
+#[cfg(not(feature = "mpi"))]
+fn main() {}
diff --git a/src/assembly.rs b/src/assembly.rs
@@ -8,7 +8,7 @@ mod test {
     use super::batched::BatchedAssembler;
     use super::*;
     use crate::function::SerialFunctionSpace;
-    use crate::traits::function::FunctionSpace;
+    use crate::traits::FunctionSpace;
     use cauchy::{c32, c64};
     use ndelement::ciarlet::CiarletElement;
     use ndelement::ciarlet::LagrangeElementFamily;

diff --git a/src/assembly/batched.rs b/src/assembly/batched.rs
@@ -4,37 +4,30 @@ use green_kernels::types::GreenKernelEvalType;
 use rlst::{Array, BaseArray, VectorContainer};
 
 mod boundary;
-pub use boundary::{BatchedAssembler, BatchedAssemblerOptions};
-mod potential;
-pub use potential::{BatchedPotentialAssembler, BatchedPotentialAssemblerOptions};
-
-mod adjoint_double_layer;
-mod double_layer;
-mod hypersingular;
-mod single_layer;
-pub use adjoint_double_layer::{
+pub use boundary::adjoint_double_layer::{
     HelmholtzAdjointDoubleLayerAssembler, LaplaceAdjointDoubleLayerAssembler,
 };
-pub use double_layer::{HelmholtzDoubleLayerAssembler, LaplaceDoubleLayerAssembler};
-pub use hypersingular::{HelmholtzHypersingularAssembler, LaplaceHypersingularAssembler};
-pub use single_layer::{HelmholtzSingleLayerAssembler, LaplaceSingleLayerAssembler};
+pub use boundary::double_layer::{HelmholtzDoubleLayerAssembler, LaplaceDoubleLayerAssembler};
+pub use boundary::hypersingular::{HelmholtzHypersingularAssembler, LaplaceHypersingularAssembler};
+pub use boundary::single_layer::{HelmholtzSingleLayerAssembler, LaplaceSingleLayerAssembler};
+pub use boundary::{BatchedAssembler, BatchedAssemblerOptions};
 
-mod double_layer_potential;
-mod single_layer_potential;
-pub use double_layer_potential::{
+mod potential;
+pub use potential::double_layer::{
     HelmholtzDoubleLayerPotentialAssembler, LaplaceDoubleLayerPotentialAssembler,
 };
-pub use single_layer_potential::{
+pub use potential::single_layer::{
     HelmholtzSingleLayerPotentialAssembler, LaplaceSingleLayerPotentialAssembler,
 };
+pub use potential::{BatchedPotentialAssembler, BatchedPotentialAssemblerOptions};
 
 type RlstArray<T, const DIM: usize> = Array<T, BaseArray<T, VectorContainer<T>, DIM>, DIM>;
 
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::function::SerialFunctionSpace;
-    use crate::traits::function::FunctionSpace;
+    use crate::traits::FunctionSpace;
     use approx::*;
     use ndelement::ciarlet::LagrangeElementFamily;
     use ndelement::types::Continuity;