diff --git a/Cargo.toml b/Cargo.toml
index 782f56f..f1388c8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,3 +1,53 @@
-[workspace]
-members = ["boojum-cuda", "criterion-cuda", "cudart", "cudart-sys"]
-resolver = "2"
+[package]
+name = "boojum-cuda"
+version = "0.1.0"
+edition = "2021"
+build = "build/main.rs"
+license = "MIT OR Apache-2.0"
+
+[build-dependencies]
+boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" }
+cudart-sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart-sys" }
+cmake = "^0"
+itertools = "^0"
+
+[dependencies]
+boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" }
+cudart = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart" }
+cudart-sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart-sys" }
+itertools = "^0"
+lazy_static = "^1"
+
+[dev-dependencies]
+blake2 = "^0"
+criterion = "^0"
+criterion-cuda = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "criterion-cuda" }
+criterion-macro = "^0"
+itertools = "^0"
+rand = "^0"
+rayon = "^1"
+serial_test = "^2"
+
+[[bench]]
+name = "blake2s"
+harness = false
+
+[[bench]]
+name = "gates"
+harness = false
+
+[[bench]]
+name = "goldilocks"
+harness = false
+
+[[bench]]
+name = "ntt"
+harness = false
+
+[[bench]]
+name = "ops_complex"
+harness = false
+
+[[bench]]
+name = "poseidon"
+harness = false
diff --git a/boojum-cuda/benches/blake2s.rs b/benches/blake2s.rs
similarity index 100%
rename from boojum-cuda/benches/blake2s.rs
rename to benches/blake2s.rs
diff --git a/boojum-cuda/benches/gates.rs b/benches/gates.rs
similarity index 100%
rename from boojum-cuda/benches/gates.rs
rename to benches/gates.rs
diff --git a/boojum-cuda/benches/goldilocks.rs b/benches/goldilocks.rs
similarity index 100%
rename from boojum-cuda/benches/goldilocks.rs
rename to benches/goldilocks.rs
diff --git a/boojum-cuda/benches/ntt.rs b/benches/ntt.rs
similarity index 100%
rename from boojum-cuda/benches/ntt.rs
rename to benches/ntt.rs
diff --git a/boojum-cuda/benches/ops_complex.rs b/benches/ops_complex.rs
similarity index 100%
rename from boojum-cuda/benches/ops_complex.rs
rename to benches/ops_complex.rs
diff --git a/boojum-cuda/benches/poseidon.rs b/benches/poseidon.rs
similarity index 100%
rename from boojum-cuda/benches/poseidon.rs
rename to benches/poseidon.rs
diff --git a/boojum-cuda/Cargo.toml b/boojum-cuda/Cargo.toml
deleted file mode 100644
index f0d55f1..0000000
--- a/boojum-cuda/Cargo.toml
+++ /dev/null
@@ -1,53 +0,0 @@
-[package]
-name = "boojum-cuda"
-version = "0.1.0"
-edition = "2021"
-build = "build/main.rs"
-license = "MIT OR Apache-2.0"
-
-[build-dependencies]
-boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" }
-cudart-sys = { path = "../cudart-sys" }
-cmake = "^0"
-itertools = "^0"
-
-[dependencies]
-boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" }
-cudart = { path = "../cudart" }
-cudart-sys = { path = "../cudart-sys" }
-itertools = "^0"
-lazy_static = "^1"
-
-[dev-dependencies]
-blake2 = "^0"
-criterion = "^0"
-criterion-cuda = { path = "../criterion-cuda" }
-criterion-macro = "^0"
-itertools = "^0"
-rand = "^0"
-rayon = "^1"
-serial_test = "^2"
-
-[[bench]]
-name = "blake2s"
-harness = false
-
-[[bench]]
-name = "gates"
-harness = false
-
-[[bench]]
-name = "goldilocks"
-harness = false
-
-[[bench]]
-name = "ntt"
-harness = false
-
-[[bench]]
-name = "ops_complex"
-harness = false
-
-[[bench]]
-name = "poseidon"
-harness = false
diff --git a/boojum-cuda/build/gates.rs b/build/gates.rs
similarity index 100%
rename from boojum-cuda/build/gates.rs
rename to build/gates.rs
diff --git a/boojum-cuda/build/main.rs b/build/main.rs
similarity index 100%
rename from boojum-cuda/build/main.rs
rename to build/main.rs
diff --git a/boojum-cuda/build/poseidon_constants.rs b/build/poseidon_constants.rs
similarity index 100%
rename from boojum-cuda/build/poseidon_constants.rs
rename to build/poseidon_constants.rs
diff --git a/boojum-cuda/build/template.rs b/build/template.rs
similarity index 100%
rename from boojum-cuda/build/template.rs
rename to build/template.rs
diff --git a/criterion-cuda/Cargo.toml b/criterion-cuda/Cargo.toml
deleted file mode 100644
index 80ada31..0000000
--- a/criterion-cuda/Cargo.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[package]
-name = "criterion-cuda"
-version = "0.1.0"
-edition = "2021"
-license = "MIT OR Apache-2.0"
-
-[dependencies]
-criterion = "^0"
-cudart = { path = "../cudart" }
diff --git a/criterion-cuda/src/lib.rs b/criterion-cuda/src/lib.rs
deleted file mode 100644
index c0b4d1c..0000000
--- a/criterion-cuda/src/lib.rs
+++ /dev/null
@@ -1,193 +0,0 @@
-use criterion::measurement::{Measurement, ValueFormatter};
-use criterion::Throughput;
-
-use cudart::event::{elapsed_time, CudaEvent};
-use cudart::stream::CudaStream;
-
-// Allows criterion benchmarks to use CUDA event-based timings.
-// Based on src/lib.rs from https://github.com/theHamsta/criterion-cuda.
-pub struct CudaMeasurement<const INV_ELEMS: bool = false>;
-
-// c.f. https://docs.rs/criterion/latest/criterion/measurement/trait.Measurement.html
-impl<const INV_ELEMS: bool> Measurement for CudaMeasurement<INV_ELEMS> {
-    type Intermediate = (CudaEvent, CudaEvent);
-    type Value = f32;
-
-    fn start(&self) -> Self::Intermediate {
-        let stream = CudaStream::default();
-        let start_event = CudaEvent::create().expect("Failed to create event");
-        let end_event = CudaEvent::create().expect("Failed to create event");
-        start_event
-            .record(&stream)
-            .expect("Could not record CUDA event");
-        (start_event, end_event)
-    }
-
-    fn end(&self, events: Self::Intermediate) -> Self::Value {
-        let (start_event, end_event) = events;
-        let stream = CudaStream::default();
-        end_event
-            .record(&stream)
-            .expect("Could not record CUDA event");
-        stream.synchronize().expect("Failed to synchronize");
-        elapsed_time(&start_event, &end_event).expect("Failed to measure time")
-    }
-
-    fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value {
-        v1 + v2
-    }
-
-    fn zero(&self) -> Self::Value {
-        0f32
-    }
-
-    fn to_f64(&self, value: &Self::Value) -> f64 {
-        *value as f64
-    }
-
-    fn formatter(&self) -> &dyn ValueFormatter {
-        &CudaEventElapsedTimeFormatter::<INV_ELEMS>
-    }
-}
-
-// based on https://github.com/bheisler/criterion.rs/blob/master/src/measurement.rs
-struct CudaEventElapsedTimeFormatter<const INV_ELEMS: bool = false>;
-
-impl<const INV_ELEMS: bool> CudaEventElapsedTimeFormatter<INV_ELEMS> {
-    fn bytes_per_second(&self, bytes: f64, typical: f64, values: &mut [f64]) -> &'static str {
-        let bytes_per_second = bytes * (1e3 / typical);
-        const K: f64 = 1024.0;
-        let (denominator, unit) = if bytes_per_second < K {
-            (1.0, "  B/s")
-        } else if bytes_per_second < K * K {
-            (K, "KiB/s")
-        } else if bytes_per_second < K * K * K {
-            (K * K, "MiB/s")
-        } else {
-            (K * K * K, "GiB/s")
-        };
-
-        for val in values {
-            let bytes_per_second = bytes * (1e3 / *val);
-            *val = bytes_per_second / denominator;
-        }
-
-        unit
-    }
-
-    fn bytes_per_second_decimal(
-        &self,
-        bytes: f64,
-        typical: f64,
-        values: &mut [f64],
-    ) -> &'static str {
-        let bytes_per_second = bytes * (1e3 / typical);
-        const K: f64 = 1000.0;
-        let (denominator, unit) = if bytes_per_second < K {
-            (1.0, "  B/s")
-        } else if bytes_per_second < K * K {
-            (K, "KB/s")
-        } else if bytes_per_second < K * K * K {
-            (K * K, "MB/s")
-        } else {
-            (K * K * K, "GB/s")
-        };
-
-        for val in values {
-            let bytes_per_second = bytes * (1e3 / *val);
-            *val = bytes_per_second / denominator;
-        }
-
-        unit
-    }
-
-    fn elements_per_second(&self, elems: f64, typical: f64, values: &mut [f64]) -> &'static str {
-        let elems_per_second = elems * (1e3 / typical);
-        const K: f64 = 1000.0;
-        let (denominator, unit) = if elems_per_second < K {
-            (1.0, " elem/s")
-        } else if elems_per_second < K * K {
-            (K, "Kelem/s")
-        } else if elems_per_second < K * K * K {
-            (K * K, "Melem/s")
-        } else {
-            (K * K * K, "Gelem/s")
-        };
-
-        for val in values {
-            let elems_per_second = elems * (1e3 / *val);
-            *val = elems_per_second / denominator;
-        }
-
-        unit
-    }
-
-    fn second_per_element(&self, elems: f64, typical: f64, values: &mut [f64]) -> &'static str {
-        let ms_per_elem = typical / elems;
-        let (factor, unit) = if ms_per_elem < 1e-6 {
-            (1e9, "ps/elem")
-        } else if ms_per_elem < 1e-3 {
-            (1e6, "ns/elem")
-        } else if ms_per_elem < 1e0 {
-            (1e3, "µs/elem")
-        } else if ms_per_elem < 1e3 {
-            (1e0, "ms/elem")
-        } else {
-            (1e-3, "s/elem")
-        };
-
-        for val in values {
-            *val *= factor / elems;
-        }
-
-        unit
-    }
-}
-
-impl<const INV_ELEMS: bool> ValueFormatter for CudaEventElapsedTimeFormatter<INV_ELEMS> {
-    fn scale_values(&self, typical_value: f64, values: &mut [f64]) -> &'static str {
-        let (factor, unit) = if typical_value < 1e-6 {
-            (1e9, "ps")
-        } else if typical_value < 1e-3 {
-            (1e6, "ns")
-        } else if typical_value < 1e0 {
-            (1e3, "µs")
-        } else if typical_value < 1e3 {
-            (1e0, "ms")
-        } else {
-            (1e-3, "s")
-        };
-
-        for val in values {
-            *val *= factor;
-        }
-
-        unit
-    }
-
-    fn scale_throughputs(
-        &self,
-        typical_value: f64,
-        throughput: &Throughput,
-        values: &mut [f64],
-    ) -> &'static str {
-        match *throughput {
-            Throughput::Bytes(bytes) => self.bytes_per_second(bytes as f64, typical_value, values),
-            Throughput::BytesDecimal(bytes) => {
-                self.bytes_per_second_decimal(bytes as f64, typical_value, values)
-            }
-            Throughput::Elements(elems) => {
-                if INV_ELEMS {
-                    self.second_per_element(elems as f64, typical_value, values)
-                } else {
-                    self.elements_per_second(elems as f64, typical_value, values)
-                }
-            }
-        }
-    }
-
-    fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str {
-        // no scaling is needed
-        "ms"
-    }
-}
diff --git a/cudart-sys/Cargo.toml b/cudart-sys/Cargo.toml
deleted file mode 100644
index 419ea6d..0000000
--- a/cudart-sys/Cargo.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[package]
-name = "cudart-sys"
-version = "0.1.0"
-edition = "2021"
-license = "MIT OR Apache-2.0"
-
-[build-dependencies]
-bindgen = "^0"
-serde_json = "^1"
diff --git a/cudart-sys/build.rs b/cudart-sys/build.rs
deleted file mode 100644
index 74ee3b8..0000000
--- a/cudart-sys/build.rs
+++ /dev/null
@@ -1,265 +0,0 @@
-use std::fs;
-use std::path::PathBuf;
-
-use bindgen::callbacks::{EnumVariantValue, ParseCallbacks};
-
-include!("src/path.rs");
-
-pub fn assert_cuda_version() {
-    let version = option_env!("CUDA_VERSION").map_or_else(
-        || {
-            let file = fs::File::open(concat!(cuda_path!(), "/version.json"))
-                .expect("CUDA Toolkit not found");
-            let reader = std::io::BufReader::new(file);
-            let value: serde_json::Value = serde_json::from_reader(reader).unwrap();
-            dbg!(value["cuda"]["version"].as_str().unwrap().to_string())
-        },
-        |s| s.to_string(),
-    );
-    assert!(
-        version.starts_with("12."),
-        "CUDA Toolkit {version} is not supported. Please install CUDA Toolkit 12.x"
-    );
-}
-
-#[derive(Debug)]
-struct CudaParseCallbacks;
-
-impl ParseCallbacks for CudaParseCallbacks {
-    fn enum_variant_name(
-        &self,
-        enum_name: Option<&str>,
-        original_variant_name: &str,
-        _variant_value: EnumVariantValue,
-    ) -> Option<String> {
-        let strip_prefix = |prefix| {
-            Some(
-                original_variant_name
-                    .strip_prefix(prefix)
-                    .unwrap()
-                    .to_string(),
-            )
-        };
-        if let Some(enum_name) = enum_name {
-            match enum_name {
-                "enum cudaDeviceAttr" => strip_prefix("cudaDevAttr"),
-                "enum cudaLimit" => strip_prefix("cudaLimit"),
-                "enum cudaError" => strip_prefix("cuda"),
-                "enum cudaMemcpyKind" => strip_prefix("cudaMemcpy"),
-                "enum cudaMemPoolAttr" => strip_prefix("cudaMemPool"),
-                "enum cudaMemLocationType" => strip_prefix("cudaMemLocationType"),
-                "enum cudaMemAllocationType" => strip_prefix("cudaMemAllocationType"),
-                "enum cudaMemAllocationHandleType" => strip_prefix("cudaMemHandleType"),
-                "enum cudaMemoryType" => strip_prefix("cudaMemoryType"),
-                "enum cudaMemAccessFlags" => strip_prefix("cudaMemAccessFlagsProt"),
-                "enum cudaFuncAttribute" => strip_prefix("cudaFuncAttribute"),
-                "enum cudaFuncCache" => strip_prefix("cudaFuncCache"),
-                "enum cudaSharedMemConfig" => strip_prefix("cudaSharedMem"),
-                "enum cudaLaunchAttributeID" => strip_prefix("cudaLaunchAttribute"),
-                "enum cudaAccessProperty" => strip_prefix("cudaAccessProperty"),
-                "enum cudaSynchronizationPolicy" => strip_prefix("cudaSyncPolicy"),
-                "enum cudaClusterSchedulingPolicy" => strip_prefix("cudaClusterSchedulingPolicy"),
-                "enum cudaLaunchMemSyncDomain" => strip_prefix("cudaLaunchMemSyncDomain"),
-                _ => None,
-            }
-        } else {
-            None
-        }
-    }
-
-    fn item_name(&self, _original_item_name: &str) -> Option<String> {
-        let from = |s: &str| Some(String::from(s));
-        match _original_item_name {
-            "cudaDeviceAttr" => from("CudaDeviceAttr"),
-            "cudaLimit" => from("CudaLimit"),
-            "cudaError" => from("CudaError"),
-            "cudaDeviceProp" => from("CudaDeviceProperties"),
-            "cudaMemcpyKind" => from("CudaMemoryCopyKind"),
-            "cudaMemPoolProps" => from("CudaMemPoolProperties"),
-            "cudaMemPoolAttr" => from("CudaMemPoolAttribute"),
-            "cudaMemLocation" => from("CudaMemLocation"),
-            "cudaMemLocationType" => from("CudaMemLocationType"),
-            "cudaMemAllocationType" => from("CudaMemAllocationType"),
-            "cudaMemAllocationHandleType" => from("CudaMemAllocationHandleType"),
-            "cudaPointerAttributes" => from("CudaPointerAttributes"),
-            "cudaMemoryType" => from("CudaMemoryType"),
-            "cudaMemAccessFlags" => from("CudaMemAccessFlags"),
-            "cudaMemAccessDesc" => from("CudaMemAccessDesc"),
-            "cudaFuncAttributes" => from("CudaFuncAttributes"),
-            "cudaFuncAttribute" => from("CudaFuncAttribute"),
-            "cudaFuncCache" => from("CudaFuncCache"),
-            "cudaSharedMemConfig" => from("CudaSharedMemConfig"),
-            "cudaLaunchAttributeID" => from("CudaLaunchAttributeID"),
-            "cudaLaunchAttributeValue" => from("CudaLaunchAttributeValue"),
-            "cudaAccessPolicyWindow" => from("CudaAccessPolicyWindow"),
-            "cudaAccessProperty" => from("CudaAccessProperty"),
-            "cudaSynchronizationPolicy" => from("CudaSynchronizationPolicy"),
-            "cudaClusterSchedulingPolicy" => from("CudaClusterSchedulingPolicy"),
-            "cudaLaunchMemSyncDomain" => from("CudaLaunchMemSyncDomain"),
-            _ => None,
-        }
-    }
-}
-
-fn main() {
-    #[cfg(target_os = "macos")]
-    std::process::exit(0);
-    assert_cuda_version();
-    let cuda_lib_path = cuda_lib_path!();
-    let cuda_runtime_api_path = concat!(cuda_include_path!(), "/cuda_runtime_api.h");
-    println!("cargo:rustc-link-search=native={cuda_lib_path}");
-    println!("cargo:rustc-link-lib=cudart");
-    println!("cargo:rerun-if-changed={cuda_runtime_api_path}");
-
-    let bindings = bindgen::Builder::default()
-        .header(cuda_runtime_api_path)
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-        .parse_callbacks(Box::new(CudaParseCallbacks))
-        .size_t_is_usize(true)
-        .generate_comments(false)
-        .layout_tests(false)
-        .allowlist_type("cudaError")
-        .rustified_enum("cudaError")
-        .must_use_type("cudaError")
-        // device management
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
-        .rustified_enum("cudaDeviceAttr")
-        .allowlist_function("cudaDeviceGetAttribute")
-        .allowlist_function("cudaDeviceGetDefaultMemPool")
-        .rustified_enum("cudaLimit")
-        .allowlist_function("cudaDeviceGetLimit")
-        .allowlist_function("cudaDeviceGetMemPool")
-        .allowlist_function("cudaDeviceReset")
-        .allowlist_function("cudaDeviceSetLimit")
-        .allowlist_function("cudaDeviceSetMemPool")
-        .allowlist_function("cudaDeviceSynchronize")
-        .allowlist_function("cudaGetDevice")
-        .allowlist_function("cudaGetDeviceCount")
-        .allowlist_function("cudaGetDeviceProperties_v2")
-        .allowlist_function("cudaSetDevice")
-        // error handling
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
-        .allowlist_function("cudaGetErrorName")
-        .allowlist_function("cudaGetLastError")
-        .allowlist_function("cudaPeekAtLastError")
-        // stream management
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
-        .allowlist_function("cudaStreamCreate")
-        .allowlist_var("cudaStreamDefault")
-        .allowlist_var("cudaStreamNonBlocking")
-        .allowlist_function("cudaStreamCreateWithFlags")
-        .allowlist_function("cudaStreamDestroy")
-        .allowlist_function("cudaStreamGetAttribute")
-        .allowlist_function("cudaStreamQuery")
-        .allowlist_function("cudaStreamSetAttribute")
-        .allowlist_function("cudaStreamSynchronize")
-        .allowlist_var("cudaEventWaitDefault")
-        .allowlist_var("cudaEventWaitExternal")
-        .allowlist_function("cudaStreamWaitEvent")
-        // event management
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
-        .allowlist_function("cudaEventCreate")
-        .allowlist_var("cudaEventDefault")
-        .allowlist_var("cudaEventBlockingSync")
-        .allowlist_var("cudaEventDisableTiming")
-        .allowlist_var("cudaEventInterprocess")
-        .allowlist_function("cudaEventCreateWithFlags")
-        .allowlist_function("cudaEventDestroy")
-        .allowlist_function("cudaEventElapsedTime")
-        .allowlist_function("cudaEventQuery")
-        .allowlist_function("cudaEventRecord")
-        .allowlist_var("cudaEventRecordDefault")
-        .allowlist_var("cudaEventRecordExternal")
-        .allowlist_function("cudaEventRecordWithFlags")
-        .allowlist_function("cudaEventSynchronize")
-        // execution control
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html
-        .rustified_enum("cudaFuncAttribute")
-        .allowlist_function("cudaFuncGetAttributes")
-        .allowlist_function("cudaFuncSetAttribute")
-        .rustified_enum("cudaFuncCache")
-        .allowlist_function("cudaFuncSetCacheConfig")
-        .rustified_enum("cudaSharedMemConfig")
-        .allowlist_function("cudaFuncSetSharedMemConfig")
-        .allowlist_function("cudaLaunchHostFunc")
-        .allowlist_function("cudaLaunchKernel")
-        .rustified_enum("cudaLaunchAttributeID")
-        .rustified_enum("cudaAccessProperty")
-        .rustified_enum("cudaSynchronizationPolicy")
-        .rustified_enum("cudaClusterSchedulingPolicy")
-        .rustified_enum("cudaLaunchMemSyncDomain")
-        .allowlist_function("cudaLaunchKernelExC")
-        // occupancy
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html
-        .allowlist_function("cudaOccupancyAvailableDynamicSMemPerBlock")
-        .allowlist_function("cudaOccupancyMaxActiveBlocksPerMultiprocessor")
-        .allowlist_function("cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags")
-        .allowlist_function("cudaOccupancyMaxActiveClusters")
-        .allowlist_function("cudaOccupancyMaxPotentialClusterSize")
-        // memory management
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
-        .rustified_enum("cudaMemcpyKind")
-        .allowlist_function("cudaFree")
-        .allowlist_function("cudaFreeHost")
-        .allowlist_function("cudaGetSymbolAddress")
-        .allowlist_function("cudaGetSymbolSize")
-        .allowlist_var("cudaHostAllocDefault")
-        .allowlist_var("cudaHostAllocPortable")
-        .allowlist_var("cudaHostAllocMapped")
-        .allowlist_var("cudaHostAllocWriteCombined")
-        .allowlist_function("cudaHostAlloc")
-        .allowlist_var("cudaHostRegisterDefault")
-        .allowlist_var("cudaHostRegisterPortable")
-        .allowlist_var("cudaHostRegisterMapped")
-        .allowlist_var("cudaHostRegisterIoMemory")
-        .allowlist_var("cudaHostRegisterReadOnly")
-        .allowlist_function("cudaHostRegister")
-        .allowlist_function("cudaHostUnregister")
-        .allowlist_function("cudaMalloc")
-        .allowlist_function("cudaMemGetInfo")
-        .allowlist_function("cudaMemcpy")
-        .allowlist_function("cudaMemcpyAsync")
-        .allowlist_function("cudaMemcpyFromSymbol")
-        .allowlist_function("cudaMemcpyFromSymbolAsync")
-        .allowlist_function("cudaMemcpyPeer")
-        .allowlist_function("cudaMemcpyPeerAsync")
-        .allowlist_function("cudaMemcpyToSymbol")
-        .allowlist_function("cudaMemcpyToSymbolAsync")
-        .allowlist_function("cudaMemset")
-        .allowlist_function("cudaMemsetAsync")
-        // Stream Ordered Memory Allocator
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
-        .allowlist_function("cudaFreeAsync")
-        .allowlist_function("cudaMallocAsync")
-        .allowlist_function("cudaMallocFromPoolAsync")
-        .rustified_enum("cudaMemLocationType")
-        .rustified_enum("cudaMemAllocationType")
-        .rustified_enum("cudaMemAllocationHandleType")
-        .allowlist_function("cudaMemPoolCreate")
-        .allowlist_function("cudaMemPoolDestroy")
-        .rustified_enum("cudaMemPoolAttr")
-        .rustified_enum("cudaMemAccessFlags")
-        .allowlist_function("cudaMemPoolGetAccess")
-        .allowlist_function("cudaMemPoolGetAttribute")
-        .allowlist_function("cudaMemPoolSetAccess")
-        .allowlist_function("cudaMemPoolSetAttribute")
-        .allowlist_function("cudaMemPoolTrimTo")
-        // Unified Addressing
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__UNIFIED.html
-        .rustified_enum("cudaMemoryType")
-        .allowlist_function("cudaPointerGetAttributes")
-        // Peer Device Memory Access
-        // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PEER.html
-        .allowlist_function("cudaDeviceCanAccessPeer")
-        .allowlist_function("cudaDeviceDisablePeerAccess")
-        .allowlist_function("cudaDeviceEnablePeerAccess")
-        //
-        .generate()
-        .expect("Unable to generate bindings");
-
-    fs::write(
-        PathBuf::from("src").join("bindings.rs"),
-        bindings.to_string(),
-    )
-    .expect("Couldn't write bindings!");
-}
diff --git a/cudart-sys/src/.gitignore b/cudart-sys/src/.gitignore
deleted file mode 100644
index 55300bd..0000000
--- a/cudart-sys/src/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-bindings.rs
diff --git a/cudart-sys/src/lib.rs b/cudart-sys/src/lib.rs
deleted file mode 100644
index de4027a..0000000
--- a/cudart-sys/src/lib.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-#![allow(non_upper_case_globals)]
-#![allow(non_camel_case_types)]
-#![allow(non_snake_case)]
-
-mod path;
-
-use std::backtrace::Backtrace;
-use std::error::Error;
-use std::ffi::CStr;
-use std::fmt::{Debug, Display, Formatter};
-use std::mem::MaybeUninit;
-
-include!("bindings.rs");
-
-impl CudaError {
-    pub fn eprint_error(self) {
-        if self != CudaError::Success {
-            eprintln!("CUDA Error: {self}");
-        }
-    }
-
-    pub fn eprint_error_and_backtrace(self) {
-        if self != CudaError::Success {
-            self.eprint_error();
-            let backtrace = Backtrace::capture();
-            eprintln!("Backtrace: {backtrace}");
-        }
-    }
-}
-
-impl Display for CudaError {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        let name = unsafe { CStr::from_ptr(cudaGetErrorName(*self)) };
-        name.fmt(f)
-    }
-}
-
-impl Error for CudaError {}
-
-impl From<u32> for dim3 {
-    fn from(value: u32) -> Self {
-        Self {
-            x: value,
-            y: 1,
-            z: 1,
-        }
-    }
-}
-
-impl From<(u32, u32)> for dim3 {
-    fn from(value: (u32, u32)) -> Self {
-        Self {
-            x: value.0,
-            y: value.1,
-            z: 1,
-        }
-    }
-}
-
-impl From<(u32, u32, u32)> for dim3 {
-    fn from(value: (u32, u32, u32)) -> Self {
-        Self {
-            x: value.0,
-            y: value.1,
-            z: value.2,
-        }
-    }
-}
-
-impl Default for CudaMemPoolProperties {
-    fn default() -> Self {
-        let mut s = MaybeUninit::<Self>::uninit();
-        unsafe {
-            std::ptr::write_bytes(s.as_mut_ptr(), 0, 1);
-            s.assume_init()
-        }
-    }
-}
diff --git a/cudart-sys/src/path.rs b/cudart-sys/src/path.rs
deleted file mode 100644
index 9268ee5..0000000
--- a/cudart-sys/src/path.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-#[cfg(target_os = "windows")]
-#[macro_export]
-macro_rules! cuda_path {
-    () => {
-        env!("CUDA_PATH")
-    };
-}
-
-#[cfg(target_os = "linux")]
-#[macro_export]
-macro_rules! cuda_path {
-    () => {
-        "/usr/local/cuda"
-    };
-}
-
-#[cfg(not(any(target_os = "windows", target_os = "linux")))]
-#[macro_export]
-macro_rules! cuda_path {
-    () => {
-        unimplemented!()
-    };
-}
-
-#[macro_export]
-macro_rules! cuda_include_path {
-    () => {
-        concat!(cuda_path!(), "/include")
-    };
-}
-
-#[cfg(target_os = "windows")]
-#[macro_export]
-macro_rules! cuda_lib_path {
-    () => {
-        concat!(cuda_path!(), "/lib/x64")
-    };
-}
-
-#[cfg(target_os = "linux")]
-#[macro_export]
-macro_rules! cuda_lib_path {
-    () => {
-        concat!(cuda_path!(), "/lib64")
-    };
-}
-
-#[cfg(not(any(target_os = "windows", target_os = "linux")))]
-#[macro_export]
-macro_rules! cuda_lib_path {
-    () => {
-        unimplemented!()
-    };
-}
diff --git a/cudart/Cargo.toml b/cudart/Cargo.toml
deleted file mode 100644
index 30efa81..0000000
--- a/cudart/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[package]
-name = "cudart"
-version = "0.1.0"
-edition = "2021"
-license = "MIT OR Apache-2.0"
-
-[dependencies]
-cudart-sys = { path = "../cudart-sys" }
-bitflags = "^2"
-criterion = "^0"
-
-[dev-dependencies]
-serial_test = "^2"
diff --git a/cudart/src/device.rs b/cudart/src/device.rs
deleted file mode 100644
index e3d8954..0000000
--- a/cudart/src/device.rs
+++ /dev/null
@@ -1,181 +0,0 @@
-// device management
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html
-
-use std::mem::MaybeUninit;
-
-use cudart_sys::*;
-
-use crate::memory_pools::CudaMemPool;
-use crate::result::{CudaResult, CudaResultWrap};
-
-pub fn device_get_attribute(attr: CudaDeviceAttr, device_id: i32) -> CudaResult<i32> {
-    let mut value = MaybeUninit::<i32>::uninit();
-    unsafe { cudaDeviceGetAttribute(value.as_mut_ptr(), attr, device_id).wrap_maybe_uninit(value) }
-}
-
-pub fn device_reset() -> CudaResult<()> {
-    unsafe { cudaDeviceReset().wrap() }
-}
-
-pub fn device_synchronize() -> CudaResult<()> {
-    unsafe { cudaDeviceSynchronize().wrap() }
-}
-
-pub fn get_device_count() -> CudaResult<i32> {
-    let mut count = MaybeUninit::<i32>::uninit();
-    unsafe { cudaGetDeviceCount(count.as_mut_ptr()).wrap_maybe_uninit(count) }
-}
-
-pub fn get_device() -> CudaResult<i32> {
-    let mut device_id = MaybeUninit::<i32>::uninit();
-    unsafe { cudaGetDevice(device_id.as_mut_ptr()).wrap_maybe_uninit(device_id) }
-}
-
-pub fn get_device_properties(device_id: i32) -> CudaResult<CudaDeviceProperties> {
-    let mut props = MaybeUninit::<CudaDeviceProperties>::uninit();
-    unsafe { cudaGetDeviceProperties_v2(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) }
-}
-
-pub fn set_device(device_id: i32) -> CudaResult<()> {
-    unsafe { cudaSetDevice(device_id).wrap() }
-}
-
-pub fn device_get_default_mem_pool(device_id: i32) -> CudaResult<CudaMemPool> {
-    let mut handle = MaybeUninit::<cudaMemPool_t>::uninit();
-    unsafe {
-        cudaDeviceGetDefaultMemPool(handle.as_mut_ptr(), device_id)
-            .wrap_maybe_uninit(handle)
-            .map(CudaMemPool::from_handle)
-    }
-}
-
-pub fn device_get_mem_pool(device_id: i32) -> CudaResult<CudaMemPool> {
-    let mut handle = MaybeUninit::<cudaMemPool_t>::uninit();
-    unsafe {
-        cudaDeviceGetMemPool(handle.as_mut_ptr(), device_id)
-            .wrap_maybe_uninit(handle)
-            .map(CudaMemPool::from_handle)
-    }
-}
-
-pub fn device_set_mem_pool(device_id: i32, pool: &CudaMemPool) -> CudaResult<()> {
-    unsafe { cudaDeviceSetMemPool(device_id, pool.into()).wrap() }
-}
-
-pub fn device_get_limit(limit: CudaLimit) -> CudaResult<usize> {
-    let mut value = MaybeUninit::<usize>::uninit();
-    unsafe { cudaDeviceGetLimit(value.as_mut_ptr(), limit).wrap_maybe_uninit(value) }
-}
-
-pub fn device_set_limit(limit: CudaLimit, value: usize) -> CudaResult<()> {
-    unsafe { cudaDeviceSetLimit(limit, value).wrap() }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::ffi::CStr;
-
-    use serial_test::serial;
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn device_get_attribute_is_ok() {
-        let result = device_get_attribute(CudaDeviceAttr::MaxBlocksPerMultiprocessor, 0);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn device_get_attribute_max_blocks_per_multiprocessor_is_gt_zero() {
-        let result = device_get_attribute(CudaDeviceAttr::MaxBlocksPerMultiprocessor, 0).unwrap();
-        assert!(result > 0);
-    }
-
-    #[test]
-    #[serial]
-    fn device_reset_is_ok() {
-        let result = device_reset();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn device_synchronize_is_ok() {
-        let result = device_synchronize();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn get_device_count_is_not_zero() {
-        let count = get_device_count().unwrap();
-        assert_ne!(count, 0);
-    }
-
-    #[test]
-    #[serial]
-    fn device_id_is_smaller_than_device_count() {
-        let device_id = get_device().unwrap();
-        let count = get_device_count().unwrap();
-        assert!(device_id < count);
-    }
-
-    #[test]
-    #[serial]
-    fn device_properties_name_is_not_empty_for_all_devices() {
-        let count = get_device_count().unwrap();
-        for i in 0..count {
-            let props = get_device_properties(i).unwrap();
-            let name = unsafe { CStr::from_ptr(props.name.as_ptr()) }
-                .to_str()
-                .unwrap();
-            assert!(!name.is_empty());
-        }
-    }
-
-    #[test]
-    #[serial]
-    fn set_device_works_for_all_devices() {
-        let count = get_device_count().unwrap();
-        let original_device_id = get_device().unwrap();
-        for i in 0..count {
-            set_device(i).unwrap();
-            let current_device_id = get_device().unwrap();
-            assert_eq!(i, current_device_id);
-        }
-        set_device(original_device_id).unwrap();
-    }
-
-    #[test]
-    #[serial]
-    fn device_get_default_mem_pool_is_ok_for_all_devices() {
-        let count = get_device_count().unwrap();
-        for i in 0..count {
-            let result = device_get_default_mem_pool(i);
-            assert!(result.is_ok());
-        }
-    }
-
-    #[test]
-    #[serial]
-    fn device_get_mem_pool_is_ok_for_all_devices() {
-        let count = get_device_count().unwrap();
-        for i in 0..count {
-            let result = device_get_mem_pool(i);
-            assert!(result.is_ok());
-        }
-    }
-
-    #[test]
-    #[serial]
-    fn device_set_mem_pool_is_ok_for_all_devices() {
-        let count = get_device_count().unwrap();
-        for i in 0..count {
-            let pool = device_get_mem_pool(i).unwrap();
-            let result = device_set_mem_pool(i, &pool);
-            assert!(result.is_ok());
-        }
-    }
-}
diff --git a/cudart/src/error.rs b/cudart/src/error.rs
deleted file mode 100644
index aa09e52..0000000
--- a/cudart/src/error.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-// error handling
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html
-
-use cudart_sys::*;
-
-pub fn get_last_error() -> CudaError {
-    unsafe { cudaGetLastError() }
-}
-
-pub fn peek_at_last_error() -> CudaError {
-    unsafe { cudaPeekAtLastError() }
-}
-
-#[cfg(test)]
-mod tests {
-    use serial_test::serial;
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn get_last_error_equals_success() {
-        let result = get_last_error();
-        assert_eq!(result, CudaError::Success)
-    }
-
-    #[test]
-    #[serial]
-    fn peek_at_last_error_equals_success() {
-        let result = peek_at_last_error();
-        assert_eq!(result, CudaError::Success)
-    }
-}
diff --git a/cudart/src/event.rs b/cudart/src/event.rs
deleted file mode 100644
index 5c244c7..0000000
--- a/cudart/src/event.rs
+++ /dev/null
@@ -1,237 +0,0 @@
-// event management
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
-
-use std::mem::{self, MaybeUninit};
-use std::ptr::NonNull;
-
-use bitflags::bitflags;
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-use crate::stream::CudaStream;
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct CudaEvent {
-    handle: NonNull<CUevent_st>,
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaEventCreateFlags: u32 {
-        const DEFAULT = cudart_sys::cudaEventDefault;
-        const BLOCKING_SYNC = cudart_sys::cudaEventBlockingSync;
-        const DISABLE_TIMING = cudart_sys::cudaEventDisableTiming;
-        const INTERPROCESS = cudart_sys::cudaEventInterprocess;
-    }
-}
-
-impl Default for CudaEventCreateFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaEventRecordFlags: u32 {
-        const DEFAULT = cudart_sys::cudaEventRecordDefault;
-        const EXTERNAL = cudart_sys::cudaEventRecordExternal;
-    }
-}
-
-impl Default for CudaEventRecordFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-impl CudaEvent {
-    fn from_handle(handle: cudaEvent_t) -> Self {
-        Self {
-            handle: NonNull::new(handle).unwrap(),
-        }
-    }
-
-    pub fn create() -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaEvent_t>::uninit();
-        unsafe {
-            cudaEventCreate(handle.as_mut_ptr())
-                .wrap_maybe_uninit(handle)
-                .map(Self::from_handle)
-        }
-    }
-
-    pub fn create_with_flags(flags: CudaEventCreateFlags) -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaEvent_t>::uninit();
-        unsafe {
-            cudaEventCreateWithFlags(handle.as_mut_ptr(), flags.bits())
-                .wrap_maybe_uninit(handle)
-                .map(Self::from_handle)
-        }
-    }
-
-    pub fn destroy(self) -> CudaResult<()> {
-        let handle = self.handle;
-        mem::forget(self);
-        unsafe { cudaEventDestroy(handle.as_ptr()).wrap() }
-    }
-
-    pub fn query(&self) -> CudaResult<bool> {
-        let error = unsafe { cudaEventQuery(self.handle.as_ptr()) };
-        match error {
-            CudaError::Success => Ok(true),
-            CudaError::ErrorNotReady => Ok(false),
-            _ => Err(error),
-        }
-    }
-
-    pub fn record(&self, stream: &CudaStream) -> CudaResult<()> {
-        unsafe { cudaEventRecord(self.handle.as_ptr(), stream.into()).wrap() }
-    }
-
-    pub fn record_with_flags(
-        &self,
-        stream: &CudaStream,
-        flags: CudaEventRecordFlags,
-    ) -> CudaResult<()> {
-        unsafe {
-            cudaEventRecordWithFlags(self.handle.as_ptr(), stream.into(), flags.bits()).wrap()
-        }
-    }
-
-    pub fn synchronize(&self) -> CudaResult<()> {
-        unsafe { cudaEventSynchronize(self.handle.as_ptr()).wrap() }
-    }
-}
-
-impl Drop for CudaEvent {
-    fn drop(&mut self) {
-        unsafe { cudaEventDestroy(self.handle.as_ptr()).eprint_error_and_backtrace() };
-    }
-}
-
-impl From<&CudaEvent> for cudaEvent_t {
-    fn from(event: &CudaEvent) -> Self {
-        event.handle.as_ptr()
-    }
-}
-
-pub fn elapsed_time(start: &CudaEvent, end: &CudaEvent) -> CudaResult<f32> {
-    let mut ms = MaybeUninit::<f32>::uninit();
-    unsafe {
-        cudaEventElapsedTime(ms.as_mut_ptr(), start.handle.as_ptr(), end.handle.as_ptr())
-            .wrap_maybe_uninit(ms)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::ptr::null_mut;
-    use std::thread;
-    use std::time::Duration;
-
-    use serial_test::serial;
-
-    use crate::execution::{self, HostFn};
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn create_is_ok() {
-        let result = CudaEvent::create();
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn create_handle_is_not_null() {
-        let event = CudaEvent::create().unwrap();
-        assert_ne!(event.handle.as_ptr(), null_mut());
-    }
-
-    #[test]
-    #[serial]
-    fn create_with_flags_is_ok() {
-        let result = CudaEvent::create_with_flags(
-            CudaEventCreateFlags::DISABLE_TIMING | CudaEventCreateFlags::BLOCKING_SYNC,
-        );
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn create_with_flags_handle_is_not_null() {
-        let event = CudaEvent::create_with_flags(CudaEventCreateFlags::DISABLE_TIMING).unwrap();
-        assert_ne!(event.handle.as_ptr(), null_mut());
-    }
-
-    #[test]
-    #[serial]
-    fn destroy_is_ok() {
-        let event = CudaEvent::create().unwrap();
-        let result = event.destroy();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn query_is_true() {
-        let stream = CudaStream::create().unwrap();
-        let event = CudaEvent::create().unwrap();
-        event.record(&stream).unwrap();
-        stream.synchronize().unwrap();
-        let result = event.query();
-        assert_eq!(result, Ok(true));
-    }
-
-    #[test]
-    #[serial]
-    fn query_is_false() {
-        let stream = CudaStream::create().unwrap();
-        let event = CudaEvent::create().unwrap();
-        let func = HostFn::new(|| thread::sleep(Duration::from_millis(100)));
-        execution::launch_host_fn(&stream, &func).unwrap();
-        event.record(&stream).unwrap();
-        let result = event.query();
-        stream.synchronize().unwrap();
-        assert_eq!(result, Ok(false));
-    }
-
-    #[test]
-    #[serial]
-    fn record_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let event = CudaEvent::create().unwrap();
-        let result = event.record(&stream);
-        stream.synchronize().unwrap();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn synchronize_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let event = CudaEvent::create().unwrap();
-        event.record(&stream).unwrap();
-        let result = event.synchronize();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn elapsed_time_in_range() {
-        let stream = CudaStream::create().unwrap();
-        let start = CudaEvent::create().unwrap();
-        let end = CudaEvent::create().unwrap();
-        let func = HostFn::new(|| thread::sleep(Duration::from_millis(10)));
-        start.record(&stream).unwrap();
-        execution::launch_host_fn(&stream, &func).unwrap();
-        end.record(&stream).unwrap();
-        stream.synchronize().unwrap();
-        let elapsed = elapsed_time(&start, &end).unwrap();
-        assert!(elapsed > 10.0 && elapsed < 100.0);
-    }
-}
diff --git a/cudart/src/execution.rs b/cudart/src/execution.rs
deleted file mode 100644
index 928069b..0000000
--- a/cudart/src/execution.rs
+++ /dev/null
@@ -1,512 +0,0 @@
-// execution control
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html
-
-use core::ffi::c_void;
-use std::marker::PhantomData;
-use std::os::raw::{c_char, c_int, c_uint};
-use std::sync::{Arc, Weak};
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-use crate::stream::CudaStream;
-
-pub struct KernelArguments<'a> {
-    vec: Vec<*mut c_void>,
-    phantom: PhantomData<&'a c_void>,
-}
-
-impl<'a> KernelArguments<'a> {
-    pub fn new() -> Self {
-        Self {
-            vec: vec![],
-            phantom: PhantomData,
-        }
-    }
-
-    pub fn push<T>(&mut self, value: &T) {
-        self.vec.push(value as *const T as *mut c_void);
-    }
-
-    pub fn as_mut_ptr(&mut self) -> *mut *mut c_void {
-        self.vec.as_mut_ptr()
-    }
-}
-
-impl<'a> Default for KernelArguments<'a> {
-    fn default() -> Self {
-        KernelArguments::new()
-    }
-}
-
-#[macro_export]
-macro_rules! kernel_args {
-    ($($x:expr),* $(,)?) => {
-        {
-            let mut args = $crate::execution::KernelArguments::new();
-            $(
-            args.push($x);
-            )*
-            args
-        }
-    };
-}
-
-pub use kernel_args;
-
-#[derive(Debug, Copy, Clone)]
-pub enum CudaLaunchAttribute {
-    Ignore,
-    AccessPolicyWindow(CudaAccessPolicyWindow),
-    Cooperative(bool),
-    SynchronizationPolicy(CudaSynchronizationPolicy),
-    ClusterDimension(dim3),
-    ClusterSchedulingPolicyPreference(CudaClusterSchedulingPolicy),
-    ProgrammaticStreamSerialization(bool),
-    ProgrammaticEvent(cudaLaunchAttributeValue__bindgen_ty_2),
-    Priority(i32),
-    MemSyncDomainMap(cudaLaunchMemSyncDomainMap),
-    MemSyncDomain(CudaLaunchMemSyncDomain),
-}
-
-impl CudaLaunchAttribute {
-    pub(crate) fn from_id_and_value(
-        id: CudaLaunchAttributeID,
-        value: CudaLaunchAttributeValue,
-    ) -> Self {
-        unsafe {
-            match id {
-                CudaLaunchAttributeID::Ignore => Self::Ignore,
-                CudaLaunchAttributeID::AccessPolicyWindow => {
-                    Self::AccessPolicyWindow(value.accessPolicyWindow)
-                }
-                CudaLaunchAttributeID::Cooperative => Self::Cooperative(value.cooperative != 0),
-                CudaLaunchAttributeID::SynchronizationPolicy => {
-                    Self::SynchronizationPolicy(value.syncPolicy)
-                }
-                CudaLaunchAttributeID::ClusterDimension => {
-                    let d = value.clusterDim;
-                    Self::ClusterDimension(dim3 {
-                        x: d.x,
-                        y: d.y,
-                        z: d.z,
-                    })
-                }
-                CudaLaunchAttributeID::ClusterSchedulingPolicyPreference => {
-                    Self::ClusterSchedulingPolicyPreference(value.clusterSchedulingPolicyPreference)
-                }
-                CudaLaunchAttributeID::ProgrammaticStreamSerialization => {
-                    Self::ProgrammaticStreamSerialization(
-                        value.programmaticStreamSerializationAllowed != 0,
-                    )
-                }
-                CudaLaunchAttributeID::ProgrammaticEvent => {
-                    Self::ProgrammaticEvent(value.programmaticEvent)
-                }
-                CudaLaunchAttributeID::Priority => Self::Priority(value.priority),
-                CudaLaunchAttributeID::MemSyncDomainMap => {
-                    Self::MemSyncDomainMap(value.memSyncDomainMap)
-                }
-                CudaLaunchAttributeID::MemSyncDomain => Self::MemSyncDomain(value.memSyncDomain),
-            }
-        }
-    }
-
-    pub(crate) fn into_id_and_value(self) -> (CudaLaunchAttributeID, CudaLaunchAttributeValue) {
-        match self {
-            CudaLaunchAttribute::Ignore => (
-                CudaLaunchAttributeID::Ignore,
-                CudaLaunchAttributeValue { pad: [0; 64] },
-            ),
-            CudaLaunchAttribute::AccessPolicyWindow(access_policy_window) => (
-                CudaLaunchAttributeID::AccessPolicyWindow,
-                CudaLaunchAttributeValue {
-                    accessPolicyWindow: access_policy_window,
-                },
-            ),
-            CudaLaunchAttribute::Cooperative(cooperative) => (
-                CudaLaunchAttributeID::Cooperative,
-                CudaLaunchAttributeValue {
-                    cooperative: cooperative as c_int,
-                },
-            ),
-            CudaLaunchAttribute::SynchronizationPolicy(sync_policy) => (
-                CudaLaunchAttributeID::SynchronizationPolicy,
-                CudaLaunchAttributeValue {
-                    syncPolicy: sync_policy,
-                },
-            ),
-            CudaLaunchAttribute::ClusterDimension(cluster_dim) => (
-                CudaLaunchAttributeID::ClusterDimension,
-                CudaLaunchAttributeValue {
-                    clusterDim: cudaLaunchAttributeValue__bindgen_ty_1 {
-                        x: cluster_dim.x,
-                        y: cluster_dim.y,
-                        z: cluster_dim.z,
-                    },
-                },
-            ),
-            CudaLaunchAttribute::ClusterSchedulingPolicyPreference(
-                cluster_scheduling_policy_preference,
-            ) => (
-                CudaLaunchAttributeID::ClusterSchedulingPolicyPreference,
-                CudaLaunchAttributeValue {
-                    clusterSchedulingPolicyPreference: cluster_scheduling_policy_preference,
-                },
-            ),
-            CudaLaunchAttribute::ProgrammaticStreamSerialization(
-                programmatic_stream_serialization_allowed,
-            ) => (
-                CudaLaunchAttributeID::ProgrammaticStreamSerialization,
-                CudaLaunchAttributeValue {
-                    programmaticStreamSerializationAllowed:
-                        programmatic_stream_serialization_allowed as c_int,
-                },
-            ),
-            CudaLaunchAttribute::ProgrammaticEvent(programmatic_event) => (
-                CudaLaunchAttributeID::ProgrammaticEvent,
-                CudaLaunchAttributeValue {
-                    programmaticEvent: programmatic_event,
-                },
-            ),
-            CudaLaunchAttribute::Priority(priority) => (
-                CudaLaunchAttributeID::Priority,
-                CudaLaunchAttributeValue { priority },
-            ),
-            CudaLaunchAttribute::MemSyncDomainMap(mem_sync_domain_map) => (
-                CudaLaunchAttributeID::MemSyncDomainMap,
-                CudaLaunchAttributeValue {
-                    memSyncDomainMap: mem_sync_domain_map,
-                },
-            ),
-            CudaLaunchAttribute::MemSyncDomain(mem_sync_domain) => (
-                CudaLaunchAttributeID::MemSyncDomain,
-                CudaLaunchAttributeValue {
-                    memSyncDomain: mem_sync_domain,
-                },
-            ),
-        }
-    }
-
-    fn into_raw(self) -> cudaLaunchAttribute {
-        let (id, val) = self.into_id_and_value();
-        cudaLaunchAttribute {
-            id,
-            pad: [c_char::default(); 4],
-            val,
-        }
-    }
-}
-
-pub trait Kernel: Sized {
-    fn get_kernel_raw(self) -> *const c_void;
-}
-
-pub trait KernelLaunch<'a>: Kernel {
-    type Args: Into<KernelArguments<'a>>;
-
-    #[allow(clippy::missing_safety_doc)]
-    unsafe fn launch(
-        self,
-        grid_dim: dim3,
-        block_dim: dim3,
-        args: Self::Args,
-        shared_mem: usize,
-        stream: &CudaStream,
-    ) -> CudaResult<()> {
-        cudaLaunchKernel(
-            self.get_kernel_raw(),
-            grid_dim,
-            block_dim,
-            args.into().as_mut_ptr(),
-            shared_mem,
-            stream.into(),
-        )
-        .wrap()
-    }
-
-    #[allow(clippy::missing_safety_doc)]
-    unsafe fn launch_ex(
-        self,
-        grid_dim: dim3,
-        block_dim: dim3,
-        args: Self::Args,
-        shared_mem: usize,
-        stream: &CudaStream,
-        attributes: &[CudaLaunchAttribute],
-    ) -> CudaResult<()> {
-        let mut attributes = attributes
-            .iter()
-            .map(|attribute| (*attribute).into_raw())
-            .collect::<Vec<_>>();
-        let config = cudaLaunchConfig_t {
-            gridDim: grid_dim,
-            blockDim: block_dim,
-            dynamicSmemBytes: shared_mem,
-            stream: stream.into(),
-            attrs: attributes.as_mut_ptr(),
-            numAttrs: attributes.len() as c_uint,
-        };
-        cudaLaunchKernelExC(
-            &config as *const _,
-            self.get_kernel_raw(),
-            args.into().as_mut_ptr(),
-        )
-        .wrap()
-    }
-}
-
-impl<'a> From<()> for KernelArguments<'a> {
-    fn from(_value: ()) -> Self {
-        KernelArguments::default()
-    }
-}
-
-pub type KernelNoArgs = unsafe extern "C" fn();
-
-impl Kernel for KernelNoArgs {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a> KernelLaunch<'a> for KernelNoArgs {
-    type Args = ();
-}
-
-impl<'a, T> From<(&T,)> for KernelArguments<'a> {
-    fn from(value: (&T,)) -> Self {
-        kernel_args![value.0]
-    }
-}
-
-pub type KernelOneArg<T0> = unsafe extern "C" fn(T0);
-
-impl<T0> Kernel for KernelOneArg<T0> {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a> KernelLaunch<'a> for KernelOneArg<T0> {
-    type Args = (&'a T0,);
-}
-
-impl<'a, T0, T1> From<(&T0, &T1)> for KernelArguments<'a> {
-    fn from(value: (&T0, &T1)) -> Self {
-        kernel_args![value.0, value.1]
-    }
-}
-
-pub type KernelTwoArgs<T0, T1> = unsafe extern "C" fn(T0, T1);
-
-impl<'a, T0: 'a, T1: 'a> Kernel for KernelTwoArgs<T0, T1> {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a> KernelLaunch<'a> for KernelTwoArgs<T0, T1> {
-    type Args = (&'a T0, &'a T1);
-}
-
-impl<'a, T0, T1, T2> From<(&T0, &T1, &T2)> for KernelArguments<'a> {
-    fn from(value: (&T0, &T1, &T2)) -> Self {
-        kernel_args![value.0, value.1, value.2]
-    }
-}
-
-pub type KernelThreeArgs<T0, T1, T2> = unsafe extern "C" fn(T0, T1, T2);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a> Kernel for KernelThreeArgs<T0, T1, T2> {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a> KernelLaunch<'a> for KernelThreeArgs<T0, T1, T2> {
-    type Args = (&'a T0, &'a T1, &'a T2);
-}
-
-impl<'a, T0, T1, T2, T3> From<(&T0, &T1, &T2, &T3)> for KernelArguments<'a> {
-    fn from(value: (&T0, &T1, &T2, &T3)) -> Self {
-        kernel_args![value.0, value.1, value.2, value.3]
-    }
-}
-
-pub type KernelFourArgs<T0, T1, T2, T3> = unsafe extern "C" fn(T0, T1, T2, T3);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a> Kernel for KernelFourArgs<T0, T1, T2, T3> {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a> KernelLaunch<'a> for KernelFourArgs<T0, T1, T2, T3> {
-    type Args = (&'a T0, &'a T1, &'a T2, &'a T3);
-}
-
-impl<'a, T0, T1, T2, T3, T4> From<(&T0, &T1, &T2, &T3, &T4)> for KernelArguments<'a> {
-    fn from(value: (&T0, &T1, &T2, &T3, &T4)) -> Self {
-        kernel_args![value.0, value.1, value.2, value.3, value.4]
-    }
-}
-
-pub type KernelFiveArgs<T0, T1, T2, T3, T4> = unsafe extern "C" fn(T0, T1, T2, T3, T4);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a> Kernel for KernelFiveArgs<T0, T1, T2, T3, T4> {
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a> KernelLaunch<'a>
-    for KernelFiveArgs<T0, T1, T2, T3, T4>
-{
-    type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4);
-}
-
-impl<'a, T0, T1, T2, T3, T4, T5> From<(&T0, &T1, &T2, &T3, &T4, &T5)> for KernelArguments<'a> {
-    fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5)) -> Self {
-        kernel_args![value.0, value.1, value.2, value.3, value.4, value.5]
-    }
-}
-
-pub type KernelSixArgs<T0, T1, T2, T3, T4, T5> = unsafe extern "C" fn(T0, T1, T2, T3, T4, T5);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a> Kernel
-    for KernelSixArgs<T0, T1, T2, T3, T4, T5>
-{
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a> KernelLaunch<'a>
-    for KernelSixArgs<T0, T1, T2, T3, T4, T5>
-{
-    type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4, &'a T5);
-}
-
-impl<'a, T0, T1, T2, T3, T4, T5, T6> From<(&T0, &T1, &T2, &T3, &T4, &T5, &T6)>
-    for KernelArguments<'a>
-{
-    fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5, &T6)) -> Self {
-        kernel_args![value.0, value.1, value.2, value.3, value.4, value.5, value.6]
-    }
-}
-
-pub type KernelSevenArgs<T0, T1, T2, T3, T4, T5, T6> =
-    unsafe extern "C" fn(T0, T1, T2, T3, T4, T5, T6);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a> Kernel
-    for KernelSevenArgs<T0, T1, T2, T3, T4, T5, T6>
-{
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a> KernelLaunch<'a>
-    for KernelSevenArgs<T0, T1, T2, T3, T4, T5, T6>
-{
-    type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4, &'a T5, &'a T6);
-}
-
-impl<'a, T0, T1, T2, T3, T4, T5, T6, T7> From<(&T0, &T1, &T2, &T3, &T4, &T5, &T6, &T7)>
-    for KernelArguments<'a>
-{
-    fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5, &T6, &T7)) -> Self {
-        kernel_args![value.0, value.1, value.2, value.3, value.4, value.5, value.6, value.7]
-    }
-}
-
-pub type KernelEightArgs<T0, T1, T2, T3, T4, T5, T6, T7> =
-    unsafe extern "C" fn(T0, T1, T2, T3, T4, T5, T6, T7);
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a, T7: 'a> Kernel
-    for KernelEightArgs<T0, T1, T2, T3, T4, T5, T6, T7>
-{
-    fn get_kernel_raw(self) -> *const c_void {
-        self as *const c_void
-    }
-}
-
-impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a, T7: 'a> KernelLaunch<'a>
-    for KernelEightArgs<T0, T1, T2, T3, T4, T5, T6, T7>
-{
-    type Args = (
-        &'a T0,
-        &'a T1,
-        &'a T2,
-        &'a T3,
-        &'a T4,
-        &'a T5,
-        &'a T6,
-        &'a T7,
-    );
-}
-
-pub struct HostFn<'a> {
-    arc: Arc<Box<dyn Fn() + Send + 'a>>,
-}
-
-impl<'a> HostFn<'a> {
-    pub fn new(func: impl Fn() + Send + 'a) -> Self {
-        Self {
-            arc: Arc::new(Box::new(func) as Box<dyn Fn() + Send>),
-        }
-    }
-}
-
-unsafe extern "C" fn launch_host_fn_callback(data: *mut c_void) {
-    let raw = data as *const Box<dyn Fn() + Send>;
-    let weak = Weak::from_raw(raw);
-    if let Some(func) = weak.upgrade() {
-        func();
-    }
-}
-
-pub fn get_raw_fn_and_data(host_fn: &HostFn) -> (cudaHostFn_t, *mut c_void) {
-    let weak = Arc::downgrade(&host_fn.arc);
-    let raw = weak.into_raw();
-    let data = raw as *mut c_void;
-    (Some(launch_host_fn_callback), data)
-}
-
-pub fn launch_host_fn(stream: &CudaStream, host_fn: &HostFn) -> CudaResult<()> {
-    let (func, data) = get_raw_fn_and_data(host_fn);
-    unsafe { cudaLaunchHostFunc(stream.into(), func, data).wrap() }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Mutex;
-    use std::thread;
-    use std::time::Duration;
-
-    use serial_test::serial;
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn host_fn_add_executes_one_time() {
-        let stream = CudaStream::create().unwrap();
-        let mut a = 0;
-        let add = || {
-            a += 1;
-            thread::sleep(Duration::from_millis(10));
-        };
-        let add_mutex = Mutex::new(add);
-        let add_fn = HostFn::new(move || add_mutex.lock().unwrap()());
-        let sleep_fn = HostFn::new(|| thread::sleep(Duration::from_millis(10)));
-        launch_host_fn(&stream, &add_fn).unwrap();
-        stream.synchronize().unwrap();
-        launch_host_fn(&stream, &sleep_fn).unwrap();
-        launch_host_fn(&stream, &add_fn).unwrap();
-        drop(add_fn);
-        stream.synchronize().unwrap();
-        assert_eq!(a, 1);
-    }
-}
diff --git a/cudart/src/lib.rs b/cudart/src/lib.rs
deleted file mode 100644
index a0a79bc..0000000
--- a/cudart/src/lib.rs
+++ /dev/null
@@ -1,19 +0,0 @@
-#![feature(min_specialization)]
-#![feature(ptr_metadata)]
-#![feature(trusted_len)]
-#![feature(trusted_random_access)]
-
-extern crate core;
-
-pub mod device;
-pub mod error;
-pub mod event;
-pub mod execution;
-pub mod memory;
-pub mod memory_pools;
-pub mod occupancy;
-pub mod peer;
-pub mod result;
-pub mod slice;
-pub mod stream;
-pub mod unified;
diff --git a/cudart/src/memory.rs b/cudart/src/memory.rs
deleted file mode 100644
index ec186b3..0000000
--- a/cudart/src/memory.rs
+++ /dev/null
@@ -1,818 +0,0 @@
-// memory management
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
-
-use core::ffi::c_void;
-use std::alloc::Layout;
-use std::mem::{self, MaybeUninit};
-use std::ops::{Deref, DerefMut};
-
-use bitflags::bitflags;
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
-use crate::stream::CudaStream;
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct DeviceAllocation<T>(AllocationData<T>);
-
-impl<T> DeviceAllocation<T> {
-    pub fn alloc(length: usize) -> CudaResult<Self> {
-        let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
-        unsafe {
-            cudaMalloc(dev_ptr.as_mut_ptr(), layout.size())
-                .wrap_maybe_uninit(dev_ptr)
-                .map(|ptr| {
-                    Self(AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    })
-                })
-        }
-    }
-
-    pub fn free(self) -> CudaResult<()> {
-        unsafe {
-            let ptr = self.0.ptr as *mut c_void;
-            mem::forget(self);
-            cudaFree(ptr).wrap()
-        }
-    }
-
-    /// # Safety
-    ///
-    /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
-        Self(AllocationData { ptr, len })
-    }
-
-    pub fn into_raw_parts(self) -> (*mut T, usize) {
-        let result = (self.0.ptr, self.0.len);
-        mem::forget(self);
-        result
-    }
-}
-
-impl<T> Drop for DeviceAllocation<T> {
-    fn drop(&mut self) {
-        unsafe { cudaFree(self.as_mut_c_void_ptr()).eprint_error_and_backtrace() };
-    }
-}
-
-impl<T> Deref for DeviceAllocation<T> {
-    type Target = DeviceSlice<T>;
-
-    fn deref(&self) -> &Self::Target {
-        Self::Target::from_allocation_data(&self.0)
-    }
-}
-
-impl<T> DerefMut for DeviceAllocation<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        Self::Target::from_mut_allocation_data(&mut self.0)
-    }
-}
-
-impl<T> AsRef<DeviceSlice<T>> for DeviceAllocation<T> {
-    fn as_ref(&self) -> &DeviceSlice<T> {
-        self.deref()
-    }
-}
-
-impl<T> AsMut<DeviceSlice<T>> for DeviceAllocation<T> {
-    fn as_mut(&mut self) -> &mut DeviceSlice<T> {
-        self.deref_mut()
-    }
-}
-
-impl<T> CudaSlice<T> for DeviceAllocation<T> {
-    unsafe fn as_slice(&self) -> &[T] {
-        self.0.as_slice()
-    }
-}
-
-impl<T> CudaSliceMut<T> for DeviceAllocation<T> {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        self.0.as_mut_slice()
-    }
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaHostAllocFlags: u32 {
-        const DEFAULT = cudart_sys::cudaHostAllocDefault;
-        const PORTABLE = cudart_sys::cudaHostAllocPortable;
-        const MAPPED = cudart_sys::cudaHostAllocMapped;
-        const WRITE_COMBINED = cudart_sys::cudaHostAllocWriteCombined;
-    }
-}
-
-impl Default for CudaHostAllocFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct HostAllocation<T>(AllocationData<T>);
-
-impl<T> HostAllocation<T> {
-    pub fn alloc(length: usize, flags: CudaHostAllocFlags) -> CudaResult<Self> {
-        let layout = Layout::array::<T>(length).unwrap();
-        let mut ptr = MaybeUninit::<*mut c_void>::uninit();
-        unsafe {
-            cudaHostAlloc(ptr.as_mut_ptr(), layout.size(), flags.bits())
-                .wrap_maybe_uninit(ptr)
-                .map(|ptr| {
-                    Self(AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    })
-                })
-        }
-    }
-
-    pub fn free(self) -> CudaResult<()> {
-        unsafe {
-            let ptr = self.0.ptr as *mut c_void;
-            mem::forget(self);
-            cudaFreeHost(ptr).wrap()
-        }
-    }
-
-    /// # Safety
-    ///
-    /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self {
-        Self(AllocationData { ptr, len })
-    }
-
-    pub fn into_raw_parts(self) -> (*mut T, usize) {
-        let result = (self.0.ptr, self.0.len);
-        mem::forget(self);
-        result
-    }
-}
-
-impl<T> Drop for HostAllocation<T> {
-    fn drop(&mut self) {
-        unsafe { cudaFreeHost(self.0.ptr as *mut c_void).eprint_error_and_backtrace() };
-    }
-}
-
-impl<T> Deref for HostAllocation<T> {
-    type Target = [T];
-    fn deref(&self) -> &Self::Target {
-        unsafe { self.0.as_slice() }
-    }
-}
-
-impl<T> DerefMut for HostAllocation<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { self.0.as_mut_slice() }
-    }
-}
-
-impl<T> AsRef<[T]> for HostAllocation<T> {
-    fn as_ref(&self) -> &[T] {
-        self.deref()
-    }
-}
-
-impl<T> AsMut<[T]> for HostAllocation<T> {
-    fn as_mut(&mut self) -> &mut [T] {
-        self.deref_mut()
-    }
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaHostRegisterFlags: u32 {
-        const DEFAULT = cudart_sys::cudaHostRegisterDefault;
-        const PORTABLE = cudart_sys::cudaHostRegisterPortable;
-        const MAPPED = cudart_sys::cudaHostRegisterMapped;
-        const IO_MEMORY = cudart_sys::cudaHostRegisterIoMemory;
-        const READ_ONLY = cudart_sys::cudaHostRegisterReadOnly;
-    }
-}
-
-impl Default for CudaHostRegisterFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct HostRegistration<'a, T>(&'a [T]);
-
-impl<'a, T> HostRegistration<'a, T> {
-    pub fn register(slice: &'a [T], flags: CudaHostRegisterFlags) -> CudaResult<Self> {
-        let length = slice.len();
-        let layout = Layout::array::<T>(length).unwrap();
-        unsafe {
-            cudaHostRegister(
-                slice.as_c_void_ptr() as *mut c_void,
-                layout.size(),
-                flags.bits(),
-            )
-            .wrap_value(Self(slice))
-        }
-    }
-
-    pub fn unregister(self) -> CudaResult<()> {
-        unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).wrap() }
-    }
-}
-
-impl<T> Drop for HostRegistration<'_, T> {
-    fn drop(&mut self) {
-        unsafe {
-            cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).eprint_error_and_backtrace()
-        };
-    }
-}
-
-impl<T> Deref for HostRegistration<'_, T> {
-    type Target = [T];
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-    }
-}
-
-impl<T> AsRef<[T]> for HostRegistration<'_, T> {
-    fn as_ref(&self) -> &[T] {
-        self.0
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct HostRegistrationMut<'a, T>(&'a mut [T]);
-
-impl<'a, T> HostRegistrationMut<'a, T> {
-    pub fn register(slice: &'a mut [T], flags: CudaHostRegisterFlags) -> CudaResult<Self> {
-        let length = slice.len();
-        let layout = Layout::array::<T>(length).unwrap();
-        unsafe {
-            cudaHostRegister(slice.as_mut_c_void_ptr(), layout.size(), flags.bits())
-                .wrap_value(Self(slice))
-        }
-    }
-
-    pub fn unregister(self) -> CudaResult<()> {
-        unsafe { cudaHostUnregister(self.0.as_mut_c_void_ptr()).wrap() }
-    }
-}
-
-impl<T> Drop for HostRegistrationMut<'_, T> {
-    fn drop(&mut self) {
-        unsafe { cudaHostUnregister(self.0.as_mut_c_void_ptr()).eprint_error_and_backtrace() };
-    }
-}
-
-impl<T> Deref for HostRegistrationMut<'_, T> {
-    type Target = [T];
-
-    fn deref(&self) -> &Self::Target {
-        self.0
-    }
-}
-
-impl<T> DerefMut for HostRegistrationMut<'_, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.0
-    }
-}
-
-impl<T> AsRef<[T]> for HostRegistrationMut<'_, T> {
-    fn as_ref(&self) -> &[T] {
-        self.0
-    }
-}
-
-impl<T> AsMut<[T]> for HostRegistrationMut<'_, T> {
-    fn as_mut(&mut self) -> &mut [T] {
-        self.0
-    }
-}
-
-pub fn memory_copy<T>(
-    dst: &mut (impl CudaSliceMut<T> + ?Sized),
-    src: &(impl CudaSlice<T> + ?Sized),
-) -> CudaResult<()> {
-    memory_copy_with_kind(dst, src, CudaMemoryCopyKind::Default)
-}
-
-pub fn memory_copy_with_kind<T>(
-    dst: &mut (impl CudaSliceMut<T> + ?Sized),
-    src: &(impl CudaSlice<T> + ?Sized),
-    kind: CudaMemoryCopyKind,
-) -> CudaResult<()> {
-    unsafe {
-        assert_eq!(
-            dst.len(),
-            src.len(),
-            "dst length and src length must be equal"
-        );
-        let layout = Layout::array::<T>(dst.len()).unwrap();
-        cudaMemcpy(
-            dst.as_mut_c_void_ptr(),
-            src.as_c_void_ptr(),
-            layout.size(),
-            kind,
-        )
-        .wrap()
-    }
-}
-
-pub fn memory_copy_async<T>(
-    dst: &mut (impl CudaSliceMut<T> + ?Sized),
-    src: &(impl CudaSlice<T> + ?Sized),
-    stream: &CudaStream,
-) -> CudaResult<()> {
-    memory_copy_with_kind_async(dst, src, CudaMemoryCopyKind::Default, stream)
-}
-
-pub fn memory_copy_with_kind_async<T>(
-    dst: &mut (impl CudaSliceMut<T> + ?Sized),
-    src: &(impl CudaSlice<T> + ?Sized),
-    kind: CudaMemoryCopyKind,
-    stream: &CudaStream,
-) -> CudaResult<()> {
-    unsafe {
-        assert_eq!(
-            dst.len(),
-            src.len(),
-            "dst length and src length must be equal"
-        );
-        let layout = Layout::array::<T>(dst.len()).unwrap();
-        cudaMemcpyAsync(
-            dst.as_mut_c_void_ptr(),
-            src.as_c_void_ptr(),
-            layout.size(),
-            kind,
-            stream.into(),
-        )
-        .wrap()
-    }
-}
-
-pub fn memory_set(dst: &mut (impl CudaSliceMut<u8> + ?Sized), value: u8) -> CudaResult<()> {
-    unsafe {
-        let layout = Layout::array::<u8>(dst.len()).unwrap();
-        cudaMemset(dst.as_mut_c_void_ptr(), value as i32, layout.size()).wrap()
-    }
-}
-
-pub fn memory_set_async(
-    dst: &mut (impl CudaSliceMut<u8> + ?Sized),
-    value: u8,
-    stream: &CudaStream,
-) -> CudaResult<()> {
-    unsafe {
-        let layout = Layout::array::<u8>(dst.len()).unwrap();
-        cudaMemsetAsync(
-            dst.as_mut_c_void_ptr(),
-            value as i32,
-            layout.size(),
-            stream.into(),
-        )
-        .wrap()
-    }
-}
-
-pub fn memory_get_info() -> CudaResult<(usize, usize)> {
-    let mut free = MaybeUninit::<usize>::uninit();
-    let mut total = MaybeUninit::<usize>::uninit();
-    unsafe {
-        let error = cudaMemGetInfo(free.as_mut_ptr(), total.as_mut_ptr());
-        if error == CudaError::Success {
-            Ok((free.assume_init(), total.assume_init()))
-        } else {
-            Err(error)
-        }
-    }
-}
-
-#[derive(Copy, Clone, Default, Debug, PartialEq, Eq)]
-pub struct HostAllocator {
-    flags: CudaHostAllocFlags,
-}
-
-impl HostAllocator {
-    pub fn new(flags: CudaHostAllocFlags) -> Self {
-        Self { flags }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use serial_test::serial;
-
-    use super::*;
-
-    const LENGTH: usize = 1024;
-
-    #[test]
-    #[serial]
-    fn device_allocation_alloc_is_ok() {
-        let result = DeviceAllocation::<u32>::alloc(LENGTH);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_free_is_ok() {
-        let allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let result = allocation.free();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_alloc_len_eq_length() {
-        let allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        assert_eq!(allocation.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_alloc_is_empty_is_false() {
-        let allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        assert!(!allocation.is_empty());
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_deref_len_eq_length() {
-        let allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let slice = allocation.deref();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_deref_mut_len_eq_length() {
-        let mut allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let slice = allocation.deref_mut();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_slice_index_len_eq_length() {
-        let allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let slice = &allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_allocation_mut_slice_index_mut_len_eq_length() {
-        let mut allocation = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let slice = &mut allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_alloc_is_ok() {
-        let result = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_free_is_ok() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let result = allocation.free();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_alloc_len_eq_length() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        assert_eq!(allocation.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_alloc_is_empty_is_false() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        assert!(!allocation.is_empty());
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_deref_len_eq_length() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let slice = allocation.deref();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_deref_mut_len_eq_length() {
-        let mut allocation =
-            HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let slice = allocation.deref_mut();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_index_len_eq_length() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let slice = &allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_index_mut_len_eq_length() {
-        let mut allocation =
-            HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let slice = &mut allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_deref_ptrs_are_equal() {
-        let allocation = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let ptr = allocation.deref().as_ptr();
-        assert_eq!(allocation.as_ptr(), ptr);
-    }
-
-    #[test]
-    #[serial]
-    fn host_allocation_deref_mut_ptrs_are_equal() {
-        let mut allocation =
-            HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let ptr = allocation.deref_mut().as_mut_ptr();
-        assert_eq!(allocation.as_mut_ptr(), ptr);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_register_is_ok() {
-        let values = [0u32; LENGTH];
-        let result = HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_register_empty_error_invalid_value() {
-        let values = [0u32; 0];
-        let result = HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT);
-        assert_eq!(result.err(), Some(CudaError::ErrorInvalidValue));
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_unregister_is_ok() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let result = registration.unregister();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_register_len_eq_length() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        assert_eq!(registration.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_register_is_empty_is_false() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        assert!(!registration.is_empty());
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_deref_len_eq_length() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let slice = registration.deref();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_mut_deref_mut_len_eq_length() {
-        let mut values = [0u32; LENGTH];
-        let mut registration =
-            HostRegistrationMut::<u32>::register(&mut values, CudaHostRegisterFlags::DEFAULT)
-                .unwrap();
-        let slice = registration.deref_mut();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_index_len_eq_length() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let slice = &registration[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_mut_index_mut_len_eq_length() {
-        let mut values = [0u32; LENGTH];
-        let mut registration =
-            HostRegistrationMut::<u32>::register(&mut values, CudaHostRegisterFlags::DEFAULT)
-                .unwrap();
-        let slice = &mut registration[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_deref_ptrs_are_equal() {
-        let values = [0u32; LENGTH];
-        let registration =
-            HostRegistration::<u32>::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let ptr = registration.deref().as_ptr();
-        assert_eq!(registration.as_ptr(), ptr);
-    }
-
-    #[test]
-    #[serial]
-    fn host_registration_mut_deref_mut_ptrs_are_equal() {
-        let mut values = [0u32; LENGTH];
-        let mut registration =
-            HostRegistrationMut::<u32>::register(&mut values, CudaHostRegisterFlags::DEFAULT)
-                .unwrap();
-        let ptr = registration.deref_mut().as_mut_ptr();
-        assert_eq!(registration.as_mut_ptr(), ptr);
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_device_slice_to_device_slice() {
-        let values1 = [42u32; LENGTH];
-        let mut values2 = [0u32; LENGTH];
-        let mut a1 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let mut a2 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let a1_slice = a1.deref_mut();
-        let a2_slice = a2.deref_mut();
-        memory_copy(a1_slice, &values1).unwrap();
-        memory_copy(a2_slice, a1_slice).unwrap();
-        memory_copy(&mut values2, a2_slice).unwrap();
-        assert!(values2.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_device_allocation_to_device_allocation() {
-        let values1 = [42u32; LENGTH];
-        let mut values2 = [0u32; LENGTH];
-        let mut a1 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let mut a2 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        memory_copy(&mut a1, &values1).unwrap();
-        memory_copy(&mut a2, &a1).unwrap();
-        memory_copy(&mut values2, &a2).unwrap();
-        assert!(values2.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_host_allocation_to_host_allocation() {
-        let mut a1 = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let mut a2 = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        a2.iter_mut().for_each(|x| {
-            *x = 42u32;
-        });
-        memory_copy(&mut a1, &a2).unwrap();
-        assert!(a1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_host_registration_to_host_registration_mut() {
-        let mut values1 = [0u32; LENGTH];
-        let values2 = [42u32; LENGTH];
-        let mut r1 =
-            HostRegistrationMut::register(&mut values1, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let r2 = HostRegistration::register(&values2, CudaHostRegisterFlags::DEFAULT).unwrap();
-        memory_copy(&mut r1, &r2).unwrap();
-        assert!(r1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_slice_to_slice() {
-        let mut values1 = [0u32; LENGTH];
-        let values2 = [42u32; LENGTH];
-        memory_copy(&mut values1, &values2).unwrap();
-        assert!(values1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_async_device_allocation_to_device_allocation() {
-        let stream = CudaStream::create().unwrap();
-        let values1 = [42u32; LENGTH];
-        let mut values2 = [0u32; LENGTH];
-        let mut a1 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        let mut a2 = DeviceAllocation::<u32>::alloc(LENGTH).unwrap();
-        memory_copy_async(&mut a1, &values1, &stream).unwrap();
-        memory_copy_async(&mut a2, &a1, &stream).unwrap();
-        memory_copy_async(&mut values2, &a2, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(values2.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_async_host_allocation_to_host_allocation() {
-        let stream = CudaStream::create().unwrap();
-        let mut a1 = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let mut a2 = HostAllocation::<u32>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        a2.iter_mut().for_each(|x| {
-            *x = 42u32;
-        });
-        memory_copy_async(&mut a1, &a2, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(a1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_async_host_registration_to_host_registration_mut() {
-        let stream = CudaStream::create().unwrap();
-        let mut values1 = [0u32; LENGTH];
-        let values2 = [42u32; LENGTH];
-        let mut r1 =
-            HostRegistrationMut::register(&mut values1, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let r2 = HostRegistration::register(&values2, CudaHostRegisterFlags::DEFAULT).unwrap();
-        memory_copy_async(&mut r1, &r2, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(r1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_async_slice_to_slice() {
-        let stream = CudaStream::create().unwrap();
-        let mut values1 = [0u32; LENGTH];
-        let values2 = [42u32; LENGTH];
-        memory_copy_async(&mut values1, &values2, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(values1.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_set_is_correct() {
-        let mut h_values =
-            HostAllocation::<u8>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let mut d_values = DeviceAllocation::<u8>::alloc(LENGTH).unwrap();
-        memory_set(&mut d_values, 42u8).unwrap();
-        memory_copy(&mut h_values, &d_values).unwrap();
-        assert!(h_values.iter().all(|&x| x == 42u8));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_set_async_is_correct() {
-        let stream = CudaStream::create().unwrap();
-        let mut h_values =
-            HostAllocation::<u8>::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap();
-        let mut d_values = DeviceAllocation::<u8>::alloc(LENGTH).unwrap();
-        memory_set_async(&mut d_values, 42u8, &stream).unwrap();
-        memory_copy_async(&mut h_values, &d_values, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(h_values.iter().all(|&x| x == 42u8));
-    }
-
-    #[test]
-    #[serial]
-    fn memory_get_info_is_correct() {
-        let result = memory_get_info();
-        assert!(result.is_ok());
-        let (free, total) = result.unwrap();
-        assert!(total > 0);
-        assert!(free <= total);
-    }
-}
diff --git a/cudart/src/memory_pools.rs b/cudart/src/memory_pools.rs
deleted file mode 100644
index ba83eff..0000000
--- a/cudart/src/memory_pools.rs
+++ /dev/null
@@ -1,533 +0,0 @@
-// Stream Ordered Memory Allocator
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html
-
-use core::ffi::c_void;
-use std::alloc::Layout;
-use std::mem;
-use std::mem::MaybeUninit;
-use std::ops::{Deref, DerefMut};
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice};
-use crate::stream::CudaStream;
-
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum CudaMemPoolAttributeI32 {
-    ReuseFollowEventDependencies = CudaMemPoolAttribute::ReuseFollowEventDependencies as i32,
-    ReuseAllowOpportunistic = CudaMemPoolAttribute::ReuseAllowOpportunistic as i32,
-    ReuseAllowInternalDependencies = CudaMemPoolAttribute::ReuseAllowInternalDependencies as i32,
-}
-
-impl From<CudaMemPoolAttributeI32> for i32 {
-    fn from(attribute: CudaMemPoolAttributeI32) -> Self {
-        attribute as i32
-    }
-}
-
-#[repr(i32)]
-#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)]
-pub enum CudaMemPoolAttributeU64 {
-    AttrReleaseThreshold = CudaMemPoolAttribute::AttrReleaseThreshold as i32,
-    AttrReservedMemCurrent = CudaMemPoolAttribute::AttrReservedMemCurrent as i32,
-    AttrReservedMemHigh = CudaMemPoolAttribute::AttrReservedMemHigh as i32,
-    AttrUsedMemCurrent = CudaMemPoolAttribute::AttrUsedMemCurrent as i32,
-    AttrUsedMemHigh = CudaMemPoolAttribute::AttrUsedMemHigh as i32,
-}
-
-impl From<CudaMemPoolAttributeU64> for i32 {
-    fn from(attribute: CudaMemPoolAttributeU64) -> Self {
-        attribute as i32
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct CudaMemPool {
-    handle: cudaMemPool_t,
-}
-
-impl CudaMemPool {
-    pub(crate) fn from_handle(handle: cudaMemPool_t) -> Self {
-        Self { handle }
-    }
-
-    pub fn get_access(&self, location: CudaMemLocation) -> CudaResult<CudaMemAccessFlags> {
-        let mut result = MaybeUninit::<CudaMemAccessFlags>::uninit();
-        unsafe {
-            cudaMemPoolGetAccess(
-                result.as_mut_ptr(),
-                self.handle,
-                &location as *const CudaMemLocation as *mut CudaMemLocation,
-            )
-            .wrap_maybe_uninit(result)
-        }
-    }
-
-    pub fn get_attribute_value<T: Into<i32>, U>(&self, attribute: T) -> CudaResult<U> {
-        let mut value = MaybeUninit::<U>::uninit();
-        unsafe {
-            cudaMemPoolGetAttribute(
-                self.handle,
-                mem::transmute(attribute.into()),
-                value.as_mut_ptr() as *mut c_void,
-            )
-            .wrap_maybe_uninit(value)
-        }
-    }
-
-    pub fn set_access(&self, descriptors: &[CudaMemAccessDesc]) -> CudaResult<()> {
-        unsafe { cudaMemPoolSetAccess(self.handle, descriptors.as_ptr(), descriptors.len()).wrap() }
-    }
-
-    pub fn set_attribute_value<T: Into<i32>, U>(&self, attribute: T, value: U) -> CudaResult<()> {
-        unsafe {
-            cudaMemPoolSetAttribute(
-                self.handle,
-                mem::transmute(attribute.into()),
-                &value as *const _ as *mut c_void,
-            )
-            .wrap()
-        }
-    }
-
-    pub fn trim_to(&self, min_bytes_to_keep: usize) -> CudaResult<()> {
-        unsafe { cudaMemPoolTrimTo(self.handle, min_bytes_to_keep).wrap() }
-    }
-}
-
-impl From<&CudaMemPool> for cudaMemPool_t {
-    fn from(pool: &CudaMemPool) -> Self {
-        pool.handle
-    }
-}
-
-pub trait AttributeHandler<T> {
-    type Value;
-    fn get_attribute(&self, attribute: T) -> CudaResult<Self::Value>;
-    fn set_attribute(&self, attribute: T, value: Self::Value) -> CudaResult<()>;
-}
-
-impl AttributeHandler<CudaMemPoolAttributeI32> for CudaMemPool {
-    type Value = i32;
-
-    fn get_attribute(&self, attribute: CudaMemPoolAttributeI32) -> CudaResult<Self::Value> {
-        self.get_attribute_value(attribute)
-    }
-
-    fn set_attribute(
-        &self,
-        attribute: CudaMemPoolAttributeI32,
-        value: Self::Value,
-    ) -> CudaResult<()> {
-        self.set_attribute_value(attribute, value)
-    }
-}
-
-impl AttributeHandler<CudaMemPoolAttributeU64> for CudaMemPool {
-    type Value = u64;
-
-    fn get_attribute(&self, attribute: CudaMemPoolAttributeU64) -> CudaResult<Self::Value> {
-        self.get_attribute_value(attribute)
-    }
-
-    fn set_attribute(
-        &self,
-        attribute: CudaMemPoolAttributeU64,
-        value: Self::Value,
-    ) -> CudaResult<()> {
-        self.set_attribute_value(attribute, value)
-    }
-}
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct CudaOwnedMemPool {
-    pool: CudaMemPool,
-}
-
-impl CudaOwnedMemPool {
-    fn from_handle(handle: cudaMemPool_t) -> Self {
-        Self {
-            pool: CudaMemPool::from_handle(handle),
-        }
-    }
-
-    pub fn create(properties: &CudaMemPoolProperties) -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaMemPool_t>::uninit();
-        unsafe {
-            cudaMemPoolCreate(handle.as_mut_ptr(), properties)
-                .wrap_maybe_uninit(handle)
-                .map(Self::from_handle)
-        }
-    }
-
-    pub fn create_for_device(device_id: i32) -> CudaResult<Self> {
-        let props = CudaMemPoolProperties {
-            allocType: CudaMemAllocationType::Pinned,
-            handleTypes: CudaMemAllocationHandleType::None,
-            location: CudaMemLocation {
-                type_: CudaMemLocationType::Device,
-                id: device_id,
-            },
-            ..Default::default()
-        };
-        Self::create(&props)
-    }
-
-    pub fn destroy(self) -> CudaResult<()> {
-        let pool = self.pool.handle;
-        mem::forget(self);
-        unsafe { cudaMemPoolDestroy(pool).wrap() }
-    }
-}
-
-impl Drop for CudaOwnedMemPool {
-    fn drop(&mut self) {
-        unsafe { cudaMemPoolDestroy(self.pool.handle).eprint_error_and_backtrace() };
-    }
-}
-
-impl Deref for CudaOwnedMemPool {
-    type Target = CudaMemPool;
-
-    fn deref(&self) -> &Self::Target {
-        &self.pool
-    }
-}
-
-#[derive(Debug)]
-pub struct DevicePoolAllocation<'a, T> {
-    data: AllocationData<T>,
-    stream: &'a CudaStream,
-}
-
-impl<'a, T> DevicePoolAllocation<'a, T> {
-    pub fn alloc_async(length: usize, stream: &'a CudaStream) -> CudaResult<Self> {
-        let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
-        unsafe {
-            cudaMallocAsync(dev_ptr.as_mut_ptr(), layout.size(), stream.into())
-                .wrap_maybe_uninit(dev_ptr)
-                .map(|ptr| Self {
-                    data: AllocationData {
-                        ptr: ptr as *mut T,
-                        len: length,
-                    },
-                    stream,
-                })
-        }
-    }
-
-    pub fn alloc_from_pool_async(
-        length: usize,
-        pool: &CudaMemPool,
-        stream: &'a CudaStream,
-    ) -> CudaResult<Self> {
-        let layout = Layout::array::<T>(length).unwrap();
-        let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit();
-        unsafe {
-            cudaMallocFromPoolAsync(
-                dev_ptr.as_mut_ptr(),
-                layout.size(),
-                pool.handle,
-                stream.into(),
-            )
-            .wrap_maybe_uninit(dev_ptr)
-            .map(|ptr| Self {
-                data: AllocationData {
-                    ptr: ptr as *mut T,
-                    len: length,
-                },
-                stream,
-            })
-        }
-    }
-
-    pub fn free_async(self, stream: &CudaStream) -> CudaResult<()> {
-        unsafe {
-            let ptr = self.as_c_void_ptr() as *mut c_void;
-            mem::forget(self);
-            cudaFreeAsync(ptr, stream.into()).wrap()
-        }
-    }
-
-    pub fn swap_stream(self, stream: &CudaStream) -> DevicePoolAllocation<T> {
-        let data = AllocationData {
-            ptr: self.data.ptr,
-            len: self.data.len,
-        };
-        mem::forget(self);
-        DevicePoolAllocation { data, stream }
-    }
-
-    /// # Safety
-    ///
-    /// The caller must ensure that the inputs are valid.
-    pub unsafe fn from_raw_parts(ptr: *mut T, len: usize, stream: &'a CudaStream) -> Self {
-        Self {
-            data: AllocationData { ptr, len },
-            stream,
-        }
-    }
-
-    pub fn into_raw_parts(self) -> (*mut T, usize, &'a CudaStream) {
-        let result = (self.data.ptr, self.data.len, self.stream);
-        mem::forget(self);
-        result
-    }
-}
-
-impl<'a, T> Drop for DevicePoolAllocation<'a, T> {
-    fn drop(&mut self) {
-        unsafe {
-            cudaFreeAsync(self.as_mut_c_void_ptr(), self.stream.into()).eprint_error_and_backtrace()
-        };
-    }
-}
-
-impl<'a, T> Deref for DevicePoolAllocation<'a, T> {
-    type Target = DeviceSlice<T>;
-
-    fn deref(&self) -> &Self::Target {
-        Self::Target::from_allocation_data(&self.data)
-    }
-}
-
-impl<'a, T> DerefMut for DevicePoolAllocation<'a, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        Self::Target::from_mut_allocation_data(&mut self.data)
-    }
-}
-
-impl<'a, T> CudaSlice<T> for DevicePoolAllocation<'a, T> {
-    unsafe fn as_slice(&self) -> &[T] {
-        self.data.as_slice()
-    }
-}
-
-impl<'a, T> CudaSliceMut<T> for DevicePoolAllocation<'a, T> {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        self.data.as_mut_slice()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use serial_test::serial;
-
-    use crate::memory::memory_copy_async;
-
-    use super::*;
-
-    const LENGTH: usize = 1024;
-
-    #[test]
-    #[serial]
-    fn mem_pool_for_device_is_ok() {
-        let result = CudaOwnedMemPool::create_for_device(0);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn mem_pool_destroy_is_ok() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let result = pool.destroy();
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_alloc_async_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let result = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_alloc_from_pool_async_is_ok() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let stream = CudaStream::create().unwrap();
-        let result = DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_free_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let result = allocation.free_async(&stream);
-        stream.synchronize().unwrap();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_alloc_async_len_eq_length() {
-        let stream = CudaStream::create().unwrap();
-        let allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        assert_eq!(allocation.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_alloc_async_is_empty_is_false() {
-        let stream = CudaStream::create().unwrap();
-        let allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        assert!(!allocation.is_empty());
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_deref_len_eq_length() {
-        let stream = CudaStream::create().unwrap();
-        let allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let slice = allocation.deref();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_deref_mut_len_eq_length() {
-        let stream = CudaStream::create().unwrap();
-        let mut allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let slice = allocation.deref_mut();
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_slice_index_len_eq_length() {
-        let stream = CudaStream::create().unwrap();
-        let allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let slice = &allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_mut_slice_index_mut_len_eq_length() {
-        let stream = CudaStream::create().unwrap();
-        let mut allocation = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let slice = &mut allocation[..];
-        assert_eq!(slice.len(), LENGTH);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_drop_frees_memory() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let stream = CudaStream::create().unwrap();
-        let allocation =
-            DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap();
-        drop(allocation);
-        let used = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
-            .unwrap() as usize;
-        assert_eq!(used, 0);
-    }
-
-    #[test]
-    #[serial]
-    fn device_pool_allocation_swap_stream() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let stream1 = CudaStream::create().unwrap();
-        let allocation =
-            DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream1).unwrap();
-        let stream2 = CudaStream::create().unwrap();
-        let allocation = allocation.swap_stream(&stream2);
-        drop(stream1);
-        drop(allocation);
-        let used = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
-            .unwrap() as usize;
-        assert_eq!(used, 0);
-    }
-
-    #[test]
-    #[serial]
-    fn memory_copy_device_pool_allocation_to_device_pool_allocation() {
-        let values1 = [42u32; LENGTH];
-        let mut values2 = [0u32; LENGTH];
-        let stream = CudaStream::create().unwrap();
-        let mut a1 = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        let mut a2 = DevicePoolAllocation::<u32>::alloc_async(LENGTH, &stream).unwrap();
-        memory_copy_async(&mut a1, &values1, &stream).unwrap();
-        memory_copy_async(&mut a2, &a1, &stream).unwrap();
-        memory_copy_async(&mut values2, &a2, &stream).unwrap();
-        stream.synchronize().unwrap();
-        assert!(values2.iter().all(|&x| x == 42u32));
-    }
-
-    #[test]
-    #[serial]
-    fn get_attribute_i32() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let result = pool.get_attribute(CudaMemPoolAttributeI32::ReuseAllowOpportunistic);
-        assert_eq!(result, Ok(1));
-    }
-
-    #[test]
-    #[serial]
-    fn get_attribute_u64() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let result = pool.get_attribute(CudaMemPoolAttributeU64::AttrReleaseThreshold);
-        assert_eq!(result, Ok(0));
-    }
-
-    #[test]
-    #[serial]
-    fn set_attribute_i32() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let attribute = CudaMemPoolAttributeI32::ReuseAllowOpportunistic;
-        let result = pool.set_attribute(attribute, 0);
-        assert_eq!(result, Ok(()));
-        assert_eq!(pool.get_attribute(attribute), Ok(0));
-    }
-
-    #[test]
-    #[serial]
-    fn set_attribute_u64() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        let attribute = CudaMemPoolAttributeU64::AttrReleaseThreshold;
-        let result = pool.set_attribute(attribute, u64::MAX);
-        assert_eq!(result, Ok(()));
-        assert_eq!(pool.get_attribute(attribute), Ok(u64::MAX));
-    }
-
-    #[test]
-    #[serial]
-    fn trim_to_works_correctly() {
-        let pool = CudaOwnedMemPool::create_for_device(0).unwrap();
-        pool.set_attribute(CudaMemPoolAttributeU64::AttrReleaseThreshold, u64::MAX)
-            .unwrap();
-        let stream = CudaStream::create().unwrap();
-        let allocation =
-            DevicePoolAllocation::<u32>::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap();
-        let size = mem::size_of::<u32>() * LENGTH;
-        let used = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
-            .unwrap() as usize;
-        assert_eq!(used, size);
-        allocation.free_async(&stream).unwrap();
-        stream.synchronize().unwrap();
-        let used = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent)
-            .unwrap() as usize;
-        assert_eq!(used, 0);
-        let reserved = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrReservedMemCurrent)
-            .unwrap() as usize;
-        assert!(reserved >= size);
-        pool.trim_to(0).unwrap();
-        let reserved = pool
-            .get_attribute(CudaMemPoolAttributeU64::AttrReservedMemCurrent)
-            .unwrap() as usize;
-        assert_eq!(reserved, 0);
-    }
-}
diff --git a/cudart/src/occupancy.rs b/cudart/src/occupancy.rs
deleted file mode 100644
index 9e4a74b..0000000
--- a/cudart/src/occupancy.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-use std::mem::MaybeUninit;
-
-use cudart_sys::{
-    cudaOccupancyAvailableDynamicSMemPerBlock, cudaOccupancyMaxActiveBlocksPerMultiprocessor,
-    cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
-};
-
-use crate::execution::Kernel;
-use crate::result::{CudaResult, CudaResultWrap};
-
-pub fn available_dynamic_smem_per_block(
-    kernel: impl Kernel,
-    num_blocks: i32,
-    block_size: i32,
-) -> CudaResult<usize> {
-    let mut result = MaybeUninit::<usize>::uninit();
-    unsafe {
-        cudaOccupancyAvailableDynamicSMemPerBlock(
-            result.as_mut_ptr(),
-            kernel.get_kernel_raw(),
-            num_blocks,
-            block_size,
-        )
-        .wrap_maybe_uninit(result)
-    }
-}
-
-pub fn max_active_blocks_per_multiprocessor(
-    kernel: impl Kernel,
-    block_size: i32,
-    dynamic_smem_size: usize,
-) -> CudaResult<i32> {
-    let mut result = MaybeUninit::<i32>::uninit();
-    unsafe {
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-            result.as_mut_ptr(),
-            kernel.get_kernel_raw(),
-            block_size,
-            dynamic_smem_size,
-        )
-        .wrap_maybe_uninit(result)
-    }
-}
-
-pub fn max_active_blocks_per_multiprocessor_with_flags(
-    kernel: impl Kernel,
-    block_size: i32,
-    dynamic_smem_size: usize,
-    flags: u32,
-) -> CudaResult<i32> {
-    let mut result = MaybeUninit::<i32>::uninit();
-    unsafe {
-        cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
-            result.as_mut_ptr(),
-            kernel.get_kernel_raw(),
-            block_size,
-            dynamic_smem_size,
-            flags,
-        )
-        .wrap_maybe_uninit(result)
-    }
-}
diff --git a/cudart/src/peer.rs b/cudart/src/peer.rs
deleted file mode 100644
index ac50f8d..0000000
--- a/cudart/src/peer.rs
+++ /dev/null
@@ -1,62 +0,0 @@
-// Peer Device Memory Access
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PEER.html
-
-use std::mem::MaybeUninit;
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-
-pub fn device_can_access_peer(device_id: i32, device_peer_id: i32) -> CudaResult<bool> {
-    let mut can_access_peer = MaybeUninit::<i32>::uninit();
-    unsafe {
-        cudaDeviceCanAccessPeer(can_access_peer.as_mut_ptr(), device_id, device_peer_id)
-            .wrap_maybe_uninit(can_access_peer)
-            .map(|value| value != 0)
-    }
-}
-
-pub fn device_disable_peer_access(device_peer_id: i32) -> CudaResult<()> {
-    unsafe { cudaDeviceDisablePeerAccess(device_peer_id).wrap() }
-}
-
-pub fn device_enable_peer_access(device_peer_id: i32) -> CudaResult<()> {
-    unsafe { cudaDeviceEnablePeerAccess(device_peer_id, 0).wrap() }
-}
-
-#[cfg(test)]
-mod tests {
-    use serial_test::serial;
-
-    use crate::device::device_reset;
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    #[ignore = "needs multiple peer-enabled GPUs"]
-    fn device_can_access_peer_is_true() {
-        let result = device_can_access_peer(0, 1);
-        assert_eq!(result, Ok(true));
-    }
-
-    #[test]
-    #[serial]
-    #[ignore = "needs multiple peer-enabled GPUs"]
-    fn device_disable_peer_access_is_ok() {
-        device_reset().unwrap();
-        device_enable_peer_access(1).unwrap();
-        let result = device_disable_peer_access(1);
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    #[ignore = "needs multiple peer-enabled GPUs"]
-    fn device_enable_peer_access_is_ok() {
-        device_reset().unwrap();
-        let result = device_enable_peer_access(1);
-        assert_eq!(result, Ok(()));
-        device_disable_peer_access(1).unwrap();
-    }
-}
diff --git a/cudart/src/result.rs b/cudart/src/result.rs
deleted file mode 100644
index a3ce68b..0000000
--- a/cudart/src/result.rs
+++ /dev/null
@@ -1,29 +0,0 @@
-use std::mem::MaybeUninit;
-
-use cudart_sys::CudaError;
-
-pub type CudaResult<T> = Result<T, CudaError>;
-
-pub trait CudaResultWrap {
-    fn wrap(self) -> CudaResult<()>;
-    fn wrap_value<T>(self, value: T) -> CudaResult<T>;
-    fn wrap_maybe_uninit<T>(self, value: MaybeUninit<T>) -> CudaResult<T>;
-}
-
-impl CudaResultWrap for CudaError {
-    fn wrap(self) -> CudaResult<()> {
-        self.wrap_value(())
-    }
-
-    fn wrap_value<T>(self, value: T) -> CudaResult<T> {
-        if self == CudaError::Success {
-            Ok(value)
-        } else {
-            Err(self)
-        }
-    }
-
-    fn wrap_maybe_uninit<T>(self, value: MaybeUninit<T>) -> CudaResult<T> {
-        self.wrap_value(value).map(|x| unsafe { x.assume_init() })
-    }
-}
diff --git a/cudart/src/slice/allocation_data.rs b/cudart/src/slice/allocation_data.rs
deleted file mode 100644
index 4b11763..0000000
--- a/cudart/src/slice/allocation_data.rs
+++ /dev/null
@@ -1,25 +0,0 @@
-use std::slice;
-
-use crate::slice::{CudaSlice, CudaSliceMut};
-
-#[derive(Debug)]
-pub(crate) struct AllocationData<T> {
-    pub ptr: *mut T,
-    pub len: usize,
-}
-
-unsafe impl<T> Send for AllocationData<T> where Vec<T>: Send {}
-
-unsafe impl<T> Sync for AllocationData<T> where Vec<T>: Sync {}
-
-impl<T> CudaSlice<T> for AllocationData<T> {
-    unsafe fn as_slice(&self) -> &[T] {
-        slice::from_raw_parts(self.ptr, self.len)
-    }
-}
-
-impl<T> CudaSliceMut<T> for AllocationData<T> {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        slice::from_raw_parts_mut(self.ptr, self.len)
-    }
-}
diff --git a/cudart/src/slice/cuda_slice.rs b/cudart/src/slice/cuda_slice.rs
deleted file mode 100644
index d3bb5b3..0000000
--- a/cudart/src/slice/cuda_slice.rs
+++ /dev/null
@@ -1,67 +0,0 @@
-use std::ffi::c_void;
-
-pub trait CudaSlice<T> {
-    /// # Safety
-    /// do not dereference if the memory is located on the device
-    unsafe fn as_slice(&self) -> &[T];
-
-    fn as_ptr(&self) -> *const T {
-        unsafe { self.as_slice().as_ptr() }
-    }
-
-    fn as_c_void_ptr(&self) -> *const c_void {
-        self.as_ptr() as *const c_void
-    }
-
-    fn is_empty(&self) -> bool {
-        unsafe { self.as_slice().is_empty() }
-    }
-
-    fn len(&self) -> usize {
-        unsafe { self.as_slice().len() }
-    }
-}
-
-pub trait CudaSliceMut<T>: CudaSlice<T> {
-    /// # Safety
-    /// do not dereference if the memory is located on the device
-    unsafe fn as_mut_slice(&mut self) -> &mut [T];
-
-    fn as_mut_ptr(&mut self) -> *mut T {
-        unsafe { self.as_mut_slice().as_mut_ptr() }
-    }
-
-    fn as_mut_c_void_ptr(&mut self) -> *mut c_void {
-        self.as_mut_ptr() as *mut c_void
-    }
-}
-
-impl<T> CudaSlice<T> for [T] {
-    unsafe fn as_slice(&self) -> &[T] {
-        self
-    }
-}
-
-impl<T> CudaSliceMut<T> for [T] {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        self
-    }
-}
-
-impl<T, U> CudaSlice<T> for U
-where
-    Self: AsRef<[T]>,
-{
-    unsafe fn as_slice(&self) -> &[T] {
-        self.as_ref()
-    }
-}
-
-impl<T, U> CudaSliceMut<T> for U
-where
-    Self: AsMut<[T]> + AsRef<[T]>,
-{
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        self.as_mut()
-    }
-}
diff --git a/cudart/src/slice/cuda_variable.rs b/cudart/src/slice/cuda_variable.rs
deleted file mode 100644
index 08920de..0000000
--- a/cudart/src/slice/cuda_variable.rs
+++ /dev/null
@@ -1,41 +0,0 @@
-use std::ffi::c_void;
-
-pub trait CudaVariable<T> {
-    /// # Safety
-    /// do not dereference if the memory is located on the device
-    unsafe fn as_ref(&self) -> &T;
-
-    fn as_ptr(&self) -> *const T {
-        unsafe { self.as_ref() }
-    }
-
-    fn as_c_void_ptr(&self) -> *const c_void {
-        self.as_ptr() as *const c_void
-    }
-}
-
-pub trait CudaVariableMut<T>: CudaVariable<T> {
-    /// # Safety
-    /// do not dereference if the memory is located on the device
-    unsafe fn as_mut(&mut self) -> &mut T;
-
-    fn as_mut_ptr(&mut self) -> *mut T {
-        unsafe { self.as_mut() }
-    }
-
-    fn as_mut_c_void_ptr(&mut self) -> *mut c_void {
-        self.as_mut_ptr() as *mut c_void
-    }
-}
-
-impl<T> CudaVariable<T> for T {
-    unsafe fn as_ref(&self) -> &T {
-        self
-    }
-}
-
-impl<T> CudaVariableMut<T> for T {
-    unsafe fn as_mut(&mut self) -> &mut T {
-        self
-    }
-}
diff --git a/cudart/src/slice/device_slice.rs b/cudart/src/slice/device_slice.rs
deleted file mode 100644
index 846f6e2..0000000
--- a/cudart/src/slice/device_slice.rs
+++ /dev/null
@@ -1,135 +0,0 @@
-use std::fmt::{Debug, Formatter};
-use std::mem::size_of;
-use std::ptr;
-use std::ptr::{null, null_mut};
-
-use crate::slice::iter::{Chunks, ChunksMut};
-use crate::slice::AllocationData;
-use crate::slice::{CudaSlice, CudaSliceMut};
-
-#[repr(transparent)]
-pub struct DeviceSlice<T>([T]);
-
-impl<T> DeviceSlice<T> {
-    /// # Safety
-    /// make sure data_address is pointing to memory located on the device
-    pub unsafe fn from_raw_parts<'a>(data_address: *const T, len: usize) -> &'a Self {
-        &*ptr::from_raw_parts(data_address as *const (), len)
-    }
-
-    /// # Safety
-    /// make sure data_address is pointing to memory located on the device
-    pub unsafe fn from_raw_parts_mut<'a>(data_address: *mut T, len: usize) -> &'a mut Self {
-        &mut *ptr::from_raw_parts_mut(data_address as *mut (), len)
-    }
-
-    /// # Safety
-    /// make sure the slice is pointing to memory located on the device
-    pub unsafe fn from_slice(slice: &[T]) -> &Self {
-        Self::from_raw_parts(slice.as_ptr(), slice.len())
-    }
-
-    /// # Safety
-    /// make sure the slice is pointing to memory located on the device
-    pub unsafe fn from_mut_slice(slice: &mut [T]) -> &mut Self {
-        Self::from_raw_parts_mut(slice.as_mut_ptr(), slice.len())
-    }
-
-    pub(crate) fn from_allocation_data(data: &AllocationData<T>) -> &Self {
-        unsafe { Self::from_slice(data.as_slice()) }
-    }
-
-    pub(crate) fn from_mut_allocation_data(data: &mut AllocationData<T>) -> &mut Self {
-        unsafe { Self::from_mut_slice(data.as_mut_slice()) }
-    }
-
-    pub fn empty<'a>() -> &'a Self {
-        unsafe { Self::from_raw_parts(null(), 0) }
-    }
-
-    pub fn empty_mut<'a>() -> &'a mut Self {
-        unsafe { Self::from_raw_parts_mut(null_mut(), 0) }
-    }
-
-    pub fn as_ptr(&self) -> *const T {
-        self.0.as_ptr()
-    }
-
-    pub fn as_mut_ptr(&mut self) -> *mut T {
-        self.0.as_mut_ptr()
-    }
-
-    pub fn len(&self) -> usize {
-        self.0.len()
-    }
-
-    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
-    }
-
-    /// # Safety
-    /// only use if you know that treating the memory as [U] is valid
-    pub unsafe fn transmute<U>(&self) -> &DeviceSlice<U> {
-        let size_of_t = size_of::<T>();
-        let size_of_u = size_of::<U>();
-        assert_eq!((self.len() * size_of_t) % size_of_u, 0);
-        let len = self.len() * size_of_t / size_of_u;
-        DeviceSlice::from_raw_parts(self.as_ptr() as *const U, len)
-    }
-
-    /// # Safety
-    /// only use if you know that treating the memory as [U] is valid
-    pub unsafe fn transmute_mut<U>(&mut self) -> &mut DeviceSlice<U> {
-        let size_of_t = size_of::<T>();
-        let size_of_u = size_of::<U>();
-        assert_eq!((self.len() * size_of_t) % size_of_u, 0);
-        let len = self.len() * size_of_t / size_of_u;
-        DeviceSlice::from_raw_parts_mut(self.as_mut_ptr() as *mut U, len)
-    }
-
-    pub fn split_at(&self, mid: usize) -> (&Self, &Self) {
-        unsafe {
-            let (left, right) = self.as_slice().split_at(mid);
-            (Self::from_slice(left), Self::from_slice(right))
-        }
-    }
-
-    pub fn split_at_mut(&mut self, mid: usize) -> (&mut Self, &mut Self) {
-        unsafe {
-            let (left, right) = self.as_mut_slice().split_at_mut(mid);
-            (Self::from_mut_slice(left), Self::from_mut_slice(right))
-        }
-    }
-
-    pub fn chunks(&self, chunk_size: usize) -> Chunks<T> {
-        assert_ne!(chunk_size, 0, "chunk size must be non-zero");
-        Chunks::new(self, chunk_size)
-    }
-
-    pub fn chunks_mut(&mut self, chunk_size: usize) -> ChunksMut<T> {
-        assert_ne!(chunk_size, 0, "chunk size must be non-zero");
-        ChunksMut::new(self, chunk_size)
-    }
-}
-
-impl<T> Debug for DeviceSlice<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        let slice = &self.0;
-        f.debug_struct("DeviceSlice")
-            .field("ptr", &slice.as_ptr())
-            .field("len", &slice.len())
-            .finish()
-    }
-}
-
-impl<T> CudaSlice<T> for DeviceSlice<T> {
-    unsafe fn as_slice(&self) -> &[T] {
-        &self.0
-    }
-}
-
-impl<T> CudaSliceMut<T> for DeviceSlice<T> {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        &mut self.0
-    }
-}
diff --git a/cudart/src/slice/device_variable.rs b/cudart/src/slice/device_variable.rs
deleted file mode 100644
index 5dc5238..0000000
--- a/cudart/src/slice/device_variable.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-use std::fmt::{Debug, Formatter};
-use std::ops::{Deref, DerefMut};
-use std::ptr;
-
-use crate::slice::{CudaSlice, CudaSliceMut, CudaVariable, CudaVariableMut, DeviceSlice};
-
-#[repr(transparent)]
-pub struct DeviceVariable<T>([T]);
-
-impl<T> DeviceVariable<T> {
-    /// # Safety
-    /// make sure data_address is pointing to memory located on the device
-    pub unsafe fn from_raw_parts<'a>(data_address: *const T) -> &'a Self {
-        &*ptr::from_raw_parts(data_address as *const (), 1)
-    }
-
-    /// # Safety
-    /// make sure data_address is pointing to memory located on the device
-    pub unsafe fn from_raw_parts_mut<'a>(data_address: *mut T) -> &'a mut Self {
-        &mut *ptr::from_raw_parts_mut(data_address as *mut (), 1)
-    }
-
-    /// # Safety
-    /// make sure the ref is pointing to memory located on the device
-    pub unsafe fn from_ref(s: &T) -> &Self {
-        Self::from_raw_parts(s.as_ptr())
-    }
-
-    /// # Safety
-    /// make sure the ref is pointing to memory located on the device
-    pub unsafe fn from_mut(s: &mut T) -> &mut Self {
-        Self::from_raw_parts_mut(s.as_mut_ptr())
-    }
-
-    /// # Safety
-    /// make sure the slice is pointing to memory located on the device
-    pub unsafe fn from_slice(slice: &[T]) -> &Self {
-        assert_eq!(slice.len(), 1);
-        Self::from_raw_parts(slice.as_ptr())
-    }
-
-    /// # Safety
-    /// make sure the slice is pointing to memory located on the device
-    pub unsafe fn from_mut_slice(slice: &mut [T]) -> &mut Self {
-        assert_eq!(slice.len(), 1);
-        Self::from_raw_parts_mut(slice.as_mut_ptr())
-    }
-
-    pub fn as_ptr(&self) -> *const T {
-        self.0.as_ptr()
-    }
-
-    pub fn as_mut_ptr(&mut self) -> *mut T {
-        self.0.as_mut_ptr()
-    }
-}
-
-impl<T> Deref for DeviceVariable<T> {
-    type Target = DeviceSlice<T>;
-
-    fn deref(&self) -> &Self::Target {
-        unsafe { DeviceSlice::from_slice(&self.0) }
-    }
-}
-
-impl<T> DerefMut for DeviceVariable<T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { DeviceSlice::from_mut_slice(&mut self.0) }
-    }
-}
-
-impl<T> Debug for DeviceVariable<T> {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        let slice = &self.0;
-        f.debug_struct("DeviceVariable")
-            .field("ptr", &slice.as_ptr())
-            .finish()
-    }
-}
-
-impl<T> CudaSlice<T> for DeviceVariable<T> {
-    unsafe fn as_slice(&self) -> &[T] {
-        &self.0
-    }
-}
-
-impl<T> CudaSliceMut<T> for DeviceVariable<T> {
-    unsafe fn as_mut_slice(&mut self) -> &mut [T] {
-        &mut self.0
-    }
-}
-
-impl<T> CudaVariable<T> for DeviceVariable<T> {
-    unsafe fn as_ref(&self) -> &T {
-        &self.0[0]
-    }
-}
-
-impl<T> CudaVariableMut<T> for DeviceVariable<T> {
-    unsafe fn as_mut(&mut self) -> &mut T {
-        &mut self.0[0]
-    }
-}
diff --git a/cudart/src/slice/index.rs b/cudart/src/slice/index.rs
deleted file mode 100644
index 721cd73..0000000
--- a/cudart/src/slice/index.rs
+++ /dev/null
@@ -1,78 +0,0 @@
-use std::ops;
-use std::ops::{Index, IndexMut};
-
-use crate::slice::{CudaSlice, CudaSliceMut, DeviceSlice, DeviceVariable};
-
-impl<T, I> Index<I> for DeviceSlice<T>
-where
-    I: DeviceSliceIndex<T>,
-{
-    type Output = I::Output;
-
-    #[inline]
-    fn index(&self, index: I) -> &I::Output {
-        index.index(self)
-    }
-}
-
-impl<T, I> IndexMut<I> for DeviceSlice<T>
-where
-    I: DeviceSliceIndex<T>,
-{
-    #[inline]
-    fn index_mut(&mut self, index: I) -> &mut I::Output {
-        index.index_mut(self)
-    }
-}
-
-pub trait DeviceSliceIndex<T> {
-    type Output: ?Sized;
-
-    fn index(self, slice: &DeviceSlice<T>) -> &Self::Output;
-
-    fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut Self::Output;
-}
-
-impl<T> DeviceSliceIndex<T> for usize {
-    type Output = DeviceVariable<T>;
-
-    fn index(self, slice: &DeviceSlice<T>) -> &Self::Output {
-        unsafe { DeviceVariable::from_ref(slice.as_slice().index(self)) }
-    }
-
-    fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut Self::Output {
-        unsafe { DeviceVariable::from_mut(slice.as_mut_slice().index_mut(self)) }
-    }
-}
-
-trait DeviceSliceToSliceIndex {}
-
-impl<T, I> DeviceSliceIndex<T> for I
-where
-    I: DeviceSliceToSliceIndex,
-    [T]: Index<I, Output = [T]> + IndexMut<I, Output = [T]>,
-{
-    type Output = DeviceSlice<T>;
-
-    fn index(self, slice: &DeviceSlice<T>) -> &Self::Output {
-        unsafe { DeviceSlice::from_slice(slice.as_slice().index(self)) }
-    }
-
-    fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut Self::Output {
-        unsafe { DeviceSlice::from_mut_slice(slice.as_mut_slice().index_mut(self)) }
-    }
-}
-
-impl DeviceSliceToSliceIndex for ops::RangeFull {}
-
-impl DeviceSliceToSliceIndex for ops::Range<usize> {}
-
-impl DeviceSliceToSliceIndex for ops::RangeFrom<usize> {}
-
-impl DeviceSliceToSliceIndex for ops::RangeTo<usize> {}
-
-impl DeviceSliceToSliceIndex for ops::RangeInclusive<usize> {}
-
-impl DeviceSliceToSliceIndex for ops::RangeToInclusive<usize> {}
-
-impl DeviceSliceToSliceIndex for (ops::Bound<usize>, ops::Bound<usize>) {}
diff --git a/cudart/src/slice/iter.rs b/cudart/src/slice/iter.rs
deleted file mode 100644
index a6e7c14..0000000
--- a/cudart/src/slice/iter.rs
+++ /dev/null
@@ -1,163 +0,0 @@
-use std::iter::{FusedIterator, TrustedLen, TrustedRandomAccess, TrustedRandomAccessNoCoerce};
-use std::slice;
-
-use crate::slice::cuda_slice::{CudaSlice, CudaSliceMut};
-use crate::slice::device_slice::DeviceSlice;
-
-#[repr(transparent)]
-pub struct Chunks<'a, T>(slice::Chunks<'a, T>);
-
-impl<'a, T: 'a> Chunks<'a, T> {
-    #[inline]
-    pub(super) fn new(slice: &'a DeviceSlice<T>, size: usize) -> Self {
-        unsafe { Self(slice.as_slice().chunks(size)) }
-    }
-
-    #[inline]
-    fn map_option(option: Option<&'a [T]>) -> Option<&'a DeviceSlice<T>> {
-        unsafe { option.map(|s| DeviceSlice::from_slice(s)) }
-    }
-}
-
-impl<T> Clone for Chunks<'_, T> {
-    fn clone(&self) -> Self {
-        Self(self.0.clone())
-    }
-}
-
-impl<'a, T> Iterator for Chunks<'a, T> {
-    type Item = &'a DeviceSlice<T>;
-
-    #[inline]
-    fn next(&mut self) -> Option<Self::Item> {
-        Self::map_option(self.0.next())
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.0.size_hint()
-    }
-
-    #[inline]
-    fn count(self) -> usize {
-        self.0.len()
-    }
-
-    #[inline]
-    fn last(self) -> Option<Self::Item> {
-        Self::map_option(self.0.last())
-    }
-
-    #[inline]
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        Self::map_option(self.0.nth(n))
-    }
-
-    unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> Self::Item {
-        DeviceSlice::from_slice(self.0.__iterator_get_unchecked(idx))
-    }
-}
-
-impl<'a, T> DoubleEndedIterator for Chunks<'a, T> {
-    #[inline]
-    fn next_back(&mut self) -> Option<Self::Item> {
-        Self::map_option(self.0.next_back())
-    }
-
-    #[inline]
-    fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
-        Self::map_option(self.0.nth_back(n))
-    }
-}
-
-impl<T> ExactSizeIterator for Chunks<'_, T> {}
-
-unsafe impl<T> TrustedLen for Chunks<'_, T> {}
-
-impl<T> FusedIterator for Chunks<'_, T> {}
-
-#[doc(hidden)]
-unsafe impl<'a, T> TrustedRandomAccess for Chunks<'a, T> {}
-
-#[doc(hidden)]
-unsafe impl<'a, T> TrustedRandomAccessNoCoerce for Chunks<'a, T> {
-    const MAY_HAVE_SIDE_EFFECT: bool = false;
-}
-
-#[repr(transparent)]
-pub struct ChunksMut<'a, T>(slice::ChunksMut<'a, T>);
-
-impl<'a, T: 'a> ChunksMut<'a, T> {
-    #[inline]
-    pub(super) fn new(slice: &'a mut DeviceSlice<T>, size: usize) -> Self {
-        unsafe { Self(slice.as_mut_slice().chunks_mut(size)) }
-    }
-
-    #[inline]
-    fn map_option(option: Option<&'a mut [T]>) -> Option<&'a mut DeviceSlice<T>> {
-        unsafe { option.map(|s| DeviceSlice::from_mut_slice(s)) }
-    }
-}
-
-impl<'a, T> Iterator for ChunksMut<'a, T> {
-    type Item = &'a mut DeviceSlice<T>;
-
-    #[inline]
-    fn next(&mut self) -> Option<Self::Item> {
-        Self::map_option(self.0.next())
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.0.size_hint()
-    }
-
-    #[inline]
-    fn count(self) -> usize {
-        self.len()
-    }
-
-    #[inline]
-    fn last(self) -> Option<Self::Item> {
-        Self::map_option(self.0.last())
-    }
-
-    #[inline]
-    fn nth(&mut self, n: usize) -> Option<Self::Item> {
-        Self::map_option(self.0.nth(n))
-    }
-
-    unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> Self::Item {
-        DeviceSlice::from_mut_slice(self.0.__iterator_get_unchecked(idx))
-    }
-}
-
-impl<'a, T> DoubleEndedIterator for ChunksMut<'a, T> {
-    #[inline]
-    fn next_back(&mut self) -> Option<Self::Item> {
-        Self::map_option(self.0.next_back())
-    }
-
-    #[inline]
-    fn nth_back(&mut self, n: usize) -> Option<Self::Item> {
-        Self::map_option(self.0.nth_back(n))
-    }
-}
-
-impl<T> ExactSizeIterator for ChunksMut<'_, T> {}
-
-unsafe impl<T> TrustedLen for ChunksMut<'_, T> {}
-
-impl<T> FusedIterator for ChunksMut<'_, T> {}
-
-#[doc(hidden)]
-unsafe impl<'a, T> TrustedRandomAccess for ChunksMut<'a, T> {}
-
-#[doc(hidden)]
-unsafe impl<'a, T> TrustedRandomAccessNoCoerce for ChunksMut<'a, T> {
-    const MAY_HAVE_SIDE_EFFECT: bool = false;
-}
-
-unsafe impl<T> Send for ChunksMut<'_, T> where T: Send {}
-
-unsafe impl<T> Sync for ChunksMut<'_, T> where T: Sync {}
diff --git a/cudart/src/slice/mod.rs b/cudart/src/slice/mod.rs
deleted file mode 100644
index 0b80b89..0000000
--- a/cudart/src/slice/mod.rs
+++ /dev/null
@@ -1,15 +0,0 @@
-pub(crate) use allocation_data::AllocationData;
-pub use cuda_slice::CudaSlice;
-pub use cuda_slice::CudaSliceMut;
-pub use cuda_variable::CudaVariable;
-pub use cuda_variable::CudaVariableMut;
-pub use device_slice::DeviceSlice;
-pub use device_variable::DeviceVariable;
-
-mod allocation_data;
-mod cuda_slice;
-mod cuda_variable;
-mod device_slice;
-mod device_variable;
-mod index;
-mod iter;
diff --git a/cudart/src/stream.rs b/cudart/src/stream.rs
deleted file mode 100644
index a696db5..0000000
--- a/cudart/src/stream.rs
+++ /dev/null
@@ -1,218 +0,0 @@
-// stream management
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
-
-use std::mem::{self, MaybeUninit};
-use std::ptr::null_mut;
-
-use bitflags::bitflags;
-
-use cudart_sys::*;
-
-use crate::event::CudaEvent;
-use crate::execution::CudaLaunchAttribute;
-use crate::result::{CudaResult, CudaResultWrap};
-
-#[repr(transparent)]
-#[derive(Debug)]
-pub struct CudaStream {
-    handle: cudaStream_t,
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaStreamCreateFlags: u32 {
-        const DEFAULT = cudart_sys::cudaStreamDefault;
-        const NON_BLOCKING = cudart_sys::cudaStreamNonBlocking;
-    }
-}
-
-impl Default for CudaStreamCreateFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-bitflags! {
-    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-    pub struct CudaStreamWaitEventFlags: u32 {
-        const DEFAULT = cudart_sys::cudaEventWaitDefault;
-        const WAIT_EXTERNAL = cudart_sys::cudaEventWaitExternal;
-    }
-}
-
-impl Default for CudaStreamWaitEventFlags {
-    fn default() -> Self {
-        Self::DEFAULT
-    }
-}
-
-impl CudaStream {
-    fn from_handle(handle: cudaStream_t) -> Self {
-        Self { handle }
-    }
-
-    pub fn create() -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaStream_t>::uninit();
-        unsafe {
-            cudaStreamCreate(handle.as_mut_ptr())
-                .wrap_maybe_uninit(handle)
-                .map(CudaStream::from_handle)
-        }
-    }
-
-    pub fn create_with_flags(flags: CudaStreamCreateFlags) -> CudaResult<Self> {
-        let mut handle = MaybeUninit::<cudaStream_t>::uninit();
-        unsafe {
-            cudaStreamCreateWithFlags(handle.as_mut_ptr(), flags.bits())
-                .wrap_maybe_uninit(handle)
-                .map(CudaStream::from_handle)
-        }
-    }
-
-    pub fn destroy(self) -> CudaResult<()> {
-        let handle = self.handle;
-        mem::forget(self);
-        if handle.is_null() {
-            Ok(())
-        } else {
-            unsafe { cudaStreamDestroy(handle).wrap() }
-        }
-    }
-
-    pub fn get_attribute(&self, id: CudaLaunchAttributeID) -> CudaResult<CudaLaunchAttribute> {
-        let mut value = MaybeUninit::<CudaLaunchAttributeValue>::uninit();
-        unsafe {
-            cudaStreamGetAttribute(self.handle, id, value.as_mut_ptr())
-                .wrap_maybe_uninit(value)
-                .map(|val| CudaLaunchAttribute::from_id_and_value(id, val))
-        }
-    }
-
-    pub fn query(&self) -> CudaResult<bool> {
-        let error = unsafe { cudaStreamQuery(self.handle) };
-        match error {
-            CudaError::Success => Ok(true),
-            CudaError::ErrorNotReady => Ok(false),
-            _ => Err(error),
-        }
-    }
-
-    pub fn set_attribute(&self, attribute: CudaLaunchAttribute) -> CudaResult<()> {
-        let (id, value) = attribute.into_id_and_value();
-        unsafe { cudaStreamSetAttribute(self.handle, id, &value as *const _).wrap() }
-    }
-
-    pub fn synchronize(&self) -> CudaResult<()> {
-        unsafe { cudaStreamSynchronize(self.handle).wrap() }
-    }
-
-    pub fn wait_event(&self, event: &CudaEvent, flags: CudaStreamWaitEventFlags) -> CudaResult<()> {
-        unsafe { cudaStreamWaitEvent(self.handle, event.into(), flags.bits()).wrap() }
-    }
-}
-
-impl Default for CudaStream {
-    fn default() -> Self {
-        Self { handle: null_mut() }
-    }
-}
-
-impl Drop for CudaStream {
-    fn drop(&mut self) {
-        let handle = self.handle;
-        if handle.is_null() {
-            return;
-        }
-        unsafe { cudaStreamDestroy(handle).eprint_error_and_backtrace() };
-    }
-}
-
-impl From<&CudaStream> for cudaStream_t {
-    fn from(stream: &CudaStream) -> Self {
-        stream.handle
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::thread;
-    use std::time::Duration;
-
-    use serial_test::serial;
-
-    use crate::execution::{launch_host_fn, HostFn};
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn create_is_ok() {
-        let result = CudaStream::create();
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn create_handle_is_not_null() {
-        let stream = CudaStream::create().unwrap();
-        assert_ne!(stream.handle, null_mut());
-    }
-
-    #[test]
-    #[serial]
-    fn create_with_flags_is_ok() {
-        let result = CudaStream::create_with_flags(CudaStreamCreateFlags::NON_BLOCKING);
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn create_with_flags_handle_is_not_null() {
-        let stream = CudaStream::create_with_flags(CudaStreamCreateFlags::NON_BLOCKING).unwrap();
-        assert_ne!(stream.handle, null_mut());
-    }
-
-    #[test]
-    #[serial]
-    fn destroy_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let result = stream.destroy();
-        assert!(result.is_ok());
-    }
-
-    #[test]
-    #[serial]
-    fn query_is_true() {
-        let stream = CudaStream::create().unwrap();
-        let result = stream.query();
-        assert_eq!(result, Ok(true));
-    }
-
-    #[test]
-    #[serial]
-    fn query_is_false() {
-        let stream = CudaStream::create().unwrap();
-        let func = HostFn::new(|| thread::sleep(Duration::from_millis(100)));
-        launch_host_fn(&stream, &func).unwrap();
-        let result = stream.query();
-        assert_eq!(result, Ok(false));
-    }
-
-    #[test]
-    #[serial]
-    fn synchronize_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let result = stream.synchronize();
-        assert_eq!(result, Ok(()));
-    }
-
-    #[test]
-    #[serial]
-    fn wait_event_is_ok() {
-        let stream = CudaStream::create().unwrap();
-        let event = CudaEvent::create().unwrap();
-        event.record(&stream).unwrap();
-        let result = stream.wait_event(&event, CudaStreamWaitEventFlags::DEFAULT);
-        assert_eq!(result, Ok(()));
-    }
-}
diff --git a/cudart/src/unified.rs b/cudart/src/unified.rs
deleted file mode 100644
index 78ea2ee..0000000
--- a/cudart/src/unified.rs
+++ /dev/null
@@ -1,58 +0,0 @@
-// Unified Addressing
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__UNIFIED.html
-
-use core::ffi::c_void;
-use std::mem::MaybeUninit;
-
-use cudart_sys::*;
-
-use crate::result::{CudaResult, CudaResultWrap};
-use crate::slice::CudaSlice;
-
-pub fn pointer_get_attributes<T>(
-    slice: &(impl CudaSlice<T> + ?Sized),
-) -> CudaResult<CudaPointerAttributes> {
-    let mut attributes = MaybeUninit::<CudaPointerAttributes>::uninit();
-    unsafe {
-        cudaPointerGetAttributes(
-            attributes.as_mut_ptr(),
-            slice.as_c_void_ptr() as *mut c_void,
-        )
-        .wrap_maybe_uninit(attributes)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use serial_test::serial;
-
-    use crate::memory::*;
-
-    use super::*;
-
-    #[test]
-    #[serial]
-    fn pointer_is_unregistered() {
-        let values = [0u32];
-        let attributes = pointer_get_attributes(&values).unwrap();
-        assert_eq!(attributes.type_, CudaMemoryType::Unregistered);
-    }
-
-    #[test]
-    #[serial]
-    fn pointer_is_host() {
-        let values = [0u32];
-        let registration =
-            HostRegistration::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap();
-        let attributes = pointer_get_attributes(&registration).unwrap();
-        assert_eq!(attributes.type_, CudaMemoryType::Host);
-    }
-
-    #[test]
-    #[serial]
-    fn pointer_is_device() {
-        let allocation = DeviceAllocation::<u32>::alloc(1).unwrap();
-        let attributes = pointer_get_attributes(&allocation).unwrap();
-        assert_eq!(attributes.type_, CudaMemoryType::Device);
-    }
-}
diff --git a/boojum-cuda/native/.clang-format b/native/.clang-format
similarity index 100%
rename from boojum-cuda/native/.clang-format
rename to native/.clang-format
diff --git a/boojum-cuda/native/.gitignore b/native/.gitignore
similarity index 100%
rename from boojum-cuda/native/.gitignore
rename to native/.gitignore
diff --git a/boojum-cuda/native/CMakeLists.txt b/native/CMakeLists.txt
similarity index 100%
rename from boojum-cuda/native/CMakeLists.txt
rename to native/CMakeLists.txt
diff --git a/boojum-cuda/native/barycentric.cu b/native/barycentric.cu
similarity index 100%
rename from boojum-cuda/native/barycentric.cu
rename to native/barycentric.cu
diff --git a/boojum-cuda/native/blake2s.cu b/native/blake2s.cu
similarity index 100%
rename from boojum-cuda/native/blake2s.cu
rename to native/blake2s.cu
diff --git a/boojum-cuda/native/carry_chain.cuh b/native/carry_chain.cuh
similarity index 100%
rename from boojum-cuda/native/carry_chain.cuh
rename to native/carry_chain.cuh
diff --git a/boojum-cuda/native/common.cuh b/native/common.cuh
similarity index 100%
rename from boojum-cuda/native/common.cuh
rename to native/common.cuh
diff --git a/boojum-cuda/native/context.cu b/native/context.cu
similarity index 100%
rename from boojum-cuda/native/context.cu
rename to native/context.cu
diff --git a/boojum-cuda/native/context.cuh b/native/context.cuh
similarity index 100%
rename from boojum-cuda/native/context.cuh
rename to native/context.cuh
diff --git a/boojum-cuda/native/gates.cuh b/native/gates.cuh
similarity index 100%
rename from boojum-cuda/native/gates.cuh
rename to native/gates.cuh
diff --git a/boojum-cuda/native/gates_poseidon.cuh b/native/gates_poseidon.cuh
similarity index 100%
rename from boojum-cuda/native/gates_poseidon.cuh
rename to native/gates_poseidon.cuh
diff --git a/boojum-cuda/native/gates_template.cu b/native/gates_template.cu
similarity index 100%
rename from boojum-cuda/native/gates_template.cu
rename to native/gates_template.cu
diff --git a/boojum-cuda/native/goldilocks.cuh b/native/goldilocks.cuh
similarity index 100%
rename from boojum-cuda/native/goldilocks.cuh
rename to native/goldilocks.cuh
diff --git a/boojum-cuda/native/goldilocks_extension.cu b/native/goldilocks_extension.cu
similarity index 100%
rename from boojum-cuda/native/goldilocks_extension.cu
rename to native/goldilocks_extension.cu
diff --git a/boojum-cuda/native/goldilocks_extension.cuh b/native/goldilocks_extension.cuh
similarity index 100%
rename from boojum-cuda/native/goldilocks_extension.cuh
rename to native/goldilocks_extension.cuh
diff --git a/boojum-cuda/native/memory.cuh b/native/memory.cuh
similarity index 100%
rename from boojum-cuda/native/memory.cuh
rename to native/memory.cuh
diff --git a/boojum-cuda/native/ntt.cu b/native/ntt.cu
similarity index 100%
rename from boojum-cuda/native/ntt.cu
rename to native/ntt.cu
diff --git a/boojum-cuda/native/ntt_b2n.cuh b/native/ntt_b2n.cuh
similarity index 100%
rename from boojum-cuda/native/ntt_b2n.cuh
rename to native/ntt_b2n.cuh
diff --git a/boojum-cuda/native/ntt_n2b.cuh b/native/ntt_n2b.cuh
similarity index 100%
rename from boojum-cuda/native/ntt_n2b.cuh
rename to native/ntt_n2b.cuh
diff --git a/boojum-cuda/native/ops_complex.cu b/native/ops_complex.cu
similarity index 100%
rename from boojum-cuda/native/ops_complex.cu
rename to native/ops_complex.cu
diff --git a/boojum-cuda/native/ops_complex.cuh b/native/ops_complex.cuh
similarity index 100%
rename from boojum-cuda/native/ops_complex.cuh
rename to native/ops_complex.cuh
diff --git a/boojum-cuda/native/ops_cub/common.cuh b/native/ops_cub/common.cuh
similarity index 100%
rename from boojum-cuda/native/ops_cub/common.cuh
rename to native/ops_cub/common.cuh
diff --git a/boojum-cuda/native/ops_cub/device_radix_sort.cu b/native/ops_cub/device_radix_sort.cu
similarity index 100%
rename from boojum-cuda/native/ops_cub/device_radix_sort.cu
rename to native/ops_cub/device_radix_sort.cu
diff --git a/boojum-cuda/native/ops_cub/device_reduce.cu b/native/ops_cub/device_reduce.cu
similarity index 100%
rename from boojum-cuda/native/ops_cub/device_reduce.cu
rename to native/ops_cub/device_reduce.cu
diff --git a/boojum-cuda/native/ops_cub/device_run_length_encode.cu b/native/ops_cub/device_run_length_encode.cu
similarity index 100%
rename from boojum-cuda/native/ops_cub/device_run_length_encode.cu
rename to native/ops_cub/device_run_length_encode.cu
diff --git a/boojum-cuda/native/ops_cub/device_scan.cu b/native/ops_cub/device_scan.cu
similarity index 100%
rename from boojum-cuda/native/ops_cub/device_scan.cu
rename to native/ops_cub/device_scan.cu
diff --git a/boojum-cuda/native/ops_simple.cu b/native/ops_simple.cu
similarity index 100%
rename from boojum-cuda/native/ops_simple.cu
rename to native/ops_simple.cu
diff --git a/boojum-cuda/native/poseidon2_cooperative.cu b/native/poseidon2_cooperative.cu
similarity index 100%
rename from boojum-cuda/native/poseidon2_cooperative.cu
rename to native/poseidon2_cooperative.cu
diff --git a/boojum-cuda/native/poseidon2_single_thread.cu b/native/poseidon2_single_thread.cu
similarity index 100%
rename from boojum-cuda/native/poseidon2_single_thread.cu
rename to native/poseidon2_single_thread.cu
diff --git a/boojum-cuda/native/poseidon2_single_thread.cuh b/native/poseidon2_single_thread.cuh
similarity index 100%
rename from boojum-cuda/native/poseidon2_single_thread.cuh
rename to native/poseidon2_single_thread.cuh
diff --git a/boojum-cuda/native/poseidon_common.cu b/native/poseidon_common.cu
similarity index 54%
rename from boojum-cuda/native/poseidon_common.cu
rename to native/poseidon_common.cu
index b68fbc6..9bec51d 100644
--- a/boojum-cuda/native/poseidon_common.cu
+++ b/native/poseidon_common.cu
@@ -1,9 +1,11 @@
 #include "goldilocks.cuh"
+#include "poseidon_constants.cuh"
 
 namespace poseidon {
 
 using namespace goldilocks;
 using namespace memory;
+using namespace poseidon_common;
 
 EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigned indexes_count, const matrix_getter<base_field, ld_modifier::cs> values,
                                           matrix_setter<base_field, st_modifier::cs> results) {
@@ -17,20 +19,20 @@ EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigne
   results.set(dst_row, col, values.get(src_row, col));
 }
 
-EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count,
-                                                  const matrix_getter<base_field, ld_modifier::cs> values, matrix_setter<base_field, st_modifier::cs> results) {
+EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count, const base_field *values,
+                                                  const unsigned log_leaves_count, base_field *results) {
   const unsigned idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= indexes_count)
     return;
-  const unsigned col = threadIdx.y;
-  const unsigned layers_count = gridDim.y;
-  const unsigned layer_from_leaves = blockIdx.y;
+  const unsigned col = blockIdx.y;
+  const unsigned layer_index = blockIdx.z;
+  const unsigned layer_offset = (CAPACITY << (log_leaves_count + 1)) - (CAPACITY << (log_leaves_count + 1 - layer_index));
+  const unsigned col_offset = col << (log_leaves_count - layer_index);
   const unsigned leaf_index = indexes[idx];
-  const unsigned layer_offset = (1 << (layers_count + 1)) - (1 << (layers_count + 1 - layer_from_leaves));
-  const unsigned hash_index = (leaf_index >> layer_from_leaves) ^ 1;
-  const unsigned src_row = layer_offset + hash_index;
-  const unsigned dst_row = layer_from_leaves * indexes_count + idx;
-  results.set(dst_row, col, values.get(src_row, col));
+  const unsigned hash_index = (leaf_index >> layer_index) ^ 1;
+  const unsigned src_index = layer_offset + col_offset + hash_index;
+  const unsigned dst_index = layer_index * indexes_count * CAPACITY + indexes_count * col + idx;
+  results[dst_index] = values[src_index];
 }
 
 } // namespace poseidon
\ No newline at end of file
diff --git a/boojum-cuda/native/poseidon_constants_template.cuh b/native/poseidon_constants_template.cuh
similarity index 100%
rename from boojum-cuda/native/poseidon_constants_template.cuh
rename to native/poseidon_constants_template.cuh
diff --git a/boojum-cuda/native/poseidon_cooperative.cu b/native/poseidon_cooperative.cu
similarity index 100%
rename from boojum-cuda/native/poseidon_cooperative.cu
rename to native/poseidon_cooperative.cu
diff --git a/boojum-cuda/native/poseidon_single_thread.cu b/native/poseidon_single_thread.cu
similarity index 100%
rename from boojum-cuda/native/poseidon_single_thread.cu
rename to native/poseidon_single_thread.cu
diff --git a/boojum-cuda/native/poseidon_single_thread.cuh b/native/poseidon_single_thread.cuh
similarity index 100%
rename from boojum-cuda/native/poseidon_single_thread.cuh
rename to native/poseidon_single_thread.cuh
diff --git a/boojum-cuda/native/poseidon_utils.cuh b/native/poseidon_utils.cuh
similarity index 100%
rename from boojum-cuda/native/poseidon_utils.cuh
rename to native/poseidon_utils.cuh
diff --git a/boojum-cuda/native/ptx.cuh b/native/ptx.cuh
similarity index 100%
rename from boojum-cuda/native/ptx.cuh
rename to native/ptx.cuh
diff --git a/boojum-cuda/src/.gitignore b/src/.gitignore
similarity index 100%
rename from boojum-cuda/src/.gitignore
rename to src/.gitignore
diff --git a/boojum-cuda/src/barycentric.rs b/src/barycentric.rs
similarity index 99%
rename from boojum-cuda/src/barycentric.rs
rename to src/barycentric.rs
index 2415524..5a0e5e6 100644
--- a/boojum-cuda/src/barycentric.rs
+++ b/src/barycentric.rs
@@ -91,6 +91,7 @@ pub trait PrecomputeImpl {
         GoldilocksField,
         u32,
     );
+    #[allow(clippy::type_complexity)]
     fn get_precompute_kernel() -> unsafe extern "C" fn(
         *const <Self::X as DeviceRepr>::Type,
         *const <Self::X as DeviceRepr>::Type,
@@ -181,6 +182,7 @@ pub trait EvalImpl {
     type XVec: DeviceRepr;
     type YsVec: DeviceRepr;
     fn get_partial_reduce_elems_per_thread() -> u32;
+    #[allow(clippy::type_complexity)]
     fn get_partial_reduce_kernel() -> unsafe extern "C" fn(
         PtrAndStride<<Self::YsVec as DeviceRepr>::Type>,
         PtrAndStride<<Self::XVec as DeviceRepr>::Type>,
diff --git a/boojum-cuda/src/blake2s.rs b/src/blake2s.rs
similarity index 100%
rename from boojum-cuda/src/blake2s.rs
rename to src/blake2s.rs
diff --git a/boojum-cuda/src/context.rs b/src/context.rs
similarity index 100%
rename from boojum-cuda/src/context.rs
rename to src/context.rs
diff --git a/boojum-cuda/src/device_structures.rs b/src/device_structures.rs
similarity index 100%
rename from boojum-cuda/src/device_structures.rs
rename to src/device_structures.rs
diff --git a/boojum-cuda/src/extension_field.rs b/src/extension_field.rs
similarity index 100%
rename from boojum-cuda/src/extension_field.rs
rename to src/extension_field.rs
diff --git a/boojum-cuda/src/gates.rs b/src/gates.rs
similarity index 100%
rename from boojum-cuda/src/gates.rs
rename to src/gates.rs
diff --git a/boojum-cuda/src/gates_data_template.rs b/src/gates_data_template.rs
similarity index 100%
rename from boojum-cuda/src/gates_data_template.rs
rename to src/gates_data_template.rs
diff --git a/boojum-cuda/src/lib.rs b/src/lib.rs
similarity index 100%
rename from boojum-cuda/src/lib.rs
rename to src/lib.rs
diff --git a/boojum-cuda/src/ntt.rs b/src/ntt.rs
similarity index 100%
rename from boojum-cuda/src/ntt.rs
rename to src/ntt.rs
diff --git a/boojum-cuda/src/ops_complex.rs b/src/ops_complex.rs
similarity index 99%
rename from boojum-cuda/src/ops_complex.rs
rename to src/ops_complex.rs
index 2032c28..fed079b 100644
--- a/boojum-cuda/src/ops_complex.rs
+++ b/src/ops_complex.rs
@@ -1694,7 +1694,7 @@ mod tests {
             .into_iter()
             .enumerate()
             .map(|(i, x)| x * generator.pow_u64(SHIFT as u64 * i as u64))
-            .zip(h_dst.into_iter())
+            .zip(h_dst)
             .for_each(assert_equal);
         context.destroy().unwrap();
     }
@@ -1755,9 +1755,8 @@ mod tests {
             .into_iter()
             .chunks(ROWS)
             .into_iter()
-            .zip(h_dst.into_iter().chunks(ROWS).into_iter())
+            .zip(h_dst.chunks(ROWS))
             .for_each(|(s, d)| {
-                let d = d.collect_vec();
                 s.enumerate()
                     .map(|(i, x)| (x, d[i.reverse_bits() >> (usize::BITS - LOG_ROWS as u32)]))
                     .for_each(assert_equal);
@@ -1817,7 +1816,7 @@ mod tests {
     }
 
     fn test_batch_inv_ef(in_place: bool) {
-        type VEF = VectorizedExtensionField;
+        type Vef = VectorizedExtensionField;
         const LOG_N: usize = 16;
         const N: usize = 1 << LOG_N;
         let h_src_bf = Uniform::new(0, GoldilocksField::ORDER)
@@ -1830,16 +1829,16 @@ mod tests {
         if in_place {
             let mut d_values_bf = DeviceAllocation::alloc(2 * N).unwrap();
             memory_copy_async(&mut d_values_bf, &h_src_bf, &stream).unwrap();
-            let mut d_values_ef = unsafe { d_values_bf.transmute_mut::<VEF>() };
-            super::batch_inv_in_place::<VEF>(&mut d_values_ef, &stream).unwrap();
+            let d_values_ef = unsafe { d_values_bf.transmute_mut::<Vef>() };
+            super::batch_inv_in_place::<Vef>(d_values_ef, &stream).unwrap();
             memory_copy_async(&mut h_dst_bf, &d_values_bf, &stream).unwrap();
         } else {
             let mut d_src_bf = DeviceAllocation::alloc(2 * N).unwrap();
             let mut d_dst_bf = DeviceAllocation::alloc(2 * N).unwrap();
             memory_copy_async(&mut d_src_bf, &h_src_bf, &stream).unwrap();
-            let d_src_ef = unsafe { d_src_bf.transmute::<VEF>() };
-            let mut d_dst_ef = unsafe { d_dst_bf.transmute_mut::<VEF>() };
-            super::batch_inv::<VEF>(&d_src_ef, &mut d_dst_ef, &stream).unwrap();
+            let d_src_ef = unsafe { d_src_bf.transmute::<Vef>() };
+            let d_dst_ef = unsafe { d_dst_bf.transmute_mut::<Vef>() };
+            super::batch_inv::<Vef>(d_src_ef, d_dst_ef, &stream).unwrap();
             memory_copy_async(&mut h_dst_bf, &d_dst_bf, &stream).unwrap();
         }
         stream.synchronize().unwrap();
diff --git a/boojum-cuda/src/ops_cub/device_radix_sort.rs b/src/ops_cub/device_radix_sort.rs
similarity index 98%
rename from boojum-cuda/src/ops_cub/device_radix_sort.rs
rename to src/ops_cub/device_radix_sort.rs
index e369c4f..6ecaf84 100644
--- a/boojum-cuda/src/ops_cub/device_radix_sort.rs
+++ b/src/ops_cub/device_radix_sort.rs
@@ -591,14 +591,8 @@ mod tests {
         memory_copy_async(&mut h_keys_out, &d_keys_out, &stream).unwrap();
         memory_copy_async(&mut h_values_out, &d_values_out, &stream).unwrap();
         stream.synchronize().unwrap();
-        let mut pairs_in = h_keys_in
-            .into_iter()
-            .zip(h_values_in.into_iter())
-            .collect_vec();
-        let pairs_out = h_keys_out
-            .into_iter()
-            .zip(h_values_out.into_iter())
-            .collect_vec();
+        let mut pairs_in = h_keys_in.into_iter().zip(h_values_in).collect_vec();
+        let pairs_out = h_keys_out.into_iter().zip(h_values_out).collect_vec();
         pairs_in.sort_by_key(|(k, _)| k.clone());
         if descending {
             pairs_in.reverse()
diff --git a/boojum-cuda/src/ops_cub/device_reduce.rs b/src/ops_cub/device_reduce.rs
similarity index 100%
rename from boojum-cuda/src/ops_cub/device_reduce.rs
rename to src/ops_cub/device_reduce.rs
diff --git a/boojum-cuda/src/ops_cub/device_run_length_encode.rs b/src/ops_cub/device_run_length_encode.rs
similarity index 100%
rename from boojum-cuda/src/ops_cub/device_run_length_encode.rs
rename to src/ops_cub/device_run_length_encode.rs
diff --git a/boojum-cuda/src/ops_cub/device_scan.rs b/src/ops_cub/device_scan.rs
similarity index 99%
rename from boojum-cuda/src/ops_cub/device_scan.rs
rename to src/ops_cub/device_scan.rs
index 810fd3b..f668cf7 100644
--- a/boojum-cuda/src/ops_cub/device_scan.rs
+++ b/src/ops_cub/device_scan.rs
@@ -844,10 +844,10 @@ mod tests {
         h_in.into_iter()
             .chunks(NUM_ITEMS)
             .into_iter()
-            .zip(h_out.into_iter().chunks(NUM_ITEMS).into_iter())
+            .zip(h_out.chunks(NUM_ITEMS))
             .for_each(|(h_in, h_out)| {
                 let h_in = h_in.collect_vec();
-                let h_out = h_out.collect_vec();
+                let h_out = Vec::from(h_out);
                 verify(operation, inclusive, reverse, h_in, h_out);
             });
     }
diff --git a/boojum-cuda/src/ops_cub/mod.rs b/src/ops_cub/mod.rs
similarity index 100%
rename from boojum-cuda/src/ops_cub/mod.rs
rename to src/ops_cub/mod.rs
diff --git a/boojum-cuda/src/ops_simple.rs b/src/ops_simple.rs
similarity index 100%
rename from boojum-cuda/src/ops_simple.rs
rename to src/ops_simple.rs
diff --git a/boojum-cuda/src/poseidon.rs b/src/poseidon.rs
similarity index 92%
rename from boojum-cuda/src/poseidon.rs
rename to src/poseidon.rs
index a8de969..e3fdee4 100644
--- a/boojum-cuda/src/poseidon.rs
+++ b/src/poseidon.rs
@@ -1,7 +1,9 @@
 use boojum::field::goldilocks::GoldilocksField;
 use boojum::implementations::poseidon_goldilocks_params::*;
 
-use cudart::execution::{KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs};
+use cudart::execution::{
+    KernelFiveArgs, KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs,
+};
 use cudart::result::CudaResult;
 use cudart::slice::DeviceSlice;
 use cudart::stream::CudaStream;
@@ -79,8 +81,9 @@ extern "C" {
     fn gather_merkle_paths_kernel(
         indexes: *const u32,
         indexes_count: u32,
-        values: PtrAndStride<BaseFieldDeviceType>,
-        results: MutPtrAndStride<BaseFieldDeviceType>,
+        values: *const GoldilocksField,
+        log_leaves_count: u32,
+        results: *mut GoldilocksField,
     );
 }
 
@@ -417,31 +420,38 @@ pub fn gather_rows(
 
 pub fn gather_merkle_paths(
     indexes: &DeviceSlice<u32>,
-    values: &(impl DeviceMatrixChunkImpl<BaseField> + ?Sized),
-    result: &mut (impl DeviceMatrixChunkMutImpl<BaseField> + ?Sized),
+    values: &DeviceSlice<GoldilocksField>,
+    results: &mut DeviceSlice<GoldilocksField>,
+    layers_count: u32,
     stream: &CudaStream,
 ) -> CudaResult<()> {
-    assert_eq!(values.cols(), CAPACITY);
-    assert_eq!(result.cols(), CAPACITY);
-    let indexes_len = indexes.len();
-    let values_rows = values.rows();
-    let result_rows = result.rows();
-    assert_eq!(result_rows % indexes_len, 0);
-    let layers_count = result_rows / indexes_len;
-    assert_eq!(values_rows, 1 << (layers_count + 1));
-    assert_eq!(WARP_SIZE % CAPACITY as u32, 0);
-    assert!(indexes_len <= u32::MAX as usize);
-    let indexes_count = indexes_len as u32;
-    let (grid_dim, block_dim) =
-        get_grid_block_dims_for_threads_count(WARP_SIZE / CAPACITY as u32, indexes_count);
-    let grid_dim = (grid_dim.x, layers_count as u32).into();
-    let block_dim = (block_dim.x, CAPACITY as u32).into();
+    assert!(indexes.len() <= u32::MAX as usize);
+    let indexes_count = indexes.len() as u32;
+    assert_eq!(values.len() % CAPACITY, 0);
+    let values_count = values.len() / CAPACITY;
+    assert!(values_count.is_power_of_two());
+    let log_values_count = values_count.trailing_zeros();
+    assert_ne!(log_values_count, 0);
+    let log_leaves_count = log_values_count - 1;
+    assert!(layers_count < log_leaves_count);
+    assert_eq!(
+        indexes.len() * layers_count as usize * CAPACITY,
+        results.len()
+    );
+    let (grid_dim, block_dim) = get_grid_block_dims_for_threads_count(WARP_SIZE, indexes_count);
+    let grid_dim = (grid_dim.x, CAPACITY as u32, layers_count).into();
     let indexes = indexes.as_ptr();
-    let values = values.as_ptr_and_stride();
-    let result = result.as_mut_ptr_and_stride();
-    let args = (&indexes, &indexes_count, &values, &result);
+    let values = values.as_ptr();
+    let result = results.as_mut_ptr();
+    let args = (
+        &indexes,
+        &indexes_count,
+        &values,
+        &log_leaves_count,
+        &result,
+    );
     unsafe {
-        KernelFourArgs::launch(
+        KernelFiveArgs::launch(
             gather_merkle_paths_kernel,
             grid_dim,
             block_dim,
@@ -458,9 +468,11 @@ mod tests {
 
     use boojum::field::{Field, U64Representable};
     use boojum::implementations::poseidon2::state_generic_impl::State;
+    use itertools::Itertools;
     use rand::Rng;
 
     use cudart::memory::{memory_copy_async, DeviceAllocation};
+    use cudart::slice::CudaSlice;
 
     // use boojum::implementations::poseidon_goldilocks::poseidon_permutation_optimized;
     use crate::device_structures::{DeviceMatrix, DeviceMatrixMut};
@@ -927,13 +939,13 @@ mod tests {
     fn gather_merkle_paths() {
         const LOG_LEAVES_COUNT: usize = 12;
         const INDEXES_COUNT: usize = 42;
+        const LAYERS_COUNT: usize = LOG_LEAVES_COUNT - 4;
         let mut rng = rand::thread_rng();
         let mut indexes_host = vec![0; INDEXES_COUNT];
-        indexes_host.fill_with(|| rng.gen_range(0..INDEXES_COUNT as u32));
+        indexes_host.fill_with(|| rng.gen_range(0..1u32 << LOG_LEAVES_COUNT));
         let mut values_host = vec![GoldilocksField::ZERO; CAPACITY << (LOG_LEAVES_COUNT + 1)];
         values_host.fill_with(|| GoldilocksField(rng.gen()));
-        let mut results_host =
-            vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LOG_LEAVES_COUNT];
+        let mut results_host = vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LAYERS_COUNT];
         let stream = CudaStream::default();
         let mut indexes_device = DeviceAllocation::<u32>::alloc(indexes_host.len()).unwrap();
         let mut values_device =
@@ -944,40 +956,37 @@ mod tests {
         memory_copy_async(&mut values_device, &values_host, &stream).unwrap();
         super::gather_merkle_paths(
             &indexes_device,
-            &DeviceMatrix::new(&values_device, 1 << (LOG_LEAVES_COUNT + 1)),
-            &mut DeviceMatrixMut::new(&mut results_device, INDEXES_COUNT * LOG_LEAVES_COUNT),
+            &values_device,
+            &mut results_device,
+            LAYERS_COUNT as u32,
             &stream,
         )
         .unwrap();
         memory_copy_async(&mut results_host, &results_device, &stream).unwrap();
         stream.synchronize().unwrap();
         fn verify_merkle_path(
+            indexes: &[u32],
             values: &[GoldilocksField],
             results: &[GoldilocksField],
-            row_index: usize,
-            leaf_index: usize,
         ) {
-            let log_leaves_count = values.len().trailing_zeros() - 1;
-            let sibling_index = leaf_index ^ 1;
-            let expected = values[sibling_index];
-            let actual = results[row_index];
-            assert_eq!(expected, actual);
-            if log_leaves_count > 1 {
-                verify_merkle_path(
-                    &values[values.len() >> 1..],
-                    &results[INDEXES_COUNT..],
-                    row_index,
-                    leaf_index >> 1,
-                );
+            let (values, values_next) = values.split_at(values.len() >> 1);
+            let (results, results_next) = results.split_at(INDEXES_COUNT * CAPACITY);
+            values
+                .chunks(values.len() / CAPACITY)
+                .zip(results.chunks(results.len() / CAPACITY))
+                .for_each(|(values, results)| {
+                    for (row_index, &index) in indexes.iter().enumerate() {
+                        let sibling_index = index ^ 1;
+                        let expected = values[sibling_index as usize];
+                        let actual = results[row_index];
+                        assert_eq!(expected, actual);
+                    }
+                });
+            if !results_next.is_empty() {
+                let indexes_next = indexes.iter().map(|&x| x >> 1).collect_vec();
+                verify_merkle_path(&indexes_next, &values_next, &results_next);
             }
         }
-        values_host
-            .chunks(1 << (LOG_LEAVES_COUNT + 1))
-            .zip(results_host.chunks(INDEXES_COUNT * LOG_LEAVES_COUNT))
-            .for_each(|(values, results)| {
-                for (row_index, &leaf_index) in indexes_host.iter().enumerate() {
-                    verify_merkle_path(values, results, row_index, leaf_index as usize);
-                }
-            });
+        verify_merkle_path(&indexes_host, &values_host, &results_host);
     }
 }
diff --git a/boojum-cuda/src/tests_helpers.rs b/src/tests_helpers.rs
similarity index 100%
rename from boojum-cuda/src/tests_helpers.rs
rename to src/tests_helpers.rs
diff --git a/boojum-cuda/src/utils.rs b/src/utils.rs
similarity index 100%
rename from boojum-cuda/src/utils.rs
rename to src/utils.rs