diff --git a/Cargo.toml b/Cargo.toml index 782f56f..f1388c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,53 @@ -[workspace] -members = ["boojum-cuda", "criterion-cuda", "cudart", "cudart-sys"] -resolver = "2" +[package] +name = "boojum-cuda" +version = "0.1.0" +edition = "2021" +build = "build/main.rs" +license = "MIT OR Apache-2.0" + +[build-dependencies] +boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" } +cudart-sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart-sys" } +cmake = "^0" +itertools = "^0" + +[dependencies] +boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" } +cudart = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart" } +cudart-sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "cudart-sys" } +itertools = "^0" +lazy_static = "^1" + +[dev-dependencies] +blake2 = "^0" +criterion = "^0" +criterion-cuda = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "criterion-cuda" } +criterion-macro = "^0" +itertools = "^0" +rand = "^0" +rayon = "^1" +serial_test = "^2" + +[[bench]] +name = "blake2s" +harness = false + +[[bench]] +name = "gates" +harness = false + +[[bench]] +name = "goldilocks" +harness = false + +[[bench]] +name = "ntt" +harness = false + +[[bench]] +name = "ops_complex" +harness = false + +[[bench]] +name = "poseidon" +harness = false diff --git a/boojum-cuda/benches/blake2s.rs b/benches/blake2s.rs similarity index 100% rename from boojum-cuda/benches/blake2s.rs rename to benches/blake2s.rs diff --git a/boojum-cuda/benches/gates.rs b/benches/gates.rs similarity index 100% rename from boojum-cuda/benches/gates.rs rename to benches/gates.rs diff --git a/boojum-cuda/benches/goldilocks.rs b/benches/goldilocks.rs similarity index 100% rename from boojum-cuda/benches/goldilocks.rs rename to benches/goldilocks.rs diff --git a/boojum-cuda/benches/ntt.rs b/benches/ntt.rs similarity index 100% rename from boojum-cuda/benches/ntt.rs rename to benches/ntt.rs diff --git a/boojum-cuda/benches/ops_complex.rs b/benches/ops_complex.rs similarity index 100% rename from boojum-cuda/benches/ops_complex.rs rename to benches/ops_complex.rs diff --git a/boojum-cuda/benches/poseidon.rs b/benches/poseidon.rs similarity index 100% rename from boojum-cuda/benches/poseidon.rs rename to benches/poseidon.rs diff --git a/boojum-cuda/Cargo.toml b/boojum-cuda/Cargo.toml deleted file mode 100644 index f0d55f1..0000000 --- a/boojum-cuda/Cargo.toml +++ /dev/null @@ -1,53 +0,0 @@ -[package] -name = "boojum-cuda" -version = "0.1.0" -edition = "2021" -build = "build/main.rs" -license = "MIT OR Apache-2.0" - -[build-dependencies] -boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" } -cudart-sys = { path = "../cudart-sys" } -cmake = "^0" -itertools = "^0" - -[dependencies] -boojum = { git = "https://github.com/matter-labs/era-boojum.git", branch = "main" } -cudart = { path = "../cudart" } -cudart-sys = { path = "../cudart-sys" } -itertools = "^0" -lazy_static = "^1" - -[dev-dependencies] -blake2 = "^0" -criterion = "^0" -criterion-cuda = { path = "../criterion-cuda" } -criterion-macro = "^0" -itertools = "^0" -rand = "^0" -rayon = "^1" -serial_test = "^2" - -[[bench]] -name = "blake2s" -harness = false - -[[bench]] -name = "gates" -harness = false - -[[bench]] -name = "goldilocks" -harness = false - -[[bench]] -name = "ntt" -harness = false - -[[bench]] -name = "ops_complex" -harness = false - -[[bench]] -name = "poseidon" -harness = false diff --git a/boojum-cuda/build/gates.rs b/build/gates.rs similarity index 100% rename from boojum-cuda/build/gates.rs rename to build/gates.rs diff --git a/boojum-cuda/build/main.rs b/build/main.rs similarity index 100% rename from boojum-cuda/build/main.rs rename to build/main.rs diff --git a/boojum-cuda/build/poseidon_constants.rs b/build/poseidon_constants.rs similarity index 100% rename from boojum-cuda/build/poseidon_constants.rs rename to build/poseidon_constants.rs diff --git a/boojum-cuda/build/template.rs b/build/template.rs similarity index 100% rename from boojum-cuda/build/template.rs rename to build/template.rs diff --git a/criterion-cuda/Cargo.toml b/criterion-cuda/Cargo.toml deleted file mode 100644 index 80ada31..0000000 --- a/criterion-cuda/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] -name = "criterion-cuda" -version = "0.1.0" -edition = "2021" -license = "MIT OR Apache-2.0" - -[dependencies] -criterion = "^0" -cudart = { path = "../cudart" } diff --git a/criterion-cuda/src/lib.rs b/criterion-cuda/src/lib.rs deleted file mode 100644 index c0b4d1c..0000000 --- a/criterion-cuda/src/lib.rs +++ /dev/null @@ -1,193 +0,0 @@ -use criterion::measurement::{Measurement, ValueFormatter}; -use criterion::Throughput; - -use cudart::event::{elapsed_time, CudaEvent}; -use cudart::stream::CudaStream; - -// Allows criterion benchmarks to use CUDA event-based timings. -// Based on src/lib.rs from https://github.com/theHamsta/criterion-cuda. -pub struct CudaMeasurement; - -// c.f. https://docs.rs/criterion/latest/criterion/measurement/trait.Measurement.html -impl Measurement for CudaMeasurement { - type Intermediate = (CudaEvent, CudaEvent); - type Value = f32; - - fn start(&self) -> Self::Intermediate { - let stream = CudaStream::default(); - let start_event = CudaEvent::create().expect("Failed to create event"); - let end_event = CudaEvent::create().expect("Failed to create event"); - start_event - .record(&stream) - .expect("Could not record CUDA event"); - (start_event, end_event) - } - - fn end(&self, events: Self::Intermediate) -> Self::Value { - let (start_event, end_event) = events; - let stream = CudaStream::default(); - end_event - .record(&stream) - .expect("Could not record CUDA event"); - stream.synchronize().expect("Failed to synchronize"); - elapsed_time(&start_event, &end_event).expect("Failed to measure time") - } - - fn add(&self, v1: &Self::Value, v2: &Self::Value) -> Self::Value { - v1 + v2 - } - - fn zero(&self) -> Self::Value { - 0f32 - } - - fn to_f64(&self, value: &Self::Value) -> f64 { - *value as f64 - } - - fn formatter(&self) -> &dyn ValueFormatter { - &CudaEventElapsedTimeFormatter:: - } -} - -// based on https://github.com/bheisler/criterion.rs/blob/master/src/measurement.rs -struct CudaEventElapsedTimeFormatter; - -impl CudaEventElapsedTimeFormatter { - fn bytes_per_second(&self, bytes: f64, typical: f64, values: &mut [f64]) -> &'static str { - let bytes_per_second = bytes * (1e3 / typical); - const K: f64 = 1024.0; - let (denominator, unit) = if bytes_per_second < K { - (1.0, " B/s") - } else if bytes_per_second < K * K { - (K, "KiB/s") - } else if bytes_per_second < K * K * K { - (K * K, "MiB/s") - } else { - (K * K * K, "GiB/s") - }; - - for val in values { - let bytes_per_second = bytes * (1e3 / *val); - *val = bytes_per_second / denominator; - } - - unit - } - - fn bytes_per_second_decimal( - &self, - bytes: f64, - typical: f64, - values: &mut [f64], - ) -> &'static str { - let bytes_per_second = bytes * (1e3 / typical); - const K: f64 = 1000.0; - let (denominator, unit) = if bytes_per_second < K { - (1.0, " B/s") - } else if bytes_per_second < K * K { - (K, "KB/s") - } else if bytes_per_second < K * K * K { - (K * K, "MB/s") - } else { - (K * K * K, "GB/s") - }; - - for val in values { - let bytes_per_second = bytes * (1e3 / *val); - *val = bytes_per_second / denominator; - } - - unit - } - - fn elements_per_second(&self, elems: f64, typical: f64, values: &mut [f64]) -> &'static str { - let elems_per_second = elems * (1e3 / typical); - const K: f64 = 1000.0; - let (denominator, unit) = if elems_per_second < K { - (1.0, " elem/s") - } else if elems_per_second < K * K { - (K, "Kelem/s") - } else if elems_per_second < K * K * K { - (K * K, "Melem/s") - } else { - (K * K * K, "Gelem/s") - }; - - for val in values { - let elems_per_second = elems * (1e3 / *val); - *val = elems_per_second / denominator; - } - - unit - } - - fn second_per_element(&self, elems: f64, typical: f64, values: &mut [f64]) -> &'static str { - let ms_per_elem = typical / elems; - let (factor, unit) = if ms_per_elem < 1e-6 { - (1e9, "ps/elem") - } else if ms_per_elem < 1e-3 { - (1e6, "ns/elem") - } else if ms_per_elem < 1e0 { - (1e3, "µs/elem") - } else if ms_per_elem < 1e3 { - (1e0, "ms/elem") - } else { - (1e-3, "s/elem") - }; - - for val in values { - *val *= factor / elems; - } - - unit - } -} - -impl ValueFormatter for CudaEventElapsedTimeFormatter { - fn scale_values(&self, typical_value: f64, values: &mut [f64]) -> &'static str { - let (factor, unit) = if typical_value < 1e-6 { - (1e9, "ps") - } else if typical_value < 1e-3 { - (1e6, "ns") - } else if typical_value < 1e0 { - (1e3, "µs") - } else if typical_value < 1e3 { - (1e0, "ms") - } else { - (1e-3, "s") - }; - - for val in values { - *val *= factor; - } - - unit - } - - fn scale_throughputs( - &self, - typical_value: f64, - throughput: &Throughput, - values: &mut [f64], - ) -> &'static str { - match *throughput { - Throughput::Bytes(bytes) => self.bytes_per_second(bytes as f64, typical_value, values), - Throughput::BytesDecimal(bytes) => { - self.bytes_per_second_decimal(bytes as f64, typical_value, values) - } - Throughput::Elements(elems) => { - if INV_ELEMS { - self.second_per_element(elems as f64, typical_value, values) - } else { - self.elements_per_second(elems as f64, typical_value, values) - } - } - } - } - - fn scale_for_machines(&self, _values: &mut [f64]) -> &'static str { - // no scaling is needed - "ms" - } -} diff --git a/cudart-sys/Cargo.toml b/cudart-sys/Cargo.toml deleted file mode 100644 index 419ea6d..0000000 --- a/cudart-sys/Cargo.toml +++ /dev/null @@ -1,9 +0,0 @@ -[package] -name = "cudart-sys" -version = "0.1.0" -edition = "2021" -license = "MIT OR Apache-2.0" - -[build-dependencies] -bindgen = "^0" -serde_json = "^1" diff --git a/cudart-sys/build.rs b/cudart-sys/build.rs deleted file mode 100644 index 74ee3b8..0000000 --- a/cudart-sys/build.rs +++ /dev/null @@ -1,265 +0,0 @@ -use std::fs; -use std::path::PathBuf; - -use bindgen::callbacks::{EnumVariantValue, ParseCallbacks}; - -include!("src/path.rs"); - -pub fn assert_cuda_version() { - let version = option_env!("CUDA_VERSION").map_or_else( - || { - let file = fs::File::open(concat!(cuda_path!(), "/version.json")) - .expect("CUDA Toolkit not found"); - let reader = std::io::BufReader::new(file); - let value: serde_json::Value = serde_json::from_reader(reader).unwrap(); - dbg!(value["cuda"]["version"].as_str().unwrap().to_string()) - }, - |s| s.to_string(), - ); - assert!( - version.starts_with("12."), - "CUDA Toolkit {version} is not supported. Please install CUDA Toolkit 12.x" - ); -} - -#[derive(Debug)] -struct CudaParseCallbacks; - -impl ParseCallbacks for CudaParseCallbacks { - fn enum_variant_name( - &self, - enum_name: Option<&str>, - original_variant_name: &str, - _variant_value: EnumVariantValue, - ) -> Option { - let strip_prefix = |prefix| { - Some( - original_variant_name - .strip_prefix(prefix) - .unwrap() - .to_string(), - ) - }; - if let Some(enum_name) = enum_name { - match enum_name { - "enum cudaDeviceAttr" => strip_prefix("cudaDevAttr"), - "enum cudaLimit" => strip_prefix("cudaLimit"), - "enum cudaError" => strip_prefix("cuda"), - "enum cudaMemcpyKind" => strip_prefix("cudaMemcpy"), - "enum cudaMemPoolAttr" => strip_prefix("cudaMemPool"), - "enum cudaMemLocationType" => strip_prefix("cudaMemLocationType"), - "enum cudaMemAllocationType" => strip_prefix("cudaMemAllocationType"), - "enum cudaMemAllocationHandleType" => strip_prefix("cudaMemHandleType"), - "enum cudaMemoryType" => strip_prefix("cudaMemoryType"), - "enum cudaMemAccessFlags" => strip_prefix("cudaMemAccessFlagsProt"), - "enum cudaFuncAttribute" => strip_prefix("cudaFuncAttribute"), - "enum cudaFuncCache" => strip_prefix("cudaFuncCache"), - "enum cudaSharedMemConfig" => strip_prefix("cudaSharedMem"), - "enum cudaLaunchAttributeID" => strip_prefix("cudaLaunchAttribute"), - "enum cudaAccessProperty" => strip_prefix("cudaAccessProperty"), - "enum cudaSynchronizationPolicy" => strip_prefix("cudaSyncPolicy"), - "enum cudaClusterSchedulingPolicy" => strip_prefix("cudaClusterSchedulingPolicy"), - "enum cudaLaunchMemSyncDomain" => strip_prefix("cudaLaunchMemSyncDomain"), - _ => None, - } - } else { - None - } - } - - fn item_name(&self, _original_item_name: &str) -> Option { - let from = |s: &str| Some(String::from(s)); - match _original_item_name { - "cudaDeviceAttr" => from("CudaDeviceAttr"), - "cudaLimit" => from("CudaLimit"), - "cudaError" => from("CudaError"), - "cudaDeviceProp" => from("CudaDeviceProperties"), - "cudaMemcpyKind" => from("CudaMemoryCopyKind"), - "cudaMemPoolProps" => from("CudaMemPoolProperties"), - "cudaMemPoolAttr" => from("CudaMemPoolAttribute"), - "cudaMemLocation" => from("CudaMemLocation"), - "cudaMemLocationType" => from("CudaMemLocationType"), - "cudaMemAllocationType" => from("CudaMemAllocationType"), - "cudaMemAllocationHandleType" => from("CudaMemAllocationHandleType"), - "cudaPointerAttributes" => from("CudaPointerAttributes"), - "cudaMemoryType" => from("CudaMemoryType"), - "cudaMemAccessFlags" => from("CudaMemAccessFlags"), - "cudaMemAccessDesc" => from("CudaMemAccessDesc"), - "cudaFuncAttributes" => from("CudaFuncAttributes"), - "cudaFuncAttribute" => from("CudaFuncAttribute"), - "cudaFuncCache" => from("CudaFuncCache"), - "cudaSharedMemConfig" => from("CudaSharedMemConfig"), - "cudaLaunchAttributeID" => from("CudaLaunchAttributeID"), - "cudaLaunchAttributeValue" => from("CudaLaunchAttributeValue"), - "cudaAccessPolicyWindow" => from("CudaAccessPolicyWindow"), - "cudaAccessProperty" => from("CudaAccessProperty"), - "cudaSynchronizationPolicy" => from("CudaSynchronizationPolicy"), - "cudaClusterSchedulingPolicy" => from("CudaClusterSchedulingPolicy"), - "cudaLaunchMemSyncDomain" => from("CudaLaunchMemSyncDomain"), - _ => None, - } - } -} - -fn main() { - #[cfg(target_os = "macos")] - std::process::exit(0); - assert_cuda_version(); - let cuda_lib_path = cuda_lib_path!(); - let cuda_runtime_api_path = concat!(cuda_include_path!(), "/cuda_runtime_api.h"); - println!("cargo:rustc-link-search=native={cuda_lib_path}"); - println!("cargo:rustc-link-lib=cudart"); - println!("cargo:rerun-if-changed={cuda_runtime_api_path}"); - - let bindings = bindgen::Builder::default() - .header(cuda_runtime_api_path) - .parse_callbacks(Box::new(bindgen::CargoCallbacks)) - .parse_callbacks(Box::new(CudaParseCallbacks)) - .size_t_is_usize(true) - .generate_comments(false) - .layout_tests(false) - .allowlist_type("cudaError") - .rustified_enum("cudaError") - .must_use_type("cudaError") - // device management - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html - .rustified_enum("cudaDeviceAttr") - .allowlist_function("cudaDeviceGetAttribute") - .allowlist_function("cudaDeviceGetDefaultMemPool") - .rustified_enum("cudaLimit") - .allowlist_function("cudaDeviceGetLimit") - .allowlist_function("cudaDeviceGetMemPool") - .allowlist_function("cudaDeviceReset") - .allowlist_function("cudaDeviceSetLimit") - .allowlist_function("cudaDeviceSetMemPool") - .allowlist_function("cudaDeviceSynchronize") - .allowlist_function("cudaGetDevice") - .allowlist_function("cudaGetDeviceCount") - .allowlist_function("cudaGetDeviceProperties_v2") - .allowlist_function("cudaSetDevice") - // error handling - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html - .allowlist_function("cudaGetErrorName") - .allowlist_function("cudaGetLastError") - .allowlist_function("cudaPeekAtLastError") - // stream management - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html - .allowlist_function("cudaStreamCreate") - .allowlist_var("cudaStreamDefault") - .allowlist_var("cudaStreamNonBlocking") - .allowlist_function("cudaStreamCreateWithFlags") - .allowlist_function("cudaStreamDestroy") - .allowlist_function("cudaStreamGetAttribute") - .allowlist_function("cudaStreamQuery") - .allowlist_function("cudaStreamSetAttribute") - .allowlist_function("cudaStreamSynchronize") - .allowlist_var("cudaEventWaitDefault") - .allowlist_var("cudaEventWaitExternal") - .allowlist_function("cudaStreamWaitEvent") - // event management - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html - .allowlist_function("cudaEventCreate") - .allowlist_var("cudaEventDefault") - .allowlist_var("cudaEventBlockingSync") - .allowlist_var("cudaEventDisableTiming") - .allowlist_var("cudaEventInterprocess") - .allowlist_function("cudaEventCreateWithFlags") - .allowlist_function("cudaEventDestroy") - .allowlist_function("cudaEventElapsedTime") - .allowlist_function("cudaEventQuery") - .allowlist_function("cudaEventRecord") - .allowlist_var("cudaEventRecordDefault") - .allowlist_var("cudaEventRecordExternal") - .allowlist_function("cudaEventRecordWithFlags") - .allowlist_function("cudaEventSynchronize") - // execution control - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html - .rustified_enum("cudaFuncAttribute") - .allowlist_function("cudaFuncGetAttributes") - .allowlist_function("cudaFuncSetAttribute") - .rustified_enum("cudaFuncCache") - .allowlist_function("cudaFuncSetCacheConfig") - .rustified_enum("cudaSharedMemConfig") - .allowlist_function("cudaFuncSetSharedMemConfig") - .allowlist_function("cudaLaunchHostFunc") - .allowlist_function("cudaLaunchKernel") - .rustified_enum("cudaLaunchAttributeID") - .rustified_enum("cudaAccessProperty") - .rustified_enum("cudaSynchronizationPolicy") - .rustified_enum("cudaClusterSchedulingPolicy") - .rustified_enum("cudaLaunchMemSyncDomain") - .allowlist_function("cudaLaunchKernelExC") - // occupancy - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OCCUPANCY.html - .allowlist_function("cudaOccupancyAvailableDynamicSMemPerBlock") - .allowlist_function("cudaOccupancyMaxActiveBlocksPerMultiprocessor") - .allowlist_function("cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags") - .allowlist_function("cudaOccupancyMaxActiveClusters") - .allowlist_function("cudaOccupancyMaxPotentialClusterSize") - // memory management - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html - .rustified_enum("cudaMemcpyKind") - .allowlist_function("cudaFree") - .allowlist_function("cudaFreeHost") - .allowlist_function("cudaGetSymbolAddress") - .allowlist_function("cudaGetSymbolSize") - .allowlist_var("cudaHostAllocDefault") - .allowlist_var("cudaHostAllocPortable") - .allowlist_var("cudaHostAllocMapped") - .allowlist_var("cudaHostAllocWriteCombined") - .allowlist_function("cudaHostAlloc") - .allowlist_var("cudaHostRegisterDefault") - .allowlist_var("cudaHostRegisterPortable") - .allowlist_var("cudaHostRegisterMapped") - .allowlist_var("cudaHostRegisterIoMemory") - .allowlist_var("cudaHostRegisterReadOnly") - .allowlist_function("cudaHostRegister") - .allowlist_function("cudaHostUnregister") - .allowlist_function("cudaMalloc") - .allowlist_function("cudaMemGetInfo") - .allowlist_function("cudaMemcpy") - .allowlist_function("cudaMemcpyAsync") - .allowlist_function("cudaMemcpyFromSymbol") - .allowlist_function("cudaMemcpyFromSymbolAsync") - .allowlist_function("cudaMemcpyPeer") - .allowlist_function("cudaMemcpyPeerAsync") - .allowlist_function("cudaMemcpyToSymbol") - .allowlist_function("cudaMemcpyToSymbolAsync") - .allowlist_function("cudaMemset") - .allowlist_function("cudaMemsetAsync") - // Stream Ordered Memory Allocator - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html - .allowlist_function("cudaFreeAsync") - .allowlist_function("cudaMallocAsync") - .allowlist_function("cudaMallocFromPoolAsync") - .rustified_enum("cudaMemLocationType") - .rustified_enum("cudaMemAllocationType") - .rustified_enum("cudaMemAllocationHandleType") - .allowlist_function("cudaMemPoolCreate") - .allowlist_function("cudaMemPoolDestroy") - .rustified_enum("cudaMemPoolAttr") - .rustified_enum("cudaMemAccessFlags") - .allowlist_function("cudaMemPoolGetAccess") - .allowlist_function("cudaMemPoolGetAttribute") - .allowlist_function("cudaMemPoolSetAccess") - .allowlist_function("cudaMemPoolSetAttribute") - .allowlist_function("cudaMemPoolTrimTo") - // Unified Addressing - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__UNIFIED.html - .rustified_enum("cudaMemoryType") - .allowlist_function("cudaPointerGetAttributes") - // Peer Device Memory Access - // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PEER.html - .allowlist_function("cudaDeviceCanAccessPeer") - .allowlist_function("cudaDeviceDisablePeerAccess") - .allowlist_function("cudaDeviceEnablePeerAccess") - // - .generate() - .expect("Unable to generate bindings"); - - fs::write( - PathBuf::from("src").join("bindings.rs"), - bindings.to_string(), - ) - .expect("Couldn't write bindings!"); -} diff --git a/cudart-sys/src/.gitignore b/cudart-sys/src/.gitignore deleted file mode 100644 index 55300bd..0000000 --- a/cudart-sys/src/.gitignore +++ /dev/null @@ -1 +0,0 @@ -bindings.rs diff --git a/cudart-sys/src/lib.rs b/cudart-sys/src/lib.rs deleted file mode 100644 index de4027a..0000000 --- a/cudart-sys/src/lib.rs +++ /dev/null @@ -1,78 +0,0 @@ -#![allow(non_upper_case_globals)] -#![allow(non_camel_case_types)] -#![allow(non_snake_case)] - -mod path; - -use std::backtrace::Backtrace; -use std::error::Error; -use std::ffi::CStr; -use std::fmt::{Debug, Display, Formatter}; -use std::mem::MaybeUninit; - -include!("bindings.rs"); - -impl CudaError { - pub fn eprint_error(self) { - if self != CudaError::Success { - eprintln!("CUDA Error: {self}"); - } - } - - pub fn eprint_error_and_backtrace(self) { - if self != CudaError::Success { - self.eprint_error(); - let backtrace = Backtrace::capture(); - eprintln!("Backtrace: {backtrace}"); - } - } -} - -impl Display for CudaError { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let name = unsafe { CStr::from_ptr(cudaGetErrorName(*self)) }; - name.fmt(f) - } -} - -impl Error for CudaError {} - -impl From for dim3 { - fn from(value: u32) -> Self { - Self { - x: value, - y: 1, - z: 1, - } - } -} - -impl From<(u32, u32)> for dim3 { - fn from(value: (u32, u32)) -> Self { - Self { - x: value.0, - y: value.1, - z: 1, - } - } -} - -impl From<(u32, u32, u32)> for dim3 { - fn from(value: (u32, u32, u32)) -> Self { - Self { - x: value.0, - y: value.1, - z: value.2, - } - } -} - -impl Default for CudaMemPoolProperties { - fn default() -> Self { - let mut s = MaybeUninit::::uninit(); - unsafe { - std::ptr::write_bytes(s.as_mut_ptr(), 0, 1); - s.assume_init() - } - } -} diff --git a/cudart-sys/src/path.rs b/cudart-sys/src/path.rs deleted file mode 100644 index 9268ee5..0000000 --- a/cudart-sys/src/path.rs +++ /dev/null @@ -1,54 +0,0 @@ -#[cfg(target_os = "windows")] -#[macro_export] -macro_rules! cuda_path { - () => { - env!("CUDA_PATH") - }; -} - -#[cfg(target_os = "linux")] -#[macro_export] -macro_rules! cuda_path { - () => { - "/usr/local/cuda" - }; -} - -#[cfg(not(any(target_os = "windows", target_os = "linux")))] -#[macro_export] -macro_rules! cuda_path { - () => { - unimplemented!() - }; -} - -#[macro_export] -macro_rules! cuda_include_path { - () => { - concat!(cuda_path!(), "/include") - }; -} - -#[cfg(target_os = "windows")] -#[macro_export] -macro_rules! cuda_lib_path { - () => { - concat!(cuda_path!(), "/lib/x64") - }; -} - -#[cfg(target_os = "linux")] -#[macro_export] -macro_rules! cuda_lib_path { - () => { - concat!(cuda_path!(), "/lib64") - }; -} - -#[cfg(not(any(target_os = "windows", target_os = "linux")))] -#[macro_export] -macro_rules! cuda_lib_path { - () => { - unimplemented!() - }; -} diff --git a/cudart/Cargo.toml b/cudart/Cargo.toml deleted file mode 100644 index 30efa81..0000000 --- a/cudart/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "cudart" -version = "0.1.0" -edition = "2021" -license = "MIT OR Apache-2.0" - -[dependencies] -cudart-sys = { path = "../cudart-sys" } -bitflags = "^2" -criterion = "^0" - -[dev-dependencies] -serial_test = "^2" diff --git a/cudart/src/device.rs b/cudart/src/device.rs deleted file mode 100644 index e3d8954..0000000 --- a/cudart/src/device.rs +++ /dev/null @@ -1,181 +0,0 @@ -// device management -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html - -use std::mem::MaybeUninit; - -use cudart_sys::*; - -use crate::memory_pools::CudaMemPool; -use crate::result::{CudaResult, CudaResultWrap}; - -pub fn device_get_attribute(attr: CudaDeviceAttr, device_id: i32) -> CudaResult { - let mut value = MaybeUninit::::uninit(); - unsafe { cudaDeviceGetAttribute(value.as_mut_ptr(), attr, device_id).wrap_maybe_uninit(value) } -} - -pub fn device_reset() -> CudaResult<()> { - unsafe { cudaDeviceReset().wrap() } -} - -pub fn device_synchronize() -> CudaResult<()> { - unsafe { cudaDeviceSynchronize().wrap() } -} - -pub fn get_device_count() -> CudaResult { - let mut count = MaybeUninit::::uninit(); - unsafe { cudaGetDeviceCount(count.as_mut_ptr()).wrap_maybe_uninit(count) } -} - -pub fn get_device() -> CudaResult { - let mut device_id = MaybeUninit::::uninit(); - unsafe { cudaGetDevice(device_id.as_mut_ptr()).wrap_maybe_uninit(device_id) } -} - -pub fn get_device_properties(device_id: i32) -> CudaResult { - let mut props = MaybeUninit::::uninit(); - unsafe { cudaGetDeviceProperties_v2(props.as_mut_ptr(), device_id).wrap_maybe_uninit(props) } -} - -pub fn set_device(device_id: i32) -> CudaResult<()> { - unsafe { cudaSetDevice(device_id).wrap() } -} - -pub fn device_get_default_mem_pool(device_id: i32) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaDeviceGetDefaultMemPool(handle.as_mut_ptr(), device_id) - .wrap_maybe_uninit(handle) - .map(CudaMemPool::from_handle) - } -} - -pub fn device_get_mem_pool(device_id: i32) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaDeviceGetMemPool(handle.as_mut_ptr(), device_id) - .wrap_maybe_uninit(handle) - .map(CudaMemPool::from_handle) - } -} - -pub fn device_set_mem_pool(device_id: i32, pool: &CudaMemPool) -> CudaResult<()> { - unsafe { cudaDeviceSetMemPool(device_id, pool.into()).wrap() } -} - -pub fn device_get_limit(limit: CudaLimit) -> CudaResult { - let mut value = MaybeUninit::::uninit(); - unsafe { cudaDeviceGetLimit(value.as_mut_ptr(), limit).wrap_maybe_uninit(value) } -} - -pub fn device_set_limit(limit: CudaLimit, value: usize) -> CudaResult<()> { - unsafe { cudaDeviceSetLimit(limit, value).wrap() } -} - -#[cfg(test)] -mod tests { - use std::ffi::CStr; - - use serial_test::serial; - - use super::*; - - #[test] - #[serial] - fn device_get_attribute_is_ok() { - let result = device_get_attribute(CudaDeviceAttr::MaxBlocksPerMultiprocessor, 0); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn device_get_attribute_max_blocks_per_multiprocessor_is_gt_zero() { - let result = device_get_attribute(CudaDeviceAttr::MaxBlocksPerMultiprocessor, 0).unwrap(); - assert!(result > 0); - } - - #[test] - #[serial] - fn device_reset_is_ok() { - let result = device_reset(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn device_synchronize_is_ok() { - let result = device_synchronize(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn get_device_count_is_not_zero() { - let count = get_device_count().unwrap(); - assert_ne!(count, 0); - } - - #[test] - #[serial] - fn device_id_is_smaller_than_device_count() { - let device_id = get_device().unwrap(); - let count = get_device_count().unwrap(); - assert!(device_id < count); - } - - #[test] - #[serial] - fn device_properties_name_is_not_empty_for_all_devices() { - let count = get_device_count().unwrap(); - for i in 0..count { - let props = get_device_properties(i).unwrap(); - let name = unsafe { CStr::from_ptr(props.name.as_ptr()) } - .to_str() - .unwrap(); - assert!(!name.is_empty()); - } - } - - #[test] - #[serial] - fn set_device_works_for_all_devices() { - let count = get_device_count().unwrap(); - let original_device_id = get_device().unwrap(); - for i in 0..count { - set_device(i).unwrap(); - let current_device_id = get_device().unwrap(); - assert_eq!(i, current_device_id); - } - set_device(original_device_id).unwrap(); - } - - #[test] - #[serial] - fn device_get_default_mem_pool_is_ok_for_all_devices() { - let count = get_device_count().unwrap(); - for i in 0..count { - let result = device_get_default_mem_pool(i); - assert!(result.is_ok()); - } - } - - #[test] - #[serial] - fn device_get_mem_pool_is_ok_for_all_devices() { - let count = get_device_count().unwrap(); - for i in 0..count { - let result = device_get_mem_pool(i); - assert!(result.is_ok()); - } - } - - #[test] - #[serial] - fn device_set_mem_pool_is_ok_for_all_devices() { - let count = get_device_count().unwrap(); - for i in 0..count { - let pool = device_get_mem_pool(i).unwrap(); - let result = device_set_mem_pool(i, &pool); - assert!(result.is_ok()); - } - } -} diff --git a/cudart/src/error.rs b/cudart/src/error.rs deleted file mode 100644 index aa09e52..0000000 --- a/cudart/src/error.rs +++ /dev/null @@ -1,33 +0,0 @@ -// error handling -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html - -use cudart_sys::*; - -pub fn get_last_error() -> CudaError { - unsafe { cudaGetLastError() } -} - -pub fn peek_at_last_error() -> CudaError { - unsafe { cudaPeekAtLastError() } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use super::*; - - #[test] - #[serial] - fn get_last_error_equals_success() { - let result = get_last_error(); - assert_eq!(result, CudaError::Success) - } - - #[test] - #[serial] - fn peek_at_last_error_equals_success() { - let result = peek_at_last_error(); - assert_eq!(result, CudaError::Success) - } -} diff --git a/cudart/src/event.rs b/cudart/src/event.rs deleted file mode 100644 index 5c244c7..0000000 --- a/cudart/src/event.rs +++ /dev/null @@ -1,237 +0,0 @@ -// event management -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html - -use std::mem::{self, MaybeUninit}; -use std::ptr::NonNull; - -use bitflags::bitflags; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; -use crate::stream::CudaStream; - -#[repr(transparent)] -#[derive(Debug)] -pub struct CudaEvent { - handle: NonNull, -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaEventCreateFlags: u32 { - const DEFAULT = cudart_sys::cudaEventDefault; - const BLOCKING_SYNC = cudart_sys::cudaEventBlockingSync; - const DISABLE_TIMING = cudart_sys::cudaEventDisableTiming; - const INTERPROCESS = cudart_sys::cudaEventInterprocess; - } -} - -impl Default for CudaEventCreateFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaEventRecordFlags: u32 { - const DEFAULT = cudart_sys::cudaEventRecordDefault; - const EXTERNAL = cudart_sys::cudaEventRecordExternal; - } -} - -impl Default for CudaEventRecordFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -impl CudaEvent { - fn from_handle(handle: cudaEvent_t) -> Self { - Self { - handle: NonNull::new(handle).unwrap(), - } - } - - pub fn create() -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaEventCreate(handle.as_mut_ptr()) - .wrap_maybe_uninit(handle) - .map(Self::from_handle) - } - } - - pub fn create_with_flags(flags: CudaEventCreateFlags) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaEventCreateWithFlags(handle.as_mut_ptr(), flags.bits()) - .wrap_maybe_uninit(handle) - .map(Self::from_handle) - } - } - - pub fn destroy(self) -> CudaResult<()> { - let handle = self.handle; - mem::forget(self); - unsafe { cudaEventDestroy(handle.as_ptr()).wrap() } - } - - pub fn query(&self) -> CudaResult { - let error = unsafe { cudaEventQuery(self.handle.as_ptr()) }; - match error { - CudaError::Success => Ok(true), - CudaError::ErrorNotReady => Ok(false), - _ => Err(error), - } - } - - pub fn record(&self, stream: &CudaStream) -> CudaResult<()> { - unsafe { cudaEventRecord(self.handle.as_ptr(), stream.into()).wrap() } - } - - pub fn record_with_flags( - &self, - stream: &CudaStream, - flags: CudaEventRecordFlags, - ) -> CudaResult<()> { - unsafe { - cudaEventRecordWithFlags(self.handle.as_ptr(), stream.into(), flags.bits()).wrap() - } - } - - pub fn synchronize(&self) -> CudaResult<()> { - unsafe { cudaEventSynchronize(self.handle.as_ptr()).wrap() } - } -} - -impl Drop for CudaEvent { - fn drop(&mut self) { - unsafe { cudaEventDestroy(self.handle.as_ptr()).eprint_error_and_backtrace() }; - } -} - -impl From<&CudaEvent> for cudaEvent_t { - fn from(event: &CudaEvent) -> Self { - event.handle.as_ptr() - } -} - -pub fn elapsed_time(start: &CudaEvent, end: &CudaEvent) -> CudaResult { - let mut ms = MaybeUninit::::uninit(); - unsafe { - cudaEventElapsedTime(ms.as_mut_ptr(), start.handle.as_ptr(), end.handle.as_ptr()) - .wrap_maybe_uninit(ms) - } -} - -#[cfg(test)] -mod tests { - use std::ptr::null_mut; - use std::thread; - use std::time::Duration; - - use serial_test::serial; - - use crate::execution::{self, HostFn}; - - use super::*; - - #[test] - #[serial] - fn create_is_ok() { - let result = CudaEvent::create(); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn create_handle_is_not_null() { - let event = CudaEvent::create().unwrap(); - assert_ne!(event.handle.as_ptr(), null_mut()); - } - - #[test] - #[serial] - fn create_with_flags_is_ok() { - let result = CudaEvent::create_with_flags( - CudaEventCreateFlags::DISABLE_TIMING | CudaEventCreateFlags::BLOCKING_SYNC, - ); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn create_with_flags_handle_is_not_null() { - let event = CudaEvent::create_with_flags(CudaEventCreateFlags::DISABLE_TIMING).unwrap(); - assert_ne!(event.handle.as_ptr(), null_mut()); - } - - #[test] - #[serial] - fn destroy_is_ok() { - let event = CudaEvent::create().unwrap(); - let result = event.destroy(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn query_is_true() { - let stream = CudaStream::create().unwrap(); - let event = CudaEvent::create().unwrap(); - event.record(&stream).unwrap(); - stream.synchronize().unwrap(); - let result = event.query(); - assert_eq!(result, Ok(true)); - } - - #[test] - #[serial] - fn query_is_false() { - let stream = CudaStream::create().unwrap(); - let event = CudaEvent::create().unwrap(); - let func = HostFn::new(|| thread::sleep(Duration::from_millis(100))); - execution::launch_host_fn(&stream, &func).unwrap(); - event.record(&stream).unwrap(); - let result = event.query(); - stream.synchronize().unwrap(); - assert_eq!(result, Ok(false)); - } - - #[test] - #[serial] - fn record_is_ok() { - let stream = CudaStream::create().unwrap(); - let event = CudaEvent::create().unwrap(); - let result = event.record(&stream); - stream.synchronize().unwrap(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn synchronize_is_ok() { - let stream = CudaStream::create().unwrap(); - let event = CudaEvent::create().unwrap(); - event.record(&stream).unwrap(); - let result = event.synchronize(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn elapsed_time_in_range() { - let stream = CudaStream::create().unwrap(); - let start = CudaEvent::create().unwrap(); - let end = CudaEvent::create().unwrap(); - let func = HostFn::new(|| thread::sleep(Duration::from_millis(10))); - start.record(&stream).unwrap(); - execution::launch_host_fn(&stream, &func).unwrap(); - end.record(&stream).unwrap(); - stream.synchronize().unwrap(); - let elapsed = elapsed_time(&start, &end).unwrap(); - assert!(elapsed > 10.0 && elapsed < 100.0); - } -} diff --git a/cudart/src/execution.rs b/cudart/src/execution.rs deleted file mode 100644 index 928069b..0000000 --- a/cudart/src/execution.rs +++ /dev/null @@ -1,512 +0,0 @@ -// execution control -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html - -use core::ffi::c_void; -use std::marker::PhantomData; -use std::os::raw::{c_char, c_int, c_uint}; -use std::sync::{Arc, Weak}; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; -use crate::stream::CudaStream; - -pub struct KernelArguments<'a> { - vec: Vec<*mut c_void>, - phantom: PhantomData<&'a c_void>, -} - -impl<'a> KernelArguments<'a> { - pub fn new() -> Self { - Self { - vec: vec![], - phantom: PhantomData, - } - } - - pub fn push(&mut self, value: &T) { - self.vec.push(value as *const T as *mut c_void); - } - - pub fn as_mut_ptr(&mut self) -> *mut *mut c_void { - self.vec.as_mut_ptr() - } -} - -impl<'a> Default for KernelArguments<'a> { - fn default() -> Self { - KernelArguments::new() - } -} - -#[macro_export] -macro_rules! kernel_args { - ($($x:expr),* $(,)?) => { - { - let mut args = $crate::execution::KernelArguments::new(); - $( - args.push($x); - )* - args - } - }; -} - -pub use kernel_args; - -#[derive(Debug, Copy, Clone)] -pub enum CudaLaunchAttribute { - Ignore, - AccessPolicyWindow(CudaAccessPolicyWindow), - Cooperative(bool), - SynchronizationPolicy(CudaSynchronizationPolicy), - ClusterDimension(dim3), - ClusterSchedulingPolicyPreference(CudaClusterSchedulingPolicy), - ProgrammaticStreamSerialization(bool), - ProgrammaticEvent(cudaLaunchAttributeValue__bindgen_ty_2), - Priority(i32), - MemSyncDomainMap(cudaLaunchMemSyncDomainMap), - MemSyncDomain(CudaLaunchMemSyncDomain), -} - -impl CudaLaunchAttribute { - pub(crate) fn from_id_and_value( - id: CudaLaunchAttributeID, - value: CudaLaunchAttributeValue, - ) -> Self { - unsafe { - match id { - CudaLaunchAttributeID::Ignore => Self::Ignore, - CudaLaunchAttributeID::AccessPolicyWindow => { - Self::AccessPolicyWindow(value.accessPolicyWindow) - } - CudaLaunchAttributeID::Cooperative => Self::Cooperative(value.cooperative != 0), - CudaLaunchAttributeID::SynchronizationPolicy => { - Self::SynchronizationPolicy(value.syncPolicy) - } - CudaLaunchAttributeID::ClusterDimension => { - let d = value.clusterDim; - Self::ClusterDimension(dim3 { - x: d.x, - y: d.y, - z: d.z, - }) - } - CudaLaunchAttributeID::ClusterSchedulingPolicyPreference => { - Self::ClusterSchedulingPolicyPreference(value.clusterSchedulingPolicyPreference) - } - CudaLaunchAttributeID::ProgrammaticStreamSerialization => { - Self::ProgrammaticStreamSerialization( - value.programmaticStreamSerializationAllowed != 0, - ) - } - CudaLaunchAttributeID::ProgrammaticEvent => { - Self::ProgrammaticEvent(value.programmaticEvent) - } - CudaLaunchAttributeID::Priority => Self::Priority(value.priority), - CudaLaunchAttributeID::MemSyncDomainMap => { - Self::MemSyncDomainMap(value.memSyncDomainMap) - } - CudaLaunchAttributeID::MemSyncDomain => Self::MemSyncDomain(value.memSyncDomain), - } - } - } - - pub(crate) fn into_id_and_value(self) -> (CudaLaunchAttributeID, CudaLaunchAttributeValue) { - match self { - CudaLaunchAttribute::Ignore => ( - CudaLaunchAttributeID::Ignore, - CudaLaunchAttributeValue { pad: [0; 64] }, - ), - CudaLaunchAttribute::AccessPolicyWindow(access_policy_window) => ( - CudaLaunchAttributeID::AccessPolicyWindow, - CudaLaunchAttributeValue { - accessPolicyWindow: access_policy_window, - }, - ), - CudaLaunchAttribute::Cooperative(cooperative) => ( - CudaLaunchAttributeID::Cooperative, - CudaLaunchAttributeValue { - cooperative: cooperative as c_int, - }, - ), - CudaLaunchAttribute::SynchronizationPolicy(sync_policy) => ( - CudaLaunchAttributeID::SynchronizationPolicy, - CudaLaunchAttributeValue { - syncPolicy: sync_policy, - }, - ), - CudaLaunchAttribute::ClusterDimension(cluster_dim) => ( - CudaLaunchAttributeID::ClusterDimension, - CudaLaunchAttributeValue { - clusterDim: cudaLaunchAttributeValue__bindgen_ty_1 { - x: cluster_dim.x, - y: cluster_dim.y, - z: cluster_dim.z, - }, - }, - ), - CudaLaunchAttribute::ClusterSchedulingPolicyPreference( - cluster_scheduling_policy_preference, - ) => ( - CudaLaunchAttributeID::ClusterSchedulingPolicyPreference, - CudaLaunchAttributeValue { - clusterSchedulingPolicyPreference: cluster_scheduling_policy_preference, - }, - ), - CudaLaunchAttribute::ProgrammaticStreamSerialization( - programmatic_stream_serialization_allowed, - ) => ( - CudaLaunchAttributeID::ProgrammaticStreamSerialization, - CudaLaunchAttributeValue { - programmaticStreamSerializationAllowed: - programmatic_stream_serialization_allowed as c_int, - }, - ), - CudaLaunchAttribute::ProgrammaticEvent(programmatic_event) => ( - CudaLaunchAttributeID::ProgrammaticEvent, - CudaLaunchAttributeValue { - programmaticEvent: programmatic_event, - }, - ), - CudaLaunchAttribute::Priority(priority) => ( - CudaLaunchAttributeID::Priority, - CudaLaunchAttributeValue { priority }, - ), - CudaLaunchAttribute::MemSyncDomainMap(mem_sync_domain_map) => ( - CudaLaunchAttributeID::MemSyncDomainMap, - CudaLaunchAttributeValue { - memSyncDomainMap: mem_sync_domain_map, - }, - ), - CudaLaunchAttribute::MemSyncDomain(mem_sync_domain) => ( - CudaLaunchAttributeID::MemSyncDomain, - CudaLaunchAttributeValue { - memSyncDomain: mem_sync_domain, - }, - ), - } - } - - fn into_raw(self) -> cudaLaunchAttribute { - let (id, val) = self.into_id_and_value(); - cudaLaunchAttribute { - id, - pad: [c_char::default(); 4], - val, - } - } -} - -pub trait Kernel: Sized { - fn get_kernel_raw(self) -> *const c_void; -} - -pub trait KernelLaunch<'a>: Kernel { - type Args: Into>; - - #[allow(clippy::missing_safety_doc)] - unsafe fn launch( - self, - grid_dim: dim3, - block_dim: dim3, - args: Self::Args, - shared_mem: usize, - stream: &CudaStream, - ) -> CudaResult<()> { - cudaLaunchKernel( - self.get_kernel_raw(), - grid_dim, - block_dim, - args.into().as_mut_ptr(), - shared_mem, - stream.into(), - ) - .wrap() - } - - #[allow(clippy::missing_safety_doc)] - unsafe fn launch_ex( - self, - grid_dim: dim3, - block_dim: dim3, - args: Self::Args, - shared_mem: usize, - stream: &CudaStream, - attributes: &[CudaLaunchAttribute], - ) -> CudaResult<()> { - let mut attributes = attributes - .iter() - .map(|attribute| (*attribute).into_raw()) - .collect::>(); - let config = cudaLaunchConfig_t { - gridDim: grid_dim, - blockDim: block_dim, - dynamicSmemBytes: shared_mem, - stream: stream.into(), - attrs: attributes.as_mut_ptr(), - numAttrs: attributes.len() as c_uint, - }; - cudaLaunchKernelExC( - &config as *const _, - self.get_kernel_raw(), - args.into().as_mut_ptr(), - ) - .wrap() - } -} - -impl<'a> From<()> for KernelArguments<'a> { - fn from(_value: ()) -> Self { - KernelArguments::default() - } -} - -pub type KernelNoArgs = unsafe extern "C" fn(); - -impl Kernel for KernelNoArgs { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a> KernelLaunch<'a> for KernelNoArgs { - type Args = (); -} - -impl<'a, T> From<(&T,)> for KernelArguments<'a> { - fn from(value: (&T,)) -> Self { - kernel_args![value.0] - } -} - -pub type KernelOneArg = unsafe extern "C" fn(T0); - -impl Kernel for KernelOneArg { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a> KernelLaunch<'a> for KernelOneArg { - type Args = (&'a T0,); -} - -impl<'a, T0, T1> From<(&T0, &T1)> for KernelArguments<'a> { - fn from(value: (&T0, &T1)) -> Self { - kernel_args![value.0, value.1] - } -} - -pub type KernelTwoArgs = unsafe extern "C" fn(T0, T1); - -impl<'a, T0: 'a, T1: 'a> Kernel for KernelTwoArgs { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a> KernelLaunch<'a> for KernelTwoArgs { - type Args = (&'a T0, &'a T1); -} - -impl<'a, T0, T1, T2> From<(&T0, &T1, &T2)> for KernelArguments<'a> { - fn from(value: (&T0, &T1, &T2)) -> Self { - kernel_args![value.0, value.1, value.2] - } -} - -pub type KernelThreeArgs = unsafe extern "C" fn(T0, T1, T2); - -impl<'a, T0: 'a, T1: 'a, T2: 'a> Kernel for KernelThreeArgs { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a> KernelLaunch<'a> for KernelThreeArgs { - type Args = (&'a T0, &'a T1, &'a T2); -} - -impl<'a, T0, T1, T2, T3> From<(&T0, &T1, &T2, &T3)> for KernelArguments<'a> { - fn from(value: (&T0, &T1, &T2, &T3)) -> Self { - kernel_args![value.0, value.1, value.2, value.3] - } -} - -pub type KernelFourArgs = unsafe extern "C" fn(T0, T1, T2, T3); - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a> Kernel for KernelFourArgs { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a> KernelLaunch<'a> for KernelFourArgs { - type Args = (&'a T0, &'a T1, &'a T2, &'a T3); -} - -impl<'a, T0, T1, T2, T3, T4> From<(&T0, &T1, &T2, &T3, &T4)> for KernelArguments<'a> { - fn from(value: (&T0, &T1, &T2, &T3, &T4)) -> Self { - kernel_args![value.0, value.1, value.2, value.3, value.4] - } -} - -pub type KernelFiveArgs = unsafe extern "C" fn(T0, T1, T2, T3, T4); - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a> Kernel for KernelFiveArgs { - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a> KernelLaunch<'a> - for KernelFiveArgs -{ - type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4); -} - -impl<'a, T0, T1, T2, T3, T4, T5> From<(&T0, &T1, &T2, &T3, &T4, &T5)> for KernelArguments<'a> { - fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5)) -> Self { - kernel_args![value.0, value.1, value.2, value.3, value.4, value.5] - } -} - -pub type KernelSixArgs = unsafe extern "C" fn(T0, T1, T2, T3, T4, T5); - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a> Kernel - for KernelSixArgs -{ - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a> KernelLaunch<'a> - for KernelSixArgs -{ - type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4, &'a T5); -} - -impl<'a, T0, T1, T2, T3, T4, T5, T6> From<(&T0, &T1, &T2, &T3, &T4, &T5, &T6)> - for KernelArguments<'a> -{ - fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5, &T6)) -> Self { - kernel_args![value.0, value.1, value.2, value.3, value.4, value.5, value.6] - } -} - -pub type KernelSevenArgs = - unsafe extern "C" fn(T0, T1, T2, T3, T4, T5, T6); - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a> Kernel - for KernelSevenArgs -{ - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a> KernelLaunch<'a> - for KernelSevenArgs -{ - type Args = (&'a T0, &'a T1, &'a T2, &'a T3, &'a T4, &'a T5, &'a T6); -} - -impl<'a, T0, T1, T2, T3, T4, T5, T6, T7> From<(&T0, &T1, &T2, &T3, &T4, &T5, &T6, &T7)> - for KernelArguments<'a> -{ - fn from(value: (&T0, &T1, &T2, &T3, &T4, &T5, &T6, &T7)) -> Self { - kernel_args![value.0, value.1, value.2, value.3, value.4, value.5, value.6, value.7] - } -} - -pub type KernelEightArgs = - unsafe extern "C" fn(T0, T1, T2, T3, T4, T5, T6, T7); - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a, T7: 'a> Kernel - for KernelEightArgs -{ - fn get_kernel_raw(self) -> *const c_void { - self as *const c_void - } -} - -impl<'a, T0: 'a, T1: 'a, T2: 'a, T3: 'a, T4: 'a, T5: 'a, T6: 'a, T7: 'a> KernelLaunch<'a> - for KernelEightArgs -{ - type Args = ( - &'a T0, - &'a T1, - &'a T2, - &'a T3, - &'a T4, - &'a T5, - &'a T6, - &'a T7, - ); -} - -pub struct HostFn<'a> { - arc: Arc>, -} - -impl<'a> HostFn<'a> { - pub fn new(func: impl Fn() + Send + 'a) -> Self { - Self { - arc: Arc::new(Box::new(func) as Box), - } - } -} - -unsafe extern "C" fn launch_host_fn_callback(data: *mut c_void) { - let raw = data as *const Box; - let weak = Weak::from_raw(raw); - if let Some(func) = weak.upgrade() { - func(); - } -} - -pub fn get_raw_fn_and_data(host_fn: &HostFn) -> (cudaHostFn_t, *mut c_void) { - let weak = Arc::downgrade(&host_fn.arc); - let raw = weak.into_raw(); - let data = raw as *mut c_void; - (Some(launch_host_fn_callback), data) -} - -pub fn launch_host_fn(stream: &CudaStream, host_fn: &HostFn) -> CudaResult<()> { - let (func, data) = get_raw_fn_and_data(host_fn); - unsafe { cudaLaunchHostFunc(stream.into(), func, data).wrap() } -} - -#[cfg(test)] -mod tests { - use std::sync::Mutex; - use std::thread; - use std::time::Duration; - - use serial_test::serial; - - use super::*; - - #[test] - #[serial] - fn host_fn_add_executes_one_time() { - let stream = CudaStream::create().unwrap(); - let mut a = 0; - let add = || { - a += 1; - thread::sleep(Duration::from_millis(10)); - }; - let add_mutex = Mutex::new(add); - let add_fn = HostFn::new(move || add_mutex.lock().unwrap()()); - let sleep_fn = HostFn::new(|| thread::sleep(Duration::from_millis(10))); - launch_host_fn(&stream, &add_fn).unwrap(); - stream.synchronize().unwrap(); - launch_host_fn(&stream, &sleep_fn).unwrap(); - launch_host_fn(&stream, &add_fn).unwrap(); - drop(add_fn); - stream.synchronize().unwrap(); - assert_eq!(a, 1); - } -} diff --git a/cudart/src/lib.rs b/cudart/src/lib.rs deleted file mode 100644 index a0a79bc..0000000 --- a/cudart/src/lib.rs +++ /dev/null @@ -1,19 +0,0 @@ -#![feature(min_specialization)] -#![feature(ptr_metadata)] -#![feature(trusted_len)] -#![feature(trusted_random_access)] - -extern crate core; - -pub mod device; -pub mod error; -pub mod event; -pub mod execution; -pub mod memory; -pub mod memory_pools; -pub mod occupancy; -pub mod peer; -pub mod result; -pub mod slice; -pub mod stream; -pub mod unified; diff --git a/cudart/src/memory.rs b/cudart/src/memory.rs deleted file mode 100644 index ec186b3..0000000 --- a/cudart/src/memory.rs +++ /dev/null @@ -1,818 +0,0 @@ -// memory management -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html - -use core::ffi::c_void; -use std::alloc::Layout; -use std::mem::{self, MaybeUninit}; -use std::ops::{Deref, DerefMut}; - -use bitflags::bitflags; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; -use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice}; -use crate::stream::CudaStream; - -#[repr(transparent)] -#[derive(Debug)] -pub struct DeviceAllocation(AllocationData); - -impl DeviceAllocation { - pub fn alloc(length: usize) -> CudaResult { - let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); - unsafe { - cudaMalloc(dev_ptr.as_mut_ptr(), layout.size()) - .wrap_maybe_uninit(dev_ptr) - .map(|ptr| { - Self(AllocationData { - ptr: ptr as *mut T, - len: length, - }) - }) - } - } - - pub fn free(self) -> CudaResult<()> { - unsafe { - let ptr = self.0.ptr as *mut c_void; - mem::forget(self); - cudaFree(ptr).wrap() - } - } - - /// # Safety - /// - /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self { - Self(AllocationData { ptr, len }) - } - - pub fn into_raw_parts(self) -> (*mut T, usize) { - let result = (self.0.ptr, self.0.len); - mem::forget(self); - result - } -} - -impl Drop for DeviceAllocation { - fn drop(&mut self) { - unsafe { cudaFree(self.as_mut_c_void_ptr()).eprint_error_and_backtrace() }; - } -} - -impl Deref for DeviceAllocation { - type Target = DeviceSlice; - - fn deref(&self) -> &Self::Target { - Self::Target::from_allocation_data(&self.0) - } -} - -impl DerefMut for DeviceAllocation { - fn deref_mut(&mut self) -> &mut Self::Target { - Self::Target::from_mut_allocation_data(&mut self.0) - } -} - -impl AsRef> for DeviceAllocation { - fn as_ref(&self) -> &DeviceSlice { - self.deref() - } -} - -impl AsMut> for DeviceAllocation { - fn as_mut(&mut self) -> &mut DeviceSlice { - self.deref_mut() - } -} - -impl CudaSlice for DeviceAllocation { - unsafe fn as_slice(&self) -> &[T] { - self.0.as_slice() - } -} - -impl CudaSliceMut for DeviceAllocation { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - self.0.as_mut_slice() - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaHostAllocFlags: u32 { - const DEFAULT = cudart_sys::cudaHostAllocDefault; - const PORTABLE = cudart_sys::cudaHostAllocPortable; - const MAPPED = cudart_sys::cudaHostAllocMapped; - const WRITE_COMBINED = cudart_sys::cudaHostAllocWriteCombined; - } -} - -impl Default for CudaHostAllocFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct HostAllocation(AllocationData); - -impl HostAllocation { - pub fn alloc(length: usize, flags: CudaHostAllocFlags) -> CudaResult { - let layout = Layout::array::(length).unwrap(); - let mut ptr = MaybeUninit::<*mut c_void>::uninit(); - unsafe { - cudaHostAlloc(ptr.as_mut_ptr(), layout.size(), flags.bits()) - .wrap_maybe_uninit(ptr) - .map(|ptr| { - Self(AllocationData { - ptr: ptr as *mut T, - len: length, - }) - }) - } - } - - pub fn free(self) -> CudaResult<()> { - unsafe { - let ptr = self.0.ptr as *mut c_void; - mem::forget(self); - cudaFreeHost(ptr).wrap() - } - } - - /// # Safety - /// - /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize) -> Self { - Self(AllocationData { ptr, len }) - } - - pub fn into_raw_parts(self) -> (*mut T, usize) { - let result = (self.0.ptr, self.0.len); - mem::forget(self); - result - } -} - -impl Drop for HostAllocation { - fn drop(&mut self) { - unsafe { cudaFreeHost(self.0.ptr as *mut c_void).eprint_error_and_backtrace() }; - } -} - -impl Deref for HostAllocation { - type Target = [T]; - fn deref(&self) -> &Self::Target { - unsafe { self.0.as_slice() } - } -} - -impl DerefMut for HostAllocation { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { self.0.as_mut_slice() } - } -} - -impl AsRef<[T]> for HostAllocation { - fn as_ref(&self) -> &[T] { - self.deref() - } -} - -impl AsMut<[T]> for HostAllocation { - fn as_mut(&mut self) -> &mut [T] { - self.deref_mut() - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaHostRegisterFlags: u32 { - const DEFAULT = cudart_sys::cudaHostRegisterDefault; - const PORTABLE = cudart_sys::cudaHostRegisterPortable; - const MAPPED = cudart_sys::cudaHostRegisterMapped; - const IO_MEMORY = cudart_sys::cudaHostRegisterIoMemory; - const READ_ONLY = cudart_sys::cudaHostRegisterReadOnly; - } -} - -impl Default for CudaHostRegisterFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct HostRegistration<'a, T>(&'a [T]); - -impl<'a, T> HostRegistration<'a, T> { - pub fn register(slice: &'a [T], flags: CudaHostRegisterFlags) -> CudaResult { - let length = slice.len(); - let layout = Layout::array::(length).unwrap(); - unsafe { - cudaHostRegister( - slice.as_c_void_ptr() as *mut c_void, - layout.size(), - flags.bits(), - ) - .wrap_value(Self(slice)) - } - } - - pub fn unregister(self) -> CudaResult<()> { - unsafe { cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).wrap() } - } -} - -impl Drop for HostRegistration<'_, T> { - fn drop(&mut self) { - unsafe { - cudaHostUnregister(self.0.as_c_void_ptr() as *mut c_void).eprint_error_and_backtrace() - }; - } -} - -impl Deref for HostRegistration<'_, T> { - type Target = [T]; - - fn deref(&self) -> &Self::Target { - self.0 - } -} - -impl AsRef<[T]> for HostRegistration<'_, T> { - fn as_ref(&self) -> &[T] { - self.0 - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct HostRegistrationMut<'a, T>(&'a mut [T]); - -impl<'a, T> HostRegistrationMut<'a, T> { - pub fn register(slice: &'a mut [T], flags: CudaHostRegisterFlags) -> CudaResult { - let length = slice.len(); - let layout = Layout::array::(length).unwrap(); - unsafe { - cudaHostRegister(slice.as_mut_c_void_ptr(), layout.size(), flags.bits()) - .wrap_value(Self(slice)) - } - } - - pub fn unregister(self) -> CudaResult<()> { - unsafe { cudaHostUnregister(self.0.as_mut_c_void_ptr()).wrap() } - } -} - -impl Drop for HostRegistrationMut<'_, T> { - fn drop(&mut self) { - unsafe { cudaHostUnregister(self.0.as_mut_c_void_ptr()).eprint_error_and_backtrace() }; - } -} - -impl Deref for HostRegistrationMut<'_, T> { - type Target = [T]; - - fn deref(&self) -> &Self::Target { - self.0 - } -} - -impl DerefMut for HostRegistrationMut<'_, T> { - fn deref_mut(&mut self) -> &mut Self::Target { - self.0 - } -} - -impl AsRef<[T]> for HostRegistrationMut<'_, T> { - fn as_ref(&self) -> &[T] { - self.0 - } -} - -impl AsMut<[T]> for HostRegistrationMut<'_, T> { - fn as_mut(&mut self) -> &mut [T] { - self.0 - } -} - -pub fn memory_copy( - dst: &mut (impl CudaSliceMut + ?Sized), - src: &(impl CudaSlice + ?Sized), -) -> CudaResult<()> { - memory_copy_with_kind(dst, src, CudaMemoryCopyKind::Default) -} - -pub fn memory_copy_with_kind( - dst: &mut (impl CudaSliceMut + ?Sized), - src: &(impl CudaSlice + ?Sized), - kind: CudaMemoryCopyKind, -) -> CudaResult<()> { - unsafe { - assert_eq!( - dst.len(), - src.len(), - "dst length and src length must be equal" - ); - let layout = Layout::array::(dst.len()).unwrap(); - cudaMemcpy( - dst.as_mut_c_void_ptr(), - src.as_c_void_ptr(), - layout.size(), - kind, - ) - .wrap() - } -} - -pub fn memory_copy_async( - dst: &mut (impl CudaSliceMut + ?Sized), - src: &(impl CudaSlice + ?Sized), - stream: &CudaStream, -) -> CudaResult<()> { - memory_copy_with_kind_async(dst, src, CudaMemoryCopyKind::Default, stream) -} - -pub fn memory_copy_with_kind_async( - dst: &mut (impl CudaSliceMut + ?Sized), - src: &(impl CudaSlice + ?Sized), - kind: CudaMemoryCopyKind, - stream: &CudaStream, -) -> CudaResult<()> { - unsafe { - assert_eq!( - dst.len(), - src.len(), - "dst length and src length must be equal" - ); - let layout = Layout::array::(dst.len()).unwrap(); - cudaMemcpyAsync( - dst.as_mut_c_void_ptr(), - src.as_c_void_ptr(), - layout.size(), - kind, - stream.into(), - ) - .wrap() - } -} - -pub fn memory_set(dst: &mut (impl CudaSliceMut + ?Sized), value: u8) -> CudaResult<()> { - unsafe { - let layout = Layout::array::(dst.len()).unwrap(); - cudaMemset(dst.as_mut_c_void_ptr(), value as i32, layout.size()).wrap() - } -} - -pub fn memory_set_async( - dst: &mut (impl CudaSliceMut + ?Sized), - value: u8, - stream: &CudaStream, -) -> CudaResult<()> { - unsafe { - let layout = Layout::array::(dst.len()).unwrap(); - cudaMemsetAsync( - dst.as_mut_c_void_ptr(), - value as i32, - layout.size(), - stream.into(), - ) - .wrap() - } -} - -pub fn memory_get_info() -> CudaResult<(usize, usize)> { - let mut free = MaybeUninit::::uninit(); - let mut total = MaybeUninit::::uninit(); - unsafe { - let error = cudaMemGetInfo(free.as_mut_ptr(), total.as_mut_ptr()); - if error == CudaError::Success { - Ok((free.assume_init(), total.assume_init())) - } else { - Err(error) - } - } -} - -#[derive(Copy, Clone, Default, Debug, PartialEq, Eq)] -pub struct HostAllocator { - flags: CudaHostAllocFlags, -} - -impl HostAllocator { - pub fn new(flags: CudaHostAllocFlags) -> Self { - Self { flags } - } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use super::*; - - const LENGTH: usize = 1024; - - #[test] - #[serial] - fn device_allocation_alloc_is_ok() { - let result = DeviceAllocation::::alloc(LENGTH); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn device_allocation_free_is_ok() { - let allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - let result = allocation.free(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn device_allocation_alloc_len_eq_length() { - let allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - assert_eq!(allocation.len(), LENGTH); - } - - #[test] - #[serial] - fn device_allocation_alloc_is_empty_is_false() { - let allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - assert!(!allocation.is_empty()); - } - - #[test] - #[serial] - fn device_allocation_deref_len_eq_length() { - let allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - let slice = allocation.deref(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_allocation_deref_mut_len_eq_length() { - let mut allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - let slice = allocation.deref_mut(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_allocation_slice_index_len_eq_length() { - let allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - let slice = &allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_allocation_mut_slice_index_mut_len_eq_length() { - let mut allocation = DeviceAllocation::::alloc(LENGTH).unwrap(); - let slice = &mut allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_alloc_is_ok() { - let result = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn host_allocation_free_is_ok() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let result = allocation.free(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn host_allocation_alloc_len_eq_length() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - assert_eq!(allocation.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_alloc_is_empty_is_false() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - assert!(!allocation.is_empty()); - } - - #[test] - #[serial] - fn host_allocation_deref_len_eq_length() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let slice = allocation.deref(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_deref_mut_len_eq_length() { - let mut allocation = - HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let slice = allocation.deref_mut(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_index_len_eq_length() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let slice = &allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_index_mut_len_eq_length() { - let mut allocation = - HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let slice = &mut allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_allocation_deref_ptrs_are_equal() { - let allocation = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let ptr = allocation.deref().as_ptr(); - assert_eq!(allocation.as_ptr(), ptr); - } - - #[test] - #[serial] - fn host_allocation_deref_mut_ptrs_are_equal() { - let mut allocation = - HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let ptr = allocation.deref_mut().as_mut_ptr(); - assert_eq!(allocation.as_mut_ptr(), ptr); - } - - #[test] - #[serial] - fn host_registration_register_is_ok() { - let values = [0u32; LENGTH]; - let result = HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn host_registration_register_empty_error_invalid_value() { - let values = [0u32; 0]; - let result = HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT); - assert_eq!(result.err(), Some(CudaError::ErrorInvalidValue)); - } - - #[test] - #[serial] - fn host_registration_unregister_is_ok() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - let result = registration.unregister(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn host_registration_register_len_eq_length() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - assert_eq!(registration.len(), LENGTH); - } - - #[test] - #[serial] - fn host_registration_register_is_empty_is_false() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - assert!(!registration.is_empty()); - } - - #[test] - #[serial] - fn host_registration_deref_len_eq_length() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - let slice = registration.deref(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_registration_mut_deref_mut_len_eq_length() { - let mut values = [0u32; LENGTH]; - let mut registration = - HostRegistrationMut::::register(&mut values, CudaHostRegisterFlags::DEFAULT) - .unwrap(); - let slice = registration.deref_mut(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_registration_index_len_eq_length() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - let slice = ®istration[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_registration_mut_index_mut_len_eq_length() { - let mut values = [0u32; LENGTH]; - let mut registration = - HostRegistrationMut::::register(&mut values, CudaHostRegisterFlags::DEFAULT) - .unwrap(); - let slice = &mut registration[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn host_registration_deref_ptrs_are_equal() { - let values = [0u32; LENGTH]; - let registration = - HostRegistration::::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - let ptr = registration.deref().as_ptr(); - assert_eq!(registration.as_ptr(), ptr); - } - - #[test] - #[serial] - fn host_registration_mut_deref_mut_ptrs_are_equal() { - let mut values = [0u32; LENGTH]; - let mut registration = - HostRegistrationMut::::register(&mut values, CudaHostRegisterFlags::DEFAULT) - .unwrap(); - let ptr = registration.deref_mut().as_mut_ptr(); - assert_eq!(registration.as_mut_ptr(), ptr); - } - - #[test] - #[serial] - fn memory_copy_device_slice_to_device_slice() { - let values1 = [42u32; LENGTH]; - let mut values2 = [0u32; LENGTH]; - let mut a1 = DeviceAllocation::::alloc(LENGTH).unwrap(); - let mut a2 = DeviceAllocation::::alloc(LENGTH).unwrap(); - let a1_slice = a1.deref_mut(); - let a2_slice = a2.deref_mut(); - memory_copy(a1_slice, &values1).unwrap(); - memory_copy(a2_slice, a1_slice).unwrap(); - memory_copy(&mut values2, a2_slice).unwrap(); - assert!(values2.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_device_allocation_to_device_allocation() { - let values1 = [42u32; LENGTH]; - let mut values2 = [0u32; LENGTH]; - let mut a1 = DeviceAllocation::::alloc(LENGTH).unwrap(); - let mut a2 = DeviceAllocation::::alloc(LENGTH).unwrap(); - memory_copy(&mut a1, &values1).unwrap(); - memory_copy(&mut a2, &a1).unwrap(); - memory_copy(&mut values2, &a2).unwrap(); - assert!(values2.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_host_allocation_to_host_allocation() { - let mut a1 = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let mut a2 = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - a2.iter_mut().for_each(|x| { - *x = 42u32; - }); - memory_copy(&mut a1, &a2).unwrap(); - assert!(a1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_host_registration_to_host_registration_mut() { - let mut values1 = [0u32; LENGTH]; - let values2 = [42u32; LENGTH]; - let mut r1 = - HostRegistrationMut::register(&mut values1, CudaHostRegisterFlags::DEFAULT).unwrap(); - let r2 = HostRegistration::register(&values2, CudaHostRegisterFlags::DEFAULT).unwrap(); - memory_copy(&mut r1, &r2).unwrap(); - assert!(r1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_slice_to_slice() { - let mut values1 = [0u32; LENGTH]; - let values2 = [42u32; LENGTH]; - memory_copy(&mut values1, &values2).unwrap(); - assert!(values1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_async_device_allocation_to_device_allocation() { - let stream = CudaStream::create().unwrap(); - let values1 = [42u32; LENGTH]; - let mut values2 = [0u32; LENGTH]; - let mut a1 = DeviceAllocation::::alloc(LENGTH).unwrap(); - let mut a2 = DeviceAllocation::::alloc(LENGTH).unwrap(); - memory_copy_async(&mut a1, &values1, &stream).unwrap(); - memory_copy_async(&mut a2, &a1, &stream).unwrap(); - memory_copy_async(&mut values2, &a2, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(values2.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_async_host_allocation_to_host_allocation() { - let stream = CudaStream::create().unwrap(); - let mut a1 = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let mut a2 = HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - a2.iter_mut().for_each(|x| { - *x = 42u32; - }); - memory_copy_async(&mut a1, &a2, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(a1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_async_host_registration_to_host_registration_mut() { - let stream = CudaStream::create().unwrap(); - let mut values1 = [0u32; LENGTH]; - let values2 = [42u32; LENGTH]; - let mut r1 = - HostRegistrationMut::register(&mut values1, CudaHostRegisterFlags::DEFAULT).unwrap(); - let r2 = HostRegistration::register(&values2, CudaHostRegisterFlags::DEFAULT).unwrap(); - memory_copy_async(&mut r1, &r2, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(r1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_copy_async_slice_to_slice() { - let stream = CudaStream::create().unwrap(); - let mut values1 = [0u32; LENGTH]; - let values2 = [42u32; LENGTH]; - memory_copy_async(&mut values1, &values2, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(values1.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn memory_set_is_correct() { - let mut h_values = - HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let mut d_values = DeviceAllocation::::alloc(LENGTH).unwrap(); - memory_set(&mut d_values, 42u8).unwrap(); - memory_copy(&mut h_values, &d_values).unwrap(); - assert!(h_values.iter().all(|&x| x == 42u8)); - } - - #[test] - #[serial] - fn memory_set_async_is_correct() { - let stream = CudaStream::create().unwrap(); - let mut h_values = - HostAllocation::::alloc(LENGTH, CudaHostAllocFlags::DEFAULT).unwrap(); - let mut d_values = DeviceAllocation::::alloc(LENGTH).unwrap(); - memory_set_async(&mut d_values, 42u8, &stream).unwrap(); - memory_copy_async(&mut h_values, &d_values, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(h_values.iter().all(|&x| x == 42u8)); - } - - #[test] - #[serial] - fn memory_get_info_is_correct() { - let result = memory_get_info(); - assert!(result.is_ok()); - let (free, total) = result.unwrap(); - assert!(total > 0); - assert!(free <= total); - } -} diff --git a/cudart/src/memory_pools.rs b/cudart/src/memory_pools.rs deleted file mode 100644 index ba83eff..0000000 --- a/cudart/src/memory_pools.rs +++ /dev/null @@ -1,533 +0,0 @@ -// Stream Ordered Memory Allocator -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html - -use core::ffi::c_void; -use std::alloc::Layout; -use std::mem; -use std::mem::MaybeUninit; -use std::ops::{Deref, DerefMut}; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; -use crate::slice::{AllocationData, CudaSlice, CudaSliceMut, DeviceSlice}; -use crate::stream::CudaStream; - -#[repr(i32)] -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] -pub enum CudaMemPoolAttributeI32 { - ReuseFollowEventDependencies = CudaMemPoolAttribute::ReuseFollowEventDependencies as i32, - ReuseAllowOpportunistic = CudaMemPoolAttribute::ReuseAllowOpportunistic as i32, - ReuseAllowInternalDependencies = CudaMemPoolAttribute::ReuseAllowInternalDependencies as i32, -} - -impl From for i32 { - fn from(attribute: CudaMemPoolAttributeI32) -> Self { - attribute as i32 - } -} - -#[repr(i32)] -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] -pub enum CudaMemPoolAttributeU64 { - AttrReleaseThreshold = CudaMemPoolAttribute::AttrReleaseThreshold as i32, - AttrReservedMemCurrent = CudaMemPoolAttribute::AttrReservedMemCurrent as i32, - AttrReservedMemHigh = CudaMemPoolAttribute::AttrReservedMemHigh as i32, - AttrUsedMemCurrent = CudaMemPoolAttribute::AttrUsedMemCurrent as i32, - AttrUsedMemHigh = CudaMemPoolAttribute::AttrUsedMemHigh as i32, -} - -impl From for i32 { - fn from(attribute: CudaMemPoolAttributeU64) -> Self { - attribute as i32 - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct CudaMemPool { - handle: cudaMemPool_t, -} - -impl CudaMemPool { - pub(crate) fn from_handle(handle: cudaMemPool_t) -> Self { - Self { handle } - } - - pub fn get_access(&self, location: CudaMemLocation) -> CudaResult { - let mut result = MaybeUninit::::uninit(); - unsafe { - cudaMemPoolGetAccess( - result.as_mut_ptr(), - self.handle, - &location as *const CudaMemLocation as *mut CudaMemLocation, - ) - .wrap_maybe_uninit(result) - } - } - - pub fn get_attribute_value, U>(&self, attribute: T) -> CudaResult { - let mut value = MaybeUninit::::uninit(); - unsafe { - cudaMemPoolGetAttribute( - self.handle, - mem::transmute(attribute.into()), - value.as_mut_ptr() as *mut c_void, - ) - .wrap_maybe_uninit(value) - } - } - - pub fn set_access(&self, descriptors: &[CudaMemAccessDesc]) -> CudaResult<()> { - unsafe { cudaMemPoolSetAccess(self.handle, descriptors.as_ptr(), descriptors.len()).wrap() } - } - - pub fn set_attribute_value, U>(&self, attribute: T, value: U) -> CudaResult<()> { - unsafe { - cudaMemPoolSetAttribute( - self.handle, - mem::transmute(attribute.into()), - &value as *const _ as *mut c_void, - ) - .wrap() - } - } - - pub fn trim_to(&self, min_bytes_to_keep: usize) -> CudaResult<()> { - unsafe { cudaMemPoolTrimTo(self.handle, min_bytes_to_keep).wrap() } - } -} - -impl From<&CudaMemPool> for cudaMemPool_t { - fn from(pool: &CudaMemPool) -> Self { - pool.handle - } -} - -pub trait AttributeHandler { - type Value; - fn get_attribute(&self, attribute: T) -> CudaResult; - fn set_attribute(&self, attribute: T, value: Self::Value) -> CudaResult<()>; -} - -impl AttributeHandler for CudaMemPool { - type Value = i32; - - fn get_attribute(&self, attribute: CudaMemPoolAttributeI32) -> CudaResult { - self.get_attribute_value(attribute) - } - - fn set_attribute( - &self, - attribute: CudaMemPoolAttributeI32, - value: Self::Value, - ) -> CudaResult<()> { - self.set_attribute_value(attribute, value) - } -} - -impl AttributeHandler for CudaMemPool { - type Value = u64; - - fn get_attribute(&self, attribute: CudaMemPoolAttributeU64) -> CudaResult { - self.get_attribute_value(attribute) - } - - fn set_attribute( - &self, - attribute: CudaMemPoolAttributeU64, - value: Self::Value, - ) -> CudaResult<()> { - self.set_attribute_value(attribute, value) - } -} - -#[repr(transparent)] -#[derive(Debug)] -pub struct CudaOwnedMemPool { - pool: CudaMemPool, -} - -impl CudaOwnedMemPool { - fn from_handle(handle: cudaMemPool_t) -> Self { - Self { - pool: CudaMemPool::from_handle(handle), - } - } - - pub fn create(properties: &CudaMemPoolProperties) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaMemPoolCreate(handle.as_mut_ptr(), properties) - .wrap_maybe_uninit(handle) - .map(Self::from_handle) - } - } - - pub fn create_for_device(device_id: i32) -> CudaResult { - let props = CudaMemPoolProperties { - allocType: CudaMemAllocationType::Pinned, - handleTypes: CudaMemAllocationHandleType::None, - location: CudaMemLocation { - type_: CudaMemLocationType::Device, - id: device_id, - }, - ..Default::default() - }; - Self::create(&props) - } - - pub fn destroy(self) -> CudaResult<()> { - let pool = self.pool.handle; - mem::forget(self); - unsafe { cudaMemPoolDestroy(pool).wrap() } - } -} - -impl Drop for CudaOwnedMemPool { - fn drop(&mut self) { - unsafe { cudaMemPoolDestroy(self.pool.handle).eprint_error_and_backtrace() }; - } -} - -impl Deref for CudaOwnedMemPool { - type Target = CudaMemPool; - - fn deref(&self) -> &Self::Target { - &self.pool - } -} - -#[derive(Debug)] -pub struct DevicePoolAllocation<'a, T> { - data: AllocationData, - stream: &'a CudaStream, -} - -impl<'a, T> DevicePoolAllocation<'a, T> { - pub fn alloc_async(length: usize, stream: &'a CudaStream) -> CudaResult { - let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); - unsafe { - cudaMallocAsync(dev_ptr.as_mut_ptr(), layout.size(), stream.into()) - .wrap_maybe_uninit(dev_ptr) - .map(|ptr| Self { - data: AllocationData { - ptr: ptr as *mut T, - len: length, - }, - stream, - }) - } - } - - pub fn alloc_from_pool_async( - length: usize, - pool: &CudaMemPool, - stream: &'a CudaStream, - ) -> CudaResult { - let layout = Layout::array::(length).unwrap(); - let mut dev_ptr = MaybeUninit::<*mut c_void>::uninit(); - unsafe { - cudaMallocFromPoolAsync( - dev_ptr.as_mut_ptr(), - layout.size(), - pool.handle, - stream.into(), - ) - .wrap_maybe_uninit(dev_ptr) - .map(|ptr| Self { - data: AllocationData { - ptr: ptr as *mut T, - len: length, - }, - stream, - }) - } - } - - pub fn free_async(self, stream: &CudaStream) -> CudaResult<()> { - unsafe { - let ptr = self.as_c_void_ptr() as *mut c_void; - mem::forget(self); - cudaFreeAsync(ptr, stream.into()).wrap() - } - } - - pub fn swap_stream(self, stream: &CudaStream) -> DevicePoolAllocation { - let data = AllocationData { - ptr: self.data.ptr, - len: self.data.len, - }; - mem::forget(self); - DevicePoolAllocation { data, stream } - } - - /// # Safety - /// - /// The caller must ensure that the inputs are valid. - pub unsafe fn from_raw_parts(ptr: *mut T, len: usize, stream: &'a CudaStream) -> Self { - Self { - data: AllocationData { ptr, len }, - stream, - } - } - - pub fn into_raw_parts(self) -> (*mut T, usize, &'a CudaStream) { - let result = (self.data.ptr, self.data.len, self.stream); - mem::forget(self); - result - } -} - -impl<'a, T> Drop for DevicePoolAllocation<'a, T> { - fn drop(&mut self) { - unsafe { - cudaFreeAsync(self.as_mut_c_void_ptr(), self.stream.into()).eprint_error_and_backtrace() - }; - } -} - -impl<'a, T> Deref for DevicePoolAllocation<'a, T> { - type Target = DeviceSlice; - - fn deref(&self) -> &Self::Target { - Self::Target::from_allocation_data(&self.data) - } -} - -impl<'a, T> DerefMut for DevicePoolAllocation<'a, T> { - fn deref_mut(&mut self) -> &mut Self::Target { - Self::Target::from_mut_allocation_data(&mut self.data) - } -} - -impl<'a, T> CudaSlice for DevicePoolAllocation<'a, T> { - unsafe fn as_slice(&self) -> &[T] { - self.data.as_slice() - } -} - -impl<'a, T> CudaSliceMut for DevicePoolAllocation<'a, T> { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - self.data.as_mut_slice() - } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use crate::memory::memory_copy_async; - - use super::*; - - const LENGTH: usize = 1024; - - #[test] - #[serial] - fn mem_pool_for_device_is_ok() { - let result = CudaOwnedMemPool::create_for_device(0); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn mem_pool_destroy_is_ok() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let result = pool.destroy(); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn device_pool_allocation_alloc_async_is_ok() { - let stream = CudaStream::create().unwrap(); - let result = DevicePoolAllocation::::alloc_async(LENGTH, &stream); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn device_pool_allocation_alloc_from_pool_async_is_ok() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let stream = CudaStream::create().unwrap(); - let result = DevicePoolAllocation::::alloc_from_pool_async(LENGTH, &pool, &stream); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn device_pool_allocation_free_is_ok() { - let stream = CudaStream::create().unwrap(); - let allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let result = allocation.free_async(&stream); - stream.synchronize().unwrap(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn device_pool_allocation_alloc_async_len_eq_length() { - let stream = CudaStream::create().unwrap(); - let allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - assert_eq!(allocation.len(), LENGTH); - } - - #[test] - #[serial] - fn device_pool_allocation_alloc_async_is_empty_is_false() { - let stream = CudaStream::create().unwrap(); - let allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - assert!(!allocation.is_empty()); - } - - #[test] - #[serial] - fn device_pool_allocation_deref_len_eq_length() { - let stream = CudaStream::create().unwrap(); - let allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let slice = allocation.deref(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_pool_allocation_deref_mut_len_eq_length() { - let stream = CudaStream::create().unwrap(); - let mut allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let slice = allocation.deref_mut(); - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_pool_allocation_slice_index_len_eq_length() { - let stream = CudaStream::create().unwrap(); - let allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let slice = &allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_pool_allocation_mut_slice_index_mut_len_eq_length() { - let stream = CudaStream::create().unwrap(); - let mut allocation = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let slice = &mut allocation[..]; - assert_eq!(slice.len(), LENGTH); - } - - #[test] - #[serial] - fn device_pool_allocation_drop_frees_memory() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let stream = CudaStream::create().unwrap(); - let allocation = - DevicePoolAllocation::::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap(); - drop(allocation); - let used = pool - .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent) - .unwrap() as usize; - assert_eq!(used, 0); - } - - #[test] - #[serial] - fn device_pool_allocation_swap_stream() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let stream1 = CudaStream::create().unwrap(); - let allocation = - DevicePoolAllocation::::alloc_from_pool_async(LENGTH, &pool, &stream1).unwrap(); - let stream2 = CudaStream::create().unwrap(); - let allocation = allocation.swap_stream(&stream2); - drop(stream1); - drop(allocation); - let used = pool - .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent) - .unwrap() as usize; - assert_eq!(used, 0); - } - - #[test] - #[serial] - fn memory_copy_device_pool_allocation_to_device_pool_allocation() { - let values1 = [42u32; LENGTH]; - let mut values2 = [0u32; LENGTH]; - let stream = CudaStream::create().unwrap(); - let mut a1 = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - let mut a2 = DevicePoolAllocation::::alloc_async(LENGTH, &stream).unwrap(); - memory_copy_async(&mut a1, &values1, &stream).unwrap(); - memory_copy_async(&mut a2, &a1, &stream).unwrap(); - memory_copy_async(&mut values2, &a2, &stream).unwrap(); - stream.synchronize().unwrap(); - assert!(values2.iter().all(|&x| x == 42u32)); - } - - #[test] - #[serial] - fn get_attribute_i32() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let result = pool.get_attribute(CudaMemPoolAttributeI32::ReuseAllowOpportunistic); - assert_eq!(result, Ok(1)); - } - - #[test] - #[serial] - fn get_attribute_u64() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let result = pool.get_attribute(CudaMemPoolAttributeU64::AttrReleaseThreshold); - assert_eq!(result, Ok(0)); - } - - #[test] - #[serial] - fn set_attribute_i32() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let attribute = CudaMemPoolAttributeI32::ReuseAllowOpportunistic; - let result = pool.set_attribute(attribute, 0); - assert_eq!(result, Ok(())); - assert_eq!(pool.get_attribute(attribute), Ok(0)); - } - - #[test] - #[serial] - fn set_attribute_u64() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - let attribute = CudaMemPoolAttributeU64::AttrReleaseThreshold; - let result = pool.set_attribute(attribute, u64::MAX); - assert_eq!(result, Ok(())); - assert_eq!(pool.get_attribute(attribute), Ok(u64::MAX)); - } - - #[test] - #[serial] - fn trim_to_works_correctly() { - let pool = CudaOwnedMemPool::create_for_device(0).unwrap(); - pool.set_attribute(CudaMemPoolAttributeU64::AttrReleaseThreshold, u64::MAX) - .unwrap(); - let stream = CudaStream::create().unwrap(); - let allocation = - DevicePoolAllocation::::alloc_from_pool_async(LENGTH, &pool, &stream).unwrap(); - let size = mem::size_of::() * LENGTH; - let used = pool - .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent) - .unwrap() as usize; - assert_eq!(used, size); - allocation.free_async(&stream).unwrap(); - stream.synchronize().unwrap(); - let used = pool - .get_attribute(CudaMemPoolAttributeU64::AttrUsedMemCurrent) - .unwrap() as usize; - assert_eq!(used, 0); - let reserved = pool - .get_attribute(CudaMemPoolAttributeU64::AttrReservedMemCurrent) - .unwrap() as usize; - assert!(reserved >= size); - pool.trim_to(0).unwrap(); - let reserved = pool - .get_attribute(CudaMemPoolAttributeU64::AttrReservedMemCurrent) - .unwrap() as usize; - assert_eq!(reserved, 0); - } -} diff --git a/cudart/src/occupancy.rs b/cudart/src/occupancy.rs deleted file mode 100644 index 9e4a74b..0000000 --- a/cudart/src/occupancy.rs +++ /dev/null @@ -1,62 +0,0 @@ -use std::mem::MaybeUninit; - -use cudart_sys::{ - cudaOccupancyAvailableDynamicSMemPerBlock, cudaOccupancyMaxActiveBlocksPerMultiprocessor, - cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, -}; - -use crate::execution::Kernel; -use crate::result::{CudaResult, CudaResultWrap}; - -pub fn available_dynamic_smem_per_block( - kernel: impl Kernel, - num_blocks: i32, - block_size: i32, -) -> CudaResult { - let mut result = MaybeUninit::::uninit(); - unsafe { - cudaOccupancyAvailableDynamicSMemPerBlock( - result.as_mut_ptr(), - kernel.get_kernel_raw(), - num_blocks, - block_size, - ) - .wrap_maybe_uninit(result) - } -} - -pub fn max_active_blocks_per_multiprocessor( - kernel: impl Kernel, - block_size: i32, - dynamic_smem_size: usize, -) -> CudaResult { - let mut result = MaybeUninit::::uninit(); - unsafe { - cudaOccupancyMaxActiveBlocksPerMultiprocessor( - result.as_mut_ptr(), - kernel.get_kernel_raw(), - block_size, - dynamic_smem_size, - ) - .wrap_maybe_uninit(result) - } -} - -pub fn max_active_blocks_per_multiprocessor_with_flags( - kernel: impl Kernel, - block_size: i32, - dynamic_smem_size: usize, - flags: u32, -) -> CudaResult { - let mut result = MaybeUninit::::uninit(); - unsafe { - cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - result.as_mut_ptr(), - kernel.get_kernel_raw(), - block_size, - dynamic_smem_size, - flags, - ) - .wrap_maybe_uninit(result) - } -} diff --git a/cudart/src/peer.rs b/cudart/src/peer.rs deleted file mode 100644 index ac50f8d..0000000 --- a/cudart/src/peer.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Peer Device Memory Access -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__PEER.html - -use std::mem::MaybeUninit; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; - -pub fn device_can_access_peer(device_id: i32, device_peer_id: i32) -> CudaResult { - let mut can_access_peer = MaybeUninit::::uninit(); - unsafe { - cudaDeviceCanAccessPeer(can_access_peer.as_mut_ptr(), device_id, device_peer_id) - .wrap_maybe_uninit(can_access_peer) - .map(|value| value != 0) - } -} - -pub fn device_disable_peer_access(device_peer_id: i32) -> CudaResult<()> { - unsafe { cudaDeviceDisablePeerAccess(device_peer_id).wrap() } -} - -pub fn device_enable_peer_access(device_peer_id: i32) -> CudaResult<()> { - unsafe { cudaDeviceEnablePeerAccess(device_peer_id, 0).wrap() } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use crate::device::device_reset; - - use super::*; - - #[test] - #[serial] - #[ignore = "needs multiple peer-enabled GPUs"] - fn device_can_access_peer_is_true() { - let result = device_can_access_peer(0, 1); - assert_eq!(result, Ok(true)); - } - - #[test] - #[serial] - #[ignore = "needs multiple peer-enabled GPUs"] - fn device_disable_peer_access_is_ok() { - device_reset().unwrap(); - device_enable_peer_access(1).unwrap(); - let result = device_disable_peer_access(1); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - #[ignore = "needs multiple peer-enabled GPUs"] - fn device_enable_peer_access_is_ok() { - device_reset().unwrap(); - let result = device_enable_peer_access(1); - assert_eq!(result, Ok(())); - device_disable_peer_access(1).unwrap(); - } -} diff --git a/cudart/src/result.rs b/cudart/src/result.rs deleted file mode 100644 index a3ce68b..0000000 --- a/cudart/src/result.rs +++ /dev/null @@ -1,29 +0,0 @@ -use std::mem::MaybeUninit; - -use cudart_sys::CudaError; - -pub type CudaResult = Result; - -pub trait CudaResultWrap { - fn wrap(self) -> CudaResult<()>; - fn wrap_value(self, value: T) -> CudaResult; - fn wrap_maybe_uninit(self, value: MaybeUninit) -> CudaResult; -} - -impl CudaResultWrap for CudaError { - fn wrap(self) -> CudaResult<()> { - self.wrap_value(()) - } - - fn wrap_value(self, value: T) -> CudaResult { - if self == CudaError::Success { - Ok(value) - } else { - Err(self) - } - } - - fn wrap_maybe_uninit(self, value: MaybeUninit) -> CudaResult { - self.wrap_value(value).map(|x| unsafe { x.assume_init() }) - } -} diff --git a/cudart/src/slice/allocation_data.rs b/cudart/src/slice/allocation_data.rs deleted file mode 100644 index 4b11763..0000000 --- a/cudart/src/slice/allocation_data.rs +++ /dev/null @@ -1,25 +0,0 @@ -use std::slice; - -use crate::slice::{CudaSlice, CudaSliceMut}; - -#[derive(Debug)] -pub(crate) struct AllocationData { - pub ptr: *mut T, - pub len: usize, -} - -unsafe impl Send for AllocationData where Vec: Send {} - -unsafe impl Sync for AllocationData where Vec: Sync {} - -impl CudaSlice for AllocationData { - unsafe fn as_slice(&self) -> &[T] { - slice::from_raw_parts(self.ptr, self.len) - } -} - -impl CudaSliceMut for AllocationData { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - slice::from_raw_parts_mut(self.ptr, self.len) - } -} diff --git a/cudart/src/slice/cuda_slice.rs b/cudart/src/slice/cuda_slice.rs deleted file mode 100644 index d3bb5b3..0000000 --- a/cudart/src/slice/cuda_slice.rs +++ /dev/null @@ -1,67 +0,0 @@ -use std::ffi::c_void; - -pub trait CudaSlice { - /// # Safety - /// do not dereference if the memory is located on the device - unsafe fn as_slice(&self) -> &[T]; - - fn as_ptr(&self) -> *const T { - unsafe { self.as_slice().as_ptr() } - } - - fn as_c_void_ptr(&self) -> *const c_void { - self.as_ptr() as *const c_void - } - - fn is_empty(&self) -> bool { - unsafe { self.as_slice().is_empty() } - } - - fn len(&self) -> usize { - unsafe { self.as_slice().len() } - } -} - -pub trait CudaSliceMut: CudaSlice { - /// # Safety - /// do not dereference if the memory is located on the device - unsafe fn as_mut_slice(&mut self) -> &mut [T]; - - fn as_mut_ptr(&mut self) -> *mut T { - unsafe { self.as_mut_slice().as_mut_ptr() } - } - - fn as_mut_c_void_ptr(&mut self) -> *mut c_void { - self.as_mut_ptr() as *mut c_void - } -} - -impl CudaSlice for [T] { - unsafe fn as_slice(&self) -> &[T] { - self - } -} - -impl CudaSliceMut for [T] { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - self - } -} - -impl CudaSlice for U -where - Self: AsRef<[T]>, -{ - unsafe fn as_slice(&self) -> &[T] { - self.as_ref() - } -} - -impl CudaSliceMut for U -where - Self: AsMut<[T]> + AsRef<[T]>, -{ - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - self.as_mut() - } -} diff --git a/cudart/src/slice/cuda_variable.rs b/cudart/src/slice/cuda_variable.rs deleted file mode 100644 index 08920de..0000000 --- a/cudart/src/slice/cuda_variable.rs +++ /dev/null @@ -1,41 +0,0 @@ -use std::ffi::c_void; - -pub trait CudaVariable { - /// # Safety - /// do not dereference if the memory is located on the device - unsafe fn as_ref(&self) -> &T; - - fn as_ptr(&self) -> *const T { - unsafe { self.as_ref() } - } - - fn as_c_void_ptr(&self) -> *const c_void { - self.as_ptr() as *const c_void - } -} - -pub trait CudaVariableMut: CudaVariable { - /// # Safety - /// do not dereference if the memory is located on the device - unsafe fn as_mut(&mut self) -> &mut T; - - fn as_mut_ptr(&mut self) -> *mut T { - unsafe { self.as_mut() } - } - - fn as_mut_c_void_ptr(&mut self) -> *mut c_void { - self.as_mut_ptr() as *mut c_void - } -} - -impl CudaVariable for T { - unsafe fn as_ref(&self) -> &T { - self - } -} - -impl CudaVariableMut for T { - unsafe fn as_mut(&mut self) -> &mut T { - self - } -} diff --git a/cudart/src/slice/device_slice.rs b/cudart/src/slice/device_slice.rs deleted file mode 100644 index 846f6e2..0000000 --- a/cudart/src/slice/device_slice.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::fmt::{Debug, Formatter}; -use std::mem::size_of; -use std::ptr; -use std::ptr::{null, null_mut}; - -use crate::slice::iter::{Chunks, ChunksMut}; -use crate::slice::AllocationData; -use crate::slice::{CudaSlice, CudaSliceMut}; - -#[repr(transparent)] -pub struct DeviceSlice([T]); - -impl DeviceSlice { - /// # Safety - /// make sure data_address is pointing to memory located on the device - pub unsafe fn from_raw_parts<'a>(data_address: *const T, len: usize) -> &'a Self { - &*ptr::from_raw_parts(data_address as *const (), len) - } - - /// # Safety - /// make sure data_address is pointing to memory located on the device - pub unsafe fn from_raw_parts_mut<'a>(data_address: *mut T, len: usize) -> &'a mut Self { - &mut *ptr::from_raw_parts_mut(data_address as *mut (), len) - } - - /// # Safety - /// make sure the slice is pointing to memory located on the device - pub unsafe fn from_slice(slice: &[T]) -> &Self { - Self::from_raw_parts(slice.as_ptr(), slice.len()) - } - - /// # Safety - /// make sure the slice is pointing to memory located on the device - pub unsafe fn from_mut_slice(slice: &mut [T]) -> &mut Self { - Self::from_raw_parts_mut(slice.as_mut_ptr(), slice.len()) - } - - pub(crate) fn from_allocation_data(data: &AllocationData) -> &Self { - unsafe { Self::from_slice(data.as_slice()) } - } - - pub(crate) fn from_mut_allocation_data(data: &mut AllocationData) -> &mut Self { - unsafe { Self::from_mut_slice(data.as_mut_slice()) } - } - - pub fn empty<'a>() -> &'a Self { - unsafe { Self::from_raw_parts(null(), 0) } - } - - pub fn empty_mut<'a>() -> &'a mut Self { - unsafe { Self::from_raw_parts_mut(null_mut(), 0) } - } - - pub fn as_ptr(&self) -> *const T { - self.0.as_ptr() - } - - pub fn as_mut_ptr(&mut self) -> *mut T { - self.0.as_mut_ptr() - } - - pub fn len(&self) -> usize { - self.0.len() - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - /// # Safety - /// only use if you know that treating the memory as [U] is valid - pub unsafe fn transmute(&self) -> &DeviceSlice { - let size_of_t = size_of::(); - let size_of_u = size_of::(); - assert_eq!((self.len() * size_of_t) % size_of_u, 0); - let len = self.len() * size_of_t / size_of_u; - DeviceSlice::from_raw_parts(self.as_ptr() as *const U, len) - } - - /// # Safety - /// only use if you know that treating the memory as [U] is valid - pub unsafe fn transmute_mut(&mut self) -> &mut DeviceSlice { - let size_of_t = size_of::(); - let size_of_u = size_of::(); - assert_eq!((self.len() * size_of_t) % size_of_u, 0); - let len = self.len() * size_of_t / size_of_u; - DeviceSlice::from_raw_parts_mut(self.as_mut_ptr() as *mut U, len) - } - - pub fn split_at(&self, mid: usize) -> (&Self, &Self) { - unsafe { - let (left, right) = self.as_slice().split_at(mid); - (Self::from_slice(left), Self::from_slice(right)) - } - } - - pub fn split_at_mut(&mut self, mid: usize) -> (&mut Self, &mut Self) { - unsafe { - let (left, right) = self.as_mut_slice().split_at_mut(mid); - (Self::from_mut_slice(left), Self::from_mut_slice(right)) - } - } - - pub fn chunks(&self, chunk_size: usize) -> Chunks { - assert_ne!(chunk_size, 0, "chunk size must be non-zero"); - Chunks::new(self, chunk_size) - } - - pub fn chunks_mut(&mut self, chunk_size: usize) -> ChunksMut { - assert_ne!(chunk_size, 0, "chunk size must be non-zero"); - ChunksMut::new(self, chunk_size) - } -} - -impl Debug for DeviceSlice { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let slice = &self.0; - f.debug_struct("DeviceSlice") - .field("ptr", &slice.as_ptr()) - .field("len", &slice.len()) - .finish() - } -} - -impl CudaSlice for DeviceSlice { - unsafe fn as_slice(&self) -> &[T] { - &self.0 - } -} - -impl CudaSliceMut for DeviceSlice { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - &mut self.0 - } -} diff --git a/cudart/src/slice/device_variable.rs b/cudart/src/slice/device_variable.rs deleted file mode 100644 index 5dc5238..0000000 --- a/cudart/src/slice/device_variable.rs +++ /dev/null @@ -1,103 +0,0 @@ -use std::fmt::{Debug, Formatter}; -use std::ops::{Deref, DerefMut}; -use std::ptr; - -use crate::slice::{CudaSlice, CudaSliceMut, CudaVariable, CudaVariableMut, DeviceSlice}; - -#[repr(transparent)] -pub struct DeviceVariable([T]); - -impl DeviceVariable { - /// # Safety - /// make sure data_address is pointing to memory located on the device - pub unsafe fn from_raw_parts<'a>(data_address: *const T) -> &'a Self { - &*ptr::from_raw_parts(data_address as *const (), 1) - } - - /// # Safety - /// make sure data_address is pointing to memory located on the device - pub unsafe fn from_raw_parts_mut<'a>(data_address: *mut T) -> &'a mut Self { - &mut *ptr::from_raw_parts_mut(data_address as *mut (), 1) - } - - /// # Safety - /// make sure the ref is pointing to memory located on the device - pub unsafe fn from_ref(s: &T) -> &Self { - Self::from_raw_parts(s.as_ptr()) - } - - /// # Safety - /// make sure the ref is pointing to memory located on the device - pub unsafe fn from_mut(s: &mut T) -> &mut Self { - Self::from_raw_parts_mut(s.as_mut_ptr()) - } - - /// # Safety - /// make sure the slice is pointing to memory located on the device - pub unsafe fn from_slice(slice: &[T]) -> &Self { - assert_eq!(slice.len(), 1); - Self::from_raw_parts(slice.as_ptr()) - } - - /// # Safety - /// make sure the slice is pointing to memory located on the device - pub unsafe fn from_mut_slice(slice: &mut [T]) -> &mut Self { - assert_eq!(slice.len(), 1); - Self::from_raw_parts_mut(slice.as_mut_ptr()) - } - - pub fn as_ptr(&self) -> *const T { - self.0.as_ptr() - } - - pub fn as_mut_ptr(&mut self) -> *mut T { - self.0.as_mut_ptr() - } -} - -impl Deref for DeviceVariable { - type Target = DeviceSlice; - - fn deref(&self) -> &Self::Target { - unsafe { DeviceSlice::from_slice(&self.0) } - } -} - -impl DerefMut for DeviceVariable { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { DeviceSlice::from_mut_slice(&mut self.0) } - } -} - -impl Debug for DeviceVariable { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let slice = &self.0; - f.debug_struct("DeviceVariable") - .field("ptr", &slice.as_ptr()) - .finish() - } -} - -impl CudaSlice for DeviceVariable { - unsafe fn as_slice(&self) -> &[T] { - &self.0 - } -} - -impl CudaSliceMut for DeviceVariable { - unsafe fn as_mut_slice(&mut self) -> &mut [T] { - &mut self.0 - } -} - -impl CudaVariable for DeviceVariable { - unsafe fn as_ref(&self) -> &T { - &self.0[0] - } -} - -impl CudaVariableMut for DeviceVariable { - unsafe fn as_mut(&mut self) -> &mut T { - &mut self.0[0] - } -} diff --git a/cudart/src/slice/index.rs b/cudart/src/slice/index.rs deleted file mode 100644 index 721cd73..0000000 --- a/cudart/src/slice/index.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::ops; -use std::ops::{Index, IndexMut}; - -use crate::slice::{CudaSlice, CudaSliceMut, DeviceSlice, DeviceVariable}; - -impl Index for DeviceSlice -where - I: DeviceSliceIndex, -{ - type Output = I::Output; - - #[inline] - fn index(&self, index: I) -> &I::Output { - index.index(self) - } -} - -impl IndexMut for DeviceSlice -where - I: DeviceSliceIndex, -{ - #[inline] - fn index_mut(&mut self, index: I) -> &mut I::Output { - index.index_mut(self) - } -} - -pub trait DeviceSliceIndex { - type Output: ?Sized; - - fn index(self, slice: &DeviceSlice) -> &Self::Output; - - fn index_mut(self, slice: &mut DeviceSlice) -> &mut Self::Output; -} - -impl DeviceSliceIndex for usize { - type Output = DeviceVariable; - - fn index(self, slice: &DeviceSlice) -> &Self::Output { - unsafe { DeviceVariable::from_ref(slice.as_slice().index(self)) } - } - - fn index_mut(self, slice: &mut DeviceSlice) -> &mut Self::Output { - unsafe { DeviceVariable::from_mut(slice.as_mut_slice().index_mut(self)) } - } -} - -trait DeviceSliceToSliceIndex {} - -impl DeviceSliceIndex for I -where - I: DeviceSliceToSliceIndex, - [T]: Index + IndexMut, -{ - type Output = DeviceSlice; - - fn index(self, slice: &DeviceSlice) -> &Self::Output { - unsafe { DeviceSlice::from_slice(slice.as_slice().index(self)) } - } - - fn index_mut(self, slice: &mut DeviceSlice) -> &mut Self::Output { - unsafe { DeviceSlice::from_mut_slice(slice.as_mut_slice().index_mut(self)) } - } -} - -impl DeviceSliceToSliceIndex for ops::RangeFull {} - -impl DeviceSliceToSliceIndex for ops::Range {} - -impl DeviceSliceToSliceIndex for ops::RangeFrom {} - -impl DeviceSliceToSliceIndex for ops::RangeTo {} - -impl DeviceSliceToSliceIndex for ops::RangeInclusive {} - -impl DeviceSliceToSliceIndex for ops::RangeToInclusive {} - -impl DeviceSliceToSliceIndex for (ops::Bound, ops::Bound) {} diff --git a/cudart/src/slice/iter.rs b/cudart/src/slice/iter.rs deleted file mode 100644 index a6e7c14..0000000 --- a/cudart/src/slice/iter.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::iter::{FusedIterator, TrustedLen, TrustedRandomAccess, TrustedRandomAccessNoCoerce}; -use std::slice; - -use crate::slice::cuda_slice::{CudaSlice, CudaSliceMut}; -use crate::slice::device_slice::DeviceSlice; - -#[repr(transparent)] -pub struct Chunks<'a, T>(slice::Chunks<'a, T>); - -impl<'a, T: 'a> Chunks<'a, T> { - #[inline] - pub(super) fn new(slice: &'a DeviceSlice, size: usize) -> Self { - unsafe { Self(slice.as_slice().chunks(size)) } - } - - #[inline] - fn map_option(option: Option<&'a [T]>) -> Option<&'a DeviceSlice> { - unsafe { option.map(|s| DeviceSlice::from_slice(s)) } - } -} - -impl Clone for Chunks<'_, T> { - fn clone(&self) -> Self { - Self(self.0.clone()) - } -} - -impl<'a, T> Iterator for Chunks<'a, T> { - type Item = &'a DeviceSlice; - - #[inline] - fn next(&mut self) -> Option { - Self::map_option(self.0.next()) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.0.len() - } - - #[inline] - fn last(self) -> Option { - Self::map_option(self.0.last()) - } - - #[inline] - fn nth(&mut self, n: usize) -> Option { - Self::map_option(self.0.nth(n)) - } - - unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> Self::Item { - DeviceSlice::from_slice(self.0.__iterator_get_unchecked(idx)) - } -} - -impl<'a, T> DoubleEndedIterator for Chunks<'a, T> { - #[inline] - fn next_back(&mut self) -> Option { - Self::map_option(self.0.next_back()) - } - - #[inline] - fn nth_back(&mut self, n: usize) -> Option { - Self::map_option(self.0.nth_back(n)) - } -} - -impl ExactSizeIterator for Chunks<'_, T> {} - -unsafe impl TrustedLen for Chunks<'_, T> {} - -impl FusedIterator for Chunks<'_, T> {} - -#[doc(hidden)] -unsafe impl<'a, T> TrustedRandomAccess for Chunks<'a, T> {} - -#[doc(hidden)] -unsafe impl<'a, T> TrustedRandomAccessNoCoerce for Chunks<'a, T> { - const MAY_HAVE_SIDE_EFFECT: bool = false; -} - -#[repr(transparent)] -pub struct ChunksMut<'a, T>(slice::ChunksMut<'a, T>); - -impl<'a, T: 'a> ChunksMut<'a, T> { - #[inline] - pub(super) fn new(slice: &'a mut DeviceSlice, size: usize) -> Self { - unsafe { Self(slice.as_mut_slice().chunks_mut(size)) } - } - - #[inline] - fn map_option(option: Option<&'a mut [T]>) -> Option<&'a mut DeviceSlice> { - unsafe { option.map(|s| DeviceSlice::from_mut_slice(s)) } - } -} - -impl<'a, T> Iterator for ChunksMut<'a, T> { - type Item = &'a mut DeviceSlice; - - #[inline] - fn next(&mut self) -> Option { - Self::map_option(self.0.next()) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.len() - } - - #[inline] - fn last(self) -> Option { - Self::map_option(self.0.last()) - } - - #[inline] - fn nth(&mut self, n: usize) -> Option { - Self::map_option(self.0.nth(n)) - } - - unsafe fn __iterator_get_unchecked(&mut self, idx: usize) -> Self::Item { - DeviceSlice::from_mut_slice(self.0.__iterator_get_unchecked(idx)) - } -} - -impl<'a, T> DoubleEndedIterator for ChunksMut<'a, T> { - #[inline] - fn next_back(&mut self) -> Option { - Self::map_option(self.0.next_back()) - } - - #[inline] - fn nth_back(&mut self, n: usize) -> Option { - Self::map_option(self.0.nth_back(n)) - } -} - -impl ExactSizeIterator for ChunksMut<'_, T> {} - -unsafe impl TrustedLen for ChunksMut<'_, T> {} - -impl FusedIterator for ChunksMut<'_, T> {} - -#[doc(hidden)] -unsafe impl<'a, T> TrustedRandomAccess for ChunksMut<'a, T> {} - -#[doc(hidden)] -unsafe impl<'a, T> TrustedRandomAccessNoCoerce for ChunksMut<'a, T> { - const MAY_HAVE_SIDE_EFFECT: bool = false; -} - -unsafe impl Send for ChunksMut<'_, T> where T: Send {} - -unsafe impl Sync for ChunksMut<'_, T> where T: Sync {} diff --git a/cudart/src/slice/mod.rs b/cudart/src/slice/mod.rs deleted file mode 100644 index 0b80b89..0000000 --- a/cudart/src/slice/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -pub(crate) use allocation_data::AllocationData; -pub use cuda_slice::CudaSlice; -pub use cuda_slice::CudaSliceMut; -pub use cuda_variable::CudaVariable; -pub use cuda_variable::CudaVariableMut; -pub use device_slice::DeviceSlice; -pub use device_variable::DeviceVariable; - -mod allocation_data; -mod cuda_slice; -mod cuda_variable; -mod device_slice; -mod device_variable; -mod index; -mod iter; diff --git a/cudart/src/stream.rs b/cudart/src/stream.rs deleted file mode 100644 index a696db5..0000000 --- a/cudart/src/stream.rs +++ /dev/null @@ -1,218 +0,0 @@ -// stream management -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html - -use std::mem::{self, MaybeUninit}; -use std::ptr::null_mut; - -use bitflags::bitflags; - -use cudart_sys::*; - -use crate::event::CudaEvent; -use crate::execution::CudaLaunchAttribute; -use crate::result::{CudaResult, CudaResultWrap}; - -#[repr(transparent)] -#[derive(Debug)] -pub struct CudaStream { - handle: cudaStream_t, -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaStreamCreateFlags: u32 { - const DEFAULT = cudart_sys::cudaStreamDefault; - const NON_BLOCKING = cudart_sys::cudaStreamNonBlocking; - } -} - -impl Default for CudaStreamCreateFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -bitflags! { - #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] - pub struct CudaStreamWaitEventFlags: u32 { - const DEFAULT = cudart_sys::cudaEventWaitDefault; - const WAIT_EXTERNAL = cudart_sys::cudaEventWaitExternal; - } -} - -impl Default for CudaStreamWaitEventFlags { - fn default() -> Self { - Self::DEFAULT - } -} - -impl CudaStream { - fn from_handle(handle: cudaStream_t) -> Self { - Self { handle } - } - - pub fn create() -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaStreamCreate(handle.as_mut_ptr()) - .wrap_maybe_uninit(handle) - .map(CudaStream::from_handle) - } - } - - pub fn create_with_flags(flags: CudaStreamCreateFlags) -> CudaResult { - let mut handle = MaybeUninit::::uninit(); - unsafe { - cudaStreamCreateWithFlags(handle.as_mut_ptr(), flags.bits()) - .wrap_maybe_uninit(handle) - .map(CudaStream::from_handle) - } - } - - pub fn destroy(self) -> CudaResult<()> { - let handle = self.handle; - mem::forget(self); - if handle.is_null() { - Ok(()) - } else { - unsafe { cudaStreamDestroy(handle).wrap() } - } - } - - pub fn get_attribute(&self, id: CudaLaunchAttributeID) -> CudaResult { - let mut value = MaybeUninit::::uninit(); - unsafe { - cudaStreamGetAttribute(self.handle, id, value.as_mut_ptr()) - .wrap_maybe_uninit(value) - .map(|val| CudaLaunchAttribute::from_id_and_value(id, val)) - } - } - - pub fn query(&self) -> CudaResult { - let error = unsafe { cudaStreamQuery(self.handle) }; - match error { - CudaError::Success => Ok(true), - CudaError::ErrorNotReady => Ok(false), - _ => Err(error), - } - } - - pub fn set_attribute(&self, attribute: CudaLaunchAttribute) -> CudaResult<()> { - let (id, value) = attribute.into_id_and_value(); - unsafe { cudaStreamSetAttribute(self.handle, id, &value as *const _).wrap() } - } - - pub fn synchronize(&self) -> CudaResult<()> { - unsafe { cudaStreamSynchronize(self.handle).wrap() } - } - - pub fn wait_event(&self, event: &CudaEvent, flags: CudaStreamWaitEventFlags) -> CudaResult<()> { - unsafe { cudaStreamWaitEvent(self.handle, event.into(), flags.bits()).wrap() } - } -} - -impl Default for CudaStream { - fn default() -> Self { - Self { handle: null_mut() } - } -} - -impl Drop for CudaStream { - fn drop(&mut self) { - let handle = self.handle; - if handle.is_null() { - return; - } - unsafe { cudaStreamDestroy(handle).eprint_error_and_backtrace() }; - } -} - -impl From<&CudaStream> for cudaStream_t { - fn from(stream: &CudaStream) -> Self { - stream.handle - } -} - -#[cfg(test)] -mod tests { - use std::thread; - use std::time::Duration; - - use serial_test::serial; - - use crate::execution::{launch_host_fn, HostFn}; - - use super::*; - - #[test] - #[serial] - fn create_is_ok() { - let result = CudaStream::create(); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn create_handle_is_not_null() { - let stream = CudaStream::create().unwrap(); - assert_ne!(stream.handle, null_mut()); - } - - #[test] - #[serial] - fn create_with_flags_is_ok() { - let result = CudaStream::create_with_flags(CudaStreamCreateFlags::NON_BLOCKING); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn create_with_flags_handle_is_not_null() { - let stream = CudaStream::create_with_flags(CudaStreamCreateFlags::NON_BLOCKING).unwrap(); - assert_ne!(stream.handle, null_mut()); - } - - #[test] - #[serial] - fn destroy_is_ok() { - let stream = CudaStream::create().unwrap(); - let result = stream.destroy(); - assert!(result.is_ok()); - } - - #[test] - #[serial] - fn query_is_true() { - let stream = CudaStream::create().unwrap(); - let result = stream.query(); - assert_eq!(result, Ok(true)); - } - - #[test] - #[serial] - fn query_is_false() { - let stream = CudaStream::create().unwrap(); - let func = HostFn::new(|| thread::sleep(Duration::from_millis(100))); - launch_host_fn(&stream, &func).unwrap(); - let result = stream.query(); - assert_eq!(result, Ok(false)); - } - - #[test] - #[serial] - fn synchronize_is_ok() { - let stream = CudaStream::create().unwrap(); - let result = stream.synchronize(); - assert_eq!(result, Ok(())); - } - - #[test] - #[serial] - fn wait_event_is_ok() { - let stream = CudaStream::create().unwrap(); - let event = CudaEvent::create().unwrap(); - event.record(&stream).unwrap(); - let result = stream.wait_event(&event, CudaStreamWaitEventFlags::DEFAULT); - assert_eq!(result, Ok(())); - } -} diff --git a/cudart/src/unified.rs b/cudart/src/unified.rs deleted file mode 100644 index 78ea2ee..0000000 --- a/cudart/src/unified.rs +++ /dev/null @@ -1,58 +0,0 @@ -// Unified Addressing -// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__UNIFIED.html - -use core::ffi::c_void; -use std::mem::MaybeUninit; - -use cudart_sys::*; - -use crate::result::{CudaResult, CudaResultWrap}; -use crate::slice::CudaSlice; - -pub fn pointer_get_attributes( - slice: &(impl CudaSlice + ?Sized), -) -> CudaResult { - let mut attributes = MaybeUninit::::uninit(); - unsafe { - cudaPointerGetAttributes( - attributes.as_mut_ptr(), - slice.as_c_void_ptr() as *mut c_void, - ) - .wrap_maybe_uninit(attributes) - } -} - -#[cfg(test)] -mod tests { - use serial_test::serial; - - use crate::memory::*; - - use super::*; - - #[test] - #[serial] - fn pointer_is_unregistered() { - let values = [0u32]; - let attributes = pointer_get_attributes(&values).unwrap(); - assert_eq!(attributes.type_, CudaMemoryType::Unregistered); - } - - #[test] - #[serial] - fn pointer_is_host() { - let values = [0u32]; - let registration = - HostRegistration::register(&values, CudaHostRegisterFlags::DEFAULT).unwrap(); - let attributes = pointer_get_attributes(®istration).unwrap(); - assert_eq!(attributes.type_, CudaMemoryType::Host); - } - - #[test] - #[serial] - fn pointer_is_device() { - let allocation = DeviceAllocation::::alloc(1).unwrap(); - let attributes = pointer_get_attributes(&allocation).unwrap(); - assert_eq!(attributes.type_, CudaMemoryType::Device); - } -} diff --git a/boojum-cuda/native/.clang-format b/native/.clang-format similarity index 100% rename from boojum-cuda/native/.clang-format rename to native/.clang-format diff --git a/boojum-cuda/native/.gitignore b/native/.gitignore similarity index 100% rename from boojum-cuda/native/.gitignore rename to native/.gitignore diff --git a/boojum-cuda/native/CMakeLists.txt b/native/CMakeLists.txt similarity index 100% rename from boojum-cuda/native/CMakeLists.txt rename to native/CMakeLists.txt diff --git a/boojum-cuda/native/barycentric.cu b/native/barycentric.cu similarity index 100% rename from boojum-cuda/native/barycentric.cu rename to native/barycentric.cu diff --git a/boojum-cuda/native/blake2s.cu b/native/blake2s.cu similarity index 100% rename from boojum-cuda/native/blake2s.cu rename to native/blake2s.cu diff --git a/boojum-cuda/native/carry_chain.cuh b/native/carry_chain.cuh similarity index 100% rename from boojum-cuda/native/carry_chain.cuh rename to native/carry_chain.cuh diff --git a/boojum-cuda/native/common.cuh b/native/common.cuh similarity index 100% rename from boojum-cuda/native/common.cuh rename to native/common.cuh diff --git a/boojum-cuda/native/context.cu b/native/context.cu similarity index 100% rename from boojum-cuda/native/context.cu rename to native/context.cu diff --git a/boojum-cuda/native/context.cuh b/native/context.cuh similarity index 100% rename from boojum-cuda/native/context.cuh rename to native/context.cuh diff --git a/boojum-cuda/native/gates.cuh b/native/gates.cuh similarity index 100% rename from boojum-cuda/native/gates.cuh rename to native/gates.cuh diff --git a/boojum-cuda/native/gates_poseidon.cuh b/native/gates_poseidon.cuh similarity index 100% rename from boojum-cuda/native/gates_poseidon.cuh rename to native/gates_poseidon.cuh diff --git a/boojum-cuda/native/gates_template.cu b/native/gates_template.cu similarity index 100% rename from boojum-cuda/native/gates_template.cu rename to native/gates_template.cu diff --git a/boojum-cuda/native/goldilocks.cuh b/native/goldilocks.cuh similarity index 100% rename from boojum-cuda/native/goldilocks.cuh rename to native/goldilocks.cuh diff --git a/boojum-cuda/native/goldilocks_extension.cu b/native/goldilocks_extension.cu similarity index 100% rename from boojum-cuda/native/goldilocks_extension.cu rename to native/goldilocks_extension.cu diff --git a/boojum-cuda/native/goldilocks_extension.cuh b/native/goldilocks_extension.cuh similarity index 100% rename from boojum-cuda/native/goldilocks_extension.cuh rename to native/goldilocks_extension.cuh diff --git a/boojum-cuda/native/memory.cuh b/native/memory.cuh similarity index 100% rename from boojum-cuda/native/memory.cuh rename to native/memory.cuh diff --git a/boojum-cuda/native/ntt.cu b/native/ntt.cu similarity index 100% rename from boojum-cuda/native/ntt.cu rename to native/ntt.cu diff --git a/boojum-cuda/native/ntt_b2n.cuh b/native/ntt_b2n.cuh similarity index 100% rename from boojum-cuda/native/ntt_b2n.cuh rename to native/ntt_b2n.cuh diff --git a/boojum-cuda/native/ntt_n2b.cuh b/native/ntt_n2b.cuh similarity index 100% rename from boojum-cuda/native/ntt_n2b.cuh rename to native/ntt_n2b.cuh diff --git a/boojum-cuda/native/ops_complex.cu b/native/ops_complex.cu similarity index 100% rename from boojum-cuda/native/ops_complex.cu rename to native/ops_complex.cu diff --git a/boojum-cuda/native/ops_complex.cuh b/native/ops_complex.cuh similarity index 100% rename from boojum-cuda/native/ops_complex.cuh rename to native/ops_complex.cuh diff --git a/boojum-cuda/native/ops_cub/common.cuh b/native/ops_cub/common.cuh similarity index 100% rename from boojum-cuda/native/ops_cub/common.cuh rename to native/ops_cub/common.cuh diff --git a/boojum-cuda/native/ops_cub/device_radix_sort.cu b/native/ops_cub/device_radix_sort.cu similarity index 100% rename from boojum-cuda/native/ops_cub/device_radix_sort.cu rename to native/ops_cub/device_radix_sort.cu diff --git a/boojum-cuda/native/ops_cub/device_reduce.cu b/native/ops_cub/device_reduce.cu similarity index 100% rename from boojum-cuda/native/ops_cub/device_reduce.cu rename to native/ops_cub/device_reduce.cu diff --git a/boojum-cuda/native/ops_cub/device_run_length_encode.cu b/native/ops_cub/device_run_length_encode.cu similarity index 100% rename from boojum-cuda/native/ops_cub/device_run_length_encode.cu rename to native/ops_cub/device_run_length_encode.cu diff --git a/boojum-cuda/native/ops_cub/device_scan.cu b/native/ops_cub/device_scan.cu similarity index 100% rename from boojum-cuda/native/ops_cub/device_scan.cu rename to native/ops_cub/device_scan.cu diff --git a/boojum-cuda/native/ops_simple.cu b/native/ops_simple.cu similarity index 100% rename from boojum-cuda/native/ops_simple.cu rename to native/ops_simple.cu diff --git a/boojum-cuda/native/poseidon2_cooperative.cu b/native/poseidon2_cooperative.cu similarity index 100% rename from boojum-cuda/native/poseidon2_cooperative.cu rename to native/poseidon2_cooperative.cu diff --git a/boojum-cuda/native/poseidon2_single_thread.cu b/native/poseidon2_single_thread.cu similarity index 100% rename from boojum-cuda/native/poseidon2_single_thread.cu rename to native/poseidon2_single_thread.cu diff --git a/boojum-cuda/native/poseidon2_single_thread.cuh b/native/poseidon2_single_thread.cuh similarity index 100% rename from boojum-cuda/native/poseidon2_single_thread.cuh rename to native/poseidon2_single_thread.cuh diff --git a/boojum-cuda/native/poseidon_common.cu b/native/poseidon_common.cu similarity index 54% rename from boojum-cuda/native/poseidon_common.cu rename to native/poseidon_common.cu index b68fbc6..9bec51d 100644 --- a/boojum-cuda/native/poseidon_common.cu +++ b/native/poseidon_common.cu @@ -1,9 +1,11 @@ #include "goldilocks.cuh" +#include "poseidon_constants.cuh" namespace poseidon { using namespace goldilocks; using namespace memory; +using namespace poseidon_common; EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigned indexes_count, const matrix_getter values, matrix_setter results) { @@ -17,20 +19,20 @@ EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigne results.set(dst_row, col, values.get(src_row, col)); } -EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count, - const matrix_getter values, matrix_setter results) { +EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count, const base_field *values, + const unsigned log_leaves_count, base_field *results) { const unsigned idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx >= indexes_count) return; - const unsigned col = threadIdx.y; - const unsigned layers_count = gridDim.y; - const unsigned layer_from_leaves = blockIdx.y; + const unsigned col = blockIdx.y; + const unsigned layer_index = blockIdx.z; + const unsigned layer_offset = (CAPACITY << (log_leaves_count + 1)) - (CAPACITY << (log_leaves_count + 1 - layer_index)); + const unsigned col_offset = col << (log_leaves_count - layer_index); const unsigned leaf_index = indexes[idx]; - const unsigned layer_offset = (1 << (layers_count + 1)) - (1 << (layers_count + 1 - layer_from_leaves)); - const unsigned hash_index = (leaf_index >> layer_from_leaves) ^ 1; - const unsigned src_row = layer_offset + hash_index; - const unsigned dst_row = layer_from_leaves * indexes_count + idx; - results.set(dst_row, col, values.get(src_row, col)); + const unsigned hash_index = (leaf_index >> layer_index) ^ 1; + const unsigned src_index = layer_offset + col_offset + hash_index; + const unsigned dst_index = layer_index * indexes_count * CAPACITY + indexes_count * col + idx; + results[dst_index] = values[src_index]; } } // namespace poseidon \ No newline at end of file diff --git a/boojum-cuda/native/poseidon_constants_template.cuh b/native/poseidon_constants_template.cuh similarity index 100% rename from boojum-cuda/native/poseidon_constants_template.cuh rename to native/poseidon_constants_template.cuh diff --git a/boojum-cuda/native/poseidon_cooperative.cu b/native/poseidon_cooperative.cu similarity index 100% rename from boojum-cuda/native/poseidon_cooperative.cu rename to native/poseidon_cooperative.cu diff --git a/boojum-cuda/native/poseidon_single_thread.cu b/native/poseidon_single_thread.cu similarity index 100% rename from boojum-cuda/native/poseidon_single_thread.cu rename to native/poseidon_single_thread.cu diff --git a/boojum-cuda/native/poseidon_single_thread.cuh b/native/poseidon_single_thread.cuh similarity index 100% rename from boojum-cuda/native/poseidon_single_thread.cuh rename to native/poseidon_single_thread.cuh diff --git a/boojum-cuda/native/poseidon_utils.cuh b/native/poseidon_utils.cuh similarity index 100% rename from boojum-cuda/native/poseidon_utils.cuh rename to native/poseidon_utils.cuh diff --git a/boojum-cuda/native/ptx.cuh b/native/ptx.cuh similarity index 100% rename from boojum-cuda/native/ptx.cuh rename to native/ptx.cuh diff --git a/boojum-cuda/src/.gitignore b/src/.gitignore similarity index 100% rename from boojum-cuda/src/.gitignore rename to src/.gitignore diff --git a/boojum-cuda/src/barycentric.rs b/src/barycentric.rs similarity index 99% rename from boojum-cuda/src/barycentric.rs rename to src/barycentric.rs index 2415524..5a0e5e6 100644 --- a/boojum-cuda/src/barycentric.rs +++ b/src/barycentric.rs @@ -91,6 +91,7 @@ pub trait PrecomputeImpl { GoldilocksField, u32, ); + #[allow(clippy::type_complexity)] fn get_precompute_kernel() -> unsafe extern "C" fn( *const ::Type, *const ::Type, @@ -181,6 +182,7 @@ pub trait EvalImpl { type XVec: DeviceRepr; type YsVec: DeviceRepr; fn get_partial_reduce_elems_per_thread() -> u32; + #[allow(clippy::type_complexity)] fn get_partial_reduce_kernel() -> unsafe extern "C" fn( PtrAndStride<::Type>, PtrAndStride<::Type>, diff --git a/boojum-cuda/src/blake2s.rs b/src/blake2s.rs similarity index 100% rename from boojum-cuda/src/blake2s.rs rename to src/blake2s.rs diff --git a/boojum-cuda/src/context.rs b/src/context.rs similarity index 100% rename from boojum-cuda/src/context.rs rename to src/context.rs diff --git a/boojum-cuda/src/device_structures.rs b/src/device_structures.rs similarity index 100% rename from boojum-cuda/src/device_structures.rs rename to src/device_structures.rs diff --git a/boojum-cuda/src/extension_field.rs b/src/extension_field.rs similarity index 100% rename from boojum-cuda/src/extension_field.rs rename to src/extension_field.rs diff --git a/boojum-cuda/src/gates.rs b/src/gates.rs similarity index 100% rename from boojum-cuda/src/gates.rs rename to src/gates.rs diff --git a/boojum-cuda/src/gates_data_template.rs b/src/gates_data_template.rs similarity index 100% rename from boojum-cuda/src/gates_data_template.rs rename to src/gates_data_template.rs diff --git a/boojum-cuda/src/lib.rs b/src/lib.rs similarity index 100% rename from boojum-cuda/src/lib.rs rename to src/lib.rs diff --git a/boojum-cuda/src/ntt.rs b/src/ntt.rs similarity index 100% rename from boojum-cuda/src/ntt.rs rename to src/ntt.rs diff --git a/boojum-cuda/src/ops_complex.rs b/src/ops_complex.rs similarity index 99% rename from boojum-cuda/src/ops_complex.rs rename to src/ops_complex.rs index 2032c28..fed079b 100644 --- a/boojum-cuda/src/ops_complex.rs +++ b/src/ops_complex.rs @@ -1694,7 +1694,7 @@ mod tests { .into_iter() .enumerate() .map(|(i, x)| x * generator.pow_u64(SHIFT as u64 * i as u64)) - .zip(h_dst.into_iter()) + .zip(h_dst) .for_each(assert_equal); context.destroy().unwrap(); } @@ -1755,9 +1755,8 @@ mod tests { .into_iter() .chunks(ROWS) .into_iter() - .zip(h_dst.into_iter().chunks(ROWS).into_iter()) + .zip(h_dst.chunks(ROWS)) .for_each(|(s, d)| { - let d = d.collect_vec(); s.enumerate() .map(|(i, x)| (x, d[i.reverse_bits() >> (usize::BITS - LOG_ROWS as u32)])) .for_each(assert_equal); @@ -1817,7 +1816,7 @@ mod tests { } fn test_batch_inv_ef(in_place: bool) { - type VEF = VectorizedExtensionField; + type Vef = VectorizedExtensionField; const LOG_N: usize = 16; const N: usize = 1 << LOG_N; let h_src_bf = Uniform::new(0, GoldilocksField::ORDER) @@ -1830,16 +1829,16 @@ mod tests { if in_place { let mut d_values_bf = DeviceAllocation::alloc(2 * N).unwrap(); memory_copy_async(&mut d_values_bf, &h_src_bf, &stream).unwrap(); - let mut d_values_ef = unsafe { d_values_bf.transmute_mut::() }; - super::batch_inv_in_place::(&mut d_values_ef, &stream).unwrap(); + let d_values_ef = unsafe { d_values_bf.transmute_mut::() }; + super::batch_inv_in_place::(d_values_ef, &stream).unwrap(); memory_copy_async(&mut h_dst_bf, &d_values_bf, &stream).unwrap(); } else { let mut d_src_bf = DeviceAllocation::alloc(2 * N).unwrap(); let mut d_dst_bf = DeviceAllocation::alloc(2 * N).unwrap(); memory_copy_async(&mut d_src_bf, &h_src_bf, &stream).unwrap(); - let d_src_ef = unsafe { d_src_bf.transmute::() }; - let mut d_dst_ef = unsafe { d_dst_bf.transmute_mut::() }; - super::batch_inv::(&d_src_ef, &mut d_dst_ef, &stream).unwrap(); + let d_src_ef = unsafe { d_src_bf.transmute::() }; + let d_dst_ef = unsafe { d_dst_bf.transmute_mut::() }; + super::batch_inv::(d_src_ef, d_dst_ef, &stream).unwrap(); memory_copy_async(&mut h_dst_bf, &d_dst_bf, &stream).unwrap(); } stream.synchronize().unwrap(); diff --git a/boojum-cuda/src/ops_cub/device_radix_sort.rs b/src/ops_cub/device_radix_sort.rs similarity index 98% rename from boojum-cuda/src/ops_cub/device_radix_sort.rs rename to src/ops_cub/device_radix_sort.rs index e369c4f..6ecaf84 100644 --- a/boojum-cuda/src/ops_cub/device_radix_sort.rs +++ b/src/ops_cub/device_radix_sort.rs @@ -591,14 +591,8 @@ mod tests { memory_copy_async(&mut h_keys_out, &d_keys_out, &stream).unwrap(); memory_copy_async(&mut h_values_out, &d_values_out, &stream).unwrap(); stream.synchronize().unwrap(); - let mut pairs_in = h_keys_in - .into_iter() - .zip(h_values_in.into_iter()) - .collect_vec(); - let pairs_out = h_keys_out - .into_iter() - .zip(h_values_out.into_iter()) - .collect_vec(); + let mut pairs_in = h_keys_in.into_iter().zip(h_values_in).collect_vec(); + let pairs_out = h_keys_out.into_iter().zip(h_values_out).collect_vec(); pairs_in.sort_by_key(|(k, _)| k.clone()); if descending { pairs_in.reverse() diff --git a/boojum-cuda/src/ops_cub/device_reduce.rs b/src/ops_cub/device_reduce.rs similarity index 100% rename from boojum-cuda/src/ops_cub/device_reduce.rs rename to src/ops_cub/device_reduce.rs diff --git a/boojum-cuda/src/ops_cub/device_run_length_encode.rs b/src/ops_cub/device_run_length_encode.rs similarity index 100% rename from boojum-cuda/src/ops_cub/device_run_length_encode.rs rename to src/ops_cub/device_run_length_encode.rs diff --git a/boojum-cuda/src/ops_cub/device_scan.rs b/src/ops_cub/device_scan.rs similarity index 99% rename from boojum-cuda/src/ops_cub/device_scan.rs rename to src/ops_cub/device_scan.rs index 810fd3b..f668cf7 100644 --- a/boojum-cuda/src/ops_cub/device_scan.rs +++ b/src/ops_cub/device_scan.rs @@ -844,10 +844,10 @@ mod tests { h_in.into_iter() .chunks(NUM_ITEMS) .into_iter() - .zip(h_out.into_iter().chunks(NUM_ITEMS).into_iter()) + .zip(h_out.chunks(NUM_ITEMS)) .for_each(|(h_in, h_out)| { let h_in = h_in.collect_vec(); - let h_out = h_out.collect_vec(); + let h_out = Vec::from(h_out); verify(operation, inclusive, reverse, h_in, h_out); }); } diff --git a/boojum-cuda/src/ops_cub/mod.rs b/src/ops_cub/mod.rs similarity index 100% rename from boojum-cuda/src/ops_cub/mod.rs rename to src/ops_cub/mod.rs diff --git a/boojum-cuda/src/ops_simple.rs b/src/ops_simple.rs similarity index 100% rename from boojum-cuda/src/ops_simple.rs rename to src/ops_simple.rs diff --git a/boojum-cuda/src/poseidon.rs b/src/poseidon.rs similarity index 92% rename from boojum-cuda/src/poseidon.rs rename to src/poseidon.rs index a8de969..e3fdee4 100644 --- a/boojum-cuda/src/poseidon.rs +++ b/src/poseidon.rs @@ -1,7 +1,9 @@ use boojum::field::goldilocks::GoldilocksField; use boojum::implementations::poseidon_goldilocks_params::*; -use cudart::execution::{KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs}; +use cudart::execution::{ + KernelFiveArgs, KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs, +}; use cudart::result::CudaResult; use cudart::slice::DeviceSlice; use cudart::stream::CudaStream; @@ -79,8 +81,9 @@ extern "C" { fn gather_merkle_paths_kernel( indexes: *const u32, indexes_count: u32, - values: PtrAndStride, - results: MutPtrAndStride, + values: *const GoldilocksField, + log_leaves_count: u32, + results: *mut GoldilocksField, ); } @@ -417,31 +420,38 @@ pub fn gather_rows( pub fn gather_merkle_paths( indexes: &DeviceSlice, - values: &(impl DeviceMatrixChunkImpl + ?Sized), - result: &mut (impl DeviceMatrixChunkMutImpl + ?Sized), + values: &DeviceSlice, + results: &mut DeviceSlice, + layers_count: u32, stream: &CudaStream, ) -> CudaResult<()> { - assert_eq!(values.cols(), CAPACITY); - assert_eq!(result.cols(), CAPACITY); - let indexes_len = indexes.len(); - let values_rows = values.rows(); - let result_rows = result.rows(); - assert_eq!(result_rows % indexes_len, 0); - let layers_count = result_rows / indexes_len; - assert_eq!(values_rows, 1 << (layers_count + 1)); - assert_eq!(WARP_SIZE % CAPACITY as u32, 0); - assert!(indexes_len <= u32::MAX as usize); - let indexes_count = indexes_len as u32; - let (grid_dim, block_dim) = - get_grid_block_dims_for_threads_count(WARP_SIZE / CAPACITY as u32, indexes_count); - let grid_dim = (grid_dim.x, layers_count as u32).into(); - let block_dim = (block_dim.x, CAPACITY as u32).into(); + assert!(indexes.len() <= u32::MAX as usize); + let indexes_count = indexes.len() as u32; + assert_eq!(values.len() % CAPACITY, 0); + let values_count = values.len() / CAPACITY; + assert!(values_count.is_power_of_two()); + let log_values_count = values_count.trailing_zeros(); + assert_ne!(log_values_count, 0); + let log_leaves_count = log_values_count - 1; + assert!(layers_count < log_leaves_count); + assert_eq!( + indexes.len() * layers_count as usize * CAPACITY, + results.len() + ); + let (grid_dim, block_dim) = get_grid_block_dims_for_threads_count(WARP_SIZE, indexes_count); + let grid_dim = (grid_dim.x, CAPACITY as u32, layers_count).into(); let indexes = indexes.as_ptr(); - let values = values.as_ptr_and_stride(); - let result = result.as_mut_ptr_and_stride(); - let args = (&indexes, &indexes_count, &values, &result); + let values = values.as_ptr(); + let result = results.as_mut_ptr(); + let args = ( + &indexes, + &indexes_count, + &values, + &log_leaves_count, + &result, + ); unsafe { - KernelFourArgs::launch( + KernelFiveArgs::launch( gather_merkle_paths_kernel, grid_dim, block_dim, @@ -458,9 +468,11 @@ mod tests { use boojum::field::{Field, U64Representable}; use boojum::implementations::poseidon2::state_generic_impl::State; + use itertools::Itertools; use rand::Rng; use cudart::memory::{memory_copy_async, DeviceAllocation}; + use cudart::slice::CudaSlice; // use boojum::implementations::poseidon_goldilocks::poseidon_permutation_optimized; use crate::device_structures::{DeviceMatrix, DeviceMatrixMut}; @@ -927,13 +939,13 @@ mod tests { fn gather_merkle_paths() { const LOG_LEAVES_COUNT: usize = 12; const INDEXES_COUNT: usize = 42; + const LAYERS_COUNT: usize = LOG_LEAVES_COUNT - 4; let mut rng = rand::thread_rng(); let mut indexes_host = vec![0; INDEXES_COUNT]; - indexes_host.fill_with(|| rng.gen_range(0..INDEXES_COUNT as u32)); + indexes_host.fill_with(|| rng.gen_range(0..1u32 << LOG_LEAVES_COUNT)); let mut values_host = vec![GoldilocksField::ZERO; CAPACITY << (LOG_LEAVES_COUNT + 1)]; values_host.fill_with(|| GoldilocksField(rng.gen())); - let mut results_host = - vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LOG_LEAVES_COUNT]; + let mut results_host = vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LAYERS_COUNT]; let stream = CudaStream::default(); let mut indexes_device = DeviceAllocation::::alloc(indexes_host.len()).unwrap(); let mut values_device = @@ -944,40 +956,37 @@ mod tests { memory_copy_async(&mut values_device, &values_host, &stream).unwrap(); super::gather_merkle_paths( &indexes_device, - &DeviceMatrix::new(&values_device, 1 << (LOG_LEAVES_COUNT + 1)), - &mut DeviceMatrixMut::new(&mut results_device, INDEXES_COUNT * LOG_LEAVES_COUNT), + &values_device, + &mut results_device, + LAYERS_COUNT as u32, &stream, ) .unwrap(); memory_copy_async(&mut results_host, &results_device, &stream).unwrap(); stream.synchronize().unwrap(); fn verify_merkle_path( + indexes: &[u32], values: &[GoldilocksField], results: &[GoldilocksField], - row_index: usize, - leaf_index: usize, ) { - let log_leaves_count = values.len().trailing_zeros() - 1; - let sibling_index = leaf_index ^ 1; - let expected = values[sibling_index]; - let actual = results[row_index]; - assert_eq!(expected, actual); - if log_leaves_count > 1 { - verify_merkle_path( - &values[values.len() >> 1..], - &results[INDEXES_COUNT..], - row_index, - leaf_index >> 1, - ); + let (values, values_next) = values.split_at(values.len() >> 1); + let (results, results_next) = results.split_at(INDEXES_COUNT * CAPACITY); + values + .chunks(values.len() / CAPACITY) + .zip(results.chunks(results.len() / CAPACITY)) + .for_each(|(values, results)| { + for (row_index, &index) in indexes.iter().enumerate() { + let sibling_index = index ^ 1; + let expected = values[sibling_index as usize]; + let actual = results[row_index]; + assert_eq!(expected, actual); + } + }); + if !results_next.is_empty() { + let indexes_next = indexes.iter().map(|&x| x >> 1).collect_vec(); + verify_merkle_path(&indexes_next, &values_next, &results_next); } } - values_host - .chunks(1 << (LOG_LEAVES_COUNT + 1)) - .zip(results_host.chunks(INDEXES_COUNT * LOG_LEAVES_COUNT)) - .for_each(|(values, results)| { - for (row_index, &leaf_index) in indexes_host.iter().enumerate() { - verify_merkle_path(values, results, row_index, leaf_index as usize); - } - }); + verify_merkle_path(&indexes_host, &values_host, &results_host); } } diff --git a/boojum-cuda/src/tests_helpers.rs b/src/tests_helpers.rs similarity index 100% rename from boojum-cuda/src/tests_helpers.rs rename to src/tests_helpers.rs diff --git a/boojum-cuda/src/utils.rs b/src/utils.rs similarity index 100% rename from boojum-cuda/src/utils.rs rename to src/utils.rs