Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

build refactoring #36

Merged
merged 7 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
428 changes: 139 additions & 289 deletions Cargo.lock

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "boojum-cuda"
version = "0.1.1"
name = "era_boojum_cuda"
robik75 marked this conversation as resolved.
Show resolved Hide resolved
version = "0.2.0"
edition = "2021"
build = "build/main.rs"
authors = ["The Matter Labs Team <[email protected]>"]
Expand All @@ -13,21 +13,21 @@ description = "Boojum-CUDA is a library implementing GPU-accelerated cryptograph

[build-dependencies]
boojum = "=0.2.1"
cudart-sys = { version = "=0.1.0", package = "era_cudart_sys" }
era_cudart_sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "rr-build-refactor", version = "=0.2.0", package = "era_cudart_sys" }
robik75 marked this conversation as resolved.
Show resolved Hide resolved
cmake = "0.1"
itertools = "0.13"

[dependencies]
boojum = "=0.2.1"
cudart = { version = "=0.1.0", package = "era_cudart" }
cudart-sys = { version = "=0.1.0", package = "era_cudart_sys" }
era_cudart = { git = "https://github.com/matter-labs/era-cuda.git", branch = "rr-build-refactor", version = "=0.2.0", package = "era_cudart" }
era_cudart_sys = { git = "https://github.com/matter-labs/era-cuda.git", branch = "rr-build-refactor", version = "=0.2.0" , package = "era_cudart_sys" }
itertools = "0.13"
lazy_static = "1.4"

[dev-dependencies]
blake2 = "0.10"
criterion = "0.5"
criterion-cuda = { git = "https://github.com/matter-labs/era-cuda.git", branch = "main", package = "criterion-cuda" }
era_criterion_cuda = { git = "https://github.com/matter-labs/era-cuda.git", branch = "rr-build-refactor", version = "=0.2.0", package = "era_criterion_cuda" }
criterion-macro = "0.4"
itertools = "0.13"
rand = "0.8"
Expand Down
8 changes: 4 additions & 4 deletions benches/blake2s.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

use boojum_cuda::blake2s::blake2s_pow;
use criterion_cuda::CudaMeasurement;
use cudart::memory::{memory_set_async, DeviceAllocation};
use cudart::stream::CudaStream;
use era_boojum_cuda::blake2s::blake2s_pow;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::{memory_set_async, DeviceAllocation};
use era_cudart::stream::CudaStream;

fn blake2s(c: &mut Criterion<CudaMeasurement>) {
const MIN_BITS_COUNT: u32 = 17;
Expand Down
12 changes: 6 additions & 6 deletions benches/gates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Samplin
use rand::prelude::*;
use rayon::prelude::*;

use boojum_cuda::device_structures::{DeviceMatrixChunk, DeviceMatrixChunkMut};
use boojum_cuda::gates::*;
use criterion_cuda::CudaMeasurement;
use cudart::memory::{memory_copy, DeviceAllocation};
use cudart::slice::DeviceSlice;
use cudart::stream::CudaStream;
use era_boojum_cuda::device_structures::{DeviceMatrixChunk, DeviceMatrixChunkMut};
use era_boojum_cuda::gates::*;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::{memory_copy, DeviceAllocation};
use era_cudart::slice::DeviceSlice;
use era_cudart::stream::CudaStream;

fn poseidon_group(c: &mut Criterion<CudaMeasurement>, group_name: &str, gate_name: &str) {
const VARIABLES_COUNT: usize = 140;
Expand Down
8 changes: 4 additions & 4 deletions benches/goldilocks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ use criterion::{
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};

use boojum_cuda::ops_simple;
use criterion_cuda::CudaMeasurement;
use cudart::memory::{memory_copy, DeviceAllocation};
use cudart::stream::CudaStream;
use era_boojum_cuda::ops_simple;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::{memory_copy, DeviceAllocation};
use era_cudart::stream::CudaStream;

fn goldilocks_inv(c: &mut Criterion<CudaMeasurement>) {
const MIN_LOG_N: usize = 17;
Expand Down
10 changes: 5 additions & 5 deletions benches/ntt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ use criterion::{criterion_group, criterion_main, Criterion, SamplingMode, Throug
use rand::{thread_rng, Rng};
use rayon::prelude::*;

use boojum_cuda::context::Context;
use boojum_cuda::ntt::*;
use criterion_cuda::CudaMeasurement;
use cudart::memory::{memory_copy, DeviceAllocation};
use cudart::stream::CudaStream;
use era_boojum_cuda::context::Context;
use era_boojum_cuda::ntt::*;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::{memory_copy, DeviceAllocation};
use era_cudart::stream::CudaStream;

type CudaMeasurementInvElems = CudaMeasurement<true>;

Expand Down
10 changes: 5 additions & 5 deletions benches/ops_complex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};

use boojum_cuda::device_structures::DeviceMatrixMut;
use boojum_cuda::ops_complex::bit_reverse_in_place;
use criterion_cuda::CudaMeasurement;
use cudart::memory::DeviceAllocation;
use cudart::stream::CudaStream;
use era_boojum_cuda::device_structures::DeviceMatrixMut;
use era_boojum_cuda::ops_complex::bit_reverse_in_place;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::DeviceAllocation;
use era_cudart::stream::CudaStream;

fn bit_reverse(c: &mut Criterion<CudaMeasurement>) {
const LOG_MIN_BATCH_SIZE: usize = 0;
Expand Down
12 changes: 6 additions & 6 deletions benches/poseidon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ use criterion::{
use rand::{thread_rng, Rng};
use rayon::prelude::*;

use boojum_cuda::poseidon::*;
use criterion_cuda::CudaMeasurement;
use cudart::memory::{memory_copy, DeviceAllocation};
use cudart::result::CudaResult;
use cudart::slice::DeviceSlice;
use cudart::stream::CudaStream;
use era_boojum_cuda::poseidon::*;
use era_criterion_cuda::CudaMeasurement;
use era_cudart::memory::{memory_copy, DeviceAllocation};
use era_cudart::result::CudaResult;
use era_cudart::slice::DeviceSlice;
use era_cudart::stream::CudaStream;

#[allow(clippy::type_complexity)]
fn leaves_group(
Expand Down
6 changes: 3 additions & 3 deletions build/gates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ pub(super) fn generate() {
}

fn generate_cuda(descriptions: &[Description]) {
const TEMPLATE_PATH: &str = "native/gates_template.cu";
const RESULT_PATH: &str = "native/gates.cu";
const TEMPLATE_PATH: &str = "native/gate_kernels_template.cuh";
const RESULT_PATH: &str = "gate_kernels.cuh";
let mut code = String::new();
let s = &mut code;
new_line(s);
Expand Down Expand Up @@ -180,7 +180,7 @@ fn generate_cuda(descriptions: &[Description]) {

fn generate_rust(descriptions: &[Description]) {
const TEMPLATE_PATH: &str = "src/gates_data_template.rs";
const RESULT_PATH: &str = "src/gates_data.rs";
const RESULT_PATH: &str = "gates_data.rs";
let mut hash_map = String::new();
let mut bindings = String::new();
let mut mappings = String::new();
Expand Down
46 changes: 29 additions & 17 deletions build/main.rs
Original file line number Diff line number Diff line change
@@ -1,28 +1,40 @@
#![allow(incomplete_features)]
#![allow(unexpected_cfgs)]
#![feature(generic_const_exprs)]

use cudart_sys::{cuda_lib_path, cuda_path};

mod gates;
mod poseidon_constants;
mod template;

fn main() {
gates::generate();
poseidon_constants::generate();
#[cfg(target_os = "macos")]
std::process::exit(0);
let dst = cmake::Config::new("native")
.profile("Release")
.define(
"CMAKE_CUDA_ARCHITECTURES",
std::env::var("CUDAARCHS").unwrap_or("native".to_string()),
)
.build();
println!("cargo:rustc-link-search=native={}", dst.display());
println!("cargo:rustc-link-lib=static=boojum-cuda-native");
println!("cargo:rustc-link-search=native={}", cuda_lib_path!());
println!("cargo:rustc-link-lib=cudart");
#[cfg(target_os = "linux")]
println!("cargo:rustc-link-lib=stdc++");
println!("cargo::rustc-check-cfg=cfg(no_cuda)");
#[cfg(no_cuda)]
{
println!("cargo::warning={}", era_cudart_sys::no_cuda_message!());
}
#[cfg(not(no_cuda))]
{
use era_cudart_sys::{get_cuda_lib_path, get_cuda_version};
use std::env::var;
let cuda_version = get_cuda_version().expect("Failed to determine the CUDA version.");
if !cuda_version.starts_with("12.") {
println!("cargo::warning=CUDA version {cuda_version} detected. This crate is only tested with CUDA 12.*.");
}
let cudaarchs = var("CUDAARCHS").unwrap_or("native".to_string());
let dst = cmake::Config::new("native")
.profile("Release")
.define("CMAKE_CUDA_ARCHITECTURES", cudaarchs)
.build();
let boojum_lib_path = dst.to_str().unwrap();
println!("cargo:rustc-link-search=native={boojum_lib_path}");
println!("cargo:rustc-link-lib=static=era_boojum_cuda_native");
let cuda_lib_path = get_cuda_lib_path().unwrap();
let cuda_lib_path_str = cuda_lib_path.to_str().unwrap();
println!("cargo:rustc-link-search=native={cuda_lib_path_str}");
println!("cargo:rustc-link-lib=cudart");
#[cfg(target_os = "linux")]
println!("cargo:rustc-link-lib=stdc++");
}
}
2 changes: 1 addition & 1 deletion build/poseidon_constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use boojum::implementations::poseidon_goldilocks_params::*;
// use itertools::Itertools;

const TEMPLATE_PATH: &str = "native/poseidon_constants_template.cuh";
const RESULT_PATH: &str = "native/poseidon_constants.cuh";
const RESULT_PATH: &str = "poseidon_constants.cuh";

fn split_u64(value: u64) -> (u32, u32) {
let lo = value as u32;
Expand Down
8 changes: 6 additions & 2 deletions build/template.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use std::env::var;
use std::fs;
use std::path::Path;

const PREFIX: char = '%';
const SUFFIX: char = '%';
Expand All @@ -11,8 +13,10 @@ pub(crate) fn generate(replacements: &[(&str, String)], template_path: &str, res
from.push(SUFFIX);
text = text.replace(&from, value);
}
let current = fs::read_to_string(result_path).unwrap_or_default();
let out_dir = var("OUT_DIR").unwrap();
let result_path = Path::new(&out_dir).join(result_path);
let current = fs::read_to_string(&result_path).unwrap_or_default();
if !text.eq(&current) {
fs::write(result_path, text).unwrap();
fs::write(&result_path, text).unwrap();
}
}
2 changes: 0 additions & 2 deletions native/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
/cmake-build-*/
/gates.cu
/poseidon_constants.cuh
36 changes: 19 additions & 17 deletions native/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,50 +1,52 @@
cmake_minimum_required(VERSION 3.24)
project(boojum-cuda-native)
project(era_boojum_cuda_native)
enable_language(CUDA)
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES native)
endif ()
add_library(boojum-cuda-native STATIC
add_library(era_boojum_cuda_native STATIC
ops_cub/common.cuh
ops_cub/device_radix_sort.cu
ops_cub/device_reduce.cu
ops_cub/device_run_length_encode.cu
ops_cub/device_scan.cu
barycentric.cu
blake2s.cu
carry_chain.cuh
common.cuh
context.cu
context.cuh
${CMAKE_INSTALL_PREFIX}/gate_kernels.cuh
gates.cu
gates.cuh
gates_poseidon.cuh
goldilocks.cuh
goldilocks_extension.cuh
goldilocks_extension.cu
goldilocks_extension.cuh
memory.cuh
ntt.cu
ntt_b2n.cuh
ntt_n2b.cuh
ops_complex.cu
ops_complex.cuh
ops_cub/common.cuh
ops_cub/device_radix_sort.cu
ops_cub/device_reduce.cu
ops_cub/device_run_length_encode.cu
ops_cub/device_scan.cu
ops_simple.cu
poseidon2_cooperative.cu
poseidon2_single_thread.cu
poseidon2_single_thread.cuh
poseidon_common.cu
poseidon_constants.cuh
${CMAKE_INSTALL_PREFIX}/poseidon_constants.cuh
poseidon_cooperative.cu
poseidon_single_thread.cu
poseidon_single_thread.cuh
poseidon_utils.cuh
ptx.cuh
)
set_target_properties(boojum-cuda-native PROPERTIES CUDA_STANDARD 17)
set_target_properties(boojum-cuda-native PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(boojum-cuda-native PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_compile_options(boojum-cuda-native PRIVATE --expt-relaxed-constexpr)
target_compile_options(boojum-cuda-native PRIVATE --ptxas-options=-v)
#target_compile_options(boojum-cuda-native PRIVATE -lineinfo)
#target_compile_options(boojum-cuda-native PRIVATE --keep)
install(TARGETS boojum-cuda-native DESTINATION .)
target_include_directories(era_boojum_cuda_native PRIVATE ${CMAKE_INSTALL_PREFIX})
set_target_properties(era_boojum_cuda_native PROPERTIES CUDA_STANDARD 17)
set_target_properties(era_boojum_cuda_native PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(era_boojum_cuda_native PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_compile_options(era_boojum_cuda_native PRIVATE --expt-relaxed-constexpr)
target_compile_options(era_boojum_cuda_native PRIVATE --ptxas-options=-v)
#target_compile_options(era_boojum_cuda_native PRIVATE -lineinfo)
#target_compile_options(era_boojum_cuda_native PRIVATE --keep)
install(TARGETS era_boojum_cuda_native DESTINATION .)
3 changes: 3 additions & 0 deletions native/gate_kernels_template.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
namespace gates {
%CODE%
} // namespace gates
4 changes: 4 additions & 0 deletions native/gates.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include "gates.cuh"
#include "gates_poseidon.cuh"
// do not reorder includes
#include "gate_kernels.cuh"
6 changes: 0 additions & 6 deletions native/gates_template.cu

This file was deleted.

2 changes: 0 additions & 2 deletions native/poseidon_constants_template.cuh
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// clang-format off
#pragma once

#include "goldilocks.cuh"

namespace poseidon_common {

using namespace goldilocks;
Expand Down
1 change: 0 additions & 1 deletion src/.gitignore

This file was deleted.

16 changes: 8 additions & 8 deletions src/barycentric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ use crate::utils::WARP_SIZE;
use crate::BaseField;
use boojum::cs::implementations::utils::domain_generator_for_size;
use boojum::field::{Field, PrimeField};
use cudart::execution::{CudaLaunchConfig, Dim3, KernelFunction};
use cudart::paste::paste;
use cudart::result::CudaResult;
use cudart::slice::{DeviceSlice, DeviceVariable};
use cudart::stream::CudaStream;
use cudart::{cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function};
use era_cudart::execution::{CudaLaunchConfig, Dim3, KernelFunction};
use era_cudart::paste::paste;
use era_cudart::result::CudaResult;
use era_cudart::slice::{DeviceSlice, DeviceVariable};
use era_cudart::stream::CudaStream;
use era_cudart::{cuda_kernel_declaration, cuda_kernel_signature_arguments_and_function};
use std::cmp;

type BF = BaseField;
Expand Down Expand Up @@ -294,8 +294,8 @@ mod tests {
use boojum::field::goldilocks::GoldilocksExt2;
use boojum::field::{rand_from_rng, Field, PrimeField, U64Representable};
use boojum::worker::Worker;
use cudart::memory::{memory_copy_async, DeviceAllocation};
use cudart::stream::CudaStream;
use era_cudart::memory::{memory_copy_async, DeviceAllocation};
use era_cudart::stream::CudaStream;
use rand::{thread_rng, Rng};
use serial_test::serial;
use std::alloc::Global;
Expand Down
Loading
Loading