Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

Commit

Permalink
fix gather_merkle_paths function (#18)
Browse files Browse the repository at this point in the history
# What ❔

This PR fixes the `gather_merkle_paths` function.

## Why ❔

The function assumed the wrong structure of the node values array, this
fixes the mistake.

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Code has been formatted via `cargo fmt`.
  • Loading branch information
robik75 authored Oct 28, 2023
1 parent f44a0fb commit af269f6
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 61 deletions.
22 changes: 12 additions & 10 deletions boojum-cuda/native/poseidon_common.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include "goldilocks.cuh"
#include "poseidon_constants.cuh"

namespace poseidon {

using namespace goldilocks;
using namespace memory;
using namespace poseidon_common;

EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigned indexes_count, const matrix_getter<base_field, ld_modifier::cs> values,
matrix_setter<base_field, st_modifier::cs> results) {
Expand All @@ -17,20 +19,20 @@ EXTERN __global__ void gather_rows_kernel(const unsigned *indexes, const unsigne
results.set(dst_row, col, values.get(src_row, col));
}

EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count,
const matrix_getter<base_field, ld_modifier::cs> values, matrix_setter<base_field, st_modifier::cs> results) {
EXTERN __global__ void gather_merkle_paths_kernel(const unsigned *indexes, const unsigned indexes_count, const base_field *values,
const unsigned log_leaves_count, base_field *results) {
const unsigned idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= indexes_count)
return;
const unsigned col = threadIdx.y;
const unsigned layers_count = gridDim.y;
const unsigned layer_from_leaves = blockIdx.y;
const unsigned col = blockIdx.y;
const unsigned layer_index = blockIdx.z;
const unsigned layer_offset = (CAPACITY << (log_leaves_count + 1)) - (CAPACITY << (log_leaves_count + 1 - layer_index));
const unsigned col_offset = col << (log_leaves_count - layer_index);
const unsigned leaf_index = indexes[idx];
const unsigned layer_offset = (1 << (layers_count + 1)) - (1 << (layers_count + 1 - layer_from_leaves));
const unsigned hash_index = (leaf_index >> layer_from_leaves) ^ 1;
const unsigned src_row = layer_offset + hash_index;
const unsigned dst_row = layer_from_leaves * indexes_count + idx;
results.set(dst_row, col, values.get(src_row, col));
const unsigned hash_index = (leaf_index >> layer_index) ^ 1;
const unsigned src_index = layer_offset + col_offset + hash_index;
const unsigned dst_index = layer_index * indexes_count * CAPACITY + indexes_count * col + idx;
results[dst_index] = values[src_index];
}

} // namespace poseidon
111 changes: 60 additions & 51 deletions boojum-cuda/src/poseidon.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use boojum::field::goldilocks::GoldilocksField;
use boojum::implementations::poseidon_goldilocks_params::*;

use cudart::execution::{KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs};
use cudart::execution::{
KernelFiveArgs, KernelFourArgs, KernelLaunch, KernelSevenArgs, KernelThreeArgs,
};
use cudart::result::CudaResult;
use cudart::slice::DeviceSlice;
use cudart::stream::CudaStream;
Expand Down Expand Up @@ -79,8 +81,9 @@ extern "C" {
fn gather_merkle_paths_kernel(
indexes: *const u32,
indexes_count: u32,
values: PtrAndStride<BaseFieldDeviceType>,
results: MutPtrAndStride<BaseFieldDeviceType>,
values: *const GoldilocksField,
log_leaves_count: u32,
results: *mut GoldilocksField,
);
}

Expand Down Expand Up @@ -417,31 +420,38 @@ pub fn gather_rows(

pub fn gather_merkle_paths(
indexes: &DeviceSlice<u32>,
values: &(impl DeviceMatrixChunkImpl<BaseField> + ?Sized),
result: &mut (impl DeviceMatrixChunkMutImpl<BaseField> + ?Sized),
values: &DeviceSlice<GoldilocksField>,
results: &mut DeviceSlice<GoldilocksField>,
layers_count: u32,
stream: &CudaStream,
) -> CudaResult<()> {
assert_eq!(values.cols(), CAPACITY);
assert_eq!(result.cols(), CAPACITY);
let indexes_len = indexes.len();
let values_rows = values.rows();
let result_rows = result.rows();
assert_eq!(result_rows % indexes_len, 0);
let layers_count = result_rows / indexes_len;
assert_eq!(values_rows, 1 << (layers_count + 1));
assert_eq!(WARP_SIZE % CAPACITY as u32, 0);
assert!(indexes_len <= u32::MAX as usize);
let indexes_count = indexes_len as u32;
let (grid_dim, block_dim) =
get_grid_block_dims_for_threads_count(WARP_SIZE / CAPACITY as u32, indexes_count);
let grid_dim = (grid_dim.x, layers_count as u32).into();
let block_dim = (block_dim.x, CAPACITY as u32).into();
assert!(indexes.len() <= u32::MAX as usize);
let indexes_count = indexes.len() as u32;
assert_eq!(values.len() % CAPACITY, 0);
let values_count = values.len() / CAPACITY;
assert!(values_count.is_power_of_two());
let log_values_count = values_count.trailing_zeros();
assert_ne!(log_values_count, 0);
let log_leaves_count = log_values_count - 1;
assert!(layers_count < log_leaves_count);
assert_eq!(
indexes.len() * layers_count as usize * CAPACITY,
results.len()
);
let (grid_dim, block_dim) = get_grid_block_dims_for_threads_count(WARP_SIZE, indexes_count);
let grid_dim = (grid_dim.x, CAPACITY as u32, layers_count).into();
let indexes = indexes.as_ptr();
let values = values.as_ptr_and_stride();
let result = result.as_mut_ptr_and_stride();
let args = (&indexes, &indexes_count, &values, &result);
let values = values.as_ptr();
let result = results.as_mut_ptr();
let args = (
&indexes,
&indexes_count,
&values,
&log_leaves_count,
&result,
);
unsafe {
KernelFourArgs::launch(
KernelFiveArgs::launch(
gather_merkle_paths_kernel,
grid_dim,
block_dim,
Expand All @@ -458,9 +468,11 @@ mod tests {

use boojum::field::{Field, U64Representable};
use boojum::implementations::poseidon2::state_generic_impl::State;
use itertools::Itertools;
use rand::Rng;

use cudart::memory::{memory_copy_async, DeviceAllocation};
use cudart::slice::CudaSlice;

// use boojum::implementations::poseidon_goldilocks::poseidon_permutation_optimized;
use crate::device_structures::{DeviceMatrix, DeviceMatrixMut};
Expand Down Expand Up @@ -927,13 +939,13 @@ mod tests {
fn gather_merkle_paths() {
const LOG_LEAVES_COUNT: usize = 12;
const INDEXES_COUNT: usize = 42;
const LAYERS_COUNT: usize = LOG_LEAVES_COUNT - 4;
let mut rng = rand::thread_rng();
let mut indexes_host = vec![0; INDEXES_COUNT];
indexes_host.fill_with(|| rng.gen_range(0..INDEXES_COUNT as u32));
indexes_host.fill_with(|| rng.gen_range(0..1u32 << LOG_LEAVES_COUNT));
let mut values_host = vec![GoldilocksField::ZERO; CAPACITY << (LOG_LEAVES_COUNT + 1)];
values_host.fill_with(|| GoldilocksField(rng.gen()));
let mut results_host =
vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LOG_LEAVES_COUNT];
let mut results_host = vec![GoldilocksField::ZERO; CAPACITY * INDEXES_COUNT * LAYERS_COUNT];
let stream = CudaStream::default();
let mut indexes_device = DeviceAllocation::<u32>::alloc(indexes_host.len()).unwrap();
let mut values_device =
Expand All @@ -944,40 +956,37 @@ mod tests {
memory_copy_async(&mut values_device, &values_host, &stream).unwrap();
super::gather_merkle_paths(
&indexes_device,
&DeviceMatrix::new(&values_device, 1 << (LOG_LEAVES_COUNT + 1)),
&mut DeviceMatrixMut::new(&mut results_device, INDEXES_COUNT * LOG_LEAVES_COUNT),
&values_device,
&mut results_device,
LAYERS_COUNT as u32,
&stream,
)
.unwrap();
memory_copy_async(&mut results_host, &results_device, &stream).unwrap();
stream.synchronize().unwrap();
fn verify_merkle_path(
indexes: &[u32],
values: &[GoldilocksField],
results: &[GoldilocksField],
row_index: usize,
leaf_index: usize,
) {
let log_leaves_count = values.len().trailing_zeros() - 1;
let sibling_index = leaf_index ^ 1;
let expected = values[sibling_index];
let actual = results[row_index];
assert_eq!(expected, actual);
if log_leaves_count > 1 {
verify_merkle_path(
&values[values.len() >> 1..],
&results[INDEXES_COUNT..],
row_index,
leaf_index >> 1,
);
let (values, values_next) = values.split_at(values.len() >> 1);
let (results, results_next) = results.split_at(INDEXES_COUNT * CAPACITY);
values
.chunks(values.len() / CAPACITY)
.zip(results.chunks(results.len() / CAPACITY))
.for_each(|(values, results)| {
for (row_index, &index) in indexes.iter().enumerate() {
let sibling_index = index ^ 1;
let expected = values[sibling_index as usize];
let actual = results[row_index];
assert_eq!(expected, actual);
}
});
if !results_next.is_empty() {
let indexes_next = indexes.iter().map(|&x| x >> 1).collect_vec();
verify_merkle_path(&indexes_next, &values_next, &results_next);
}
}
values_host
.chunks(1 << (LOG_LEAVES_COUNT + 1))
.zip(results_host.chunks(INDEXES_COUNT * LOG_LEAVES_COUNT))
.for_each(|(values, results)| {
for (row_index, &leaf_index) in indexes_host.iter().enumerate() {
verify_merkle_path(values, results, row_index, leaf_index as usize);
}
});
verify_merkle_path(&indexes_host, &values_host, &results_host);
}
}

0 comments on commit af269f6

Please sign in to comment.