From bd28d22208188d78f2d7ab28ace28e77f1752113 Mon Sep 17 00:00:00 2001 From: tmontaigu Date: Thu, 3 Oct 2024 15:44:49 +0200 Subject: [PATCH] feat(hlapi): add erc20 bench This adds benchmarks of both latency and throughput of 4 variants of the erc20 transfer. It also prints the PBS count of each versions. - "whitepaper" is the variant written in the fhevm whitepaper - "no_cmux" is similar to whitepaper, but uses a "boolean multiplication" instead of a cmux - "overflow" uses an overflowing_sub to remove the need for comparison - "safe" use both overflowing_sub and overflowing_add to make sure both then sender has enough money and the that the transfer won't overflow the receiver's money "overflow" has the lowest latency, and second best throughput "no_cmux" has the second lowest latenc and the best throughput --- tfhe/Cargo.toml | 6 + tfhe/benches/high_level_api/bench.rs | 8 +- tfhe/benches/high_level_api/erc20.rs | 301 +++++++++++++++++++ tfhe/src/integer/gpu/server_key/radix/mul.rs | 1 - 4 files changed, 309 insertions(+), 7 deletions(-) create mode 100644 tfhe/benches/high_level_api/erc20.rs diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml index 8e92de46aa..627fbf41d3 100644 --- a/tfhe/Cargo.toml +++ b/tfhe/Cargo.toml @@ -246,6 +246,12 @@ path = "benches/high_level_api/bench.rs" harness = false required-features = ["integer", "internal-keycache"] +[[bench]] +name = "hlapi-erc20" +path = "benches/high_level_api/erc20.rs" +harness = false +required-features = ["integer", "internal-keycache"] + [[bench]] name = "keygen" path = "benches/keygen/bench.rs" diff --git a/tfhe/benches/high_level_api/bench.rs b/tfhe/benches/high_level_api/bench.rs index 8361b0587f..97234e5b8b 100644 --- a/tfhe/benches/high_level_api/bench.rs +++ b/tfhe/benches/high_level_api/bench.rs @@ -1,7 +1,8 @@ +use std::ops::*; + use criterion::{black_box, Criterion}; use rand::prelude::*; use std::fmt::Write; -use std::ops::*; use tfhe::prelude::*; use tfhe::shortint::parameters::*; use tfhe::{ @@ -33,11 +34,6 @@ where let rhs = FheType::encrypt(rng.gen(), client_key); let mut name = String::with_capacity(255); - - write!(name, "add({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs + &rhs))); - name.clear(); - write!(name, "overflowing_add({type_name}, {type_name})").unwrap(); bench_group.bench_function(&name, |b| { b.iter(|| black_box((&lhs).overflowing_add(&rhs))) diff --git a/tfhe/benches/high_level_api/erc20.rs b/tfhe/benches/high_level_api/erc20.rs new file mode 100644 index 0000000000..c94f235716 --- /dev/null +++ b/tfhe/benches/high_level_api/erc20.rs @@ -0,0 +1,301 @@ +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, Criterion, Throughput}; +use rand::prelude::*; +use rand::thread_rng; +use rayon::prelude::*; +use std::ops::{Add, Mul, Sub}; +use tfhe::prelude::*; +use tfhe::shortint::parameters::*; +use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBool, FheUint64}; + +/// Transfer as written in the original FHEvm white-paper, +/// it uses a comparison to check if the sender has enough, +/// and cmuxes based on the comparison result +fn transfer_whitepaper( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) +where + FheType: Add + for<'a> FheOrd<&'a FheType>, + FheBool: IfThenElse, + for<'a> &'a FheType: Add + Sub, +{ + let has_enough_funds = (from_amount).ge(amount); + + let mut new_to_amount = to_amount + amount; + new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount); + + let mut new_from_amount = from_amount - amount; + new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount); + + (new_from_amount, new_to_amount) +} + +/// This one also uses a comparison, but it leverages the 'boolean' multiplication +/// instead of cmuxes, so it is faster +fn transfer_no_cmux( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) +where + FheType: Add + CastFrom + for<'a> FheOrd<&'a FheType>, + FheBool: IfThenElse, + for<'a> &'a FheType: + Add + Sub + Mul, +{ + let has_enough_funds = (from_amount).ge(amount); + + let amount = amount * FheType::cast_from(has_enough_funds); + + let new_to_amount = to_amount + &amount; + let new_from_amount = from_amount - &amount; + + (new_from_amount, new_to_amount) +} + +/// This one uses overflowing sub to remove the need for comparison +/// it also uses the 'boolean' multiplication +fn transfer_overflow( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) +where + FheType: CastFrom + for<'a> FheOrd<&'a FheType>, + FheBool: IfThenElse, + for<'a> &'a FheType: Add + + OverflowingSub<&'a FheType, Output = FheType> + + Mul, +{ + let (new_from, did_not_have_enough) = (from_amount).overflowing_sub(amount); + + let new_from_amount = did_not_have_enough.if_then_else(from_amount, &new_from); + + let had_enough_funds = !did_not_have_enough; + let new_to_amount = to_amount + (amount * FheType::cast_from(had_enough_funds)); + + (new_from_amount, new_to_amount) +} + +/// This ones uses both overflowing_add/sub to check that both +/// the sender has enough funds, and the receiver will not overflow its balance +fn transfer_safe( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) +where + for<'a> &'a FheType: OverflowingSub<&'a FheType, Output = FheType> + + OverflowingAdd<&'a FheType, Output = FheType>, + FheBool: IfThenElse, +{ + let (new_from, did_not_have_enough_funds) = (from_amount).overflowing_sub(amount); + let (new_to, did_not_have_enough_space) = (to_amount).overflowing_add(amount); + + let something_not_ok = did_not_have_enough_funds | did_not_have_enough_space; + + let new_from_amount = something_not_ok.if_then_else(from_amount, &new_from); + let new_to_amount = something_not_ok.if_then_else(to_amount, &new_to); + + (new_from_amount, new_to_amount) +} + +#[cfg(feature = "pbs-stats")] +fn print_transfer_pbs_counts( + client_key: &ClientKey, + type_name: &str, + fn_name: &str, + transfer_func: F, +) where + FheType: FheEncrypt, + F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType), +{ + let mut rng = thread_rng(); + + let from_amount = FheType::encrypt(rng.gen::(), client_key); + let to_amount = FheType::encrypt(rng.gen::(), client_key); + let amount = FheType::encrypt(rng.gen::(), client_key); + + tfhe::reset_pbs_count(); + let (_, _) = transfer_func(&from_amount, &to_amount, &amount); + let count = tfhe::get_pbs_count(); + + println!("ERC20 transfer/{fn_name}::{type_name}: {count} PBS"); +} + +fn bench_transfer_latency( + c: &mut BenchmarkGroup<'_, WallTime>, + client_key: &ClientKey, + type_name: &str, + fn_name: &str, + transfer_func: F, +) where + FheType: FheEncrypt, + F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType), +{ + let id_name = format!("{fn_name}::{type_name}"); + c.bench_function(&id_name, |b| { + let mut rng = thread_rng(); + + let from_amount = FheType::encrypt(rng.gen::(), client_key); + let to_amount = FheType::encrypt(rng.gen::(), client_key); + let amount = FheType::encrypt(rng.gen::(), client_key); + + b.iter(|| { + let (_, _) = transfer_func(&from_amount, &to_amount, &amount); + }) + }); +} + +fn bench_transfer_throughput( + group: &mut BenchmarkGroup<'_, WallTime>, + client_key: &ClientKey, + type_name: &str, + fn_name: &str, + transfer_func: F, +) where + FheType: FheEncrypt + Send + Sync, + F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync, +{ + let mut rng = thread_rng(); + + for num_elems in [10, 100, 500] { + group.throughput(Throughput::Elements(num_elems)); + let id_name = format!("{fn_name}::{type_name}::{num_elems}"); + group.bench_with_input(id_name, &num_elems, |b, &num_elems| { + let from_amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + let to_amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + let amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + + b.iter(|| { + from_amounts + .par_iter() + .zip(to_amounts.par_iter().zip(amounts.par_iter())) + .for_each(|(from_amount, (to_amount, amount))| { + let (_, _) = transfer_func(from_amount, to_amount, amount); + }) + }) + }); + } +} + +fn main() { + #[cfg(not(feature = "gpu"))] + let params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64; + #[cfg(feature = "gpu")] + let params = PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS; + + let config = ConfigBuilder::with_custom_parameters(params).build(); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + #[cfg(not(feature = "gpu"))] + let sks = compressed_sks.decompress(); + #[cfg(feature = "gpu")] + let sks = compressed_sks.decompress_to_gpu(); + + rayon::broadcast(|_| set_server_key(sks.clone())); + set_server_key(sks); + + let mut c = Criterion::default().sample_size(10).configure_from_args(); + + // FheUint64 PBS counts + // We don't run multiple times since every input is encrypted + // PBS count is always the same + #[cfg(feature = "pbs-stats")] + { + print_transfer_pbs_counts( + &cks, + "FheUint64", + "whitepaper", + transfer_whitepaper::, + ); + print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::); + print_transfer_pbs_counts( + &cks, + "FheUint64", + "overflow", + transfer_overflow::, + ); + print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::); + } + + // FheUint64 latency + { + let mut group = c.benchmark_group("ERC20 latency"); + bench_transfer_latency( + &mut group, + &cks, + "FheUint64", + "whitepaper", + transfer_whitepaper::, + ); + bench_transfer_latency( + &mut group, + &cks, + "FheUint64", + "no_cmux", + transfer_no_cmux::, + ); + bench_transfer_latency( + &mut group, + &cks, + "FheUint64", + "overflow", + transfer_overflow::, + ); + bench_transfer_latency( + &mut group, + &cks, + "FheUint64", + "safe", + transfer_safe::, + ); + + group.finish(); + } + + // FheUint64 Throughput + { + let mut group = c.benchmark_group("ERC20 throughput"); + bench_transfer_throughput( + &mut group, + &cks, + "FheUint64", + "whitepaper", + transfer_whitepaper::, + ); + bench_transfer_throughput( + &mut group, + &cks, + "FheUint64", + "no_cmux", + transfer_no_cmux::, + ); + bench_transfer_throughput( + &mut group, + &cks, + "FheUint64", + "overflow", + transfer_overflow::, + ); + bench_transfer_throughput( + &mut group, + &cks, + "FheUint64", + "safe", + transfer_safe::, + ); + group.finish(); + } + + c.final_summary(); +} diff --git a/tfhe/src/integer/gpu/server_key/radix/mul.rs b/tfhe/src/integer/gpu/server_key/radix/mul.rs index 3f741d57f9..7b76cd78b7 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mul.rs @@ -71,7 +71,6 @@ impl CudaServerKey { stream: &CudaStreams, ) { let num_blocks = ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32; - match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { unchecked_mul_integer_radix_kb_assign_async(