Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VCLAMP ASAN fix for F32 and F16 Neon #5918

Merged
merged 1 commit into from
Jan 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4009,6 +4009,11 @@ IF(XNNPACK_BUILD_BENCHMARKS)
TARGET_LINK_LIBRARIES(f32-vsqrt-bench PRIVATE benchmark::benchmark pthreadpool)
TARGET_LINK_LIBRARIES(f32-vsqrt-bench PRIVATE bench-utils hardware-config logging microkernels-all microparams-init)

ADD_EXECUTABLE(f32-vclamp-bench bench/f32-vclamp.cc)
TARGET_INCLUDE_DIRECTORIES(f32-vclamp-bench PRIVATE . include src)
TARGET_LINK_LIBRARIES(f32-vclamp-bench PRIVATE benchmark::benchmark pthreadpool)
TARGET_LINK_LIBRARIES(f32-vclamp-bench PRIVATE bench-utils hardware-config logging microkernels-all microparams-init)

ADD_EXECUTABLE(f32-vtanh-bench bench/f32-vtanh.cc)
TARGET_INCLUDE_DIRECTORIES(f32-vtanh-bench PRIVATE . include src)
TARGET_LINK_LIBRARIES(f32-vtanh-bench PRIVATE benchmark::benchmark pthreadpool)
Expand Down
8 changes: 8 additions & 0 deletions bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,14 @@ xnnpack_benchmark(
deps = MICROKERNEL_BENCHMARK_DEPS,
)

xnnpack_benchmark(
name = "f32_vclamp_bench",
srcs = [
"f32-vclamp.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
)

xnnpack_benchmark(
name = "f32_vtanh_bench",
srcs = [
Expand Down
194 changes: 194 additions & 0 deletions bench/f32-vclamp.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <algorithm>
#include <cmath>
#include <functional>
#include <random>
#include <vector>

#include <benchmark/benchmark.h>
#include "bench/utils.h"

#include <xnnpack.h>
#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/microparams-init.h>
#include <xnnpack/vunary.h>


static void f32_vclamp(
benchmark::State& state,
xnn_f32_vclamp_ukernel_fn vclamp,
xnn_init_f32_minmax_params_fn init_params = nullptr,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check != nullptr && !isa_check(state)) {
return;
}

const size_t num_elements = state.range(0);
std::vector<float, AlignedAllocator<float, 64>> input(num_elements);
std::vector<float, AlignedAllocator<float, 64>> output(num_elements);

std::random_device random_device;
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 10.0f), std::ref(rng));
std::generate(input.begin(), input.end(), std::ref(f32rng));
std::fill(output.begin(), output.end(), std::nanf(""));

union xnn_f32_minmax_params params;
if (init_params != nullptr) {
init_params(&params, -INFINITY, INFINITY);
}
for (auto _ : state) {
vclamp(num_elements * sizeof(float), input.data(), output.data(), &params);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

const size_t elements_per_iteration = num_elements;
state.counters["elements"] =
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);

const size_t bytes_per_iteration = 2 * num_elements * sizeof(float);
state.counters["bytes"] =
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
}

#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(f32_vclamp, neon_u4,
xnn_f32_vclamp_ukernel__neon_u4,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, neon_u8,
xnn_f32_vclamp_ukernel__neon_u8,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, neon_u16,
xnn_f32_vclamp_ukernel__neon_u16,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64

#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
BENCHMARK_CAPTURE(f32_vclamp, rvv_u1v,
xnn_f32_vclamp_ukernel__rvv_u1v,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckRVV)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, rvv_u2v,
xnn_f32_vclamp_ukernel__rvv_u2v,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckRVV)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, rvv_u4v,
xnn_f32_vclamp_ukernel__rvv_u4v,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckRVV)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, rvv_u8v,
xnn_f32_vclamp_ukernel__rvv_u8v,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckRVV)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV

#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f32_vclamp, avx512f_u16,
xnn_f32_vclamp_ukernel__avx512f_u16,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckAVX512F)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, avx512f_u32,
xnn_f32_vclamp_ukernel__avx512f_u32,
xnn_init_f32_minmax_scalar_params,
benchmark::utils::CheckAVX512F)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();

BENCHMARK_CAPTURE(f32_vclamp, avx_u8,
xnn_f32_vclamp_ukernel__avx_u8,
xnn_init_f32_minmax_avx_params,
benchmark::utils::CheckAVX)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, avx_u16,
xnn_f32_vclamp_ukernel__avx_u16,
xnn_init_f32_minmax_avx_params,
benchmark::utils::CheckAVX)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();

BENCHMARK_CAPTURE(f32_vclamp, sse_u4,
xnn_f32_vclamp_ukernel__sse_u4,
xnn_init_f32_minmax_sse_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, sse_u8,
xnn_f32_vclamp_ukernel__sse_u8,
xnn_init_f32_minmax_sse_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64

#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE(f32_vclamp, wasmsimd_arm_u4,
xnn_f32_vclamp_ukernel__wasmsimd_arm_u4,
xnn_init_f32_minmax_wasmsimd_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, wasmsimd_arm_u8,
xnn_f32_vclamp_ukernel__wasmsimd_arm_u8,
xnn_init_f32_minmax_wasmsimd_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();

BENCHMARK_CAPTURE(f32_vclamp, wasmsimd_x86_u4,
xnn_f32_vclamp_ukernel__wasmsimd_x86_u4,
xnn_init_f32_minmax_wasmsimd_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, wasmsimd_x86_u8,
xnn_f32_vclamp_ukernel__wasmsimd_x86_u8,
xnn_init_f32_minmax_wasmsimd_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD

BENCHMARK_CAPTURE(f32_vclamp, scalar_u1,
xnn_f32_vclamp_ukernel__scalar_u1,
xnn_init_f32_minmax_scalar_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, scalar_u2,
xnn_f32_vclamp_ukernel__scalar_u2,
xnn_init_f32_minmax_scalar_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();
BENCHMARK_CAPTURE(f32_vclamp, scalar_u4,
xnn_f32_vclamp_ukernel__scalar_u4,
xnn_init_f32_minmax_scalar_params)
->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
->UseRealTime();

#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
1 change: 1 addition & 0 deletions cmake/microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2865,6 +2865,7 @@ SET(ALL_NEON_MICROKERNEL_SRCS
src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c
src/f32-vclamp/gen/f32-vclamp-neon-u4.c
src/f32-vclamp/gen/f32-vclamp-neon-u8.c
src/f32-vclamp/gen/f32-vclamp-neon-u16.c
src/f32-vcmul/gen/f32-vcmul-neon-u4.c
src/f32-vcmul/gen/f32-vcmul-neon-u8.c
src/f32-vcmul/gen/f32-vcmul-neon-u12.c
Expand Down
1 change: 1 addition & 0 deletions microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -2875,6 +2875,7 @@ ALL_NEON_MICROKERNEL_SRCS = [
"src/f32-vbinary/gen/f32-vsubc-minmax-neon-u8.c",
"src/f32-vclamp/gen/f32-vclamp-neon-u4.c",
"src/f32-vclamp/gen/f32-vclamp-neon-u8.c",
"src/f32-vclamp/gen/f32-vclamp-neon-u16.c",
"src/f32-vcmul/gen/f32-vcmul-neon-u4.c",
"src/f32-vcmul/gen/f32-vcmul-neon-u8.c",
"src/f32-vcmul/gen/f32-vcmul-neon-u12.c",
Expand Down
5 changes: 3 additions & 2 deletions scripts/generate-f32-vclamp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ tools/xngen src/f32-vclamp/wasmsimd.c.in -D BATCH_TILE=4 -D X86=1 -o src/f32-vcl
tools/xngen src/f32-vclamp/wasmsimd.c.in -D BATCH_TILE=8 -D X86=1 -o src/f32-vclamp/gen/f32-vclamp-wasmsimd-x86-u8.c &

################################### ARM NEON ##################################
tools/xngen src/f32-vclamp/neon.c.in -D BATCH_TILE=4 -o src/f32-vclamp/gen/f32-vclamp-neon-u4.c &
tools/xngen src/f32-vclamp/neon.c.in -D BATCH_TILE=8 -o src/f32-vclamp/gen/f32-vclamp-neon-u8.c &
tools/xngen src/f32-vclamp/neon.c.in -D BATCH_TILE=4 -o src/f32-vclamp/gen/f32-vclamp-neon-u4.c &
tools/xngen src/f32-vclamp/neon.c.in -D BATCH_TILE=8 -o src/f32-vclamp/gen/f32-vclamp-neon-u8.c &
tools/xngen src/f32-vclamp/neon.c.in -D BATCH_TILE=16 -o src/f32-vclamp/gen/f32-vclamp-neon-u16.c &

################################ RISC-V Vector ################################
tools/xngen src/f32-vclamp/rvv.c.in -D LMUL=1 -o src/f32-vclamp/gen/f32-vclamp-rvv-u1v.c &
Expand Down
43 changes: 29 additions & 14 deletions src/amalgam/gen/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -9198,7 +9198,7 @@ void xnn_f32_vsubc_minmax_ukernel__neon_u8(
}
}

void xnn_f32_vclamp_ukernel__neon_u8(
void xnn_f32_vclamp_ukernel__neon_u16(
size_t batch,
const float* input,
float* output,
Expand All @@ -9209,39 +9209,54 @@ void xnn_f32_vclamp_ukernel__neon_u8(
assert(input != NULL);
assert(output != NULL);

const float32x4_t vy_min = vld1q_dup_f32(&params->scalar.min);
const float32x4_t vy_max = vld1q_dup_f32(&params->scalar.max);
#if XNN_ARCH_ARM64
const float32x4x2_t vminmax = vld2q_dup_f32(&params->scalar.min);
const float32x4_t vmin = vminmax.val[0];
const float32x4_t vmax = vminmax.val[1];
#else
const float32x2x2_t vminmax = vld2_dup_f32(&params->scalar.min);
const float32x4_t vmin = vcombine_f32(vminmax.val[0], vminmax.val[0]);
const float32x4_t vmax = vcombine_f32(vminmax.val[1], vminmax.val[1]);
#endif

for (; batch >= 8 * sizeof(float); batch -= 8 * sizeof(float)) {
for (; batch >= 16 * sizeof(float); batch -= 16 * sizeof(float)) {
float32x4_t vacc0123 = vld1q_f32(input); input += 4;
float32x4_t vacc4567 = vld1q_f32(input); input += 4;
float32x4_t vacc89AB = vld1q_f32(input); input += 4;
float32x4_t vaccCDEF = vld1q_f32(input); input += 4;

vacc0123 = vmaxq_f32(vacc0123, vy_min);
vacc4567 = vmaxq_f32(vacc4567, vy_min);
vacc0123 = vmaxq_f32(vacc0123, vmin);
vacc4567 = vmaxq_f32(vacc4567, vmin);
vacc89AB = vmaxq_f32(vacc89AB, vmin);
vaccCDEF = vmaxq_f32(vaccCDEF, vmin);

vacc0123 = vminq_f32(vacc0123, vy_max);
vacc4567 = vminq_f32(vacc4567, vy_max);
vacc0123 = vminq_f32(vacc0123, vmax);
vacc4567 = vminq_f32(vacc4567, vmax);
vacc89AB = vminq_f32(vacc89AB, vmax);
vaccCDEF = vminq_f32(vaccCDEF, vmax);

vst1q_f32(output, vacc0123); output += 4;
vst1q_f32(output, vacc4567); output += 4;
vst1q_f32(output, vacc89AB); output += 4;
vst1q_f32(output, vaccCDEF); output += 4;
}
for (; batch >= 4 * sizeof(float); batch -= 4 * sizeof(float)) {
float32x4_t vacc = vld1q_f32(input); input += 4;
vacc = vmaxq_f32(vacc, vy_min);
vacc = vminq_f32(vacc, vy_max);
vacc = vmaxq_f32(vacc, vmin);
vacc = vminq_f32(vacc, vmax);
vst1q_f32(output, vacc); output += 4;
}
if XNN_UNLIKELY(batch != 0) {
if (batch & (2 * sizeof(float))) {
float32x2_t vacc = vld1_f32(input); input += 2;
vacc = vmax_f32(vacc, vget_low_f32(vy_min));
vacc = vmin_f32(vacc, vget_low_f32(vy_max));
vacc = vmax_f32(vacc, vget_low_f32(vmin));
vacc = vmin_f32(vacc, vget_low_f32(vmax));
vst1_f32(output, vacc); output += 2;
}
if (batch & (1 * sizeof(float))) {
float32x2_t vacc = vld1_dup_f32(input);
vacc = vmax_f32(vacc, vget_low_f32(vy_min));
vacc = vmin_f32(vacc, vget_low_f32(vy_max));
vacc = vmax_f32(vacc, vget_low_f32(vmin));
vacc = vmin_f32(vacc, vget_low_f32(vmax));
vst1_lane_f32(output, vacc, 0);
}
}
Expand Down
Loading
Loading