From c6a4bd345fa4858a49011fffabcab34578204433 Mon Sep 17 00:00:00 2001 From: Sirui Lu Date: Fri, 27 Sep 2024 13:11:58 -0700 Subject: [PATCH] improve MulEven performance --- hwy/ops/rvv-inl.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hwy/ops/rvv-inl.h b/hwy/ops/rvv-inl.h index fecc8bc1f5..ca8618b924 100644 --- a/hwy/ops/rvv-inl.h +++ b/hwy/ops/rvv-inl.h @@ -5655,9 +5655,13 @@ HWY_API V64 BitShuffle(V64 values, VI idx) { template , class DW = RepartitionToWide> HWY_API VFromD MulEven(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo)); + constexpr int maskVal = sizeof(TFromD) == 4 ? 5 + : sizeof(TFromD) == 2 ? 0x55 + : 0x5555; + const auto mask = Dup128MaskFromMaskBits(D(), maskVal); + const auto hi = Slide1Up(D(), MulHigh(a, b)); + const auto res = MaskedMulOr(hi, mask, a, b); + return BitCast(DW(), res); } template MulOdd(const V a, const V b) { // There is no 64x64 vwmul. template HWY_INLINE V MulEven(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return OddEven(detail::Slide1Up(hi), lo); + const auto mask = Dup128MaskFromMaskBits(DFromV(), 1); + const auto hi = Slide1Up(DFromV(), MulHigh(a, b)); + return MaskedMulOr(hi, mask, a, b); } template