Skip to content

Commit

Permalink
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinsics (ll…
Browse files Browse the repository at this point in the history
…vm#120363)

This started out as trying to combine bf16 fpround to BFCVT2
instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in
favour of generating fpround instructions directly. This simplifies the
patterns and can lead to other optimizations. The BFCVT2 instruction is
adjusted to makes sure the types are valid, and a bfcvt2 is now
generated in more place. The old intrinsics are auto-upgraded to fptrunc
instructions too.
  • Loading branch information
davemgreen authored Jan 21, 2025
1 parent c22364a commit 547bfda
Show file tree
Hide file tree
Showing 10 changed files with 190 additions and 165 deletions.
10 changes: 2 additions & 8 deletions clang/include/clang/Basic/arm_neon.td
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
def OP_VCVT_F32_BF16_HI
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;

def OP_VCVT_BF16_F32_LO_A64
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
def OP_VCVT_BF16_F32_A64
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;

def OP_VCVT_BF16_F32_A32
: Op<(call "__a32_vcvt_bf16", $p0)>;

Expand Down Expand Up @@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
}

let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;

def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
Expand Down
41 changes: 38 additions & 3 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
};

static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
NEONMAP0(splat_lane_v),
NEONMAP0(splat_laneq_v),
NEONMAP0(splatq_lane_v),
Expand Down Expand Up @@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
NEONMAP0(vcvtq_f16_s16),
NEONMAP0(vcvtq_f16_u16),
NEONMAP0(vcvtq_f32_v),
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
NEONMAP0(vcvtq_high_bf16_f32),
NEONMAP0(vcvtq_low_bf16_f32),
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
Expand Down Expand Up @@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
NEONMAP0(vcvth_bf16_f32),
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
Expand Down Expand Up @@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return ConstantInt::get(Builder.getInt32Ty(), 0);
}

if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
return Builder.CreateFPTrunc(
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
Builder.getFloatTy()),
Builder.getBFloatTy());

// Handle MSVC intrinsics before argument evaluation to prevent double
// evaluation.
if (std::optional<MSVCIntrin> MsvcIntId =
Expand Down Expand Up @@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
"vgetq_lane");
}
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
}
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
SmallVector<int, 16> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
llvm::Value *Trunc =
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
return Builder.CreateShuffleVector(
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
}
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
SmallVector<int, 16> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
SmallVector<int, 16> LoMask(4);
std::iota(LoMask.begin(), LoMask.end(), 0);
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
llvm::Value *Inactive = Builder.CreateShuffleVector(
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
llvm::Value *Trunc =
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
}

case clang::AArch64::BI_InterlockedAdd:
case clang::AArch64::BI_InterlockedAdd64: {
Expand Down
23 changes: 11 additions & 12 deletions clang/test/CodeGen/arm-bf16-convert-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down Expand Up @@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {

// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
Expand Down
11 changes: 0 additions & 11 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;


// v8.6-A Bfloat Intrinsics
def int_aarch64_neon_bfcvt
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
def int_aarch64_neon_bfcvtn2
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
[llvm_v8bf16_ty, llvm_v4f32_ty],
[IntrNoMem]>;

// v8.2-A FP16 Fused Multiply-Add Long
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
Expand Down
86 changes: 61 additions & 25 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstring>
#include <numeric>

using namespace llvm;

Expand Down Expand Up @@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return true;
}
}

// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
if (Name.starts_with("bfcvt")) {
NewFn = nullptr;
return true;
}

return false; // No other 'aarch64.neon.*'.
}
if (Name.consume_front("sve.")) {
Expand Down Expand Up @@ -4028,31 +4036,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,

static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
Function *F, IRBuilder<> &Builder) {
Intrinsic::ID NewID =
StringSwitch<Intrinsic::ID>(Name)
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
.Default(Intrinsic::not_intrinsic);
if (NewID == Intrinsic::not_intrinsic)
llvm_unreachable("Unhandled Intrinsic!");

SmallVector<Value *, 3> Args(CI->args());

// The original intrinsics incorrectly used a predicate based on the smallest
// element type rather than the largest.
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);

if (Args[1]->getType() != BadPredTy)
llvm_unreachable("Unexpected predicate type!");

Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
BadPredTy, Args[1]);
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
GoodPredTy, Args[1]);

return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
CI->getName());
if (Name.starts_with("neon.bfcvt")) {
if (Name.starts_with("neon.bfcvtn2")) {
SmallVector<int, 32> LoMask(4);
std::iota(LoMask.begin(), LoMask.end(), 0);
SmallVector<int, 32> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
Value *Trunc =
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
} else if (Name.starts_with("neon.bfcvtn")) {
SmallVector<int, 32> ConcatMask(8);
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
Type *V4BF16 =
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
dbgs() << "Trunc: " << *Trunc << "\n";
return Builder.CreateShuffleVector(
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
} else {
return Builder.CreateFPTrunc(CI->getOperand(0),
Type::getBFloatTy(F->getContext()));
}
} else if (Name.starts_with("sve.fcvt")) {
Intrinsic::ID NewID =
StringSwitch<Intrinsic::ID>(Name)
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
.Case("sve.fcvtnt.bf16f32",
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
.Default(Intrinsic::not_intrinsic);
if (NewID == Intrinsic::not_intrinsic)
llvm_unreachable("Unhandled Intrinsic!");

SmallVector<Value *, 3> Args(CI->args());

// The original intrinsics incorrectly used a predicate based on the
// smallest element type rather than the largest.
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);

if (Args[1]->getType() != BadPredTy)
llvm_unreachable("Unexpected predicate type!");

Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
BadPredTy, Args[1]);
Args[1] = Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);

return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
CI->getName());
}

llvm_unreachable("Unhandled Intrinsic!");
}

static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
Expand Down
11 changes: 4 additions & 7 deletions llvm/lib/Target/AArch64/AArch64InstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>

let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
"bfcvtn", ".4h", ".4s",
[(set (v8bf16 V128:$Rd),
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;

let mayRaiseFPException = 1, Uses = [FPCR] in
class SIMD_BFCVTN2
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
"bfcvtn2", ".8h", ".4s",
[(set (v8bf16 V128:$dst),
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
"bfcvtn2", ".8h", ".4s", []>;

let mayRaiseFPException = 1, Uses = [FPCR] in
class BF16ToSinglePrecision<string asm>
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
Sched<[WriteFCvt]> {
bits<5> Rd;
bits<5> Rn;
Expand Down
24 changes: 13 additions & 11 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1454,8 +1454,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;

def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;

// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
Expand All @@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot

let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
// Round FP32 to BF16.
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}

// ARMv8.6A AArch64 matrix multiplication
Expand Down Expand Up @@ -10412,9 +10410,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
(INSERT_SUBREG (IMPLICIT_DEF),
(v4bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;

let Predicates = [HasNoBF16] in
Expand Down Expand Up @@ -10449,10 +10449,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
(v8bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
(INSERT_SUBREG (IMPLICIT_DEF),
(v4bf16 (BFCVTN
(v4f32 (OutInst
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;

Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s

; This test acts to test the old neon.bfcvt intrinsics, which are now
; autoupgraded to fptrunc operations.

declare bfloat @llvm.aarch64.neon.bfcvt(float)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)
Expand Down
Loading

0 comments on commit 547bfda

Please sign in to comment.