Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64][SME] Add intrinsics for multi-vector BFCLAMP #88251

Merged
merged 8 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sve.td
Original file line number Diff line number Diff line change
Expand Up @@ -2151,6 +2151,11 @@ let TargetGuard = "sme2" in {
def SVFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "hfd", MergeNone, "aarch64_sve_fclamp_single_x4", [IsStreaming], []>;
}

let TargetGuard = "sme2,b16b16"in {
def SVBFCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]", "22dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x2", [IsStreaming], []>;
def SVBFCLAMP_X4 : SInst<"svclamp[_single_{d}_x4]", "44dd", "b", MergeNone, "aarch64_sve_bfclamp_single_x4", [IsStreaming], []>;
}

let TargetGuard = "sme2" in {
// == ADD (vectors) ==
def SVADD_SINGLE_X2 : SInst<"svadd[_single_{d}_x2]", "22d", "cUcsUsiUilUl", MergeNone, "aarch64_sve_add_single_x2", [IsStreaming], []>;
Expand Down
74 changes: 69 additions & 5 deletions clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: aarch64-registered-target
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 \
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
// RUN: -S -Werror -emit-llvm -disable-O0-optnone -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +b16b16 \
// RUN: -S -disable-O0-optnone -Werror -Wall -o /dev/null %s

#include <arm_sme.h>
Expand Down Expand Up @@ -745,3 +745,67 @@ svfloat32x4_t test_svclamp_single_f32_x4(svfloat32x4_t op1, svfloat32_t op2, svf
svfloat64x4_t test_svclamp_single_f64_x4(svfloat64x4_t op1, svfloat64_t op2, svfloat64_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_f64_x4, , )(op1, op2, op3);
}

// CHECK-LABEL: @test_svclamp_single_bf16_x2(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
// CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
// CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
// CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
//
// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x214svbfloat16x2_tu14__SVBfloat16_tS0_(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1:%.*]], i64 0)
// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[OP1]], i64 8)
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 0
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> poison, <vscale x 8 x bfloat> [[TMP3]], i64 0)
// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP2]], 1
// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], i64 8)
// CPP-CHECK-NEXT: ret <vscale x 16 x bfloat> [[TMP6]]
//
svbfloat16x2_t test_svclamp_single_bf16_x2(svbfloat16x2_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_bf16_x2, , )(op1, op2, op3);
}

// CHECK-LABEL: @test_svclamp_single_bf16_x4(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
// CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
// CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
// CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
// CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
// CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
// CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
// CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
// CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP12]]
//
// CPP-CHECK-LABEL: @_Z27test_svclamp_single_bf16_x414svbfloat16x4_tu14__SVBfloat16_tS0_(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1:%.*]], i64 0)
// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 8)
// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 16)
// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[OP1]], i64 24)
// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[OP2:%.*]], <vscale x 8 x bfloat> [[OP3:%.*]])
// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 0
// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> poison, <vscale x 8 x bfloat> [[TMP5]], i64 0)
// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 1
// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 8)
// CPP-CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 2
// CPP-CHECK-NEXT: [[TMP10:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP8]], <vscale x 8 x bfloat> [[TMP9]], i64 16)
// CPP-CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP4]], 3
// CPP-CHECK-NEXT: [[TMP12:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP10]], <vscale x 8 x bfloat> [[TMP11]], i64 24)
// CPP-CHECK-NEXT: ret <vscale x 32 x bfloat> [[TMP12]]
//
svbfloat16x4_t test_svclamp_single_bf16_x4(svbfloat16x4_t op1, svbfloat16_t op2, svbfloat16_t op3) __arm_streaming {
return SVE_ACLE_FUNC(svclamp, _single_bf16_x4, , )(op1, op2, op3);
}
13 changes: 13 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_b16b16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// REQUIRES: aarch64-registered-target

// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify -verify-ignore-unexpected=error,note -emit-llvm -o - %s

#include <arm_sme.h>

void test_b16b16(svbool_t pg, uint64_t u64, int64_t i64, const bfloat16_t *const_bf16_ptr, bfloat16_t *bf16_ptr, svbfloat16_t bf16, svbfloat16x2_t bf16x2, svbfloat16x3_t bf16x3, svbfloat16x4_t bf16x4) __arm_streaming
{
// expected-error@+1 {{'svclamp_single_bf16_x2' needs target feature sme2,b16b16}}
svclamp_single_bf16_x2(bf16x2, bf16, bf16);
// expected-error@+1 {{'svclamp_single_bf16_x4' needs target feature sme2,b16b16}}
svclamp_single_bf16_x4(bf16x4, bf16, bf16);
}
2 changes: 2 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3472,10 +3472,12 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;

def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;

//
// Multi-vector add/sub and accumulate into ZA
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5734,6 +5734,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::FCLAMP_VG2_2Z2Z_D}))
SelectClamp(Node, 2, Op);
return;
case Intrinsic::aarch64_sve_bfclamp_single_x2:
SelectClamp(Node, 2, AArch64::BFCLAMP_VG2_2ZZZ_H);
return;
case Intrinsic::aarch64_sve_sclamp_single_x4:
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
Node->getValueType(0),
Expand All @@ -5755,6 +5758,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::FCLAMP_VG4_4Z4Z_D}))
SelectClamp(Node, 4, Op);
return;
case Intrinsic::aarch64_sve_bfclamp_single_x4:
SelectClamp(Node, 4, AArch64::BFCLAMP_VG4_4ZZZ_H);
return;
case Intrinsic::aarch64_sve_add_single_x2:
if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int>(
Node->getValueType(0),
Expand Down
26 changes: 25 additions & 1 deletion llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfclamp.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -mattr=+sme2 -mattr=+b16b16 -verify-machineinstrs < %s | FileCheck %s

define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c){
; CHECK-LABEL: bfclamp:
Expand All @@ -11,3 +11,27 @@ define <vscale x 8 x bfloat> @bfclamp(<vscale x 8 x bfloat> %a, <vscale x 8 x bf
}

declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fclamp.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)

define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x2_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d){
; CHECK-LABEL: test_bfclamp_single_x2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: bfclamp { z0.h, z1.h }, z2.h, z3.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x2.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d)
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}

define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @test_bfclamp_single_x4_f16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f){
; CHECK-LABEL: test_bfclamp_single_x4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
; CHECK-NEXT: bfclamp { z0.h - z3.h }, z4.h, z5.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.bfclamp.single.x4.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, <vscale x 8 x bfloat> %d, <vscale x 8 x bfloat> %e, <vscale x 8 x bfloat> %f)
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
Loading