diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 1ac6d5170ea283..000bd97a4b25d5 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -298,6 +298,16 @@ multiclass ZAAddSub { def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>; def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>; } + + let TargetGuard = "sme-f16f16|sme-f8f16" in { + def NAME # _ZA16_VG1X2_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA16_VG1X4_F16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "h", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>; + } + + let TargetGuard = "sme2,b16b16" in { + def NAME # _ZA16_VG1X2_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x2", "vm2", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x2", [IsStreaming, IsInOutZA], []>; + def NAME # _ZA16_VG1X4_BF16 : Inst<"sv" # n_suffix # "_za16[_{d}]_vg1x4", "vm4", "b", MergeNone, "aarch64_sme_" # n_suffix # "_za16_vg1x4", [IsStreaming, IsInOutZA], []>; + } } defm SVADD : ZAAddSub<"add">; diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c new file mode 100644 index 00000000000000..d98427fac610b8 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c @@ -0,0 +1,193 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f16f16 -target-feature +b16b16 -O2 -S -Werror -Wall -o /dev/null + +// REQUIRES: aarch64-registered-target + +#include + +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x2_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svadd_za16_vg1x2_f16j13svfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_f16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x4_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svadd_za16_vg1x4_f16j13svfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_f16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x2_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svsub_za16_vg1x2_f16j13svfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv16f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x2_f16(uint32_t slice, svfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_f16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x4_f16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z25test_svsub_za16_vg1x4_f16j13svfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8f16.nxv32f16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x4_f16(uint32_t slice, svfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_f16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x2_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svadd_za16_vg1x2_bf16j14svbfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x2_bf16(uint32_t slice, svbfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_bf16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svadd_za16_vg1x4_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svadd_za16_vg1x4_bf16j14svbfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svadd_za16_vg1x4_bf16(uint32_t slice, svbfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svadd_za16,_bf16,_vg1x4)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x2_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svsub_za16_vg1x2_bf16j14svbfloat16x2_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv16bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x2_bf16(uint32_t slice, svbfloat16x2_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_bf16,_vg1x2)(slice, zn); +} + +// CHECK-LABEL: define dso_local void @test_svsub_za16_vg1x4_bf16( +// CHECK-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z26test_svsub_za16_vg1x4_bf16j14svbfloat16x4_t( +// CHECK-CXX-SAME: i32 noundef [[SLICE:%.*]], [[ZN:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-CXX-NEXT: entry: +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 0) +// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 8) +// CHECK-CXX-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 16) +// CHECK-CXX-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.extract.nxv8bf16.nxv32bf16( [[ZN]], i64 24) +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 [[SLICE]], [[TMP0]], [[TMP1]], [[TMP2]], [[TMP3]]) +// CHECK-CXX-NEXT: ret void +// +void test_svsub_za16_vg1x4_bf16(uint32_t slice, svbfloat16x4_t zn) __arm_streaming __arm_inout("za") { + SVE_ACLE_FUNC(svsub_za16,_bf16,_vg1x4)(slice, zn); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c new file mode 100644 index 00000000000000..0eeeac5a50469c --- /dev/null +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_add_sub_za16.c @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -fsyntax-only -verify -emit-llvm %s + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(uint32_t slice, svfloat16x2_t zn2, svfloat16x4_t zn4, + svbfloat16x2_t bzn2, svbfloat16x4_t bzn4) __arm_streaming __arm_inout("za") { + // expected-error@+1 {{'svadd_za16_f16_vg1x2' needs target feature sme-f16f16|sme-f8f16}} + svadd_za16_f16_vg1x2(slice, zn2); + // expected-error@+1 {{'svadd_za16_f16_vg1x4' needs target feature sme-f16f16|sme-f8f16}} + svadd_za16_f16_vg1x4(slice, zn4); + // expected-error@+1 {{'svsub_za16_f16_vg1x2' needs target feature sme-f16f16|sme-f8f16}} + svsub_za16_f16_vg1x2(slice, zn2); + // expected-error@+1 {{'svsub_za16_f16_vg1x4' needs target feature sme-f16f16|sme-f8f16}} + svsub_za16_f16_vg1x4(slice, zn4); + + // expected-error@+1 {{'svadd_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svadd_za16_bf16_vg1x2(slice, bzn2); + // expected-error@+1 {{'svadd_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svadd_za16_bf16_vg1x4(slice, bzn4); + // expected-error@+1 {{'svsub_za16_bf16_vg1x2' needs target feature sme2,b16b16}} + svsub_za16_bf16_vg1x2(slice, bzn2); + // expected-error@+1 {{'svsub_za16_bf16_vg1x4' needs target feature sme2,b16b16}} + svsub_za16_bf16_vg1x4(slice, bzn4); +} + + + diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index e31e00a9c76f31..04571faf130679 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3481,7 +3481,7 @@ let TargetPrefix = "aarch64" in { // Multi-vector add/sub and accumulate into ZA // foreach intr = ["add", "sub"] in { - foreach za = ["za32", "za64"] in { + foreach za = ["za16","za32", "za64"] in { def int_aarch64_sme_ # intr # _ # za # _vg1x2 : SME2_ZA_Write_VG2_Intrinsic; def int_aarch64_sme_ # intr # _ # za # _vg1x4 : SME2_ZA_Write_VG4_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 574178c8d5244c..5102c602d54e13 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -793,10 +793,10 @@ defm LUTI4_S_4ZTZI : sme2p1_luti4_vector_vg4_index<"luti4">; } let Predicates = [HasSMEF16F16orSMEF8F16] in { -defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; -defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, null_frag>; -defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, null_frag>; +defm FADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fadd", 0b0100, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_add_za16_vg1x2>; +defm FADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fadd", 0b0100, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_add_za16_vg1x4>; +defm FSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"fsub", 0b0101, MatrixOp16, ZZ_h_mul_r, nxv8f16, int_aarch64_sme_sub_za16_vg1x2>; +defm FSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"fsub", 0b0101, MatrixOp16, ZZZZ_h_mul_r, nxv8f16, int_aarch64_sme_sub_za16_vg1x4>; } let Predicates = [HasSMEF16F16] in { @@ -822,10 +822,10 @@ defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>; } let Predicates = [HasSME2, HasB16B16] in { -defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; -defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; +defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x2>; +defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_add_za16_vg1x4>; +defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, int_aarch64_sme_sub_za16_vg1x2>; +defm BFSUB_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfsub", 0b1101, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, int_aarch64_sme_sub_za16_vg1x4>; defm BFMLA_VG2_M2ZZI : sme2p1_multi_vec_array_vg2_index_16b<"bfmla", 0b00, 0b110, ZZ_h_mul_r, ZPR4b16>; defm BFMLA_VG4_M4ZZI : sme2p1_multi_vec_array_vg4_index_16b<"bfmla", 0b010, ZZZZ_h_mul_r, ZPR4b16>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll new file mode 100644 index 00000000000000..e7a6c0d6c549be --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-add-sub-za16.ll @@ -0,0 +1,148 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +define void @add_f16_vg1x2(i32 %slice, %zn0, %zn1) #0 { +; CHECK-LABEL: add_f16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fadd za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: fadd za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8f16(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @add_f16_vg1x4(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: add_f16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fadd za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: fadd za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + %zn2, %zn3) #1 { + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 %slice, %zn0, %zn1, + %zn2, %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8f16(i32 %slice.7, %zn0, %zn1, + %zn2, %zn3); + ret void +} + +define void @sub_f16_vg1x2(i32 %slice, %zn0, %zn1) #1 { +; CHECK-LABEL: sub_f16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fsub za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: fsub za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8f16(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @sub_f16_vg1x4(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: sub_f16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: fsub za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: fsub za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + %zn2, %zn3) #0 { + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 %slice, %zn0, %zn1, + %zn2, %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8f16(i32 %slice.7, %zn0, %zn1, + %zn2, %zn3); + ret void +} + +define void @add_bf16_vg1x2(i32 %slice, %zn0, %zn1) #2 { +; CHECK-LABEL: add_bf16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfadd za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: bfadd za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x2.nxv8bf16(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @add_bf16_vg1x4(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: add_bf16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfadd za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: bfadd za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + %zn2, %zn3) #2 { + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 %slice, %zn0, %zn1, + %zn2, %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.add.za16.vg1x4.nxv8bf16(i32 %slice.7, %zn0, %zn1, + %zn2, %zn3); + ret void +} + +define void @sub_bf16_vg1x2(i32 %slice, %zn0, %zn1) #2 { +; CHECK-LABEL: sub_bf16_vg1x2: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfsub za.h[w8, 0, vgx2], { z0.h, z1.h } +; CHECK-NEXT: bfsub za.h[w8, 7, vgx2], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 %slice, %zn0, %zn1) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x2.nxv8bf16(i32 %slice.7, %zn0, %zn1) + ret void +} + +define void @sub_bf16_vg1x4(i32 %slice, %zn0, %zn1, +; CHECK-LABEL: sub_bf16_vg1x4: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: bfsub za.h[w8, 0, vgx4], { z0.h - z3.h } +; CHECK-NEXT: bfsub za.h[w8, 7, vgx4], { z0.h - z3.h } +; CHECK-NEXT: ret + %zn2, %zn3) #2 { + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 %slice, %zn0, %zn1, + %zn2, %zn3); + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.sub.za16.vg1x4.nxv8bf16(i32 %slice.7, %zn0, %zn1, + %zn2, %zn3); + ret void +} + +attributes #0 = { nounwind "target-features"="+sme-f16f16" } +attributes #1 = { nounwind "target-features"="+sme-f8f16" } +attributes #2 = { nounwind "target-features"="+sme2,+bf16,+b16b16" }