From 3a96a9bd363be318dc25e8e9a9428cce364125a3 Mon Sep 17 00:00:00 2001 From: CarolineConcatto Date: Wed, 26 Jun 2024 12:54:37 +0100 Subject: [PATCH] [CLANG][LLVM][AArch64]SME2.1 intrinsics for MOVAZ tile to 2/4 vectors (#88710) According to the specification in ARM-software/acle#309 this adds the intrinsics // Variants are also available for _za8_u8, _za16_s16, _za16_u16, // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, // _za64_s64, _za64_u64 and _za64_f64 svint8x2_t svreadz_hor_za8_s8_vg2(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za"); // Variants are also available for _za8_u8, _za16_s16, _za16_u16, // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, // _za64_s64, _za64_u64 and _za64_f64 svint8x4_t svreadz_hor_za8_s8_vg4(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za"); // Variants are also available for _za8_u8, _za16_s16, _za16_u16, // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, // _za64_s64, _za64_u64 and _za64_f64 svint8x2_t svreadz_ver_za8_s8_vg2(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za"); // Variants are also available for _za8_u8, _za16_s16, _za16_u16, // _za16_f16, _za16_bf16, _za32_s32, _za32_u32, _za32_f32, // _za64_s64, _za64_u64 and _za64_f64 svint8x4_t svreadz_ver_za8_s8_vg4(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za"); --- clang/include/clang/Basic/arm_sme.td | 23 + .../acle_sme2p1_movaz.c | 1414 +++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 18 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 99 +- .../Target/AArch64/AArch64ISelLowering.cpp | 31 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 43 +- .../AArch64/sme2p1-intrinsics-movaz.ll | 459 ++++++ 8 files changed, 2067 insertions(+), 23 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 1580331ea603c5..5f757b40e8fd9f 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -764,4 +764,27 @@ let SMETargetGuard = "sme-f16f16" in { [ImmCheck<0, ImmCheck0_1>]>; } + +multiclass ZAReadz ch> { + let SMETargetGuard = "sme2p1" in { + def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t, + MergeNone, i_prefix # "_horiz_x" # vg_num, + [IsStreaming, IsInOutZA], ch>; + + def NAME # _V : SInst<"svreadz_ver_" # n_suffix # "_{d}_vg" # vg_num, vg_num # "im", t, + MergeNone, i_prefix # "_vert_x" #vg_num, + [IsStreaming, IsInOutZA], ch>; + } +} + +defm SVREADZ_ZA8_X2 : ZAReadz<"za8", "2", "cUc", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>; +defm SVREADZ_ZA16_X2 : ZAReadz<"za16", "2", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>; +defm SVREADZ_ZA32_X2 : ZAReadz<"za32", "2", "iUif", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>; +defm SVREADZ_ZA64_X2 : ZAReadz<"za64", "2", "lUld", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>; + +defm SVREADZ_ZA8_X4 : ZAReadz<"za8", "4", "cUc", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>; +defm SVREADZ_ZA16_X4 : ZAReadz<"za16", "4", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>; +defm SVREADZ_ZA32_X4 : ZAReadz<"za32", "4", "iUif", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>; +defm SVREADZ_ZA64_X4 : ZAReadz<"za64", "4", "lUld", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>; + } // let SVETargetGuard = InvalidMode diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c new file mode 100644 index 00000000000000..3f5337c2b23d2d --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c @@ -0,0 +1,1414 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 + //RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +sme -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// +// X2- hor +// CHECK-LABEL: define dso_local @test_svreadz_hor_za8_s8_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_hor_za8_s8_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svreadz_hor_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za8_s8_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za8_u8_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_hor_za8_u8_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svreadz_hor_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za8_u8_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_s16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_s16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svreadz_hor_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_s16_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_u16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_u16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svreadz_hor_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_u16_vg2(1, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_f16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_f16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svreadz_hor_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_f16_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_bf16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z29test_svreadz_hor_za16_bf16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svreadz_hor_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_bf16_vg2(1, slice); +} + + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_s32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_s32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svreadz_hor_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_s32_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_u32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_u32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 2, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svreadz_hor_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_u32_vg2(2, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_f32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_f32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svreadz_hor_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_f32_vg2(3, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_s64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_s64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svreadz_hor_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_s64_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_u64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_u64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 4, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svreadz_hor_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_u64_vg2(4, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_f64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_f64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svreadz_hor_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_f64_vg2(7, slice); +} + + +// +// X2- ver +// + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za8_s8_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_ver_za8_s8_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint8x2_t test_svreadz_ver_za8_s8_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za8_s8_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za8_u8_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_ver_za8_u8_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint8x2_t test_svreadz_ver_za8_u8_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za8_u8_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_s16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_s16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint16x2_t test_svreadz_ver_za16_s16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_s16_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_u16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_u16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint16x2_t test_svreadz_ver_za16_u16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_u16_vg2(1, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_f16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_f16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat16x2_t test_svreadz_ver_za16_f16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_f16_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_bf16_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z29test_svreadz_ver_za16_bf16_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svbfloat16x2_t test_svreadz_ver_za16_bf16_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_bf16_vg2(1, slice); +} + + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_s32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_s32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint32x2_t test_svreadz_ver_za32_s32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_s32_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_u32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_u32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 2, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint32x2_t test_svreadz_ver_za32_u32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_u32_vg2(2, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_f32_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_f32_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat32x2_t test_svreadz_ver_za32_f32_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_f32_vg2(3, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_s64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_s64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svint64x2_t test_svreadz_ver_za64_s64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_s64_vg2(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_u64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_u64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 4, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svuint64x2_t test_svreadz_ver_za64_u64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_u64_vg2(4, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_f64_x2( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: ret [[TMP4]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_f64_x2j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: ret [[TMP4]] +// +svfloat64x2_t test_svreadz_ver_za64_f64_x2(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_f64_vg2(7, slice); +} + + +// +// X4 - hor +// CHECK-LABEL: define dso_local @test_svreadz_hor_za8_s8_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_hor_za8_s8_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svreadz_hor_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za8_s8_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za8_u8_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_hor_za8_u8_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svreadz_hor_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za8_u8_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_s16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_s16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svreadz_hor_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_s16_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_u16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_u16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svreadz_hor_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_u16_vg4(1, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_f16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za16_f16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svreadz_hor_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_f16_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za16_bf16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z29test_svreadz_hor_za16_bf16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svreadz_hor_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za16_bf16_vg4(1, slice); +} + + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_s32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_s32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svreadz_hor_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_s32_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_u32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_u32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 2, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svreadz_hor_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_u32_vg4(2, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za32_f32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za32_f32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svreadz_hor_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za32_f32_vg4(3, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_s64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_s64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svreadz_hor_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_s64_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_u64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_u64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 4, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svreadz_hor_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_u64_vg4(4, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_hor_za64_f64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_hor_za64_f64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svreadz_hor_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_hor_za64_f64_vg4(7, slice); +} + +// +// X4 - ver +// CHECK-LABEL: define dso_local @test_svreadz_ver_za8_s8_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_ver_za8_s8_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint8x4_t test_svreadz_ver_za8_s8_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za8_s8_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za8_u8_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z26test_svreadz_ver_za8_u8_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint8x4_t test_svreadz_ver_za8_u8_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za8_u8_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_s16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_s16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint16x4_t test_svreadz_ver_za16_s16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_s16_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_u16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_u16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint16x4_t test_svreadz_ver_za16_u16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_u16_vg4(1, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_f16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za16_f16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32f16.nxv8f16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat16x4_t test_svreadz_ver_za16_f16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_f16_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za16_bf16_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z29test_svreadz_ver_za16_bf16_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32bf16.nxv8bf16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svbfloat16x4_t test_svreadz_ver_za16_bf16_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za16_bf16_vg4(1, slice); +} + + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_s32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_s32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint32x4_t test_svreadz_ver_za32_s32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_s32_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_u32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_u32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 2, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint32x4_t test_svreadz_ver_za32_u32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_u32_vg4(2, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za32_f32_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za32_f32_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP2]], [[TMP3]], i64 4) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP4]], [[TMP5]], i64 8) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv16f32.nxv4f32( [[TMP6]], [[TMP7]], i64 12) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat32x4_t test_svreadz_ver_za32_f32_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za32_f32_vg4(3, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_s64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_s64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svint64x4_t test_svreadz_ver_za64_s64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_s64_vg4(0, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_u64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_u64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 4, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8i64.nxv2i64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svuint64x4_t test_svreadz_ver_za64_u64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_u64_vg4(4, slice); +} + +// CHECK-LABEL: define dso_local @test_svreadz_ver_za64_f64_x4( +// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CHECK-NEXT: ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z28test_svreadz_ver_za64_f64_x4j( +// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 [[SLICE]]) +// CPP-CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT: [[TMP4:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP2]], [[TMP3]], i64 2) +// CPP-CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT: [[TMP6:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP4]], [[TMP5]], i64 4) +// CPP-CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT: [[TMP8:%.*]] = tail call @llvm.vector.insert.nxv8f64.nxv2f64( [[TMP6]], [[TMP7]], i64 6) +// CPP-CHECK-NEXT: ret [[TMP8]] +// +svfloat64x4_t test_svreadz_ver_za64_f64_x4(uint32_t slice) __arm_streaming __arm_inout("za") +{ + return svreadz_ver_za64_f64_vg4(7, slice); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 9a71aaa9f44349..38d71b17b476d5 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2841,6 +2841,24 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic; def int_aarch64_sme_writeq_vert : SME_VectorToTile_Intrinsic; + class SME_MOVAZ_TileToVector_X2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrHasSideEffects, ImmArg>]>; + + class SME_MOVAZ_TileToVector_X4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>,LLVMMatchType<0>], + [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrHasSideEffects, ImmArg>]>; + + def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic; + def int_aarch64_sme_readz_vert_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic; + + def int_aarch64_sme_readz_horiz_x4 : SME_MOVAZ_TileToVector_X4_Intrinsic; + def int_aarch64_sme_readz_vert_x4 : SME_MOVAZ_TileToVector_X4_Intrinsic; + + def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>]>; class SME_OuterProduct_Intrinsic diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 5bad1da7da15d2..544eec3ab9cecf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -395,7 +395,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { template void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, unsigned Op); - + void SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, unsigned Op, + unsigned MaxIdx, unsigned Scale); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. template @@ -2003,6 +2004,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectMultiVectorMoveZ(SDNode *N, unsigned NumVecs, + unsigned Op, unsigned MaxIdx, + unsigned Scale) { + + SDValue SliceBase = N->getOperand(3); + SDValue Base, Offset; + if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale)) + return; + // The correct Za tile number is computed in Machine Instruction + // See EmitZAInstr + // DAG cannot select Za tile as an output register with ZReg + SDLoc DL(N); + SDValue Ops[] = {/*TileNum*/ N->getOperand(2), Base, Offset, + /*Chain*/ N->getOperand(0)}; + SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops); + + EVT VT = N->getValueType(0); + for (unsigned I = 0; I < NumVecs; ++I) + ReplaceUses(SDValue(N, I), + CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT, + SDValue(Mov, 0))); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, @@ -5245,6 +5274,74 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::MOVA_VG4_4ZMXI); return; } + case Intrinsic::aarch64_sme_readz_horiz_x2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_B_PSEUDO, 14, 2); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_H_PSEUDO, 6, 2); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_S_PSEUDO, 2, 2); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_H_D_PSEUDO, 0, 2); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_vert_x2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_B_PSEUDO, 14, 2); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_H_PSEUDO, 6, 2); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_S_PSEUDO, 2, 2); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 2, AArch64::MOVAZ_2ZMI_V_D_PSEUDO, 0, 2); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_horiz_x4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_B_PSEUDO, 12, 4); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_H_PSEUDO, 4, 4); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_S_PSEUDO, 0, 4); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_H_D_PSEUDO, 0, 4); + return; + } + break; + } + case Intrinsic::aarch64_sme_readz_vert_x4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_B_PSEUDO, 12, 4); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_H_PSEUDO, 4, 4); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_S_PSEUDO, 0, 4); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMoveZ(Node, 4, AArch64::MOVAZ_4ZMI_V_D_PSEUDO, 0, 4); + return; + } + break; + } case Intrinsic::swift_async_context_addr: { SDLoc DL(Node); SDValue Chain = Node->getOperand(0); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8b1107f9e08af0..b32e67a6e2cdcc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2976,18 +2976,25 @@ MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI, MachineBasicBlock * AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, - MachineBasicBlock *BB, bool HasTile) const { + MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); unsigned StartIdx = 0; + bool HasTile = BaseReg != AArch64::ZA; + bool HasZPROut = HasTile && MI.getOperand(0).isReg(); + if (HasZPROut) { + MIB.add(MI.getOperand(0)); // Output ZPR + ++StartIdx; + } if (HasTile) { - MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); - MIB.addReg(BaseReg + MI.getOperand(0).getImm()); - StartIdx = 1; - } else + MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(), + RegState::Define); // Output ZA Tile + MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile + StartIdx++; + } else { MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg); - + } for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I) MIB.add(MI.getOperand(I)); @@ -3096,17 +3103,17 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask; switch (SMEMatrixType) { case (AArch64::SMEMatrixArray): - return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB, /*HasTile*/ false); + return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB); case (AArch64::SMEMatrixTileB): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB); case (AArch64::SMEMatrixTileH): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB); case (AArch64::SMEMatrixTileS): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB); case (AArch64::SMEMatrixTileD): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB); case (AArch64::SMEMatrixTileQ): - return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB, /*HasTile*/ true); + return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 986f1b67ee5136..5200b24d1388a5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -655,8 +655,7 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *BB) const; MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg, - MachineInstr &MI, MachineBasicBlock *BB, - bool HasTile) const; + MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index b21b1faf5c9622..3087f6090379aa 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -111,6 +111,12 @@ class sem2p1_zero_matrix_pseudo + : SMEPseudo2Instr, + Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -4000,7 +4006,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst opc, string mnemo def _B : sme2_mova_tile_to_vec_vg2_multi_base<0b00, v, opc, ZZ_b_mul_r, !if(v, TileVectorOpV8, TileVectorOpH8), - uimm3s2range, mnemonic> { + uimm3s2range, mnemonic>, SMEPseudo2Instr { bits<3> imm; let Inst{7-5} = imm; } @@ -4008,7 +4014,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst opc, string mnemo def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r, !if(v, TileVectorOpV16, TileVectorOpH16), - uimm2s2range, mnemonic> { + uimm2s2range, mnemonic>, SMEPseudo2Instr { bits<1> ZAn; bits<2> imm; let Inst{7} = ZAn; @@ -4018,7 +4024,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst opc, string mnemo def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r, !if(v, TileVectorOpV32, TileVectorOpH32), - uimm1s2range, mnemonic> { + uimm1s2range, mnemonic>, SMEPseudo2Instr { bits<2> ZAn; bits<1> imm; let Inst{7-6} = ZAn; @@ -4028,7 +4034,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst opc, string mnemo def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r, !if(v, TileVectorOpV64, TileVectorOpH64), - uimm0s2range, mnemonic> { + uimm0s2range, mnemonic>, SMEPseudo2Instr { bits<3> ZAn; let Inst{7-5} = ZAn; } @@ -4097,6 +4103,17 @@ multiclass sme2_mova_tile_to_vec_vg2_multi{ multiclass sme2p1_movaz_tile_to_vec_vg2{ defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>; defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b010, mnemonic>; + + + def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo; + + def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo; } class sme2_mova_tile_to_vec_vg4_multi_base sz, bit v, bits<6> op, @@ -4130,7 +4147,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base opc, string mnemo ZZZZ_b_mul_r, !if(v, TileVectorOpV8, TileVectorOpH8), - uimm2s4range, mnemonic> { + uimm2s4range, mnemonic>, SMEPseudo2Instr { bits<2> imm; let Inst{6-5} = imm; } @@ -4139,7 +4156,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base opc, string mnemo ZZZZ_h_mul_r, !if(v, TileVectorOpV16, TileVectorOpH16), - uimm1s4range, mnemonic> { + uimm1s4range, mnemonic>, SMEPseudo2Instr { bits<1> ZAn; bits<1> imm; let Inst{6} = ZAn; @@ -4150,7 +4167,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base opc, string mnemo ZZZZ_s_mul_r, !if(v, TileVectorOpV32, TileVectorOpH32), - uimm0s4range, mnemonic> { + uimm0s4range, mnemonic>, SMEPseudo2Instr { bits<2> ZAn; let Inst{6-5} = ZAn; } @@ -4159,7 +4176,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base opc, string mnemo ZZZZ_d_mul_r, !if(v, TileVectorOpV64, TileVectorOpH64), - uimm0s4range, mnemonic> { + uimm0s4range, mnemonic>, SMEPseudo2Instr { bits<3> ZAn; let Inst{7-5} = ZAn; } @@ -4228,6 +4245,16 @@ multiclass sme2_mova_tile_to_vec_vg4_multi{ multiclass sme2p1_movaz_tile_to_vec_vg4{ defm _H : sme2_mova_tile_to_vec_vg4_multi_base<0b0, 0b110, mnemonic>; defm _V : sme2_mova_tile_to_vec_vg4_multi_base<0b1, 0b110, mnemonic>; + + def NAME # _H_B_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_H_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_S_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _H_D_PSEUDO : sme2_movez_to_tile_multi_pseudo; + + def NAME # _V_B_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_H_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_S_PSEUDO : sme2_movez_to_tile_multi_pseudo; + def NAME # _V_D_PSEUDO : sme2_movez_to_tile_multi_pseudo; } diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll new file mode 100644 index 00000000000000..f76cd6d1f5a175 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll @@ -0,0 +1,459 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s + +;MOVAZ (tile to vector, Multi) + + +;; +; X2 - Horiz +;; + +define {, } @test_readz_hor_z8_i8_x2(i32 %tile, i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z8_i8_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: movaz { z0.b, z1.b }, za0h.b[w12, 0:1] +; CHECK-NEXT: movaz { z0.b, z1.b }, za0h.b[w12, 14:15] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice) + %slice.max = add i32 %slice, 14 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv16i8(i32 0, i32 %slice.max) + ret {, } %res2 +} +define {, } @test_readz_hor_z16_i16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_i16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_hor_z32_i32_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z32_i32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s, z1.s }, za0h.s[w12, 0:1] +; CHECK-NEXT: movaz { z0.s, z1.s }, za3h.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 0, i32 %slice) + %slice.max = add i32 %slice, 2 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32 3, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_hor_z64_i64_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z64_i64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: movaz { z2.d, z3.d }, za7h.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 0, i32 %slice) + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32 7, i32 %slice) + ret {, } %res +} + +define {, } @test_readz_hor_z16_bf16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_bf16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_hor_z16_f16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_f16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_hor_z32_f32_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z32_f32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s, z1.s }, za0h.s[w12, 0:1] +; CHECK-NEXT: movaz { z0.s, z1.s }, za3h.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 0, i32 %slice) + %slice.max = add i32 %slice, 2 + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32 3, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_hor_z64_f64_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z64_f64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: movaz { z2.d, z3.d }, za7h.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 0, i32 %slice) + %res2 = call {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32 7, i32 %slice) + ret {, } %res +} + +;; +; X2- Vert +;; + +define {, } @test_readz_ver_z8_i8_x2(i32 %tile, i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z8_i8_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: movaz { z0.b, z1.b }, za0v.b[w12, 0:1] +; CHECK-NEXT: movaz { z0.b, z1.b }, za0v.b[w12, 14:15] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice) + %slice.max = add i32 %slice, 14 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv16i8(i32 0, i32 %slice.max) + ret {, } %res2 +} +define {, } @test_readz_ver_z16_i16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_i16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_ver_z32_i32_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z32_i32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s, z1.s }, za0v.s[w12, 0:1] +; CHECK-NEXT: movaz { z0.s, z1.s }, za3v.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 0, i32 %slice) + %slice.max = add i32 %slice, 2 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32 3, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_ver_z64_i64_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z64_i64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: movaz { z2.d, z3.d }, za7v.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 0, i32 %slice) + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32 7, i32 %slice) + ret {, } %res +} + +define {, } @test_readz_ver_z16_bf16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_bf16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_ver_z16_f16_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_f16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: movaz { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 6 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32 1, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_ver_z32_f32_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z32_f32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s, z1.s }, za0v.s[w12, 0:1] +; CHECK-NEXT: movaz { z0.s, z1.s }, za3v.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 0, i32 %slice) + %slice.max = add i32 %slice, 2 + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32 3, i32 %slice.max) + ret {, } %res2 +} + +define {, } @test_readz_ver_z64_f64_x2(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z64_f64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: movaz { z2.d, z3.d }, za7v.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 0, i32 %slice) + %res2 = call {, } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32 7, i32 %slice) + ret {, } %res +} + +;; +; X4 - Horiz +;; + +define {, ,, } @test_readz_hor_z8_i8_x4(i32 %tile, i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z8_i8_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: movaz { z0.b - z3.b }, za0h.b[w12, 0:3] +; CHECK-NEXT: movaz { z0.b - z3.b }, za0h.b[w12, 12:15] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice) + %slice.max = add i32 %slice, 12 + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv16i8(i32 0, i32 %slice.max) + ret {, ,, } %res2 +} +define {, ,, } @test_readz_hor_z16_i16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_i16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32 1, i32 %slice.max) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_hor_z32_i32_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z32_i32_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: movaz { z0.s - z3.s }, za3h.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32 3, i32 %slice) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_hor_z64_i64_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z64_i64_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: movaz { z4.d - z7.d }, za7h.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32 7, i32 %slice) + ret {, ,, } %res +} + +define {, , , } @test_readz_hor_z16_bf16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_bf16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32 1, i32 %slice.max) + ret {, , , } %res2 +} + +define {, , , } @test_readz_hor_z16_f16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z16_f16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32 1, i32 %slice.max) + ret {, , , } %res2 +} + +define {, ,, } @test_readz_hor_z32_f32_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z32_f32_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: movaz { z0.s - z3.s }, za3h.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32 3, i32 %slice) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_hor_z64_f64_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_hor_z64_f64_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: movaz { z4.d - z7.d }, za7h.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32 7, i32 %slice) + ret {, ,, } %res +} + +;; +; X4 - Vert +;; + +define {, ,, } @test_readz_ver_z8_i8_x4(i32 %tile, i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z8_i8_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: movaz { z0.b - z3.b }, za0v.b[w12, 0:3] +; CHECK-NEXT: movaz { z0.b - z3.b }, za0v.b[w12, 12:15] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice) + %slice.max = add i32 %slice, 12 + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv16i8(i32 0, i32 %slice.max) + ret {, ,, } %res2 +} +define {, ,, } @test_readz_ver_z16_i16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_i16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32 1, i32 %slice.max) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_ver_z32_i32_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z32_i32_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: movaz { z0.s - z3.s }, za3v.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32 3, i32 %slice) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_ver_z64_i64_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z64_i64_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: movaz { z4.d - z7.d }, za7v.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32 7, i32 %slice) + ret {, ,, } %res +} + +define {, , , } @test_readz_ver_z16_bf16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_bf16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32 1, i32 %slice.max) + ret {, , , } %res2 +} + +define {, , , } @test_readz_ver_z16_f16_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z16_f16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: movaz { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 0, i32 %slice) + %slice.max = add i32 %slice, 4 + %res2 = call {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32 1, i32 %slice.max) + ret {, , , } %res2 +} + +define {, ,, } @test_readz_ver_z32_f32_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z32_f32_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: movaz { z0.s - z3.s }, za3v.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32 3, i32 %slice) + ret {, ,, } %res2 +} + +define {, ,, } @test_readz_ver_z64_f64_x4(i32 %slice) #0 { +; CHECK-LABEL: test_readz_ver_z64_f64_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: movaz { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: movaz { z4.d - z7.d }, za7v.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 0, i32 %slice) + %res2 = call {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32 7, i32 %slice) + ret {, ,, } %res +} + +attributes #0 = { "target-features"="+sve" } + +declare {, } @llvm.aarch64.sme.readz.horiz.za8.x2.nxv16i8(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8i16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4i32(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2i64(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8bf16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv8f16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv4f32(i32, i32) +declare {, } @llvm.aarch64.sme.readz.horiz.x2.nxv2f64(i32, i32) + +declare {, } @llvm.aarch64.sme.readz.vert.za8.x2.nxv16i8(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv8i16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv4i32(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv2i64(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv8bf16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv8f16(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv4f32(i32, i32) +declare {, } @llvm.aarch64.sme.readz.vert.x2.nxv2f64(i32, i32) + +declare {, ,, } @llvm.aarch64.sme.readz.horiz.za8.x4.nxv16i8(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv8i16(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4i32(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2i64(i32, i32) +declare {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8bf16(i32, i32) +declare {, , , } @llvm.aarch64.sme.readz.horiz.x4.nxv8f16(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv4f32(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.horiz.x4.nxv2f64(i32, i32) + +declare {, ,, } @llvm.aarch64.sme.readz.vert.za8.x4.nxv16i8(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv8i16(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4i32(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2i64(i32, i32) +declare {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8bf16(i32, i32) +declare {, , , } @llvm.aarch64.sme.readz.vert.x4.nxv8f16(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv4f32(i32, i32) +declare {, ,, } @llvm.aarch64.sme.readz.vert.x4.nxv2f64(i32, i32)