diff --git a/llvm/lib/Target/AIE/AIE2.h b/llvm/lib/Target/AIE/AIE2.h index 144c1d86ceab..2dfc00b04157 100644 --- a/llvm/lib/Target/AIE/AIE2.h +++ b/llvm/lib/Target/AIE/AIE2.h @@ -34,7 +34,6 @@ class MachineInstr; class MachineOperand; class PassRegistry; -FunctionPass *createAIE2ISelDag(TargetMachine &TM); FunctionPass *createAIE2PreLegalizerCombiner(); FunctionPass *createAIE2PostLegalizerCustomCombiner(); FunctionPass *createAIE2PostLegalizerGenericCombiner(); diff --git a/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp b/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp deleted file mode 100644 index fd6a6e1ea5bb..000000000000 --- a/llvm/lib/Target/AIE/AIE2ISelDAGToDAG.cpp +++ /dev/null @@ -1,74 +0,0 @@ -//===--AIE2ISelDAGToDAG.cpp -A dag to dag inst selector for AIEngine V2 ---===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates -// -//===----------------------------------------------------------------------===// -// -// This file defines an instruction selector for the AIEngine V2 target. -// -//===----------------------------------------------------------------------===// -#include "AIE2Subtarget.h" -#include "AIEISelDAGToDAG.h" -#include "MCTargetDesc/AIE2MCTargetDesc.h" -using namespace llvm; -// AIEngine V2-specific code to select AIEngine V2 machine instructions for -// SelectionDAG operations. -class AIE2DAGToDAGISel : public AIEDAGToDAGISel { -public: - explicit AIE2DAGToDAGISel(TargetMachine &TM) : AIEDAGToDAGISel(TM) {} - - StringRef getPassName() const override { - return "AIE2 DAG->DAG Pattern Instruction Selection"; - } - - void Select(SDNode *Node) override; - - // Complex Pattern Selectors. Each one corresponds to a - // ComplexPattern<> in AIEInstrInfo.td - bool SelectFrameIndex(SDValue &N, SDValue &R); - -// Include the pieces autogenerated from the target description. -#include "AIE2GenDAGISel.inc" -}; - -void AIE2DAGToDAGISel::Select(SDNode *Node) { - // If we have a custom node, we have already selected. - if (Node->isMachineOpcode()) { - LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n"); - Node->setNodeId(-1); - return; - } - // Instruction Selection not handled by the auto-generated tablegen selection - // should be handled here. - unsigned Opcode = Node->getOpcode(); - SDLoc DL(Node); - // EVT VT = Node->getValueType(0); -// TODO Add code here - switch (Opcode) { - case ISD::Constant: { - break; - } - } - // Select the default instruction. - SelectCode(Node); -} - -// Match a frame index that can be used in an addressing mode. -bool AIE2DAGToDAGISel::SelectFrameIndex(SDValue &N, SDValue &R) { - if (N.getOpcode() != ISD::FrameIndex) - return false; - int FI = cast(N)->getIndex(); - LLVM_DEBUG(dbgs() << "SelectFrameIndex: " << FI << "\n"); - R = CurDAG->getTargetFrameIndex(FI, MVT::i32); - return true; -} - -// This pass converts a legalized DAG into a AIE-specific DAG, ready -// for instruction scheduling. -FunctionPass *llvm::createAIE2ISelDag(TargetMachine &TM) { - return new AIE2DAGToDAGISel(TM); -} diff --git a/llvm/lib/Target/AIE/AIE2InstrPatterns.td b/llvm/lib/Target/AIE/AIE2InstrPatterns.td index d6c7db3766b2..bff0201bc062 100644 --- a/llvm/lib/Target/AIE/AIE2InstrPatterns.td +++ b/llvm/lib/Target/AIE/AIE2InstrPatterns.td @@ -1068,3 +1068,8 @@ def : PatInaccessibleMem<(int_aie2_clr16f_conf), // DIVS def : Pat<(int_aie2_divs eR31:$sd_in, eR:$src0, eR:$src1), (DIVS eR31:$sd_in, eR:$src0, eR:$src1)>; + +// G_AIE_[SZ]EXT_EXTRACT_VECTOR_ELT +defm : Extract_512; +defm : Extract_512; +defm : Extract_512; diff --git a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp index 75606807153b..a00e41b6c4f3 100644 --- a/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp +++ b/llvm/lib/Target/AIE/AIE2InstructionSelector.cpp @@ -103,8 +103,6 @@ class AIE2InstructionSelector : public AIEBaseInstructionSelector { bool selectG_AIE_STORE_CONV(MachineInstr &StoreI, MachineRegisterInfo &MRI); bool selectG_AIE_STORE_PACK(MachineInstr &StoreI, MachineRegisterInfo &MRI); bool selectStartLoop(MachineInstr &I, MachineRegisterInfo &MRI); - bool selectG_AIE_EXTRACT_VECTOR_ELT(MachineInstr &I, - MachineRegisterInfo &MRI); bool selectG_AIE_INSERT_VECTOR_ELT(MachineInstr &I, MachineRegisterInfo &MRI); bool selectG_AIE_PAD_VECTOR_UNDEF(MachineInstr &I, MachineOperand &DstReg, MachineOperand &SrcReg, @@ -452,9 +450,6 @@ bool AIE2InstructionSelector::select(MachineInstr &I) { case AIE2::G_AIE_POSTINC_3D_SEXTLOAD: case AIE2::G_AIE_POSTINC_3D_ZEXTLOAD: return selectG_AIE_LOAD_STORE(I, MRI); - case AIE2::G_AIE_ZEXT_EXTRACT_VECTOR_ELT: - case AIE2::G_AIE_SEXT_EXTRACT_VECTOR_ELT: - return selectG_AIE_EXTRACT_VECTOR_ELT(I, MRI); case AIE2::G_AIE_INSERT_VECTOR_ELT: return selectG_AIE_INSERT_VECTOR_ELT(I, MRI); case AIE2::G_AIE_PAD_VECTOR_UNDEF: @@ -3755,28 +3750,6 @@ createOpcodeCondRegPair(unsigned EltSize, Register LtReg, MachineIRBuilder &MIB, return std::make_pair(Opcode, SelReg); } -static unsigned getExtractVecEltOpcode(unsigned EltSize, unsigned InstOpcode) { - unsigned Opcode = 0; - bool IsZextExtVecElt = InstOpcode == AIE2::G_AIE_ZEXT_EXTRACT_VECTOR_ELT; - switch (EltSize) { - case 8: - Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D8 : AIE2::VEXTRACT_S8; - break; - case 16: - Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D16 : AIE2::VEXTRACT_S16; - break; - case 32: - Opcode = IsZextExtVecElt ? AIE2::VEXTRACT_D32 : AIE2::VEXTRACT_S32; - break; - // there is no AIE vector with elt size 64, VEXTRACT_D64/VEXTRACT_S64 is - // selected only when the extracted value is another vector of size 64-bit. - default: - llvm_unreachable("Unexpected Extracted Vector Element Size"); - } - assert(Opcode != 0 && "Expected a NonZero Opcode"); - return Opcode; -} - static unsigned getInsertVecEltOpcode(unsigned EltSize, unsigned InstOpcode) { switch (EltSize) { case 8: @@ -3931,24 +3904,6 @@ static SelSrcAndIdx getExtractOrInsertVectorEltInputs( return SelSrcIdx; } -bool AIE2InstructionSelector::selectG_AIE_EXTRACT_VECTOR_ELT( - MachineInstr &I, MachineRegisterInfo &MRI) { - MachineOperand &RegOp0 = I.getOperand(1); - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg0 = RegOp0.getReg(); - LLT SrcVecTy = MRI.getType(SrcReg0); - LLT SrcEltTy = SrcVecTy.getElementType(); - unsigned EltSize = SrcEltTy.getSizeInBits(); - SelSrcAndIdx SelSrcIdx = - getExtractOrInsertVectorEltInputs(I, TRI, MRI, TII, RBI, MIB); - unsigned Opcode = getExtractVecEltOpcode(EltSize, I.getOpcode()); - MachineInstrBuilder MI = MIB.buildInstr(Opcode, {DstReg}, {}) - .addReg(SelSrcIdx.SrcReg) - .addReg(SelSrcIdx.IdxReg); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); -} - bool AIE2InstructionSelector::selectG_AIE_INSERT_VECTOR_ELT( MachineInstr &I, MachineRegisterInfo &MRI) { Register DstVecReg = I.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp index 3402e5eec199..c43a7b095d0c 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp @@ -220,15 +220,6 @@ AIE2TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(AIE2TTIImpl(this, F)); } -bool AIE2PassConfig::addInstSelector() { - if (AIEDumpArtifacts) - addPass(createMachineFunctionDumperPass(/*Suffix=*/"before-isel")); - addPass(createAIE2ISelDag(getAIETargetMachine())); - if (AIEDumpArtifacts) - addPass(createMachineFunctionDumperPass(/*Suffix=*/"after-isel")); - return false; -} - unsigned AIE2TargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const { switch (Kind) { diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.h b/llvm/lib/Target/AIE/AIE2TargetMachine.h index a0928f19fef0..ce7ec36389ce 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.h +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.h @@ -57,7 +57,6 @@ class AIE2PassConfig : public AIEBasePassConfig { bool addPreISel() override; void addPreEmitPass() override; - bool addInstSelector() override; bool addGlobalInstructionSelect() override; void addPreRegAlloc() override; bool addRegAssignAndRewriteOptimized() override; diff --git a/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td b/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td index 261366bc279b..3528c412725f 100644 --- a/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td +++ b/llvm/lib/Target/AIE/AIEBaseInstrPatterns.td @@ -77,3 +77,21 @@ foreach vec512Ty = [v64i8, v32i16, v16i32] in { def : Pat<(vec512Ty (select (i32 eR:$rs1), VEC512:$rs2, VEC512:$rs3)), (vec512Ty (VSEL_32 VEC512:$rs2, VEC512:$rs3, (ADD_add_r_ri eR:$rs1, (i32 -1))))>; } + +// Make our generic extract vector elt instructions available to TableGen patterns. +def vextract_zext : SDNode<"G_AIE_ZEXT_EXTRACT_VECTOR_ELT", + SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisVec<1>, SDTCisInt<2>]>>; +def : GINodeEquiv; + +def vextract_sext : SDNode<"G_AIE_SEXT_EXTRACT_VECTOR_ELT", + SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisVec<1>, SDTCisInt<2>]>>; +def : GINodeEquiv; + +class Extr512Pat : + Pat<(DstTy (Op SrcTy:$src1, Idx)), + (Inst SrcTy:$src1, Idx)>; + +multiclass Extract_512 { + def : Extr512Pat; + def : Extr512Pat; +} diff --git a/llvm/lib/Target/AIE/CMakeLists.txt b/llvm/lib/Target/AIE/CMakeLists.txt index f9f06ea5fa25..54296a8b667b 100644 --- a/llvm/lib/Target/AIE/CMakeLists.txt +++ b/llvm/lib/Target/AIE/CMakeLists.txt @@ -33,7 +33,6 @@ tablegen(LLVM AIE2GenPostLegalizerGIGenericCombiner.inc -gen-global-isel-combine tablegen(LLVM AIE2GenPostLegalizerGICustomCombiner.inc -gen-global-isel-combiner -combiners="AIE2PostLegalizerCustomCombiner") tablegen(LLVM AIE2GenCallingConv.inc -gen-callingconv) -tablegen(LLVM AIE2GenDAGISel.inc -gen-dag-isel) tablegen(LLVM AIE2GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AIE2GenFormats.inc -gen-instr-format) tablegen(LLVM AIE2GenInstrInfo.inc -gen-instr-info -base-instrinfo-class AIEBaseInstrInfo) @@ -129,7 +128,6 @@ add_llvm_target(AIECodeGen AIE2FrameLowering.cpp AIE2InstrInfo.cpp AIE2InstructionSelector.cpp - AIE2ISelDAGToDAG.cpp AIE2ISelLowering.cpp AIE2LegalizerInfo.cpp AIE2PostLegalizerCustomCombiner.cpp diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td b/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td index a3aecfc37657..9a9b514d6b7c 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrPatterns.td @@ -1096,3 +1096,14 @@ foreach Ty = [v64i8, v32i16, v16i32, v8i64] in { def : Pat<(Ty (vshift_node VEC512:$src1, VEC512:$src2, (i32 eR:$shift))), (VSHIFT VEC512:$src1, VEC512:$src2, eR:$shift)>; } + +// G_AIE_[SZ]EXT_EXTRACT_VECTOR_ELT +defm : Extract_512; +defm : Extract_512; +defm : Extract_512; +defm : Extract_512; + +defm : Extract_512; +defm : Extract_512; +defm : Extract_512; +defm : Extract_512; diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp index 9dc986fa0011..a89add811f1a 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstructionSelector.cpp @@ -58,8 +58,6 @@ class AIE2PInstructionSelector : public AIEBaseInstructionSelector { unsigned crUPSModeVal); bool selectG_AIE_ADD_VECTOR_ELT_HI(MachineInstr &I, MachineRegisterInfo &MRI); bool selectVCONVbfp16(MachineInstr &I, MachineRegisterInfo &MRI); - bool selectG_AIE_EXTRACT_VECTOR_ELT(MachineInstr &I, - MachineRegisterInfo &MRI); bool selectG_AIE_INSERT_VECTOR_ELT(MachineInstr &I, MachineRegisterInfo &MRI); bool selectG_AIE_PAD_VECTOR_UNDEF(MachineInstr &I, MachineOperand &DstReg, MachineOperand &SrcReg, @@ -427,9 +425,6 @@ bool AIE2PInstructionSelector::select(MachineInstr &I) { return selectG_UNMERGE_VALUES(MIB, I, MRI); case AIE2P::G_AIE_ADD_VECTOR_ELT_HI: return selectG_AIE_ADD_VECTOR_ELT_HI(I, MRI); - case AIE2P::G_AIE_ZEXT_EXTRACT_VECTOR_ELT: - case AIE2P::G_AIE_SEXT_EXTRACT_VECTOR_ELT: - return selectG_AIE_EXTRACT_VECTOR_ELT(I, MRI); case AIE2P::G_AIE_INSERT_VECTOR_ELT: return selectG_AIE_INSERT_VECTOR_ELT(I, MRI); case AIE2P::G_AIE_BROADCAST_VECTOR: @@ -817,32 +812,6 @@ struct SelSrcAndIdx { }; } // end anonymous namespace -static unsigned getExtractVecEltOpcode(unsigned EltSize, unsigned InstOpcode) { - unsigned Opcode = 0; - bool IsZextExtVecElt = InstOpcode == AIE2P::G_AIE_ZEXT_EXTRACT_VECTOR_ELT; - switch (EltSize) { - case 8: - Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_8_vec_extract_r_vaddSign0 - : AIE2P::VEXTRACT_8_vec_extract_r_vaddSign1; - break; - case 16: - Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_16_vec_extract_r_vaddSign0 - : AIE2P::VEXTRACT_16_vec_extract_r_vaddSign1; - break; - case 32: - Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_32_vec_extract_r_vaddSign0 - : AIE2P::VEXTRACT_32_vec_extract_r_vaddSign1; - break; - case 64: - Opcode = IsZextExtVecElt ? AIE2P::VEXTRACT_64_vec_extract_r_vaddSign0 - : AIE2P::VEXTRACT_64_vec_extract_r_vaddSign1; - break; - default: - llvm_unreachable("Unexpected Extracted Vector Element Size"); - } - assert(Opcode != 0 && "Expected a NonZero Opcode"); - return Opcode; -} static unsigned getInsertVecEltOpcode(unsigned EltSize, unsigned InstOpcode) { switch (EltSize) { @@ -1188,24 +1157,6 @@ static SelSrcAndIdx getExtractOrInsertVectorEltInputs( return SelSrcIdx; } -bool AIE2PInstructionSelector::selectG_AIE_EXTRACT_VECTOR_ELT( - MachineInstr &I, MachineRegisterInfo &MRI) { - MachineOperand &RegOp0 = I.getOperand(1); - Register DstReg = I.getOperand(0).getReg(); - Register SrcReg0 = RegOp0.getReg(); - LLT SrcVecTy = MRI.getType(SrcReg0); - LLT SrcEltTy = SrcVecTy.getElementType(); - unsigned EltSize = SrcEltTy.getSizeInBits(); - SelSrcAndIdx SelSrcIdx = - getExtractOrInsertVectorEltInputs(I, TRI, MRI, TII, RBI, MIB); - unsigned Opcode = getExtractVecEltOpcode(EltSize, I.getOpcode()); - MachineInstrBuilder MI = MIB.buildInstr(Opcode, {DstReg}, {}) - .addReg(SelSrcIdx.SrcReg) - .addReg(SelSrcIdx.IdxReg); - I.eraseFromParent(); - return constrainSelectedInstRegOperands(*MI, TII, TRI, RBI); -} - bool AIE2PInstructionSelector::selectG_AIE_INSERT_VECTOR_ELT( MachineInstr &I, MachineRegisterInfo &MRI) { Register DstVecReg = I.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir b/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir index 858920759ed1..9946733dc431 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/inst-select-extract-vector-elem.mir @@ -30,9 +30,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_r_vaddSign0_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0 - ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_r_vaddSign0_]] + ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_imm_vaddSign0_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0 + ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_imm_vaddSign0_]] ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0 %1:vregbank(<32 x s16>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 @@ -63,9 +62,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1 - ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_r_vaddSign1_]] + ; AIE2P-NEXT: [[VEXTRACT_16_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_16_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1 + ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_16_vec_extract_imm_vaddSign1_]] ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0 %1:vregbank(<32 x s16>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 @@ -96,9 +94,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1 - ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_r_vaddSign1_]] + ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1 + ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_imm_vaddSign1_]] ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0 %1:vregbank(<64 x s8>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 @@ -129,9 +126,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_r_vaddSign0_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0 - ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_r_vaddSign0_]] + ; AIE2P-NEXT: [[VEXTRACT_8_vec_extract_imm_vaddSign0_:%[0-9]+]]:er = VEXTRACT_8_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0 + ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_8_vec_extract_imm_vaddSign0_]] ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0 %1:vregbank(<64 x s8>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 @@ -162,9 +158,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_32_vec_extract_r_vaddSign1_:%[0-9]+]]:er = VEXTRACT_32_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1 - ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_32_vec_extract_r_vaddSign1_]] + ; AIE2P-NEXT: [[VEXTRACT_32_vec_extract_imm_vaddSign1_:%[0-9]+]]:er = VEXTRACT_32_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1 + ; AIE2P-NEXT: $r0 = COPY [[VEXTRACT_32_vec_extract_imm_vaddSign1_]] ; AIE2P-NEXT: PseudoRET implicit $lr, implicit $r0 %1:vregbank(<16 x s32>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 diff --git a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir index 694837d39ad8..b5326820a03f 100644 --- a/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir +++ b/llvm/test/CodeGen/AIE/aie2p/GlobalIsel/inst-select-extract-vector-elem.mir @@ -21,9 +21,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign1 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1 - ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign1_]] + ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign1 [[COPY]], 1, implicit $vaddsign1 + ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign1_]] %1:vregbank(<8 x s64>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 %0:gprregbank(s64) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32) @@ -63,10 +62,9 @@ body: | ; AIE2P: liveins: $bmll0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:acc512 = COPY $bmll0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 ; AIE2P-NEXT: [[COPY1:%[0-9]+]]:mxm = COPY [[COPY]] - ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign1 [[COPY1]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign1 - ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign1_]] + ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign1_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign1 [[COPY1]], 1, implicit $vaddsign1 + ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign1_]] %1:accregbank(<8 x s64>) = COPY $bmll0 %2:gprregbank(s32) = G_CONSTANT i32 1 %0:gprregbank(s64) = G_AIE_SEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32) @@ -84,9 +82,8 @@ body: | ; AIE2P: liveins: $x0 ; AIE2P-NEXT: {{ $}} ; AIE2P-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x0 - ; AIE2P-NEXT: [[MOV_RLC_imm11_pseudo:%[0-9]+]]:er = MOV_RLC_imm11_pseudo 1 - ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_r_vaddSign0_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_r_vaddSign0 [[COPY]], [[MOV_RLC_imm11_pseudo]], implicit $vaddsign0 - ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_r_vaddSign0_]] + ; AIE2P-NEXT: [[VEXTRACT_64_vec_extract_imm_vaddSign0_:%[0-9]+]]:el = VEXTRACT_64_vec_extract_imm_vaddSign0 [[COPY]], 1, implicit $vaddsign0 + ; AIE2P-NEXT: PseudoRET implicit $lr, implicit [[VEXTRACT_64_vec_extract_imm_vaddSign0_]] %1:vregbank(<8 x s64>) = COPY $x0 %2:gprregbank(s32) = G_CONSTANT i32 1 %0:gprregbank(s64) = G_AIE_ZEXT_EXTRACT_VECTOR_ELT %1(<8 x s64>), %2(s32) diff --git a/llvm/test/CodeGen/AIE/aie2p/extractelement.ll b/llvm/test/CodeGen/AIE/aie2p/extractelement.ll index e3085508f508..5a4f32e168f2 100644 --- a/llvm/test/CodeGen/AIE/aie2p/extractelement.ll +++ b/llvm/test/CodeGen/AIE/aie2p/extractelement.ll @@ -13,10 +13,10 @@ define i64 @extract_v4i64(<4 x i64> inreg %v) nounwind { ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: ; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nop // Delay Slot 5 -; AIE2P-NEXT: mova r0, #3 // Delay Slot 4 +; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: vmov x0, bmll0 // Delay Slot 3 -; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.64 r1:r0, x0, #3, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <4 x i64> %v, i32 3 ret i64 %1 @@ -42,10 +42,10 @@ define i64 @extract_v8i64(<8 x i64> inreg %v) nounwind { ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: ; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nop // Delay Slot 5 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 4 +; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: vmov x0, bmll0 // Delay Slot 3 -; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <8 x i64> %v, i32 7 ret i64 %1 @@ -71,10 +71,10 @@ define i64 @extract_v16i64(<16 x i64> inreg %v) nounwind { ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: ; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nop // Delay Slot 5 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 4 +; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: vmov x0, bmll0 // Delay Slot 3 -; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i64> %v, i32 7 ret i64 %1 @@ -106,10 +106,10 @@ define i32 @extract_v64i32(<64 x i32> inreg %v) nounwind { ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: ; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nop // Delay Slot 5 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 4 +; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: vmov x0, bmll0 // Delay Slot 3 -; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <64 x i32> %v, i32 7 ret i32 %1 @@ -152,10 +152,10 @@ define i64 @extract_v32i64(<32 x i64> inreg %v) nounwind { ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: ; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nop // Delay Slot 5 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 4 +; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: nop // Delay Slot 4 ; AIE2P-NEXT: vmov x0, bmll0 // Delay Slot 3 -; AIE2P-NEXT: vextract.64 r1:r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i64> %v, i32 7 ret i64 %1 diff --git a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll index 7e9a3fa85496..63f0c74d7687 100644 --- a/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll +++ b/llvm/test/CodeGen/AIE/aie2p/load-store-unaligned.ll @@ -16,423 +16,406 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-LABEL: test_load_store_unaligned: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova m0, #-600; nopb ; nopx -; CHECK-NEXT: paddxm [sp], #640 -; CHECK-NEXT: st p7, [sp, #-640] // 4-byte Folded Spill +; CHECK-NEXT: mova m0, #-560 +; CHECK-NEXT: paddxm [sp], #576 +; CHECK-NEXT: st p7, [sp, #-576] // 4-byte Folded Spill ; CHECK-NEXT: mov p7, sp -; CHECK-NEXT: st r8, [sp, #-604] // 4-byte Folded Spill -; CHECK-NEXT: st r9, [sp, #-608] // 4-byte Folded Spill -; CHECK-NEXT: st r10, [sp, #-612] // 4-byte Folded Spill -; CHECK-NEXT: st r11, [sp, #-616] // 4-byte Folded Spill -; CHECK-NEXT: st r12, [sp, #-620] // 4-byte Folded Spill -; CHECK-NEXT: st r13, [sp, #-624] // 4-byte Folded Spill -; CHECK-NEXT: st r14, [sp, #-628] // 4-byte Folded Spill -; CHECK-NEXT: st r15, [sp, #-632] // 4-byte Folded Spill -; CHECK-NEXT: st p6, [sp, #-636] // 4-byte Folded Spill ; CHECK-NEXT: padda [p7], m0 -; CHECK-NEXT: st.s16 r16, [p7, #0] +; CHECK-NEXT: st.s16 r0, [p7, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r0, #0 -; CHECK-NEXT: vextract.32 r16, x0, r0, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r17, [p7, #2] ; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r1, [p7, #2] ; CHECK-NEXT: nop -; CHECK-NEXT: mova r1, #1 -; CHECK-NEXT: vextract.32 r17, x0, r1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r18, [p7, #4] +; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r2, #2 -; CHECK-NEXT: vextract.32 r18, x0, r2, vaddsign1 +; CHECK-NEXT: st.s16 r2, [p7, #4] ; CHECK-NEXT: nop -; CHECK-NEXT: mova r4, #4 -; CHECK-NEXT: st.s16 r19, [p7, #6] -; CHECK-NEXT: mova r3, #3 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r3, [p7, #6] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r4, [p7, #8] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.32 r4, x0, #4, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r5, [p7, #10] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vmov q0, wl2 +; CHECK-NEXT: vextract.32 r5, x0, #5, vaddsign1 +; CHECK-NEXT: vextract.32 r6, x0, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r7, x0, #7, vaddsign1 +; CHECK-NEXT: st.s16 r6, [p7, #12] ; CHECK-NEXT: mov p0, sp -; CHECK-NEXT: mov p2, sp -; CHECK-NEXT: vextract.32 r19, x0, r3, vaddsign1 -; CHECK-NEXT: mova r5, #5 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: st.s16 r20, [p7, #8] -; CHECK-NEXT: vextract.32 r20, x0, r4, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x0, r5, vaddsign1 -; CHECK-NEXT: mova m0, #-584 -; CHECK-NEXT: padda [p1], #-512 +; CHECK-NEXT: vmov wl0, q0 +; CHECK-NEXT: mova m0, #-544 ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-568 -; CHECK-NEXT: st p1, [sp, #-24] // 4-byte Folded Spill -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: st p7, [sp, #-28] // 4-byte Folded Spill -; CHECK-NEXT: mov r24, p0 -; CHECK-NEXT: st.s16 r21, [p7, #10] +; CHECK-NEXT: mova m0, #-528 +; CHECK-NEXT: mov r27, p7 +; CHECK-NEXT: st.s16 r7, [p7, #14] +; CHECK-NEXT: mov r16, p0 ; CHECK-NEXT: mov p0, sp ; CHECK-NEXT: padda [p0], m0 -; CHECK-NEXT: mova m0, #-544 -; CHECK-NEXT: padda [p2], m0 -; CHECK-NEXT: mova m0, #-480 -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: st p1, [sp, #-16] // 4-byte Folded Spill -; CHECK-NEXT: st.s16 r22, [p7, #12] +; CHECK-NEXT: vextract.32 r0, x0, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r1, x0, #1, vaddsign1 +; CHECK-NEXT: mov p7, r16 +; CHECK-NEXT: st r1, [p7, #4] +; CHECK-NEXT: st r0, [p7, #0] +; CHECK-NEXT: st.s8 r0, [p0, #0] ; CHECK-NEXT: nop -; CHECK-NEXT: mova r6, #6 -; CHECK-NEXT: vextract.32 r22, x0, r6, vaddsign1 -; CHECK-NEXT: vmov q0, wl2 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: padda [p1], #-448 -; CHECK-NEXT: st p1, [sp, #-20] // 4-byte Folded Spill -; CHECK-NEXT: st.s16 r23, [p7, #14] -; CHECK-NEXT: mova r7, #7 -; CHECK-NEXT: vextract.32 r23, x0, r7, vaddsign1 -; CHECK-NEXT: vmov wl0, q0 -; CHECK-NEXT: vextract.32 r16, x0, r0, vaddsign1 -; CHECK-NEXT: vextract.32 r17, x0, r1, vaddsign1 -; CHECK-NEXT: mov p7, r24 -; CHECK-NEXT: st r17, [p7, #4] -; CHECK-NEXT: st r16, [p7, #0] -; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill -; CHECK-NEXT: st.s8 r24, [p0, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: vextract.16 r24, x4, r0, vaddsign1 -; CHECK-NEXT: vextract.32 r18, x0, r2, vaddsign1 -; CHECK-NEXT: padda [p1], #-320 -; CHECK-NEXT: st r18, [p7, #8] -; CHECK-NEXT: st p1, [sp, #-36] // 4-byte Folded Spill -; CHECK-NEXT: st.s8 r25, [p0, #1] +; CHECK-NEXT: vextract.16 r0, x4, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r2, x0, #2, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: st r2, [p7, #8] +; CHECK-NEXT: st.s8 r1, [p0, #1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r25, x4, r1, vaddsign1 -; CHECK-NEXT: vextract.32 r19, x0, r3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st r19, [p7, #12] -; CHECK-NEXT: lda p7, [sp, #-24] // 4-byte Folded Reload -; CHECK-NEXT: st.s8 r26, [p0, #2] +; CHECK-NEXT: vextract.16 r1, x4, #1, vaddsign1 +; CHECK-NEXT: vextract.32 r3, x0, #3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r26, x4, r2, vaddsign1 +; CHECK-NEXT: st r3, [p7, #12] +; CHECK-NEXT: st.s8 r2, [p0, #2] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r2, x4, #2, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: mova m0, #-304 -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: st p1, [sp, #-40] // 4-byte Folded Spill -; CHECK-NEXT: st.s8 r27, [p0, #3] ; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r3, [p0, #3] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r27, x4, r3, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r3, x4, #3, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r28, [p0, #4] ; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r4, [p0, #4] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r28, x4, r4, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r4, x4, #4, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r29, [p0, #5] ; CHECK-NEXT: nop +; CHECK-NEXT: st r8, [sp, #-568] // 4-byte Folded Spill +; CHECK-NEXT: st p6, [sp, #-572] // 4-byte Folded Spill +; CHECK-NEXT: st.s8 r5, [p0, #5] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r29, x4, r5, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r5, x4, #5, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r30, [p0, #6] ; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r6, [p0, #6] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r30, x4, r6, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r6, x4, #6, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st.s8 r31, [p0, #7] ; CHECK-NEXT: nop +; CHECK-NEXT: st.s8 r7, [p0, #7] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r31, x4, r7, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r7, x4, #7, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #8 -; CHECK-NEXT: st.s8 r8, [p0, dj0] +; CHECK-NEXT: st.s8 r16, [p0, dj0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #8 -; CHECK-NEXT: vextract.16 r8, x4, r16, vaddsign1 +; CHECK-NEXT: vextract.16 r16, x4, #8, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #9 -; CHECK-NEXT: st.s8 r9, [p0, dj0] +; CHECK-NEXT: st.s8 r17, [p0, dj0] +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r17, #9 -; CHECK-NEXT: vextract.16 r9, x4, r17, vaddsign1 +; CHECK-NEXT: vextract.16 r17, x4, #9, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: mova dj2, #10 -; CHECK-NEXT: st.s8 r10, [p0, dj2] +; CHECK-NEXT: st.s8 r18, [p0, dj2] ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova r18, #10 -; CHECK-NEXT: vextract.16 r10, x4, r18, vaddsign1 -; CHECK-NEXT: mova dj1, #12 +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r18, x4, #10, vaddsign1 +; CHECK-NEXT: nop ; CHECK-NEXT: mova dj0, #11 -; CHECK-NEXT: st.s8 r11, [p0, dj0] -; CHECK-NEXT: mova dj3, #14 -; CHECK-NEXT: mova dj4, #32 -; CHECK-NEXT: mova r19, #11 -; CHECK-NEXT: vextract.16 r11, x4, r19, vaddsign1 -; CHECK-NEXT: mova dj5, #36 -; CHECK-NEXT: mov p5, sp -; CHECK-NEXT: st.s8 r12, [p0, dj1] -; CHECK-NEXT: mov p3, sp -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: mova r20, #12 -; CHECK-NEXT: vextract.16 r12, x4, r20, vaddsign1 -; CHECK-NEXT: padda [p5], #-384 -; CHECK-NEXT: mova dj0, #13 -; CHECK-NEXT: st.s8 r13, [p0, dj0] -; CHECK-NEXT: mova m0, #-288 -; CHECK-NEXT: mov p1, sp -; CHECK-NEXT: mova r21, #13 -; CHECK-NEXT: vextract.16 r13, x4, r21, vaddsign1 -; CHECK-NEXT: padda [p1], m0 -; CHECK-NEXT: mova m0, #-272 -; CHECK-NEXT: st.s8 r14, [p0, dj3] -; CHECK-NEXT: mova r22, #14 -; CHECK-NEXT: padda [p3], m0 -; CHECK-NEXT: mova m0, #-240 -; CHECK-NEXT: vextract.16 r14, x4, r22, vaddsign1 -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mova m0, #-208 -; CHECK-NEXT: st p4, [sp, #-44] // 4-byte Folded Spill -; CHECK-NEXT: mov p4, sp -; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: st p4, [sp, #-48] // 4-byte Folded Spill -; CHECK-NEXT: mova dj0, #15 -; CHECK-NEXT: st.s8 r15, [p0, dj0] -; CHECK-NEXT: mova r23, #15 -; CHECK-NEXT: vextract.16 r15, x4, r23, vaddsign1 -; CHECK-NEXT: vextract.16 r8, x6, r16, vaddsign1 -; CHECK-NEXT: vextract.32 r16, x10, r16, vaddsign1 -; CHECK-NEXT: vextract.16 r9, x6, r17, vaddsign1 -; CHECK-NEXT: vextract.32 r17, x10, r17, vaddsign1 -; CHECK-NEXT: st r16, [p5, dj4] -; CHECK-NEXT: st r17, [p5, dj5] -; CHECK-NEXT: st.s16 r24, [p2, #0] +; CHECK-NEXT: st.s8 r19, [p0, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj6, #40 -; CHECK-NEXT: vextract.16 r24, x6, r0, vaddsign1 -; CHECK-NEXT: vextract.16 r10, x6, r18, vaddsign1 -; CHECK-NEXT: vextract.32 r18, x10, r18, vaddsign1 -; CHECK-NEXT: mova dj5, #8 -; CHECK-NEXT: lda.s8 r16, [p0, dj5] -; CHECK-NEXT: st r18, [p5, dj6] -; CHECK-NEXT: st.s16 r25, [p2, #2] ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj7, #44 -; CHECK-NEXT: vextract.16 r25, x6, r1, vaddsign1 -; CHECK-NEXT: vextract.16 r11, x6, r19, vaddsign1 -; CHECK-NEXT: vextract.32 r19, x10, r19, vaddsign1 -; CHECK-NEXT: mova dj5, #9 -; CHECK-NEXT: lda.s8 r17, [p0, dj5] -; CHECK-NEXT: st r19, [p5, dj7] -; CHECK-NEXT: st.s16 r26, [p2, #4] ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r26, x6, r2, vaddsign1 -; CHECK-NEXT: mova dj1, #48 -; CHECK-NEXT: vextract.16 r12, x6, r20, vaddsign1 -; CHECK-NEXT: vextract.32 r20, x10, r20, vaddsign1 -; CHECK-NEXT: mova dj5, #10 -; CHECK-NEXT: lda.s8 r18, [p0, dj5] -; CHECK-NEXT: st r20, [p5, dj1] -; CHECK-NEXT: st.s16 r27, [p2, #6] +; CHECK-NEXT: vextract.16 r19, x4, #11, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r27, x6, r3, vaddsign1 -; CHECK-NEXT: mova dj3, #52 -; CHECK-NEXT: vextract.16 r13, x6, r21, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x10, r21, vaddsign1 -; CHECK-NEXT: mova dj5, #11 -; CHECK-NEXT: lda.s8 r19, [p0, dj5] -; CHECK-NEXT: st r21, [p5, dj3] -; CHECK-NEXT: st.s16 r28, [p2, #8] +; CHECK-NEXT: mova dj1, #12 +; CHECK-NEXT: st.s8 r20, [p0, dj1] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r28, x6, r4, vaddsign1 +; CHECK-NEXT: vextract.16 r20, x4, #12, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: mova dj0, #13 +; CHECK-NEXT: st.s8 r21, [p0, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj5, #12 -; CHECK-NEXT: lda.s8 r20, [p0, dj5] -; CHECK-NEXT: st.s16 r29, [p2, #10] ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj2, #60 -; CHECK-NEXT: vextract.16 r29, x6, r5, vaddsign1 -; CHECK-NEXT: vextract.16 r15, x6, r23, vaddsign1 -; CHECK-NEXT: vextract.32 r23, x10, r23, vaddsign1 -; CHECK-NEXT: mova dj5, #13 -; CHECK-NEXT: lda.s8 r21, [p0, dj5] -; CHECK-NEXT: st r23, [p5, dj2] -; CHECK-NEXT: st.s16 r30, [p2, #12] ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r21, x4, #13, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj3, #14 +; CHECK-NEXT: st.s8 r22, [p0, dj3] ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r30, x6, r6, vaddsign1 -; CHECK-NEXT: vextract.32 r24, x8, r0, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: st r24, [p7, #0] -; CHECK-NEXT: st.s16 r31, [p2, #14] ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r22, x4, #14, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: mova dj0, #15 +; CHECK-NEXT: st.s8 r23, [p0, dj0] ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.16 r31, x6, r7, vaddsign1 -; CHECK-NEXT: vextract.32 r25, x8, r1, vaddsign1 -; CHECK-NEXT: mova dj0, #16 -; CHECK-NEXT: st r25, [p7, #4] -; CHECK-NEXT: st.s16 r8, [p2, dj0] ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r23, x4, #15, vaddsign1 ; CHECK-NEXT: nop +; CHECK-NEXT: mov p2, sp +; CHECK-NEXT: padda [p2], #-512 +; CHECK-NEXT: st.s16 r0, [p2, #0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj0, #18 -; CHECK-NEXT: st.s16 r9, [p2, dj0] +; CHECK-NEXT: vextract.16 r0, x6, #0, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r1, [p2, #2] ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.32 r26, x8, r2, vaddsign1 -; CHECK-NEXT: vextract.32 r27, x8, r3, vaddsign1 -; CHECK-NEXT: mova dj0, #20 -; CHECK-NEXT: st r26, [p7, #8] -; CHECK-NEXT: st r27, [p7, #12] -; CHECK-NEXT: st.s16 r10, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r1, x6, #1, vaddsign1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r2, [p2, #4] ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj0, #22 -; CHECK-NEXT: st.s16 r11, [p2, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r2, x6, #2, vaddsign1 ; CHECK-NEXT: nop -; CHECK-NEXT: vextract.32 r28, x8, r4, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x8, r5, vaddsign1 -; CHECK-NEXT: mova dj0, #24 -; CHECK-NEXT: st r28, [p7, #16] -; CHECK-NEXT: st r29, [p7, #20] -; CHECK-NEXT: st.s16 r12, [p2, dj0] +; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r3, [p2, #6] +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: vextract.16 r3, x6, #3, vaddsign1 +; CHECK-NEXT: nop +; CHECK-NEXT: mova dj4, #32 +; CHECK-NEXT: st.s16 r4, [p2, #8] +; CHECK-NEXT: mova dj5, #36 +; CHECK-NEXT: mova dj6, #40 +; CHECK-NEXT: mova dj7, #44 +; CHECK-NEXT: vextract.16 r4, x6, #4, vaddsign1 ; CHECK-NEXT: vmov x2, bmll0 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: st.s16 r5, [p2, #10] +; CHECK-NEXT: mov p5, sp ; CHECK-NEXT: vmov bmll0, x2 -; CHECK-NEXT: nop +; CHECK-NEXT: mova m0, #-480 +; CHECK-NEXT: vextract.16 r5, x6, #5, vaddsign1 +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: mova m0, #-416 +; CHECK-NEXT: st.s16 r6, [p2, #12] +; CHECK-NEXT: mov r30, p1 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: mov r29, p7 +; CHECK-NEXT: vextract.16 r6, x6, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r0, x8, #0, vaddsign1 +; CHECK-NEXT: mov p7, r30 +; CHECK-NEXT: st r0, [p7, #0] +; CHECK-NEXT: st.s16 r7, [p2, #14] +; CHECK-NEXT: padda [p1], #-448 +; CHECK-NEXT: mov r31, p1 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: vextract.16 r7, x6, #7, vaddsign1 +; CHECK-NEXT: vextract.32 r1, x8, #1, vaddsign1 +; CHECK-NEXT: mova dj0, #16 +; CHECK-NEXT: st r1, [p7, #4] +; CHECK-NEXT: st.s16 r16, [p2, dj0] +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: mova m0, #-352 +; CHECK-NEXT: mov r8, p1 +; CHECK-NEXT: vextract.16 r16, x6, #8, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: mova dj0, #18 +; CHECK-NEXT: st.s16 r17, [p2, dj0] +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: padda [p5], m0 +; CHECK-NEXT: vextract.16 r17, x6, #9, vaddsign1 +; CHECK-NEXT: vextract.32 r2, x8, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r3, x8, #3, vaddsign1 +; CHECK-NEXT: mova dj0, #20 +; CHECK-NEXT: st r2, [p7, #8] +; CHECK-NEXT: st r3, [p7, #12] +; CHECK-NEXT: st.s16 r18, [p2, dj0] +; CHECK-NEXT: mova m0, #-288 +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: mov r28, p1 +; CHECK-NEXT: vextract.16 r18, x6, #10, vaddsign1 ; CHECK-NEXT: vmov bmll0, x2 +; CHECK-NEXT: mova dj0, #22 +; CHECK-NEXT: st.s16 r19, [p2, dj0] +; CHECK-NEXT: mov r30, p7 +; CHECK-NEXT: mova dj2, #60 +; CHECK-NEXT: vextract.16 r19, x6, #11, vaddsign1 +; CHECK-NEXT: vextract.32 r4, x8, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r5, x8, #5, vaddsign1 +; CHECK-NEXT: mova dj0, #24 +; CHECK-NEXT: st r4, [p7, #16] +; CHECK-NEXT: st r5, [p7, #20] +; CHECK-NEXT: st.s16 r20, [p2, dj0] +; CHECK-NEXT: mova dj1, #48 +; CHECK-NEXT: mova dj3, #52 +; CHECK-NEXT: vextract.16 r20, x6, #12, vaddsign1 +; CHECK-NEXT: vextract.16 r21, x6, #13, vaddsign1 +; CHECK-NEXT: vextract.16 r22, x6, #14, vaddsign1 ; CHECK-NEXT: mova dj0, #26 -; CHECK-NEXT: st.s16 r13, [p2, dj0] -; CHECK-NEXT: vextract.16 r14, x6, r22, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x10, r22, vaddsign1 -; CHECK-NEXT: mova dj5, #14 -; CHECK-NEXT: vextract.32 r30, x8, r6, vaddsign1 -; CHECK-NEXT: vextract.32 r31, x8, r7, vaddsign1 -; CHECK-NEXT: mova dj0, #28 -; CHECK-NEXT: st r30, [p7, #24] -; CHECK-NEXT: st r31, [p7, #28] -; CHECK-NEXT: st.s16 r14, [p2, dj0] -; CHECK-NEXT: vextract.64 r25:r24, x0, r0, vaddsign1 +; CHECK-NEXT: st.s16 r21, [p2, dj0] +; CHECK-NEXT: vextract.16 r23, x6, #15, vaddsign1 +; CHECK-NEXT: vextract.64 r1:r0, x0, #0, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 +; CHECK-NEXT: vextract.32 r6, x8, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r7, x8, #7, vaddsign1 +; CHECK-NEXT: mova dj0, #28 +; CHECK-NEXT: st r6, [p7, #24] +; CHECK-NEXT: st r7, [p7, #28] +; CHECK-NEXT: st.s16 r22, [p2, dj0] ; CHECK-NEXT: vmov bmll0, x2 -; CHECK-NEXT: vextract.64 r27:r26, x0, r1, vaddsign1 +; CHECK-NEXT: vextract.64 r3:r2, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll0 -; CHECK-NEXT: mova dj0, #30 -; CHECK-NEXT: lda p7, [sp, #-16] // 4-byte Folded Reload -; CHECK-NEXT: st.s16 r15, [p2, dj0] ; CHECK-NEXT: vmov bmll0, x2 -; CHECK-NEXT: vextract.64 r29:r28, x0, r2, vaddsign1 +; CHECK-NEXT: vextract.64 r5:r4, x0, #2, vaddsign1 +; CHECK-NEXT: mova dj0, #30 +; CHECK-NEXT: st.s16 r23, [p2, dj0] ; CHECK-NEXT: vmov x0, bmll0 -; CHECK-NEXT: vextract.64 r31:r30, x0, r3, vaddsign1 +; CHECK-NEXT: mov p7, r31 +; CHECK-NEXT: vextract.64 r7:r6, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 +; CHECK-NEXT: mov r31, p7 ; CHECK-NEXT: mova dj0, #36 -; CHECK-NEXT: st r24, [p7, #0] -; CHECK-NEXT: st r25, [p7, #4] -; CHECK-NEXT: st r26, [p7, #8] -; CHECK-NEXT: st r27, [p7, #12] -; CHECK-NEXT: st r28, [p7, #16] -; CHECK-NEXT: st r29, [p7, #20] -; CHECK-NEXT: st r30, [p7, #24] -; CHECK-NEXT: st r31, [p7, #28] -; CHECK-NEXT: vextract.64 r29:r28, x0, r0, vaddsign1 +; CHECK-NEXT: st r0, [p7, #0] +; CHECK-NEXT: st r1, [p7, #4] +; CHECK-NEXT: st r2, [p7, #8] +; CHECK-NEXT: st r3, [p7, #12] +; CHECK-NEXT: st r4, [p7, #16] +; CHECK-NEXT: st r5, [p7, #20] +; CHECK-NEXT: vextract.64 r5:r4, x0, #0, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r0, x10, r0, vaddsign1 -; CHECK-NEXT: lda p7, [sp, #-20] // 4-byte Folded Reload -; CHECK-NEXT: vextract.64 r31:r30, x0, r1, vaddsign1 +; CHECK-NEXT: st r6, [p7, #24] +; CHECK-NEXT: st r7, [p7, #28] +; CHECK-NEXT: mov p7, r8 +; CHECK-NEXT: vextract.64 r7:r6, x0, #1, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r1, x10, r1, vaddsign1 -; CHECK-NEXT: vextract.64 r9:r8, x0, r2, vaddsign1 +; CHECK-NEXT: mov r8, p7 +; CHECK-NEXT: vextract.64 r17:r16, x0, #2, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r2, x10, r2, vaddsign1 -; CHECK-NEXT: st r0, [p5, #0] -; CHECK-NEXT: vextract.64 r11:r10, x0, r3, vaddsign1 +; CHECK-NEXT: st r4, [p7, #0] +; CHECK-NEXT: st r5, [p7, #4] +; CHECK-NEXT: vextract.32 r4, x10, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r5, x10, #5, vaddsign1 +; CHECK-NEXT: vextract.64 r19:r18, x0, #3, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r3, x10, r3, vaddsign1 -; CHECK-NEXT: st r1, [p5, #4] -; CHECK-NEXT: vextract.64 r13:r12, x0, r4, vaddsign1 -; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r4, x10, r4, vaddsign1 -; CHECK-NEXT: st r2, [p5, #8] -; CHECK-NEXT: lda r2, [sp, #-28] // 4-byte Folded Reload -; CHECK-NEXT: vextract.64 r15:r14, x0, r5, vaddsign1 -; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r5, x10, r5, vaddsign1 -; CHECK-NEXT: st r3, [p5, #12] -; CHECK-NEXT: vextract.64 r27:r26, x0, r6, vaddsign1 +; CHECK-NEXT: st r6, [p7, #8] +; CHECK-NEXT: st r7, [p7, #12] +; CHECK-NEXT: vextract.32 r6, x10, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r7, x10, #7, vaddsign1 +; CHECK-NEXT: vextract.64 r21:r20, x0, #4, vaddsign1 ; CHECK-NEXT: vmov x0, bmll1 -; CHECK-NEXT: vextract.32 r6, x10, r6, vaddsign1 +; CHECK-NEXT: st r16, [p7, #16] +; CHECK-NEXT: st r17, [p7, #20] +; CHECK-NEXT: vextract.32 r16, x10, #8, vaddsign1 +; CHECK-NEXT: vextract.32 r17, x10, #9, vaddsign1 ; CHECK-NEXT: st r4, [p5, #16] -; CHECK-NEXT: st r13, [p7, dj0] -; CHECK-NEXT: mova dj0, #56 -; CHECK-NEXT: vextract.64 r25:r24, x0, r7, vaddsign1 -; CHECK-NEXT: st r28, [p7, #0] -; CHECK-NEXT: st r29, [p7, #4] -; CHECK-NEXT: st r30, [p7, #8] -; CHECK-NEXT: st r31, [p7, #12] -; CHECK-NEXT: st r8, [p7, #16] -; CHECK-NEXT: st r9, [p7, #20] -; CHECK-NEXT: st r10, [p7, #24] -; CHECK-NEXT: st r11, [p7, #28] -; CHECK-NEXT: st r12, [p7, dj4] -; CHECK-NEXT: st r14, [p7, dj6] -; CHECK-NEXT: st r15, [p7, dj7] -; CHECK-NEXT: vextract.32 r7, x10, r7, vaddsign1 ; CHECK-NEXT: st r5, [p5, #20] -; CHECK-NEXT: lda r15, [sp, #-632] // 4-byte Folded Reload -; CHECK-NEXT: lda r14, [sp, #-628] // 4-byte Folded Reload -; CHECK-NEXT: lda r13, [sp, #-624] // 4-byte Folded Reload -; CHECK-NEXT: lda r12, [sp, #-620] // 4-byte Folded Reload -; CHECK-NEXT: lda r11, [sp, #-616] // 4-byte Folded Reload -; CHECK-NEXT: lda r10, [sp, #-612] // 4-byte Folded Reload -; CHECK-NEXT: lda r9, [sp, #-608] // 4-byte Folded Reload -; CHECK-NEXT: lda r8, [sp, #-604] // 4-byte Folded Reload -; CHECK-NEXT: st r22, [p5, dj0] -; CHECK-NEXT: lda.s8 r22, [p0, dj5] -; CHECK-NEXT: mova dj5, #15 -; CHECK-NEXT: st r26, [p7, dj1] -; CHECK-NEXT: st r27, [p7, dj3] +; CHECK-NEXT: vextract.64 r23:r22, x0, #5, vaddsign1 +; CHECK-NEXT: vmov x0, bmll1 +; CHECK-NEXT: st r18, [p7, #24] +; CHECK-NEXT: st r19, [p7, #28] +; CHECK-NEXT: vextract.32 r18, x10, #10, vaddsign1 +; CHECK-NEXT: vextract.32 r19, x10, #11, vaddsign1 ; CHECK-NEXT: st r6, [p5, #24] -; CHECK-NEXT: st r24, [p7, dj0] -; CHECK-NEXT: mov r24, p7 -; CHECK-NEXT: st r25, [p7, dj2] ; CHECK-NEXT: st r7, [p5, #28] -; CHECK-NEXT: lda.s8 r23, [p0, dj5] -; CHECK-NEXT: mov p7, r2 +; CHECK-NEXT: st r21, [p7, dj0] +; CHECK-NEXT: mova dj0, #56 +; CHECK-NEXT: st r17, [p5, dj5] +; CHECK-NEXT: mova dj5, #8 +; CHECK-NEXT: vextract.64 r3:r2, x0, #6, vaddsign1 +; CHECK-NEXT: vmov x0, bmll1 +; CHECK-NEXT: st r20, [p7, dj4] +; CHECK-NEXT: vextract.32 r20, x10, #12, vaddsign1 +; CHECK-NEXT: vextract.32 r21, x10, #13, vaddsign1 +; CHECK-NEXT: st r16, [p5, dj4] +; CHECK-NEXT: lda.s8 r16, [p0, dj5] +; CHECK-NEXT: mova dj5, #9 +; CHECK-NEXT: vextract.64 r1:r0, x0, #7, vaddsign1 +; CHECK-NEXT: st r22, [p7, dj6] +; CHECK-NEXT: st r23, [p7, dj7] +; CHECK-NEXT: vextract.32 r22, x10, #14, vaddsign1 +; CHECK-NEXT: vextract.32 r23, x10, #15, vaddsign1 +; CHECK-NEXT: st r18, [p5, dj6] +; CHECK-NEXT: st r19, [p5, dj7] +; CHECK-NEXT: lda.s8 r17, [p0, dj5] +; CHECK-NEXT: mova dj5, #10 +; CHECK-NEXT: st r2, [p7, dj1] +; CHECK-NEXT: st r3, [p7, dj3] +; CHECK-NEXT: vextract.32 r2, x10, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r3, x10, #3, vaddsign1 +; CHECK-NEXT: st r20, [p5, dj1] +; CHECK-NEXT: st r21, [p5, dj3] +; CHECK-NEXT: st r0, [p7, dj0] +; CHECK-NEXT: st r1, [p7, dj2] +; CHECK-NEXT: mov p7, r27 +; CHECK-NEXT: lda.s8 r18, [p0, dj5] +; CHECK-NEXT: mova dj5, #11 +; CHECK-NEXT: vextract.32 r0, x10, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r1, x10, #1, vaddsign1 +; CHECK-NEXT: st r22, [p5, dj0] +; CHECK-NEXT: st r23, [p5, dj2] +; CHECK-NEXT: lda.s8 r19, [p0, dj5] +; CHECK-NEXT: mova dj5, #12 +; CHECK-NEXT: st r2, [p5, #8] +; CHECK-NEXT: st r3, [p5, #12] +; CHECK-NEXT: lda.s8 r20, [p0, dj5] +; CHECK-NEXT: mova dj5, #13 +; CHECK-NEXT: st r0, [p5, #0] +; CHECK-NEXT: st r1, [p5, #4] ; CHECK-NEXT: lda.s16 r0, [p7], #2 ; CHECK-NEXT: lda.s16 r1, [p7, #0] -; CHECK-NEXT: mov p7, r2 -; CHECK-NEXT: lda p7, [sp, #-36] // 4-byte Folded Reload +; CHECK-NEXT: mov p7, r27 +; CHECK-NEXT: lda.s8 r21, [p0, dj5] +; CHECK-NEXT: mova dj5, #14 ; CHECK-NEXT: lda.s16 r2, [p7, #4] ; CHECK-NEXT: lda.s16 r3, [p7, #6] ; CHECK-NEXT: lda.s16 r4, [p7, #8] ; CHECK-NEXT: lda.s16 r5, [p7, #10] ; CHECK-NEXT: lda.s16 r6, [p7, #12] ; CHECK-NEXT: lda.s16 r7, [p7, #14] +; CHECK-NEXT: mov p7, r28 +; CHECK-NEXT: lda.s8 r22, [p0, dj5] +; CHECK-NEXT: mova dj5, #15 +; CHECK-NEXT: lda.s8 r23, [p0, dj5] ; CHECK-NEXT: st.s16 r0, [p7], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -454,7 +437,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r2, [sp, #-32] // 4-byte Folded Reload ; CHECK-NEXT: st.s16 r3, [p7], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -477,47 +459,48 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda.s8 r5, [p0, #5] ; CHECK-NEXT: st.s16 r6, [p7], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: mova m0, #-272 ; CHECK-NEXT: st.s16 r7, [p7, #0] -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: mov p7, r2 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: padda [p1], m0 +; CHECK-NEXT: mov r26, p1 +; CHECK-NEXT: mov p1, sp +; CHECK-NEXT: padda [p1], #-256 +; CHECK-NEXT: mov p7, r29 ; CHECK-NEXT: lda r0, [p7], #4 ; CHECK-NEXT: lda r1, [p7, #0] -; CHECK-NEXT: lda p7, [sp, #-40] // 4-byte Folded Reload -; CHECK-NEXT: mov p7, r2 +; CHECK-NEXT: mov p7, r29 ; CHECK-NEXT: lda r2, [p7, #8] -; CHECK-NEXT: lda.s8 r5, [p0, #5] ; CHECK-NEXT: lda r3, [p7, #12] ; CHECK-NEXT: lda.s8 r6, [p0, #6] ; CHECK-NEXT: lda.s8 r7, [p0, #7] +; CHECK-NEXT: mov p7, r26 ; CHECK-NEXT: st r0, [p7], #4 ; CHECK-NEXT: st r1, [p7], #4 ; CHECK-NEXT: st r2, [p7], #4 -; CHECK-NEXT: lda.s8 r2, [p0, #2] ; CHECK-NEXT: st r3, [p7, #0] ; CHECK-NEXT: mov p7, p0 -; CHECK-NEXT: lda.s8 r3, [p0, #3] ; CHECK-NEXT: lda.s8 r0, [p7], #1 -; CHECK-NEXT: lda.s8 r1, [p7, #0] ; CHECK-NEXT: st.s8 r0, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: lda.s8 r2, [p0, #2] +; CHECK-NEXT: lda.s8 r3, [p0, #3] ; CHECK-NEXT: mov p0, p2 +; CHECK-NEXT: lda.s8 r1, [p7, #0] ; CHECK-NEXT: lda.s16 r0, [p0], #2 -; CHECK-NEXT: lda p7, [sp, #-640] // 4-byte Folded Reload +; CHECK-NEXT: lda p7, [sp, #-576] // 4-byte Folded Reload ; CHECK-NEXT: st.s8 r1, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -596,16 +579,14 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj5, #20 -; CHECK-NEXT: lda.s16 r18, [p2, dj5] +; CHECK-NEXT: nop ; CHECK-NEXT: st.s8 r19, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mova dj5, #22 -; CHECK-NEXT: lda.s16 r19, [p2, dj5] +; CHECK-NEXT: nop ; CHECK-NEXT: st.s8 r20, [p1], #1 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -631,15 +612,26 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: mov p3, sp +; CHECK-NEXT: mova m0, #-240 +; CHECK-NEXT: padda [p3], m0 +; CHECK-NEXT: st.s16 r0, [p3], #2 +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: st.s16 r0, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st.s16 r1, [p3], #2 +; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: nop +; CHECK-NEXT: mova dj5, #20 +; CHECK-NEXT: lda.s16 r18, [p2, dj5] +; CHECK-NEXT: mova dj5, #22 +; CHECK-NEXT: lda.s16 r19, [p2, dj5] ; CHECK-NEXT: mova dj5, #24 ; CHECK-NEXT: lda.s16 r20, [p2, dj5] ; CHECK-NEXT: mova dj5, #26 @@ -647,15 +639,11 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: mova dj5, #28 ; CHECK-NEXT: lda.s16 r22, [p2, dj5] ; CHECK-NEXT: mova dj5, #30 +; CHECK-NEXT: mov p1, r30 ; CHECK-NEXT: lda.s16 r23, [p2, dj5] -; CHECK-NEXT: lda p1, [sp, #-24] // 4-byte Folded Reload -; CHECK-NEXT: st.s16 r1, [p3], #2 -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: lda r0, [p0], #4 +; CHECK-NEXT: lda r1, [p0, #0] ; CHECK-NEXT: st.s16 r2, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -663,6 +651,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda r2, [p1, #8] ; CHECK-NEXT: st.s16 r3, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -670,6 +659,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda r3, [p1, #12] ; CHECK-NEXT: st.s16 r4, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -677,6 +667,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: lda r4, [p1, #16] ; CHECK-NEXT: st.s16 r5, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -684,9 +675,6 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: lda r2, [p1, #8] -; CHECK-NEXT: lda r3, [p1, #12] -; CHECK-NEXT: lda r4, [p1, #16] ; CHECK-NEXT: lda r5, [p1, #20] ; CHECK-NEXT: st.s16 r6, [p3], #2 ; CHECK-NEXT: nop @@ -694,27 +682,26 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: mov p0, p1 +; CHECK-NEXT: nop ; CHECK-NEXT: lda r6, [p1, #24] -; CHECK-NEXT: lda r0, [p0], #4 ; CHECK-NEXT: st.s16 r7, [p3], #2 ; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: mov p4, sp +; CHECK-NEXT: mova m0, #-208 +; CHECK-NEXT: padda [p4], m0 +; CHECK-NEXT: mov r25, p4 +; CHECK-NEXT: mov p0, r25 ; CHECK-NEXT: lda r7, [p1, #28] -; CHECK-NEXT: lda p1, [sp, #-16] // 4-byte Folded Reload -; CHECK-NEXT: lda r1, [p0, #0] -; CHECK-NEXT: lda p0, [sp, #-44] // 4-byte Folded Reload +; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st.s16 r16, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK-NEXT: mov p1, r31 +; CHECK-NEXT: st r1, [p0], #4 +; CHECK-NEXT: lda r1, [p1, #4] ; CHECK-NEXT: st.s16 r17, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -722,6 +709,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st r2, [p0], #4 +; CHECK-NEXT: lda r2, [p1, #8] ; CHECK-NEXT: st.s16 r18, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -729,6 +718,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st r3, [p0], #4 ; CHECK-NEXT: st.s16 r19, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -736,6 +726,8 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st r4, [p0], #4 +; CHECK-NEXT: lda r4, [p1, #16] ; CHECK-NEXT: st.s16 r20, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -743,6 +735,7 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st r5, [p0], #4 ; CHECK-NEXT: st.s16 r21, [p3], #2 ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -750,43 +743,33 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: st r6, [p0], #4 +; CHECK-NEXT: lda r6, [p1, #24] ; CHECK-NEXT: st.s16 r22, [p3], #2 -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: st r0, [p0], #4 -; CHECK-NEXT: st.s16 r23, [p3, #0] ; CHECK-NEXT: mov p6, sp ; CHECK-NEXT: mova m0, #-176 ; CHECK-NEXT: mov p4, sp ; CHECK-NEXT: padda [p4], m0 -; CHECK-NEXT: mova m0, #-112 -; CHECK-NEXT: padda [p6], m0 -; CHECK-NEXT: st r1, [p0], #4 -; CHECK-NEXT: lda r1, [p1, #4] -; CHECK-NEXT: st r2, [p0], #4 -; CHECK-NEXT: lda r2, [p1, #8] -; CHECK-NEXT: st r3, [p0], #4 -; CHECK-NEXT: st r4, [p0], #4 -; CHECK-NEXT: lda r4, [p1, #16] -; CHECK-NEXT: st r5, [p0], #4 -; CHECK-NEXT: st r6, [p0], #4 -; CHECK-NEXT: lda r6, [p1, #24] +; CHECK-NEXT: mova m0, #-144 +; CHECK-NEXT: mov r24, p4 ; CHECK-NEXT: st r7, [p0, #0] +; CHECK-NEXT: lda r8, [sp, #-568] // 4-byte Folded Reload +; CHECK-NEXT: st.s16 r23, [p3, #0] +; CHECK-NEXT: mov p4, sp +; CHECK-NEXT: padda [p4], m0 +; CHECK-NEXT: mova m0, #-80 ; CHECK-NEXT: mov p0, p1 -; CHECK-NEXT: mov p1, r24 +; CHECK-NEXT: mov p1, r8 +; CHECK-NEXT: padda [p6], m0 ; CHECK-NEXT: lda r16, [p1, dj4] -; CHECK-NEXT: lda r0, [p0], #12 -; CHECK-NEXT: lda p0, [sp, #-48] // 4-byte Folded Reload ; CHECK-NEXT: lda r18, [p1, dj6] +; CHECK-NEXT: lda r0, [p0], #12 ; CHECK-NEXT: lda r20, [p1, dj1] ; CHECK-NEXT: lda r22, [p1, dj0] ; CHECK-NEXT: lda r3, [p0], #8 ; CHECK-NEXT: lda r5, [p0], #8 ; CHECK-NEXT: lda r7, [p0, #0] +; CHECK-NEXT: mov p0, r24 ; CHECK-NEXT: st r0, [p0], #4 ; CHECK-NEXT: st r1, [p0], #4 ; CHECK-NEXT: lda r1, [p1, #4] @@ -855,12 +838,12 @@ define dso_local void @test_load_store_unaligned(<8 x i16> noundef %a, <4 x i32> ; CHECK-NEXT: st r18, [p6], #4 ; CHECK-NEXT: st r19, [p6], #4 ; CHECK-NEXT: st r20, [p6], #4 -; CHECK-NEXT: lda p6, [sp, #-636] // 4-byte Folded Reload +; CHECK-NEXT: lda p6, [sp, #-572] // 4-byte Folded Reload ; CHECK-NEXT: ret lr ; CHECK-NEXT: st r21, [p6], #4 // Delay Slot 5 ; CHECK-NEXT: st r22, [p6], #4 // Delay Slot 4 ; CHECK-NEXT: st r23, [p6, #0] // Delay Slot 3 -; CHECK-NEXT: paddxm [sp], #-640 // Delay Slot 2 +; CHECK-NEXT: paddxm [sp], #-576 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: %a.addr = alloca <8 x i16>, align 8 diff --git a/llvm/test/CodeGen/AIE/aie2p/streams.ll b/llvm/test/CodeGen/AIE/aie2p/streams.ll index b60b3e354b80..100f16ce3693 100644 --- a/llvm/test/CodeGen/AIE/aie2p/streams.ll +++ b/llvm/test/CodeGen/AIE/aie2p/streams.ll @@ -138,41 +138,41 @@ define dso_local void @_Z19test_put_ms_v64bf16Dv64_u6__bf16ii(<64 x bfloat> noun ; CHECK-LABEL: _Z19test_put_ms_v64bf16Dv64_u6__bf16ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r2, #0; nopb ; nopx ; mov r28, r1 -; CHECK-NEXT: mova r0, #1; vextract.32 r5, x4, r2, vaddsign1 -; CHECK-NEXT: mova r3, #2; vextract.32 r6, x4, r0, vaddsign1 -; CHECK-NEXT: mova r4, #3; mov ms, r5; vextract.32 r7, x4, r3, vaddsign1 -; CHECK-NEXT: mova r5, #4; mov ms, r6; vextract.32 r6, x4, r4, vaddsign1 -; CHECK-NEXT: mova r16, #5; mov ms, r7; vextract.32 r7, x4, r5, vaddsign1 -; CHECK-NEXT: mova r17, #6; mov ms, r6; vextract.32 r6, x4, r16, vaddsign1 -; CHECK-NEXT: mova r18, #7; mov ms, r7; vextract.32 r7, x4, r17, vaddsign1 -; CHECK-NEXT: mova r19, #8; mov ms, r6; vextract.32 r6, x4, r18, vaddsign1 -; CHECK-NEXT: mova r20, #9; mov ms, r7; vextract.32 r7, x4, r19, vaddsign1 -; CHECK-NEXT: mova r21, #10; mov ms, r6; vextract.32 r6, x4, r20, vaddsign1 -; CHECK-NEXT: mova r22, #11; mov ms, r7; vextract.32 r7, x4, r21, vaddsign1 -; CHECK-NEXT: mova r23, #12; mov ms, r6; vextract.32 r6, x4, r22, vaddsign1 -; CHECK-NEXT: mova r24, #13; mov ms, r7; vextract.32 r7, x4, r23, vaddsign1 -; CHECK-NEXT: mova r25, #14; mov ms, r6; vextract.32 r6, x4, r24, vaddsign1 -; CHECK-NEXT: mova r26, #15; mov ms, r7; vextract.32 r7, x4, r25, vaddsign1 -; CHECK-NEXT: mov ms, r6; vextract.32 r6, x4, r26, vaddsign1 -; CHECK-NEXT: mov ms, r7; vextract.32 r2, x5, r2, vaddsign1 -; CHECK-NEXT: mov ms, r6; vextract.32 r0, x5, r0, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r3, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r4, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r5, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r16, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r17, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r18, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r19, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r20, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r21, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r22, vaddsign1 -; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, r23, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r24, vaddsign1 -; CHECK-NEXT: mov ms, r2; ret lr; vextract.32 r2, x5, r25, vaddsign1 -; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, r26, vaddsign1 // Delay Slot 5 -; CHECK-NEXT: mov ms, r2 // Delay Slot 4 -; CHECK-NEXT: mov ms, r0, r28 // Delay Slot 3 +; CHECK-NEXT: nopa ; nopx ; mov r28, r1 +; CHECK-NEXT: vextract.32 r0, x4, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r2, x4, #1, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #2, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #3, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #4, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #5, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #6, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #7, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #8, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #9, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #10, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #11, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #12, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #13, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #14, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #15, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #0, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #1, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #2, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #3, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #4, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #5, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #6, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #7, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #8, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #9, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #10, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #11, vaddsign1 +; CHECK-NEXT: mov ms, r0; vextract.32 r0, x5, #12, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #13, vaddsign1 +; CHECK-NEXT: mov ms, r0; ret lr; vextract.32 r0, x5, #14, vaddsign1 +; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #15, vaddsign1 // Delay Slot 5 +; CHECK-NEXT: mov ms, r0 // Delay Slot 4 +; CHECK-NEXT: mov ms, r2, r28 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/aie2p/vmac.ll b/llvm/test/CodeGen/AIE/aie2p/vmac.ll index c315920d4b12..e512da792d18 100644 --- a/llvm/test/CodeGen/AIE/aie2p/vmac.ll +++ b/llvm/test/CodeGen/AIE/aie2p/vmac.ll @@ -105,115 +105,117 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmac_4x16_16x16_confDv64_h ; CHECK-LABEL: _Z27test_addmac_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r7, #50; mov crunpacksize, #0 -; CHECK-NEXT: mova r23, #5; vshuffle x0, x0, x0, r7 -; CHECK-NEXT: mova r28, #1; vextract.32 r29, x4, r23, vaddsign1 -; CHECK-NEXT: mova r20, #2; vextract.32 r16, x0, r28, vaddsign1 -; CHECK-NEXT: mova r21, #3; vextract.32 r17, x0, r20, vaddsign1 -; CHECK-NEXT: mova r22, #4; vextract.32 r18, x0, r21, vaddsign1 -; CHECK-NEXT: vextract.32 r19, x0, r22, vaddsign1 -; CHECK-NEXT: mova r24, #0; vextract.32 r25, x0, r23, vaddsign1 -; CHECK-NEXT: vextract.32 r7, x0, r24, vaddsign1 -; CHECK-NEXT: vextract.32 r23, x5, r23, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x0, r7 -; CHECK-NEXT: vpush.hi.32 x2, x2, r16 +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov crunpacksize, #0; nopv +; CHECK-NEXT: paddxm [sp], #64 +; CHECK-NEXT: vextract.32 r25, x4, #3, vaddsign1 +; CHECK-NEXT: vextract.32 r26, x4, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r27, x4, #5, vaddsign1 +; CHECK-NEXT: vextract.32 r28, x4, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r29, x4, #7, vaddsign1 +; CHECK-NEXT: vextract.32 r30, x4, #8, vaddsign1 +; CHECK-NEXT: mova r7, #50; vextract.32 r31, x4, #9, vaddsign1 +; CHECK-NEXT: vshuffle x2, x0, x0, r7 +; CHECK-NEXT: vextract.32 r8, x4, #10, vaddsign1 +; CHECK-NEXT: st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: vextract.32 r19, x2, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r20, x2, #3, vaddsign1 +; CHECK-NEXT: vextract.32 r21, x2, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x2, #5, vaddsign1 +; CHECK-NEXT: vextract.32 r16, x2, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r7, x2, #7, vaddsign1 +; CHECK-NEXT: vextract.32 r23, x2, #9, vaddsign1 +; CHECK-NEXT: vextract.32 r17, x2, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r24, x2, #10, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r17 +; CHECK-NEXT: vextract.32 r17, x2, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r18 +; CHECK-NEXT: vextract.32 r18, x2, #14, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r19 +; CHECK-NEXT: vextract.32 r19, x2, #13, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r20 +; CHECK-NEXT: vextract.32 r20, x2, #12, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r21 +; CHECK-NEXT: vextract.32 r21, x2, #11, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x2, #8, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r22 +; CHECK-NEXT: vpush.hi.32 x2, x0, r22 +; CHECK-NEXT: vextract.32 r22, x4, #0, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x2, x2, r23 +; CHECK-NEXT: vextract.32 r23, x4, #1, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x2, x2, r24 +; CHECK-NEXT: vpush.hi.32 x6, x0, r22 +; CHECK-NEXT: vextract.32 r24, x4, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x4, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x6, r23 +; CHECK-NEXT: vpush.hi.32 x2, x2, r21 +; CHECK-NEXT: vextract.32 r23, x5, #1, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r24 +; CHECK-NEXT: vpush.hi.32 x2, x2, r20 +; CHECK-NEXT: vextract.32 r24, x5, #2, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r25 +; CHECK-NEXT: vpush.hi.32 x2, x2, r19 +; CHECK-NEXT: vextract.32 r25, x5, #3, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r26 +; CHECK-NEXT: vpush.hi.32 x2, x2, r18 +; CHECK-NEXT: vextract.32 r26, x5, #4, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r27 ; CHECK-NEXT: vpush.hi.32 x2, x2, r17 -; CHECK-NEXT: mova r17, #11; vpush.hi.32 x2, x2, r18 -; CHECK-NEXT: vextract.32 r27, x0, r17, vaddsign1 -; CHECK-NEXT: mova r7, #8; vpush.hi.32 x2, x2, r19 -; CHECK-NEXT: mova r18, #10; vextract.32 r19, x0, r7, vaddsign1 -; CHECK-NEXT: vextract.32 r26, x0, r18, vaddsign1 -; CHECK-NEXT: mova r16, #9; vpush.hi.32 x2, x2, r25 -; CHECK-NEXT: vextract.32 r25, x0, r16, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x0, r19 -; CHECK-NEXT: mova r19, #12; vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vextract.32 r25, x0, r19, vaddsign1 +; CHECK-NEXT: vextract.32 r27, x5, #5, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r28 +; CHECK-NEXT: vextract.32 r28, x5, #6, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r29 +; CHECK-NEXT: vextract.32 r29, x5, #7, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r30 +; CHECK-NEXT: vextract.32 r30, x5, #8, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r31 +; CHECK-NEXT: vextract.32 r31, x5, #9, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r8 +; CHECK-NEXT: vextract.32 r8, x5, #10, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r9 +; CHECK-NEXT: vextract.32 r9, x5, #11, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r10 +; CHECK-NEXT: vextract.32 r10, x5, #12, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r11 +; CHECK-NEXT: vextract.32 r11, x5, #13, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r12 +; CHECK-NEXT: vextract.32 r12, x5, #14, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x5, #0, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r22 +; CHECK-NEXT: vpush.hi.32 x6, x0, r22 +; CHECK-NEXT: vpush.hi.32 x0, x0, r16 +; CHECK-NEXT: vextract.32 r22, x5, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x6, x6, r23 +; CHECK-NEXT: vpush.hi.32 x0, x0, r7 +; CHECK-NEXT: nez r23, r1; vpush.hi.32 x6, x6, r24 +; CHECK-NEXT: mov unpacksign0, r23 +; CHECK-NEXT: vmov wl0, wh2 +; CHECK-NEXT: vunpack y2, x4, unpacksign0; vpush.hi.32 x6, x6, r25 ; CHECK-NEXT: vpush.hi.32 x6, x6, r26 -; CHECK-NEXT: vextract.32 r26, x4, r28, vaddsign1 -; CHECK-NEXT: vextract.32 r28, x5, r28, vaddsign1 ; CHECK-NEXT: vpush.hi.32 x6, x6, r27 -; CHECK-NEXT: vextract.32 r27, x4, r20, vaddsign1 -; CHECK-NEXT: vextract.32 r20, x5, r20, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vextract.32 r25, x4, r24, vaddsign1 -; CHECK-NEXT: vextract.32 r24, x5, r24, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x0, r25 -; CHECK-NEXT: vextract.32 r25, x4, r21, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x5, r21, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: vextract.32 r26, x4, r22, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r27 -; CHECK-NEXT: vpush.hi.32 x8, x8, r25 -; CHECK-NEXT: mova r27, #6; vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: vextract.32 r26, x4, r27, vaddsign1 -; CHECK-NEXT: mova r25, #13; vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x0, r25, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: mova r26, #7; vpush.hi.32 x6, x6, r29 -; CHECK-NEXT: vextract.32 r29, x4, r26, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x4, r16, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r18, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x4, r17, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r19, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: mova r30, #14; vextract.32 r29, x4, r25, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r30, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: mova r29, #15; vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r31, x4, r29, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x5, r22, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x8, r31 -; CHECK-NEXT: vpush.hi.32 x8, x0, r24 -; CHECK-NEXT: vpush.hi.32 x8, x8, r28 -; CHECK-NEXT: vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r27, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: vextract.32 r21, x5, r26, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: vextract.32 r22, x5, r7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vextract.32 r23, x5, r16, vaddsign1 -; CHECK-NEXT: vextract.32 r16, x0, r30, vaddsign1 -; CHECK-NEXT: lshl r0, r0, r16; vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r18, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: vpush.hi.32 x6, x6, r16 -; CHECK-NEXT: vextract.32 r21, x5, r17, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: vextract.32 r22, x5, r19, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vextract.32 r23, x5, r25, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r30, vaddsign1 -; CHECK-NEXT: lshl r4, r4, r17; vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: nez r21, r1; vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: lshl r3, r3, r18; mov unpacksign0, r21 -; CHECK-NEXT: lshl r1, r1, r7; vextract.32 r22, x5, r29, vaddsign1 -; CHECK-NEXT: or r0, r1, r0; vextract.32 r2, x0, r27, vaddsign1 -; CHECK-NEXT: or r1, r3, r2; vextract.32 r3, x0, r29, vaddsign1 -; CHECK-NEXT: lshl r5, r5, r19; vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vunpack y2, x4, unpacksign0; or r1, r1, r4; vextract.32 r5, x0, r26, vaddsign1 -; CHECK-NEXT: or r1, r1, r5; vpush.hi.32 x0, x6, r3 -; CHECK-NEXT: vunpack y4, x8, unpacksign0; vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vpush.hi.32 x2, x2, r2 -; CHECK-NEXT: vmov wl0, wh0 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: lshl r4, r6, r25; mov unpacksign0, #0 -; CHECK-NEXT: mova r3, #200; or r1, r1, r0; mov unpacksign0, r21 -; CHECK-NEXT: or r1, r1, r3; vpush.hi.32 x2, x2, r5 -; CHECK-NEXT: or r0, r0, r4; vmov wl0, wh2; vmac dm0, dm1, x0, y4,r1 -; CHECK-NEXT: or r0, r0, r3 +; CHECK-NEXT: mova r24, #10; vpush.hi.32 x6, x6, r28 +; CHECK-NEXT: mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29 +; CHECK-NEXT: mova r25, #11; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30 +; CHECK-NEXT: mova r25, #9; lshl r4, r4, r25; mov unpacksign0, #0 +; CHECK-NEXT: mova r24, #8; lshl r0, r0, r25; vpush.hi.32 x6, x6, r31 +; CHECK-NEXT: lshl r1, r1, r24; vpush.hi.32 x6, x6, r8 +; CHECK-NEXT: vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9 +; CHECK-NEXT: or r1, r3, r2; vpush.hi.32 x6, x6, r10 +; CHECK-NEXT: mova r3, #13; or r1, r1, r4; vpush.hi.32 x6, x6, r11 +; CHECK-NEXT: mova r2, #200; lshl r3, r6, r3; vpush.hi.32 x6, x6, r12 +; CHECK-NEXT: lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload +; CHECK-NEXT: lda r9, [sp, #-52]; or r1, r1, r0; mov unpacksign0, r23 // 4-byte Folded Reload +; CHECK-NEXT: lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload +; CHECK-NEXT: lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmac dm0, dm1, x0, y3,r1 // 4-byte Folded Reload +; CHECK-NEXT: lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload ; CHECK-NEXT: ret lr; vaddmac dm0, dm0, dm2, x0, y2,r0 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: mov unpacksign0, #0 // Delay Slot 1 entry: %0 = bitcast <64 x i8> %a to <16 x i32> @@ -258,115 +260,117 @@ define dso_local inreg noundef <64 x i32> @_Z27test_addmsc_4x16_16x16_confDv64_h ; CHECK-LABEL: _Z27test_addmsc_4x16_16x16_confDv64_hiDv128_DB8_iDv64_u7__acc32S2_iiiii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: mova r7, #50; mov crunpacksize, #0 -; CHECK-NEXT: mova r23, #5; vshuffle x0, x0, x0, r7 -; CHECK-NEXT: mova r28, #1; vextract.32 r29, x4, r23, vaddsign1 -; CHECK-NEXT: mova r20, #2; vextract.32 r16, x0, r28, vaddsign1 -; CHECK-NEXT: mova r21, #3; vextract.32 r17, x0, r20, vaddsign1 -; CHECK-NEXT: mova r22, #4; vextract.32 r18, x0, r21, vaddsign1 -; CHECK-NEXT: vextract.32 r19, x0, r22, vaddsign1 -; CHECK-NEXT: mova r24, #0; vextract.32 r25, x0, r23, vaddsign1 -; CHECK-NEXT: vextract.32 r7, x0, r24, vaddsign1 -; CHECK-NEXT: vextract.32 r23, x5, r23, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x2, x0, r7 -; CHECK-NEXT: vpush.hi.32 x2, x2, r16 +; CHECK-NEXT: nopa ; nopb ; nops ; nopx ; mov crunpacksize, #0; nopv +; CHECK-NEXT: paddxm [sp], #64 +; CHECK-NEXT: vextract.32 r25, x4, #3, vaddsign1 +; CHECK-NEXT: vextract.32 r26, x4, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r27, x4, #5, vaddsign1 +; CHECK-NEXT: vextract.32 r28, x4, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r29, x4, #7, vaddsign1 +; CHECK-NEXT: vextract.32 r30, x4, #8, vaddsign1 +; CHECK-NEXT: mova r7, #50; vextract.32 r31, x4, #9, vaddsign1 +; CHECK-NEXT: vshuffle x2, x0, x0, r7 +; CHECK-NEXT: vextract.32 r8, x4, #10, vaddsign1 +; CHECK-NEXT: st r8, [sp, #-48]; vextract.32 r9, x4, #11, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r9, [sp, #-52]; vextract.32 r10, x4, #12, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r10, [sp, #-56]; vextract.32 r11, x4, #13, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r11, [sp, #-60]; vextract.32 r12, x4, #14, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: st r12, [sp, #-64]; vextract.32 r18, x2, #1, vaddsign1 // 4-byte Folded Spill +; CHECK-NEXT: vextract.32 r19, x2, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r20, x2, #3, vaddsign1 +; CHECK-NEXT: vextract.32 r21, x2, #4, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x2, #5, vaddsign1 +; CHECK-NEXT: vextract.32 r16, x2, #6, vaddsign1 +; CHECK-NEXT: vextract.32 r7, x2, #7, vaddsign1 +; CHECK-NEXT: vextract.32 r23, x2, #9, vaddsign1 +; CHECK-NEXT: vextract.32 r17, x2, #0, vaddsign1 +; CHECK-NEXT: vextract.32 r24, x2, #10, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r17 +; CHECK-NEXT: vextract.32 r17, x2, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r18 +; CHECK-NEXT: vextract.32 r18, x2, #14, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r19 +; CHECK-NEXT: vextract.32 r19, x2, #13, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r20 +; CHECK-NEXT: vextract.32 r20, x2, #12, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r21 +; CHECK-NEXT: vextract.32 r21, x2, #11, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x2, #8, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x0, x0, r22 +; CHECK-NEXT: vpush.hi.32 x2, x0, r22 +; CHECK-NEXT: vextract.32 r22, x4, #0, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x2, x2, r23 +; CHECK-NEXT: vextract.32 r23, x4, #1, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x2, x2, r24 +; CHECK-NEXT: vpush.hi.32 x6, x0, r22 +; CHECK-NEXT: vextract.32 r24, x4, #2, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x4, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x6, r23 +; CHECK-NEXT: vpush.hi.32 x2, x2, r21 +; CHECK-NEXT: vextract.32 r23, x5, #1, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r24 +; CHECK-NEXT: vpush.hi.32 x2, x2, r20 +; CHECK-NEXT: vextract.32 r24, x5, #2, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r25 +; CHECK-NEXT: vpush.hi.32 x2, x2, r19 +; CHECK-NEXT: vextract.32 r25, x5, #3, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r26 +; CHECK-NEXT: vpush.hi.32 x2, x2, r18 +; CHECK-NEXT: vextract.32 r26, x5, #4, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r27 ; CHECK-NEXT: vpush.hi.32 x2, x2, r17 -; CHECK-NEXT: mova r17, #11; vpush.hi.32 x2, x2, r18 -; CHECK-NEXT: vextract.32 r27, x0, r17, vaddsign1 -; CHECK-NEXT: mova r7, #8; vpush.hi.32 x2, x2, r19 -; CHECK-NEXT: mova r18, #10; vextract.32 r19, x0, r7, vaddsign1 -; CHECK-NEXT: vextract.32 r26, x0, r18, vaddsign1 -; CHECK-NEXT: mova r16, #9; vpush.hi.32 x2, x2, r25 -; CHECK-NEXT: vextract.32 r25, x0, r16, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x0, r19 -; CHECK-NEXT: mova r19, #12; vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vextract.32 r25, x0, r19, vaddsign1 +; CHECK-NEXT: vextract.32 r27, x5, #5, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r28 +; CHECK-NEXT: vextract.32 r28, x5, #6, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r29 +; CHECK-NEXT: vextract.32 r29, x5, #7, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r30 +; CHECK-NEXT: vextract.32 r30, x5, #8, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r31 +; CHECK-NEXT: vextract.32 r31, x5, #9, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r8 +; CHECK-NEXT: vextract.32 r8, x5, #10, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r9 +; CHECK-NEXT: vextract.32 r9, x5, #11, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r10 +; CHECK-NEXT: vextract.32 r10, x5, #12, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r11 +; CHECK-NEXT: vextract.32 r11, x5, #13, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r12 +; CHECK-NEXT: vextract.32 r12, x5, #14, vaddsign1 +; CHECK-NEXT: vextract.32 r22, x5, #0, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x4, x4, r22 +; CHECK-NEXT: vpush.hi.32 x6, x0, r22 +; CHECK-NEXT: vpush.hi.32 x0, x0, r16 +; CHECK-NEXT: vextract.32 r22, x5, #15, vaddsign1 +; CHECK-NEXT: vpush.hi.32 x6, x6, r23 +; CHECK-NEXT: vpush.hi.32 x0, x0, r7 +; CHECK-NEXT: nez r23, r1; vpush.hi.32 x6, x6, r24 +; CHECK-NEXT: mov unpacksign0, r23 +; CHECK-NEXT: vmov wl0, wh2 +; CHECK-NEXT: vunpack y2, x4, unpacksign0; vpush.hi.32 x6, x6, r25 ; CHECK-NEXT: vpush.hi.32 x6, x6, r26 -; CHECK-NEXT: vextract.32 r26, x4, r28, vaddsign1 -; CHECK-NEXT: vextract.32 r28, x5, r28, vaddsign1 ; CHECK-NEXT: vpush.hi.32 x6, x6, r27 -; CHECK-NEXT: vextract.32 r27, x4, r20, vaddsign1 -; CHECK-NEXT: vextract.32 r20, x5, r20, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x6, x6, r25 -; CHECK-NEXT: vextract.32 r25, x4, r24, vaddsign1 -; CHECK-NEXT: vextract.32 r24, x5, r24, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x0, r25 -; CHECK-NEXT: vextract.32 r25, x4, r21, vaddsign1 -; CHECK-NEXT: vextract.32 r21, x5, r21, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: vextract.32 r26, x4, r22, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r27 -; CHECK-NEXT: vpush.hi.32 x8, x8, r25 -; CHECK-NEXT: mova r27, #6; vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: vextract.32 r26, x4, r27, vaddsign1 -; CHECK-NEXT: mova r25, #13; vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x0, r25, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r26 -; CHECK-NEXT: mova r26, #7; vpush.hi.32 x6, x6, r29 -; CHECK-NEXT: vextract.32 r29, x4, r26, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x4, r16, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r18, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r29, x4, r17, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r19, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: mova r30, #14; vextract.32 r29, x4, r25, vaddsign1 -; CHECK-NEXT: vextract.32 r29, x4, r30, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: mova r29, #15; vpush.hi.32 x8, x8, r29 -; CHECK-NEXT: vextract.32 r31, x4, r29, vaddsign1 -; CHECK-NEXT: vextract.32 r22, x5, r22, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x4, x8, r31 -; CHECK-NEXT: vpush.hi.32 x8, x0, r24 -; CHECK-NEXT: vpush.hi.32 x8, x8, r28 -; CHECK-NEXT: vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r27, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: vextract.32 r21, x5, r26, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: vextract.32 r22, x5, r7, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vextract.32 r23, x5, r16, vaddsign1 -; CHECK-NEXT: vextract.32 r16, x0, r30, vaddsign1 -; CHECK-NEXT: lshl r0, r0, r16; vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r18, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: vpush.hi.32 x6, x6, r16 -; CHECK-NEXT: vextract.32 r21, x5, r17, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: vextract.32 r22, x5, r19, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vextract.32 r23, x5, r25, vaddsign1 -; CHECK-NEXT: vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vextract.32 r20, x5, r30, vaddsign1 -; CHECK-NEXT: lshl r4, r4, r17; vpush.hi.32 x8, x8, r21 -; CHECK-NEXT: nez r21, r1; vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: lshl r3, r3, r18; mov unpacksign0, r21 -; CHECK-NEXT: lshl r1, r1, r7; vextract.32 r22, x5, r29, vaddsign1 -; CHECK-NEXT: or r0, r1, r0; vextract.32 r2, x0, r27, vaddsign1 -; CHECK-NEXT: or r1, r3, r2; vextract.32 r3, x0, r29, vaddsign1 -; CHECK-NEXT: lshl r5, r5, r19; vpush.hi.32 x8, x8, r23 -; CHECK-NEXT: vunpack y2, x4, unpacksign0; or r1, r1, r4; vextract.32 r5, x0, r26, vaddsign1 -; CHECK-NEXT: or r1, r1, r5; vpush.hi.32 x0, x6, r3 -; CHECK-NEXT: vunpack y4, x8, unpacksign0; vpush.hi.32 x8, x8, r20 -; CHECK-NEXT: vpush.hi.32 x2, x2, r2 -; CHECK-NEXT: vmov wl0, wh0 -; CHECK-NEXT: vpush.hi.32 x8, x8, r22 -; CHECK-NEXT: lshl r4, r6, r25; mov unpacksign0, #0 -; CHECK-NEXT: mova r3, #200; or r1, r1, r0; mov unpacksign0, r21 -; CHECK-NEXT: or r1, r1, r3; vpush.hi.32 x2, x2, r5 -; CHECK-NEXT: or r0, r0, r4; vmov wl0, wh2; vmsc dm0, dm1, x0, y4,r1 -; CHECK-NEXT: or r0, r0, r3 +; CHECK-NEXT: mova r24, #10; vpush.hi.32 x6, x6, r28 +; CHECK-NEXT: mova r24, #12; lshl r3, r3, r24; vpush.hi.32 x6, x6, r29 +; CHECK-NEXT: mova r25, #11; lshl r5, r5, r24; vpush.hi.32 x6, x6, r30 +; CHECK-NEXT: mova r25, #9; lshl r4, r4, r25; mov unpacksign0, #0 +; CHECK-NEXT: mova r24, #8; lshl r0, r0, r25; vpush.hi.32 x6, x6, r31 +; CHECK-NEXT: lshl r1, r1, r24; vpush.hi.32 x6, x6, r8 +; CHECK-NEXT: vunpack y3, x6, unpacksign0; or r0, r1, r0; vpush.hi.32 x6, x6, r9 +; CHECK-NEXT: or r1, r3, r2; vpush.hi.32 x6, x6, r10 +; CHECK-NEXT: mova r3, #13; or r1, r1, r4; vpush.hi.32 x6, x6, r11 +; CHECK-NEXT: mova r2, #200; lshl r3, r6, r3; vpush.hi.32 x6, x6, r12 +; CHECK-NEXT: lda r8, [sp, #-48]; or r1, r1, r5; vpush.hi.32 x6, x6, r22 // 4-byte Folded Reload +; CHECK-NEXT: lda r9, [sp, #-52]; or r1, r1, r0; mov unpacksign0, r23 // 4-byte Folded Reload +; CHECK-NEXT: lda r10, [sp, #-56]; or r1, r1, r2 // 4-byte Folded Reload +; CHECK-NEXT: lda r11, [sp, #-60]; or r0, r0, r3; vmov wl0, wh0; vmsc dm0, dm1, x0, y3,r1 // 4-byte Folded Reload +; CHECK-NEXT: lda r12, [sp, #-64]; or r0, r0, r2 // 4-byte Folded Reload ; CHECK-NEXT: ret lr; vaddmsc dm0, dm0, dm2, x0, y2,r0 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: mov unpacksign0, #0 // Delay Slot 1 entry: %0 = bitcast <64 x i8> %a to <16 x i32> diff --git a/llvm/test/CodeGen/AIE/extractelement.ll b/llvm/test/CodeGen/AIE/extractelement.ll index d8396d2ba51e..6083cc4f8bf5 100644 --- a/llvm/test/CodeGen/AIE/extractelement.ll +++ b/llvm/test/CodeGen/AIE/extractelement.ll @@ -72,11 +72,11 @@ define signext i8 @extract_v16i8_signext(<16 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v16i8_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #0 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #0, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i8> %v, i32 0 ret i8 %1 @@ -96,11 +96,11 @@ define zeroext i8 @extract_v16i8_zeroext(<16 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v16i8_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #0 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #0, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i8> %v, i32 0 ret i8 %1 @@ -144,11 +144,11 @@ define signext i16 @extract_v8i16_signext(<8 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v8i16_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #0 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #0, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <8 x i16> %v, i32 0 ret i16 %1 @@ -168,11 +168,11 @@ define zeroext i16 @extract_v8i16_zeroext(<8 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v8i16_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #0 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #0, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <8 x i16> %v, i32 0 ret i16 %1 @@ -234,25 +234,21 @@ define i32 @extract_v4i32(<4 x i32> %v) nounwind { ; AIE2P-LABEL: extract_v4i32: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: mova r0, #0; nopx -; AIE2P-NEXT: mova r2, #1 -; AIE2P-NEXT: mova r3, #2 -; AIE2P-NEXT: mova r4, #3 -; AIE2P-NEXT: vextract.32 r2, x0, r2, vaddsign1 -; AIE2P-NEXT: vextract.32 r3, x0, r3, vaddsign1 -; AIE2P-NEXT: vextract.32 r1, x0, r0, vaddsign1 -; AIE2P-NEXT: vextract.32 r4, x0, r4, vaddsign1 +; AIE2P-NEXT: nopx ; vextract.32 r1, x0, #1, vaddsign1 +; AIE2P-NEXT: vextract.32 r2, x0, #2, vaddsign1 +; AIE2P-NEXT: vextract.32 r0, x0, #0, vaddsign1 +; AIE2P-NEXT: vextract.32 r3, x0, #3, vaddsign1 +; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r1 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r2 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r3 -; AIE2P-NEXT: vpush.hi.32 x0, x0, r4 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 ; AIE2P-NEXT: ret lr ; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 // Delay Slot 5 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 // Delay Slot 4 ; AIE2P-NEXT: vmov wl0, wh0 // Delay Slot 3 -; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: vextract.32 r0, x0, #0, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <4 x i32> %v, i32 0 ret i32 %1 @@ -289,14 +285,10 @@ define i32 @extract_v4i32_dyn(<4 x i32> %v, i32 %idx) nounwind { ; AIE2P-LABEL: extract_v4i32_dyn: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: mova r0, #0; nopx -; AIE2P-NEXT: mova r2, #1 -; AIE2P-NEXT: mova r3, #2 -; AIE2P-NEXT: mova r4, #3 -; AIE2P-NEXT: vextract.32 r2, x0, r2, vaddsign1 -; AIE2P-NEXT: vextract.32 r3, x0, r3, vaddsign1 -; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 -; AIE2P-NEXT: vextract.32 r4, x0, r4, vaddsign1 +; AIE2P-NEXT: nopx ; vextract.32 r2, x0, #1, vaddsign1 +; AIE2P-NEXT: vextract.32 r3, x0, #2, vaddsign1 +; AIE2P-NEXT: vextract.32 r0, x0, #0, vaddsign1 +; AIE2P-NEXT: vextract.32 r4, x0, #3, vaddsign1 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r0 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r2 ; AIE2P-NEXT: vpush.hi.32 x0, x0, r3 @@ -328,11 +320,11 @@ define signext i8 @extract_v32i8_signext(<32 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v32i8_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #1, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i8> %v, i32 1 ret i8 %1 @@ -352,11 +344,11 @@ define zeroext i8 @extract_v32i8_zeroext(<32 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v32i8_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #1, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i8> %v, i32 1 ret i8 %1 @@ -400,11 +392,11 @@ define signext i16 @extract_v16i16_signext(<16 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v16i16_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #11, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i16> %v, i32 11 ret i16 %1 @@ -424,11 +416,11 @@ define zeroext i16 @extract_v16i16_zeroext(<16 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v16i16_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #11, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i16> %v, i32 11 ret i16 %1 @@ -472,11 +464,11 @@ define i32 @extract_v8i32(<8 x i32> %v) nounwind { ; AIE2P-LABEL: extract_v8i32: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 3 -; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <8 x i32> %v, i32 7 ret i32 %1 @@ -521,11 +513,11 @@ define signext i8 @extract_v64i8_signext(<64 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v64i8_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #1, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <64 x i8> %v, i32 1 ret i8 %1 @@ -545,11 +537,11 @@ define zeroext i8 @extract_v64i8_zeroext(<64 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v64i8_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x0, #1, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <64 x i8> %v, i32 1 ret i8 %1 @@ -593,11 +585,11 @@ define signext i16 @extract_v32i16_signext(<32 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v32i16_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #11, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i16> %v, i32 11 ret i16 %1 @@ -617,11 +609,11 @@ define zeroext i16 @extract_v32i16_zeroext(<32 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v32i16_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x0, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x0, #11, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i16> %v, i32 11 ret i16 %1 @@ -665,11 +657,11 @@ define i32 @extract_v16i32(<16 x i32> %v) nounwind { ; AIE2P-LABEL: extract_v16i32: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 3 -; AIE2P-NEXT: vextract.32 r0, x0, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.32 r0, x0, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <16 x i32> %v, i32 7 ret i32 %1 @@ -714,11 +706,11 @@ define signext i8 @extract_v128i8_signext(<128 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v128i8_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x4, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x4, #1, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <128 x i8> %v, i32 1 ret i8 %1 @@ -738,11 +730,11 @@ define zeroext i8 @extract_v128i8_zeroext(<128 x i8> %v) nounwind { ; AIE2P-LABEL: extract_v128i8_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #1 // Delay Slot 3 -; AIE2P-NEXT: vextract.8 r0, x4, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.8 r0, x4, #1, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <128 x i8> %v, i32 1 ret i8 %1 @@ -796,11 +788,11 @@ define signext i16 @extract_v64i16_signext(<64 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v64i16_signext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x4, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x4, #11, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <64 x i16> %v, i32 11 ret i16 %1 @@ -820,11 +812,11 @@ define zeroext i16 @extract_v64i16_zeroext(<64 x i16> %v) nounwind { ; AIE2P-LABEL: extract_v64i16_zeroext: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #11 // Delay Slot 3 -; AIE2P-NEXT: vextract.16 r0, x4, r0, vaddsign0 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.16 r0, x4, #11, vaddsign0 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <64 x i16> %v, i32 11 ret i16 %1 @@ -878,11 +870,11 @@ define i32 @extract_v32i32(<32 x i32> %v) nounwind { ; AIE2P-LABEL: extract_v32i32: ; AIE2P: .p2align 4 ; AIE2P-NEXT: // %bb.0: -; AIE2P-NEXT: nopa ; nopb ; nops ; ret lr; nopm ; nopv -; AIE2P-NEXT: nopx // Delay Slot 5 +; AIE2P-NEXT: ret lr +; AIE2P-NEXT: nop // Delay Slot 5 ; AIE2P-NEXT: nop // Delay Slot 4 -; AIE2P-NEXT: mova r0, #7 // Delay Slot 3 -; AIE2P-NEXT: vextract.32 r0, x4, r0, vaddsign1 // Delay Slot 2 +; AIE2P-NEXT: nop // Delay Slot 3 +; AIE2P-NEXT: vextract.32 r0, x4, #7, vaddsign1 // Delay Slot 2 ; AIE2P-NEXT: nop // Delay Slot 1 %1 = extractelement <32 x i32> %v, i32 7 ret i32 %1