From 09583dec15c5100fd5f26f81a1b011bce3f9b21f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 8 Jan 2025 22:33:54 +0700 Subject: [PATCH] AMDGPU: Reduce 64-bit add width if low bits are known 0 (#122049) If one of the inputs has all 0 bits, the low part cannot carry and we can just pass through the original value. Add case: https://alive2.llvm.org/ce/z/TNc7hf Sub case: https://alive2.llvm.org/ce/z/AjH2-J We could do this in the general case with computeKnownBits, but add is so common this could be potentially expensive for something which will fire infrequently. One potential concern is this could break the 64-bit add we expect to see for addressing mode matching, but these constants shouldn't appear often in addressing expressions. One test for large offset expressions changes but isn't worse. Fixes https://github.com/ROCm/llvm-project/issues/237 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 47 ++++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 + .../AMDGPU/add64-low-32-bits-known-zero.ll | 56 +++---- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 54 +++---- .../AMDGPU/promote-constOffset-to-imm.ll | 144 +++++++++--------- .../AMDGPU/sub64-low-32-bits-known-zero.ll | 56 +++---- 6 files changed, 177 insertions(+), 183 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b3cfa398d9b5f6..0ac84f4e1f02af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +SDValue +SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue RHS = N->getOperand(1); + auto *CRHS = dyn_cast(RHS); + if (!CRHS) + return SDValue(); + + // TODO: Worth using computeKnownBits? Maybe expensive since it's so + // common. + uint64_t Val = CRHS->getZExtValue(); + if (countr_zero(Val) >= 32) { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + + // Avoid carry machinery if we know the low half of the add does not + // contribute to the final result. + // + // add i64:x, K if computeTrailingZeros(K) >= 32 + // => build_pair (add x.hi, K.hi), x.lo + + // Breaking the 64-bit add here with this strange constant is unlikely + // to interfere with addressing mode patterns. + + SDValue Hi = getHiHalf64(LHS, DAG); + SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue AddHi = + DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); + } + + return SDValue(); +} + // Collect the ultimate src of each of the mul node's operands, and confirm // each operand is 8 bytes. static std::optional> @@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return V; } + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { SDValue TempNode(N, 0); @@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if (VT != MVT::i32) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f4641e7a659907..299c8f5f739235 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -212,6 +212,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll index 981e33f89d956e..52259c4c2e6e12 100644 --- a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll @@ -10,8 +10,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x40000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x40000 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %add @@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 4294967296 ; (1 << 32) ret i64 %add @@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: s_add_i32 s1, s1, 2 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 8589934592 ; (1 << 33) ret i64 %add @@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %add @@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %add @@ -61,9 +56,7 @@ define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x40000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %add @@ -73,8 +66,7 @@ define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 4294967296 ; (1 << 32) ret i64 %add @@ -84,8 +76,7 @@ define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 8589934592 ; (1 << 33) ret i64 %add @@ -95,9 +86,7 @@ define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %add @@ -107,8 +96,7 @@ define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %add @@ -139,10 +127,8 @@ define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 1, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %add @@ -152,10 +138,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %add @@ -164,10 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %add @@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 2 +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 2 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %add diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 157f91ccc6b1c5..b2f113f08a9166 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace( define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, -1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 98d5f3097153d9..a2a0107a6f7d81 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4 ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc @@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v10 ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 ; GFX9-NEXT: s_movk_i32 s0, 0xf000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[34:35] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Offset64: @@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off @@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo -; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[6:7], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: @@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: v_mov_b32_e32 v3, 3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; @@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc @@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo @@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll index 779c4aef647b41..f52f1164f2ba24 100644 --- a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll @@ -10,8 +10,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0xfffc0000 +; GFX9-NEXT: s_add_i32 s1, s1, 0xfffc0000 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %sub @@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 4294967296 ; (1 << 32) ret i64 %sub @@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -2 +; GFX9-NEXT: s_add_i32 s1, s1, -2 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 8589934592 ; (1 << 33) ret i64 %sub @@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %sub @@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %sub @@ -61,9 +56,7 @@ define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xfffc0000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0xfffc0000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %sub @@ -73,8 +66,7 @@ define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 4294967296 ; (1 << 32) ret i64 %sub @@ -84,8 +76,7 @@ define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -2, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 8589934592 ; (1 << 33) ret i64 %sub @@ -95,9 +86,7 @@ define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %sub @@ -107,8 +96,7 @@ define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %sub @@ -139,10 +127,8 @@ define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %sub @@ -152,10 +138,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %sub @@ -164,10 +148,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %sub @@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, -2 +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -2 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %sub