diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b3cfa398d9b5f6..0ac84f4e1f02af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +SDValue +SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue RHS = N->getOperand(1); + auto *CRHS = dyn_cast(RHS); + if (!CRHS) + return SDValue(); + + // TODO: Worth using computeKnownBits? Maybe expensive since it's so + // common. + uint64_t Val = CRHS->getZExtValue(); + if (countr_zero(Val) >= 32) { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue LHS = N->getOperand(0); + + // Avoid carry machinery if we know the low half of the add does not + // contribute to the final result. + // + // add i64:x, K if computeTrailingZeros(K) >= 32 + // => build_pair (add x.hi, K.hi), x.lo + + // Breaking the 64-bit add here with this strange constant is unlikely + // to interfere with addressing mode patterns. + + SDValue Hi = getHiHalf64(LHS, DAG); + SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + SDValue AddHi = + DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); + } + + return SDValue(); +} + // Collect the ultimate src of each of the mul node's operands, and confirm // each operand is 8 bytes. static std::optional> @@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return V; } + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { SDValue TempNode(N, 0); @@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + if (VT != MVT::i32) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index f4641e7a659907..299c8f5f739235 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -212,6 +212,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll index 981e33f89d956e..52259c4c2e6e12 100644 --- a/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll @@ -10,8 +10,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x40000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x40000 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %add @@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 4294967296 ; (1 << 32) ret i64 %add @@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 2 +; GFX9-NEXT: s_add_i32 s1, s1, 2 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, 8589934592 ; (1 << 33) ret i64 %add @@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %add @@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) { define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) { ; GFX9-LABEL: s_add_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %add @@ -61,9 +56,7 @@ define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0x40000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %add @@ -73,8 +66,7 @@ define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 4294967296 ; (1 << 32) ret i64 %add @@ -84,8 +76,7 @@ define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, 8589934592 ; (1 << 33) ret i64 %add @@ -95,9 +86,7 @@ define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %add @@ -107,8 +96,7 @@ define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) { ; GFX9-LABEL: v_add_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %add @@ -139,10 +127,8 @@ define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 1, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %add @@ -152,10 +138,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %add = add <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %add @@ -164,10 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %add @@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, 2 +; GFX9-NEXT: s_add_i32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 2 ; GFX9-NEXT: ; return to shader part epilog %add = add <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %add diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 157f91ccc6b1c5..b2f113f08a9166 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1) define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, 1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, 1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog @@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace( define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) { ; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1] -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_add_i32 s3, s3, -1 +; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] -; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_add_i32 s3, s3, -1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX12-SDAG-NEXT: s_mov_b32 s1, -1 -; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] -; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0 +; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1 +; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-SDAG-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 98d5f3097153d9..a2a0107a6f7d81 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1372,20 +1372,19 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 ; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] -; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[3:4] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 ; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v4 ; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v5 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc @@ -1416,32 +1415,32 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff8000, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s35 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v10 ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 ; GFX9-NEXT: s_movk_i32 s0, 0xf000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[34:35] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Offset64: @@ -1477,8 +1476,7 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[2:3], off ; GFX10-NEXT: global_load_dwordx2 v[10:11], v[0:1], off @@ -1517,25 +1515,25 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo -; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_load_b64 v[6:7], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[2:3], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: @@ -2408,18 +2406,17 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX8-NEXT: v_mov_b32_e32 v3, 3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v0, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, -1, v0 ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc ; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; @@ -2450,14 +2447,13 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX9-NEXT: v_mov_b32_e32 v3, 3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc @@ -2490,15 +2486,14 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v6, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v5, vcc_lo @@ -2525,19 +2520,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf ; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v1, -1, v1 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll index 779c4aef647b41..f52f1164f2ba24 100644 --- a/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll +++ b/llvm/test/CodeGen/AMDGPU/sub64-low-32-bits-known-zero.ll @@ -10,8 +10,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0xfffc0000 +; GFX9-NEXT: s_add_i32 s1, s1, 0xfffc0000 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %sub @@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_0(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 4294967296 ; (1 << 32) ret i64 %sub @@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_1(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -2 +; GFX9-NEXT: s_add_i32 s1, s1, -2 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, 8589934592 ; (1 << 33) ret i64 %sub @@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_2(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %sub @@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_3(i64 inreg %reg) { define amdgpu_ps i64 @s_sub_i64_const_low_bits_known0_4(i64 inreg %reg) { ; GFX9-LABEL: s_sub_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, 1 +; GFX9-NEXT: s_add_i32 s1, s1, 1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %sub @@ -61,9 +56,7 @@ define i64 @v_sub_i64_const_low_bits_known0_0(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, 0xfffc0000 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0xfffc0000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 1125899906842624 ; (1 << 50) ret i64 %sub @@ -73,8 +66,7 @@ define i64 @v_sub_i64_const_low_bits_known0_1(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 4294967296 ; (1 << 32) ret i64 %sub @@ -84,8 +76,7 @@ define i64 @v_sub_i64_const_low_bits_known0_2(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -2, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, 8589934592 ; (1 << 33) ret i64 %sub @@ -95,9 +86,7 @@ define i64 @v_sub_i64_const_low_bits_known0_3(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, -9223372036854775808 ; (1 << 63) ret i64 %sub @@ -107,8 +96,7 @@ define i64 @v_sub_i64_const_low_bits_known0_4(i64 %reg) { ; GFX9-LABEL: v_sub_i64_const_low_bits_known0_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub i64 %reg, -4294967296 ; 0xffffffff00000000 ret i64 %sub @@ -139,10 +127,8 @@ define <2 x i64> @v_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_sub_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %sub @@ -152,10 +138,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { ; GFX9-LABEL: v_sub_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -2, v3, vcc +; GFX9-NEXT: v_add_u32_e32 v1, -1, v1 +; GFX9-NEXT: v_add_u32_e32 v3, -2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %sub = sub <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %sub @@ -164,10 +148,8 @@ define <2 x i64> @v_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) { define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_splat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, -1 +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -1 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32) ret <2 x i64> %sub @@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_sub_v2i64_splat_const_low_bits_known0_0(<2 x i64> define amdgpu_ps <2 x i64> @s_sub_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) { ; GFX9-LABEL: s_sub_v2i64_nonsplat_const_low_bits_known0_0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s0, 0 -; GFX9-NEXT: s_addc_u32 s1, s1, -1 -; GFX9-NEXT: s_add_u32 s2, s2, 0 -; GFX9-NEXT: s_addc_u32 s3, s3, -2 +; GFX9-NEXT: s_add_i32 s1, s1, -1 +; GFX9-NEXT: s_add_i32 s3, s3, -2 ; GFX9-NEXT: ; return to shader part epilog %sub = sub <2 x i64> %reg, ; (1 << 32), (1 << 33) ret <2 x i64> %sub