Skip to content

Commit

Permalink
AMDGPU: Reduce 64-bit add width if low bits are known 0 (llvm#122049)
Browse files Browse the repository at this point in the history
If one of the inputs has all 0 bits, the low part cannot
carry and we can just pass through the original value.

Add case: https://alive2.llvm.org/ce/z/TNc7hf
Sub case: https://alive2.llvm.org/ce/z/AjH2-J

We could do this in the general case with computeKnownBits,
but add is so common this could be potentially expensive for
something which will fire infrequently.

One potential concern is this could break the 64-bit add
we expect to see for addressing mode matching, but these
constants shouldn't appear often in addressing expressions.
One test for large offset expressions changes but isn't worse.

Fixes ROCm#237
  • Loading branch information
arsenm authored and shenhanc78 committed Jan 8, 2025
1 parent ab452fe commit c4e3251
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 183 deletions.
47 changes: 47 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13985,6 +13985,43 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
return Accum;
}

SDValue
SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue RHS = N->getOperand(1);
auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
if (!CRHS)
return SDValue();

// TODO: Worth using computeKnownBits? Maybe expensive since it's so
// common.
uint64_t Val = CRHS->getZExtValue();
if (countr_zero(Val) >= 32) {
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
SDValue LHS = N->getOperand(0);

// Avoid carry machinery if we know the low half of the add does not
// contribute to the final result.
//
// add i64:x, K if computeTrailingZeros(K) >= 32
// => build_pair (add x.hi, K.hi), x.lo

// Breaking the 64-bit add here with this strange constant is unlikely
// to interfere with addressing mode patterns.

SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
SDValue AddHi =
DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());

SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
}

return SDValue();
}

// Collect the ultimate src of each of the mul node's operands, and confirm
// each operand is 8 bytes.
static std::optional<ByteProvider<SDValue>>
Expand Down Expand Up @@ -14261,6 +14298,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return V;
}

if (VT == MVT::i64) {
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
return Folded;
}

if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
(Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
SDValue TempNode(N, 0);
Expand Down Expand Up @@ -14446,6 +14488,11 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);

if (VT == MVT::i64) {
if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
return Folded;
}

if (VT != MVT::i32)
return SDValue();

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0, const SDNode *N1) const;
SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
DAGCombinerInfo &DCI) const;

SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Expand Down
56 changes: 18 additions & 38 deletions llvm/test/CodeGen/AMDGPU/add64-low-32-bits-known-zero.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
; GFX9-LABEL: s_add_i64_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 0x40000
; GFX9-NEXT: s_add_i32 s1, s1, 0x40000
; GFX9-NEXT: ; return to shader part epilog
%add = add i64 %reg, 1125899906842624 ; (1 << 50)
ret i64 %add
Expand All @@ -20,8 +19,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_0(i64 inreg %reg) {
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
; GFX9-LABEL: s_add_i64_const_low_bits_known0_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add i64 %reg, 4294967296 ; (1 << 32)
ret i64 %add
Expand All @@ -30,8 +28,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_1(i64 inreg %reg) {
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
; GFX9-LABEL: s_add_i64_const_low_bits_known0_2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 2
; GFX9-NEXT: s_add_i32 s1, s1, 2
; GFX9-NEXT: ; return to shader part epilog
%add = add i64 %reg, 8589934592 ; (1 << 33)
ret i64 %add
Expand All @@ -40,8 +37,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_2(i64 inreg %reg) {
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
; GFX9-LABEL: s_add_i64_const_low_bits_known0_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
; GFX9-NEXT: s_add_i32 s1, s1, 0x80000000
; GFX9-NEXT: ; return to shader part epilog
%add = add i64 %reg, -9223372036854775808 ; (1 << 63)
ret i64 %add
Expand All @@ -50,8 +46,7 @@ define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_3(i64 inreg %reg) {
define amdgpu_ps i64 @s_add_i64_const_low_bits_known0_4(i64 inreg %reg) {
; GFX9-LABEL: s_add_i64_const_low_bits_known0_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, -1
; GFX9-NEXT: s_add_i32 s1, s1, -1
; GFX9-NEXT: ; return to shader part epilog
%add = add i64 %reg, -4294967296 ; 0xffffffff00000000
ret i64 %add
Expand All @@ -61,9 +56,7 @@ define i64 @v_add_i64_const_low_bits_known0_0(i64 %reg) {
; GFX9-LABEL: v_add_i64_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, 0x40000
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 0x40000, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %reg, 1125899906842624 ; (1 << 50)
ret i64 %add
Expand All @@ -73,8 +66,7 @@ define i64 @v_add_i64_const_low_bits_known0_1(i64 %reg) {
; GFX9-LABEL: v_add_i64_const_low_bits_known0_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %reg, 4294967296 ; (1 << 32)
ret i64 %add
Expand All @@ -84,8 +76,7 @@ define i64 @v_add_i64_const_low_bits_known0_2(i64 %reg) {
; GFX9-LABEL: v_add_i64_const_low_bits_known0_2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %reg, 8589934592 ; (1 << 33)
ret i64 %add
Expand All @@ -95,9 +86,7 @@ define i64 @v_add_i64_const_low_bits_known0_3(i64 %reg) {
; GFX9-LABEL: v_add_i64_const_low_bits_known0_3:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %reg, -9223372036854775808 ; (1 << 63)
ret i64 %add
Expand All @@ -107,8 +96,7 @@ define i64 @v_add_i64_const_low_bits_known0_4(i64 %reg) {
; GFX9-LABEL: v_add_i64_const_low_bits_known0_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: v_add_u32_e32 v1, -1, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add i64 %reg, -4294967296 ; 0xffffffff00000000
ret i64 %add
Expand Down Expand Up @@ -139,10 +127,8 @@ define <2 x i64> @v_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> %reg) {
; GFX9-LABEL: v_add_v2i64_splat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 1, v3, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
; GFX9-NEXT: v_add_u32_e32 v3, 1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
ret <2 x i64> %add
Expand All @@ -152,10 +138,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
; GFX9-LABEL: v_add_v2i64_nonsplat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 2, v3, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 1, v1
; GFX9-NEXT: v_add_u32_e32 v3, 2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
ret <2 x i64> %add
Expand All @@ -164,10 +148,8 @@ define <2 x i64> @v_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> %reg) {
define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_splat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 1
; GFX9-NEXT: s_add_u32 s2, s2, 0
; GFX9-NEXT: s_addc_u32 s3, s3, 1
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 1
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 4294967296> ; (1 << 32)
ret <2 x i64> %add
Expand All @@ -176,10 +158,8 @@ define amdgpu_ps <2 x i64> @s_add_v2i64_splat_const_low_bits_known0_0(<2 x i64>
define amdgpu_ps <2 x i64> @s_add_v2i64_nonsplat_const_low_bits_known0_0(<2 x i64> inreg %reg) {
; GFX9-LABEL: s_add_v2i64_nonsplat_const_low_bits_known0_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s0, 0
; GFX9-NEXT: s_addc_u32 s1, s1, 1
; GFX9-NEXT: s_add_u32 s2, s2, 0
; GFX9-NEXT: s_addc_u32 s3, s3, 2
; GFX9-NEXT: s_add_i32 s1, s1, 1
; GFX9-NEXT: s_add_i32 s3, s3, 2
; GFX9-NEXT: ; return to shader part epilog
%add = add <2 x i64> %reg, <i64 4294967296, i64 8589934592> ; (1 << 32), (1 << 33)
ret <2 x i64> %add
Expand Down
54 changes: 22 additions & 32 deletions llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -668,37 +668,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_0xFFFFFFFF(ptr addrspace(1)
define amdgpu_ps float @global_load_saddr_i8_offset_0x100000000(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_add_i32 s3, s3, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_add_i32 s3, s3, 1
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_add_i32 s3, s3, 1
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_0x100000000:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
; GFX12-SDAG-NEXT: s_mov_b32 s1, 1
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, 1
; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
Expand Down Expand Up @@ -934,37 +929,32 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg0xFFFFFFFF(ptr addrspace(
define amdgpu_ps float @global_load_saddr_i8_offset_neg0x100000000(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_add_i32 s3, s3, -1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_add_i32 s3, s3, -1
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_add_i32 s3, s3, -1
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_neg0x100000000:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
; GFX12-SDAG-NEXT: s_mov_b32 s1, -1
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: s_load_u8 s0, s[0:1], 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s3, s3, -1
; GFX12-SDAG-NEXT: s_load_u8 s0, s[2:3], 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
; GFX12-SDAG-NEXT: ; return to shader part epilog
Expand Down
Loading

0 comments on commit c4e3251

Please sign in to comment.