Skip to content

Commit

Permalink
AMDGPU: Handle gfx950 global_load_lds_* instructions
Browse files Browse the repository at this point in the history
Define global_load_lds_dwordx3 and global_load_dwordx4.
Oddly it seems dwordx2 was skipped.
  • Loading branch information
arsenm committed Nov 18, 2024
1 parent 330427f commit 0443398
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 1 deletion.
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS :
[],
[LLVMQualPointerType<1>, // Base global pointer to load from
LLVMQualPointerType<3>, // LDS base pointer to store to
llvm_i32_ty, // Data byte size: 1/2/4
llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950)
llvm_i32_ty, // imm offset (applied to both global and LDS address)
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0,
// bit 1 = sc1,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3329,6 +3329,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
case 4:
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return false;
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return false;
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
break;
}

MachineBasicBlock *MBB = MI.getParent();
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">;
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;

let SubtargetPredicate = HasGFX950Insts in {
defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">;
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">;
}

let SubtargetPredicate = isGFX12Plus in {
defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
Expand Down Expand Up @@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>;
defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>;
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>;

defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>;
defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>;


defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// hasGFX940Insts and hasGFX90AInsts are also true.
bool hasGFX950Insts() const { return GFX950Insts; }

/// Returns true if the target supports
/// global_load_lds_dwordx3/global_load_lds_dwordx4 or
/// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
bool hasLDSLoadB96_B128() const {
return hasGFX950Insts();
}

bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }

bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9894,6 +9894,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case 4:
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
break;
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
break;
}

auto *M = cast<MemSDNode>(Op);
Expand Down
137 changes: 137 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s

declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

;---------------------------------------------------------------------y
; dwordx3
;---------------------------------------------------------------------

define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:32 nt
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 3
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v0, s[0:1] offset:32 nt
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1
; GFX950-GISEL-NEXT: s_endpgm
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16)
ret void
}

;---------------------------------------------------------------------
; dwordx4
;---------------------------------------------------------------------

define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2
; GFX950-SDAG-NEXT: s_mov_b32 m0, s0
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:32 nt
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-GISEL-NEXT: s_nop 3
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v0, s[0:1] offset:32 nt
; GFX950-GISEL-NEXT: s_endpgm
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2)
ret void
}

define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
; GFX950-SDAG: ; %bb.0:
; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX950-SDAG-NEXT: s_mov_b32 m0, s2
; GFX950-SDAG-NEXT: s_nop 0
; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr:
; GFX950-GISEL: ; %bb.0:
; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0
; GFX950-GISEL-NEXT: s_nop 4
; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1
; GFX950-GISEL-NEXT: s_endpgm
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16)
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX950: {{.*}}
37 changes: 37 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_asm_features.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s
// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]

global_load_lds_dwordx3 v[2:3], off

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx3 v[2:3], off sc0 nt sc1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx3 v[2:3], off offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
global_load_lds_dwordx3 v2, s[4:5] offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU
// GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off sc0 nt sc1

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
global_load_lds_dwordx4 v[2:3], off offset:4

// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error:
// GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
global_load_lds_dwordx4 v2, s[4:5] offset:4
25 changes: 25 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx950.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s

# GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00]
0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00]
0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00]
0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00]
0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00]
0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00

# GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00]
0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00

0 comments on commit 0443398

Please sign in to comment.