From 0443398b73f18791598db1bf6ab2274a46ac649f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 18 Jan 2024 14:44:03 +0700 Subject: [PATCH] AMDGPU: Handle gfx950 global_load_lds_* instructions Define global_load_lds_dwordx3 and global_load_dwordx4. Oddly it seems dwordx2 was skipped. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 10 ++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 9 ++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 7 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 ++ .../llvm.amdgcn.global.load.lds.gfx950.ll | 137 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_features.s | 37 +++++ llvm/test/MC/Disassembler/AMDGPU/gfx950.txt | 25 ++++ 8 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll create mode 100644 llvm/test/MC/AMDGPU/gfx950_asm_features.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx950.txt diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 15f33cdbf92e6e..f43ab50d2ea441 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2452,7 +2452,7 @@ class AMDGPUGlobalLoadLDS : [], [LLVMQualPointerType<1>, // Base global pointer to load from LLVMQualPointerType<3>, // LDS base pointer to store to - llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // imm offset (applied to both global and LDS address) llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = sc0, // bit 1 = sc1, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 13de93e829fab2..a6ef0069f134bd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3329,6 +3329,16 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ case 4: Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; break; + case 12: + if (!Subtarget->hasLDSLoadB96_B128()) + return false; + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; + break; + case 16: + if (!Subtarget->hasLDSLoadB96_B128()) + return false; + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; + break; } MachineBasicBlock *MBB = MI.getParent(); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index db74372e9db452..861fcf017d9e4d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -934,6 +934,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; +let SubtargetPredicate = HasGFX950Insts in { +defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx3">; +defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwordx4">; +} + let SubtargetPredicate = isGFX12Plus in { defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>; defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; @@ -1980,6 +1985,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Real_AllAddr_LDS <0x028, 0x12>; defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Real_AllAddr_LDS <0x029, 0x13>; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Real_AllAddr_LDS <0x02a, 0x14>; +defm GLOBAL_LOAD_LDS_DWORDX3 : FLAT_Real_AllAddr_LDS <0x07e, 0x07e>; +defm GLOBAL_LOAD_LDS_DWORDX4 : FLAT_Real_AllAddr_LDS <0x07d, 0x07d>; + + defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>; defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>; defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 4a6efe533230b1..f3f96940c1f44b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1289,6 +1289,13 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, // hasGFX940Insts and hasGFX90AInsts are also true. bool hasGFX950Insts() const { return GFX950Insts; } + /// Returns true if the target supports + /// global_load_lds_dwordx3/global_load_lds_dwordx4 or + /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit. + bool hasLDSLoadB96_B128() const { + return hasGFX950Insts(); + } + bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ad89812558d25c..0f7764906527d0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9894,6 +9894,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case 4: Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD; break; + case 12: + if (!Subtarget->hasLDSLoadB96_B128()) + return SDValue(); + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3; + break; + case 16: + if (!Subtarget->hasLDSLoadB96_B128()) + return SDValue(); + Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4; + break; } auto *M = cast(Op); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll new file mode 100644 index 00000000000000..b7819ea0431588 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s + +declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) + +;---------------------------------------------------------------------y +; dwordx3 +;--------------------------------------------------------------------- + +define amdgpu_ps void @global_load_lds_dwordx3_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx3_vaddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx3_vaddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2 +; GFX950-GISEL-NEXT: s_nop 4 +; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v[0:1], off offset:16 sc0 +; GFX950-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_lds_dwordx3_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:32 nt +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 3 +; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v0, s[0:1] offset:32 nt +; GFX950-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 12, i32 32, i32 2) + ret void +} + +define amdgpu_ps void @global_load_lds_dwordx3_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx3_saddr_and_vaddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx3_saddr_and_vaddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: s_nop 4 +; GFX950-GISEL-NEXT: global_load_lds_dwordx3 v1, s[0:1] offset:48 sc1 +; GFX950-GISEL-NEXT: s_endpgm + %voffset.64 = zext i32 %voffset to i64 + %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 12, i32 48, i32 16) + ret void +} + +;--------------------------------------------------------------------- +; dwordx4 +;--------------------------------------------------------------------- + +define amdgpu_ps void @global_load_lds_dwordx4_vaddr(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx4_vaddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s0 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx4_vaddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v2 +; GFX950-GISEL-NEXT: s_nop 4 +; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v[0:1], off offset:16 sc0 +; GFX950-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 16, i32 1) + ret void +} + +define amdgpu_ps void @global_load_lds_dwordx4_saddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:32 nt +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-GISEL-NEXT: s_nop 3 +; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v0, s[0:1] offset:32 nt +; GFX950-GISEL-NEXT: s_endpgm + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) %lptr, i32 16, i32 32, i32 2) + ret void +} + +define amdgpu_ps void @global_load_lds_dwordx4_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) { +; GFX950-SDAG-LABEL: global_load_lds_dwordx4_saddr_and_vaddr: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s2 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: global_load_lds_dwordx4_saddr_and_vaddr: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_readfirstlane_b32 m0, v0 +; GFX950-GISEL-NEXT: s_nop 4 +; GFX950-GISEL-NEXT: global_load_lds_dwordx4 v1, s[0:1] offset:48 sc1 +; GFX950-GISEL-NEXT: s_endpgm + %voffset.64 = zext i32 %voffset to i64 + %gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64 + call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gep, ptr addrspace(3) %lptr, i32 16, i32 48, i32 16) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX950: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s new file mode 100644 index 00000000000000..405d152c93d867 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s @@ -0,0 +1,37 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 --strict-whitespace %s +// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX940 --implicit-check-not=error: %s +// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX90A --implicit-check-not=error: %s +// xUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX950,GFX10 --implicit-check-not=error: %s + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00] + +global_load_lds_dwordx3 v[2:3], off + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00] +global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00] +global_load_lds_dwordx3 v[2:3], off offset:4 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00] +global_load_lds_dwordx3 v2, s[4:5] offset:4 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: instruction not supported on this GPU +// GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00] +global_load_lds_dwordx4 v[2:3], off + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00] +global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00] +global_load_lds_dwordx4 v[2:3], off offset:4 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00] +global_load_lds_dwordx4 v2, s[4:5] offset:4 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt new file mode 100644 index 00000000000000..a9f28332860ee5 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt @@ -0,0 +1,25 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s + +# GFX950: global_load_lds_dwordx3 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00] +0x04,0x80,0xf8,0xdd,0x02,0x00,0x04,0x00 + +# GFX950: global_load_lds_dwordx3 v[2:3], off ; encoding: [0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00] +0x00,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00 + +# GFX950: global_load_lds_dwordx3 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00] +0x04,0x80,0xf8,0xdd,0x02,0x00,0x7f,0x00 + +# GFX950: global_load_lds_dwordx3 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00] +0x00,0x80,0xfb,0xdf,0x02,0x00,0x7f,0x00 + +# GFX950: global_load_lds_dwordx4 v2, s[4:5] offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00] +0x04,0x80,0xf4,0xdd,0x02,0x00,0x04,0x00 + +# GFX950: global_load_lds_dwordx4 v[2:3], off ; encoding: [0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00] +0x00,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00 + +# GFX950: global_load_lds_dwordx4 v[2:3], off offset:4 ; encoding: [0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00] +0x04,0x80,0xf4,0xdd,0x02,0x00,0x7f,0x00 + +# GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00] +0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00