From a5454b4ff78dc094b6ee1e8fe60da6a2cd540cd5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 5 Jan 2024 16:11:00 +0700 Subject: [PATCH] AMDGPU: Add v_mfma_ld_scale_b32 for gfx950 --- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 26 ++-- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 13 ++ llvm/lib/Target/AMDGPU/VOPInstructions.td | 6 +- llvm/test/MC/AMDGPU/mai-gfx950-err.s | 31 +++++ llvm/test/MC/AMDGPU/mai-gfx950.s | 117 ++++++++++++++++ .../MC/Disassembler/AMDGPU/gfx950_mai.txt | 131 ++++++++++++++---- llvm/test/tools/llvm-mca/AMDGPU/gfx950.s | 9 +- 7 files changed, 288 insertions(+), 45 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/mai-gfx950-err.s diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 2079b34d0448f4..d2024cf915874d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1991,13 +1991,14 @@ class getInsVOP3Base { dag base = getInsVOP3Base.ret; dag vop3pOpsel = (ins op_sel_hi0:$op_sel_hi); - dag vop3p_neg = (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi); + dag vop3p_neg = !if(HasNeg, (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), (ins)); dag vop3pFields = !con(!if(HasOpSel, vop3pOpsel, (ins)), vop3p_neg); dag ret = !con(base, vop3pFields); @@ -2191,22 +2192,22 @@ class getAsmVOPDPart { // Returns the assembly string for the inputs and outputs of a VOP3P // instruction. -class getAsmVOP3P { - string dst = "$vdst"; - string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string dst = !if(HasDst, "$vdst"# !if(!gt(NumSrcArgs, 0), ",", ""), ""); + string src0 = !if(!eq(NumSrcArgs, 1), " $src0", " $src0,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1", " $src1,")); string src2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); - string mods = !if(HasModifiers, "$neg_lo$neg_hi", ""); + string mods = !if(HasNeg, "$neg_lo$neg_hi", ""); string clamp = !if(HasClamp, "$clamp", ""); string opsel = !if(HasOpSel, "$op_sel$op_sel_hi", ""); // Each modifier is printed as an array of bits for each operand, so // all operands are printed as part of src0_modifiers. - string ret = dst#", "#src0#src1#src2#opsel#mods#clamp; + string ret = dst#src0#src1#src2#opsel#mods#clamp; } // FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all @@ -2267,7 +2268,7 @@ class getAsmDPP8 { string dst = !if(HasDst, @@ -2294,7 +2295,7 @@ class getAsmVOP3Base _ArgVT, bit _EnableClamp = 0> { isModifierType.ret, isModifierType.ret, HasOMod); + field bit HasNeg = HasModifiers; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); @@ -2589,7 +2591,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { HasClamp, HasModifiers, HasSrc2Mods, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsVOP3P = getInsVOP3P.ret; field dag InsVOP3OpSel = getInsVOP3OpSel _ArgVT, bit _EnableClamp = 0> { Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret; defvar InsVOP3PDPPBase = getInsVOP3P.ret; field dag InsVOP3Base = !if(IsVOP3P, InsVOP3PDPPBase, InsVOP3DPPBase); @@ -2635,10 +2637,10 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { // the asm operand name via this HasModifiers flag field string AsmDPP8 = getAsmDPP8.ret; field string AsmVOP3Base = getAsmVOP3Base.ret; field string Asm64 = AsmVOP3Base; - field string AsmVOP3P = getAsmVOP3P.ret; + field string AsmVOP3P = getAsmVOP3P.ret; field string AsmVOP3OpSel = getAsmVOP3OpSel, VOP3P_LD_SCALE> { + let HasModifiers = 1; + let HasNeg = 0; +} + // Used for FMA_MIX* and MAD_MIX* insts // Their operands are only sort of f16 operands. Depending on // op_sel_hi, these may be interpreted as f32. The inline immediate @@ -753,6 +758,10 @@ defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16 defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>; } +let SubtargetPredicate = HasGFX950Insts in { +defm V_MFMA_LD_SCALE_B32 : VOP3PInst<"v_mfma_ld_scale_b32", VOP_MFMA_LD_SCALE>; +} + let Predicates = [isGFX90APlus] in { let is_gfx940_xdl = 1 in { defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; @@ -1792,6 +1801,10 @@ defm V_MFMA_F32_16X16X32_F16 : VOP3P_Real_MFMA_gfx950 <0x54, "v_mfma_f32_16x defm V_MFMA_F32_32X32X16_F16 : VOP3P_Real_MFMA_gfx950 <0x55, "v_mfma_f32_32x32x16_f16">; defm V_MFMA_F32_32X32X16_BF16 : VOP3P_Real_MFMA_gfx950 <0x37, "v_mfma_f32_32x32x16_bf16">; +let SubtargetPredicate = HasGFX950Insts in { +defm V_MFMA_LD_SCALE_B32 : VOP3P_Real_vi <0x2c>; +} + defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; let SubtargetPredicate = HasXF32Insts in { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 1be434c2c11f7e..a6e6adac04e5a9 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -423,7 +423,7 @@ class VOP3Pe op, VOPProfile P> : Enc64 { bits<2> index_key_8bit; bits<1> index_key_16bit; - let Inst{7-0} = vdst; + let Inst{7-0} = !if(P.HasDst, vdst, 0); let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2 @@ -1365,6 +1365,10 @@ def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; def VOP3_MAI : VOP3Features<0, 0, 0, 1>; +// Packed is misleading, but it enables the appropriate op_sel +// modifiers. +def VOP3P_LD_SCALE : VOP3Features<0, 1, 1, 0>; + class VOP3_Profile_Base : VOPProfile { let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s new file mode 100644 index 00000000000000..a6dff076392c85 --- /dev/null +++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s @@ -0,0 +1,31 @@ +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --implicit-check-not=error: %s + +v_mfma_ld_scale_b32 v0, 65 +// CHECK: :[[@LINE-1]]:25: error: literal operands are not supported + +v_mfma_ld_scale_b32 65, v0 +// CHECK: :[[@LINE-1]]:21: error: literal operands are not supported + +v_mfma_ld_scale_b32 65, 65 +// CHECK: :[[@LINE-1]]:25: error: literal operands are not supported + +v_mfma_ld_scale_b32 s0, s1 +// CHECK: :[[@LINE-1]]:25: error: invalid operand (violates constant bus restrictions) + +v_mfma_ld_scale_b32 v0, v0 clamp +// CHECK: :[[@LINE-1]]:28: error: invalid operand for instruction + +v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1] +// CHECK: :[[@LINE-1]]:28: error: not a valid operand + +v_mfma_ld_scale_b32 v0, v0 neg_lo:[1,1] +// CHECK: :[[@LINE-1]]:28: error: not a valid operand + +v_mfma_ld_scale_b32 v0, v0 neg_hi:[1,1] +// CHECK: :[[@LINE-1]]:28: error: not a valid operand + +v_mfma_ld_scale_b32 v0, v0 neg_hi:[0,1] +// CHECK: :[[@LINE-1]]:28: error: not a valid operand + +v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1] neg_hi:[0,1] +// CHECK: :[[@LINE-1]]:28: error: not a valid operand diff --git a/llvm/test/MC/AMDGPU/mai-gfx950.s b/llvm/test/MC/AMDGPU/mai-gfx950.s index 1d4902e293bb10..a692693638c692 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950.s @@ -158,3 +158,120 @@ v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 // GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c] // ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 + +//===----------------------------------------------------------------------===// +// v_mfma_ld_scale_b32 +//===----------------------------------------------------------------------===// + +// GFX950: v_mfma_ld_scale_b32 v0, 64 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v0, 64 + +// GFX950: v_mfma_ld_scale_b32 64, v0 ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 64, v0 + +// GFX950: v_mfma_ld_scale_b32 64, 64 ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 64, 64 + +// GFX950: v_mfma_ld_scale_b32 s0, s0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 s0, s0 + +// GFX950: v_mfma_ld_scale_b32 s0, v0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 s0, v0 + +// GFX950: v_mfma_ld_scale_b32 v0, s0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v0, s0 + +// GFX950: v_mfma_ld_scale_b32 vcc_lo, vcc_lo ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 vcc_lo, vcc_lo + +// GFX950: v_mfma_ld_scale_b32 m0, m0 ; encoding: [0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 m0, m0 + +// GFX950: v_mfma_ld_scale_b32 src_vccz, src_vccz ; encoding: [0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 vccz, vccz + +// GFX950: v_mfma_ld_scale_b32 src_execz, src_execz ; encoding: [0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 execz, execz + +// GFX950: v_mfma_ld_scale_b32 v0, v0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v0, v0 + +// GFX950: v_mfma_ld_scale_b32 v1, v1 ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 + +// GFX950: v_mfma_ld_scale_b32 0, 0 ; encoding: [0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 0, 0 + +// GFX950: v_mfma_ld_scale_b32 1, 0 ; encoding: [0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 1, 0 + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[1, 0] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0, 1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[1, 1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1, 0] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0, 1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1, 1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0,0] op_sel_hi:[0,0] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] op_sel_hi:[1,1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] + +// GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08] +// ERR: :[[@LINE+1]]:{{[0-9]+}}: error: instruction not supported on this GPU +v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] + diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt index 292f2a348df2ef..1fa48fca80fb45 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_mai.txt @@ -1,88 +1,161 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding -disassemble %s | FileCheck -check-prefix=GFX950 %s -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] abid:1 ; encoding: [0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x88,0xd4,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c] 0x00,0x80,0xd4,0xd3,0x00,0x01,0x02,0x3c -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 ; encoding: [0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x83,0xd4,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x8b,0xd4,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13] 0x00,0x80,0xd4,0xd3,0x00,0x01,0xca,0x13 -# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04] +# GFX950: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] ; encoding: [0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04] 0x00,0x80,0xd4,0xd3,0x00,0x01,0x12,0x04 -# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c] +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], a[0:3], v[4:7] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c] 0x00,0x00,0xd4,0xd3,0x00,0x01,0x12,0x1c -# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b] +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], a[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b] 0x00,0x00,0xd4,0xd3,0x00,0x01,0xca,0x0b -# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04] 0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4] +# GFX950: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4] 0x00,0x00,0xd4,0xd3,0x00,0x01,0x02,0xa4 -# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b] +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b] 0x00,0x80,0xd5,0xd3,0x00,0x01,0xca,0x1b -# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c] +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c] 0x00,0x80,0xd5,0xd3,0x00,0x01,0x02,0x5c -# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x8b,0xd5,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03] +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03] 0x00,0x00,0xd5,0xd3,0x00,0x01,0xca,0x03 -# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04] 0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04] 0x00,0x08,0xd5,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4] +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4] 0x00,0x00,0xd5,0xd3,0x00,0x01,0x02,0xa4 -# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04] 0x00,0x03,0xd5,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b] +# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], 1.0 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b] 0x00,0x80,0xb7,0xd3,0x00,0x01,0xca,0x1b -# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c] +# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 ; encoding: [0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c] 0x00,0x80,0xb7,0xd3,0x00,0x01,0x02,0x5c -# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c] +# GFX950: v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] cbsz:3 abid:1 ; encoding: [0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c] 0x00,0x8b,0xb7,0xd3,0x00,0x01,0x02,0x1c -# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03] +# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], 1.0 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03] 0x00,0x00,0xb7,0xd3,0x00,0x01,0xca,0x03 -# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04] 0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] abid:1 ; encoding: [0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04] 0x00,0x08,0xb7,0xd3,0x00,0x01,0x02,0x04 -# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4] +# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4] 0x00,0x00,0xb7,0xd3,0x00,0x01,0x02,0xa4 -# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04] +# GFX950: v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] cbsz:3 ; encoding: [0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04] 0x00,0x03,0xb7,0xd3,0x00,0x01,0x02,0x04 + + +# GFX950: v_mfma_ld_scale_b32 0, 0 ; encoding: [0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18] +0x00,0x40,0xac,0xd3,0x80,0x00,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 1, 0 ; encoding: [0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18] +0x00,0x40,0xac,0xd3,0x81,0x00,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 64, 64 ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18] +0x00,0x40,0xac,0xd3,0xc0,0x80,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 64, v0 ; encoding: [0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18] +0x00,0x40,0xac,0xd3,0xc0,0x00,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 m0, m0 ; encoding: [0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18] +0x00,0x40,0xac,0xd3,0x7c,0xf8,0x00,0x18 + +# GFX950: v_mfma_ld_scale_b32 s0, s0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18] +0x00,0x40,0xac,0xd3,0x00,0x00,0x00,0x18 + +# GFX950: v_mfma_ld_scale_b32 s0, v0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18] +0x00,0x40,0xac,0xd3,0x00,0x00,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 src_execz, src_execz ; encoding: [0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18] +0x00,0x40,0xac,0xd3,0xfc,0xf8,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 src_vccz, src_vccz ; encoding: [0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18] +0x00,0x40,0xac,0xd3,0xfb,0xf6,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 v0, 64 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18] +0x00,0x40,0xac,0xd3,0x00,0x81,0x01,0x18 + +# GFX950: v_mfma_ld_scale_b32 v0, s0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18] +0x00,0x40,0xac,0xd3,0x00,0x01,0x00,0x18 + +# GFX950: v_mfma_ld_scale_b32 v0, v0 ; encoding: [0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18] +0x00,0x40,0xac,0xd3,0x00,0x01,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18] +0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18] +0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10] +0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x10 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08] +0x00,0x50,0xac,0xd3,0x01,0x03,0x02,0x08 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18] +0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10] +0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x10 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08] +0x00,0x48,0xac,0xd3,0x01,0x03,0x02,0x08 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel:[1,1] ; encoding: [0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18] +0x00,0x58,0xac,0xd3,0x01,0x03,0x02,0x18 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00] +0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x00 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10] +0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x10 + +# GFX950: v_mfma_ld_scale_b32 v1, v1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08] +0x00,0x40,0xac,0xd3,0x01,0x03,0x02,0x08 + +# GFX950: v_mfma_ld_scale_b32 vcc_lo, vcc_lo ; encoding: [0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18] +0x00,0x40,0xac,0xd3,0x6a,0xd4,0x00,0x18 diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s index 667fb7d78a87bd..e601de8d706b44 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s @@ -1,10 +1,11 @@ # RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s # CHECK: Iterations: 1 -# CHECK: Instructions: 6 -# CHECK: Total Cycles: 41 -# CHECK: Total uOps: 6 +# CHECK: Instructions: 7 +# CHECK: Total Cycles: 42 +# CHECK: Total uOps: 7 +v_mfma_ld_scale_b32 v0, v0 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] @@ -13,7 +14,9 @@ v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 v_mfma_f32_32x32x16_bf16 v[0:15], v[0:3], v[0:3], v[0:15] v_mfma_f32_32x32x16_bf16 a[0:15], a[0:3], a[0:3], a[0:15] blgp:2 + # CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x16_f16 v[0:15], v[0:3], v[0:3], v[0:15]