From 1d491a5c57a164f0c0c89225826449a538f81b2f Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Wed, 24 Jul 2024 11:16:20 +0000 Subject: [PATCH 1/3] [AArch64] Implement NEON vscale intrinsics --- clang/include/clang/Basic/arm_neon.td | 6 ++ clang/lib/CodeGen/CGBuiltin.cpp | 8 +++ .../acle_neon_fscale.c | 58 +++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 7 +++ .../lib/Target/AArch64/AArch64InstrFormats.td | 21 +++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +- llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll | 54 +++++++++++++++++ 7 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 92f39744f3d084..83e7c8ec65eb17 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2126,3 +2126,9 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { def FAMIN : WInst<"vamin", "...", "fhQdQfQh">; def FAMAX : WInst<"vamax", "...", "fhQdQfQh">; } + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in { + // fscale + def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">; + def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">; +} \ No newline at end of file diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 249aead33ad73d..9033cd1ccd781d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13573,6 +13573,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_famax; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "famax"); } + case NEON::BI__builtin_neon_vscale_f16: + case NEON::BI__builtin_neon_vscaleq_f16: + case NEON::BI__builtin_neon_vscale_f32: + case NEON::BI__builtin_neon_vscaleq_f32: + case NEON::BI__builtin_neon_vscaleq_f64: { + Int = Intrinsic::aarch64_neon_fp8_fscale; + return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fscale"); + } } } diff --git a/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c new file mode 100644 index 00000000000000..b50d30876a7c51 --- /dev/null +++ b/clang/test/CodeGen/aarch64-neon-fp8-intrinsics/acle_neon_fscale.c @@ -0,0 +1,58 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +#include + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -S -O3 -o /dev/null %s + +// CHECK-LABEL: define dso_local <4 x half> @test_vscale_f16( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> [[VN]], <4 x i16> [[VM]]) +// CHECK-NEXT: ret <4 x half> [[FSCALE2_I]] +// +float16x4_t test_vscale_f16(float16x4_t vn, int16x4_t vm) { + return vscale_f16(vn, vm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vscaleq_f16( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x i16> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> [[VN]], <8 x i16> [[VM]]) +// CHECK-NEXT: ret <8 x half> [[FSCALE2_I]] +// +float16x8_t test_vscaleq_f16(float16x8_t vn, int16x8_t vm) { + return vscaleq_f16(vn, vm); + +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vscale_f32( +// CHECK-SAME: <2 x float> noundef [[VN:%.*]], <2 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> [[VN]], <2 x i32> [[VM]]) +// CHECK-NEXT: ret <2 x float> [[FSCALE2_I]] +// +float32x2_t test_vscale_f32(float32x2_t vn, int32x2_t vm) { + return vscale_f32(vn, vm); + +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vscaleq_f32( +// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x i32> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> [[VN]], <4 x i32> [[VM]]) +// CHECK-NEXT: ret <4 x float> [[FSCALE2_I]] +// +float32x4_t test_vscaleq_f32(float32x4_t vn, int32x4_t vm) { + return vscaleq_f32(vn, vm); + +} + +// CHECK-LABEL: define dso_local <2 x double> @test_vscale_f64( +// CHECK-SAME: <2 x double> noundef [[VN:%.*]], <2 x i64> noundef [[VM:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[FSCALE2_I:%.*]] = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> [[VN]], <2 x i64> [[VM]]) +// CHECK-NEXT: ret <2 x double> [[FSCALE2_I]] +// +float64x2_t test_vscale_f64(float64x2_t vn, int64x2_t vm) { + return vscaleq_f64(vn, vm); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b2a2e11240186d..2d8ce66f53ba8a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -563,6 +563,13 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in { def int_aarch64_neon_vcmla_rot90 : AdvSIMD_3VectorArg_Intrinsic; def int_aarch64_neon_vcmla_rot180 : AdvSIMD_3VectorArg_Intrinsic; def int_aarch64_neon_vcmla_rot270 : AdvSIMD_3VectorArg_Intrinsic; + + // FP8 fscale + def int_aarch64_neon_fp8_fscale : DefaultAttrsIntrinsic< + [llvm_anyvector_ty], + [LLVMMatchType<0>, + LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; } let TargetPrefix = "aarch64" in { diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 46b462de5071cb..4c90449925f107 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5997,6 +5997,27 @@ multiclass SIMDThreeSameVectorFP opc, [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } +// As above, but only floating point elements supported. +let mayRaiseFPException = 1, Uses = [FPCR] in +multiclass SIMDThreeVectorFP opc, + string asm, SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, + asm, ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, + asm, ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, + asm, ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>; +} + let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDThreeSameVectorFPCmp opc, string asm, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c70e835d1619ff..f40aadb2500c94 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10136,7 +10136,7 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8] in { defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">; defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">; defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">; - defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>; + defm FSCALE : SIMDThreeVectorFP<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; } // End let Predicates = [HasFP8] // fminimum(abs(a), abs(b)) -> famin(a, b) diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll new file mode 100644 index 00000000000000..da0e365db2d314 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-fp8-fscale.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s + + +define <4 x half> @test_fscale_f16(<4 x half> %vn, <4 x i16> %vm) { +; CHECK-LABEL: test_fscale_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fscale v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %res = tail call <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half> %vn, <4 x i16> %vm) + ret <4 x half> %res +} + +define <8 x half> @test_fscaleq_f16(<8 x half> %vn, <8 x i16> %vm) { +; CHECK-LABEL: test_fscaleq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fscale v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half> %vn, <8 x i16> %vm) + ret <8 x half> %res +} + +define <2 x float> @test_fscale_f32(<2 x float> %vn, <2 x i32> %vm) { +; CHECK-LABEL: test_fscale_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fscale v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %res = tail call <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float> %vn, <2 x i32> %vm) + ret <2 x float> %res +} + +define <4 x float> @test_fscaleq_f32(<4 x float> %vn, <4 x i32> %vm) { +; CHECK-LABEL: test_fscaleq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fscale v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float> %vn, <4 x i32> %vm) + ret <4 x float> %res +} + +define <2 x double> @test_fscaleq_f64(<2 x double> %vn, <2 x i64> %vm) { +; CHECK-LABEL: test_fscaleq_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: fscale v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %res = tail call <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double> %vn, <2 x i64> %vm) + ret <2 x double> %res +} + +declare <4 x half> @llvm.aarch64.neon.fp8.fscale.v4f16(<4 x half>, <4 x i16>) +declare <8 x half> @llvm.aarch64.neon.fp8.fscale.v8f16(<8 x half>, <8 x i16>) +declare <2 x float> @llvm.aarch64.neon.fp8.fscale.v2f32(<2 x float>, <2 x i32>) +declare <4 x float> @llvm.aarch64.neon.fp8.fscale.v4f32(<4 x float>, <4 x i32>) +declare <2 x double> @llvm.aarch64.neon.fp8.fscale.v2f64(<2 x double>, <2 x i64>) From cbf1133b82951cc410244413a312dc3f20627c20 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Fri, 16 Aug 2024 12:37:27 +0100 Subject: [PATCH 2/3] Update AArch64InstrFormats.td --- llvm/lib/Target/AArch64/AArch64InstrFormats.td | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 4c90449925f107..31858abd0700c1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5974,7 +5974,6 @@ multiclass SIMDThreeSameVectorB opc, string asm, (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } -// As above, but only floating point elements supported. let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDThreeSameVectorFP opc, string asm, SDPatternOperator OpNode> { From 1d1c54ae8be42ce4a48df81b0f7e8dc0dbef26de Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Mon, 2 Sep 2024 13:32:59 +0000 Subject: [PATCH 3/3] Moved definition of instrunction class --- clang/include/clang/Basic/arm_neon.td | 2 +- .../lib/Target/AArch64/AArch64InstrFormats.td | 42 +++++++++---------- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 83e7c8ec65eb17..8652b5e3a9c901 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2127,7 +2127,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { def FAMAX : WInst<"vamax", "...", "fhQdQfQh">; } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { // fscale def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">; def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 31858abd0700c1..1d1d9b5512cfc7 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5974,6 +5974,7 @@ multiclass SIMDThreeSameVectorB opc, string asm, (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } +// As above, but only floating point elements supported. let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDThreeSameVectorFP opc, string asm, SDPatternOperator OpNode> { @@ -5996,27 +5997,6 @@ multiclass SIMDThreeSameVectorFP opc, [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -// As above, but only floating point elements supported. -let mayRaiseFPException = 1, Uses = [FPCR] in -multiclass SIMDThreeVectorFP opc, - string asm, SDPatternOperator OpNode> { - def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, - asm, ".4h", - [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, - asm, ".8h", - [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, - asm, ".2s", - [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, - asm, ".4s", - [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, - asm, ".2d", - [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>; -} - let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDThreeSameVectorFPCmp opc, string asm, @@ -6263,6 +6243,26 @@ multiclass SIMDThreeSameVectorDOT4 { V128, v4f32, v16i8, null_frag>; } +let mayRaiseFPException = 1, Uses = [FPCR] in +multiclass SIMDThreeVectorFscale opc, + string asm, SDPatternOperator OpNode> { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, + asm, ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, + asm, ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4i32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, + asm, ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2i64 V128:$Rm)))]>; +} + //---------------------------------------------------------------------------- // AdvSIMD two register vector instructions. //---------------------------------------------------------------------------- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f40aadb2500c94..943c48c0f230a5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10136,7 +10136,7 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8] in { defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">; defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">; defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">; - defm FSCALE : SIMDThreeVectorFP<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; + defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; } // End let Predicates = [HasFP8] // fminimum(abs(a), abs(b)) -> famin(a, b)