diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 43659564d5ace7..37a56e12efcc3d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -865,6 +865,13 @@ class CombinerHelper { /// By default, it erases the instruction def'd on \p MO from the function. void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo); + /// Match FPOWI if it's safe to extend it into a series of multiplications. + bool matchFPowIExpansion(MachineInstr &MI, int64_t Exponent); + + /// Expands FPOWI into a series of multiplications and a division if the + /// exponent is negative. + void applyExpandFPowI(MachineInstr &MI, int64_t Exponent); + /// Combine insert vector element OOB. bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index bd43b958990302..b0789fca630e8d 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1636,6 +1636,13 @@ def sub_of_vscale : GICombineRule< [{ return Helper.matchSubOfVScale(${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; +def expand_const_fpowi : GICombineRule< + (defs root:$root), + (match (G_CONSTANT $int, $imm), + (G_FPOWI $dst, $float, $int):$root, + [{ return Helper.matchFPowIExpansion(*${root}, ${imm}.getCImm()->getSExtValue()); }]), + (apply [{ Helper.applyExpandFPowI(*${root}, ${imm}.getCImm()->getSExtValue()); }])>; + // match_extract_of_element and insert_vector_elt_oob must be the first! def vector_ops_combines: GICombineGroup<[ match_extract_of_element_undef_vector, @@ -1786,7 +1793,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero, def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p, overlapping_and, mulo_by_2, mulo_by_0, adde_to_addo, - combine_minmax_nan]>; + combine_minmax_nan, expand_const_fpowi]>; def known_bits_simplifications : GICombineGroup<[ redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index f99d2aa284404e..d57dd6fca01403 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -7349,6 +7349,54 @@ void CombinerHelper::applyBuildFnMO(const MachineOperand &MO, Root->eraseFromParent(); } +bool CombinerHelper::matchFPowIExpansion(MachineInstr &MI, int64_t Exponent) { + bool OptForSize = MI.getMF()->getFunction().hasOptSize(); + return getTargetLowering().isBeneficialToExpandPowI(Exponent, OptForSize); +} + +void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) { + auto [Dst, Base] = MI.getFirst2Regs(); + LLT Ty = MRI.getType(Dst); + int64_t ExpVal = Exponent; + + if (ExpVal == 0) { + Builder.buildFConstant(Dst, 1.0); + MI.removeFromParent(); + return; + } + + if (ExpVal < 0) + ExpVal = -ExpVal; + + // We use the simple binary decomposition method from SelectionDAG ExpandPowI + // to generate the multiply sequence. There are more optimal ways to do this + // (for example, powi(x,15) generates one more multiply than it should), but + // this has the benefit of being both really simple and much better than a + // libcall. + std::optional Res; + SrcOp CurSquare = Base; + while (ExpVal > 0) { + if (ExpVal & 1) { + if (!Res) + Res = CurSquare; + else + Res = Builder.buildFMul(Ty, *Res, CurSquare); + } + + CurSquare = Builder.buildFMul(Ty, CurSquare, CurSquare); + ExpVal >>= 1; + } + + // If the original exponent was negative, invert the result, producing + // 1/(x*x*x). + if (Exponent < 0) + Res = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), *Res, + MI.getFlags()); + + Builder.buildCopy(Dst, *Res); + MI.eraseFromParent(); +} + bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo) { GSext *Sext = cast(getDefIgnoringCopies(MO.getReg(), MRI)); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 430fcae7316891..7087265f335f95 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -7140,8 +7140,6 @@ LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) { return UnableToLegalize; } -// TODO: If RHS is a constant SelectionDAGBuilder expands this into a -// multiplication tree. LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) { auto [Dst, Src0, Src1] = MI.getFirst3Regs(); LLT Ty = MRI.getType(Dst); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll new file mode 100644 index 00000000000000..1add4a86aee277 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi-optsize.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -global-isel | FileCheck %s + +define double @pow_optsize(double %x) nounwind optsize { +; CHECK-LABEL: pow_optsize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #15 // =0xf +; CHECK-NEXT: b __powidf2 +entry: + %0 = call double @llvm.powi.f64.i32(double %x, i32 15) + ret double %0 +} + +define double @pow_optsize_expand(double %x) nounwind optsize { +; CHECK-LABEL: pow_optsize_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul d0, d0, d0 +; CHECK-NEXT: fmul d0, d0, d0 +; CHECK-NEXT: fmul d0, d0, d0 +; CHECK-NEXT: fmul d0, d0, d0 +; CHECK-NEXT: ret +entry: + %0 = call double @llvm.powi.f64.i32(double %x, i32 16) + ret double %0 +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir new file mode 100644 index 00000000000000..8b8158348e399a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-fpowi.mir @@ -0,0 +1,124 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s + +--- +name: fpowi_s64_zero +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_s64_zero + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; CHECK-NEXT: $d0 = COPY [[C]](s64) + %0:_(s64) = COPY $d0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32) + $d0 = COPY %3(s64) +... + +--- +name: fpowi_s32_zero +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_s32_zero + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: $s0 = COPY [[C]](s32) + %0:_(s32) = COPY $s0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_CONSTANT i32 0 + %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32) + $s0 = COPY %3(s32) +... + +--- +name: fpowi_positive +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_positive + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]] + ; CHECK-NEXT: $d0 = COPY [[FMUL2]](s64) + %0:_(s64) = COPY $d0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_CONSTANT i32 5 + %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32) + $d0 = COPY %3(s64) +... + +--- +name: fpowi_s64_negative +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_s64_negative + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[FMUL]], [[FMUL]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMUL1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]] + ; CHECK-NEXT: $d0 = COPY [[FDIV]](s64) + %0:_(s64) = COPY $d0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_CONSTANT i32 -5 + %3:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32) + $d0 = COPY %3(s64) +... + +--- +name: fpowi_s32_negative +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_s32_negative + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY]] + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMUL]], [[FMUL]] + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[FMUL1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; CHECK-NEXT: [[FDIV:%[0-9]+]]:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FDIV [[C]], [[FMUL2]] + ; CHECK-NEXT: $s0 = COPY [[FDIV]](s32) + %0:_(s32) = COPY $s0 + %1:_(s32) = COPY $w0 + %2:_(s32) = G_CONSTANT i32 -5 + %3:_(s32) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %2(s32) + $s0 = COPY %3(s32) +... + +--- +name: fpowi_libcall +body: | + bb.0: + liveins: $d0, $w0 + + ; CHECK-LABEL: name: fpowi_libcall + ; CHECK: liveins: $d0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[FPOWI:%[0-9]+]]:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI [[COPY]], [[COPY1]](s32) + ; CHECK-NEXT: $d0 = COPY [[FPOWI]](s64) + %0:_(s64) = COPY $d0 + %1:_(s32) = COPY $w0 + %2:_(s64) = nnan ninf nsz arcp contract afn reassoc G_FPOWI %0, %1(s32) + $d0 = COPY %2(s64) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll index b169063d67872d..9d586e3e4a09a4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -141,51 +141,57 @@ define float @v_powi_1_f32(float %l) { } define float @v_powi_neg1_f32(float %l) { -; GFX78-LABEL: v_powi_neg1_f32: -; GFX78: ; %bb.0: -; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_powi_neg1_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_powi_neg1_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg1_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -1) ret float %res @@ -195,99 +201,74 @@ define float @v_powi_2_f32(float %l) { ; GFX78-LABEL: v_powi_2_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 2) ret float %res } define float @v_powi_neg2_f32(float %l) { -; GFX78-LABEL: v_powi_neg2_f32: -; GFX78: ; %bb.0: -; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_powi_neg2_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_powi_neg2_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg2_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -2) ret float %res @@ -297,48 +278,16 @@ define float @v_powi_4_f32(float %l) { ; GFX78-LABEL: v_powi_4_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_4_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 4) ret float %res @@ -348,48 +297,18 @@ define float @v_powi_8_f32(float %l) { ; GFX78-LABEL: v_powi_8_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_8_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 8) ret float %res @@ -399,48 +318,21 @@ define float @v_powi_16_f32(float %l) { ; GFX78-LABEL: v_powi_16_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_16_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 16) ret float %res @@ -450,99 +342,110 @@ define float @v_powi_128_f32(float %l) { ; GFX78-LABEL: v_powi_128_f32: ; GFX78: ; %bb.0: ; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX78-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX78-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_128_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 128) ret float %res } define float @v_powi_neg128_f32(float %l) { -; GFX78-LABEL: v_powi_neg128_f32: -; GFX78: ; %bb.0: -; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX78-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX78-NEXT: v_mov_b32_e32 v2, 0x4f800000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_log_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX78-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX78-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0xc2fc0000 -; GFX78-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX78-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX78-NEXT: v_exp_f32_e32 v0, v0 -; GFX78-NEXT: v_mov_b32_e32 v1, 0x1f800000 -; GFX78-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc -; GFX78-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX78-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: v_powi_neg128_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_powi_neg128_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_powi_neg128_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11-NEXT: v_log_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0 -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo -; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -128) ret float %res