diff --git a/doc/running.html b/doc/running.html
index 7868efab7e..1cf41f1bb8 100644
--- a/doc/running.html
+++ b/doc/running.html
@@ -226,6 +226,12 @@ <h3 id="opt_O"><tt>-O[level]</tt><br>
 overrides all earlier flags.
 </p>
 <p>
+Note that <tt>-Ofma</tt> is not enabled by default at any level,
+because it affects floating-point result accuracy. Only enable this,
+if you fully understand the trade-offs of FMA for performance (higher),
+determinism (lower) and numerical accuracy (higher).
+</p>
+<p>
 Here are the available flags and at what optimization levels they
 are enabled:
 </p>
@@ -257,6 +263,8 @@ <h3 id="opt_O"><tt>-O[level]</tt><br>
 <td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
 <tr class="even">
 <td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
+<tr class="odd">
+<td class="flag_name">fma </td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_desc">Fused multiply-add</td></tr>
 </table>
 <p>
 Here are the parameters and their default settings:
diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h
index 5a0f925fe1..041cd79448 100644
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -310,7 +310,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref,
 }
 
 #if !LJ_SOFTFP
-/* Fuse to multiply-add/sub instruction. */
+/*
+** Fuse to multiply-add/sub instruction.
+** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
+** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
+*/
 static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
 {
   IRRef lref = ir->op1, rref = ir->op2;
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
index 88b47ceb3a..554bb60a54 100644
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -334,7 +334,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air)
 {
   IRRef lref = ir->op1, rref = ir->op2;
   IRIns *irm;
-  if (lref != rref &&
+  if ((as->flags & JIT_F_OPT_FMA) &&
+      lref != rref &&
       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
        ra_noreg(irm->r)) ||
        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h
index 7bba71b3cc..52db292692 100644
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -232,7 +232,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
 {
   IRRef lref = ir->op1, rref = ir->op2;
   IRIns *irm;
-  if (lref != rref &&
+  if ((as->flags & JIT_F_OPT_FMA) &&
+      lref != rref &&
       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
 	ra_noreg(irm->r)) ||
        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
diff --git a/src/lj_jit.h b/src/lj_jit.h
index 47df85c6c1..73c355b913 100644
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -86,10 +86,11 @@
 #define JIT_F_OPT_ABC		(JIT_F_OPT << 7)
 #define JIT_F_OPT_SINK		(JIT_F_OPT << 8)
 #define JIT_F_OPT_FUSE		(JIT_F_OPT << 9)
+#define JIT_F_OPT_FMA		(JIT_F_OPT << 10)
 
 /* Optimizations names for -O. Must match the order above. */
 #define JIT_F_OPTSTRING	\
-  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
+  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
 
 /* Optimization levels set a fixed combination of flags. */
 #define JIT_F_OPT_0	0
@@ -98,6 +99,7 @@
 #define JIT_F_OPT_3	(JIT_F_OPT_2|\
   JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
 #define JIT_F_OPT_DEFAULT	JIT_F_OPT_3
+/* Note: FMA is not set by default. */
 
 /* -- JIT engine parameters ----------------------------------------------- */
 
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
index faebe719ef..29b72e0ca0 100644
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
 
 /* -- Helper functions ---------------------------------------------------- */
 
+/* Required to prevent the C compiler from applying FMA optimizations.
+**
+** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
+** But the current state of C compilers is a mess in this regard.
+** Also, this function is not performance sensitive at all.
+*/
+LJ_NOINLINE static double lj_vm_floormul(double x, double y)
+{
+  return lj_vm_floor(x / y) * y;
+}
+
 double lj_vm_foldarith(double x, double y, int op)
 {
   switch (op) {
@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op)
   case IR_SUB - IR_ADD: return x-y; break;
   case IR_MUL - IR_ADD: return x*y; break;
   case IR_DIV - IR_ADD: return x/y; break;
-  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
+  case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
   case IR_POW - IR_ADD: return pow(x, y); break;
   case IR_NEG - IR_ADD: return -x; break;
   case IR_ABS - IR_ADD: return fabs(x); break;
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index 1cf1ea519e..c5f0a7a736 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -2581,7 +2581,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |.macro ins_arithmod, res, reg1, reg2
     |  fdiv d2, reg1, reg2
     |  frintm d2, d2
-    |  fmsub res, d2, reg2, reg1
+    |  // Cannot use fmsub, because FMA is not enabled by default.
+    |  fmul d2, d2, reg2
+    |  fsub res, reg1, d2
     |.endmacro
     |
     |.macro ins_arithdn, intins, fpins
diff --git a/test/tarantool-tests/lj-918-fma-numerical-accuracy-jit.test.lua b/test/tarantool-tests/lj-918-fma-numerical-accuracy-jit.test.lua
new file mode 100644
index 0000000000..8b16d4c3e7
--- /dev/null
+++ b/test/tarantool-tests/lj-918-fma-numerical-accuracy-jit.test.lua
@@ -0,0 +1,48 @@
+local tap = require('tap')
+
+-- Test file to demonstrate consistent behaviour for JIT and the
+-- VM regarding FMA optimization (disabled by default).
+-- XXX: The VM behaviour is checked in the
+-- <lj-918-fma-numerical-accuracy.test.lua>.
+-- See also: https://github.com/LuaJIT/LuaJIT/issues/918.
+local test = tap.test('lj-918-fma-numerical-accuracy-jit'):skipcond({
+  ['Test requires JIT enabled'] = not jit.status(),
+})
+
+test:plan(1)
+
+local _2pow52 = 2 ^ 52
+
+-- XXX: Before this commit the LuaJIT arm64 VM uses `fmsub` [1]
+-- instruction for the modulo operation, which is the fused
+-- multiply-add (FMA [2]) operation (more precisely,
+-- multiply-sub). Hence, it may produce different results compared
+-- to the unfused one. For the test, let's just use 2 numbers in
+-- modulo for which the single rounding is different from the
+-- double rounding. The numbers from the original issue are good
+-- enough.
+--
+-- [1]:https://developer.arm.com/documentation/dui0801/g/A64-Floating-point-Instructions/FMSUB
+-- [2]:https://en.wikipedia.org/wiki/Multiply%E2%80%93accumulate_operation
+--
+-- IEEE754 components to double:
+-- sign * (2 ^ (exp - 1023)) * (mantissa / _2pow52 + normal).
+local a = 1 * (2 ^ (1083 - 1023)) * (4080546448249347 / _2pow52 + 1)
+assert(a == 2197541395358679800)
+
+local b = -1 * (2 ^ (1052 - 1023)) * (3927497732209973 / _2pow52 + 1)
+assert(b == -1005065126.3690554)
+
+local results = {}
+
+jit.opt.start('hotloop=1')
+for i = 1, 4 do
+  results[i] = a % b
+end
+
+-- XXX: The test doesn't fail before this commit. But it is
+-- required to be sure that there are no inconsistencies after the
+-- commit.
+test:samevalues(results, 'consistent behaviour between the JIT and the VM')
+
+test:done(true)
diff --git a/test/tarantool-tests/lj-918-fma-numerical-accuracy.test.lua b/test/tarantool-tests/lj-918-fma-numerical-accuracy.test.lua
new file mode 100644
index 0000000000..25b5970756
--- /dev/null
+++ b/test/tarantool-tests/lj-918-fma-numerical-accuracy.test.lua
@@ -0,0 +1,43 @@
+local tap = require('tap')
+
+-- Test file to demonstrate possible numerical inaccuracy if FMA
+-- optimization takes place.
+-- XXX: The JIT consistency is checked in the
+-- <lj-918-fma-numerical-accuracy-jit.test.lua>.
+-- See also: https://github.com/LuaJIT/LuaJIT/issues/918.
+local test = tap.test('lj-918-fma-numerical-accuracy')
+
+test:plan(2)
+
+local _2pow52 = 2 ^ 52
+
+-- XXX: Before this commit the LuaJIT arm64 VM uses `fmsub` [1]
+-- instruction for the modulo operation, which is the fused
+-- multiply-add (FMA [2]) operation (more precisely,
+-- multiply-sub). Hence, it may produce different results compared
+-- to the unfused one. For the test, let's just use 2 numbers in
+-- modulo for which the single rounding is different from the
+-- double rounding. The numbers from the original issue are good
+-- enough.
+--
+-- [1]:https://developer.arm.com/documentation/dui0801/g/A64-Floating-point-Instructions/FMSUB
+-- [2]:https://en.wikipedia.org/wiki/Multiply%E2%80%93accumulate_operation
+--
+-- IEEE754 components to double:
+-- sign * (2 ^ (exp - 1023)) * (mantissa / _2pow52 + normal).
+local a = 1 * (2 ^ (1083 - 1023)) * (4080546448249347 / _2pow52 + 1)
+assert(a == 2197541395358679800)
+
+local b = -1 * (2 ^ (1052 - 1023)) * (3927497732209973 / _2pow52 + 1)
+assert(b == -1005065126.3690554)
+
+-- These tests fail on ARM64 before this patch or with FMA
+-- optimization enabled.
+-- The first test may not fail if the compiler doesn't generate
+-- an ARM64 FMA operation in `lj_vm_foldarith()`.
+test:is(2197541395358679800 % -1005065126.3690554, -606337536,
+        'FMA in the lj_vm_foldarith() during parsing')
+
+test:is(a % b, -606337536, 'FMA in the VM')
+
+test:done(true)
diff --git a/test/tarantool-tests/lj-918-fma-optimization.test.lua b/test/tarantool-tests/lj-918-fma-optimization.test.lua
new file mode 100644
index 0000000000..9396e5584f
--- /dev/null
+++ b/test/tarantool-tests/lj-918-fma-optimization.test.lua
@@ -0,0 +1,25 @@
+local tap = require('tap')
+local test = tap.test('lj-918-fma-optimization'):skipcond({
+  ['Test requires JIT enabled'] = not jit.status(),
+})
+
+test:plan(3)
+
+local function jit_opt_is_on(flag)
+  for _, opt in ipairs({jit.status()}) do
+    if opt == flag then
+      return true
+    end
+  end
+  return false
+end
+
+test:ok(not jit_opt_is_on('fma'), 'FMA is disabled by default')
+
+local ok, _ = pcall(jit.opt.start, '+fma')
+
+test:ok(ok, 'fma flag is recognized')
+
+test:ok(jit_opt_is_on('fma'), 'FMA is enabled after jit.opt.start()')
+
+test:done(true)