Skip to content

Commit

Permalink
Cleanup CPU detection and tuning for old CPUs.
Browse files Browse the repository at this point in the history
(cherry picked from commit 0eddcbe)

This patch does the following refactoring:
1) Drops optimizations for the Intel Atom CPU [1]: removes the
   `JIT_F_LEA_AGU` flag and related optimizations. The considerations
   for the use of LEA are complex and very CPU-specific, mostly
   dependent on the number of operands. Mostly, it isn't worth it due to
   the extra register pressure and/or extra instructions.
   Be aware that it applies to the original and obsolete Atom
   architecture. Today "Intel Atom" is just a trade name for
   reduced-performance implementations of the current Intel
   architecture.
2) Drops optimizations for the AMD K8, K10 CPU [2][3]: removes the
   `JIT_F_PREFER_IMUL` flag and related optimizations.
3) Refactors JIT flags defined in the <lj_jit.h>. Now all CPU-specific
   JIT flags are defined as the left shift of `JIT_F_CPU` instead of
   hardcoded constants, similar for the optimization flags.
4) Adds detection of the ARM8 CPU.
5) Drops the check for SSE2 since the VM already presumes CPU supports
   it.
6) Adds checks for `__ARM_ARCH`[4] macro in <lj_arch.h>.
7) Drops outdated comment in the amalgamation file about memory
   requirements.

Sergey Kaplun:
* added the description for the patch

[1]: https://en.wikipedia.org/wiki/Intel_Atom
[2]: https://en.wikipedia.org/wiki/AMD_K8
[3]: https://en.wikipedia.org/wiki/AMD_K10
[4]: https://developer.arm.com/documentation/dui0774/l/Other-Compiler-specific-Features/Predefined-macros

Part of tarantool/tarantool#10709

Reviewed-by: Sergey Bronnikov <[email protected]>
Signed-off-by: Sergey Kaplun <[email protected]>
  • Loading branch information
Mike Pall authored and Buristan committed Jan 20, 2025
1 parent ffede1b commit 1d988a8
Show file tree
Hide file tree
Showing 9 changed files with 87 additions and 138 deletions.
1 change: 0 additions & 1 deletion src/Makefile.original
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,6 @@ E= @echo
default all: $(TARGET_T)

amalg:
@grep "^[+|]" ljamalg.c
$(MAKE) -f Makefile.original all "LJCORE_O=ljamalg.o"

clean:
Expand Down
65 changes: 23 additions & 42 deletions src/lib_jit.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ LJLIB_CF(jit_status)
jit_State *J = L2J(L);
L->top = L->base;
setboolV(L->top++, (J->flags & JIT_F_ON) ? 1 : 0);
flagbits_to_strings(L, J->flags, JIT_F_CPU_FIRST, JIT_F_CPUSTRING);
flagbits_to_strings(L, J->flags, JIT_F_OPT_FIRST, JIT_F_OPTSTRING);
flagbits_to_strings(L, J->flags, JIT_F_CPU, JIT_F_CPUSTRING);
flagbits_to_strings(L, J->flags, JIT_F_OPT, JIT_F_OPTSTRING);
return (int)(L->top - L->base);
#else
setboolV(L->top++, 0);
Expand Down Expand Up @@ -467,7 +467,7 @@ static int jitopt_flag(jit_State *J, const char *str)
str += str[2] == '-' ? 3 : 2;
set = 0;
}
for (opt = JIT_F_OPT_FIRST; ; opt <<= 1) {
for (opt = JIT_F_OPT; ; opt <<= 1) {
size_t len = *(const uint8_t *)lst;
if (len == 0)
break;
Expand Down Expand Up @@ -636,80 +636,64 @@ JIT_PARAMDEF(JIT_PARAMINIT)
#undef JIT_PARAMINIT
0
};
#endif

#if LJ_TARGET_ARM && LJ_TARGET_LINUX
#include <sys/utsname.h>
#endif

/* Arch-dependent CPU detection. */
static uint32_t jit_cpudetect(lua_State *L)
/* Arch-dependent CPU feature detection. */
static uint32_t jit_cpudetect(void)
{
uint32_t flags = 0;
#if LJ_TARGET_X86ORX64

uint32_t vendor[4];
uint32_t features[4];
if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
#if !LJ_HASJIT
#define JIT_F_SSE2 2
#endif
flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
#if LJ_HASJIT
flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
if (vendor[2] == 0x6c65746e) { /* Intel. */
if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
flags |= JIT_F_LEA_AGU;
} else if (vendor[2] == 0x444d4163) { /* AMD. */
uint32_t fam = (features[0] & 0x0ff00f00);
if (fam >= 0x00000f00) /* K8, K10. */
flags |= JIT_F_PREFER_IMUL;
}
if (vendor[0] >= 7) {
uint32_t xfeatures[4];
lj_vm_cpuid(7, xfeatures);
flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
}
#endif
}
/* Check for required instruction set support on x86 (unnecessary on x64). */
#if LJ_TARGET_X86
if (!(flags & JIT_F_SSE2))
luaL_error(L, "CPU with SSE2 required");
#endif
/* Don't bother checking for SSE2 -- the VM will crash before getting here. */

#elif LJ_TARGET_ARM
#if LJ_HASJIT

int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
#if LJ_TARGET_LINUX
if (ver < 70) { /* Runtime ARM CPU detection. */
struct utsname ut;
uname(&ut);
if (strncmp(ut.machine, "armv", 4) == 0) {
if (ut.machine[4] >= '7')
ver = 70;
else if (ut.machine[4] == '6')
ver = 60;
if (ut.machine[4] >= '8') ver = 80;
else if (ut.machine[4] == '7') ver = 70;
else if (ut.machine[4] == '6') ver = 60;
}
}
#endif
flags |= ver >= 70 ? JIT_F_ARMV7 :
ver >= 61 ? JIT_F_ARMV6T2_ :
ver >= 60 ? JIT_F_ARMV6_ : 0;
flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2;
#endif

#elif LJ_TARGET_ARM64

/* No optional CPU features to detect (for now). */

#elif LJ_TARGET_PPC
#if LJ_HASJIT

#if LJ_ARCH_SQRT
flags |= JIT_F_SQRT;
#endif
#if LJ_ARCH_ROUND
flags |= JIT_F_ROUND;
#endif
#endif

#elif LJ_TARGET_MIPS
#if LJ_HASJIT

/* Compile-time MIPS CPU detection. */
#if LJ_ARCH_VERSION >= 20
flags |= JIT_F_MIPSXXR2;
Expand All @@ -727,31 +711,28 @@ static uint32_t jit_cpudetect(lua_State *L)
if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */
}
#endif
#endif

#else
#error "Missing CPU detection for this architecture"
#endif
UNUSED(L);
return flags;
}

/* Initialize JIT compiler. */
static void jit_init(lua_State *L)
{
uint32_t flags = jit_cpudetect(L);
#if LJ_HASJIT
jit_State *J = L2J(L);
J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
J->flags = jit_cpudetect() | JIT_F_ON | JIT_F_OPT_DEFAULT;
memcpy(J->param, jit_param_default, sizeof(J->param));
lj_dispatch_update(G(L));
#else
UNUSED(flags);
#endif
}
#endif

LUALIB_API int luaopen_jit(lua_State *L)
{
#if LJ_HASJIT
jit_init(L);
#endif
lua_pushliteral(L, LJ_OS_NAME);
lua_pushliteral(L, LJ_ARCH_NAME);
lua_pushinteger(L, LUAJIT_VERSION_NUM);
Expand Down
6 changes: 3 additions & 3 deletions src/lj_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,13 @@
#define LJ_TARGET_UNIFYROT 2 /* Want only IR_BROR. */
#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL

#if __ARM_ARCH____ARM_ARCH_8__ || __ARM_ARCH_8A__
#if __ARM_ARCH == 8 || __ARM_ARCH_8__ || __ARM_ARCH_8A__
#define LJ_ARCH_VERSION 80
#elif __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__
#elif __ARM_ARCH == 7 || __ARM_ARCH_7__ || __ARM_ARCH_7A__ || __ARM_ARCH_7R__ || __ARM_ARCH_7S__ || __ARM_ARCH_7VE__
#define LJ_ARCH_VERSION 70
#elif __ARM_ARCH_6T2__
#define LJ_ARCH_VERSION 61
#elif __ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ || __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__
#elif __ARM_ARCH == 6 || __ARM_ARCH_6__ || __ARM_ARCH_6J__ || __ARM_ARCH_6K__ || __ARM_ARCH_6Z__ || __ARM_ARCH_6ZK__
#define LJ_ARCH_VERSION 60
#else
#define LJ_ARCH_VERSION 50
Expand Down
33 changes: 9 additions & 24 deletions src/lj_asm_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -1224,13 +1224,8 @@ static void asm_href(ASMState *as, IRIns *ir, IROp merge)
emit_rmro(as, XO_MOV, dest|REX_GC64, tab, offsetof(GCtab, node));
} else {
emit_rmro(as, XO_ARITH(XOg_ADD), dest|REX_GC64, tab, offsetof(GCtab,node));
if ((as->flags & JIT_F_PREFER_IMUL)) {
emit_i8(as, sizeof(Node));
emit_rr(as, XO_IMULi8, dest, dest);
} else {
emit_shifti(as, XOg_SHL, dest, 3);
emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
}
emit_shifti(as, XOg_SHL, dest, 3);
emit_rmrxo(as, XO_LEA, dest, dest, dest, XM_SCALE2, 0);
if (isk) {
emit_gri(as, XG_ARITHi(XOg_AND), dest, (int32_t)khash);
emit_rmro(as, XO_MOV, dest, tab, offsetof(GCtab, hmask));
Expand Down Expand Up @@ -1289,7 +1284,7 @@ static void asm_hrefk(ASMState *as, IRIns *ir)
lj_assertA(ofs % sizeof(Node) == 0, "unaligned HREFK slot");
if (ra_hasreg(dest)) {
if (ofs != 0) {
if (dest == node && !(as->flags & JIT_F_LEA_AGU))
if (dest == node)
emit_gri(as, XG_ARITHi(XOg_ADD), dest|REX_GC64, ofs);
else
emit_rmro(as, XO_LEA, dest|REX_GC64, node, ofs);
Expand Down Expand Up @@ -2183,8 +2178,7 @@ static void asm_add(ASMState *as, IRIns *ir)
{
if (irt_isnum(ir->t))
asm_fparith(as, ir, XO_ADDSD);
else if ((as->flags & JIT_F_LEA_AGU) || as->flagmcp == as->mcp ||
irt_is64(ir->t) || !asm_lea(as, ir))
else if (as->flagmcp == as->mcp || irt_is64(ir->t) || !asm_lea(as, ir))
asm_intarith(as, ir, XOg_ADD);
}

Expand Down Expand Up @@ -2889,7 +2883,7 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
MCode *target, *q;
int32_t spadj = as->T->spadjust;
if (spadj == 0) {
p -= ((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0);
p -= LJ_64 ? 7 : 6;
} else {
MCode *p1;
/* Patch stack adjustment. */
Expand All @@ -2901,20 +2895,11 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
p1 = p-9;
*(int32_t *)p1 = spadj;
}
if ((as->flags & JIT_F_LEA_AGU)) {
#if LJ_64
p1[-4] = 0x48;
#endif
p1[-3] = (MCode)XI_LEA;
p1[-2] = MODRM(checki8(spadj) ? XM_OFS8 : XM_OFS32, RID_ESP, RID_ESP);
p1[-1] = MODRM(XM_SCALE1, RID_ESP, RID_ESP);
} else {
#if LJ_64
p1[-3] = 0x48;
p1[-3] = 0x48;
#endif
p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
}
p1[-2] = (MCode)(checki8(spadj) ? XI_ARITHi8 : XI_ARITHi);
p1[-1] = MODRM(XM_REG, XOg_ADD, RID_ESP);
}
/* Patch exit branch. */
target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
Expand Down Expand Up @@ -2945,7 +2930,7 @@ static void asm_tail_prep(ASMState *as)
as->invmcp = as->mcp = p;
} else {
/* Leave room for ESP adjustment: add esp, imm or lea esp, [esp+imm] */
as->mcp = p - (((as->flags & JIT_F_LEA_AGU) ? 7 : 6) + (LJ_64 ? 1 : 0));
as->mcp = p - (LJ_64 ? 7 : 6);
as->invmcp = NULL;
}
}
Expand Down
7 changes: 0 additions & 7 deletions src/lj_dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,15 +258,8 @@ int luaJIT_setmode(lua_State *L, int idx, int mode)
} else {
if (!(mode & LUAJIT_MODE_ON))
G2J(g)->flags &= ~(uint32_t)JIT_F_ON;
#if LJ_TARGET_X86ORX64
else if ((G2J(g)->flags & JIT_F_SSE2))
G2J(g)->flags |= (uint32_t)JIT_F_ON;
else
return 0; /* Don't turn on JIT compiler without SSE2 support. */
#else
else
G2J(g)->flags |= (uint32_t)JIT_F_ON;
#endif
lj_dispatch_update(g);
}
break;
Expand Down
5 changes: 1 addition & 4 deletions src/lj_emit_x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,10 +561,7 @@ static void emit_storeofs(ASMState *as, IRIns *ir, Reg r, Reg base, int32_t ofs)
static void emit_addptr(ASMState *as, Reg r, int32_t ofs)
{
if (ofs) {
if ((as->flags & JIT_F_LEA_AGU))
emit_rmro(as, XO_LEA, r|REX_GC64, r, ofs);
else
emit_gri(as, XG_ARITHi(XOg_ADD), r|REX_GC64, ofs);
emit_gri(as, XG_ARITHi(XOg_ADD), r|REX_GC64, ofs);
}
}

Expand Down
4 changes: 0 additions & 4 deletions src/lj_errmsg.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,7 @@ ERRDEF(STRGSRV, "invalid replacement value (a %s)")
ERRDEF(BADMODN, "name conflict for module " LUA_QS)
#if LJ_HASJIT
ERRDEF(JITPROT, "runtime code generation failed, restricted kernel?")
#if LJ_TARGET_X86ORX64
ERRDEF(NOJIT, "JIT compiler disabled, CPU does not support SSE2")
#else
ERRDEF(NOJIT, "JIT compiler disabled")
#endif
#elif defined(LJ_ARCH_NOJIT)
ERRDEF(NOJIT, "no JIT compiler for this architecture (yet)")
#else
Expand Down
Loading

0 comments on commit 1d988a8

Please sign in to comment.