From 7494620e4959232615261fc0b937d815b08dd717 Mon Sep 17 00:00:00 2001 From: Akira Saitoh Date: Thu, 28 Sep 2023 11:23:51 +0900 Subject: [PATCH] (0.41) AArch64: Add space for outgoing JNI argument to J9CInterpreterStackFrame This commit adds outgoingArguments array to J9CInterpreterStackFrame for AArch64 VM to avoid overwriting preservedGPRs and preservedFPRs when calling out to jitReleaseVMAccess helper from JNI invocation sequence. This commit also updates ARM64JNILinkage to store JNI method arguments that are passed by stack to the area where the outgoingArguments array resides. Adding outgoingArguments makes the size of J9CInterpreterStackFrame large and some slots are not accessible with ldp/stp instructions using sp as a base register. Thus, this commit also updates m4 macros in arm64helpers.m4. They use the start of slots for saving JIT GPRs as the base address and x16 as the base register. Issue: https://github.com/eclipse-openj9/openj9/issues/18149 Signed-off-by: Akira Saitoh --- runtime/codert_vm/arm64nathelp.m4 | 6 +- .../aarch64/codegen/ARM64JNILinkage.cpp | 48 ++------ .../aarch64/codegen/J9CodeGenerator.cpp | 3 + runtime/compiler/env/VMJ9.cpp | 2 +- runtime/oti/arm64helpers.m4 | 114 ++++++++++-------- runtime/oti/j9nonbuilder.h | 1 + 6 files changed, 85 insertions(+), 89 deletions(-) diff --git a/runtime/codert_vm/arm64nathelp.m4 b/runtime/codert_vm/arm64nathelp.m4 index 7958aacdee1..f8693d01d0e 100644 --- a/runtime/codert_vm/arm64nathelp.m4 +++ b/runtime/codert_vm/arm64nathelp.m4 @@ -165,8 +165,7 @@ define({OLD_DUAL_MODE_HELPER_NO_RETURN_VALUE},{ define({NEW_DUAL_MODE_HELPER},{ DECLARE_EXTERN(fast_$1) - START_PROC($1) - SAVE_FPLR + BEGIN_HELPER($1) CALL_DIRECT(fast_$1) cbz x0,.L_done_$1 ldr x30,JIT_GPR_SAVE_SLOT(30) @@ -190,8 +189,7 @@ define({NEW_DUAL_MODE_HELPER},{ define({NEW_DUAL_MODE_HELPER_NO_RETURN_VALUE},{ DECLARE_EXTERN(fast_$1) - START_PROC($1) - SAVE_FPLR + BEGIN_HELPER($1) CALL_DIRECT(fast_$1) cbz x0,.L_done_$1 ldr x30,JIT_GPR_SAVE_SLOT(30) diff --git a/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp b/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp index 8ec958015d8..0d9487253bb 100644 --- a/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp +++ b/runtime/compiler/aarch64/codegen/ARM64JNILinkage.cpp @@ -395,7 +395,6 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen { const TR::ARM64LinkageProperties &properties = _systemLinkage->getProperties(); TR::ARM64MemoryArgument *pushToMemory = NULL; - TR::Register *argMemReg; TR::Register *tempReg; int32_t argIndex = 0; int32_t numMemArgs = 0; @@ -503,12 +502,18 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen if (numMemArgs > 0) { pushToMemory = new (trStackMemory()) TR::ARM64MemoryArgument[numMemArgs]; - - argMemReg = cg()->allocateRegister(); } // align to 16-byte boundary totalSize = (totalSize + 15) & (~15); + /* + * We have J9_INLINE_JNI_MAX_ARG_COUNT (32) slots in interpreter frame. + * If the size of the native stack used for passing parameters exceeds this size, + * it will overwrite preserved registers. + * The number of parameters is guaranteed to be <= J9_INLINE_JNI_MAX_ARG_COUNT + * when HasFixedFrameC_CallingConvention codegen flag is set. + */ + TR_ASSERT_FATAL(totalSize <= sizeof(UDATA) * J9_INLINE_JNI_MAX_ARG_COUNT, "totalSize must not be larger than J9_INLINE_JNI_MAX_ARG_COUNT slots"); numIntegerArgs = 0; numFloatArgs = 0; @@ -522,6 +527,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #if defined(OSX) sigCursor = (char *)sig; #endif + TR::RealRegister *sp = cg()->machine()->getRealRegister(properties.getStackPointerRegister()); for (i = firstArgumentChild; i < callNode->getNumChildren(); i++) { @@ -611,7 +617,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #else #error Unsupported platform #endif - getOutgoingArgumentMemRef(argMemReg, argOffset, argRegister, op, pushToMemory[argIndex++]); + getOutgoingArgumentMemRef(sp, argOffset, argRegister, op, pushToMemory[argIndex++]); argOffset += offsetInc; } numIntegerArgs++; @@ -672,7 +678,7 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen #error Unsupported platform #endif } - getOutgoingArgumentMemRef(argMemReg, argOffset, argRegister, op, pushToMemory[argIndex++]); + getOutgoingArgumentMemRef(sp, argOffset, argRegister, op, pushToMemory[argIndex++]); argOffset += offsetInc; } numFloatArgs++; @@ -746,17 +752,12 @@ size_t J9::ARM64::JNILinkage::buildJNIArgs(TR::Node *callNode, TR::RegisterDepen if (numMemArgs > 0) { - TR::RealRegister *sp = cg()->machine()->getRealRegister(properties.getStackPointerRegister()); - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::subimmx, callNode, argMemReg, sp, totalSize); - for (argIndex = 0; argIndex < numMemArgs; argIndex++) { TR::Register *aReg = pushToMemory[argIndex].argRegister; generateMemSrc1Instruction(cg(), pushToMemory[argIndex].opCode, callNode, pushToMemory[argIndex].argMemory, aReg); cg()->stopUsingRegister(aReg); } - - cg()->stopUsingRegister(argMemReg); } return totalSize; @@ -1058,20 +1059,7 @@ TR::Register *J9::ARM64::JNILinkage::buildDirectDispatch(TR::Node *callNode) #endif TR::RegisterDependencyConditions *deps = new (trHeapMemory()) TR::RegisterDependencyConditions(maxRegisters, maxPostRegisters, trMemory()); - size_t spSize = buildJNIArgs(callNode, deps, passThread, passReceiver, killNonVolatileGPRs); - TR::RealRegister *sp = machine()->getRealRegister(_systemLinkage->getProperties().getStackPointerRegister()); - - if (spSize > 0) - { - if (constantIsUnsignedImm12(spSize)) - { - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::subimmx, callNode, sp, sp, spSize); - } - else - { - TR_ASSERT_FATAL(false, "Too many arguments."); - } - } + buildJNIArgs(callNode, deps, passThread, passReceiver, killNonVolatileGPRs); TR::Register *returnRegister = getReturnRegisterFromDeps(callNode, deps); @@ -1111,18 +1099,6 @@ TR::Register *J9::ARM64::JNILinkage::buildDirectDispatch(TR::Node *callNode) TR::Instruction *callInstr = generateMethodDispatch(callNode, isJNIGCPoint, deps, targetAddress, x9Reg); generateLabelInstruction(cg(), TR::InstOpCode::label, callNode, returnLabel, callInstr); - if (spSize > 0) - { - if (constantIsUnsignedImm12(spSize)) - { - generateTrg1Src1ImmInstruction(cg(), TR::InstOpCode::addimmx, callNode, sp, sp, spSize); - } - else - { - TR_ASSERT_FATAL(false, "Too many arguments."); - } - } - if (dropVMAccess) { #ifdef J9VM_INTERP_ATOMIC_FREE_JNI diff --git a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp index 5840292427b..14a39b11fbe 100644 --- a/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/aarch64/codegen/J9CodeGenerator.cpp @@ -76,6 +76,9 @@ J9::ARM64::CodeGenerator::initialize() { comp->setOption(TR_EnableMonitorCacheLookup); } + + if (comp->fej9()->hasFixedFrameC_CallingConvention()) + cg->setHasFixedFrameC_CallingConvention(); } TR::Linkage * diff --git a/runtime/compiler/env/VMJ9.cpp b/runtime/compiler/env/VMJ9.cpp index 357f9dfd280..64dbf72f90f 100644 --- a/runtime/compiler/env/VMJ9.cpp +++ b/runtime/compiler/env/VMJ9.cpp @@ -9662,7 +9662,7 @@ portLibCall_sysinfo_has_fixed_frame_C_calling_convention() /* AArch64 */ #if defined(TR_HOST_ARM64) && defined(TR_TARGET_ARM64) - return false; + return true; #endif /* X86 */ diff --git a/runtime/oti/arm64helpers.m4 b/runtime/oti/arm64helpers.m4 index 4ecf75cc3e4..1a12e932b98 100644 --- a/runtime/oti/arm64helpers.m4 +++ b/runtime/oti/arm64helpers.m4 @@ -75,10 +75,23 @@ define({JIT_GPR_SAVE_OFFSET},{eval(J9TR_cframe_jitGPRs+(($1)*ALen))}) define({JIT_GPR_SAVE_SLOT},{[sp,{#}JIT_GPR_SAVE_OFFSET($1)]}) define({JIT_FPR_SAVE_OFFSET},{eval(J9TR_cframe_jitFPRs+(($1)*8))}) define({JIT_FPR_SAVE_SLOT},{[sp,{#}JIT_FPR_SAVE_OFFSET($1)]}) +dnl The size of J9CInterpreterStackFrame is now larger than 504 bytes, so we cannot simply use sp as the base register for ldp/stp instructions +dnl because the offset would become too large and invalid for ldp/stp. +dnl Thus, we use sp + JIT_GPR_SAVE_OFFSET(0) as the base address for ldp/stp. We chose x16 as the base register. +dnl It is intra procedure call register and JIT does not use it, so it is safe to clobber x16. +define({JIT_GPR_SAVE_SLOT_X16},{[x16,{#}eval(($1)*ALen)]}) +define({JIT_FPR_SAVE_SLOT_X16},{[x16,{#}eval(JIT_FPR_SAVE_OFFSET($1)-JIT_GPR_SAVE_OFFSET(0))]}) -define({SAVE_FPLR},{stp x29,x30,JIT_GPR_SAVE_SLOT(29)}) +dnl JIT_GPR_SAVE_OFFSET is too large to fit in ldp/stp instructions. +define({SAVE_FPLR},{ + str x29,JIT_GPR_SAVE_SLOT(29) + str x30,JIT_GPR_SAVE_SLOT(30) +}) -define({RESTORE_FPLR},{ldp x29,x30,JIT_GPR_SAVE_SLOT(29)}) +define({RESTORE_FPLR},{ + ldr x29,JIT_GPR_SAVE_SLOT(29) + ldr x30,JIT_GPR_SAVE_SLOT(30) +}) define({BEGIN_HELPER},{ START_PROC($1) @@ -95,22 +108,23 @@ define({CALL_C_WITH_VMTHREAD},{ CALL_DIRECT($1) }) +dnl x16 is inter-procedure-call register define({SAVE_C_VOLATILE_REGS},{ - stp x0,x1,JIT_GPR_SAVE_SLOT(0) - stp x2,x3,JIT_GPR_SAVE_SLOT(2) - stp x4,x5,JIT_GPR_SAVE_SLOT(4) - stp x6,x7,JIT_GPR_SAVE_SLOT(6) - stp x8,x9,JIT_GPR_SAVE_SLOT(8) - stp x10,x11,JIT_GPR_SAVE_SLOT(10) - stp x12,x13,JIT_GPR_SAVE_SLOT(12) - stp x14,x15,JIT_GPR_SAVE_SLOT(14) - stp x16,x17,JIT_GPR_SAVE_SLOT(16) - str x18,JIT_GPR_SAVE_SLOT(18) + add x16, sp, JIT_GPR_SAVE_OFFSET(0) + stp x0,x1,JIT_GPR_SAVE_SLOT_X16(0) + stp x2,x3,JIT_GPR_SAVE_SLOT_X16(2) + stp x4,x5,JIT_GPR_SAVE_SLOT_X16(4) + stp x6,x7,JIT_GPR_SAVE_SLOT_X16(6) + stp x8,x9,JIT_GPR_SAVE_SLOT_X16(8) + stp x10,x11,JIT_GPR_SAVE_SLOT_X16(10) + stp x12,x13,JIT_GPR_SAVE_SLOT_X16(12) + stp x14,x15,JIT_GPR_SAVE_SLOT_X16(14) + stp x17,x18,JIT_GPR_SAVE_SLOT_X16(17) ifdef({METHOD_INVOCATION},{ - stp d0,d1,JIT_FPR_SAVE_SLOT(0) - stp d2,d3,JIT_FPR_SAVE_SLOT(2) - stp d4,d5,JIT_FPR_SAVE_SLOT(4) - stp d6,d7,JIT_FPR_SAVE_SLOT(6) + stp d0,d1,JIT_FPR_SAVE_SLOT_X16(0) + stp d2,d3,JIT_FPR_SAVE_SLOT_X16(2) + stp d4,d5,JIT_FPR_SAVE_SLOT_X16(4) + stp d6,d7,JIT_FPR_SAVE_SLOT_X16(6) },{ dnl METHOD_INVOCATION add x15, sp, JIT_FPR_SAVE_OFFSET(0) st1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [x15], #64 @@ -125,11 +139,12 @@ ifdef({METHOD_INVOCATION},{ }) define({RESTORE_C_VOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ifdef({METHOD_INVOCATION},{ - ldp d0,d1,JIT_FPR_SAVE_SLOT(0) - ldp d2,d3,JIT_FPR_SAVE_SLOT(2) - ldp d4,d5,JIT_FPR_SAVE_SLOT(4) - ldp d6,d7,JIT_FPR_SAVE_SLOT(6) + ldp d0,d1,JIT_FPR_SAVE_SLOT_X16(0) + ldp d2,d3,JIT_FPR_SAVE_SLOT_X16(2) + ldp d4,d5,JIT_FPR_SAVE_SLOT_X16(4) + ldp d6,d7,JIT_FPR_SAVE_SLOT_X16(6) },{ dnl METHOD_INVOCATION add x15, sp, JIT_FPR_SAVE_OFFSET(0) ld1 {{v0.4s, v1.4s, v2.4s, v3.4s}}, [x15], #64 @@ -141,37 +156,38 @@ ifdef({METHOD_INVOCATION},{ ld1 {{v24.4s, v25.4s, v26.4s, v27.4s}}, [x15], #64 ld1 {{v28.4s, v29.4s, v30.4s, v31.4s}}, [x15] }) dnl METHOD_INVOCATION - ldp x0,x1,JIT_GPR_SAVE_SLOT(0) - ldp x2,x3,JIT_GPR_SAVE_SLOT(2) - ldp x4,x5,JIT_GPR_SAVE_SLOT(4) - ldp x6,x7,JIT_GPR_SAVE_SLOT(6) - ldp x8,x9,JIT_GPR_SAVE_SLOT(8) - ldp x10,x11,JIT_GPR_SAVE_SLOT(10) - ldp x12,x13,JIT_GPR_SAVE_SLOT(12) - ldp x14,x15,JIT_GPR_SAVE_SLOT(14) - ldp x16,x17,JIT_GPR_SAVE_SLOT(16) - ldr x18,JIT_GPR_SAVE_SLOT(18) + ldp x0,x1,JIT_GPR_SAVE_SLOT_X16(0) + ldp x2,x3,JIT_GPR_SAVE_SLOT_X16(2) + ldp x4,x5,JIT_GPR_SAVE_SLOT_X16(4) + ldp x6,x7,JIT_GPR_SAVE_SLOT_X16(6) + ldp x8,x9,JIT_GPR_SAVE_SLOT_X16(8) + ldp x10,x11,JIT_GPR_SAVE_SLOT_X16(10) + ldp x12,x13,JIT_GPR_SAVE_SLOT_X16(12) + ldp x14,x15,JIT_GPR_SAVE_SLOT_X16(14) + ldp x17,x18,JIT_GPR_SAVE_SLOT_X16(17) }) dnl No need to save/restore d8-15 - the stack walker will never need to read dnl or modify them (no preserved FPRs in the JIT private linkage). define({SAVE_C_NONVOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) str x19,JIT_GPR_SAVE_SLOT(19) - stp x20,x21,JIT_GPR_SAVE_SLOT(20) - stp x22,x23,JIT_GPR_SAVE_SLOT(22) - stp x24,x25,JIT_GPR_SAVE_SLOT(24) - stp x26,x27,JIT_GPR_SAVE_SLOT(26) - stp x28,x29,JIT_GPR_SAVE_SLOT(28) + stp x20,x21,JIT_GPR_SAVE_SLOT_X16(20) + stp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + stp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + stp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + stp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({RESTORE_C_NONVOLATILE_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ldr x19,JIT_GPR_SAVE_SLOT(19) - ldp x20,x21,JIT_GPR_SAVE_SLOT(20) - ldp x22,x23,JIT_GPR_SAVE_SLOT(22) - ldp x24,x25,JIT_GPR_SAVE_SLOT(24) - ldp x26,x27,JIT_GPR_SAVE_SLOT(26) - ldp x28,x29,JIT_GPR_SAVE_SLOT(28) + ldp x20,x21,JIT_GPR_SAVE_SLOT_X16(20) + ldp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + ldp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + ldp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + ldp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({SAVE_ALL_REGS},{ @@ -185,21 +201,23 @@ define({RESTORE_ALL_REGS},{ }) define({SAVE_PRESERVED_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) str x18,JIT_GPR_SAVE_SLOT(18) str x21,JIT_GPR_SAVE_SLOT(21) - stp x22,x23,JIT_GPR_SAVE_SLOT(22) - stp x24,x25,JIT_GPR_SAVE_SLOT(24) - stp x26,x27,JIT_GPR_SAVE_SLOT(26) - stp x28,x29,JIT_GPR_SAVE_SLOT(28) + stp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + stp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + stp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + stp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({RESTORE_PRESERVED_REGS},{ + add x16, sp, JIT_GPR_SAVE_OFFSET(0) ldr x18,JIT_GPR_SAVE_SLOT(18) ldr x21,JIT_GPR_SAVE_SLOT(21) - ldp x22,x23,JIT_GPR_SAVE_SLOT(22) - ldp x24,x25,JIT_GPR_SAVE_SLOT(24) - ldp x26,x27,JIT_GPR_SAVE_SLOT(26) - ldp x28,x29,JIT_GPR_SAVE_SLOT(28) + ldp x22,x23,JIT_GPR_SAVE_SLOT_X16(22) + ldp x24,x25,JIT_GPR_SAVE_SLOT_X16(24) + ldp x26,x27,JIT_GPR_SAVE_SLOT_X16(26) + ldp x28,x29,JIT_GPR_SAVE_SLOT_X16(28) }) define({BRANCH_VIA_VMTHREAD},{ diff --git a/runtime/oti/j9nonbuilder.h b/runtime/oti/j9nonbuilder.h index d1b67fc15a7..535ad6f4677 100644 --- a/runtime/oti/j9nonbuilder.h +++ b/runtime/oti/j9nonbuilder.h @@ -6287,6 +6287,7 @@ typedef struct J9CInterpreterStackFrame { U_8 jitFPRs[16 * 8]; /* fpr0-15 */ #endif /* J9VM_ENV_DATA64 */ #elif defined(J9VM_ARCH_AARCH64) /* J9VM_ARCH_ARM */ + UDATA outgoingArguments[J9_INLINE_JNI_MAX_ARG_COUNT]; UDATA preservedGPRs[12]; /* x19-x30 */ U_8 preservedFPRs[8 * 8]; /* v8-15 */ J9JITGPRSpillArea jitGPRs;