From 58bcf691d677e93de201a9d1c4b5a97712e8bee0 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 22 Sep 2024 10:45:21 -0700 Subject: [PATCH 1/2] Linux: Optimize CreateNewThread stack usage We were creating a copy of the FEXCore::Core::CPUState object when we didn't need to. We can pass the host thread's CPUState frame through to the creation handlers since it's read-only (so modify it to be const). We then just move the RAX and RSP setting to /after/ the CreateThread handling instead of before. This reduces stack usage from ~1392 bytes to ~80 bytes. --- FEXCore/Source/Interface/Context/Context.h | 2 +- FEXCore/Source/Interface/Core/Core.cpp | 2 +- FEXCore/include/FEXCore/Core/Context.h | 4 ++-- .../LinuxSyscalls/Syscalls/Thread.cpp | 14 +++++--------- .../LinuxEmulation/LinuxSyscalls/ThreadManager.cpp | 2 +- .../LinuxEmulation/LinuxSyscalls/ThreadManager.h | 2 +- 6 files changed, 11 insertions(+), 15 deletions(-) diff --git a/FEXCore/Source/Interface/Context/Context.h b/FEXCore/Source/Interface/Context/Context.h index 7863ed6d4f..009b02c840 100644 --- a/FEXCore/Source/Interface/Context/Context.h +++ b/FEXCore/Source/Interface/Context/Context.h @@ -137,7 +137,7 @@ class ContextImpl final : public FEXCore::Context::Context { */ FEXCore::Core::InternalThreadState* - CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID) override; + CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID) override; // Public for threading void ExecutionThread(FEXCore::Core::InternalThreadState* Thread) override; diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index 46cedca69b..611497b1f8 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -426,7 +426,7 @@ void ContextImpl::InitializeCompiler(FEXCore::Core::InternalThreadState* Thread) } FEXCore::Core::InternalThreadState* -ContextImpl::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID) { +ContextImpl::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID) { FEXCore::Core::InternalThreadState* Thread = new FEXCore::Core::InternalThreadState {}; Thread->CurrentFrame->State.gregs[X86State::REG_RSP] = StackPointer; diff --git a/FEXCore/include/FEXCore/Core/Context.h b/FEXCore/include/FEXCore/Core/Context.h index 8921955ce8..387aada428 100644 --- a/FEXCore/include/FEXCore/Core/Context.h +++ b/FEXCore/include/FEXCore/Core/Context.h @@ -164,8 +164,8 @@ class Context { * @return A new InternalThreadState object for using with a new guest thread. */ - FEX_DEFAULT_VISIBILITY virtual FEXCore::Core::InternalThreadState* - CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState = nullptr, uint64_t ParentTID = 0) = 0; + FEX_DEFAULT_VISIBILITY virtual FEXCore::Core::InternalThreadState* CreateThread( + uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState = nullptr, uint64_t ParentTID = 0) = 0; FEX_DEFAULT_VISIBILITY virtual void ExecutionThread(FEXCore::Core::InternalThreadState* Thread) = 0; FEX_DEFAULT_VISIBILITY virtual void DestroyThread(FEXCore::Core::InternalThreadState* Thread, bool NeedsTLSUninstall = false) = 0; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp index 59b40dd568..ffae7b1a83 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp @@ -66,22 +66,18 @@ static void* ThreadHandler(void* Data) { FEX::HLE::ThreadStateObject* CreateNewThread(FEXCore::Context::Context* CTX, FEXCore::Core::CpuStateFrame* Frame, FEX::HLE::clone3_args* args) { uint64_t flags = args->args.flags; - FEXCore::Core::CPUState NewThreadState {}; - // Clone copies the parent thread's state - memcpy(&NewThreadState, Frame, sizeof(FEXCore::Core::CPUState)); + auto NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &Frame->State, args->args.parent_tid, + FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); - NewThreadState.gregs[FEXCore::X86State::REG_RAX] = 0; + NewThread->Thread->CurrentFrame->State.gregs[FEXCore::X86State::REG_RAX] = 0; if (args->Type == TYPE_CLONE3) { // stack pointer points to the lowest address to the stack // set RSP to stack + size - NewThreadState.gregs[FEXCore::X86State::REG_RSP] = args->args.stack + args->args.stack_size; + NewThread->Thread->CurrentFrame->State.gregs[FEXCore::X86State::REG_RSP] = args->args.stack + args->args.stack_size; } else { - NewThreadState.gregs[FEXCore::X86State::REG_RSP] = args->args.stack; + NewThread->Thread->CurrentFrame->State.gregs[FEXCore::X86State::REG_RSP] = args->args.stack; } - auto NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, args->args.parent_tid, - FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); - if (FEX::HLE::_SyscallHandler->Is64BitMode()) { if (flags & CLONE_SETTLS) { x64::SetThreadArea(NewThread->Thread->CurrentFrame, reinterpret_cast(args->args.tls)); diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp index 8c905ed0e3..68a050ecb5 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.cpp @@ -6,7 +6,7 @@ #include namespace FEX::HLE { -FEX::HLE::ThreadStateObject* ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState, +FEX::HLE::ThreadStateObject* ThreadManager::CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState, uint64_t ParentTID, FEX::HLE::ThreadStateObject* InheritThread) { auto ThreadStateObject = new FEX::HLE::ThreadStateObject; diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h index ebc02c80f2..a52c5b6625 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/ThreadManager.h @@ -75,7 +75,7 @@ class ThreadManager final { return static_cast(Thread->FrontendPtr); } - FEX::HLE::ThreadStateObject* CreateThread(uint64_t InitialRIP, uint64_t StackPointer, FEXCore::Core::CPUState* NewThreadState = nullptr, + FEX::HLE::ThreadStateObject* CreateThread(uint64_t InitialRIP, uint64_t StackPointer, const FEXCore::Core::CPUState* NewThreadState = nullptr, uint64_t ParentTID = 0, FEX::HLE::ThreadStateObject* InheritThread = nullptr); void TrackThread(FEX::HLE::ThreadStateObject* Thread) { std::lock_guard lk(ThreadCreationMutex); From f5def7ae1c0238e10759a07d3af396a39f57ca20 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 22 Sep 2024 10:51:25 -0700 Subject: [PATCH 2/2] Linux: Also optimize HandleNewClone stack usage Drops from ~1392 bytes of stack usage to ~80 bytes --- .../LinuxSyscalls/Syscalls/Thread.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp index ffae7b1a83..0b8fe142b5 100644 --- a/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp +++ b/Source/Tools/LinuxEmulation/LinuxSyscalls/Syscalls/Thread.cpp @@ -157,21 +157,17 @@ uint64_t HandleNewClone(FEX::HLE::ThreadStateObject* Thread, FEXCore::Context::C bool CreatedNewThreadObject {}; if (flags & CLONE_THREAD) { - FEXCore::Core::CPUState NewThreadState {}; - // Clone copies the parent thread's state - memcpy(&NewThreadState, Frame, sizeof(FEXCore::Core::CPUState)); + // Overwrite thread + NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &Frame->State, GuestArgs->parent_tid, + FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); - NewThreadState.gregs[FEXCore::X86State::REG_RAX] = 0; + NewThread->Thread->CurrentFrame->State.gregs[FEXCore::X86State::REG_RAX] = 0; if (GuestArgs->stack == 0) { // Copies in the original thread's stack } else { - NewThreadState.gregs[FEXCore::X86State::REG_RSP] = GuestArgs->stack; + NewThread->Thread->CurrentFrame->State.gregs[FEXCore::X86State::REG_RSP] = GuestArgs->stack; } - // Overwrite thread - NewThread = FEX::HLE::_SyscallHandler->TM.CreateThread(0, 0, &NewThreadState, GuestArgs->parent_tid, - FEX::HLE::ThreadManager::GetStateObjectFromCPUState(Frame)); - // CLONE_PARENT_SETTID, CLONE_CHILD_SETTID, CLONE_CHILD_CLEARTID, CLONE_PIDFD will be handled by kernel // Call execution thread directly since we already are on the new thread NewThread->Thread->StartRunning.NotifyAll(); // Clear the start running flag