From a90cfd27f939a8012885ff451bb6ee9726a15ea0 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Wed, 19 Feb 2020 13:20:48 -0500 Subject: [PATCH] partr: Fix deadlock (#34807) When there is no work to do, the first thread to be idle will attempt to run the event loop once, waiting for any notifications (which will usually create new work). However, there is an interesting corner case where a notification arrives, but no work was scheduled. That doesn't usually happen, but there are a few situations where it does: 1) Somebody used a libuv primitive outside of julia, so the callback doesn't schedule any julia work. 2) Another thread forbily interrupted the waiting thread because it wants to take over the event loop for various reasons 3) On Windows, we occasionally get spurious wake ups of the event loop. The existing code in partr assumed that we were in situation 2, i.e. that there was another thread waiting to take over the event loop, so it released the event loop and simply put the current thread to sleep in the expectation that another thread will pick it up. However, if we instead are in one of the other two conditions, there may not be another thread there to pick up the event loop. Thus, with no thread owning the event loop, julia will stop responding to events and effectively deadlock. Since both 1 and 3 are rare, and we don't actually enter the event loop until there was no work for 4 milliseconds (which is fairly rare), this condition rarely happens, but is occasionally observable on Windows, where it caused #34769. To test that this fix works, we manually create situation 1 in the test by creating an idle callback, which will prevent the event loop from blocking, but only schedules julia work after it's been called 100 times. This reproduces the observed failure from the issue and is fixed by this PR. Fixes #34769 Co-authored-by: Jeff Bezanson Co-authored-by: Jameson Nash (cherry picked from commit f36edc2c2429140663baeedddd4cf11712c4469d) --- src/partr.c | 3 ++ test/threads.jl | 117 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) diff --git a/src/partr.c b/src/partr.c index c457643797446..f6cac067f9222 100644 --- a/src/partr.c +++ b/src/partr.c @@ -461,6 +461,9 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q) // TODO: this relinquishes responsibility for all event // to the last thread to do an explicit operation, // which may starve other threads of critical work + if (jl_atomic_load(&jl_uv_n_waiters) == 0) { + continue; + } } if (!_threadedregion && active && ptls->tid == 0) { // thread 0 is the only thread permitted to run the event loop diff --git a/test/threads.jl b/test/threads.jl index 54e11c8df3f19..c555363cf9d62 100644 --- a/test/threads.jl +++ b/test/threads.jl @@ -20,3 +20,120 @@ if Sys.islinux() && Sys.CPU_THREADS > 1 && Sys.which("taskset") !== nothing @test endswith(run_with_affinity("1"), "2") @test endswith(run_with_affinity("0,1"), "3") end + +# issue #34769 +function idle_callback(handle) + idle = @Base.handle_as handle UvTestIdle + if idle.active + idle.count += 1 + if idle.count == 1 + # We want to hit the case where we're allowing + # the thread to go to sleep, which only happens + # after some default amount of time (DEFAULT_THREAD_SLEEP_THRESHOLD) + # so spend that amount of time here. + Libc.systemsleep(0.004) + elseif idle.count >= 10 + lock(idle.cond) + try + notify(idle.cond, true) + finally + unlock(idle.cond) + end + idle.active = false + end + end + nothing +end + +mutable struct UvTestIdle + handle::Ptr{Cvoid} + cond::Base.ThreadSynchronizer + isopen::Bool + active::Bool + count::Int + + function UvTestIdle() + this = new(Libc.malloc(Base._sizeof_uv_idle), Base.ThreadSynchronizer(), true, false, 0) + Base.iolock_begin() + Base.associate_julia_struct(this.handle, this) + err = ccall(:uv_idle_init, Cint, (Ptr{Cvoid}, Ptr{Cvoid}), + Base.eventloop(), this.handle) + if err != 0 + Libc.free(this.handle) + this.handle = C_NULL + throw(_UVError("uv_idle_init", err)) + end + err = ccall(:uv_idle_start, Cint, (Ptr{Cvoid}, Ptr{Cvoid}), + this.handle, @cfunction(idle_callback, Cvoid, (Ptr{Cvoid},))) + if err != 0 + Libc.free(this.handle) + this.handle = C_NULL + throw(_UVError("uv_idle_start", err)) + end + finalizer(Base.uvfinalize, this) + Base.iolock_end() + return this + end +end +Base.unsafe_convert(::Type{Ptr{Cvoid}}, idle::UvTestIdle) = idle.handle + +function Base.uvfinalize(t::UvTestIdle) + Base.iolock_begin() + Base.lock(t.cond) + try + if t.handle != C_NULL + Base.disassociate_julia_struct(t.handle) # not going to call the usual close hooks + if t.isopen + t.isopen = false + ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), t) + end + t.handle = C_NULL + notify(t.cond, false) + end + finally + unlock(t.cond) + end + Base.iolock_end() + nothing +end + +function Base.wait(idle::UvTestIdle) + Base.iolock_begin() + Base.preserve_handle(idle) + Base.lock(idle.cond) + try + idle.active = true + wait(idle.cond) + finally + Base.unlock(idle.cond) + Base.unpreserve_handle(idle) + Base.iolock_end() + end +end + +# Spawn another process as a watchdog. If this test fails, it'll unrecoverably +# hang in the event loop. Another process needs to kill it +cmd = """ + @async (Base.wait_readnb(stdin, 1); exit()) + sleep(100) + isopen(stdin) || exit() + println(stderr, "ERROR: Killing threads test due to watchdog expiry") + ccall(:uv_kill, Cint, (Cint, Cint), $(getpid()), Base.SIGTERM) +""" +proc = open(pipeline(`$(Base.julia_cmd()) -e $cmd`; stderr=stderr); write=true) + +let idle=UvTestIdle() + wait(idle) +end + +using Base.Threads +@threads for i = 1:1 + let idle=UvTestIdle() + wait(idle) + end +end + +@test process_running(proc) + +# We don't need the watchdog anymore +close(proc.in)