From dcbf7b6cd3aa45f06d64ceccea70d2be8383ddbe Mon Sep 17 00:00:00 2001 From: Julian Oes Date: Mon, 18 Apr 2022 08:22:48 +1200 Subject: [PATCH] core: fix deadlock on discovery This fixes a deadlock that can happen rarely when trying to connect to a system. It's a typical deadlock where two different mutexes are locked in opposite order, so what happens is that: - the first thread locks mutex A and then tries to lock mutex B, and - the second thread locks mutex B and then tries to lock mutex A. At this point both threads are waiting on the other thread and neither can continue. In this case the two relevant locks are: - _new_system_callback_mutex, and - _systems_mutex 1. The user calls subscribe_on_new_system() which locks _new_system_callback_mutex, and then checks whether there is already a system that they should be notified about. In is_any_system_connected(), the systems() are accessed requiring the _systems_mutex. 2. At the same time, a message from a system arrives which grabs the _systems_mutex and creates the System. It then forwards the first heartbeat to the new system which in turn calls notify_on_discover() which, of course, requires the _new_system_callback_mutex, so the lock that the first thread has already taken. We deadlock. The possible fixes I thought of were: 1. Always lock these locks in the same order. This can make sense for strictly hierarchical data but doesn't really seem right here. 2. Make scope of where locks are used smaller, potentially preventing that multiple locks need to be locked at once. I didn't find a way to do that in this case. For both cases the scope seemed correct, and making it smaller would introduce new problems (invalidating iterator or calling the callback twice by mistake). 2. Collapse/merge the two conflicting locks. This had the drawback that I had to switch to the recursive_mutex to prevent any stalls when we're trying to lock the same mutex twice. We could have added methods with and without lock and then call the correct one, however, when calling the methods out of SystemImpl the context is not so obvious and it's not clear which one we should call. Basically, it makes it rather convoluted so recursive_mutex seems slightly cleaner. --- src/mavsdk/core/mavsdk_impl.cpp | 14 +++++++------- src/mavsdk/core/mavsdk_impl.h | 5 +---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/mavsdk/core/mavsdk_impl.cpp b/src/mavsdk/core/mavsdk_impl.cpp index df78420c26..6543520cd2 100644 --- a/src/mavsdk/core/mavsdk_impl.cpp +++ b/src/mavsdk/core/mavsdk_impl.cpp @@ -67,7 +67,7 @@ MavsdkImpl::~MavsdkImpl() } { - std::lock_guard lock(_systems_mutex); + std::lock_guard lock(_systems_mutex); _systems.clear(); } @@ -109,7 +109,7 @@ std::vector> MavsdkImpl::systems() const { std::vector> systems_result{}; - std::lock_guard lock(_systems_mutex); + std::lock_guard lock(_systems_mutex); for (auto& system : _systems) { // We ignore the 0 entry because it's just a null system. // It's only created because the older, deprecated API needs a @@ -212,7 +212,7 @@ void MavsdkImpl::receive_message(mavlink_message_t& message, Connection* connect return; } - std::lock_guard lock(_systems_mutex); + std::lock_guard lock(_systems_mutex); // The only situation where we create a system with sysid 0 is when we initialize the connection // to the remote. @@ -371,7 +371,7 @@ ConnectionResult MavsdkImpl::setup_udp_remote( if (ret == ConnectionResult::Success) { new_conn->add_remote(remote_ip, remote_port); add_connection(new_conn); - std::lock_guard lock(_systems_mutex); + std::lock_guard lock(_systems_mutex); make_system_with_component(0, 0, true); } return ret; @@ -500,7 +500,7 @@ void MavsdkImpl::make_system_with_component( void MavsdkImpl::notify_on_discover() { - std::lock_guard lock(_new_system_callback_mutex); + std::lock_guard lock(_systems_mutex); if (_new_system_callback) { auto temp_callback = _new_system_callback; call_user_callback([temp_callback]() { temp_callback(); }); @@ -509,7 +509,7 @@ void MavsdkImpl::notify_on_discover() void MavsdkImpl::notify_on_timeout() { - std::lock_guard lock(_new_system_callback_mutex); + std::lock_guard lock(_systems_mutex); if (_new_system_callback) { auto temp_callback = _new_system_callback; call_user_callback([temp_callback]() { temp_callback(); }); @@ -518,7 +518,7 @@ void MavsdkImpl::notify_on_timeout() void MavsdkImpl::subscribe_on_new_system(const Mavsdk::NewSystemCallback& callback) { - std::lock_guard lock(_new_system_callback_mutex); + std::lock_guard lock(_systems_mutex); _new_system_callback = callback; if (_new_system_callback != nullptr && is_any_system_connected()) { diff --git a/src/mavsdk/core/mavsdk_impl.h b/src/mavsdk/core/mavsdk_impl.h index 8592c61cd2..0ef8708507 100644 --- a/src/mavsdk/core/mavsdk_impl.h +++ b/src/mavsdk/core/mavsdk_impl.h @@ -111,11 +111,8 @@ class MavsdkImpl { std::mutex _connections_mutex{}; std::vector> _connections{}; - mutable std::mutex _systems_mutex{}; - + mutable std::recursive_mutex _systems_mutex{}; std::vector>> _systems{}; - - std::mutex _new_system_callback_mutex{}; Mavsdk::NewSystemCallback _new_system_callback{nullptr}; Time _time{};