From 5213b02541cdcfb1273c29771b1857acd791e0e0 Mon Sep 17 00:00:00 2001 From: William Emfinger Date: Fri, 30 Aug 2024 09:50:13 -0500 Subject: [PATCH] feat(task): Add support for enabling task watchdog on an `espp::Task` (#322) * feat(task): Add support for enabling task watchdog on an espp::Task * fix xplat build --- components/task/example/main/task_example.cpp | 158 ++++++++++++------ components/task/include/task.hpp | 86 ++++++++-- components/task/src/task.cpp | 103 +++++++++++- 3 files changed, 283 insertions(+), 64 deletions(-) diff --git a/components/task/example/main/task_example.cpp b/components/task/example/main/task_example.cpp index 159d9a82d..6203fdfc5 100644 --- a/components/task/example/main/task_example.cpp +++ b/components/task/example/main/task_example.cpp @@ -1,6 +1,7 @@ #include #include +#include "logger.hpp" #include "run_on_core.hpp" #include "task.hpp" @@ -33,6 +34,7 @@ using namespace std::chrono_literals; * */ extern "C" void app_main(void) { + espp::Logger logger({.tag = "TaskExample", .level = espp::Logger::Verbosity::DEBUG}); /** * Set up some variables we'll re-use to control and measure our tests. */ @@ -47,8 +49,9 @@ extern "C" void app_main(void) { */ auto test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Spawning 1 task for {} seconds!\n", num_seconds_to_run); + logger.info("Basic task example: spawning 1 task for {} seconds!", num_seconds_to_run); //! [Task example] + espp::Task::configure_task_watchdog(1000ms); auto task_fn = [](std::mutex &m, std::condition_variable &cv) { static size_t task_iterations{0}; fmt::print("Task: #iterations = {}\n", task_iterations); @@ -65,15 +68,69 @@ extern "C" void app_main(void) { auto task = espp::Task( {.name = "Task 1", .callback = task_fn, .log_level = espp::Logger::Verbosity::DEBUG}); task.start(); + task.start_watchdog(); // start the watchdog timer for this task std::this_thread::sleep_for(num_seconds_to_run * 1s); + task.stop_watchdog(); // stop the watchdog timer for this task // show explicitly stopping the task (though the destructor called at the // end of this scope would do it for us) task.stop(); + std::error_code ec; + std::string watchdog_info = espp::Task::get_watchdog_info(ec); + if (ec) { + fmt::print("Error getting watchdog info: {}\n", ec.message()); + } else if (!watchdog_info.empty()) { + fmt::print("Watchdog info: {}\n", watchdog_info); + } else { + fmt::print("No watchdog info available\n"); + } //! [Task example] } auto test_end = std::chrono::high_resolution_clock::now(); auto test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); + + /** + * Show a simple task triggering a wathdog timeout (but not panicing), and + * then printing out the watchdog info. + */ + test_start = std::chrono::high_resolution_clock::now(); + { + logger.info("Task watchdog example:"); + //! [task watchdog example] + static constexpr bool panic_on_watchdog_timeout = false; + espp::Task::configure_task_watchdog(300ms, panic_on_watchdog_timeout); + auto task_fn = [](std::mutex &m, std::condition_variable &cv) { + static size_t task_iterations{0}; + fmt::print("Task: #iterations = {}\n", task_iterations); + task_iterations++; + std::unique_lock lk(m); + // note our sleep here is longer than the watchdog timeout, so we should + // trigger the watchdog timeout + cv.wait_for(lk, 500ms); + // we don't want to stop, so return false + return false; + }; + auto task = espp::Task( + {.name = "Task 1", .callback = task_fn, .log_level = espp::Logger::Verbosity::DEBUG}); + task.start(); + task.start_watchdog(); // start the watchdog timer for this task + std::this_thread::sleep_for(500ms); + std::error_code ec; + std::string watchdog_info = espp::Task::get_watchdog_info(ec); + if (ec) { + fmt::print("Error getting watchdog info: {}\n", ec.message()); + } else if (!watchdog_info.empty()) { + fmt::print("Watchdog info: {}\n", watchdog_info); + } else { + fmt::print("No watchdog info available\n"); + } + // NOTE: the task and the watchdog will both automatically get stopped when + // the task goes out of scope and is destroyed. + //! [task watchdog example] + } + test_end = std::chrono::high_resolution_clock::now(); + test_duration = std::chrono::duration(test_end - test_start).count(); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show the most efficient way to wait in a task, using provided mutex / @@ -86,7 +143,7 @@ extern "C" void app_main(void) { //! [ManyTask example] std::vector> tasks; size_t num_tasks = 10; - fmt::print("Spawning {} tasks!\n", num_tasks); + logger.info("Many task example: spawning {} tasks!", num_tasks); tasks.resize(num_tasks); auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < num_tasks; i++) { @@ -115,7 +172,7 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show the more convenient way to wait in a task, simply using sleep_for or @@ -126,7 +183,8 @@ extern "C" void app_main(void) { { std::vector> tasks; size_t num_tasks = 10; - fmt::print("Spawning {} tasks!\n", num_tasks); + logger.info("Convenient, but inefficient / blocking sleep example: spawning {} tasks!", + num_tasks); tasks.resize(num_tasks); auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < num_tasks; i++) { @@ -157,7 +215,11 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); + logger.debug( + "Note: some tasks ran for multiple iterations, and not all tasks stopped at the same time!"); + logger.debug("This is because the sleep_for() function cannot be interrupted / notified when the " + "task is stopped / destroyed."); /** * Show an example of a long running task that has multiple steps per @@ -165,7 +227,7 @@ extern "C" void app_main(void) { */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Spawning complex task for {} seconds!\n", num_seconds_to_run); + logger.info("Spawning long-running / complex task for {} seconds!", num_seconds_to_run); //! [LongRunningTask example] auto task_fn = [](std::mutex &m, std::condition_variable &cv) { static size_t task_iterations{0}; @@ -207,14 +269,14 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show an example of printing out the task info from another thread. */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Spawning 1 task for {} seconds!\n", num_seconds_to_run); + logger.info("Print task info example: spawning 1 task for {} seconds!", num_seconds_to_run); //! [Task Info example] auto task_fn = [](std::mutex &m, std::condition_variable &cv) { static size_t task_iterations{0}; @@ -248,7 +310,7 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show an example of the task auto-stopping itself from within the task @@ -256,7 +318,7 @@ extern "C" void app_main(void) { */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Spawning 1 task for {} seconds!\n", num_seconds_to_run); + logger.info("Task request stop example: spawning 1 task for {} seconds!", num_seconds_to_run); //! [Task Request Stop example] auto task_fn = [&num_seconds_to_run](std::mutex &m, std::condition_variable &cv) { static auto begin = std::chrono::high_resolution_clock::now(); @@ -286,7 +348,7 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show an example of the task auto-stopping itself from within the task @@ -294,7 +356,8 @@ extern "C" void app_main(void) { */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Spawning 1 task for {} seconds!\n", num_seconds_to_run); + logger.info("Task request stop, then restart example: spawning 1 task for {} seconds!", + num_seconds_to_run); //! [Task Request Stop Then Restart example] auto task_fn = [&num_seconds_to_run](std::mutex &m, std::condition_variable &cv) { static auto begin = std::chrono::high_resolution_clock::now(); @@ -332,7 +395,7 @@ extern "C" void app_main(void) { } test_end = std::chrono::high_resolution_clock::now(); test_duration = std::chrono::duration(test_end - test_start).count(); - fmt::print("Test ran for {:.03f} seconds\n", test_duration); + logger.debug("Test ran for {:.03f} seconds", test_duration); /** * Show an example of a task which is stopped by multiple other tasks @@ -341,7 +404,7 @@ extern "C" void app_main(void) { */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Task Request Stop From Multiple Threads example\n"); + logger.info("Task Request Stop From Multiple Threads example"); //! [Task Request Stop From Multiple Threads example] auto task_fn = [&num_seconds_to_run](std::mutex &m, std::condition_variable &cv) { static auto begin = std::chrono::high_resolution_clock::now(); @@ -384,54 +447,55 @@ extern "C" void app_main(void) { for (auto &t : threads) { t.join(); } - fmt::print("Task successfully stopped by multiple threads!\n"); + logger.debug("Task successfully stopped by multiple threads!"); //! [Task Request Stop From Multiple Threads example] } /** - * Show an example of a task which calls stop on itself from within the task - */ + * Show an example of a task which calls stop on itself from within the task + */ test_start = std::chrono::high_resolution_clock::now(); { - fmt::print("Task Request Stop From Within Task example\n"); + logger.info("Task Request Stop From Within Task example"); //! [Task Request Stop From Within Task example] - espp::Task task = espp::Task({.name = "Self Stopping Task", - .callback = - [&num_seconds_to_run, &task](std::mutex &m, std::condition_variable &cv) { - static auto begin = std::chrono::high_resolution_clock::now(); - auto now = std::chrono::high_resolution_clock::now(); - auto elapsed = std::chrono::duration(now - begin).count(); - fmt::print("Task has run for {:.03f} seconds\n", elapsed); - // NOTE: sleeping in this way allows the sleep to exit early when the - // task is being stopped / destroyed - { - std::unique_lock lk(m); - cv.wait_for(lk, 100ms); - } - if (elapsed > num_seconds_to_run) { - fmt::print("Stopping task from within task...\n"); - task.stop(); - } - // do some other work here which can't be preempted, this helps force the - // stopping threads to try to contend on the thread join within the stop - // call - std::this_thread::sleep_for(50ms); - // we don't want to stop yet, so return false - return false; - }, + espp::Task task = + espp::Task({.name = "Self Stopping Task", + .callback = + [&num_seconds_to_run, &task](std::mutex &m, std::condition_variable &cv) { + static auto begin = std::chrono::high_resolution_clock::now(); + auto now = std::chrono::high_resolution_clock::now(); + auto elapsed = std::chrono::duration(now - begin).count(); + fmt::print("Task has run for {:.03f} seconds\n", elapsed); + // NOTE: sleeping in this way allows the sleep to exit early when the + // task is being stopped / destroyed + { + std::unique_lock lk(m); + cv.wait_for(lk, 100ms); + } + if (elapsed > num_seconds_to_run) { + fmt::print("Stopping task from within task...\n"); + task.stop(); + } + // do some other work here which can't be preempted, this helps force the + // stopping threads to try to contend on the thread join within the stop + // call + std::this_thread::sleep_for(50ms); + // we don't want to stop yet, so return false + return false; + }, - .log_level = espp::Logger::Verbosity::DEBUG}); + .log_level = espp::Logger::Verbosity::DEBUG}); task.start(); while (task.is_started()) { std::this_thread::sleep_for(50ms); } - fmt::print("Task successfully stopped by itself!\n"); + logger.debug("Task successfully stopped by itself!"); //! [Task Request Stop From Within Task example] } { //! [run on core example] - fmt::print("Example main running on core {}\n", xPortGetCoreID()); + logger.info("espp::task::run_on_core example: main thread core ID: {}", xPortGetCoreID()); // NOTE: in these examples, because we're logging with libfmt in the // function to be run, we need a little more than the default 2k stack size, // so we're using 3k. @@ -466,7 +530,7 @@ extern "C" void app_main(void) { //! [run on core example] } - fmt::print("Task example complete!\n"); + logger.info("Task example complete!"); while (true) { std::this_thread::sleep_for(1s); diff --git a/components/task/include/task.hpp b/components/task/include/task.hpp index a6baa7a55..0ca322b59 100644 --- a/components/task/include/task.hpp +++ b/components/task/include/task.hpp @@ -8,6 +8,7 @@ #if defined(ESP_PLATFORM) #include +#include #include #include #endif @@ -30,13 +31,15 @@ namespace espp { * * \section task_ex1 Basic Task Example * \snippet task_example.cpp Task example - * \section task_ex2 Many Task Example + * \section task_ex2 Task Watchdog Example + * \snippet task_example.cpp task watchdog example + * \section task_ex3 Many Task Example * \snippet task_example.cpp ManyTask example - * \section task_ex3 Long Running Task Example + * \section task_ex4 Long Running Task Example * \snippet task_example.cpp LongRunningTask example - * \section task_ex4 Task Info Example + * \section task_ex5 Task Info Example * \snippet task_example.cpp Task Info example - * \section task_ex5 Task Request Stop Example + * \section task_ex6 Task Request Stop Example * \snippet task_example.cpp Task Request Stop example * * \section run_on_core_ex1 Run on Core Example @@ -44,11 +47,11 @@ namespace espp { */ class Task : public espp::BaseComponent { public: - #if defined(ESP_PLATFORM) - typedef void* task_id_t; - #else +#if defined(ESP_PLATFORM) + typedef void *task_id_t; +#else typedef std::thread::id task_id_t; - #endif +#endif /** * @brief Task callback function signature. @@ -221,6 +224,56 @@ class Task : public espp::BaseComponent { bool is_running() const; #if defined(ESP_PLATFORM) || defined(_DOXYGEN_) + /** + * @brief Start the task watchdog for this task. + * @return true if the watchdog was started, false otherwise. + */ + bool start_watchdog(); + + /** + * @brief Stop the task watchdog for this task. + * @return true if the watchdog was stopped, false otherwise. + */ + bool stop_watchdog(); + + /** + * @brief Initialize/Configure the task watchdog for the current task. + * @param timeout_ms Timeout in milliseconds for the watchdog. + * @param panic_on_timeout Whether or not to panic on timeout. + * @return true if the watchdog was initialized, false otherwise. + * @note This function will not monitor the idle tasks. + * @note If the watchdog has not been configured, then this function will call + * `esp_task_wdt_init`, otherwise it will then call + * `esp_task_wdt_reconfigure`. + */ + static bool configure_task_watchdog(uint32_t timeout_ms, bool panic_on_timeout = true); + + /** + * @brief Initialize/Configure the task watchdog for the current task. + * @param timeout The timeout for the watchdog. + * @param panic_on_timeout Whether or not to panic on timeout. + * @return true if the watchdog was initialized, false otherwise. + * @note This function will not monitor the idle tasks. + * @note If the watchdog has not been configured, then this function will call + * `esp_task_wdt_init`, otherwise it will then call + * `esp_task_wdt_reconfigure`. + */ + static bool configure_task_watchdog(const std::chrono::milliseconds &timeout, + bool panic_on_timeout = true); + + /** + * @brief Retrieve the info about tasks / users which triggered the task + * watchdog timeout. + * @param ec Error code to set if there was an error retrieving the info. + * @return std::string containing the task watchdog info, or an empty string + * if there was no timeout or there was an error retrieving the info. + * @note This function will only return info for tasks which are still + * registered with the watchdog. If you call this after you have called + * stop_watchdog() for a task, then even if the task triggered the + * watchdog timeout you will not get that information. + */ + static std::string get_watchdog_info(std::error_code &ec); + /** * @brief Get the info (as a string) for the task of the current context. * @return std::string containing name, core ID, priority, and stack high @@ -244,11 +297,11 @@ class Task : public espp::BaseComponent { * @return ID for this Task's thread / task context. */ task_id_t get_id() const { - #if defined(ESP_PLATFORM) +#if defined(ESP_PLATFORM) return task_handle_; - #else +#else return thread_.get_id(); - #endif +#endif } /** @@ -256,11 +309,11 @@ class Task : public espp::BaseComponent { * @return ID for the current thread / task context. */ static task_id_t get_current_id() { - #if defined(ESP_PLATFORM) +#if defined(ESP_PLATFORM) return static_cast(xTaskGetCurrentTaskHandle()); - #else +#else return std::this_thread::get_id(); - #endif +#endif } protected: @@ -288,9 +341,10 @@ class Task : public espp::BaseComponent { std::mutex cv_m_; std::mutex thread_mutex_; std::thread thread_; - #if defined(ESP_PLATFORM) +#if defined(ESP_PLATFORM) + std::atomic watchdog_started_{false}; task_id_t task_handle_; - #endif +#endif }; } // namespace espp diff --git a/components/task/src/task.cpp b/components/task/src/task.cpp index 1cfd947e4..6b2488f07 100644 --- a/components/task/src/task.cpp +++ b/components/task/src/task.cpp @@ -94,6 +94,9 @@ bool Task::stop() { if (started_) { started_ = false; logger_.debug("Stopping task"); +#if defined(ESP_PLATFORM) + stop_watchdog(); +#endif notify_and_join(); logger_.debug("Task stopped"); return true; @@ -103,6 +106,95 @@ bool Task::stop() { } } +#if defined(ESP_PLATFORM) +bool Task::start_watchdog() { + if (!started_) { + logger_.warn("Task not started, cannot start watchdog!"); + return false; + } + if (watchdog_started_) { + logger_.debug("Watchdog already started!"); + return false; + } + logger_.debug("Starting watchdog for task '{}'", name_); + // subscribe to the watchdog + auto task_handle = static_cast(get_id()); + if (task_handle == nullptr) { + logger_.error("Failed to get task handle for task '{}'", name_); + return false; + } + auto err = esp_task_wdt_add(task_handle); + if (err != ESP_OK) { + logger_.error("Failed to start watchdog for task '{}'", name_); + return false; + } + // everything is good, set the flag + watchdog_started_ = true; + return true; +} + +bool Task::stop_watchdog() { + if (!watchdog_started_) { + logger_.debug("Watchdog already stopped!"); + return false; + } + logger_.debug("Stopping watchdog for task '{}'", name_); + // update the flag + watchdog_started_ = false; + // unsubscribe from the watchdog + auto task_handle = static_cast(get_id()); + if (task_handle == nullptr) { + logger_.error("Failed to get task handle for task '{}'", name_); + return false; + } + auto err = esp_task_wdt_delete(task_handle); + if (err != ESP_OK) { + logger_.error("Failed to stop watchdog for task '{}'", name_); + } + return err == ESP_OK; +} + +bool Task::configure_task_watchdog(uint32_t timeout_ms, bool panic_on_timeout) { + esp_task_wdt_config_t config; + memset(&config, 0, sizeof(config)); + config.timeout_ms = timeout_ms; + config.trigger_panic = panic_on_timeout; + auto err = esp_task_wdt_status(nullptr); + if (err == ESP_ERR_INVALID_STATE) { + // the watchdog was not initialized yet, so initialize it + err = esp_task_wdt_init(&config); + } else if (err == ESP_OK || err == ESP_ERR_NOT_FOUND) { + // the watchdog is already initialized, so reconfigure it + err = esp_task_wdt_reconfigure(&config); + } else { + // some other error occurred + return false; + } + return err == ESP_OK; +} + +bool Task::configure_task_watchdog(const std::chrono::milliseconds &timeout, + bool panic_on_timeout) { + return configure_task_watchdog(timeout.count(), panic_on_timeout); +} + +std::string Task::get_watchdog_info(std::error_code &ec) { + std::string info = ""; + auto err = esp_task_wdt_print_triggered_tasks( + [](void *arg, const char *msg) { + std::string *info = static_cast(arg); + *info += msg; + }, + &info, nullptr); + if (err == ESP_FAIL) { + // no triggered tasks were found, no information was printed + } else if (err != ESP_OK) { + ec = std::make_error_code(std::errc::io_error); + } + return info; +} +#endif + void Task::notify_and_join() { { std::lock_guard lock(cv_m_); @@ -142,7 +234,7 @@ std::string Task::get_info(const Task &task) { void Task::thread_function() { #if defined(ESP_PLATFORM) task_handle_ = get_current_id(); -#endif +#endif // ESP_PLATFORM while (started_) { if (callback_) { bool should_stop = callback_(cv_m_, cv_); @@ -164,5 +256,14 @@ void Task::thread_function() { started_ = false; break; } +#if defined(ESP_PLATFORM) + // check if the watchdog is enabled + if (watchdog_started_) { + auto err = esp_task_wdt_reset(); + if (err != ESP_OK) { + logger_.error("Watchdog reset failed for task '{}'", name_); + } + } +#endif // ESP_PLATFORM } }