From ab8d798aea8a69b1a7178085d9e799431fbe2a36 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 25 Dec 2024 22:07:07 +1100 Subject: [PATCH 1/2] workqueue: Add helper and corelens module to show unsubmitted pending works. dealyed_work(s) get their pending bit set but are actually submitted to a workqueue, upon expiration of corresponding timer(s). Recently we have found some cases where a delayed work submitted to an already offlined CPU was never getting executed, because underlying timers were not firing in first place. Since the pending bit was set, this gave a notion that work item was lost to workqueue subsystem (which was not the case here.) Add an helper and a corelens module to dump delayed_work(s) whose timer has not yet expired. This is off interest for offline CPUs mainly, because ideally we should not see any delayed_work timer lying on an offlined CPU. So by default the helper and corelens module dump this info for offlined CPUs only like shown in the below snippet: python3 -m drgn_tools.corelens vmcore -d ~/v5.4/ -M unsubmitted_pending_works CPU: 4 state: offline timer: ffff8ce6bd7b3a40 tte(jiffies): 289126 work: ffff8ce6bd7b3a20 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b39e0 tte(jiffies): 289125 work: ffff8ce6bd7b39c0 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b3980 tte(jiffies): 289125 work: ffff8ce6bd7b3960 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b3920 tte(jiffies): 289125 work: ffff8ce6bd7b3900 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b38c0 tte(jiffies): 289124 work: ffff8ce6bd7b38a0 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b3860 tte(jiffies): 289124 work: ffff8ce6bd7b3840 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b3800 tte(jiffies): 289124 work: ffff8ce6bd7b37e0 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b37a0 tte(jiffies): 289124 work: ffff8ce6bd7b3780 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b3740 tte(jiffies): 289124 work: ffff8ce6bd7b3720 func: UNKNOWN: 0xffffffffc0327000 timer: ffff8ce6bd7b36e0 tte(jiffies): 289124 work: ffff8ce6bd7b36c0 func: UNKNOWN: 0xffffffffc0327000 Signed-off-by: Imran Khan --- drgn_tools/workqueue.py | 86 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/drgn_tools/workqueue.py b/drgn_tools/workqueue.py index 5f59918..a8f580a 100644 --- a/drgn_tools/workqueue.py +++ b/drgn_tools/workqueue.py @@ -13,11 +13,17 @@ from typing import Union from drgn import cast +from drgn import container_of +from drgn import FaultError from drgn import IntegerLike from drgn import NULL from drgn import Object from drgn import Program +from drgn import sizeof from drgn.helpers.common.format import escape_ascii_string +from drgn.helpers.linux.bitops import for_each_set_bit +from drgn.helpers.linux.cpumask import for_each_online_cpu +from drgn.helpers.linux.cpumask import for_each_possible_cpu from drgn.helpers.linux.idr import idr_find from drgn.helpers.linux.idr import idr_for_each from drgn.helpers.linux.list import hlist_for_each_entry @@ -52,6 +58,7 @@ "is_task_a_worker", "find_worker_executing_work", "workqueue_get_pwq", + "show_unexpired_delayed_works", ) @@ -614,6 +621,85 @@ def find_worker_executing_work(work: Object) -> Object: return NULL(prog, "struct worker *") +def show_unexpired_delayed_works( + prog: Program, only_offline_cpus: bool = True +) -> None: + """ + Show delayed_work(s) whose timers have not yet expired. + delayed_work(s) get their `WORK_STRUCT_PENDING_BIT` set, but get + submitted only at expiration of corresponding timer. + This helper dumps all delayed_work(s) that have not yet made it to + any worker_pool, due to their timers not firing for one reason or + another. + + :param only_offline_cpus: if True only delayed_works on offlined CPUs are shown. + """ + online_cpus = list(for_each_online_cpu(prog)) + for cpu in for_each_possible_cpu(prog): + cpu_state = "online" if cpu in online_cpus else "offline" + if only_offline_cpus and cpu in online_cpus: + continue + print(f"CPU: {cpu} state: {cpu_state}") + try: + for timer_base in per_cpu(prog["timer_bases"], cpu): + for idx in for_each_set_bit( + timer_base.pending_map, sizeof(timer_base.pending_map) * 8 + ): + for timer in hlist_for_each_entry( + "struct timer_list", + timer_base.vectors[idx].address_of_(), + "entry", + ): + if ( + prog["delayed_work_timer_fn"].address_of_() + == timer.function + ): + dwork = container_of( + timer, + "struct delayed_work", + "timer", + ) + tte = ( + timer.expires.value_() + - prog["jiffies"].value_() + ) + work = dwork.work.address_ + try: + func = prog.symbol( + dwork.work.func.value_() + ).name + except LookupError: + func = ( + f"UNKNOWN: 0x{dwork.work.func.value_():x}" + ) + print( + f"timer: {timer.value_():x} tte(jiffies): {tte} work: {work:x} func: {func}" + ) + + except FaultError: + continue + + +class OfflinedDelayedWorksModule(CorelensModule): + """ + Show delayed works from offlined CPUs. + Delayed works (with non zero delay), rely on timer-wheel timers for + their submission. If these timers don't fire the work does not get + submitted. So delayed works submitted to an offlined CPU, don't get + executed even after specified delay because timer-wheel timers on + offlined CPUs don't get fired in first place. + + This corelens module list delayed works on offlined CPUs, so that + one can know if a delayed work was left unexececuted, due to the fact + that it was submitted on an offlined CPU. + """ + + name = "offlined_delayed_works" + + def run(self, prog: Program, args: argparse.Namespace) -> None: + show_unexpired_delayed_works(prog) + + class WorkqueueModule(CorelensModule): """Show details about all workqueues""" From 003bdbbba26896aa7e43bf046666bd5861819b41 Mon Sep 17 00:00:00 2001 From: Imran Khan Date: Wed, 25 Dec 2024 22:12:11 +1100 Subject: [PATCH 2/2] workqueue: add test for helper of previous commit. Signed-off-by: Imran Khan --- drgn_tools/workqueue.py | 4 ++-- tests/test_workqueue.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drgn_tools/workqueue.py b/drgn_tools/workqueue.py index a8f580a..3537af1 100644 --- a/drgn_tools/workqueue.py +++ b/drgn_tools/workqueue.py @@ -684,13 +684,13 @@ class OfflinedDelayedWorksModule(CorelensModule): """ Show delayed works from offlined CPUs. Delayed works (with non zero delay), rely on timer-wheel timers for - their submission. If these timers don't fire the work does not get + their submission. If these timers don't fire, the work does not get submitted. So delayed works submitted to an offlined CPU, don't get executed even after specified delay because timer-wheel timers on offlined CPUs don't get fired in first place. This corelens module list delayed works on offlined CPUs, so that - one can know if a delayed work was left unexececuted, due to the fact + one can know if a delayed work was left unexecuted, due to the fact that it was submitted on an offlined CPU. """ diff --git a/tests/test_workqueue.py b/tests/test_workqueue.py index b165b01..57e5350 100644 --- a/tests/test_workqueue.py +++ b/tests/test_workqueue.py @@ -105,3 +105,9 @@ def test_for_each_pending_work_of_pwq(prog: drgn.Program) -> None: def test_show_all_workqueues(prog: drgn.Program) -> None: wq.show_all_workqueues(prog) + + +def test_show_unexpired_delayed_works( + prog: drgn.Program, +) -> None: + wq.show_unexpired_delayed_works(prog)