RelationalAI · d-netto · Sep 30, 2024 · NHDaly · Sep 25, 2024
diff --git a/base/timing.jl b/base/timing.jl
@@ -84,6 +84,10 @@ function cumulative_compile_timing(b::Bool)
     return
 end
 
+function cumulative_scheduler_time_ns()
+    return ccall(:jl_cumulative_scheduler_time_ns, UInt64, ())
+end
+
 # total time spend in garbage collection, in nanoseconds
 gc_time_ns() = ccall(:jl_gc_total_hrtime, UInt64, ())
 

diff --git a/src/gc.c b/src/gc.c
@@ -3762,7 +3762,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
     jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
     // `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
-    uint64_t t0 = jl_hrtime();
+
+    // time we started the GC stop-the-world phase
+    uint64_t safepoint_start_wall_time = jl_hrtime();
+    uint64_t safepoint_start_for_tls_timing = jl_record_time_for_tls_metric();
+
     if (!jl_safepoint_start_gc()) {
         // either another thread is running GC, or the GC got disabled just now.
         jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
@@ -3792,8 +3796,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
     JL_PROBE_GC_STOP_THE_WORLD();
 
-    uint64_t t1 = jl_hrtime();
-    uint64_t duration = t1 - t0;
+    // time we ended the GC stop-the-world phase
+    uint64_t safepoint_end_wall_time = jl_hrtime();
+    uint64_t safepoint_end_for_tls_timing = jl_record_time_for_tls_metric();
+
+    uint64_t duration = safepoint_end_wall_time - safepoint_start_wall_time;
     if (duration > gc_num.max_time_to_safepoint)
         gc_num.max_time_to_safepoint = duration;
     gc_num.time_to_safepoint = duration;
@@ -3822,6 +3829,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
     JL_PROBE_GC_END();
 
+    // Measure how long safepoint (stop-the-world) and GC took
+    jl_increment_timing_tls_metric(ptls, safepoint_time, safepoint_end_for_tls_timing - safepoint_start_for_tls_timing);
+    jl_increment_timing_tls_metric(ptls, gc_time, jl_record_time_for_tls_metric() - safepoint_end_for_tls_timing);
+
     // Only disable finalizers on current thread
     // Doing this on all threads is racy (it's impossible to check
     // or wait for finalizers on other threads without dead lock).

diff --git a/src/julia_internal.h b/src/julia_internal.h
@@ -192,6 +192,7 @@ int jl_running_under_rr(int recheck) JL_NOTSAFEPOINT;
 // timers
 // Returns time in nanosec
 JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_set_peek_cond(uintptr_t);
 JL_DLLEXPORT double jl_get_profile_peek_duration(void);

diff --git a/src/julia_locks.h b/src/julia_locks.h
@@ -22,10 +22,14 @@ JL_DLLEXPORT int _jl_mutex_trylock_nogc(jl_task_t *self, jl_mutex_t *lock) JL_NO
 JL_DLLEXPORT int _jl_mutex_trylock(jl_task_t *self, jl_mutex_t *lock);
 JL_DLLEXPORT void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock);
 JL_DLLEXPORT void _jl_mutex_unlock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT;
+// Unfortunately we can't include `julia_internal.h` here, so we need to forward declare this
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;
 
 static inline void jl_mutex_wait(jl_mutex_t *lock, int safepoint)
 {
+    uint64_t t0 = jl_record_time_for_tls_metric();
     _jl_mutex_wait(jl_current_task, lock, safepoint);
+    jl_increment_timing_tls_metric(jl_current_task->ptls, internal_spinlock_time, jl_record_time_for_tls_metric() - t0);
 }
 
 static inline void jl_mutex_lock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER

diff --git a/src/julia_threads.h b/src/julia_threads.h
@@ -124,6 +124,19 @@ typedef struct {
 
 struct _jl_bt_element_t;
 
+typedef struct {
+    uint64_t internal_spinlock_time;
+    uint64_t scheduler_time;
+    uint64_t safepoint_time;
+    uint64_t gc_time;
+} jl_timing_tls_states_t;
+
+#define jl_increment_timing_tls_metric(ptls, metric, dt) do { \
+    if (ptls) { \
+        ptls->timing_tls.metric += dt; \
+    } \
+} while (0)
+
 // This includes all the thread local states we care about for a thread.
 // Changes to TLS field types must be reflected in codegen.
 #define JL_MAX_BT_SIZE 80000
@@ -155,6 +168,7 @@ typedef struct _jl_tls_states_t {
     // Counter to disable finalizer **on the current thread**
     int finalizers_inhibited;
     jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
+    jl_timing_tls_states_t timing_tls;
     volatile sig_atomic_t defer_signal;
     _Atomic(struct _jl_task_t*) current_task;
     struct _jl_task_t *next_task;

diff --git a/src/partr.c b/src/partr.c
@@ -385,7 +385,7 @@ static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT
 
 extern _Atomic(unsigned) _threadedregion;
 
-JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
+static jl_task_t *_jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
 {
     jl_task_t *ct = jl_current_task;
     uint64_t start_cycles = 0;
@@ -538,6 +538,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
     }
 }
 
+JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
+{
+    uint64_t t0 = jl_record_time_for_tls_metric();
+    jl_task_t *task = _jl_task_get_next(trypoptask, q, checkempty);
+    jl_increment_timing_tls_metric(jl_current_task->ptls, scheduler_time, jl_record_time_for_tls_metric() - t0);
+    return task;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/safepoint.c b/src/safepoint.c
@@ -156,7 +156,7 @@ void jl_safepoint_end_gc(void)
     uv_cond_broadcast(&safepoint_cond);
 }
 
-void jl_safepoint_wait_gc(void)
+void _jl_safepoint_wait_gc(void)
 {
     jl_task_t *ct = jl_current_task; (void)ct;
     JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
@@ -175,6 +175,13 @@ void jl_safepoint_wait_gc(void)
     }
 }
 
+void jl_safepoint_wait_gc(void)
+{
+    uint64_t t0 = jl_record_time_for_tls_metric();
+    _jl_safepoint_wait_gc();
+    jl_increment_timing_tls_metric(jl_current_task->ptls, gc_time, jl_record_time_for_tls_metric() - t0);
+}
+
 void jl_safepoint_enable_sigint(void)
 {
     uv_mutex_lock(&safepoint_lock);

diff --git a/src/sys.c b/src/sys.c
@@ -464,6 +464,28 @@ JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT
     return uv_hrtime();
 }
 
+JL_DLLEXPORT uint64_t jl_cpu_time(void) JL_NOTSAFEPOINT
+{
+#if defined(_OS_LINUX_)
+    struct rusage usage;
+    if (getrusage(RUSAGE_THREAD, &usage) == 0) {
+        return (uint64_t)usage.ru_utime.tv_sec * 1000000000 + (uint64_t)usage.ru_utime.tv_usec * 1000;
+    }
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT
+{
+#ifdef USE_CPU_TIMING_FOR_TLS_METRIC
+    return jl_cpu_time();
+#else
+    return jl_hrtime();
+#endif
+}
+
 // -- iterating the environment --
 
 #ifdef __APPLE__

diff --git a/src/threading.c b/src/threading.c
@@ -49,6 +49,20 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;
 
+JL_DLLEXPORT uint64_t jl_cumulative_scheduler_time_ns(void)
+{
+    int nthreads = jl_atomic_load_acquire(&jl_n_threads);
+    jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
+    uint64_t scheduler_time = 0;
+    for (int i = 0; i < nthreads; i++) {
+        jl_ptls_t ptls = all_tls_states[i];
+        if (ptls == NULL)
+            continue;
+        scheduler_time += ptls->timing_tls.scheduler_time;
+    }
+    return scheduler_time;
+}
+
 JL_DLLEXPORT void *jl_get_ptls_states(void)
 {
     // mostly deprecated: use current_task instead