Skip to content

Commit

Permalink
per-thread user time
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Oct 1, 2024
1 parent 75cf8dc commit 1cbe632
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 5 deletions.
4 changes: 4 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ function cumulative_compile_timing(b::Bool)
return
end

function cumulative_scheduler_time_ns()
return ccall(:jl_cumulative_scheduler_time_ns, UInt64, ())
end

# total time spend in garbage collection, in nanoseconds
gc_time_ns() = ccall(:jl_gc_total_hrtime, UInt64, ())

Expand Down
17 changes: 14 additions & 3 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3762,7 +3762,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
uint64_t t0 = jl_hrtime();

// time we started the GC stop-the-world phase
uint64_t safepoint_start_wall_time = jl_hrtime();
uint64_t safepoint_start_for_tls_timing = jl_record_time_for_tls_metric();

if (!jl_safepoint_start_gc()) {
// either another thread is running GC, or the GC got disabled just now.
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
Expand Down Expand Up @@ -3792,8 +3796,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
JL_PROBE_GC_STOP_THE_WORLD();

uint64_t t1 = jl_hrtime();
uint64_t duration = t1 - t0;
// time we ended the GC stop-the-world phase
uint64_t safepoint_end_wall_time = jl_hrtime();
uint64_t safepoint_end_for_tls_timing = jl_record_time_for_tls_metric();

uint64_t duration = safepoint_end_wall_time - safepoint_start_wall_time;
if (duration > gc_num.max_time_to_safepoint)
gc_num.max_time_to_safepoint = duration;
gc_num.time_to_safepoint = duration;
Expand Down Expand Up @@ -3822,6 +3829,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
JL_PROBE_GC_END();

// Measure how long safepoint (stop-the-world) and GC took
jl_increment_timing_tls_metric(ptls, safepoint_time, safepoint_end_for_tls_timing - safepoint_start_for_tls_timing);
jl_increment_timing_tls_metric(ptls, gc_time, jl_record_time_for_tls_metric() - safepoint_end_for_tls_timing);

// Only disable finalizers on current thread
// Doing this on all threads is racy (it's impossible to check
// or wait for finalizers on other threads without dead lock).
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ int jl_running_under_rr(int recheck) JL_NOTSAFEPOINT;
// timers
// Returns time in nanosec
JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT;
JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;

JL_DLLEXPORT void jl_set_peek_cond(uintptr_t);
JL_DLLEXPORT double jl_get_profile_peek_duration(void);
Expand Down
4 changes: 4 additions & 0 deletions src/julia_locks.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ JL_DLLEXPORT int _jl_mutex_trylock_nogc(jl_task_t *self, jl_mutex_t *lock) JL_NO
JL_DLLEXPORT int _jl_mutex_trylock(jl_task_t *self, jl_mutex_t *lock);
JL_DLLEXPORT void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock);
JL_DLLEXPORT void _jl_mutex_unlock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT;
// Unfortunately we can't include `julia_internal.h` here, so we need to forward declare this
JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;

static inline void jl_mutex_wait(jl_mutex_t *lock, int safepoint)
{
uint64_t t0 = jl_record_time_for_tls_metric();
_jl_mutex_wait(jl_current_task, lock, safepoint);
jl_increment_timing_tls_metric(jl_current_task->ptls, internal_spinlock_time, jl_record_time_for_tls_metric() - t0);
}

static inline void jl_mutex_lock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER
Expand Down
14 changes: 14 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,19 @@ typedef struct {

struct _jl_bt_element_t;

typedef struct {
uint64_t internal_spinlock_time;
uint64_t scheduler_time;
uint64_t safepoint_time;
uint64_t gc_time;
} jl_timing_tls_states_t;

#define jl_increment_timing_tls_metric(ptls, metric, dt) do { \
if (ptls) { \
ptls->timing_tls.metric += dt; \
} \
} while (0)

// This includes all the thread local states we care about for a thread.
// Changes to TLS field types must be reflected in codegen.
#define JL_MAX_BT_SIZE 80000
Expand Down Expand Up @@ -155,6 +168,7 @@ typedef struct _jl_tls_states_t {
// Counter to disable finalizer **on the current thread**
int finalizers_inhibited;
jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
jl_timing_tls_states_t timing_tls;
volatile sig_atomic_t defer_signal;
_Atomic(struct _jl_task_t*) current_task;
struct _jl_task_t *next_task;
Expand Down
10 changes: 9 additions & 1 deletion src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT

extern _Atomic(unsigned) _threadedregion;

JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
static jl_task_t *_jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
{
jl_task_t *ct = jl_current_task;
uint64_t start_cycles = 0;
Expand Down Expand Up @@ -538,6 +538,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
}
}

JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
{
uint64_t t0 = jl_record_time_for_tls_metric();
jl_task_t *task = _jl_task_get_next(trypoptask, q, checkempty);
jl_increment_timing_tls_metric(jl_current_task->ptls, scheduler_time, jl_record_time_for_tls_metric() - t0);
return task;
}

#ifdef __cplusplus
}
#endif
9 changes: 8 additions & 1 deletion src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ void jl_safepoint_end_gc(void)
uv_cond_broadcast(&safepoint_cond);
}

void jl_safepoint_wait_gc(void)
void _jl_safepoint_wait_gc(void)
{
jl_task_t *ct = jl_current_task; (void)ct;
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
Expand All @@ -175,6 +175,13 @@ void jl_safepoint_wait_gc(void)
}
}

void jl_safepoint_wait_gc(void)
{
uint64_t t0 = jl_record_time_for_tls_metric();
_jl_safepoint_wait_gc();
jl_increment_timing_tls_metric(jl_current_task->ptls, gc_time, jl_record_time_for_tls_metric() - t0);
}

void jl_safepoint_enable_sigint(void)
{
uv_mutex_lock(&safepoint_lock);
Expand Down
22 changes: 22 additions & 0 deletions src/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,28 @@ JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT
return uv_hrtime();
}

JL_DLLEXPORT uint64_t jl_cpu_time(void) JL_NOTSAFEPOINT
{
#if defined(_OS_LINUX_)
struct rusage usage;
if (getrusage(RUSAGE_THREAD, &usage) == 0) {
return (uint64_t)usage.ru_utime.tv_sec * 1000000000 + (uint64_t)usage.ru_utime.tv_usec * 1000;
}
return 0;
#else
return 0;
#endif
}

JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT
{
#ifdef USE_CPU_TIMING_FOR_TLS_METRIC
return jl_cpu_time();
#else
return jl_hrtime();
#endif
}

// -- iterating the environment --

#ifdef __APPLE__
Expand Down
14 changes: 14 additions & 0 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,20 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;

JL_DLLEXPORT uint64_t jl_cumulative_scheduler_time_ns(void)
{
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
uint64_t scheduler_time = 0;
for (int i = 0; i < nthreads; i++) {
jl_ptls_t ptls = all_tls_states[i];
if (ptls == NULL)
continue;
scheduler_time += ptls->timing_tls.scheduler_time;
}
return scheduler_time;
}

JL_DLLEXPORT void *jl_get_ptls_states(void)
{
// mostly deprecated: use current_task instead
Expand Down

0 comments on commit 1cbe632

Please sign in to comment.