Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Per-thread cpu time #181

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions base/timing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ function cumulative_compile_timing(b::Bool)
return
end

function cumulative_scheduler_time_ns()
return ccall(:jl_cumulative_scheduler_time_ns, UInt64, ())
end

# total time spend in garbage collection, in nanoseconds
gc_time_ns() = ccall(:jl_gc_total_hrtime, UInt64, ())

Expand Down
17 changes: 14 additions & 3 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3762,7 +3762,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
// `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
uint64_t t0 = jl_hrtime();

// time we started the GC stop-the-world phase
uint64_t safepoint_start_wall_time = jl_hrtime();
uint64_t safepoint_start_for_tls_timing = jl_record_time_for_tls_metric();

if (!jl_safepoint_start_gc()) {
// either another thread is running GC, or the GC got disabled just now.
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
Expand Down Expand Up @@ -3792,8 +3796,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
JL_PROBE_GC_STOP_THE_WORLD();

uint64_t t1 = jl_hrtime();
uint64_t duration = t1 - t0;
// time we ended the GC stop-the-world phase
uint64_t safepoint_end_wall_time = jl_hrtime();
uint64_t safepoint_end_for_tls_timing = jl_record_time_for_tls_metric();

uint64_t duration = safepoint_end_wall_time - safepoint_start_wall_time;
if (duration > gc_num.max_time_to_safepoint)
gc_num.max_time_to_safepoint = duration;
gc_num.time_to_safepoint = duration;
Expand Down Expand Up @@ -3822,6 +3829,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
JL_PROBE_GC_END();

// Measure how long safepoint (stop-the-world) and GC took
jl_increment_timing_tls_metric(ptls, safepoint_time, safepoint_end_for_tls_timing - safepoint_start_for_tls_timing);
jl_increment_timing_tls_metric(ptls, gc_time, jl_record_time_for_tls_metric() - safepoint_end_for_tls_timing);

// Only disable finalizers on current thread
// Doing this on all threads is racy (it's impossible to check
// or wait for finalizers on other threads without dead lock).
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ int jl_running_under_rr(int recheck) JL_NOTSAFEPOINT;
// timers
// Returns time in nanosec
JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT;
JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;

JL_DLLEXPORT void jl_set_peek_cond(uintptr_t);
JL_DLLEXPORT double jl_get_profile_peek_duration(void);
Expand Down
4 changes: 4 additions & 0 deletions src/julia_locks.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ JL_DLLEXPORT int _jl_mutex_trylock_nogc(jl_task_t *self, jl_mutex_t *lock) JL_NO
JL_DLLEXPORT int _jl_mutex_trylock(jl_task_t *self, jl_mutex_t *lock);
JL_DLLEXPORT void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock);
JL_DLLEXPORT void _jl_mutex_unlock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT;
// Unfortunately we can't include `julia_internal.h` here, so we need to forward declare this
JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;

static inline void jl_mutex_wait(jl_mutex_t *lock, int safepoint)
{
uint64_t t0 = jl_record_time_for_tls_metric();
_jl_mutex_wait(jl_current_task, lock, safepoint);
jl_increment_timing_tls_metric(jl_current_task->ptls, internal_spinlock_time, jl_record_time_for_tls_metric() - t0);
d-netto marked this conversation as resolved.
Show resolved Hide resolved
}

static inline void jl_mutex_lock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER
Expand Down
14 changes: 14 additions & 0 deletions src/julia_threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,19 @@ typedef struct {

struct _jl_bt_element_t;

typedef struct {
uint64_t internal_spinlock_time;
uint64_t scheduler_time;
uint64_t safepoint_time;
uint64_t gc_time;
} jl_timing_tls_states_t;

#define jl_increment_timing_tls_metric(ptls, metric, dt) do { \
if (ptls) { \
ptls->timing_tls.metric += dt; \
} \
} while (0)

// This includes all the thread local states we care about for a thread.
// Changes to TLS field types must be reflected in codegen.
#define JL_MAX_BT_SIZE 80000
Expand Down Expand Up @@ -155,6 +168,7 @@ typedef struct _jl_tls_states_t {
// Counter to disable finalizer **on the current thread**
int finalizers_inhibited;
jl_gc_tls_states_t gc_tls; // this is very large, and the offset of the first member is baked into codegen
jl_timing_tls_states_t timing_tls;
volatile sig_atomic_t defer_signal;
_Atomic(struct _jl_task_t*) current_task;
struct _jl_task_t *next_task;
Expand Down
10 changes: 9 additions & 1 deletion src/partr.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT

extern _Atomic(unsigned) _threadedregion;

JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
static jl_task_t *_jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
{
jl_task_t *ct = jl_current_task;
uint64_t start_cycles = 0;
Expand Down Expand Up @@ -538,6 +538,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
}
}

JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
{
uint64_t t0 = jl_record_time_for_tls_metric();
jl_task_t *task = _jl_task_get_next(trypoptask, q, checkempty);
jl_increment_timing_tls_metric(jl_current_task->ptls, scheduler_time, jl_record_time_for_tls_metric() - t0);
return task;
}
Comment on lines +541 to +547
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this got lost in the handoff, but the original plan was to not record the current time twice for the "fast-path" of task switching.

The concern is that adding two syscalls can meaningfully slow down task switching, and the idea was that we probably don't need to do this anyway since the time we'd be measuring in the fast-path case is trivial (we believe).

So ideally I think we'd want to only do this in the slow paths inside jl_task_get_next?

BUT to be honest, I find your approach much cleaner and clearer - and it's nice that we would get an accurate measure of that switch time which is supposed to be cheap......... AND we are going to need something like this anyway for the per-task CPU time in https://relationalai.atlassian.net/browse/RAI-30482...

So maybe if we could solve the vDSO / fast instruction problem, then we could do it this way after all?
Something to resolve CC @kpamnany.


#ifdef __cplusplus
}
#endif
9 changes: 8 additions & 1 deletion src/safepoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ void jl_safepoint_end_gc(void)
uv_cond_broadcast(&safepoint_cond);
}

void jl_safepoint_wait_gc(void)
void _jl_safepoint_wait_gc(void)
{
jl_task_t *ct = jl_current_task; (void)ct;
JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
Expand All @@ -175,6 +175,13 @@ void jl_safepoint_wait_gc(void)
}
}

void jl_safepoint_wait_gc(void)
{
uint64_t t0 = jl_record_time_for_tls_metric();
_jl_safepoint_wait_gc();
jl_increment_timing_tls_metric(jl_current_task->ptls, gc_time, jl_record_time_for_tls_metric() - t0);
}

void jl_safepoint_enable_sigint(void)
{
uv_mutex_lock(&safepoint_lock);
Expand Down
22 changes: 22 additions & 0 deletions src/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,28 @@ JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT
return uv_hrtime();
}

JL_DLLEXPORT uint64_t jl_cpu_time(void) JL_NOTSAFEPOINT
{
#if defined(_OS_LINUX_)
struct rusage usage;
if (getrusage(RUSAGE_THREAD, &usage) == 0) {
return (uint64_t)usage.ru_utime.tv_sec * 1000000000 + (uint64_t)usage.ru_utime.tv_usec * 1000;
}
return 0;
#else
return 0;
#endif
}

JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT
{
#ifdef USE_CPU_TIMING_FOR_TLS_METRIC
return jl_cpu_time();
#else
return jl_hrtime();
#endif
}

// -- iterating the environment --

#ifdef __APPLE__
Expand Down
14 changes: 14 additions & 0 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,20 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;

JL_DLLEXPORT uint64_t jl_cumulative_scheduler_time_ns(void)
{
int nthreads = jl_atomic_load_acquire(&jl_n_threads);
jl_ptls_t *all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states);
uint64_t scheduler_time = 0;
for (int i = 0; i < nthreads; i++) {
jl_ptls_t ptls = all_tls_states[i];
if (ptls == NULL)
continue;
scheduler_time += ptls->timing_tls.scheduler_time;
}
return scheduler_time;
}

JL_DLLEXPORT void *jl_get_ptls_states(void)
{
// mostly deprecated: use current_task instead
Expand Down