Skip to content

Commit

Permalink
Optimize load time and TTFX for non-comparison cases. (#137)
Browse files Browse the repository at this point in the history
- Separate _benchmark_1 from benchmark and eliminate keyword arguments
- Move bench out of _benchmark_1 into its own toplevel function _benchmark_2
- Inline a constant
- Update precompile directives

Comparison of PR to 1.2.2 

```julia-repl
julia> f(expr; old=false, show=false) = eval(Meta.parse(read(`julia $(old ? () : "--project") $(show ? "--trace-compile=stderr" : ()) -E $("a = @Elapsed using Chairmarks; a,@Elapsed @eval $expr")`, String)))
f (generic function with 3 methods)

julia> comp(x) = f(x, old=true) => f(x)
comp (generic function with 1 method)

julia> comp("@b rand evals=1 samples=1")
(0.008544065, 0.038519668) => (0.008942193, 0.033227878)

julia> comp("@b rand hash seconds=.001")
(0.00854119, 0.046308936) => (0.009106361, 0.038525668)

julia> comp("@b rand hash(_) seconds=.001")
(0.008300188, 0.051189014) => (0.008772066, 0.038625961)

julia> comp("@b rand hash evals=1 samples=1")
(0.008669608, 0.041850652) => (0.008984026, 0.037786705)

julia> comp("@b rand hash(_),hash evals=1 samples=1") # This has a major behavior change
(0.008611357, 0.048080741) => (0.008755858, 0.336857771)
```
  • Loading branch information
LilithHafner authored Nov 24, 2024
1 parent dfb6b6b commit ea0fe66
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 30 deletions.
65 changes: 37 additions & 28 deletions src/benchmarking.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,15 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
samples::Union{Int, Nothing}=nothing,
seconds::Union{Real, Nothing}=(samples===nothing ? DEFAULTS.seconds : 10*DEFAULTS.seconds)*N,
gc::Bool=DEFAULTS.gc) where N
_benchmark_1(init, setup, teardown, evals, samples, seconds, gc, fs...)
end
_benchmark_1(init, setup, teardown, evals::Union{Int, Nothing}, samples::Union{Int, Nothing}, seconds::Real, gc::Bool, fs...) =
_benchmark_1(init, setup, teardown, evals, samples, Float64(seconds), gc, fs...)
function _benchmark_1(init, setup, teardown, evals::Union{Int, Nothing}, samples::Union{Int, Nothing}, seconds::Union{Float64, Nothing}, gc::Bool, fs...)
@nospecialize
N = length(fs)

if seconds !== nothing && seconds >= 2.0^63*1e-9
if seconds !== nothing && seconds >= 9.223372036854776e9 # 2.0^63*1e-9
samples === nothing && throw(ArgumentError("samples must be specified if seconds is infinite or nearly infinite (more than 292 years)"))
seconds = nothing
end
Expand All @@ -36,27 +42,9 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;

args1 = maybecall(init, ())

function bench(evals, warmup=true)
p = N == 1 ? (1,) : N == 2 ? rand() < .5 ? (1,2) : (2,1) : randperm(N)
t = Ref(zero(UInt64))
args2 = maybecall(setup, args1)
rp = ntuple(N) do i
old_gc = gc || GC.enable(false)
sample, ti, args3 = try
_benchmark(fs[p[i]], args2, evals, warmup)
finally
gc || GC.enable(old_gc)
end
maybecall(teardown, (args3,))
t[] = ti
sample
end
ntuple(i -> rp[p[i]], N), t[]
end

samples == 0 && return ntuple(i -> Benchmark([bench(evals, false)[1][i]]), N)
samples == 0 && return ntuple(i -> Benchmark([_benchmark_2(args1, setup, teardown, gc, evals, false, fs...)[1][i]]), N)

warmup, start_time = bench(1, false)
warmup, start_time = _benchmark_2(args1, setup, teardown, gc, 1, false, fs...)

seconds == 0 && return ntuple(i -> Benchmark([warmup[i]]), N)
new_evals = if evals === nothing
Expand All @@ -68,7 +56,7 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
return ntuple(i -> Benchmark([warmup[i]]), N)
end

calibration1, time = bench(1)
calibration1, time = _benchmark_2(args1, setup, teardown, gc, 1, true, fs...)

# We should be spending about 5% of runtime on calibration.
# If we spent less than 1% then recalibrate with more evals.
Expand All @@ -77,14 +65,14 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
calibration2 = nothing
calibration2time = nothing
if calibration1time < .00015seconds # This branch protects us against cases where runtime is dominated by the reduction.
calibration2, time = bench(10)
calibration2, time = _benchmark_2(args1, setup, teardown, gc, 10, true, fs...)
calibration2time = sum(s.time for s in calibration2)
trials = floor(Int, .05seconds/(calibration2time+1e-9))
if trials > 20
calibration2, time = bench(trials)
calibration2, time = _benchmark_2(args1, setup, teardown, gc, trials, true, fs...)
end
elseif calibration1time < .01seconds
calibration2, time = bench(floor(Int, .05seconds/(calibration1time+1e-9)))
calibration2, time = _benchmark_2(args1, setup, teardown, gc, floor(Int, .05seconds/(calibration1time+1e-9)), true, fs...)
end
if calibration2 !== nothing
calibration2time = sum(s.time for s in calibration2)
Expand Down Expand Up @@ -116,22 +104,43 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
elseif evals === nothing && calibration2 !== nothing && first(calibration2).evals == new_evals # Can't match both
data[1] = calibration2
else
data[1], time = bench(new_evals)
data[1], time = _benchmark_2(args1, setup, teardown, gc, new_evals, true, fs...)
end

i = 1
stop_time = seconds === nothing ? nothing : start_time + round(UInt64, 1e9seconds)
while (seconds === nothing || signed(stop_time - time) >= 0) && (samples === nothing || i < samples)
sample, time = bench(new_evals)
sample, time = _benchmark_2(args1, setup, teardown, gc, new_evals, true, fs...)
samples === nothing ? push!(data, sample) : (data[i += 1] = sample)
end

samples === nothing || resize!(data, i)

ntuple(i -> Benchmark([s[i] for s in data]), N)
end

function _benchmark_2(args1, setup, teardown, gc::Bool, evals::Int, warmup::Bool, fs...)
@nospecialize
N = length(fs)
p = N == 1 ? (1,) : N == 2 ? rand() < .5 ? (1,2) : (2,1) : randperm(N)
t = Ref(zero(UInt64))
args2 = maybecall(setup, args1)
rp = ntuple(N) do i
old_gc = gc || GC.enable(false)
sample, ti, args3 = try
_benchmark_3(fs[p[i]], args2, evals, warmup)
finally
gc || GC.enable(old_gc)
end
maybecall(teardown, (args3,))
t[] = ti
sample
end
ntuple(i -> rp[p[i]], N), t[]
end

_div(a, b) = a == b == 0 ? zero(a/b) : a/b
function _benchmark(f::F, args::A, evals::Int, warmup::Bool) where {F, A}
function _benchmark_3(f::F, args::A, evals::Int, warmup::Bool) where {F, A}
gcstats = Base.gc_num()
cumulative_compile_timing(true)
ctime, time0, time1, res = try
Expand Down
7 changes: 5 additions & 2 deletions src/precompile.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
precompile(minimum, (Benchmark,))
precompile(summarize, (Benchmark,))
precompile(process_args, (Any,))
precompile(create_function, (Symbol,))
precompile(benchmark, (Any,Any,Tuple{Any},Any))
precompile(create_first_function, (Symbol,))
precompile(create_function, (Expr,))
precompile(create_first_function, (Expr,))
precompile(_benchmark_1, (Any,Any,Any,Int,Int,Float64,Bool,Any))
precompile(Base.show, (IO, MIME"text/plain", Sample))

0 comments on commit ea0fe66

Please sign in to comment.