Optimize load time and TTFX for non-comparison cases. (#137)

- Separate _benchmark_1 from benchmark and eliminate keyword arguments - Move bench out of _benchmark_1 into its own toplevel function _benchmark_2 - Inline a constant - Update precompile directives Comparison of PR to 1.2.2 ```julia-repl julia> f(expr; old=false, show=false) = eval(Meta.parse(read(`julia $(old ? () : "--project") $(show ? "--trace-compile=stderr" : ()) -E $("a = @Elapsed using Chairmarks; a,@Elapsed @eval $expr")`, String))) f (generic function with 3 methods) julia> comp(x) = f(x, old=true) => f(x) comp (generic function with 1 method) julia> comp("@b rand evals=1 samples=1") (0.008544065, 0.038519668) => (0.008942193, 0.033227878) julia> comp("@b rand hash seconds=.001") (0.00854119, 0.046308936) => (0.009106361, 0.038525668) julia> comp("@b rand hash(_) seconds=.001") (0.008300188, 0.051189014) => (0.008772066, 0.038625961) julia> comp("@b rand hash evals=1 samples=1") (0.008669608, 0.041850652) => (0.008984026, 0.037786705) julia> comp("@b rand hash(_),hash evals=1 samples=1") # This has a major behavior change (0.008611357, 0.048080741) => (0.008755858, 0.336857771) ```
LilithHafner · Nov 24, 2024 · ea0fe66 · ea0fe66
1 parent dfb6b6b
commit ea0fe66
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 30 deletions.
diff --git a/src/benchmarking.jl b/src/benchmarking.jl
@@ -21,9 +21,15 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
         samples::Union{Int, Nothing}=nothing,
         seconds::Union{Real, Nothing}=(samples===nothing ? DEFAULTS.seconds : 10*DEFAULTS.seconds)*N,
         gc::Bool=DEFAULTS.gc) where N
+    _benchmark_1(init, setup, teardown, evals, samples, seconds, gc, fs...)
+end
+_benchmark_1(init, setup, teardown, evals::Union{Int, Nothing}, samples::Union{Int, Nothing}, seconds::Real, gc::Bool, fs...) =
+    _benchmark_1(init, setup, teardown, evals, samples, Float64(seconds), gc, fs...)
+function _benchmark_1(init, setup, teardown, evals::Union{Int, Nothing}, samples::Union{Int, Nothing}, seconds::Union{Float64, Nothing}, gc::Bool, fs...)
     @nospecialize
+    N = length(fs)
 
-    if seconds !== nothing && seconds >= 2.0^63*1e-9
+    if seconds !== nothing && seconds >= 9.223372036854776e9 # 2.0^63*1e-9
         samples === nothing && throw(ArgumentError("samples must be specified if seconds is infinite or nearly infinite (more than 292 years)"))
         seconds = nothing
     end
@@ -36,27 +42,9 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
 
     args1 = maybecall(init, ())
 
-    function bench(evals, warmup=true)
-        p = N == 1 ? (1,) : N == 2 ? rand() < .5 ? (1,2) : (2,1) : randperm(N)
-        t = Ref(zero(UInt64))
-        args2 = maybecall(setup, args1)
-        rp = ntuple(N) do i
-            old_gc = gc || GC.enable(false)
-            sample, ti, args3 = try
-                _benchmark(fs[p[i]], args2, evals, warmup)
-            finally
-                gc || GC.enable(old_gc)
-            end
-            maybecall(teardown, (args3,))
-            t[] = ti
-            sample
-        end
-        ntuple(i -> rp[p[i]], N), t[]
-    end
-
-    samples == 0 && return ntuple(i -> Benchmark([bench(evals, false)[1][i]]), N)
+    samples == 0 && return ntuple(i -> Benchmark([_benchmark_2(args1, setup, teardown, gc, evals, false, fs...)[1][i]]), N)
 
-    warmup, start_time = bench(1, false)
+    warmup, start_time = _benchmark_2(args1, setup, teardown, gc, 1, false, fs...)
 
     seconds == 0 && return ntuple(i -> Benchmark([warmup[i]]), N)
     new_evals = if evals === nothing
@@ -68,7 +56,7 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
             return ntuple(i -> Benchmark([warmup[i]]), N)
         end
 
-        calibration1, time = bench(1)
+        calibration1, time = _benchmark_2(args1, setup, teardown, gc, 1, true, fs...)
 
         # We should be spending about 5% of runtime on calibration.
         # If we spent less than 1% then recalibrate with more evals.
@@ -77,14 +65,14 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
         calibration2 = nothing
         calibration2time = nothing
         if calibration1time < .00015seconds # This branch protects us against cases where runtime is dominated by the reduction.
-            calibration2, time = bench(10)
+            calibration2, time = _benchmark_2(args1, setup, teardown, gc, 10, true, fs...)
             calibration2time = sum(s.time for s in calibration2)
             trials = floor(Int, .05seconds/(calibration2time+1e-9))
             if trials > 20
-                calibration2, time = bench(trials)
+                calibration2, time = _benchmark_2(args1, setup, teardown, gc, trials, true, fs...)
             end
         elseif calibration1time < .01seconds
-            calibration2, time = bench(floor(Int, .05seconds/(calibration1time+1e-9)))
+            calibration2, time = _benchmark_2(args1, setup, teardown, gc, floor(Int, .05seconds/(calibration1time+1e-9)), true, fs...)
         end
         if calibration2 !== nothing
             calibration2time = sum(s.time for s in calibration2)
@@ -116,22 +104,43 @@ function benchmark(init, setup, fs::Tuple{Vararg{Any, N}}, teardown;
     elseif evals === nothing && calibration2 !== nothing && first(calibration2).evals == new_evals # Can't match both
         data[1] = calibration2
     else
-        data[1], time = bench(new_evals)
+        data[1], time = _benchmark_2(args1, setup, teardown, gc, new_evals, true, fs...)
     end
 
     i = 1
     stop_time = seconds === nothing ? nothing : start_time + round(UInt64, 1e9seconds)
     while (seconds === nothing || signed(stop_time - time) >= 0) && (samples === nothing || i < samples)
-        sample, time = bench(new_evals)
+        sample, time = _benchmark_2(args1, setup, teardown, gc, new_evals, true, fs...)
         samples === nothing ? push!(data, sample) : (data[i += 1] = sample)
     end
 
     samples === nothing || resize!(data, i)
 
     ntuple(i -> Benchmark([s[i] for s in data]), N)
 end
+
+function _benchmark_2(args1, setup, teardown, gc::Bool, evals::Int, warmup::Bool, fs...)
+    @nospecialize
+    N = length(fs)
+    p = N == 1 ? (1,) : N == 2 ? rand() < .5 ? (1,2) : (2,1) : randperm(N)
+    t = Ref(zero(UInt64))
+    args2 = maybecall(setup, args1)
+    rp = ntuple(N) do i
+        old_gc = gc || GC.enable(false)
+        sample, ti, args3 = try
+            _benchmark_3(fs[p[i]], args2, evals, warmup)
+        finally
+            gc || GC.enable(old_gc)
+        end
+        maybecall(teardown, (args3,))
+        t[] = ti
+        sample
+    end
+    ntuple(i -> rp[p[i]], N), t[]
+end
+
 _div(a, b) = a == b == 0 ? zero(a/b) : a/b
-function _benchmark(f::F, args::A, evals::Int, warmup::Bool) where {F, A}
+function _benchmark_3(f::F, args::A, evals::Int, warmup::Bool) where {F, A}
     gcstats = Base.gc_num()
     cumulative_compile_timing(true)
     ctime, time0, time1, res = try

diff --git a/src/precompile.jl b/src/precompile.jl
@@ -1,5 +1,8 @@
-precompile(minimum, (Benchmark,))
+precompile(summarize, (Benchmark,))
 precompile(process_args, (Any,))
 precompile(create_function, (Symbol,))
-precompile(benchmark, (Any,Any,Tuple{Any},Any))
+precompile(create_first_function, (Symbol,))
+precompile(create_function, (Expr,))
+precompile(create_first_function, (Expr,))
+precompile(_benchmark_1, (Any,Any,Any,Int,Int,Float64,Bool,Any))
 precompile(Base.show, (IO, MIME"text/plain", Sample))