Skip to content

Commit

Permalink
added benchmark system details, tested benchmarks on CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrei Leonard Nicusan committed Jun 18, 2024
1 parent 817f06f commit 3945382
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 331 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AcceleratedKernels.jl will also be a fundamental building block of applications


## Benchmark
See `protoype/sort_benchmark.jl` for the benchmark code and `prototype/thrust_sort` for the Thrust wrapper.
See `protoype/sort_benchmark.jl` for the benchmark code and `prototype/thrust_sort` for the Thrust wrapper. The results below are from a system with Linux 6.6.30-2-MANJARO, Intel Core i9-10885H CPU, Nvidia Quadro RTX 4000 with Max-Q Design GPU, Thrust 1.17.1-1, Julia Version 1.10.4.

![Sorting benchmark](https://github.com/anicusan/AcceleratedKernels.jl/blob/main/docs/src/static/sort_benchmark.png?raw=true)

Expand Down
6 changes: 3 additions & 3 deletions prototype/reduce_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using Profile
using PProf

using KernelAbstractions
using oneAPI
using CUDA

import AcceleratedKernels as AK

Expand All @@ -19,12 +19,12 @@ function redmin(s)
s;
init=typemax(eltype(s)),
block_size=256,
switch_below=10_000,
switch_below=0,
)
end


s = oneArray(shuffle(1:1_000_000))
s = CuArray(shuffle(1:1_000_000))
d = redmin(s)
@assert d == 1
println("Simple correctness check passed")
Expand Down
2 changes: 1 addition & 1 deletion prototype/sort_benchmark.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ display(@benchmark sort!($d) setup=(rand!(d)))

println("AcceleratedKernels Sort:")
temp = similar(d)
display(@benchmark AK.merge_sort!($d, temp=temp) setup=(rand!(d)))
display(@benchmark AK.merge_sort!($d, temp=temp, block_size=256) setup=(rand!(d)))

println("BUC / CUDA Thrust Sort:")
display(@benchmark buc_sort!($d) setup=(rand!(d)))
Expand Down
28 changes: 21 additions & 7 deletions prototype/truth_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@ using Profile
using PProf

using KernelAbstractions
using oneAPI
# using oneAPI
using CUDA

import AcceleratedKernels as AK


Random.seed!(0)


v = oneArray(1:100)
v = CuArray(1:100)

@assert AK.any(x->x<0, v, cooperative=false) === false
@assert AK.any(x->x>99, v, cooperative=false) === true
Expand All @@ -23,16 +24,29 @@ println("simple any tests passed")
@assert AK.all(x->x<100, v, cooperative=false) === false
println("simple all tests passed")

@assert AK.any(x->x<0, v, cooperative=true) === false
@assert AK.any(x->x>99, v, cooperative=true) === true
println("simple any tests passed")

@assert AK.all(x->x>0, v, cooperative=true) === true
@assert AK.all(x->x<100, v, cooperative=true) === false
println("simple all tests passed")




v = CuArray(1:10_000_000)

v = oneArray(1:10_000_000)
println("AcceleratedKernels any (reduce based):")
display(@benchmark(AK.any(x->x>9_999, v, cooperative=false)))

println("AcceleratedKernels any:")
display(@benchmark(AK.any(x->x>9_999_999, v, cooperative=false)))
println("AcceleratedKernels any (coop based):")
display(@benchmark(AK.any(x->x>9_999, v, cooperative=true)))

println("oneAPI minimum:")
display(@benchmark(any(x->x>9_999_999, v)))
display(@benchmark(any(x->x>9_999, v)))

println("CPU minimum:")
vh = Array(v)
display(@benchmark(any(x->x>9_999_999, vh)))
display(@benchmark(any(x->x>9_999, vh)))

2 changes: 1 addition & 1 deletion src/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ end
offset = 1
next_pow2 = block_size * 2
d = next_pow2 >> 1
while d > 0
while d > 0 # TODO: unroll this like in reduce.jl ?
@synchronize()

if ithread < d
Expand Down
Loading

0 comments on commit 3945382

Please sign in to comment.