added benchmark system details, tested benchmarks on CUDA

JuliaGPU · Jun 18, 2024 · 3945382 · 3945382
1 parent 817f06f
commit 3945382
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 331 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ AcceleratedKernels.jl will also be a fundamental building block of applications
 
 
 ## Benchmark
-See `protoype/sort_benchmark.jl` for the benchmark code and `prototype/thrust_sort` for the Thrust wrapper.
+See `protoype/sort_benchmark.jl` for the benchmark code and `prototype/thrust_sort` for the Thrust wrapper. The results below are from a system with Linux 6.6.30-2-MANJARO, Intel Core i9-10885H CPU, Nvidia Quadro RTX 4000 with Max-Q Design GPU, Thrust 1.17.1-1, Julia Version 1.10.4.
 
 ![Sorting benchmark](https://github.com/anicusan/AcceleratedKernels.jl/blob/main/docs/src/static/sort_benchmark.png?raw=true)
 

diff --git a/prototype/reduce_test.jl b/prototype/reduce_test.jl
@@ -5,7 +5,7 @@ using Profile
 using PProf
 
 using KernelAbstractions
-using oneAPI
+using CUDA
 
 import AcceleratedKernels as AK
 
@@ -19,12 +19,12 @@ function redmin(s)
         s;
         init=typemax(eltype(s)),
         block_size=256,
-        switch_below=10_000,
+        switch_below=0,
     )
 end
 
 
-s = oneArray(shuffle(1:1_000_000))
+s = CuArray(shuffle(1:1_000_000))
 d = redmin(s)
 @assert d == 1
 println("Simple correctness check passed")

diff --git a/prototype/sort_benchmark.jl b/prototype/sort_benchmark.jl
@@ -22,7 +22,7 @@ display(@benchmark sort!($d) setup=(rand!(d)))
 
 println("AcceleratedKernels Sort:")
 temp = similar(d)
-display(@benchmark AK.merge_sort!($d, temp=temp) setup=(rand!(d)))
+display(@benchmark AK.merge_sort!($d, temp=temp, block_size=256) setup=(rand!(d)))
 
 println("BUC / CUDA Thrust Sort:")
 display(@benchmark buc_sort!($d) setup=(rand!(d)))

diff --git a/prototype/truth_test.jl b/prototype/truth_test.jl
@@ -5,15 +5,16 @@ using Profile
 using PProf
 
 using KernelAbstractions
-using oneAPI
+# using oneAPI
+using CUDA
 
 import AcceleratedKernels as AK
 
 
 Random.seed!(0)
 
 
-v = oneArray(1:100)
+v = CuArray(1:100)
 
 @assert AK.any(x->x<0, v, cooperative=false) === false
 @assert AK.any(x->x>99, v, cooperative=false) === true
@@ -23,16 +24,29 @@ println("simple any tests passed")
 @assert AK.all(x->x<100, v, cooperative=false) === false
 println("simple all tests passed")
 
+@assert AK.any(x->x<0, v, cooperative=true) === false
+@assert AK.any(x->x>99, v, cooperative=true) === true
+println("simple any tests passed")
+
+@assert AK.all(x->x>0, v, cooperative=true) === true
+@assert AK.all(x->x<100, v, cooperative=true) === false
+println("simple all tests passed")
+
+
+
+
+v = CuArray(1:10_000_000)
 
-v = oneArray(1:10_000_000)
+println("AcceleratedKernels any (reduce based):")
+display(@benchmark(AK.any(x->x>9_999, v, cooperative=false)))
 
-println("AcceleratedKernels any:")
-display(@benchmark(AK.any(x->x>9_999_999, v, cooperative=false)))
+println("AcceleratedKernels any (coop based):")
+display(@benchmark(AK.any(x->x>9_999, v, cooperative=true)))
 
 println("oneAPI minimum:")
-display(@benchmark(any(x->x>9_999_999, v)))
+display(@benchmark(any(x->x>9_999, v)))
 
 println("CPU minimum:")
 vh = Array(v)
-display(@benchmark(any(x->x>9_999_999, vh)))
+display(@benchmark(any(x->x>9_999, vh)))
 
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -57,7 +57,7 @@ end
     offset = 1
     next_pow2 = block_size * 2
     d = next_pow2 >> 1
-    while d > 0
+    while d > 0             # TODO: unroll this like in reduce.jl ?
         @synchronize()
 
         if ithread < d