From 3b97437277bdededc084f8046a6a27765db8280f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Dec 2023 17:57:55 +0100
Subject: [PATCH] Support more kwarg syntax with kernel launches (#380)

Also clean-up KA.jl back-end.
---
 Project.toml              |   4 +-
 src/compiler/execution.jl |  10 ++-
 src/mapreduce.jl          |   4 +-
 src/oneAPIKernels.jl      | 172 ++++++++++++++++++--------------------
 test/execution.jl         |   4 +
 5 files changed, 100 insertions(+), 94 deletions(-)

diff --git a/Project.toml b/Project.toml
index 13ce41eb..7e0b0645 100644
--- a/Project.toml
+++ b/Project.toml
@@ -19,7 +19,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SPIRV_LLVM_Translator_unified_jll = "85f0d8ed-5b39-5caa-b1ae-7472de402361"
 SPIRV_Tools_jll = "6ac6d60f-d740-5983-97d7-a4482c0689f4"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
-UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 oneAPI_Level_Zero_Headers_jll = "f4bc562b-d309-54f8-9efb-476e56f0410d"
 oneAPI_Level_Zero_Loader_jll = "13eca655-d68d-5b81-8367-6d99d727ab01"
 oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
@@ -36,7 +36,7 @@ NEO_jll = "=23.17.26241"
 Preferences = "1"
 SPIRV_LLVM_Translator_unified_jll = "0.3"
 SpecialFunctions = "1.3, 2"
-UnsafeAtomicsLLVM = "0.1"
+StaticArrays = "1"
 julia = "1.8"
 oneAPI_Level_Zero_Loader_jll = "1.9"
 oneAPI_Support_jll = "~0.2.2"
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 508c7c70..0d7f8fb7 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -9,7 +9,15 @@ const LAUNCH_KWARGS = [:groups, :items, :queue]
 
 macro oneapi(ex...)
     call = ex[end]
-    kwargs = ex[1:end-1]
+    kwargs = map(ex[1:end-1]) do kwarg
+        if kwarg isa Symbol
+            :($kwarg = $kwarg)
+        elseif Meta.isexpr(kwarg, :(=))
+            kwarg
+        else
+            throw(ArgumentError("Invalid keyword argument '$kwarg'"))
+        end
+    end
 
     # destructure the kernel call
     Meta.isexpr(call, :call) || throw(ArgumentError("second argument to @oneapi should be a function call"))
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index b807b285..8e9f8c7c 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -163,7 +163,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
     # perform the actual reduction
     if reduce_groups == 1
         # we can cover the dimensions to reduce using a single group
-        @oneapi items=items groups=groups partial_mapreduce_device(
+        @oneapi items groups partial_mapreduce_device(
             f, op, init, Val(items), Rreduce, Rother, R′, A)
     else
         # we need multiple steps to cover all values to reduce
@@ -172,7 +172,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
             # without an explicit initializer we need to copy from the output container
             partial .= R
         end
-        @oneapi items=items groups=groups partial_mapreduce_device(
+        @oneapi items groups partial_mapreduce_device(
             f, op, init, Val(items), Rreduce, Rother, partial, A)
 
         GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
index f8427568..c2208f28 100644
--- a/src/oneAPIKernels.jl
+++ b/src/oneAPIKernels.jl
@@ -1,49 +1,55 @@
 module oneAPIKernels
 
-import KernelAbstractions
-import oneAPI
-import oneAPI: oneL0, @device_override
-import GPUCompiler
+using ..oneAPI
+using ..oneAPI: @device_override
 
-import UnsafeAtomicsLLVM
+import KernelAbstractions as KA
+
+import StaticArrays
+
+import Adapt
+
+
+## Back-end Definition
 
-struct oneAPIBackend <: KernelAbstractions.GPU
-end
 export oneAPIBackend
 
-KernelAbstractions.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.oneArray{T}(undef, dims)
-KernelAbstractions.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.zeros(T, dims)
-KernelAbstractions.ones(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.ones(T, dims)
+struct oneAPIBackend <: KA.GPU
+end
 
-# Import through parent
-import KernelAbstractions: StaticArrays, Adapt
-import .StaticArrays: MArray
+KA.allocate(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneArray{T}(undef, dims)
+KA.zeros(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.zeros(T, dims)
+KA.ones(::oneAPIBackend, ::Type{T}, dims::Tuple) where T = oneAPI.ones(T, dims)
 
-KernelAbstractions.get_backend(::oneAPI.oneArray) = oneAPIBackend()
+KA.get_backend(::oneArray) = oneAPIBackend()
 # TODO should be non-blocking
-KernelAbstractions.synchronize(::oneAPIBackend) = oneL0.synchronize()
-KernelAbstractions.supports_float64(::oneAPIBackend) = false # TODO is this device dependent?
+KA.synchronize(::oneAPIBackend) = oneL0.synchronize()
+KA.supports_float64(::oneAPIBackend) = false  # TODO: Check if this is device dependent
 
-Adapt.adapt_storage(::oneAPIBackend, a::Array) = Adapt.adapt(oneAPI.oneArray, a)
-Adapt.adapt_storage(::oneAPIBackend, a::oneAPI.oneArray) = a
-Adapt.adapt_storage(::KernelAbstractions.CPU, a::oneAPI.oneArray) = convert(Array, a)
+Adapt.adapt_storage(::oneAPIBackend, a::Array) = Adapt.adapt(oneArray, a)
+Adapt.adapt_storage(::oneAPIBackend, a::oneArray) = a
+Adapt.adapt_storage(::KA.CPU, a::oneArray) = convert(Array, a)
 
-##
-# copyto!
-##
 
+## Memory Operations
 
-function KernelAbstractions.copyto!(::oneAPIBackend, A, B)
+function KA.copyto!(::oneAPIBackend, A, B)
     copyto!(A, B)
-    # TODO device to host copies in oneAPI.jl are synchronizing.
+    # TODO: Address device to host copies in jl being synchronizing
 end
 
-import KernelAbstractions: Kernel, StaticSize, DynamicSize, partition, blocks, workitems, launch_config
 
-###
-# Kernel launch
-###
-function launch_config(kernel::Kernel{oneAPIBackend}, ndrange, workgroupsize)
+## Kernel Launch
+
+function KA.mkcontext(kernel::KA.Kernel{oneAPIBackend}, _ndrange, iterspace)
+    KA.CompilerMetadata{KA.ndrange(kernel), KA.DynamicCheck}(_ndrange, iterspace)
+end
+function KA.mkcontext(kernel::KA.Kernel{oneAPIBackend}, I, _ndrange, iterspace,
+                      ::Dynamic) where Dynamic
+    KA.CompilerMetadata{KA.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
+end
+
+function KA.launch_config(kernel::KA.Kernel{oneAPIBackend}, ndrange, workgroupsize)
     if ndrange isa Integer
         ndrange = (ndrange,)
     end
@@ -52,16 +58,16 @@ function launch_config(kernel::Kernel{oneAPIBackend}, ndrange, workgroupsize)
     end
 
     # partition checked that the ndrange's agreed
-    if KernelAbstractions.ndrange(kernel) <: StaticSize
+    if KA.ndrange(kernel) <: KA.StaticSize
         ndrange = nothing
     end
 
-    iterspace, dynamic = if KernelAbstractions.workgroupsize(kernel) <: DynamicSize &&
+    iterspace, dynamic = if KA.workgroupsize(kernel) <: KA.DynamicSize &&
         workgroupsize === nothing
         # use ndrange as preliminary workgroupsize for autotuning
-        partition(kernel, ndrange, ndrange)
+        KA.partition(kernel, ndrange, ndrange)
     else
-        partition(kernel, ndrange, workgroupsize)
+        KA.partition(kernel, ndrange, workgroupsize)
     end
 
     return ndrange, workgroupsize, iterspace, dynamic
@@ -76,108 +82,96 @@ function threads_to_workgroupsize(threads, ndrange)
     end
 end
 
-function (obj::Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize=nothing)
-    ndrange, workgroupsize, iterspace, dynamic = launch_config(obj, ndrange, workgroupsize)
+function (obj::KA.Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize=nothing)
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, ndrange, workgroupsize)
     # this might not be the final context, since we may tune the workgroupsize
-    ctx = mkcontext(obj, ndrange, iterspace)
-    kernel = oneAPI.@oneapi launch=false obj.f(ctx, args...)
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @oneapi launch=false obj.f(ctx, args...)
 
     # figure out the optimal workgroupsize automatically
-    if KernelAbstractions.workgroupsize(obj) <: DynamicSize && workgroupsize === nothing
+    if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
         items = oneAPI.suggest_groupsize(kernel.fun, prod(ndrange)).x
         # XXX: the z dimension of the suggested group size is often non-zero. use this?
         workgroupsize = threads_to_workgroupsize(items, ndrange)
-        iterspace, dynamic = partition(obj, ndrange, workgroupsize)
-        ctx = mkcontext(obj, ndrange, iterspace)
+        iterspace, dynamic = KA.partition(obj, ndrange, workgroupsize)
+        ctx = KA.mkcontext(obj, ndrange, iterspace)
     end
 
-    nblocks = length(blocks(iterspace))
-    threads = length(workitems(iterspace))
+    groups = length(KA.blocks(iterspace))
+    items = length(KA.workitems(iterspace))
 
-    if nblocks == 0
+    if groups == 0
         return nothing
     end
 
     # Launch kernel
-    kernel(ctx, args...; items=threads, groups=nblocks)
+    kernel(ctx, args...; items, groups)
 
     return nothing
 end
 
-import KernelAbstractions: CompilerMetadata, DynamicCheck, LinearIndices
-import KernelAbstractions: __index_Local_Linear, __index_Group_Linear, __index_Global_Linear, __index_Local_Cartesian, __index_Group_Cartesian, __index_Global_Cartesian, __validindex, __print
-import KernelAbstractions: mkcontext, expand, __iterspace, __ndrange, __dynamic_checkbounds
 
-function mkcontext(kernel::Kernel{oneAPIBackend}, _ndrange, iterspace)
-    metadata = CompilerMetadata{KernelAbstractions.ndrange(kernel), DynamicCheck}(_ndrange, iterspace)
-end
-function mkcontext(kernel::Kernel{oneAPIBackend}, I, _ndrange, iterspace, ::Dynamic) where Dynamic
-    metadata = CompilerMetadata{KernelAbstractions.ndrange(kernel), Dynamic}(I, _ndrange, iterspace)
-end
+## Indexing Functions
 
-@device_override @inline function __index_Local_Linear(ctx)
-    return oneAPI.get_local_id(0)
+@device_override @inline function KA.__index_Local_Linear(ctx)
+    return get_local_id(0)
 end
 
-@device_override @inline function __index_Group_Linear(ctx)
-    return oneAPI.get_group_id(0)
+@device_override @inline function KA.__index_Group_Linear(ctx)
+    return get_group_id(0)
 end
 
-@device_override @inline function __index_Global_Linear(ctx)
-    I =  @inbounds expand(__iterspace(ctx), oneAPI.get_group_id(0), oneAPI.get_local_id(0))
-    # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(__ndrange(ctx))[I]
+@device_override @inline function KA.__index_Global_Linear(ctx)
+    return get_global_id(0)
 end
 
-@device_override @inline function __index_Local_Cartesian(ctx)
-    @inbounds workitems(__iterspace(ctx))[oneAPI.get_local_id(0)]
+@device_override @inline function KA.__index_Local_Cartesian(ctx)
+    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(0)]
 end
 
-@device_override @inline function __index_Group_Cartesian(ctx)
-    @inbounds blocks(__iterspace(ctx))[oneAPI.get_group_id(0)]
+@device_override @inline function KA.__index_Group_Cartesian(ctx)
+    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(0)]
 end
 
-@device_override @inline function __index_Global_Cartesian(ctx)
-    return @inbounds expand(__iterspace(ctx), oneAPI.get_group_id(0), oneAPI.get_local_id(0))
+@device_override @inline function KA.__index_Global_Cartesian(ctx)
+    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(0), get_local_id(0))
 end
 
-@device_override @inline function __validindex(ctx)
-    if __dynamic_checkbounds(ctx)
-        I = @inbounds expand(__iterspace(ctx), oneAPI.get_group_id(0), oneAPI.get_local_id(0))
-        return I in __ndrange(ctx)
+@device_override @inline function KA.__validindex(ctx)
+    if KA.__dynamic_checkbounds(ctx)
+        I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(0), get_local_id(0))
+        return I in KA.__ndrange(ctx)
     else
         return true
     end
 end
 
-import KernelAbstractions: groupsize, __groupsize, __workitems_iterspace
-import KernelAbstractions: SharedMemory, Scratchpad, __synchronize, __size
 
-###
-# GPU implementation of shared memory
-###
-@device_override @inline function SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+## Shared and Scratch Memory
+
+@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
     ptr = oneAPI.emit_localmemory(T, Val(prod(Dims)))
-    oneAPI.oneDeviceArray(Dims, ptr)
+    oneDeviceArray(Dims, ptr)
 end
 
-###
-# GPU implementation of scratch memory
-# - private memory for each workitem
-###
-
-@device_override @inline function Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
-    StaticArrays.MArray{__size(Dims), T}(undef)
+@device_override @inline function KA.Scratchpad(ctx, ::Type{T}, ::Val{Dims}) where {T, Dims}
+    StaticArrays.MArray{KA.__size(Dims), T}(undef)
 end
 
-@device_override @inline function __synchronize()
-    oneAPI.barrier()
+
+## Synchronization and Printing
+
+@device_override @inline function KA.__synchronize()
+    barrier()
 end
 
-@device_override @inline function __print(args...)
+@device_override @inline function KA.__print(args...)
     oneAPI._print(args...)
 end
 
-KernelAbstractions.argconvert(::Kernel{oneAPIBackend}, arg) = oneAPI.kernel_convert(arg)
+
+## Other
+
+KA.argconvert(::KA.Kernel{oneAPIBackend}, arg) = kernel_convert(arg)
 
 end
diff --git a/test/execution.jl b/test/execution.jl
index 2910ce0b..cddfc8ab 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -20,10 +20,14 @@ end
 @testset "launch configuration" begin
     @oneapi dummy()
 
+    items = 1
+    @oneapi items dummy()
     @oneapi items=1 dummy()
     @oneapi items=(1,1) dummy()
     @oneapi items=(1,1,1) dummy()
 
+    groups = 1
+    @oneapi groups dummy()
     @oneapi groups=1 dummy()
     @oneapi groups=(1,1) dummy()
     @oneapi groups=(1,1,1) dummy()