Implemented Callback sample (#87)

* Implemented callback sample * Minor fixes from code review * Minor fixes from code review II.
KhronosGroup · Dec 5, 2023 · cc5e561 · cc5e561
1 parent de7cb8a
commit cc5e561
Show file tree

Hide file tree

Showing 10 changed files with 1,301 additions and 4 deletions.
diff --git a/lib/include/CL/Utils/Context.hpp b/lib/include/CL/Utils/Context.hpp
@@ -13,5 +13,7 @@ namespace util {
     Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id,
                                         cl_device_type type,
                                         cl_int* error = nullptr);
+
+    void UTILSCPP_EXPORT print_device_info(const cl::Device& device);
 }
 }
diff --git a/lib/src/Utils/Context.cpp b/lib/src/Utils/Context.cpp
@@ -1,6 +1,9 @@
 // OpenCL SDK includes
 #include <CL/Utils/Context.hpp>
 
+#include <iostream>
+#include <string>
+
 cl::Context cl::util::get_context(cl_uint plat_id, cl_uint dev_id,
                                   cl_device_type type, cl_int* error)
 {
@@ -40,3 +43,16 @@ cl::Context cl::util::get_context(cl_uint plat_id, cl_uint dev_id,
 
     return cl::Context{};
 }
+
+void cl::util::print_device_info(const cl::Device& device)
+{
+    const cl::Platform platform(device.getInfo<CL_DEVICE_PLATFORM>());
+    const std::string platform_vendor = platform.getInfo<CL_PLATFORM_VENDOR>();
+    const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+    const std::string device_opencl_c_version =
+        device.getInfo<CL_DEVICE_OPENCL_C_VERSION>();
+    std::cout << "Selected platform by " << platform_vendor
+              << "\nSelected device: " << device_name << '\n'
+              << device_opencl_c_version << '\n'
+              << std::endl;
+}
diff --git a/samples/core/CMakeLists.txt b/samples/core/CMakeLists.txt
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_subdirectory(enumopencl)
+add_subdirectory(binaries)
+add_subdirectory(blur)
+add_subdirectory(callback)
 add_subdirectory(copybuffer)
 add_subdirectory(copybufferkernel)
-add_subdirectory(saxpy)
+add_subdirectory(enumopencl)
 add_subdirectory(reduce)
-add_subdirectory(blur)
-add_subdirectory(binaries)
+add_subdirectory(saxpy)
diff --git a/samples/core/callback/CMakeLists.txt b/samples/core/callback/CMakeLists.txt
@@ -0,0 +1,53 @@
+# Copyright (c) 2023 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+find_package(Threads)
+
+file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/threads.c"
+"#include <threads.h>
+#include <time.h>
+int main(void) { thrd_sleep(&(struct timespec){.tv_nsec=1}, NULL); }
+")
+
+# Signature can be modernized at CMake version 3.25
+try_compile(
+    HAS_C11_THREADS
+    "${CMAKE_CURRENT_BINARY_DIR}"
+    SOURCES "${CMAKE_CURRENT_BINARY_DIR}/threads.c"
+    C_STANDARD 11
+    C_STANDARD_REQUIRED ON
+)
+
+if (HAS_C11_THREADS)
+    add_sample(
+        TEST
+        TARGET callback
+        VERSION 300
+        SOURCES main.c
+        KERNELS reaction_diffusion.cl)
+    target_link_libraries(callback PRIVATE
+        $<TARGET_NAME_IF_EXISTS:Threads::Threads>)
+else()
+    message(WARNING
+    "Skipping callback sample, C11 standard threads are not supported with the current toolset")
+endif()
+
+add_sample(
+    TEST
+    TARGET callbackcpp
+    VERSION 300
+    SOURCES main.cpp
+    KERNELS reaction_diffusion.cl)
+target_link_libraries(callbackcpp PRIVATE
+    $<TARGET_NAME_IF_EXISTS:Threads::Threads>)
diff --git a/samples/core/callback/README.md b/samples/core/callback/README.md
@@ -0,0 +1,104 @@
+# Callback sample - synchronizing Command Queues with Events
+
+## Sample purpose
+
+This sample demonstrates how to synchronize the execution of multiple command queues with each other and with host-side calculations.
+
+## Key APIs and Concepts
+
+If all operations are enqueued on a single command queue, the execution of the operations is sequential, and the ordering of the operations matches the order in which they were enqueued (unless `CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE` was specified).
+
+However, sometimes it is more performant to enqueue unrelated operations to separate command queues, enabling the parallel execution capabilities of the device. The primer utility of synchronization across command queues are **OpenCL event objects**. Each enqueue operation can take a list of events which it will synchronize with. Also, each enqueue operation can emit a single event object that can be used to synchronize other operations with that one. Additionally, using the **callback functionality**, host code can be executed at various points in the event's lifetime.
+
+### Kernel logic
+
+The sample implements the simulation of a theoretical chemical reaction, namely the [Gray-Scott model](https://groups.csail.mit.edu/mac/projects/amorphous/GrayScott/) of the [reaction-diffusion system](https://en.wikipedia.org/wiki/Reaction%E2%80%93diffusion_system). Explaining the algorithm is not the goal of this example. We only need to know, that the simulation progresses iteratively: given a 2D lattice of the concentration of the $U$ and $V$ chemicals (stored in the R and G channels of an image texture respectively), the subsequent state is calculated by the kernel. The algorithm results in visually interesting shapes, with the order being obvious to the viewer: the more "spread" chemical $V$ is, the later iteration step we observe.
+
+## Application flow
+
+### Overview
+
+From an initial state, the iteration progresses through a fixed number of steps. The number of steps can be specified on the command line. Two 2D image objects are used to store the state of the simulation, one for the input and another for the output state, and the images are swapped after each step. At every Nth iteration (N being controlled from the command line), the state of the simulation is dispatched for writing to the disk as a PNG image.
+
+### Command queues and synchronization
+
+<p align="center">
+    <img alt="Sequence diagram of the sample's execution flow" src="callback_sample_sequence.svg"/>
+</p>
+
+In this example, kernel launches, image-to-buffer copies (device-to-device copy) and buffer reads (device-to-host copy) are performed. Some systems allow the overlapping of these operations, thereby it makes sense to enqueue all three of these operation types to separate command queues. The journey of each iteration state is the following:
+
+1. The state is calculated from the previous state by the `reaction_diffusion_step` kernel. The kernel launch synchronizes with the previous image-to-buffer copy, which could have happened in the previous iteration, but also quite a few iterations ago.
+2. If the iteration index is a multiple of N, a device buffer object and a host vector of the same size are allocated. Otherwise, the next iteration starts calculating at step 1.
+3.  A copy of the output image to the allocated buffer object is enqueued. Eventually the simulation state is read to host memory, but since device-to-device copy is usually faster than device-to-host copy, first the previous output image is copied to a buffer and this buffer is read to the host in step 4. Note, that the copy can be performed concurrently with the kernel launch of the next iteration, since they both read from the same image object. Only the subsequent iteration's compute launch has to synchronize with this copy.
+4. The read of the buffer (i.e. device-to-host copy) is enqueued on the read queue. If the device has concurrent copy capabilities, this read potentially overlaps with a previous copy (step 3.) operation.
+5. After the read of the buffer is completed, its contents need to be written to an image file. This has to synchronize with the read operation, but this time, we need to execute host code instead of an OpenCL enqueue. For that, we set the completion callback of the event produced by the read enqueue. A `void*` argument is passed to the callback, which is used to identify the host vector containing the data. 
+6. The callback is executed on a thread used by the OpenCL runtime. Therefore it is advised that the callback returns as quickly as possible. To achieve this, the image write is dispatched to a different thread, using `std::async` in the C++ version of the sample, and `thrd_create` in the C version.
+7. When the image write has finished, the completion is signaled back to the waiting main thread, otherwise the executable would possibly exit before completion. For this purpose `std::future::wait` is used in the C++ version, and a conditional variable in the C version.
+
+## Used API surface (C++)
+
+```c++
+cl::Buffer::Buffer(cl::Context, cl_mem_flags, std::size_t size)
+cl::CommandQueue::enqueueCopyImageToBuffer(cl::Image2D, cl::Buffer, std::array<size_type, 3>,
+                                           std::array<size_type, 3>, std::size_t,
+                                           std::vector<cl::Event>*, cl::Event*)
+cl::CommandQueue::enqueueFillImage(cl::Image2D, cl_float4, std::array<size_type, 3>, std::array<size_type, 3>)
+cl::CommandQueue::enqueueReadBuffer(cl::Buffer, bool, std::size_t, std::size_t,
+                                    void*, std::vector<cl::Event>*, cl::Event*)
+cl::Context::getInfo<CL_CONTEXT_DEVICES>()
+cl::Context::getSupportedImageFormats(cl_mem_flags, cl_mem_object_type, std::vector<cl::ImageFormat>*)
+cl::Device::getInfo<CL_DEVICE_NAME>()
+cl::Device::getInfo<CL_DEVICE_PLATFORM>()
+cl::EnqueueArgs::EnqueueArgs(cl::CommandQueue, cl::Event, cl::NDRange)
+cl::Event::Event()
+cl::Event::Event(cl::Event)
+cl::Event::setCallback(cl_int, void(*)(cl_event, cl_int, void*), void*)
+cl::Image2D::Image2D(cl::Context, cl_mem_flags, cl::ImageFormat, std::size_t, std::size_t)
+cl::ImageFormat::ImageFormat(cl_channel_order, cl_channel_type)
+cl::KernelFunctor<cl::Image2D, cl::Image2D>(cl::Program, std::string)
+cl::NDRange(std::size_t, std::size_t)
+cl::Platform::getInfo<CL_PLATFORM_VENDOR>()
+cl::Platform::Platform(cl_platform)
+cl::Program::build(cl::Device)
+cl::Program::Program(cl::Context, std::string)
+cl::sdk::comprehend()
+cl::sdk::parse()
+cl::sdk::parse_cli()
+cl::UserEvent::setStatus(cl_int)
+cl::UserEvent::UserEvent(cl::Context)
+```
+
+## Used API surface (C)
+
+```c
+clCreateBuffer
+clCreateCommandQueueWithProperties
+clCreateContext
+clCreateImage
+clCreateKernel
+clCreateProgramWithSource
+clEnqueueCopyImageToBuffer
+clEnqueueFillImage
+clEnqueueNDRangeKernel
+clEnqueueReadBuffer
+clGetDeviceInfo
+clGetSupportedImageFormats
+clReleaseCommandQueue
+clReleaseContext
+clReleaseDevice
+clReleaseEvent
+clReleaseEvent
+clReleaseKernel
+clReleaseMemObject
+clReleaseProgram
+clSetEventCallback
+clSetKernelArg
+cnd_destroy
+cnd_signal
+mtx_destroy
+mtx_lock
+mtx_unlock
+thrd_create
+thrd_detach
+```