diff --git a/samples/core/multi-device/CMakeLists.txt b/samples/core/multi-device/CMakeLists.txt index 449caba2..ccf8933b 100644 --- a/samples/core/multi-device/CMakeLists.txt +++ b/samples/core/multi-device/CMakeLists.txt @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -#add_sample( -# TEST -# TARGET multidevice -# VERSION 300 -# SOURCES main.c -# KERNELS convolution.cl) +add_sample( + TEST + TARGET multidevice + VERSION 300 + SOURCES main.c + KERNELS convolution.cl) add_sample( TEST diff --git a/samples/core/multi-device/main.c b/samples/core/multi-device/main.c new file mode 100644 index 00000000..9292b57b --- /dev/null +++ b/samples/core/multi-device/main.c @@ -0,0 +1,693 @@ +/* + * Copyright (c) 2023 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// OpenCL SDK includes. +#include +#include +#include +#include + +// OpenCL Utils includes. +#include +#include +#include + +// Standard header includes. +#include +#include +#include +#include + +// Sample-specific options. +struct convolution_options +{ + size_t x_dim; + size_t y_dim; +}; + +// Add option to CLI-parsing SDK utility for input dimensions. +cag_option ConvolutionOptions[] = { { .identifier = 'x', + .access_letters = "x", + .access_name = "x_dim", + .value_name = "(positive integral)", + .description = "x dimension of input" }, + + { .identifier = 'y', + .access_letters = "y", + .access_name = "y_dim", + .value_name = "(positive integral)", + .description = "y dimension of input" } }; + +ParseState parse_ConvolutionOptions(const char identifier, + cag_option_context* cag_context, + struct convolution_options* opts) +{ + const char* value; + + switch (identifier) + { + case 'x': + if ((value = cag_option_get_value(cag_context))) + { + opts->x_dim = strtoull(value, NULL, 0); + return ParsedOK; + } + else + return ParseError; + case 'y': + if ((value = cag_option_get_value(cag_context))) + { + opts->y_dim = strtoull(value, NULL, 0); + return ParsedOK; + } + else + return ParseError; + } + return NotParsed; +} + +// Add option to CLI parsing SDK utility for multi-device type. +cag_option MultiDeviceOptions[] = { { .identifier = 't', + .access_letters = "t", + .access_name = "type", + .value_name = "(all|cpu|gpu|acc|cus|def)", + .description = + "Type of device to use" } }; + +ParseState parse_MultiDeviceOptions(const char identifier, + cag_option_context* cag_context, + struct cl_sdk_options_MultiDevice* opts) +{ + const char* value; + + switch (identifier) + { + case 't': + if ((value = cag_option_get_value(cag_context))) + { + // If the user selects a device type, query all devices and + // filter the ones of that type. + struct cl_sdk_options_DeviceTriplet* triplets; + size_t number = 0; + cl_device_type dev_type = get_dev_type(value); + + // Get platforms IDs. + cl_uint num_platforms = 0; + clGetPlatformIDs(0, NULL, &num_platforms); + + cl_platform_id* platforms = (cl_platform_id*)malloc( + num_platforms * sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + + // Calculate total number of triplets to add. + for (cl_uint platform_id = 0; platform_id < num_platforms; + ++platform_id) + { + // Get devices IDs. + cl_uint num_devices = 0; + clGetDeviceIDs(platforms[platform_id], dev_type, 0, NULL, + &num_devices); + number += num_devices; + } + + if (!number) + { + fprintf(stderr, + "Error: No OpenCL devices of type %s available", + value); + return ParseError; + } + else if (number < 2) + { + printf("Not enough OpenCL devices of type %s available for " + "multi-device. Using only one device.", + value); + } + + // Register triplets {platform, device, device_type} for each + // device in each platform. + triplets = malloc( + number * sizeof(struct cl_sdk_options_DeviceTriplet)); + number = 0; + for (cl_uint platform_id = 0; platform_id < num_platforms; + ++platform_id) + { + cl_uint num_devices; + clGetDeviceIDs(platforms[platform_id], dev_type, 0, NULL, + &num_devices); + + // Register triplets. + for (cl_uint device_id = 0; device_id < num_devices; + ++device_id, ++number) + { + struct cl_sdk_options_DeviceTriplet triplet = { + platform_id, device_id, dev_type + }; + triplets[number] = triplet; + } + } + + opts->triplets = triplets; + opts->number = number; + + return ParsedOK; + } + else + { + return ParseError; + } + } + return NotParsed; +} + +cl_int parse_options(int argc, char* argv[], + struct cl_sdk_options_Diagnostic* diag_opts, + struct cl_sdk_options_MultiDevice* devs_opts, + struct convolution_options* convolution_opts) +{ + cl_int error = CL_SUCCESS; + struct cag_option *opts = NULL, *tmp = NULL; + size_t n = 0; + + // Prepare options array. + MEM_CHECK(opts = add_CLI_options(opts, &n, DiagnosticOptions, + CAG_ARRAY_SIZE(DiagnosticOptions)), + error, end); + MEM_CHECK(tmp = add_CLI_options(opts, &n, MultiDeviceOptions, + CAG_ARRAY_SIZE(MultiDeviceOptions)), + error, end); + opts = tmp; + MEM_CHECK(tmp = add_CLI_options(opts, &n, ConvolutionOptions, + CAG_ARRAY_SIZE(ConvolutionOptions)), + error, end); + opts = tmp; + + char identifier; + cag_option_context cag_context; + + // Prepare the context and iterate over all options. + cag_option_prepare(&cag_context, opts, n, argc, argv); + while (cag_option_fetch(&cag_context)) + { + ParseState state = NotParsed; + identifier = cag_option_get(&cag_context); + + PARS_OPTIONS(parse_DiagnosticOptions(identifier, diag_opts), state); + PARS_OPTIONS( + parse_MultiDeviceOptions(identifier, &cag_context, devs_opts), + state); + PARS_OPTIONS(parse_ConvolutionOptions(identifier, &cag_context, + convolution_opts), + state); + + if (identifier == 'h') + { + printf("Usage: multidevice [OPTION]...\n"); + printf("Option name and value should be separated by '=' or a " + "space\n"); + printf( + "Demonstrates convolution calculation with two devices.\n\n"); + cag_option_print(opts, n, stdout); + exit((state == ParseError) ? CL_INVALID_ARG_VALUE : CL_SUCCESS); + } + } + +end: + free(opts); + return error; +} + +// Host-side implementation of the convolution for verification. Padded input +// assumed. +void host_convolution(cl_float* in, cl_float* out, cl_float* mask, size_t x_dim, + size_t y_dim) +{ + const cl_uint mask_dim = 3; + const cl_uint pad_width = mask_dim / 2; + const size_t pad_x_dim = x_dim + 2 * pad_width; + for (size_t x = 0; x < x_dim; ++x) + { + for (size_t y = 0; y < y_dim; ++y) + { + float result = 0.f; + for (size_t grid_column = x, mask_column = 0; + mask_column < mask_dim; ++grid_column, ++mask_column) + { + for (size_t grid_row = y, mask_row = 0; mask_row < mask_dim; + ++grid_row, ++mask_row) + { + result += mask[mask_column + mask_row * mask_dim] + * in[grid_column + grid_row * pad_x_dim]; + } + } + out[x + y * x_dim] = result; + } + } +} + +int main(int argc, char* argv[]) +{ + cl_int error = CL_SUCCESS; + cl_int end_error = CL_SUCCESS; + cl_platform_id platform1, platform2; + cl_device_id dev1, dev2; + cl_context context1, context2; + cl_command_queue queue1, queue2 = NULL; + + cl_program program1, program2; + + // Parse command-line options. + struct cl_sdk_options_Diagnostic diag_opts = { .quiet = false, + .verbose = false }; + + // By default assume that there is only one device available. + // dev_opts->number is set to 1 so that when calling to cl_util_get_device + // for the second device there is no index out of range. + struct cl_sdk_options_MultiDevice devs_opts = { + .triplets = + (struct cl_sdk_options_DeviceTriplet[]){ + { 0, 0, CL_DEVICE_TYPE_ALL }, { 0, 0, CL_DEVICE_TYPE_ALL } }, + .number = 1 + }; + struct convolution_options convolution_opts = { .x_dim = 4000, + .y_dim = 4000 }; + + OCLERROR_RET( + parse_options(argc, argv, &diag_opts, &devs_opts, &convolution_opts), + error, end); + + // Create runtime objects based on user preference or default. + OCLERROR_PAR(dev1 = + cl_util_get_device(devs_opts.triplets[0].plat_index, + devs_opts.triplets[0].dev_index, + devs_opts.triplets[0].dev_type, &error), + error, end); + OCLERROR_PAR(dev2 = cl_util_get_device( + devs_opts.triplets[(devs_opts.number > 1)].plat_index, + devs_opts.triplets[(devs_opts.number > 1)].dev_index, + devs_opts.triplets[(devs_opts.number > 1)].dev_type, + &error), + error, end); + OCLERROR_PAR(context1 = clCreateContext(NULL, 1, &dev1, NULL, NULL, &error), + error, end); + OCLERROR_PAR(context2 = clCreateContext(NULL, 1, &dev2, NULL, NULL, &error), + error, end); + OCLERROR_RET(clGetDeviceInfo(dev1, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &platform1, NULL), + error, cont); + OCLERROR_RET(clGetDeviceInfo(dev2, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &platform2, NULL), + error, cont); + + if (!diag_opts.quiet) + { + cl_util_print_device_info(dev1); + cl_util_print_device_info(dev2); + } + + // Compile kernels. + const char* kernel_location = "./convolution.cl"; + char *kernel = NULL, *tmp = NULL; + size_t program_size = 0; + OCLERROR_PAR( + kernel = cl_util_read_text_file(kernel_location, &program_size, &error), + error, cont); + MEM_CHECK(tmp = (char*)realloc(kernel, program_size), error, ker); + kernel = tmp; + OCLERROR_PAR(program1 = clCreateProgramWithSource( + context1, 1, (const char**)&kernel, &program_size, &error), + error, ker); + OCLERROR_PAR(program2 = clCreateProgramWithSource( + context2, 1, (const char**)&kernel, &program_size, &error), + error, ker); + + // If no -cl-std option is specified then the highest 1.x version + // supported by each device is used to compile the program. Therefore, + // it's only necessary to add the -cl-std option for 2.0 and 3.0 OpenCL + // versions. + char compiler_options[1023] = ""; +#if CL_HPP_TARGET_OPENCL_VERSION >= 300 + strcat(compiler_options, "-cl-std=CL3.0 "); +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + strcat(compiler_options, "-cl-std=CL2.0 "); +#endif + + OCLERROR_RET(cl_util_build_program(program1, dev1, compiler_options), error, + prg); + OCLERROR_RET(cl_util_build_program(program2, dev2, compiler_options), error, + prg); + + // Query maximum workgroup size (WGS) of kernel supported on each device + // based on private mem (registers) constraints. + size_t wgs1, wgs2; + cl_kernel convolution1, convolution2; + OCLERROR_PAR(convolution1 = + clCreateKernel(program1, "convolution_3x3", &error), + error, prg); + OCLERROR_PAR(convolution2 = + clCreateKernel(program2, "convolution_3x3", &error), + error, prg); + OCLERROR_RET(clGetKernelWorkGroupInfo(convolution1, dev1, + CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &wgs1, NULL), + error, conv); + OCLERROR_RET(clGetKernelWorkGroupInfo(convolution2, dev2, + CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &wgs2, NULL), + error, conv); + + // Initialize host-side storage. + const cl_uint mask_dim = 3; + const cl_uint pad_width = mask_dim / 2; + const size_t x_dim = convolution_opts.x_dim; + const size_t y_dim = convolution_opts.y_dim; + const size_t pad_x_dim = x_dim + 2 * pad_width; + const size_t pad_y_dim = y_dim + 2 * pad_width; + + // Check that the WGSs can divide the global size (MacOS reports + // CL_INVALID_WORK_GROUP_SIZE otherwise). If WGS is smaller than the x + // dimension, then a NULL pointer will be used when calling + // clEnqueueNDRangeKernel for enqueuing the kernels. + if (pad_x_dim % wgs1 && pad_x_dim > wgs1) + { + size_t div = pad_x_dim / wgs1; + wgs1 = sqrt(div * wgs1); + } + + if (pad_x_dim % wgs2 && pad_x_dim > wgs2) + { + size_t div = pad_x_dim / wgs2; + wgs2 = sqrt(div * wgs2); + } + + // Random number generator. + pcg32_random_t rng; + pcg32_srandom_r(&rng, 11111, -2222); + + // Initialize input matrix. The input will be padded to remove + // conditional branches from the convolution kernel for determining + // out-of-bounds. + cl_float* h_input_grid; + MEM_CHECK(h_input_grid = + (cl_float*)malloc(sizeof(cl_float) * pad_x_dim * pad_y_dim), + error, conv); + if (diag_opts.verbose) + { + printf("Generating %zu random numbers for convolution input grid.\n", + x_dim * y_dim); + } + cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_input_grid, + pad_x_dim * pad_y_dim, -1000, 1000); + + // Fill with 0s the extra rows and columns added for padding. + for (size_t j = 0; j < pad_x_dim; ++j) + { + for (size_t i = 0; i < pad_y_dim; ++i) + { + if (i == 0 || j == 0 || i == (pad_y_dim - 1) + || j == (pad_x_dim - 1)) + { + h_input_grid[j + i * pad_x_dim] = 0; + } + } + } + + // Declare output matrix. Output will not be padded. + cl_float* h_output_grid; + MEM_CHECK(h_output_grid = + (cl_float*)malloc(sizeof(cl_float) * x_dim * y_dim), + error, hinput); + + // Initialize convolution mask. + cl_float* h_mask; + MEM_CHECK(h_mask = + (cl_float*)malloc(sizeof(cl_float) * mask_dim * mask_dim), + error, hinput); + if (diag_opts.verbose) + { + printf("Generating %u random numbers for convolution mask.\n", + mask_dim * mask_dim); + } + cl_sdk_fill_with_random_ints_range(&rng, (cl_int*)h_mask, + mask_dim * mask_dim, -1000, 1000); + + /// Initialize device-side storage. + const size_t grid_midpoint = y_dim / 2; + const size_t pad_grid_midpoint = pad_y_dim / 2; + + if (diag_opts.verbose) + { + printf("Initializing device-side storage..."); + } + + // Initialize queues for command execution on each device. +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + cl_command_queue_properties props[] = { CL_QUEUE_PROPERTIES, + CL_QUEUE_PROFILING_ENABLE, 0 }; + OCLERROR_PAR(queue1 = clCreateCommandQueueWithProperties(context1, dev1, + props, &error), + error, hinput); + OCLERROR_PAR(queue2 = clCreateCommandQueueWithProperties(context2, dev2, + props, &error), + error, que1); +#else + OCLERROR_PAR(queue1 = clCreateCommandQueue( + context1, dev1, CL_QUEUE_PROFILING_ENABLE, &error), + error, hinput); + OCLERROR_PAR(queue2 = clCreateCommandQueue( + context2, dev2, CL_QUEUE_PROFILING_ENABLE, &error), + error, que1); +#endif + + // First device performs the convolution in the upper half and second device + // in the lower half. + cl_mem dev1_input_grid; + OCLERROR_PAR(dev1_input_grid = clCreateBuffer( + context1, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint + 1), + h_input_grid, &error), + error, que2); + // Second device performs the convolution in the lower half (middle + // border included). + cl_mem dev2_input_grid; + OCLERROR_PAR(dev2_input_grid = clCreateBuffer( + context2, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * pad_x_dim * (pad_grid_midpoint + 1), + h_input_grid + pad_x_dim * (pad_grid_midpoint - 1), + &error), + error, bufin1); + + cl_mem dev1_output_grid; + OCLERROR_PAR(dev1_output_grid = clCreateBuffer( + context1, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, + sizeof(cl_float) * x_dim * y_dim, NULL, &error), + error, bufin2); + cl_mem dev2_output_grid; + OCLERROR_PAR(dev2_output_grid = clCreateBuffer( + context2, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, + sizeof(cl_float) * x_dim * y_dim, NULL, &error), + error, bufout1); + + cl_mem dev1_mask; + OCLERROR_PAR(dev1_mask = clCreateBuffer( + context1, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * mask_dim * mask_dim, h_mask, &error), + error, bufout2); + cl_mem dev2_mask; + OCLERROR_PAR(dev2_mask = clCreateBuffer( + context2, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + sizeof(cl_float) * mask_dim * mask_dim, h_mask, &error), + error, bufmask1); + + // Set kernels arguments. + OCLERROR_RET( + clSetKernelArg(convolution1, 0, sizeof(cl_mem), &dev1_input_grid), + error, bufmask2); + OCLERROR_RET( + clSetKernelArg(convolution1, 1, sizeof(cl_mem), &dev1_output_grid), + error, bufmask2); + OCLERROR_RET(clSetKernelArg(convolution1, 2, sizeof(cl_mem), &dev1_mask), + error, bufmask2); + OCLERROR_RET(clSetKernelArg(convolution1, 3, sizeof(size_t), &x_dim), error, + bufmask2); + OCLERROR_RET( + clSetKernelArg(convolution1, 4, sizeof(size_t), &grid_midpoint), error, + bufmask2); + + OCLERROR_RET( + clSetKernelArg(convolution2, 0, sizeof(cl_mem), &dev2_input_grid), + error, bufmask2); + OCLERROR_RET( + clSetKernelArg(convolution2, 1, sizeof(cl_mem), &dev2_output_grid), + error, bufmask2); + OCLERROR_RET(clSetKernelArg(convolution2, 2, sizeof(cl_mem), &dev2_mask), + error, bufmask2); + OCLERROR_RET(clSetKernelArg(convolution2, 3, sizeof(size_t), &x_dim), error, + bufmask2); + OCLERROR_RET( + clSetKernelArg(convolution2, 4, sizeof(size_t), &grid_midpoint), error, + bufmask2); + + // Launch kernels. + if (diag_opts.verbose) + { + printf("done.\nExecuting on device... "); + fflush(stdout); + } + + // Initialize global and local buffers for device execution. + const size_t* global = (size_t[]){ pad_x_dim, pad_y_dim }; + const size_t* local1 = (size_t[]){ wgs1, 1 }; + const size_t* local2 = (size_t[]){ wgs2, 1 }; + + // Enqueue kernel calls and wait for them to finish. + cl_event *dev1_kernel_run = NULL, *dev2_kernel_run = NULL; + MEM_CHECK(dev1_kernel_run = (cl_event*)malloc(sizeof(cl_event)), error, + bufmask2); + MEM_CHECK(dev2_kernel_run = (cl_event*)malloc(sizeof(cl_event)), error, + run1); + GET_CURRENT_TIMER(dev_start) + OCLERROR_RET(clEnqueueNDRangeKernel(queue1, convolution1, 2, NULL, global, + (pad_x_dim < wgs1) ? NULL : local1, 0, + NULL, dev1_kernel_run), + error, run2); + OCLERROR_RET(clEnqueueNDRangeKernel(queue2, convolution2, 2, NULL, global, + (pad_x_dim < wgs2) ? NULL : local2, 0, + NULL, dev2_kernel_run), + error, run2); + + OCLERROR_RET(clWaitForEvents(1, dev1_kernel_run), error, run2); + OCLERROR_RET(clWaitForEvents(1, dev2_kernel_run), error, run2); + GET_CURRENT_TIMER(dev_end) + cl_ulong dev_time; + TIMER_DIFFERENCE(dev_time, dev_start, dev_end) + + // Compute reference host-side convolution. + if (diag_opts.verbose) + { + printf("done.\nExecuting on host... "); + } + + GET_CURRENT_TIMER(host_start) + host_convolution(h_input_grid, h_output_grid, h_mask, x_dim, y_dim); + GET_CURRENT_TIMER(host_end) + cl_ulong host_time; + TIMER_DIFFERENCE(host_time, host_start, host_end) + + if (diag_opts.verbose) + { + printf("done.\n"); + } + + // Fetch and combine results from devices. + cl_float* concatenated_results; + MEM_CHECK(concatenated_results = + (cl_float*)malloc(sizeof(cl_float) * x_dim * y_dim), + error, run2); + OCLERROR_RET(clEnqueueReadBuffer(queue1, dev1_output_grid, CL_BLOCKING, 0, + sizeof(cl_float) * x_dim * grid_midpoint, + concatenated_results, 0, NULL, NULL), + error, run2); + OCLERROR_RET( + clEnqueueReadBuffer(queue2, dev2_output_grid, CL_BLOCKING, 0, + sizeof(cl_float) * x_dim * grid_midpoint, + concatenated_results + x_dim * grid_midpoint, 0, + NULL, NULL), + error, run2); + + // Validate device-side solution. + cl_float deviation = 0.f; + const cl_float tolerance = 1e-6; + + for (size_t i = 0; i < x_dim * y_dim; ++i) + { + deviation += fabs(concatenated_results[i] - h_output_grid[i]); + } + deviation /= (x_dim * y_dim); + + if (deviation > tolerance) + { + printf("Failed convolution! Normalized deviation %.6f between host and " + "device exceeds tolerance %.6f\n", + deviation, tolerance); + } + else + { + printf("Successful convolution!\n"); + } + + if (!diag_opts.quiet) + { + printf("Kernels execution time as seen by host: %llu us.\n", + (unsigned long long)(dev_time + 500) / 1000); + + printf("Kernels execution time as measured by devices :\n"); + printf("\t%llu us.\n", + (unsigned long long)(cl_util_get_event_duration( + *dev1_kernel_run, + CL_PROFILING_COMMAND_START, + CL_PROFILING_COMMAND_END, &error) + + 500) + / 1000); + printf("\t%llu us.\n", + (unsigned long long)(cl_util_get_event_duration( + *dev2_kernel_run, + CL_PROFILING_COMMAND_START, + CL_PROFILING_COMMAND_END, &error) + + 500) + / 1000); + + printf("Reference execution as seen by host: %llu us.\n", + (unsigned long long)(host_time + 500) / 1000); + } + +run2: + free(dev2_kernel_run); +run1: + free(dev1_kernel_run); +bufmask2: + OCLERROR_RET(clReleaseMemObject(dev2_mask), end_error, bufmask1); +bufmask1: + OCLERROR_RET(clReleaseMemObject(dev1_mask), end_error, bufout2); +bufout2: + OCLERROR_RET(clReleaseMemObject(dev2_output_grid), end_error, bufout1); +bufout1: + OCLERROR_RET(clReleaseMemObject(dev1_output_grid), end_error, bufin2); +bufin2: + OCLERROR_RET(clReleaseMemObject(dev2_input_grid), end_error, bufin1); +bufin1: + OCLERROR_RET(clReleaseMemObject(dev1_input_grid), end_error, hinput); +que2: + OCLERROR_RET(clReleaseCommandQueue(queue2), end_error, que1); +que1: + OCLERROR_RET(clReleaseCommandQueue(queue1), end_error, cont); +hinput: + free(h_input_grid); +conv: + OCLERROR_RET(clReleaseKernel(convolution1), end_error, prg); + OCLERROR_RET(clReleaseKernel(convolution2), end_error, prg); +prg: + OCLERROR_RET(clReleaseProgram(program1), end_error, ker); + OCLERROR_RET(clReleaseProgram(program2), end_error, ker); +ker: + free(kernel); +cont: + OCLERROR_RET(clReleaseContext(context1), end_error, end); + OCLERROR_RET(clReleaseContext(context2), end_error, end); +end: + if (error) cl_util_print_error(error); + return error; +}