Skip to content

Commit

Permalink
More updates and fix compilation errors
Browse files Browse the repository at this point in the history
  • Loading branch information
uumesh committed Oct 8, 2024
1 parent 77317b1 commit d214710
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 67 deletions.
18 changes: 9 additions & 9 deletions backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ static constexpr SpecID BASIS_Q_1D_ID;
// Interpolation kernel - tensor
//------------------------------------------------------------------------------
template <int is_transpose>
static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclBundle_t &sycl_bundle, CeedInt num_elem, const CeedBasis_Sycl *impl,
const CeedScalar *u, CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
Expand All @@ -55,7 +55,7 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);
cgh.use_kernel_bundle(sycl_bundle);

sycl::local_accessor<CeedScalar> s_mem(op_len + 2 * buf_len, cgh);

Expand Down Expand Up @@ -139,7 +139,7 @@ static int CeedBasisApplyInterp_Sycl(sycl::queue &sycl_queue, const SyclModule_t
// Gradient kernel - tensor
//------------------------------------------------------------------------------
template <int is_transpose>
static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &sycl_module, CeedInt num_elem, const CeedBasis_Sycl *impl,
static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclBundle_t &sycl_bundle, CeedInt num_elem, const CeedBasis_Sycl *impl,
const CeedScalar *u, CeedScalar *v) {
const CeedInt buf_len = impl->buf_len;
const CeedInt op_len = impl->op_len;
Expand All @@ -158,7 +158,7 @@ static int CeedBasisApplyGrad_Sycl(sycl::queue &sycl_queue, const SyclModule_t &

sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);
cgh.use_kernel_bundle(sycl_module);
cgh.use_kernel_bundle(sycl_bundle);

sycl::local_accessor<CeedScalar> s_mem(2 * (op_len + buf_len), cgh);

Expand Down Expand Up @@ -299,16 +299,16 @@ static int CeedBasisApply_Sycl(CeedBasis basis, const CeedInt num_elem, CeedTran
switch (eval_mode) {
case CEED_EVAL_INTERP:
if (is_transpose) {
CeedCallBackend(CeedBasisApplyInterp_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
CeedCallBackend(CeedBasisApplyInterp_Sycl<true>(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v));
} else {
CeedCallBackend(CeedBasisApplyInterp_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
CeedCallBackend(CeedBasisApplyInterp_Sycl<false>(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v));
}
break;
case CEED_EVAL_GRAD:
if (is_transpose) {
CeedCallBackend(CeedBasisApplyGrad_Sycl<true>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
CeedCallBackend(CeedBasisApplyGrad_Sycl<true>(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v));
} else {
CeedCallBackend(CeedBasisApplyGrad_Sycl<false>(data->sycl_queue, *impl->sycl_module, num_elem, impl, d_u, d_v));
CeedCallBackend(CeedBasisApplyGrad_Sycl<false>(data->sycl_queue, *impl->sycl_bundle, num_elem, impl, d_u, d_v));
}
break;
case CEED_EVAL_WEIGHT:
Expand Down Expand Up @@ -610,7 +610,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
input_bundle.set_specialization_constant<BASIS_Q_1D_ID>(Q_1d);
input_bundle.set_specialization_constant<BASIS_P_1D_ID>(P_1d);

CeedCallSycl(ceed, impl->sycl_module = new SyclModule_t(sycl::build(input_bundle)));
CeedCallSycl(ceed, impl->sycl_bundle = new SyclBundle_t(sycl::build(input_bundle)));

CeedCallBackend(CeedBasisSetData(basis, impl));

Expand Down
34 changes: 16 additions & 18 deletions backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
Ceed ceed;
Ceed_Sycl *data;
const char *read_write_kernel_path, *read_write_kernel_source;
const char *qfunction_name, *qfunction_source;
CeedInt num_input_fields, num_output_fields;
CeedQFunctionField *input_fields, *output_fields;
CeedQFunction_Sycl *impl;
Expand Down Expand Up @@ -60,21 +59,21 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
std::string_view rw_source_view(read_write_kernel_source);
const std::string kernel_name = "CeedKernelSyclRefQFunction_" + std::string(qf_name_view);

// std::vector<CeedInt> input_sizes(num_input_fields);
// CeedQFunctionField *input_i = input_fields;
std::vector<CeedInt> input_sizes(num_input_fields);
CeedQFunctionField *input_i = input_fields;

// for (auto &size_i : input_sizes) {
// CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i));
// ++input_i;
// }
for (auto &size_i : input_sizes) {
CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i));
++input_i;
}

// std::vector<CeedInt> output_sizes(num_output_fields);
// CeedQFunctionField *output_i = output_fields;
std::vector<CeedInt> output_sizes(num_output_fields);
CeedQFunctionField *output_i = output_fields;

// for (auto &size_i : output_sizes) {
// CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i));
// ++output_i;
// }
for (auto &size_i : output_sizes) {
CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i));
++output_i;
}

// Defintions
std::ostringstream code;
Expand All @@ -88,7 +87,7 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
// code << "__attribute__((intel_reqd_sub_group_size(" << SUB_GROUP_SIZE_QF << "))) extern \"C\" void " << kernel_name
code << "#include <vector>\n\n";
code << "extern \"C\" void " << kernel_name
<< "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl fields) {\n";
<< "(sycl::queue &sycl_queue, sycl::nd_range<1> kernel_range, void *ctx, CeedInt Q, Fields_Sycl *fields) {\n";

// OpenCL doesn't allow for structs with pointers.
// We will need to pass all of the arguments individually.
Expand All @@ -97,15 +96,15 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
<< "const CeedScalar *fields_inputs[" << num_input_fields << "];\n";
for (CeedInt i = 0; i < num_input_fields; ++i) {
code << " "
<< "fields_inputs[" << i << "] = fields.inputs[" << i << "];\n";
<< "fields_inputs[" << i << "] = fields->inputs[" << i << "];\n";
}

// Output parameters
code << " "
<< "const CeedScalar *fields_outputs[" << num_output_fields << "];\n";
for (CeedInt i = 0; i < num_output_fields; ++i) {
code << " "
<< "fields_outputs[" << i << "] = fields.outputs[" << i << "];\n";
<< "fields_outputs[" << i << "] = fields->outputs[" << i << "];\n";
}
code << "\n";

Expand Down Expand Up @@ -174,11 +173,10 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) {
CeedDebug(ceed, code.str().c_str());

// Compile kernel
CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module));
CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), impl->sycl_module));
CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, kernel_name, &impl->QFunction));

// Cleanup
CeedCallBackend(CeedFree(&qfunction_source));
CeedCallBackend(CeedFree(&read_write_kernel_path));
CeedCallBackend(CeedFree(&read_write_kernel_source));
return CEED_ERROR_SUCCESS;
Expand Down
34 changes: 7 additions & 27 deletions backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,44 +58,24 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C
CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_DEVICE, &impl->fields.inputs[i]));
}
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedCallBackend(CeedVectorGetArrayRead(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i]));
CeedCallBackend(CeedVectorGetArrayWrite(V[i], CEED_MEM_DEVICE, &impl->fields.outputs[i]));
}

// Get context data
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data));

std::vector<sycl::event> e;

if (!ceed_Sycl->sycl_queue.is_in_order()) e = {ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier()};

// Launch as a basic parallel_for over Q quadrature points
ceed_Sycl->sycl_queue.submit([&](sycl::handler &cgh) {
cgh.depends_on(e);

int iarg{};
cgh.set_arg(iarg, context_data);
++iarg;
cgh.set_arg(iarg, Q);
++iarg;
for (auto &input_i : inputs) {
cgh.set_arg(iarg, input_i);
++iarg;
}
for (auto &output_i : outputs) {
cgh.set_arg(iarg, output_i);
++iarg;
}
// Hard-coding the work-group size for now
// We could use the Level Zero API to query and set an appropriate size in future
// Equivalent of CUDA Occupancy Calculator
int wg_size = WG_SIZE_QF;
sycl::range<1> rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size;
sycl::nd_range<1> kernel_range(rounded_Q, wg_size);
cgh.parallel_for(kernel_range, *(impl->QFunction));
});
int wg_size = WG_SIZE_QF;
sycl::range<1> rounded_Q = ((Q + (wg_size - 1)) / wg_size) * wg_size;
sycl::nd_range<1> kernel_range(rounded_Q, wg_size);

// Call launcher function that executes kernel
*(impl->QFunction)(sycl_queue, context_data, Q, fields);
// Pass in nd_range as second argument
// Pass in vector of events as third argument
(*impl->QFunction)(ceed_Sycl->sycl_queue, kernel_range, context_data, Q, &impl->fields);

// Restore vectors
// U_i = U;
Expand Down
4 changes: 2 additions & 2 deletions backends/sycl-ref/ceed-sycl-ref.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ typedef struct {
CeedInt num_qpts;
CeedInt buf_len;
CeedInt op_len;
SyclModule_t *sycl_module;
SyclBundle_t *sycl_bundle;
CeedScalar *d_interp_1d;
CeedScalar *d_grad_1d;
CeedScalar *d_q_weight_1d;
Expand All @@ -68,7 +68,7 @@ typedef struct {
CeedScalar *d_q_weight;
} CeedBasisNonTensor_Sycl;

using SyclQfunctionKernel_t = std::function<void(sycl::queue&, sycl::nd_range<1>, void*, CeedInt, Fields_Sycl)>;
using SyclQfunctionKernel_t = std::function<void(sycl::queue&, sycl::nd_range<1>, void*, CeedInt, Fields_Sycl*)>;

typedef struct {
SyclModule_t *sycl_module;
Expand Down
9 changes: 5 additions & 4 deletions backends/sycl/ceed-sycl-compile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
#include <map>
#include <sycl/sycl.hpp>

#include <libprtc/prtc.h>
#include "libprtc/prtc.h"

using SyclModule_t = std::shared_ptr<prtc::DynamicLibrary>;
using SyclBundle_t = sycl::kernel_bundle<sycl::bundle_state::executable>;

CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module,
CEED_INTERN int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t *sycl_module,
const std::map<std::string, CeedInt> &constants = {});

template <class SyclKernel_t>
CEED_INTERN int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, SyclKernel_t **sycl_kernel);
int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, SyclKernel_t **sycl_kernel);

template <class SyclKernel_t>
CEED_INTERN int CeedRunKernelDimSharedSycl(Ceed ceed, SyclKernel_t *kernel, const int grid_size, const int block_size_x, const int block_size_y,
int CeedRunKernelDimSharedSycl(Ceed ceed, SyclKernel_t *kernel, const int grid_size, const int block_size_x, const int block_size_y,
const int block_size_z, const int shared_mem_size, void **args);
16 changes: 9 additions & 7 deletions backends/sycl/ceed-sycl-compile.sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <string>
#include <sstream>
#include <fstream>
#include <algorithm>
#include <sycl/sycl.hpp>

#include "ceed-sycl-common.hpp"
Expand Down Expand Up @@ -77,12 +78,12 @@ static inline int CeedJitGetFlags_Sycl(std::vector<std::string> &flags) {
// TODO: Check if source, module, etc. already exists
//------------------------------------------------------------------------------
static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_device, const std::string &kernel_source, std::string& output_path,
const std::vector<std::string> &flags = {}) {
const std::vector<std::string> flags = {}) {

// Get cache path from env variable
std::string cache_root;
// TODO: Add default directory to current working directory
if(std::getenv()"CEED_CACHE_DIR")) {
if(std::getenv("CEED_CACHE_DIR")) {
cache_root = std::string(std::getenv("CEED_CACHE_DIR")) + "/.ceed/cache";
} else {
cache_root = std::string(std::getenv("PWD")) + "/.ceed/cache";
Expand All @@ -101,14 +102,15 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_
size_t kernel_source_hash = string_hash(kernel_source);

// Hash compilation flags
std::sort(flags.begin(), flags.end());
std::string all_flags = prtc::concatenateFlags(flags);
std::vector<std::string> copy_flags = flags;
std::sort(copy_flags.begin(), copy_flags.end());
std::string all_flags = prtc::concatenateFlags(copy_flags);
size_t build_options_hash = string_hash(all_flags);

// Hash compiler version
prtc::ShellCommand command("icpx --version");
const auto [success, compiler_version] = command.result();
if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, compiler_version);
if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, compiler_version.c_str());
size_t compiler_hash = string_hash(compiler_version);

// Determine file paths for source and binaries based on hashes
Expand All @@ -124,9 +126,9 @@ static inline int CeedJitCompileSource_Sycl(Ceed ceed, const sycl::device &sycl_

// TODO: Get compiler-path and flags from env or some other means
prtc::ShellCompiler compiler("icpx","-o","-c","-fPIC","-shared");
const auto [success, message] = compiler.compileAndLink(source_file_path,object_file_path,flags);
const auto [build_success, message] = compiler.compileAndLink(source_file_path,object_file_path,flags);
// Q: Should we always output the compiler output in verbose/debug mode?
if (!success) return CeedError((ceed), CEED_ERROR_BACKEND, message);
if (!build_success) return CeedError((ceed), CEED_ERROR_BACKEND, message.c_str());
return CEED_ERROR_SUCCESS;
}

Expand Down
2 changes: 2 additions & 0 deletions backends/sycl/libprtc/shell_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

namespace prtc {

std::string concatenateFlags(const std::vector<std::string>& flags);

class ShellCompiler {
public:
ShellCompiler(const std::string& executable, const std::string& output_flag,
Expand Down

0 comments on commit d214710

Please sign in to comment.