Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA wrapper capability. #714

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion ffcx/codegeneration/C/integrals.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,21 @@ def generator(ir: IntegralIR, options):
else:
code["tabulate_tensor_complex64"] = ".tabulate_tensor_complex64 = NULL,"
code["tabulate_tensor_complex128"] = ".tabulate_tensor_complex128 = NULL,"
if options.get("cuda"):
code["tabulate_tensor_cuda_nvrtc"] = (
f".tabulate_tensor_cuda_nvrtc = tabulate_tensor_cuda_nvrtc_{factory_name}"
)
else:
code["tabulate_tensor_cuda_nvrtc"] = ""

np_scalar_type = np.dtype(options["scalar_type"]).name
code[f"tabulate_tensor_{np_scalar_type}"] = (
f".tabulate_tensor_{np_scalar_type} = tabulate_tensor_{factory_name},"
)

element_hash = 0 if ir.coordinate_element_hash is None else ir.coordinate_element_hash

implementation = ufcx_integrals.factory.format(
implementation = ufcx_integrals.get_factory(options).format(
factory_name=factory_name,
enabled_coefficients=code["enabled_coefficients"],
enabled_coefficients_init=code["enabled_coefficients_init"],
Expand All @@ -89,6 +96,7 @@ def generator(ir: IntegralIR, options):
tabulate_tensor_float64=code["tabulate_tensor_float64"],
tabulate_tensor_complex64=code["tabulate_tensor_complex64"],
tabulate_tensor_complex128=code["tabulate_tensor_complex128"],
tabulate_tensor_cuda_nvrtc=code["tabulate_tensor_cuda_nvrtc"],
)

return declaration, implementation
51 changes: 51 additions & 0 deletions ffcx/codegeneration/C/integrals_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,60 @@
{tabulate_tensor_float64}
{tabulate_tensor_complex64}
{tabulate_tensor_complex128}
{tabulate_tensor_cuda_nvrtc}
.needs_facet_permutations = {needs_facet_permutations},
.coordinate_element_hash = {coordinate_element_hash},
}};

// End of code for integral {factory_name}
"""

cuda_wrapper = """

// Begin NVRTC CUDA wrapper for integral {factory_name}
// The wrapper is compiled with a standard C++ compiler, and is called at runtime to generate
// source code which is then compiled into a CUDA kernel at runtime via NVRTC.
void tabulate_tensor_cuda_nvrtc_{factory_name}(int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** out_program_src,
const char** tabulate_tensor_function_name)
{{
// The below typedefs are needed due to issues with including stdint.h in NVRTC source code
const char* program_src = ""
"#define alignas(x)\\n"
"#define restrict __restrict__\\n"
"\\n"
"typedef unsigned char uint8_t;\\n"
bpachev marked this conversation as resolved.
Show resolved Hide resolved
"typedef unsigned int uint32_t;\\n"
"typedef double ufc_scalar_t;\\n"
"\\n"
"extern \\"C\\" __global__\\n"
"void tabulate_tensor_{factory_name}({scalar_type}* restrict A,\\n"
" const {scalar_type}* restrict w,\\n"
" const {scalar_type}* restrict c,\\n"
" const {geom_type}* restrict coordinate_dofs,\\n"
" const int* restrict entity_local_index,\\n"
" const uint8_t* restrict quadrature_permutation\\n"
" )\\n"
"{{\\n"
"{tabulate_tensor_quoted}\\n"
"}}";
*num_program_headers = 0;
*program_headers = NULL;
bpachev marked this conversation as resolved.
Show resolved Hide resolved
*program_include_names = NULL;
*out_program_src = program_src;
*tabulate_tensor_function_name = "tabulate_tensor_{factory_name}";
}}

// End NVRTC CUDA wrapper for integral {factory_name}

"""


def get_factory(options):
"""Return the template string for constructing form integrals."""
if options.get("cuda"):
return cuda_wrapper + factory
else:
return factory
3 changes: 3 additions & 0 deletions ffcx/codegeneration/jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_complex128\).*?\);", ufcx_h, re.DOTALL)
)
UFC_INTEGRAL_DECL += "\n".join(
re.findall(r"typedef void ?\(ufcx_tabulate_tensor_cuda_nvrtc\).*?\);", ufcx_h, re.DOTALL)
)

UFC_INTEGRAL_DECL += "\n".join(
re.findall("typedef struct ufcx_integral.*?ufcx_integral;", ufcx_h, re.DOTALL)
Expand Down
23 changes: 23 additions & 0 deletions ffcx/codegeneration/ufcx.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,28 @@ extern "C"
const uint8_t* restrict quadrature_permutation);
#endif // __STDC_NO_COMPLEX__

/// Return CUDA C++ source code for the ufc_tabulate_tensor kernel
bpachev marked this conversation as resolved.
Show resolved Hide resolved
/// The resulting source code is passed to NVRTC for runtime compilation
///
/// @param[out] num_program_headers
/// The number of headers required by the program
/// @param[out] program_headers
/// Entire contents of each header file
/// @param[out] program_include_names
/// Names of each header file
/// @param[out] program_src
/// CUDA C++ source code for the program containing the
/// tabulate_tensor function.
/// @param[out] tabulate_tensor_function_name
/// The name of the device-side function.
///
typedef void(ufcx_tabulate_tensor_cuda_nvrtc)(
int* num_program_headers,
const char*** program_headers,
const char*** program_include_names,
const char** program_src,
const char** tabulate_tensor_function_name);

typedef struct ufcx_integral
{
const bool* enabled_coefficients;
Expand All @@ -134,6 +156,7 @@ extern "C"
ufcx_tabulate_tensor_complex64* tabulate_tensor_complex64;
ufcx_tabulate_tensor_complex128* tabulate_tensor_complex128;
#endif // __STDC_NO_COMPLEX__
ufcx_tabulate_tensor_cuda_nvrtc* tabulate_tensor_cuda_nvrtc;
bool needs_facet_permutations;

/// Get the hash of the coordinate element associated with the geometry of the mesh.
Expand Down
1 change: 1 addition & 0 deletions ffcx/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
logger = logging.getLogger("ffcx")

FFCX_DEFAULT_OPTIONS = {
"cuda": (bool, False, "generate CUDA wrapped versions of tabulate tensor functions", None),
jhale marked this conversation as resolved.
Show resolved Hide resolved
"epsilon": (float, 1e-14, "machine precision, used for dropping zero terms in tables.", None),
"scalar_type": (
str,
Expand Down
Loading