From 349b09cec107077ebb5b33ce413983c54b9965b7 Mon Sep 17 00:00:00 2001
From: "Mr.Nineteen" <dxwang.bill@gmail.com>
Date: Mon, 3 Apr 2023 11:35:27 +0800
Subject: [PATCH] support bazel build

---
 .bazelrc                                      |   19 +
 README.md                                     |   12 +
 WORKSPACE                                     |   14 +
 benchmark/BUILD                               |   24 +
 build_deps/gpus/BUILD                         |    0
 build_deps/gpus/check_cuda_libs.py            |   86 ++
 build_deps/gpus/configure.bzl                 | 1320 +++++++++++++++++
 build_deps/gpus/crosstool/BUILD               |    0
 build_deps/gpus/crosstool/BUILD.tpl           |  108 ++
 .../crosstool/cc_toolchain_config.bzl.tpl     |  645 ++++++++
 .../crosstool/crosstool_compiler_wrapper.tpl  |  313 ++++
 build_deps/gpus/cuda/BUILD                    |    0
 build_deps/gpus/cuda/BUILD.tpl                |  229 +++
 build_deps/gpus/cuda/build_defs.bzl.tpl       |   56 +
 build_deps/gpus/cuda/cuda_config.h.tpl        |   34 +
 build_deps/gpus/cuda/cuda_config.py.tpl       |   16 +
 build_deps/gpus/find_cuda_config.py           |  638 ++++++++
 build_deps/remote_config/BUILD                |    0
 build_deps/remote_config/BUILD.tpl            |   26 +
 build_deps/remote_config/common.bzl           |  294 ++++
 .../remote_platform_configure.bzl             |   55 +
 include/BUILD                                 |   29 +
 include/merlin/BUILD                          |   24 +
 23 files changed, 3942 insertions(+)
 create mode 100644 .bazelrc
 create mode 100644 WORKSPACE
 create mode 100644 benchmark/BUILD
 create mode 100644 build_deps/gpus/BUILD
 create mode 100644 build_deps/gpus/check_cuda_libs.py
 create mode 100644 build_deps/gpus/configure.bzl
 create mode 100644 build_deps/gpus/crosstool/BUILD
 create mode 100644 build_deps/gpus/crosstool/BUILD.tpl
 create mode 100644 build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl
 create mode 100755 build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl
 create mode 100644 build_deps/gpus/cuda/BUILD
 create mode 100644 build_deps/gpus/cuda/BUILD.tpl
 create mode 100644 build_deps/gpus/cuda/build_defs.bzl.tpl
 create mode 100644 build_deps/gpus/cuda/cuda_config.h.tpl
 create mode 100644 build_deps/gpus/cuda/cuda_config.py.tpl
 create mode 100644 build_deps/gpus/find_cuda_config.py
 create mode 100644 build_deps/remote_config/BUILD
 create mode 100644 build_deps/remote_config/BUILD.tpl
 create mode 100644 build_deps/remote_config/common.bzl
 create mode 100644 build_deps/remote_config/remote_platform_configure.bzl
 create mode 100644 include/BUILD
 create mode 100644 include/merlin/BUILD

diff --git a/.bazelrc b/.bazelrc
new file mode 100644
index 000000000..f7374e348
--- /dev/null
+++ b/.bazelrc
@@ -0,0 +1,19 @@
+build -c opt
+build --copt -O3
+build --copt -pthread
+build --linkopt -pthread
+build --linkopt -ldl
+build --incompatible_linkopts_to_linklibs
+build --copt -g --strip=never
+build --experimental_repo_remote_exec
+
+# This config refers to building CUDA kernels with nvcc.
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+
+# CUDA options
+build:cuda --action_env GCC_HOST_COMPILER_PATH="/opt/rh/devtoolset-9/root/usr/bin/gcc"
+build:cuda --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda"
+build:cuda --action_env CUDA_VERSION="11"
+build:cuda --action_env CUDNN_VERSION="8"
+build:cuda --action_env CUDNN_INSTALL_PATH="/usr/local/cuda"
+build:cuda --action_env CUDA_COMPUTE_CAPABILITIES="7.5"
diff --git a/README.md b/README.md
index 8185af173..374efe2aa 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ and also open for public contributions, bug fixes, and documentation. [[Contribu
 
 Basically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing.
 
+### with cmake
 ```shell
 git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git
 cd HierarchicalKV && mkdir -p build && cd build
@@ -73,6 +74,17 @@ For Unit Test:
 ./merlin_hashtable_test
 ```
 
+### with bazel
+```shell
+git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git
+cd HierarchicalKV && bazel build --config=cuda //...
+```
+
+For Benchmark:
+```shell
+./benchmark_util
+```
+
 Your environment must meet the following requirements:
 
 - CUDA version >= 11.2
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 000000000..9d57b33f5
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,14 @@
+workspace(name = "HierarchicalKV")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//build_deps/gpus:configure.bzl", "cuda_configure")
+
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0",
+    urls = [
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz",
+    ],
+)
+
+cuda_configure(name = "local_config_cuda")
diff --git a/benchmark/BUILD b/benchmark/BUILD
new file mode 100644
index 000000000..a7221a33b
--- /dev/null
+++ b/benchmark/BUILD
@@ -0,0 +1,24 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+
+cc_binary(
+    name = "benchmark_util",
+    deps = [
+        ":benchmark_lib",
+    ],
+)
+
+cuda_library(
+    name = "benchmark_lib",
+    srcs = [
+        "merlin_hashtable_benchmark.cc.cu",
+    ],
+    hdrs = [
+        "benchmark_util.cuh",
+    ],
+    copts = ["-Iinclude/"],
+    linkopts = ["-pthread"],
+    deps = [
+        "//include:merlin_hashtable",
+        "@local_config_cuda//cuda",
+    ],
+)
diff --git a/build_deps/gpus/BUILD b/build_deps/gpus/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/build_deps/gpus/check_cuda_libs.py b/build_deps/gpus/check_cuda_libs.py
new file mode 100644
index 000000000..216a10e5b
--- /dev/null
+++ b/build_deps/gpus/check_cuda_libs.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Verifies that a list of libraries is installed on the system.
+
+Takes a list of arguments with every two subsequent arguments being a logical
+tuple of (path, check_soname). The path to the library and either True or False
+to indicate whether to check the soname field on the shared library.
+
+Example Usage:
+./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False
+"""
+import os
+import os.path
+import platform
+import subprocess
+import sys
+
+# pylint: disable=g-import-not-at-top,g-importing-member
+try:
+    from shutil import which
+except ImportError:
+    from distutils.spawn import find_executable as which
+# pylint: enable=g-import-not-at-top,g-importing-member
+
+
+class ConfigError(Exception):
+    pass
+
+
+def check_cuda_lib(path, check_soname=True):
+    """Tests if a library exists on disk and whether its soname matches the filename.
+
+  Args:
+    path: the path to the library.
+    check_soname: whether to check the soname as well.
+
+  Raises:
+    ConfigError: If the library does not exist or if its soname does not match
+    the filename.
+  """
+    if not os.path.isfile(path):
+        raise ConfigError("No library found under: " + path)
+    objdump = which("objdump")
+    if check_soname and objdump is not None:
+        # Decode is necessary as in py3 the return type changed from str to bytes
+        output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
+        output = [line for line in output.splitlines() if "SONAME" in line]
+        sonames = [line.strip().split(" ")[-1] for line in output]
+        if not any(soname == os.path.basename(path) for soname in sonames):
+            raise ConfigError("None of the libraries match their SONAME: " +
+                              path)
+
+
+def main():
+    try:
+        args = [argv for argv in sys.argv[1:]]
+        if len(args) % 2 == 1:
+            raise ConfigError("Expected even number of arguments")
+        checked_paths = []
+        for i in range(0, len(args), 2):
+            path = args[i]
+            check_cuda_lib(path, check_soname=args[i + 1] == "True")
+            checked_paths.append(path)
+        # pylint: disable=superfluous-parens
+        print(os.linesep.join(checked_paths))
+        # pylint: enable=superfluous-parens
+    except ConfigError as e:
+        sys.stderr.write(str(e))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/build_deps/gpus/configure.bzl b/build_deps/gpus/configure.bzl
new file mode 100644
index 000000000..326096419
--- /dev/null
+++ b/build_deps/gpus/configure.bzl
@@ -0,0 +1,1320 @@
+"""Repository rule for CUDA autoconfiguration.
+
+`cuda_configure` depends on the following environment variables:
+
+  * `NEED_CUDA`: Whether to enable building with CUDA.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+  * `SYSROOT`: The sysroot to use when compiling.
+  * `CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
+    `/usr/local/cuda,usr/`.
+  * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is
+    `/usr/local/cuda`.
+  * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
+    use the system default.
+  * `CUDNN_VERSION`: The version of the cuDNN library.
+  * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is
+    `/usr/local/cuda`.
+  * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
+    "escape_string",
+    "get_env_var",
+)
+load(
+    "//build_deps/remote_config:common.bzl",
+    "config_repo_label",
+    "err_out",
+    "execute",
+    "get_bash_bin",
+    "get_cpu_value",
+    "get_host_environ",
+    "get_python_bin",
+    "raw_exec",
+    "read_dir",
+    "realpath",
+    "which",
+)
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
+_SYSROOT = "SYSROOT"
+_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
+_CUDA_VERSION = "CUDA_VERSION"
+_CUDNN_VERSION = "CUDNN_VERSION"
+_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
+_CUDA_COMPUTE_CAPABILITIES = "CUDA_COMPUTE_CAPABILITIES"
+_CUDA_CONFIG_REPO = "CUDA_CONFIG_REPO"
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+
+_TENSORRT_VERSION = "TENSORRT_VERSION"
+_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TENSORRT_STATIC_PATH = "TENSORRT_STATIC_PATH"
+_TENSORRT_LIBS = [
+    "nvinfer",
+    "nvinfer_plugin",
+    "nvonnxparser",
+    "nvparsers",
+]
+_TENSORRT_HEADERS = [
+    "NvInfer.h",
+    "NvUtils.h",
+    "NvInferPlugin.h",
+]
+_TENSORRT_HEADERS_V6 = [
+    "NvInfer.h",
+    "NvUtils.h",
+    "NvInferPlugin.h",
+    "NvInferVersion.h",
+    "NvInferRuntime.h",
+    "NvInferRuntimeCommon.h",
+    "NvInferPluginUtils.h",
+    "NvOnnxParser.h",
+    "NvOnnxConfig.h",
+]
+_TENSORRT_HEADERS_V8 = [
+    "NvInfer.h",
+    "NvInferLegacyDims.h",
+    "NvInferImpl.h",
+    "NvUtils.h",
+    "NvInferPlugin.h",
+    "NvInferVersion.h",
+    "NvInferRuntime.h",
+    "NvInferRuntimeCommon.h",
+    "NvInferPluginUtils.h",
+    "NvOnnxParser.h",
+    "NvOnnxConfig.h",
+]
+
+def _at_least_version(actual_version, required_version):
+    actual = [int(v) for v in actual_version.split(".")]
+    required = [int(v) for v in required_version.split(".")]
+    return actual >= required
+
+def _get_tensorrt_headers(tensorrt_version):
+    if _at_least_version(tensorrt_version, "8"):
+        return _TENSORRT_HEADERS_V8
+    if _at_least_version(tensorrt_version, "6"):
+        return _TENSORRT_HEADERS_V6
+    return _TENSORRT_HEADERS
+
+def to_list_of_strings(elements):
+    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
+
+    This is to be used to put a list of strings into the bzl file templates
+    so it gets interpreted as list of strings in Starlark.
+
+    Args:
+      elements: list of string elements
+
+    Returns:
+      single string of elements wrapped in quotes separated by a comma."""
+    quoted_strings = ["\"" + element + "\"" for element in elements]
+    return ", ".join(quoted_strings)
+
+def verify_build_defines(params):
+    """Verify all variables that crosstool/BUILD.tpl expects are substituted.
+
+    Args:
+      params: dict of variables that will be passed to the BUILD.tpl template.
+    """
+    missing = []
+    for param in [
+        "cxx_builtin_include_directories",
+        "extra_no_canonical_prefixes_flags",
+        "host_compiler_path",
+        "host_compiler_prefix",
+        "host_compiler_warnings",
+        "linker_bin_path",
+        "compiler_deps",
+        "unfiltered_compile_flags",
+    ]:
+        if ("%{" + param + "}") not in params:
+            missing.append(param)
+
+    if missing:
+        auto_configure_fail(
+            "BUILD.tpl template is missing these variables: " + str(missing) +
+            ".\nWe only got: " + str(params) + ".",
+        )
+
+# TODO(dzc): Once these functions have been factored out of Bazel's
+# cc_configure.bzl, load them from @bazel_tools instead.
+# BEGIN cc_configure common functions.
+def find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
+    if cc_name_from_env:
+        cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = which(repository_ctx, cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _is_compiler_option_supported(repository_ctx, cc, option):
+    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
+    result = repository_ctx.execute([
+        cc,
+        option,
+        "-o",
+        "/dev/null",
+        "-c",
+        str(repository_ctx.path("tools/cpp/empty.cc")),
+    ])
+    return result.stderr.find(option) == -1
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    sysroot = []
+    if tf_sysroot:
+        sysroot += ["--sysroot", tf_sysroot]
+    result = raw_exec(
+        repository_ctx,
+        [cc, "-E", "-x" + lang, "-", "-v"] + sysroot,
+    )
+    stderr = err_out(result)
+    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = stderr[index1 + 1:]
+    else:
+        inc_dirs = stderr[index1 + 1:index2].strip()
+
+    print_resource_dir_supported = _is_compiler_option_supported(
+        repository_ctx,
+        cc,
+        "-print-resource-dir",
+    )
+
+    if print_resource_dir_supported:
+        resource_dir = repository_ctx.execute(
+            [cc, "-print-resource-dir"],
+        ).stdout.strip() + "/share"
+        inc_dirs += "\n" + resource_dir
+
+    return [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
+    """Compute the list of default C and C++ include directories."""
+
+    includes_cpp = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        True,
+        tf_sysroot,
+    )
+    includes_c = _get_cxx_inc_directories_impl(
+        repository_ctx,
+        cc,
+        False,
+        tf_sysroot,
+    )
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
+
+def auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+# END cc_configure common functions (see TODO above).
+
+def _cuda_include_path(repository_ctx, cuda_config):
+    """Generates the Starlark string with cuda include directories.
+
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+
+      Returns:
+        A list of the gcc host compiler include directories.
+      """
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
+
+    # The expected exit code of this command is non-zero. Bazel remote execution
+    # only caches commands with zero exit code. So force a zero exit code.
+    cmd = "%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]" % str(nvcc_path)
+    result = raw_exec(
+        repository_ctx,
+        [get_bash_bin(repository_ctx), "-c", cmd],
+    )
+    target_dir = ""
+    for one_line in err_out(result).splitlines():
+        if one_line.startswith("#$ _TARGET_DIR_="):
+            target_dir = (cuda_config.cuda_toolkit_path + "/" +
+                          one_line.replace(
+                              "#$ _TARGET_DIR_=",
+                              "",
+                          ) + "/include")
+    inc_entries = []
+    if target_dir != "":
+        inc_entries.append(realpath(repository_ctx, target_dir))
+    inc_entries.append(
+        realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"),
+    )
+    return inc_entries
+
+def matches_version(environ_version, detected_version):
+    """Checks whether the user-specified version matches the detected version.
+
+      This function performs a weak matching so that if the user specifies only
+      the
+      major or major and minor versions, the versions are still considered
+      matching
+      if the version parts match. To illustrate:
+
+          environ_version  detected_version  result
+          -----------------------------------------
+          5.1.3            5.1.3             True
+          5.1              5.1.3             True
+          5                5.1               True
+          5.1.3            5.1               False
+          5.2.3            5.1.3             False
+
+      Args:
+        environ_version: The version specified by the user via environment
+          variables.
+        detected_version: The version autodetected from the CUDA installation on
+          the system.
+      Returns: True if user-specified version matches detected version and False
+        otherwise.
+    """
+    environ_version_parts = environ_version.split(".")
+    detected_version_parts = detected_version.split(".")
+    if len(detected_version_parts) < len(environ_version_parts):
+        return False
+    for i, part in enumerate(detected_version_parts):
+        if i >= len(environ_version_parts):
+            break
+        if part != environ_version_parts[i]:
+            return False
+    return True
+
+_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
+
+_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
+
+def compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    if len(capabilities) > 0 and all(
+        [len(x.split(".")) == 2 for x in capabilities],
+    ):
+        # If all capabilities are in 'x.y' format, only include PTX for the
+        # highest capability.
+        cc_list = sorted([x.replace(".", "") for x in capabilities])
+        capabilities = [
+            "sm_%s" % x
+            for x in cc_list[:-1]
+        ] + ["compute_%s" % cc_list[-1]]
+    for i, capability in enumerate(capabilities):
+        parts = capability.split(".")
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit(
+            ):
+                continue
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+
+    return capabilities
+
+def lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+
+      Args:
+        base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
+        version: The version of the library.
+        static: True the library is static or False if it is a shared object.
+
+      Returns:
+        The platform-specific name of the library.
+      """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def _lib_path(lib, cpu_value, basedir, version, static):
+    file_name = lib_name(lib, cpu_value, version, static)
+    return "%s/%s" % (basedir, file_name)
+
+def _should_check_soname(version, static):
+    return version and not static
+
+def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
+    return (
+        _lib_path(lib, cpu_value, basedir, version, static),
+        _should_check_soname(version, static),
+    )
+
+def _check_cuda_libs(repository_ctx, script_path, libs):
+    python_bin = get_python_bin(repository_ctx)
+    contents = repository_ctx.read(script_path).splitlines()
+
+    cmd = "from os import linesep;"
+    cmd += "f = open('script.py', 'w');"
+    for line in contents:
+        cmd += "f.write('%s' + linesep);" % line
+    cmd += "f.close();"
+    cmd += "from os import system;"
+    args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs])
+    cmd += "system('%s script.py %s');" % (python_bin, args)
+
+    all_paths = [path for path, _ in libs]
+    checked_paths = execute(
+        repository_ctx,
+        [python_bin, "-c", cmd],
+    ).stdout.splitlines()
+
+    # Filter out empty lines from splitting on '\r\n' on Windows
+    checked_paths = [path for path in checked_paths if len(path) > 0]
+    if all_paths != checked_paths:
+        auto_configure_fail(
+            "Error with installed CUDA libs. Expected '%s'. Actual '%s'." %
+            (all_paths, checked_paths),
+        )
+
+def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
+    """Returns the CUDA and cuDNN libraries on the system.
+
+      Also, verifies that the script actually exist.
+
+      Args:
+        repository_ctx: The repository context.
+        check_cuda_libs_script: The path to a script verifying that the cuda
+          libraries exist on the system.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        Map of library names to structs of filename and path.
+      """
+    cpu_value = cuda_config.cpu_value
+    stub_dir = "/stubs"
+
+    check_cuda_libs_params = {
+        "cuda": _check_cuda_lib_params(
+            "cuda",
+            cpu_value,
+            cuda_config.config["cuda_library_dir"] + stub_dir,
+            version = None,
+            static = False,
+        ),
+        "cudart": _check_cuda_lib_params(
+            "cudart",
+            cpu_value,
+            cuda_config.config["cuda_library_dir"],
+            cuda_config.cudart_version,
+            static = False,
+        ),
+        "cudart_static": _check_cuda_lib_params(
+            "cudart_static",
+            cpu_value,
+            cuda_config.config["cuda_library_dir"],
+            cuda_config.cudart_version,
+            static = True,
+        ),
+        "cublas": _check_cuda_lib_params(
+            "cublas",
+            cpu_value,
+            cuda_config.config["cublas_library_dir"],
+            cuda_config.cublas_version,
+            static = False,
+        ),
+        "cublasLt": _check_cuda_lib_params(
+            "cublasLt",
+            cpu_value,
+            cuda_config.config["cublas_library_dir"],
+            cuda_config.cublas_version,
+            static = False,
+        ),
+        "cusolver": _check_cuda_lib_params(
+            "cusolver",
+            cpu_value,
+            cuda_config.config["cusolver_library_dir"],
+            cuda_config.cusolver_version,
+            static = False,
+        ),
+        "curand": _check_cuda_lib_params(
+            "curand",
+            cpu_value,
+            cuda_config.config["curand_library_dir"],
+            cuda_config.curand_version,
+            static = False,
+        ),
+        "cufft": _check_cuda_lib_params(
+            "cufft",
+            cpu_value,
+            cuda_config.config["cufft_library_dir"],
+            cuda_config.cufft_version,
+            static = False,
+        ),
+        "cudnn": _check_cuda_lib_params(
+            "cudnn",
+            cpu_value,
+            cuda_config.config["cudnn_library_dir"],
+            cuda_config.cudnn_version,
+            static = False,
+        ),
+        "cupti": _check_cuda_lib_params(
+            "cupti",
+            cpu_value,
+            cuda_config.config["cupti_library_dir"],
+            cuda_config.cupti_version,
+            static = False,
+        ),
+        "cusparse": _check_cuda_lib_params(
+            "cusparse",
+            cpu_value,
+            cuda_config.config["cusparse_library_dir"],
+            cuda_config.cusparse_version,
+            static = False,
+        ),
+    }
+
+    # Verify that the libs actually exist at their locations.
+    _check_cuda_libs(
+        repository_ctx,
+        check_cuda_libs_script,
+        check_cuda_libs_params.values(),
+    )
+
+    paths = {
+        filename: v[0]
+        for (filename, v) in check_cuda_libs_params.items()
+    }
+    return paths
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
+
+def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
+    python_bin = get_python_bin(repository_ctx)
+    cmd = "from os import system;" + "system('\"%s\" %s %s');" % (
+        python_bin,
+        script_path,
+        " ".join(cuda_libraries),
+    )
+    return execute(repository_ctx, [python_bin, "-c", cmd])
+
+# TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
+# and nccl_configure.bzl.
+def find_cuda_config(repository_ctx, script_path, cuda_libraries):
+    """Returns CUDA config dictionary from running find_cuda_config.py"""
+    exec_result = _exec_find_cuda_config(
+        repository_ctx,
+        script_path,
+        cuda_libraries,
+    )
+
+    if exec_result.return_code:
+        auto_configure_fail("Failed to run find_cuda_config.py: %s" %
+                            err_out(exec_result))
+
+    # Parse the dict from stdout.
+    return dict(
+        [tuple(x.split(": ")) for x in exec_result.stdout.splitlines()],
+    )
+
+def _get_cuda_config(repository_ctx, find_cuda_config_script):
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_toolkit_path: The CUDA toolkit installation directory.
+          cudnn_install_basedir: The cuDNN installation directory.
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+    config = find_cuda_config(
+        repository_ctx,
+        find_cuda_config_script,
+        ["cuda", "cudnn"],
+    )
+
+    cpu_value = get_cpu_value(repository_ctx)
+    toolkit_path = config["cuda_toolkit_path"]
+
+    cuda_version = config["cuda_version"].split(".")
+    cuda_major = cuda_version[0]
+    cuda_minor = cuda_version[1]
+
+    cuda_version = "%s.%s" % (cuda_major, cuda_minor)
+    cudnn_version = "%s" % config["cudnn_version"]
+
+    if int(cuda_major) >= 11:
+        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
+        if int(cuda_major) == 11:
+            cudart_version = "11.0"
+            cupti_version = cuda_version
+        else:
+            cudart_version = ("%s") % cuda_major
+            cupti_version = cudart_version
+        cublas_version = ("%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("%s") % config["cufft_version"].split(".")[0]
+        cusparse_version = ("%s") % config["cusparse_version"].split(".")[0]
+    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
+        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
+        # It changed from 'x.y' to just 'x' in CUDA 10.1.
+        cuda_lib_version = ("%s") % cuda_major
+        cudart_version = cuda_version
+        cupti_version = cuda_version
+        cublas_version = cuda_lib_version
+        cusolver_version = cuda_lib_version
+        curand_version = cuda_lib_version
+        cufft_version = cuda_lib_version
+        cusparse_version = cuda_lib_version
+    else:
+        cudart_version = cuda_version
+        cupti_version = cuda_version
+        cublas_version = cuda_version
+        cusolver_version = cuda_version
+        curand_version = cuda_version
+        cufft_version = cuda_version
+        cusparse_version = cuda_version
+
+    return struct(
+        cuda_toolkit_path = toolkit_path,
+        cuda_version = cuda_version,
+        cupti_version = cupti_version,
+        cuda_version_major = cuda_major,
+        cudart_version = cudart_version,
+        cublas_version = cublas_version,
+        cusolver_version = cusolver_version,
+        curand_version = curand_version,
+        cufft_version = cufft_version,
+        cusparse_version = cusparse_version,
+        cudnn_version = cudnn_version,
+        compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
+        config = config,
+    )
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//build_deps/gpus/%s.tpl" % tpl),
+        substitutions,
+    )
+
+def _file(repository_ctx, label):
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//build_deps/gpus/%s.tpl" % label),
+        {},
+    )
+
+_DUMMY_CROSSTOOL_BZL_FILE = """
+def error_gpu_disabled():
+  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
+       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
+       "at the prompt to build with GPU support.")
+
+  native.genrule(
+      name = "error_gen_crosstool",
+      outs = ["CROSSTOOL"],
+      cmd = "echo 'Should not be run.' && exit 1",
+  )
+
+  native.filegroup(
+      name = "crosstool",
+      srcs = [":CROSSTOOL"],
+      output_licenses = ["unencumbered"],
+  )
+"""
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+
+error_gpu_disabled()
+"""
+
+def _norm_path(path):
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def make_copy_files_rule(repository_ctx, name, srcs, outs):
+    """Returns a rule to copy a set of files."""
+    cmds = []
+
+    # Copy files.
+    for src, out in zip(srcs, outs):
+        cmds.append('cp -f "%s" "$(location %s)"' % (src, out))
+    outs = [('        "%s",' % out) for out in outs]
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""%s \""",
+)""" % (name, "\n".join(outs), " && \\\n".join(cmds))
+
+def make_copy_dir_rule(
+        repository_ctx,
+        name,
+        src_dir,
+        out_dir,
+        exceptions = None):
+    """Returns a rule to recursively copy a directory.
+    If exceptions is not None, it must be a list of files or directories in
+    'src_dir'; these will be excluded from copying.
+    """
+    src_dir = _norm_path(src_dir)
+    out_dir = _norm_path(out_dir)
+    outs = read_dir(repository_ctx, src_dir)
+    post_cmd = ""
+    if exceptions != None:
+        outs = [
+            x
+            for x in outs
+            if not any([x.startswith(src_dir + "/" + y) for y in exceptions])
+        ]
+    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
+
+    # '@D' already contains the relative path for a single file, see
+    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
+    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions != None:
+        for x in exceptions:
+            post_cmd += " ; rm -fR " + out_dir + "/" + x
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
+
+def _flag_enabled(repository_ctx, flag_name):
+    return get_host_environ(repository_ctx, flag_name) == "1"
+
+def _tf_sysroot(repository_ctx):
+    return get_host_environ(repository_ctx, _SYSROOT, "")
+
+def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
+    copts = []
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            copts.append("--cuda-include-ptx=%s" % capability)
+        copts.append("--cuda-gpu-arch=%s" % capability)
+
+    return str(copts)
+
+def _tpl_path(repository_ctx, filename):
+    return repository_ctx.path(Label("//build_deps/gpus/%s.tpl" % filename))
+
+def _basename(repository_ctx, path_str):
+    """Returns the basename of a path of type string.
+    """
+
+    num_chars = len(path_str)
+    for i in range(num_chars):
+        r_i = num_chars - 1 - i
+        if path_str[r_i] == "/":
+            return path_str[r_i + 1:]
+    return path_str
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    tpl_paths = {
+        filename: _tpl_path(repository_ctx, filename)
+        for filename in [
+            "cuda:build_defs.bzl",
+            "crosstool:crosstool_compiler_wrapper",
+            "crosstool:BUILD",
+            "crosstool:cc_toolchain_config.bzl",
+            "cuda:cuda_config.h",
+            "cuda:cuda_config.py",
+        ]
+    }
+    tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD")
+    find_cuda_config_script = repository_ctx.path(
+        Label("//build_deps/gpus:find_cuda_config.py"),
+    )
+
+    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)
+
+    cuda_include_path = cuda_config.config["cuda_include_dir"]
+    cublas_include_path = cuda_config.config["cublas_include_dir"]
+    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
+    cupti_header_dir = cuda_config.config["cupti_include_dir"]
+    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]
+
+    # Create genrule to copy files from the installed CUDA toolkit into execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-include",
+            src_dir = cuda_include_path,
+            out_dir = "cuda/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-nvvm",
+            src_dir = nvvm_libdevice_dir,
+            out_dir = "cuda/nvvm/libdevice",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-extras",
+            src_dir = cupti_header_dir,
+            out_dir = "cuda/extras/CUPTI/include",
+        ),
+    ]
+
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cublas-include",
+            srcs = [
+                cublas_include_path + "/cublas.h",
+                cublas_include_path + "/cublas_v2.h",
+                cublas_include_path + "/cublas_api.h",
+                cublas_include_path + "/cublasLt.h",
+            ],
+            outs = [
+                "cublas/include/cublas.h",
+                "cublas/include/cublas_v2.h",
+                "cublas/include/cublas_api.h",
+                "cublas/include/cublasLt.h",
+            ],
+        ),
+    )
+
+    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cusolver-include",
+            srcs = [
+                cusolver_include_path + "/cusolver_common.h",
+                cusolver_include_path + "/cusolverDn.h",
+            ],
+            outs = [
+                "cusolver/include/cusolver_common.h",
+                "cusolver/include/cusolverDn.h",
+            ],
+        ),
+    )
+
+    cufft_include_path = cuda_config.config["cufft_include_dir"]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cufft-include",
+            srcs = [
+                cufft_include_path + "/cufft.h",
+            ],
+            outs = [
+                "cufft/include/cufft.h",
+            ],
+        ),
+    )
+
+    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cusparse-include",
+            srcs = [
+                cusparse_include_path + "/cusparse.h",
+            ],
+            outs = [
+                "cusparse/include/cusparse.h",
+            ],
+        ),
+    )
+
+    curand_include_path = cuda_config.config["curand_include_dir"]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "curand-include",
+            srcs = [
+                curand_include_path + "/curand.h",
+            ],
+            outs = [
+                "curand/include/curand.h",
+            ],
+        ),
+    )
+
+    check_cuda_libs_script = repository_ctx.path(
+        Label("//build_deps/gpus:check_cuda_libs.py"),
+    )
+    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
+    cuda_lib_srcs = []
+    cuda_lib_outs = []
+    for path in cuda_libs.values():
+        cuda_lib_srcs.append(path)
+        cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path))
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cuda-lib",
+            srcs = cuda_lib_srcs,
+            outs = cuda_lib_outs,
+        ),
+    )
+
+    file_ext = ""
+    bin_files = (
+        ["crt/link.stub"] +
+        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
+    )
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cuda-bin",
+            srcs = [
+                cuda_config.cuda_toolkit_path + "/bin/" + f
+                for f in bin_files
+            ],
+            outs = ["cuda/bin/" + f for f in bin_files],
+        ),
+    )
+
+    # Select the headers based on the cuDNN version (strip '64_' for Windows).
+    cudnn_headers = ["cudnn.h"]
+    if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
+        cudnn_headers += [
+            "cudnn_backend.h",
+            "cudnn_adv_infer.h",
+            "cudnn_adv_train.h",
+            "cudnn_cnn_infer.h",
+            "cudnn_cnn_train.h",
+            "cudnn_ops_infer.h",
+            "cudnn_ops_train.h",
+            "cudnn_version.h",
+        ]
+
+    cudnn_srcs = []
+    cudnn_outs = []
+    for header in cudnn_headers:
+        cudnn_srcs.append(cudnn_header_dir + "/" + header)
+        cudnn_outs.append("cudnn/include/" + header)
+
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "cudnn-include",
+            srcs = cudnn_srcs,
+            outs = cudnn_outs,
+        ),
+    )
+
+    # Set up BUILD file for cuda/
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        tpl_paths["cuda:build_defs.bzl"],
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+        },
+    )
+
+    cub_actual = "@cub_archive//:cub"
+    if int(cuda_config.cuda_version_major) >= 11:
+        cub_actual = ":cuda_headers"
+
+    repository_ctx.template(
+        "cuda/BUILD",
+        tpl_paths["cuda:BUILD"],
+        {
+            "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]),
+            "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
+            "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
+            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
+            "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
+            "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
+            "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
+            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
+            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
+            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
+            "%{cub_actual}": cub_actual,
+            "%{copy_rules}": "\n".join(copy_rules),
+        },
+    )
+
+    tf_sysroot = _tf_sysroot(repository_ctx)
+
+    # Set up crosstool/
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc
+
+    host_compiler_includes = get_cxx_inc_directories(
+        repository_ctx,
+        cc_fullpath,
+        tf_sysroot,
+    )
+    cuda_defines = {}
+    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
+    cuda_defines["%{cuda_toolkit_path}"] = ""
+    cuda_defines["%{compiler}"] = "unknown"
+
+    host_compiler_prefix = get_host_environ(
+        repository_ctx,
+        _GCC_HOST_COMPILER_PREFIX,
+    )
+    if not host_compiler_prefix:
+        host_compiler_prefix = "/usr/bin"
+
+    cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix
+    cuda_defines["%{linker_bin_path}"] = host_compiler_prefix
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    cuda_defines["%{unfiltered_compile_flags}"] = ""
+
+    cuda_defines["%{host_compiler_path}"] = "crosstool_compiler_wrapper"
+    cuda_defines["%{host_compiler_warnings}"] = ""
+
+    # nvcc has the system include paths built in and will automatically
+    # search them; we cannot work around that, so we add the relevant cuda
+    # system paths to the allowed compiler specific include paths.
+    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
+        host_compiler_includes + _cuda_include_path(
+            repository_ctx,
+            cuda_config,
+        ) + [cupti_header_dir, cudnn_header_dir],
+    )
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""
+
+    file_ext = ""
+    nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext)
+    cuda_defines["%{compiler_deps}"] = ":crosstool_compiler"
+
+    wrapper_defines = {
+        "%{cpu_compiler}": str(cc),
+        "%{cuda_version}": cuda_config.cuda_version,
+        "%{nvcc_path}": nvcc_path,
+        "%{gcc_host_compiler_path}": str(cc),
+    }
+    repository_ctx.template(
+        "crosstool/crosstool_compiler_wrapper",
+        tpl_paths["crosstool:crosstool_compiler_wrapper"],
+        wrapper_defines,
+    )
+
+    verify_build_defines(cuda_defines)
+
+    # Only expand template variables in the BUILD file
+    repository_ctx.template(
+        "crosstool/BUILD",
+        tpl_paths["crosstool:BUILD"],
+        cuda_defines,
+    )
+
+    # No templating of cc_toolchain_config - use attributes and templatize the
+    # BUILD file.
+    repository_ctx.template(
+        "crosstool/cc_toolchain_config.bzl",
+        tpl_paths["crosstool:cc_toolchain_config.bzl"],
+        {},
+    )
+
+    # Set up cuda_config.h
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        tpl_paths["cuda:cuda_config.h"],
+        {
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
+            "%{cupti_version}": cuda_config.cupti_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
+            "%{cuda_compute_capabilities}": ", ".join(
+                [cc.split("_")[1] for cc in cuda_config.compute_capabilities],
+            ),
+        },
+    )
+
+    # Set up cuda_config.py, which is used by gen_build_info to provide
+    # static build environment info to the API
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.py",
+        tpl_paths["cuda:cuda_config.py"],
+        _py_tmpl_dict({
+            "cuda_version": cuda_config.cuda_version,
+            "cudnn_version": cuda_config.cudnn_version,
+            "cuda_compute_capabilities": cuda_config.compute_capabilities,
+            "cpu_compiler": str(cc),
+        }),
+    )
+
+def _get_tensorrt_static_path(repository_ctx):
+    return get_host_environ(repository_ctx, _TENSORRT_STATIC_PATH, None)
+
+def _create_local_tensorrt_repository(repository_ctx):
+    find_cuda_config_path = repository_ctx.path(
+        Label("//build_deps/gpus:find_cuda_config.py"),
+    )
+    config = find_cuda_config(
+        repository_ctx,
+        find_cuda_config_path,
+        ["tensorrt"],
+    )
+    tensorrt_version = config["tensorrt_version"]
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Copy the library and header files
+    libraries = [
+        lib_name(lib, cpu_value, tensorrt_version)
+        for lib in _TENSORRT_LIBS
+    ]
+    library_dir = config["tensorrt_library_dir"] + "/"
+    headers = _get_tensorrt_headers(tensorrt_version)
+    include_dir = config["tensorrt_include_dir"] + "/"
+    copy_rules = [
+        make_copy_files_rule(
+            repository_ctx,
+            name = "tensorrt_lib",
+            srcs = [library_dir + library for library in libraries],
+            outs = ["tensorrt/lib/" + library for library in libraries],
+        ),
+        make_copy_files_rule(
+            repository_ctx,
+            name = "tensorrt_include",
+            srcs = [include_dir + header for header in headers],
+            outs = ["tensorrt/include/" + header for header in headers],
+        ),
+    ]
+
+    tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)
+    if tensorrt_static_path:
+        tensorrt_static_path = tensorrt_static_path + "/"
+        if _at_least_version(tensorrt_version, "8"):
+            raw_static_library_names = _TENSORRT_LIBS
+        else:
+            raw_static_library_names = _TENSORRT_LIBS + [
+                "nvrtc",
+                "myelin_compiler",
+                "myelin_executor",
+                "myelin_pattern_library",
+                "myelin_pattern_runtime",
+            ]
+
+        static_library_names = [
+            "%s_static" % name
+            for name in raw_static_library_names
+        ]
+        static_libraries = [
+            lib_name(lib, cpu_value, tensorrt_version, static = True)
+            for lib in static_library_names
+        ]
+        copy_rules = copy_rules + [
+            make_copy_files_rule(
+                repository_ctx,
+                name = "tensorrt_static_lib",
+                srcs = [
+                    tensorrt_static_path + library
+                    for library in static_libraries
+                ],
+                outs = [
+                    "tensorrt/lib/" + library
+                    for library in static_libraries
+                ],
+            ),
+        ]
+
+    tpl_paths = {
+        "tensorrt/build_defs.bzl": _tpl_path(repository_ctx, "tensorrt:build_defs.bzl"),
+        "tensorrt/BUILD": _tpl_path(repository_ctx, "tensorrt:BUILD"),
+        "tensorrt/tensorrt_config.h": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.h"),
+        "tensorrt/tensorrt_config.py": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.py"),
+    }
+
+    # Set up config file.
+    repository_ctx.template(
+        "tensorrt/build_defs.bzl",
+        tpl_paths["tensorrt/build_defs.bzl"],
+        {"%{if_tensorrt}": "if_true"},
+    )
+
+    # Set up BUILD file.
+    repository_ctx.template(
+        "tensorrt/BUILD",
+        tpl_paths["tensorrt/BUILD"],
+        {
+            "%{copy_rules}": "\n".join(copy_rules),
+        },
+    )
+
+    # Set up tensorrt_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    repository_ctx.template(
+        "tensorrt/tensorrt_config.h",
+        tpl_paths["tensorrt/tensorrt_config.h"],
+        {"%{tensorrt_version}": tensorrt_version},
+    )
+
+    # Set up tensorrt_config.py, which is used by gen_build_info to provide
+    # build environment info to the API
+    repository_ctx.template(
+        "tensorrt/tensorrt_config.py",
+        tpl_paths["tensorrt/tensorrt_config.py"],
+        _py_tmpl_dict({
+            "tensorrt_version": tensorrt_version,
+        }),
+    )
+
+def _py_tmpl_dict(d):
+    return {"%{cuda_config}": str(d)}
+
+_CUDA_ENVIRONS = [
+    _GCC_HOST_COMPILER_PATH,
+    _GCC_HOST_COMPILER_PREFIX,
+    "NEED_CUDA",
+    _CUDA_TOOLKIT_PATH,
+    _CUDNN_INSTALL_PATH,
+    _CUDA_VERSION,
+    _CUDNN_VERSION,
+    _CUDA_COMPUTE_CAPABILITIES,
+    "NVVMIR_LIBRARY_DIR",
+    _PYTHON_BIN_PATH,
+    "TMP",
+    "TMPDIR",
+    "CUDA_PATHS",
+]
+
+cuda_configure = repository_rule(
+    implementation = _create_local_cuda_repository,
+    environ = _CUDA_ENVIRONS,
+)
+
+_TENSORRT_ENVIRONS = [
+    _TENSORRT_INSTALL_PATH,
+    _TENSORRT_VERSION,
+    _TENSORRT_STATIC_PATH,
+    "CUDA_PATHS",
+]
+
+tensorrt_configure = repository_rule(
+    implementation = _create_local_tensorrt_repository,
+    environ = _TENSORRT_ENVIRONS,
+)
diff --git a/build_deps/gpus/crosstool/BUILD b/build_deps/gpus/crosstool/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/build_deps/gpus/crosstool/BUILD.tpl b/build_deps/gpus/crosstool/BUILD.tpl
new file mode 100644
index 000000000..e95d18bc6
--- /dev/null
+++ b/build_deps/gpus/crosstool/BUILD.tpl
@@ -0,0 +1,108 @@
+# This file is expanded from a template by cuda_configure.bzl
+# Update cuda_configure.bzl#verify_build_defines when adding new variables.
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+    target_compatible_with = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "arm": ":cc-compiler-local",
+        "aarch64": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = "%{compiler_deps}",
+    compiler_files = "%{compiler_deps}",
+    ar_files = "%{compiler_deps}",
+    as_files = "%{compiler_deps}",
+    dwp_files = ":empty",
+    linker_files = "%{compiler_deps}",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux",
+    toolchain_config = ":cc-compiler-local-config",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    cpu = "local",
+    builtin_include_directories = [%{cxx_builtin_include_directories}],
+    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
+    host_compiler_path = "%{host_compiler_path}",
+    host_compiler_prefix = "%{host_compiler_prefix}",
+    host_compiler_warnings = [%{host_compiler_warnings}],
+    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
+    linker_bin_path = "%{linker_bin_path}",
+    builtin_sysroot = "%{builtin_sysroot}",
+    cuda_path = "%{cuda_toolkit_path}",
+    compiler = "%{compiler}",
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = "%{compiler_deps}",
+    compiler_files = "%{compiler_deps}",
+    ar_files = "%{compiler_deps}",
+    as_files = "%{compiler_deps}",
+    dwp_files = ":empty",
+    linker_files = "%{compiler_deps}",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
+    toolchain_config = ":cc-compiler-local-darwin",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-darwin",
+    cpu = "darwin",
+    builtin_include_directories = [%{cxx_builtin_include_directories}],
+    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
+    host_compiler_path = "%{host_compiler_path}",
+    host_compiler_prefix = "%{host_compiler_prefix}",
+    host_compiler_warnings = [%{host_compiler_warnings}],
+    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
+    linker_bin_path = "%{linker_bin_path}",
+)
+
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_compiler",
+    srcs = ["crosstool_compiler_wrapper"],
+)
diff --git a/build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl b/build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl
new file mode 100644
index 000000000..9c429754a
--- /dev/null
+++ b/build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -0,0 +1,645 @@
+"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "artifact_name_pattern",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+    "with_feature_set",
+)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
+
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
+
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
+
+def all_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
+        ],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
+    )
+
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
+    )
+
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
+    )
+
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
+    )
+
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _cuda_set(cuda_path, actions):
+    if cuda_path:
+        return [flag_set(
+            actions = actions,
+            flag_groups = [
+                flag_group(
+                    flags = ["--cuda-path=" + cuda_path],
+                ),
+            ],
+        )]
+    else:
+        return []
+
+def _nologo():
+    return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_compile_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = ([
+                            flag_group(flags = ["-Wl,-no-as-needed"])
+                        ] if cpu == "local" else []) + ([
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
+                        ] if ctx.attr.linker_bin_path else []) + [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
+                        ],
+                    ),
+                ] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_link_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "disable-assertions"),
+            feature(
+                name = "opt",
+                implies = ["disable-assertions"],
+            ),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
+        ]
+    else:
+        fail("Unreachable")
+
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+        artifact_name_patterns = []
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = _features(cpu, compiler, ctx),
+            action_configs = action_configs,
+            artifact_name_patterns = artifact_name_patterns,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = "local",
+            target_system_name = "local",
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
+            make_variables = [],
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    implementation = _impl,
+    attrs = {
+        "cpu": attr.string(mandatory = True, values = ["darwin", "local"]),
+        "compiler": attr.string(values = ["unknown"], default = "unknown"),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "builtin_sysroot": attr.string(),
+        "cuda_path": attr.string(),
+    },
+    provides = [CcToolchainConfigInfo],
+    executable = True,
+)
diff --git a/build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl b/build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl
new file mode 100755
index 000000000..f504a5669
--- /dev/null
+++ b/build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl
@@ -0,0 +1,313 @@
+#!/usr/bin/env python
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_compiler_wrapper [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+"""
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+import os
+import pipes
+import re
+import subprocess
+import sys
+from argparse import ArgumentParser
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '%{cuda_version}'
+
+
+def Log(s):
+    print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+    """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, with the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+    parser = ArgumentParser()
+    parser.add_argument(option, nargs='*', action='append')
+    option = option.lstrip('-').replace('-', '_')
+    args, _ = parser.parse_known_args(argv)
+    if not args or not vars(args)[option]:
+        return []
+    else:
+        return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+    """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+    parser = ArgumentParser()
+    parser.add_argument('-isystem', nargs='*', action='append')
+    parser.add_argument('-iquote', nargs='*', action='append')
+    parser.add_argument('--sysroot', nargs=1)
+    parser.add_argument('-g', nargs='*', action='append')
+    parser.add_argument('-fno-canonical-system-headers', action='store_true')
+    parser.add_argument('-no-canonical-prefixes', action='store_true')
+
+    args, _ = parser.parse_known_args(argv)
+
+    opts = ''
+
+    if args.isystem:
+        opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+    if args.iquote:
+        opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+    if args.g:
+        opts += ' -g' + ' -g'.join(sum(args.g, []))
+    if args.fno_canonical_system_headers:
+        opts += ' -fno-canonical-system-headers'
+    if args.no_canonical_prefixes:
+        opts += ' -no-canonical-prefixes'
+    if args.sysroot:
+        opts += ' --sysroot ' + args.sysroot[0]
+
+    return opts
+
+
+def _update_options(nvcc_options):
+    if NVCC_VERSION in ("7.0", ):
+        return nvcc_options
+
+    update_options = {"relaxed-constexpr": "expt-relaxed-constexpr"}
+    return [
+        update_options[opt] if opt in update_options else opt
+        for opt in nvcc_options
+    ]
+
+
+def GetNvccOptions(argv):
+    """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+    parser = ArgumentParser()
+    parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+    args, _ = parser.parse_known_args(argv)
+
+    if args.nvcc_options:
+        options = _update_options(sum(args.nvcc_options, []))
+        return ' '.join(['--' + a for a in options])
+    return ''
+
+
+def system(cmd):
+    """Invokes cmd with os.system().
+
+  Args:
+    cmd: The command.
+
+  Returns:
+    The exit code if the process exited with exit() or -signal
+    if the process was terminated by a signal.
+  """
+    retv = os.system(cmd)
+    if os.WIFEXITED(retv):
+        return os.WEXITSTATUS(retv)
+    else:
+        return -os.WTERMSIG(retv)
+
+
+def InvokeNvcc(argv, log=False):
+    """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling system('nvcc ' + args)
+  """
+
+    host_compiler_options = GetHostCompilerOptions(argv)
+    nvcc_compiler_options = GetNvccOptions(argv)
+    opt_option = GetOptionValue(argv, '-O')
+    m_options = GetOptionValue(argv, '-m')
+    m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+    include_options = GetOptionValue(argv, '-I')
+    out_file = GetOptionValue(argv, '-o')
+    depfiles = GetOptionValue(argv, '-MF')
+    defines = GetOptionValue(argv, '-D')
+    defines = ''.join([' -D' + define for define in defines])
+    undefines = GetOptionValue(argv, '-U')
+    undefines = ''.join([' -U' + define for define in undefines])
+    std_options = GetOptionValue(argv, '-std')
+    nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
+    nvcc_std_map = {}
+    if int(NVCC_VERSION.split('.')[0]) >= 11:
+        nvcc_std_map["c++1z"] = "c++17"
+        nvcc_allowed_std_options += ["c++17", "c++1z"]
+    std_options = ''.join([
+        ' -std=' + (nvcc_std_map[define] if define in nvcc_std_map else define)
+        for define in std_options if define in nvcc_allowed_std_options
+    ][-1:])
+    fatbin_options = ''.join([
+        ' --fatbin-options=' + option
+        for option in GetOptionValue(argv, '-Xcuda-fatbinary')
+    ])
+
+    # The list of source files get passed after the -c option. I don't know of
+    # any other reliable way to just get the list of source files to be compiled.
+    src_files = GetOptionValue(argv, '-c')
+
+    # Pass -w through from host to nvcc, but don't do anything fancier with
+    # warnings-related flags, since they're not necessarily the same across
+    # compilers.
+    warning_options = ' -w' if '-w' in argv else ''
+
+    if len(src_files) == 0:
+        return 1
+    if len(out_file) != 1:
+        return 1
+
+    opt = (' -O2' if
+           (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g')
+
+    includes = (' -I ' + ' -I '.join(include_options)
+                if len(include_options) > 0 else '')
+
+    # Unfortunately, there are other options that have -c prefix too.
+    # So allowing only those look like C/C++ files.
+    src_files = [
+        f for f in src_files
+        if re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C|\.cu|\.cuh$', f)
+    ]
+    srcs = ' '.join(src_files)
+    out = ' -o ' + out_file[0]
+
+    nvccopts = '-D_FORCE_INLINES '
+    capabilities_sm = set(GetOptionValue(argv, "--cuda-gpu-arch"))
+    capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx'))
+    # When both "code=sm_xy" and "code=compute_xy" are requested for a single
+    # arch, they can be combined using "code=xy,compute_xy" which avoids a
+    # redundant PTX generation during compilation.
+    capabilities_both = capabilities_sm.intersection(capabilities_compute)
+    for capability in capabilities_both:
+        capability = capability[len('sm_'):]
+        nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
+            capability, capability, capability)
+    for capability in capabilities_sm - capabilities_both:
+        capability = capability[len('sm_'):]
+        nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
+                                                                   capability)
+    for capability in capabilities_compute - capabilities_both:
+        capability = capability[len('sm_'):]
+        nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (
+            capability, capability)
+    nvccopts += nvcc_compiler_options
+    nvccopts += undefines
+    nvccopts += defines
+    nvccopts += std_options
+    nvccopts += m_options
+    nvccopts += warning_options
+    # Force C++17 dialect (note, everything in just one string!)
+    nvccopts += ' --std c++17 '
+    nvccopts += fatbin_options
+
+    if depfiles:
+        # Generate the dependency file
+        depfile = depfiles[0]
+        cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
+               host_compiler_options + '"' + ' --compiler-bindir=' +
+               GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +
+               ' ' + srcs + ' -M -o ' + depfile)
+        if log:
+            Log(cmd)
+        exit_status = system(cmd)
+        if exit_status != 0:
+            return exit_status
+
+    cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
+           host_compiler_options + ' -fPIC"' + ' --compiler-bindir=' +
+           GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +
+           ' -c ' + srcs + out)
+
+    # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+    # Need to investigate and fix.
+    cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+    if log:
+        Log(cmd)
+    return system(cmd)
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('-x', nargs=1)
+    parser.add_argument('--cuda_log', action='store_true')
+    args, leftover = parser.parse_known_args(sys.argv[1:])
+
+    if args.x and args.x[0] == 'cuda':
+        if args.cuda_log:
+            Log('-x cuda')
+        leftover = [pipes.quote(s) for s in leftover]
+        if args.cuda_log:
+            Log('using nvcc')
+        return InvokeNvcc(leftover, log=args.cuda_log)
+
+    # Strip our flags before passing through to the CPU compiler for files which
+    # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+    # We not only want to pass -x to the CPU compiler, but also keep it in its
+    # relative location in the argv list (the compiler is actually sensitive to
+    # this).
+    cpu_compiler_flags = [
+        flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log'))
+    ]
+
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/build_deps/gpus/cuda/BUILD b/build_deps/gpus/cuda/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/build_deps/gpus/cuda/BUILD.tpl b/build_deps/gpus/cuda/BUILD.tpl
new file mode 100644
index 000000000..05a750ab2
--- /dev/null
+++ b/build_deps/gpus/cuda/BUILD.tpl
@@ -0,0 +1,229 @@
+load(":build_defs.bzl", "cuda_header_library")
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@bazel_skylib//lib:selects.bzl", "selects")
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+bool_flag(
+    name = "enable_cuda",
+    build_setting_default = False,
+)
+
+config_setting(
+    name = "is_cuda_enabled",
+    flag_values = {":enable_cuda": "True"},
+)
+
+
+# Config setting whether built with CUDA support using nvcc.
+#
+# TODO(b/174244321), DEPRECATED: this target will be removed when all users
+# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
+selects.config_setting_group(
+    name = "using_nvcc",
+    match_all = [
+        "//:is_cuda_enabled",
+        "//:is_cuda_compiler_nvcc",
+    ],
+)
+
+config_setting(
+    name = "_opt",
+    values = {"compilation_mode": "opt"},
+    visibility = ["//visibility:private"],
+)
+
+# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
+# All clients including TensorFlow should use these directives.
+cuda_header_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
+        "cuda/include",
+    ],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/%{cudart_static_lib}"],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/%{cuda_driver_lib}"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/%{cudart_lib}"],
+    data = ["cuda/lib/%{cudart_lib}"],
+    linkstatic = 1,
+)
+
+cuda_header_library(
+    name = "cublas_headers",
+    hdrs = [":cublas-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cublas/include"],
+    strip_include_prefix = "cublas/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusolver/include"],
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cufft/include"],
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusparse/include"],
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["curand/include"],
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/%{cublas_lib}"],
+    data = ["cuda/lib/%{cublas_lib}"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "cublasLt",
+    srcs = ["cuda/lib/%{cublasLt_lib}"],
+    data = ["cuda/lib/%{cublasLt_lib}"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/%{cusolver_lib}"],
+    data = ["cuda/lib/%{cusolver_lib}"],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/%{cudnn_lib}"],
+    data = ["cuda/lib/%{cudnn_lib}"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "cudnn_header",
+    hdrs = [":cudnn-include"],
+    include_prefix = "third_party/gpus/cudnn",
+    strip_include_prefix = "cudnn/include",
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/%{cufft_lib}"],
+    data = ["cuda/lib/%{cufft_lib}"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/%{curand_lib}"],
+    data = ["cuda/lib/%{curand_lib}"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "cuda",
+    deps = [
+        ":cublas",
+        ":cublasLt",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+alias(
+    name = "cub_headers",
+    actual = "%{cub_actual}",
+)
+
+cuda_header_library(
+    name = "cupti_headers",
+    hdrs = [":cuda-extras"],
+    include_prefix = "third_party/gpus",
+    includes = ["cuda/extras/CUPTI/include/"],
+    deps = [":cuda_headers"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/%{cupti_lib}"],
+)
+
+cc_library(
+    name = "cusparse",
+    srcs = ["cuda/lib/%{cusparse_lib}"],
+    data = ["cuda/lib/%{cusparse_lib}"],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    deps = [
+        "@bazel_skylib//lib:selects",
+    ],
+)
+
+py_library(
+    name = "cuda_config_py",
+    srcs = ["cuda/cuda_config.py"],
+)
+
+%{copy_rules}
diff --git a/build_deps/gpus/cuda/build_defs.bzl.tpl b/build_deps/gpus/cuda/build_defs.bzl.tpl
new file mode 100644
index 000000000..7ab1304fa
--- /dev/null
+++ b/build_deps/gpus/cuda/build_defs.bzl.tpl
@@ -0,0 +1,56 @@
+# Macros for building CUDA code.
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return [
+        "-x",
+        "cuda",
+        "-DUSE_CUDA=1",
+        "-Xcuda-fatbinary=--compress-all",
+    ] + %{cuda_extra_copts}
+
+
+def cuda_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{cuda_gpu_architectures}
+
+
+def cuda_header_library(name,
+                        hdrs,
+                        include_prefix=None,
+                        strip_include_prefix=None,
+                        deps=[],
+                        **kwargs):
+    """Generates a cc_library containing both virtual and system include paths.
+
+    Generates both a header-only target with virtual includes plus the full
+    target without virtual includes. This works around the fact that bazel can't
+    mix 'includes' and 'include_prefix' in the same target."""
+
+    native.cc_library(
+        name=name + "_virtual",
+        hdrs=hdrs,
+        include_prefix=include_prefix,
+        strip_include_prefix=strip_include_prefix,
+        deps=deps,
+        visibility=["//visibility:private"],
+    )
+
+    native.cc_library(name=name,
+                      textual_hdrs=hdrs,
+                      deps=deps + [":%s_virtual" % name],
+                      **kwargs)
+
+
+def cuda_library(copts=[], **kwargs):
+    """Wrapper over cc_library which adds default CUDA options."""
+    native.cc_library(copts=cuda_default_copts() + copts, **kwargs)
+
+
+def cuda_binary(copts=[], **kwargs):
+    """Wrapper over cc_library which adds default CUDA options."""
+    native.cc_binary(copts=cuda_default_copts() + copts, **kwargs)
+
+
+def cuda_cc_test(copts=[], **kwargs):
+    """Wrapper over cc_test which adds default CUDA options."""
+    native.cc_test(copts=copts, **kwargs)
diff --git a/build_deps/gpus/cuda/cuda_config.h.tpl b/build_deps/gpus/cuda/cuda_config.h.tpl
new file mode 100644
index 000000000..a92871e71
--- /dev/null
+++ b/build_deps/gpus/cuda/cuda_config.h.tpl
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
+
+#define CUDA_VERSION "%{cuda_version}"
+#define CUDART_VERSION "%{cudart_version}"
+#define CUPTI_VERSION "%{cupti_version}"
+#define CUBLAS_VERSION "%{cublas_version}"
+#define CUSOLVER_VERSION "%{cusolver_version}"
+#define CURAND_VERSION "%{curand_version}"
+#define CUFFT_VERSION "%{cufft_version}"
+#define CUSPARSE_VERSION "%{cusparse_version}"
+#define CUDNN_VERSION "%{cudnn_version}"
+
+#define CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
+
+#define CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities}
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/build_deps/gpus/cuda/cuda_config.py.tpl b/build_deps/gpus/cuda/cuda_config.py.tpl
new file mode 100644
index 000000000..328558e12
--- /dev/null
+++ b/build_deps/gpus/cuda/cuda_config.py.tpl
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+config = %{cuda_config}
diff --git a/build_deps/gpus/find_cuda_config.py b/build_deps/gpus/find_cuda_config.py
new file mode 100644
index 000000000..e384feaaf
--- /dev/null
+++ b/build_deps/gpus/find_cuda_config.py
@@ -0,0 +1,638 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Prints CUDA library and header directories and versions found on the system.
+
+The script searches for CUDA library and header files on the system, inspects
+them to determine their version and prints the configuration to stdout.
+The paths to inspect and the required versions are specified through environment
+variables. If no valid configuration is found, the script prints to stderr and
+returns an error code.
+
+The list of libraries to find is specified as arguments. Supported libraries are
+CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.
+
+The script takes a list of base directories specified by the CUDA_PATHS
+environment variable as comma-separated glob list. The script looks for headers
+and library files in a hard-coded set of subdirectories from these base paths.
+If CUDA_PATHS is not specified, a OS specific default is used:
+
+  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
+  Windows: CUDA_PATH environment variable, or
+           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
+
+For backwards compatibility, some libraries also use alternative base
+directories from other environment variables if they are specified. List of
+library-specific environment variables:
+
+  Library   Version env variable  Additional base directories
+  ----------------------------------------------------------------
+  CUDA      CUDA_VERSION       CUDA_TOOLKIT_PATH
+  cuBLAS    CUBLAS_VERSION     CUDA_TOOLKIT_PATH
+  cuDNN     CUDNN_VERSION      CUDNN_INSTALL_PATH
+  NCCL      NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
+  TensorRT  TENSORRT_VERSION   TENSORRT_INSTALL_PATH
+
+Versions environment variables can be of the form 'x' or 'x.y' to request a
+specific version, empty or unspecified to accept any version.
+
+The output of a found library is of the form:
+tf_<library>_version: x.y.z
+tf_<library>_header_dir: ...
+tf_<library>_library_dir: ...
+"""
+
+import glob
+import io
+import os
+import platform
+import re
+import subprocess
+import sys
+
+# pylint: disable=g-import-not-at-top
+try:
+    from shutil import which
+except ImportError:
+    from distutils.spawn import find_executable as which
+# pylint: enable=g-import-not-at-top
+
+
+class ConfigError(Exception):
+    pass
+
+
+def _is_linux():
+    return platform.system() == "Linux"
+
+
+def _is_macos():
+    return platform.system() == "Darwin"
+
+
+def _matches_version(actual_version, required_version):
+    """Checks whether some version meets the requirements.
+
+      All elements of the required_version need to be present in the
+      actual_version.
+
+          required_version  actual_version  result
+          -----------------------------------------
+          1                 1.1             True
+          1.2               1               False
+          1.2               1.3             False
+                            1               True
+
+      Args:
+        required_version: The version specified by the user.
+        actual_version: The version detected from the CUDA installation.
+      Returns: Whether the actual version matches the required one.
+  """
+    if actual_version is None:
+        return False
+
+    # Strip spaces from the versions.
+    actual_version = actual_version.strip()
+    required_version = required_version.strip()
+    return actual_version.startswith(required_version)
+
+
+def _at_least_version(actual_version, required_version):
+    actual = [int(v) for v in actual_version.split(".")]
+    required = [int(v) for v in required_version.split(".")]
+    return actual >= required
+
+
+def _get_header_version(path, name):
+    """Returns preprocessor defines in C header file."""
+    for line in io.open(path, "r", encoding="utf-8").readlines():
+        match = re.match("\s*#\s*define %s\s+(\d+)" % name, line)
+        if match:
+            return match.group(1)
+    return ""
+
+
+def _cartesian_product(first, second):
+    """Returns all path combinations of first and second."""
+    return [os.path.join(f, s) for f in first for s in second]
+
+
+def _get_ld_config_paths():
+    """Returns all directories from 'ldconfig -p'."""
+    if not _is_linux():
+        return []
+    ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
+    output = subprocess.check_output([ldconfig_path, "-p"])
+    pattern = re.compile(".* => (.*)")
+    result = set()
+    for line in output.splitlines():
+        try:
+            match = pattern.match(line.decode("ascii"))
+        except UnicodeDecodeError:
+            match = False
+        if match:
+            result.add(os.path.dirname(match.group(1)))
+    return sorted(list(result))
+
+
+def _get_default_cuda_paths(cuda_version):
+    if not cuda_version:
+        cuda_version = "*"
+    elif not "." in cuda_version:
+        cuda_version = cuda_version + ".*"
+
+    return [
+        "/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
+        "/usr/local/cudnn"
+    ] + _get_ld_config_paths()
+
+
+def _header_paths():
+    """Returns hard-coded set of relative paths to look for header files."""
+    return [
+        "",
+        "include",
+        "include/cuda",
+        "include/*-linux-gnu",
+        "extras/CUPTI/include",
+        "include/cuda/CUPTI",
+        "local/cuda/extras/CUPTI/include",
+    ]
+
+
+def _library_paths():
+    """Returns hard-coded set of relative paths to look for library files."""
+    return [
+        "",
+        "lib64",
+        "lib",
+        "lib/*-linux-gnu",
+        "lib/x64",
+        "extras/CUPTI/*",
+        "local/cuda/lib64",
+        "local/cuda/extras/CUPTI/lib64",
+    ]
+
+
+def _not_found_error(base_paths, relative_paths, filepattern):
+    base_paths = "".join(
+        ["\n        '%s'" % path for path in sorted(base_paths)])
+    relative_paths = "".join(
+        ["\n        '%s'" % path for path in relative_paths])
+    return ConfigError(
+        "Could not find any %s in any subdirectory:%s\nof:%s\n" %
+        (filepattern, relative_paths, base_paths))
+
+
+def _find_file(base_paths, relative_paths, filepattern):
+    for path in _cartesian_product(base_paths, relative_paths):
+        for file in glob.glob(os.path.join(path, filepattern)):
+            return file
+    raise _not_found_error(base_paths, relative_paths, filepattern)
+
+
+def _find_library(base_paths, library_name, required_version):
+    """Returns first valid path to the requested library."""
+    if _is_macos():
+        filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
+                                              required_version.split(".")[:1]))
+    else:
+        filepattern = ".".join(["lib" + library_name, "so"] +
+                               required_version.split(".")[:1]) + "*"
+    return _find_file(base_paths, _library_paths(), filepattern)
+
+
+def _find_versioned_file(base_paths, relative_paths, filepatterns,
+                         required_version, get_version):
+    """Returns first valid path to a file that matches the requested version."""
+    if type(filepatterns) not in [list, tuple]:
+        filepatterns = [filepatterns]
+    for path in _cartesian_product(base_paths, relative_paths):
+        for filepattern in filepatterns:
+            for file in glob.glob(os.path.join(path, filepattern)):
+                actual_version = get_version(file)
+                if _matches_version(actual_version, required_version):
+                    return file, actual_version
+    raise _not_found_error(
+        base_paths, relative_paths,
+        ", ".join(filepatterns) + " matching version '%s'" % required_version)
+
+
+def _find_header(base_paths, header_name, required_version, get_version):
+    """Returns first valid path to a header that matches the requested version."""
+    return _find_versioned_file(base_paths, _header_paths(), header_name,
+                                required_version, get_version)
+
+
+def _find_cuda_config(base_paths, required_version):
+
+    def get_header_version(path):
+        version = int(_get_header_version(path, "CUDA_VERSION"))
+        if not version:
+            return None
+        return "%d.%d" % (version // 1000, version % 1000 // 10)
+
+    cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
+                                                    required_version,
+                                                    get_header_version)
+    cuda_version = header_version  # x.y, see above.
+
+    cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
+
+    def get_nvcc_version(path):
+        pattern = "Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
+        for line in subprocess.check_output([path, "--version"]).splitlines():
+            match = re.match(pattern, line.decode("ascii"))
+            if match:
+                return match.group(1)
+        return None
+
+    nvcc_name = "nvcc"
+    nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
+        "",
+        "bin",
+        "local/cuda/bin",
+    ], nvcc_name, cuda_version, get_nvcc_version)
+
+    nvvm_path = _find_file(base_paths, [
+        "nvvm/libdevice",
+        "share/cuda",
+        "lib/nvidia-cuda-toolkit/libdevice",
+        "local/cuda/nvvm/libdevice",
+    ], "libdevice*.10.bc")
+
+    cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
+    cupti_library_path = _find_library(base_paths, "cupti", required_version)
+
+    cuda_binary_dir = os.path.dirname(nvcc_path)
+    nvvm_library_dir = os.path.dirname(nvvm_path)
+
+    # XLA requires the toolkit path to find ptxas and libdevice.
+    # TODO(csigg): pass in both directories instead.
+    cuda_toolkit_paths = (
+        os.path.normpath(os.path.join(cuda_binary_dir, "..")),
+        os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
+    )
+    if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
+        raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
+                          cuda_toolkit_paths)
+
+    return {
+        "cuda_version": cuda_version,
+        "cuda_include_dir": os.path.dirname(cuda_header_path),
+        "cuda_library_dir": os.path.dirname(cuda_library_path),
+        "cuda_binary_dir": cuda_binary_dir,
+        "nvvm_library_dir": nvvm_library_dir,
+        "cupti_include_dir": os.path.dirname(cupti_header_path),
+        "cupti_library_dir": os.path.dirname(cupti_library_path),
+        "cuda_toolkit_path": cuda_toolkit_paths[0],
+    }
+
+
+def _find_cublas_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "10.1"):
+
+        def get_header_version(path):
+            version = (_get_header_version(path, name)
+                       for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR",
+                                    "CUBLAS_VER_PATCH"))
+            return ".".join(version)
+
+        header_path, header_version = _find_header(base_paths, "cublas_api.h",
+                                                   required_version,
+                                                   get_header_version)
+        # cuBLAS uses the major version only.
+        cublas_version = header_version.split(".")[0]
+
+    else:
+        # There is no version info available before CUDA 10.1, just find the file.
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
+        # cuBLAS version is the same as CUDA version (x.y).
+        cublas_version = required_version
+
+    library_path = _find_library(base_paths, "cublas", cublas_version)
+
+    return {
+        "cublas_version": header_version,
+        "cublas_include_dir": os.path.dirname(header_path),
+        "cublas_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cusolver_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (_get_header_version(path, name)
+                       for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
+                                    "CUSOLVER_VER_PATCH"))
+            return ".".join(version)
+
+        header_path, header_version = _find_header(base_paths,
+                                                   "cusolver_common.h",
+                                                   required_version,
+                                                   get_header_version)
+        cusolver_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(),
+                                 "cusolver_common.h")
+        cusolver_version = required_version
+
+    library_path = _find_library(base_paths, "cusolver", cusolver_version)
+
+    return {
+        "cusolver_version": header_version,
+        "cusolver_include_dir": os.path.dirname(header_path),
+        "cusolver_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_curand_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (_get_header_version(path, name)
+                       for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
+                                    "CURAND_VER_PATCH"))
+            return ".".join(version)
+
+        header_path, header_version = _find_header(base_paths, "curand.h",
+                                                   required_version,
+                                                   get_header_version)
+        curand_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "curand.h")
+        curand_version = required_version
+
+    library_path = _find_library(base_paths, "curand", curand_version)
+
+    return {
+        "curand_version": header_version,
+        "curand_include_dir": os.path.dirname(header_path),
+        "curand_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cufft_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (_get_header_version(path, name)
+                       for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR",
+                                    "CUFFT_VER_PATCH"))
+            return ".".join(version)
+
+        header_path, header_version = _find_header(base_paths, "cufft.h",
+                                                   required_version,
+                                                   get_header_version)
+        cufft_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cufft.h")
+        cufft_version = required_version
+
+    library_path = _find_library(base_paths, "cufft", cufft_version)
+
+    return {
+        "cufft_version": header_version,
+        "cufft_include_dir": os.path.dirname(header_path),
+        "cufft_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cudnn_config(base_paths, required_version):
+
+    def get_header_version(path):
+        version = [
+            _get_header_version(path, name)
+            for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")
+        ]
+        return ".".join(version) if version[0] else None
+
+    header_path, header_version = _find_header(base_paths,
+                                               ("cudnn.h", "cudnn_version.h"),
+                                               required_version,
+                                               get_header_version)
+    cudnn_version = header_version.split(".")[0]
+
+    library_path = _find_library(base_paths, "cudnn", cudnn_version)
+
+    return {
+        "cudnn_version": cudnn_version,
+        "cudnn_include_dir": os.path.dirname(header_path),
+        "cudnn_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cusparse_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (_get_header_version(path, name)
+                       for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
+                                    "CUSPARSE_VER_PATCH"))
+            return ".".join(version)
+
+        header_path, header_version = _find_header(base_paths, "cusparse.h",
+                                                   required_version,
+                                                   get_header_version)
+        cusparse_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
+        cusparse_version = required_version
+
+    library_path = _find_library(base_paths, "cusparse", cusparse_version)
+
+    return {
+        "cusparse_version": header_version,
+        "cusparse_include_dir": os.path.dirname(header_path),
+        "cusparse_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_nccl_config(base_paths, required_version):
+
+    def get_header_version(path):
+        version = (_get_header_version(path, name)
+                   for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
+        return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "nccl.h",
+                                               required_version,
+                                               get_header_version)
+    nccl_version = header_version.split(".")[0]
+
+    library_path = _find_library(base_paths, "nccl", nccl_version)
+
+    return {
+        "nccl_version": nccl_version,
+        "nccl_include_dir": os.path.dirname(header_path),
+        "nccl_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_tensorrt_config(base_paths, required_version):
+
+    def get_header_version(path):
+        version = (_get_header_version(path, name)
+                   for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
+                                "NV_TENSORRT_PATCH"))
+        # `version` is a generator object, so we convert it to a list before using
+        # it (muitiple times below).
+        version = list(version)
+        if not all(version):
+            # Versions not found, make _matches_version returns False.
+            return None
+        return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
+                                               required_version,
+                                               get_header_version)
+
+    tensorrt_version = header_version.split(".")[0]
+    library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
+
+    return {
+        "tensorrt_version": tensorrt_version,
+        "tensorrt_include_dir": os.path.dirname(header_path),
+        "tensorrt_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _list_from_env(env_name, default=[]):
+    """Returns comma-separated list from environment variable."""
+    if env_name in os.environ:
+        return os.environ[env_name].split(",")
+    return default
+
+
+def _get_legacy_path(env_name, default=[]):
+    """Returns a path specified by a legacy environment variable.
+
+  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
+  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
+  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
+  """
+    if env_name in os.environ:
+        match = re.match("^(/[^/ ]*)+/lib/\w+-linux-gnu/?$",
+                         os.environ[env_name])
+        if match:
+            return [match.group(1)]
+    return _list_from_env(env_name, default)
+
+
+def _normalize_path(path):
+    """Returns normalized path, with forward slashes on Windows."""
+    return os.path.realpath(path)
+
+
+def find_cuda_config():
+    """Returns a dictionary of CUDA library and header file paths."""
+    libraries = [argv.lower() for argv in sys.argv[1:]]
+    cuda_version = os.environ.get("CUDA_VERSION", "")
+    base_paths = _list_from_env("CUDA_PATHS",
+                                _get_default_cuda_paths(cuda_version))
+    base_paths = [path for path in base_paths if os.path.exists(path)]
+
+    result = {}
+    if "cuda" in libraries:
+        cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
+        result.update(_find_cuda_config(cuda_paths, cuda_version))
+
+        cuda_version = result["cuda_version"]
+        cublas_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
+            # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
+            cublas_paths = cuda_paths
+        cublas_version = os.environ.get("CUBLAS_VERSION", "")
+        result.update(
+            _find_cublas_config(cublas_paths, cublas_version, cuda_version))
+
+        cusolver_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cusolver_paths = cuda_paths
+        cusolver_version = os.environ.get("CUSOLVER_VERSION", "")
+        result.update(
+            _find_cusolver_config(cusolver_paths, cusolver_version,
+                                  cuda_version))
+
+        curand_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            curand_paths = cuda_paths
+        curand_version = os.environ.get("CURAND_VERSION", "")
+        result.update(
+            _find_curand_config(curand_paths, curand_version, cuda_version))
+
+        cufft_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cufft_paths = cuda_paths
+        cufft_version = os.environ.get("CUFFT_VERSION", "")
+        result.update(
+            _find_cufft_config(cufft_paths, cufft_version, cuda_version))
+
+        cusparse_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cusparse_paths = cuda_paths
+        cusparse_version = os.environ.get("CUSPARSE_VERSION", "")
+        result.update(
+            _find_cusparse_config(cusparse_paths, cusparse_version,
+                                  cuda_version))
+
+    if "cudnn" in libraries:
+        cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
+        cudnn_version = os.environ.get("CUDNN_VERSION", "")
+        result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
+
+    if "nccl" in libraries:
+        nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
+        nccl_version = os.environ.get("NCCL_VERSION", "")
+        result.update(_find_nccl_config(nccl_paths, nccl_version))
+
+    if "tensorrt" in libraries:
+        tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
+        tensorrt_version = os.environ.get("TENSORRT_VERSION", "")
+        result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
+
+    for k, v in result.items():
+        if k.endswith("_dir") or k.endswith("_path"):
+            result[k] = _normalize_path(v)
+
+    return result
+
+
+def main():
+    try:
+        for key, value in sorted(find_cuda_config().items()):
+            print("%s: %s" % (key, value))
+    except ConfigError as e:
+        sys.stderr.write(str(e) + '\n')
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/build_deps/remote_config/BUILD b/build_deps/remote_config/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/build_deps/remote_config/BUILD.tpl b/build_deps/remote_config/BUILD.tpl
new file mode 100644
index 000000000..d97eb9701
--- /dev/null
+++ b/build_deps/remote_config/BUILD.tpl
@@ -0,0 +1,26 @@
+# Each platform creates a constraint @<platform>//:platform_constraint that
+# is listed in its constraint_values; rule that want to select a specific
+# platform to run on can put @<platform>//:platform_constraing into their
+# exec_compatible_with attribute.
+# Toolchains can similarly be marked with target_compatible_with or
+# exec_compatible_with to bind them to this platform.
+constraint_setting(
+    name = "platform_setting"
+)
+
+constraint_value(
+    name = "platform_constraint",
+    constraint_setting = ":platform_setting",
+    visibility = ["//visibility:public"],
+)
+
+platform(
+    name = "platform",
+    visibility = ["//visibility:public"],
+    constraint_values = [
+        "@platforms//cpu:%{cpu}",
+        "@platforms//os:%{platform}",
+        ":platform_constraint",
+    ],
+    exec_properties = %{exec_properties},
+)
diff --git a/build_deps/remote_config/common.bzl b/build_deps/remote_config/common.bzl
new file mode 100644
index 000000000..47df004e2
--- /dev/null
+++ b/build_deps/remote_config/common.bzl
@@ -0,0 +1,294 @@
+"""Functions common across configure rules."""
+
+BAZEL_SH = "BAZEL_SH"
+PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
+PYTHON_CONFIG_REPO = "PYTHON_CONFIG_REPO"
+
+
+def auto_config_fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg))
+
+
+def which(repository_ctx, program_name, allow_failure=False):
+    """Returns the full path to a program on the execution platform.
+
+    Args:
+      repository_ctx: the repository_ctx
+      program_name: name of the program on the PATH
+
+    Returns:
+      The full path to a program on the execution platform.
+    """
+    out = execute(
+        repository_ctx,
+        ["which", program_name],
+        allow_failure=allow_failure,
+    ).stdout
+    if out != None:
+        out = out.replace("\\", "\\\\").rstrip()
+    return out
+
+
+def get_python_bin(repository_ctx):
+    """Gets the python bin path.
+
+    Args:
+      repository_ctx: the repository_ctx
+
+    Returns:
+      The python bin path.
+    """
+    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
+    if python_bin:
+        return python_bin
+
+    # First check for an explicit "python3"
+    python_bin = which(repository_ctx, "python3", True)
+    if python_bin:
+        return python_bin
+
+    # Some systems just call pythone3 "python"
+    python_bin = which(repository_ctx, "python", True)
+    if python_bin:
+        return python_bin
+
+    auto_config_fail(
+        "Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            PYTHON_BIN_PATH,
+            get_environ(repository_ctx, "PATH"),
+        ))
+    return python_bin  # unreachable
+
+
+def get_bash_bin(repository_ctx):
+    """Gets the bash bin path.
+
+    Args:
+      repository_ctx: the repository_ctx
+
+    Returns:
+      The bash bin path.
+    """
+    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)
+    if bash_bin != None:
+        return bash_bin
+    bash_bin_path = which(repository_ctx, "bash")
+    if bash_bin_path == None:
+        auto_config_fail(
+            "Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                BAZEL_SH,
+                get_environ(repository_ctx, "PATH"),
+            ))
+    return bash_bin_path
+
+
+def read_dir(repository_ctx, src_dir):
+    """Returns a sorted list with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks.
+
+    Args:
+      repository_ctx: the repository_ctx
+      src_dir: the directory to traverse
+
+    Returns:
+      A sorted list with all files in a directory.
+    """
+    find_result = execute(
+        repository_ctx,
+        ["find", src_dir, "-follow", "-type", "f"],
+        allow_failure=True,
+    )
+    result = find_result.stdout
+    return sorted(result.splitlines())
+
+
+def get_environ(repository_ctx, name, default_value=None):
+    """Returns the value of an environment variable on the execution platform.
+
+    Args:
+      repository_ctx: the repository_ctx
+      name: the name of environment variable
+      default_value: the value to return if not set
+
+    Returns:
+      The value of the environment variable 'name' on the execution platform
+      or 'default_value' if it's not set.
+    """
+    cmd = "echo -n \"$%s\"" % name
+    result = execute(
+        repository_ctx,
+        [get_bash_bin(repository_ctx), "-c", cmd],
+        allow_failure=True,
+    )
+    if len(result.stdout) == 0:
+        return default_value
+    return result.stdout
+
+
+def get_host_environ(repository_ctx, name, default_value=None):
+    """Returns the value of an environment variable on the host platform.
+
+    The host platform is the machine that Bazel runs on.
+
+    Args:
+      repository_ctx: the repository_ctx
+      name: the name of environment variable
+
+    Returns:
+      The value of the environment variable 'name' on the host platform.
+    """
+    if name in repository_ctx.os.environ:
+        return repository_ctx.os.environ.get(name).strip()
+
+    if hasattr(repository_ctx.attr,
+               "environ") and name in repository_ctx.attr.environ:
+        return repository_ctx.attr.environ.get(name).strip()
+
+    return default_value
+
+
+def get_cpu_value(repository_ctx):
+    """Returns the name of the host operating system.
+
+    Args:
+      repository_ctx: The repository context.
+    Returns:
+      A string containing the name of the host operating system.
+    """
+    result = raw_exec(repository_ctx, ["uname", "-s"])
+    return result.stdout.strip()
+
+
+def execute(repository_ctx,
+            cmdline,
+            error_msg=None,
+            error_details=None,
+            allow_failure=False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      allow_failure: bool, if True, an empty stdout result or output to stderr
+        is fine, otherwise either of these is an error
+    Returns:
+      The result of repository_ctx.execute(cmdline)
+    """
+    result = raw_exec(repository_ctx, cmdline)
+    if (result.stderr or not result.stdout) and not allow_failure:
+        fail(
+            "\n".join([
+                error_msg.strip()
+                if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]), )
+    return result
+
+
+def raw_exec(repository_ctx, cmdline):
+    """Executes a command via repository_ctx.execute() and returns the result.
+
+    This method is useful for debugging purposes. For example, to print all
+    commands executed as well as their return code.
+
+    Args:
+      repository_ctx: the repository_ctx
+      cmdline: the list of args
+
+    Returns:
+      The 'exec_result' of repository_ctx.execute().
+    """
+    return repository_ctx.execute(cmdline)
+
+
+def files_exist(repository_ctx, paths, bash_bin=None):
+    """Checks which files in paths exists.
+
+    Args:
+      repository_ctx: the repository_ctx
+      paths: a list of paths
+      bash_bin: path to the bash interpreter
+
+    Returns:
+      Returns a list of Bool. True means that the path at the
+      same position in the paths list exists.
+    """
+    if bash_bin == None:
+        bash_bin = get_bash_bin(repository_ctx)
+
+    cmd_tpl = "[ -e \"%s\" ] && echo True || echo False"
+    cmds = [cmd_tpl % path for path in paths]
+    cmd = " ; ".join(cmds)
+
+    stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip()
+    return [val == "True" for val in stdout.splitlines()]
+
+
+def realpath(repository_ctx, path, bash_bin=None):
+    """Returns the result of "realpath path".
+
+    Args:
+      repository_ctx: the repository_ctx
+      path: a path on the file system
+      bash_bin: path to the bash interpreter
+
+    Returns:
+      Returns the result of "realpath path"
+    """
+    if bash_bin == None:
+        bash_bin = get_bash_bin(repository_ctx)
+
+    return execute(repository_ctx,
+                   [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()
+
+
+def err_out(result):
+    """Returns stderr if set, else stdout.
+
+    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead
+    of using result.stderr use err_out(result) instead.
+
+    Args:
+      result: the exec_result.
+
+    Returns:
+      The stderr if set, else stdout
+    """
+    if len(result.stderr) == 0:
+        return result.stdout
+    return result.stderr
+
+
+def config_repo_label(config_repo, target):
+    """Construct a label from config_repo and target.
+
+    This function exists to ease the migration from preconfig to remote config. In preconfig
+    the *_CONFIG_REPO environ variables are set to packages in the main repo while in
+    remote config they will point to remote repositories.
+
+    Args:
+      config_repo: a remote repository or package.
+      target: a target
+    Returns:
+      A label constructed from config_repo and target.
+    """
+    if config_repo.startswith("@") and not config_repo.find("//") > 0:
+        # remote config is being used.
+        return Label(config_repo + "//" + target)
+    elif target.startswith(":"):
+        return Label(config_repo + target)
+    else:
+        return Label(config_repo + "/" + target)
diff --git a/build_deps/remote_config/remote_platform_configure.bzl b/build_deps/remote_config/remote_platform_configure.bzl
new file mode 100644
index 000000000..780de4e7d
--- /dev/null
+++ b/build_deps/remote_config/remote_platform_configure.bzl
@@ -0,0 +1,55 @@
+"""Repository rule to create a platform for a docker image to be used with RBE."""
+
+
+def _remote_platform_configure_impl(repository_ctx):
+    platform = repository_ctx.attr.platform
+    if platform == "local":
+        os = repository_ctx.os.name.lower()
+        if os.startswith("mac os"):
+            platform = "osx"
+        else:
+            platform = "linux"
+
+    cpu = "x86_64"
+    machine_type = repository_ctx.execute(["bash", "-c",
+                                           "echo $MACHTYPE"]).stdout
+    if (machine_type.startswith("ppc") or machine_type.startswith("powerpc")):
+        cpu = "ppc"
+    elif machine_type.startswith("s390x"):
+        cpu = "s390x"
+    elif machine_type.startswith("aarch64"):
+        cpu = "aarch64"
+    elif machine_type.startswith("arm64"):
+        cpu = "aarch64"
+    elif machine_type.startswith("arm"):
+        cpu = "arm"
+    elif machine_type.startswith("mips64"):
+        cpu = "mips64"
+    elif machine_type.startswith("riscv64"):
+        cpu = "riscv64"
+
+    exec_properties = repository_ctx.attr.platform_exec_properties
+
+    serialized_exec_properties = "{"
+    for k, v in exec_properties.items():
+        serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v)
+    serialized_exec_properties += "}"
+
+    repository_ctx.template(
+        "BUILD",
+        Label("//remote_config:BUILD.tpl"),
+        {
+            "%{platform}": platform,
+            "%{exec_properties}": serialized_exec_properties,
+            "%{cpu}": cpu,
+        },
+    )
+
+
+remote_platform_configure = repository_rule(
+    implementation=_remote_platform_configure_impl,
+    attrs={
+        "platform_exec_properties": attr.string_dict(mandatory=True),
+        "platform": attr.string(default="linux", values=["linux", "local"]),
+    },
+)
diff --git a/include/BUILD b/include/BUILD
new file mode 100644
index 000000000..cbacc911e
--- /dev/null
+++ b/include/BUILD
@@ -0,0 +1,29 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+
+cuda_library(
+    name = "merlin_localfile",
+    hdrs = [
+        "merlin_localfile.hpp",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//include/merlin",
+        "@local_config_cuda//cuda",
+    ],
+)
+
+cuda_library(
+    name = "merlin_hashtable",
+    hdrs = [
+        "merlin_hashtable.cuh",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//include/merlin",
+        "@local_config_cuda//cuda",
+    ],
+)
diff --git a/include/merlin/BUILD b/include/merlin/BUILD
new file mode 100644
index 000000000..2057bee4a
--- /dev/null
+++ b/include/merlin/BUILD
@@ -0,0 +1,24 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+
+cuda_library(
+    name = "merlin",
+    srcs = [
+    ],
+    hdrs = [
+        "array_kernels.cuh",
+        "core_kernels.cuh",
+        "debug.hpp",
+        "flexible_buffer.cuh",
+        "initializers.cuh",
+        "memory_pool.cuh",
+        "optimizers.cuh",
+        "types.cuh",
+        "utils.cuh",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "@local_config_cuda//cuda",
+    ],
+)