Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make eager parameters in ep config as a domain array. #28

Open
wants to merge 16 commits into
base: cuda-domain-1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion config/m4/cuda.m4
Original file line number Diff line number Diff line change
@@ -1 +1,86 @@
AM_CONDITIONAL([HAVE_CUDA], [true])
#
# Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED.
# See file LICENSE for terms.
#

#
# Check for CUDA support
#
cuda_happy="no"
gdrcopy_happy="no"

AC_ARG_WITH([cuda],
[AS_HELP_STRING([--with-cuda=(DIR)], [Enable the use of CUDA (default is no).])],
[], [with_cuda=no])

AS_IF([test "x$with_cuda" != "xno"],
[AS_IF([test ! -z "$with_cuda" -a "x$with_cuda" != "xyes"],
[
ucx_check_cuda_dir="$with_cuda"
ucx_check_cuda_libdir="$with_cuda/lib64 "
])
AS_IF([test ! -z "$with_cuda_libdir" -a "x$with_cuda_libdir" != "xyes"],
[ucx_check_cuda_libdir="$with_nccl_libdir"])

AC_CHECK_HEADERS([cuda.h cuda_runtime.h],
[AC_CHECK_DECLS([cuPointerGetAttribute],
[cuda_happy="yes"],
[AC_MSG_WARN([CUDA runtime not detected. Disable.])
cuda_happy="no"],
[#include <cuda.h>])
AS_IF([test "x$cuda_happy" == "xyes"],
[AC_DEFINE([HAVE_CUDA], 1, [Enable CUDA support])
AC_SUBST(CUDA_CPPFLAGS, "-I$ucx_check_cuda_dir/include ")
AC_SUBST(CUDA_CFLAGS, "-I$ucx_check_cuda_dir/include ")
AC_SUBST(CUDA_LDFLAGS, "-lcudart -lcuda -L$ucs_check_cuda_libdir/ ")
CFLAGS="$CFLAGS $CUDA_CFLAGS"
CPPFLAGS="$CPPFLAGS $CUDA_CPPFLAGS"
LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"],
[])],
[AC_MSG_WARN([CUDA not found])
AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])])],
[AC_MSG_WARN([CUDA was explicitly disabled])
AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])]
)


AM_CONDITIONAL([HAVE_CUDA], [test "x$cuda_happy" != xno])

AC_ARG_WITH([gdrcopy],
[AS_HELP_STRING([--with-gdrcopy=(DIR)], [Enable the use of GDR_COPY (default is no).])],
[], [with_gdrcopy=no])

AS_IF([test "x$with_gdrcopy" != "xno"],

[AS_IF([test "x$cuda_happy" == "xno"],
[AC_MSG_ERROR([--with-cuda not specified ...])],[:])
AS_IF([test ! -z "$with_gdrcopy" -a "x$with_gdrcopy" != "xyes"],
[
ucx_check_gdrcopy_dir="$with_gdrcopy"
ucx_check_gdrcopy_libdir="$with_gdrcopy/lib64 "
])
AS_IF([test ! -z "$with_gdrcopy_libdir" -a "x$with_gdrcopy_libdir" != "xyes"],
[ucx_check_gdrcopy_libdir="$with_nccl_libdir"])

AC_CHECK_HEADERS([gdrapi.h],
[AC_CHECK_DECLS([gdr_pin_buffer],
[gdrcopy_happy="yes"],
[AC_MSG_WARN([GDR_COPY runtime not detected. Disable.])
gdrcopy_happy="no"],
[#include <gdrapi.h>])
AS_IF([test "x$gdrcopy_happy" == "xyes"],
[AC_DEFINE([HAVE_GDR_COPY], 1, [Enable GDR_COPY support])
AC_SUBST(GDR_COPY_CPPFLAGS, "-I$ucx_check_gdrcopy_dir/include/ ")
AC_SUBST(GDR_COPY_CFLAGS, "-I$ucx_check_gdrcopy_dir/include/ ")
AC_SUBST(GDR_COPY_LDFLAGS, "-lgdrapi -L$ucx_check_gdrcopy_dir/lib64")
CFLAGS="$CFLAGS $GDR_COPY_CFLAGS"
CPPFLAGS="$CPPFLAGS $GDR_COPY_CPPFLAGS"
LDFLAGS="$LDFLAGS $GDR_COPY_LDFLAGS"],
[])],
[AC_MSG_WARN([GDR_COPY not found])
AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])])],
[AC_MSG_WARN([GDR_COPY was explicitly disabled])
AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])]
)

AM_CONDITIONAL([HAVE_GDR_COPY], [test "x$gdrcopy_happy" != xno])
1 change: 1 addition & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ AS_IF([test "x$with_docs_only" == xyes],
AM_CONDITIONAL([HAVE_IBV_EX_HW_TM], [false])
AM_CONDITIONAL([HAVE_CRAY_UGNI], [false])
AM_CONDITIONAL([HAVE_CUDA], [false])
AM_CONDITIONAL([HAVE_GDR_COPY], [false])
AM_CONDITIONAL([HAVE_ROCM], [false])
AM_CONDITIONAL([HAVE_XPMEM], [false])
AM_CONDITIONAL([HAVE_CMA], [false])
Expand Down
3 changes: 3 additions & 0 deletions src/tools/perf/libperf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1173,6 +1173,9 @@ static ucs_status_t uct_perf_setup(ucx_perf_context_t *perf, ucx_perf_params_t *
goto out_free_mem;
}

uct_iface_progress_enable(perf->uct.iface,
UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);

return UCS_OK;

out_free_mem:
Expand Down
7 changes: 7 additions & 0 deletions src/ucm/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ libucm_la_SOURCES = \
util/reloc.c \
util/sys.c

if HAVE_CUDA
libucm_la_SOURCES += \
cuda/install.c \
cuda/replace.c

endif

if HAVE_UCM_PTMALLOC283
libucm_la_CPPFLAGS += \
-I$(srcdir)/ptmalloc283/sysdeps/pthread \
Expand Down
20 changes: 20 additions & 0 deletions src/ucm/cuda/cudamem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifndef UCM_CUDAMEM_H_
#define UCM_CUDAMEM_H_

#include <ucm/api/ucm.h>
#include <cuda.h>
#include <cuda_runtime.h>

ucs_status_t ucm_cudamem_install();

cudaError_t ucm_override_cudaFree(void *addr);
cudaError_t ucm_orig_cudaFree(void *address);
cudaError_t ucm_cudaFree(void *address);

#endif
62 changes: 62 additions & 0 deletions src/ucm/cuda/install.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "cudamem.h"

#include <ucm/api/ucm.h>
#include <ucm/event/event.h>
#include <ucm/util/log.h>
#include <ucm/util/reloc.h>
#include <ucm/util/ucm_config.h>
#include <ucs/sys/math.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <unistd.h>
#include <pthread.h>



static ucm_reloc_patch_t ucm_cudamem_symbol_patches[] = {
{"cudaFree", ucm_override_cudaFree},
{NULL, NULL}
};

ucs_status_t ucm_cudamem_install()
{
static int ucm_cudamem_installed = 0;
static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
ucm_reloc_patch_t *patch;
ucs_status_t status = UCS_OK;

if (!ucm_global_config.enable_cuda_hooks) {
ucm_debug("installing cudamem relocations is disabled by configuration");
return UCS_ERR_UNSUPPORTED;
}
if (ucm_cudamem_installed) {
return UCS_OK;
}

pthread_mutex_lock(&install_mutex);

for (patch = ucm_cudamem_symbol_patches; patch->symbol != NULL; ++patch) {
status = ucm_reloc_modify(patch);
if (status != UCS_OK) {
ucm_warn("failed to install relocation table entry for '%s'",
patch->symbol);
goto out_unlock;
}
}
ucm_cudamem_installed = 1;

out_unlock:
pthread_mutex_unlock(&install_mutex);
return status;
}
98 changes: 98 additions & 0 deletions src/ucm/cuda/replace.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED.
*
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "cudamem.h"

#include <ucm/event/event.h>
#include <ucm/util/log.h>
#include <ucm/util/reloc.h>
#include <ucs/sys/compiler.h>
#include <ucs/sys/preprocessor.h>
#include <ucs/type/component.h>
#include <pthread.h>

static pthread_mutex_t ucm_cudamem_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
static pthread_t volatile ucm_cudamem_get_orig_thread = -1;


/**
* Define a replacement function to a memory-mapping function call, which calls
* the event handler, and if event handler returns error code - calls the original
* function.
*/
#define UCM_DEFINE_CUDA_FUNC(_name, _rettype, _fail_val, ...) \
\
_rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)); \
\
/* Call the original function using dlsym(RTLD_NEXT) */ \
_rettype ucm_orig_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
{ \
typedef _rettype (*func_ptr_t) (__VA_ARGS__); \
static func_ptr_t orig_func_ptr = NULL; \
\
ucm_trace("%s()", __FUNCTION__); \
\
if (ucs_unlikely(orig_func_ptr == NULL)) { \
pthread_mutex_lock(&ucm_cudamem_get_orig_lock); \
ucm_cudamem_get_orig_thread = pthread_self(); \
orig_func_ptr = ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \
ucm_override_##_name); \
ucm_cudamem_get_orig_thread = -1; \
pthread_mutex_unlock(&ucm_cudamem_get_orig_lock); \
} \
return orig_func_ptr(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
} \
\
/* Define a symbol which goes to the replacement - in case we are loaded first */ \
_rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
{ \
ucm_trace("%s()", __FUNCTION__); \
\
if (ucs_unlikely(ucm_cudamem_get_orig_thread == pthread_self())) { \
return _fail_val; \
} \
return ucm_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
}

#define UCM_OVERRIDE_CUDA_FUNC(_name) \
cudaError_t _name() __attribute__ ((alias ("ucm_override_" UCS_PP_QUOTE(_name)))); \


/*
* Define argument list with given types.
*/
#define UCM_FUNC_DEFINE_ARGS(...) \
UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_DEFINE, _, \
UCS_PP_ZIP((UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))), \
(__VA_ARGS__)))

/*
* Pass auto-generated arguments to a function call.
*/
#define UCM_FUNC_PASS_ARGS(...) \
UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_PASS, _, UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__)))


/*
* Helpers
*/
#define _UCM_FUNC_ARG_DEFINE(_, _bundle) \
__UCM_FUNC_ARG_DEFINE(_, UCS_PP_TUPLE_0 _bundle, UCS_PP_TUPLE_1 _bundle)
#define __UCM_FUNC_ARG_DEFINE(_, _index, _type) \
_type UCS_PP_TOKENPASTE(arg, _index)
#define _UCM_FUNC_ARG_PASS(_, _index) \
UCS_PP_TOKENPASTE(arg, _index)


UCM_DEFINE_CUDA_FUNC(cudaFree, cudaError_t, -1, void*)

#if ENABLE_SYMBOL_OVERRIDE
UCM_OVERRIDE_CUDA_FUNC(cudaFree)
#endif
32 changes: 32 additions & 0 deletions src/ucm/event/event.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include <ucm/api/ucm.h>
#include <ucm/mmap/mmap.h>
#include <ucm/malloc/malloc_hook.h>
#if HAVE_CUDA
#include <ucm/cuda/cudamem.h>
#endif
#include <ucm/util/ucm_config.h>
#include <ucm/util/log.h>
#include <ucm/util/sys.h>
Expand Down Expand Up @@ -334,6 +337,25 @@ void *ucm_sbrk(intptr_t increment)
return event.sbrk.result;
}

#if HAVE_CUDA
cudaError_t ucm_cudaFree(void *addr)
{
cudaError_t ret;

ucm_event_enter();

ucm_trace("ucm_cudaFree(addr=%p )", addr);

ucm_dispatch_vm_munmap(addr, 0);

ret = ucm_orig_cudaFree(addr);

ucm_event_leave();

return ret;
}
#endif

void ucm_event_handler_add(ucm_event_handler_t *handler)
{
ucm_event_handler_t *elem;
Expand Down Expand Up @@ -390,6 +412,16 @@ static ucs_status_t ucm_event_install(int events)
}

ucm_debug("malloc hooks are ready");

#if HAVE_CUDA
status = ucm_cudamem_install();
if (status != UCS_OK) {
ucm_debug("failed to install cudamem events");
goto out_unlock;
}
ucm_debug("cudaFree hooks are ready");
#endif

status = UCS_OK;

out_unlock:
Expand Down
14 changes: 13 additions & 1 deletion src/ucm/util/ucm_config.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#define UCM_EN_MMAP_RELOC_VAR "MMAP_RELOC"
#define UCM_EN_MALLOC_HOOKS_VAR "MALLOC_HOOKS"
#define UCM_EN_MALLOC_RELOC_VAR "MALLOC_RELOC"
#define UCM_EN_CUDA_HOOKS_VAR "CUDA_HOOKS"


ucm_config_t ucm_global_config = {
Expand All @@ -28,7 +29,10 @@ ucm_config_t ucm_global_config = {
.enable_events = 1,
.enable_mmap_reloc = 1,
.enable_malloc_hooks = 1,
.enable_malloc_reloc = 0
.enable_malloc_reloc = 0,
#if HAVE_CUDA
.enable_cuda_hooks = 1
#endif
};

static const char *ucm_config_bool_to_string(int value)
Expand Down Expand Up @@ -107,6 +111,10 @@ void ucm_config_print(FILE *stream, ucs_config_print_flags_t print_flags)
print_flags);
fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_MALLOC_RELOC_VAR,
ucm_config_bool_to_string(ucm_global_config.enable_malloc_reloc));
#if HAVE_CUDA
fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_CUDA_HOOKS_VAR,
ucm_config_bool_to_string(ucm_global_config.enable_cuda_hooks));
#endif
}

static void ucm_config_set_value_table(const char *str_value, const char **table,
Expand Down Expand Up @@ -157,6 +165,10 @@ ucs_status_t ucm_config_modify(const char *name, const char *value)
ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_hooks);
} else if (!strcmp(name, UCM_EN_MALLOC_RELOC_VAR)) {
ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_reloc);
#if HAVE_CUDA
} else if (!strcmp(name, UCM_EN_CUDA_HOOKS_VAR)) {
ucm_config_set_value_bool(value, &ucm_global_config.enable_cuda_hooks);
#endif
} else {
return UCS_ERR_INVALID_PARAM;
}
Expand Down
Loading