From f0691d8d9f42f175903959ef521c7f3861f6ce7c Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sun, 8 Oct 2017 13:51:20 +0300 Subject: [PATCH 01/16] TEST/UCP: Fix checking send request status after failure. Request may complete successfully before the error is detected. --- test/gtest/ucp/test_ucp_peer_failure.cc | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index 64f1f9c63bb..d5b50f6906a 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -80,7 +80,7 @@ class test_ucp_peer_failure : virtual void init(); virtual void cleanup(); - void test_status_after(); + void test_status_after(bool request_must_fail); void test_force_close(); protected: @@ -134,7 +134,7 @@ void test_ucp_peer_failure::cleanup() { test_ucp_tag::cleanup(); } -void test_ucp_peer_failure::test_status_after() +void test_ucp_peer_failure::test_status_after(bool request_must_fail) { fail_receiver(); @@ -144,8 +144,17 @@ void test_ucp_peer_failure::test_status_after() wait_err(); EXPECT_NE(UCS_OK, m_err_status); if (UCS_PTR_IS_PTR(req)) { + /* The request may either succeed or fail, even though the data is not + * delivered - depends on when the error is detected on sender side and + * if zcopy/bcopy protocol is used. In any case, the request must + * complete, and all resources have to be released. + */ EXPECT_TRUE(req->completed); - EXPECT_EQ(m_err_status, req->status); + if (request_must_fail) { + EXPECT_EQ(m_err_status, req->status); + } else { + EXPECT_TRUE((m_err_status == req->status) || (UCS_OK == req->status)); + } request_release(req); } @@ -231,7 +240,7 @@ UCS_TEST_P(test_ucp_peer_failure, disable_sync_send) { } UCS_TEST_P(test_ucp_peer_failure, status_after_error) { - test_status_after(); + test_status_after(false); } UCP_INSTANTIATE_TEST_CASE(test_ucp_peer_failure) @@ -247,7 +256,7 @@ class test_ucp_peer_failure_zcopy : public test_ucp_peer_failure }; UCS_TEST_P(test_ucp_peer_failure_zcopy, status_after_error) { - test_status_after(); + test_status_after(true); } UCP_INSTANTIATE_TEST_CASE(test_ucp_peer_failure_zcopy) @@ -268,7 +277,7 @@ class test_ucp_peer_failure_zcopy_multi : public test_ucp_peer_failure_zcopy }; UCS_TEST_P(test_ucp_peer_failure_zcopy_multi, status_after_error) { - test_status_after(); + test_status_after(true); } UCP_INSTANTIATE_TEST_CASE(test_ucp_peer_failure_zcopy_multi) From eea96e95bf8e8ab86999522f1997367bcb8c7ca2 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sun, 8 Oct 2017 04:05:27 +0300 Subject: [PATCH 02/16] UCS/RCACHE: Fix race condition in rcache region put. --- src/ucs/sys/rcache.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ucs/sys/rcache.c b/src/ucs/sys/rcache.c index 69392b556f4..099917a6bad 100644 --- a/src/ucs/sys/rcache.c +++ b/src/ucs/sys/rcache.c @@ -523,16 +523,15 @@ void ucs_rcache_region_put(ucs_rcache_t *rcache, ucs_rcache_region_t *region) { ucs_rcache_region_trace(rcache, region, "put"); + pthread_rwlock_wrlock(&rcache->lock); + ucs_assert(region->refcount > 0); ucs_atomic_add32(®ion->refcount, -1); - if (ucs_unlikely(region->flags & UCS_RCACHE_REGION_FLAG_INVALID) && - (region->refcount == 0)) - { - pthread_rwlock_wrlock(&rcache->lock); + if (region->refcount == 0) { ucs_rcache_region_invalidate(rcache, region, 0, 1); - pthread_rwlock_unlock(&rcache->lock); } + pthread_rwlock_unlock(&rcache->lock); } static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params, From 948909ec48485780e5dc1cada6dc03cac046d585 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 3 Oct 2017 20:28:50 +0300 Subject: [PATCH 03/16] UCT/UCP: Disable interface progress by default. --- src/tools/perf/libperf.c | 3 +++ src/ucp/core/ucp_worker.c | 4 --- src/uct/api/uct.h | 2 +- src/uct/base/uct_iface.c | 11 ++++++-- src/uct/base/uct_iface.h | 3 +++ src/uct/ib/dc/accel/dc_mlx5.c | 3 --- src/uct/ib/dc/verbs/dc_verbs.c | 38 ++++++++++++++++++--------- src/uct/ib/rc/accel/rc_mlx5_ep.c | 6 ----- src/uct/ib/rc/accel/rc_mlx5_iface.c | 6 +++-- src/uct/ib/rc/verbs/rc_verbs.h | 5 ---- src/uct/ib/rc/verbs/rc_verbs_common.c | 11 ++++++-- src/uct/ib/rc/verbs/rc_verbs_common.h | 8 ++++++ src/uct/ib/rc/verbs/rc_verbs_ep.c | 7 +---- src/uct/ib/rc/verbs/rc_verbs_iface.c | 24 +++++++++++------ src/uct/ib/ud/base/ud_iface.c | 2 -- src/uct/sm/mm/mm_iface.c | 3 --- test/examples/uct_hello_world.c | 4 +++ test/gtest/uct/uct_test.cc | 2 ++ 18 files changed, 85 insertions(+), 57 deletions(-) diff --git a/src/tools/perf/libperf.c b/src/tools/perf/libperf.c index f3c2ebb08bb..c8d968327a8 100644 --- a/src/tools/perf/libperf.c +++ b/src/tools/perf/libperf.c @@ -1173,6 +1173,9 @@ static ucs_status_t uct_perf_setup(ucx_perf_context_t *perf, ucx_perf_params_t * goto out_free_mem; } + uct_iface_progress_enable(perf->uct.iface, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + return UCS_OK; out_free_mem: diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index e8e2ad34107..ea5063b44d6 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -682,10 +682,6 @@ ucp_worker_add_iface(ucp_worker_h worker, ucp_rsc_index_t tl_id, tl_id, wiface->iface, UCT_TL_RESOURCE_DESC_ARG(&resource->tl_rsc), worker); - /* Disable progress until we know better */ - uct_iface_progress_disable(wiface->iface, UCT_PROGRESS_SEND | - UCT_PROGRESS_RECV); - VALGRIND_MAKE_MEM_UNDEFINED(&wiface->attr, sizeof(wiface->attr)); status = uct_iface_query(wiface->iface, &wiface->attr); if (status != UCS_OK) { diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 131fcd13715..0e9e579e5fa 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -2306,7 +2306,7 @@ UCT_INLINE_API ucs_status_t uct_iface_tag_recv_cancel(uct_iface_h iface, * Notify the transport that it should actively progress communications during * @ref uct_worker_progress(). * - * When the interface is created, its progress is enabled. + * When the interface is created, its progress is initially disabled. * * @param [in] iface The interface to enable progress. * @param [in] flags The type of progress to enable as defined by diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c index 8ff721b9ce6..64b5f0e76d0 100644 --- a/src/uct/base/uct_iface.c +++ b/src/uct/base/uct_iface.c @@ -185,6 +185,14 @@ void uct_iface_close(uct_iface_h iface) void uct_base_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) { uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); + uct_base_iface_progress_enable_cb(iface, + (ucs_callback_t)iface->super.ops.iface_progress, + flags); +} + +void uct_base_iface_progress_enable_cb(uct_base_iface_t *iface, + ucs_callback_t cb, unsigned flags) +{ uct_priv_worker_t *worker = iface->worker; UCS_ASYNC_BLOCK(worker->async); @@ -192,8 +200,7 @@ void uct_base_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) /* Add callback only if previous flags are 0 and new flags != 0 */ if (!iface->progress_flags && flags) { if (iface->prog.id == UCS_CALLBACKQ_ID_NULL) { - iface->prog.id = ucs_callbackq_add(&worker->super.progress_q, - (ucs_callback_t)iface->super.ops.iface_progress, + iface->prog.id = ucs_callbackq_add(&worker->super.progress_q, cb, iface, UCS_CALLBACKQ_FLAG_FAST); } } diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h index 12abee28d0e..b52ad540b7a 100644 --- a/src/uct/base/uct_iface.h +++ b/src/uct/base/uct_iface.h @@ -481,6 +481,9 @@ ucs_status_t uct_base_iface_fence(uct_iface_h tl_iface, unsigned flags); void uct_base_iface_progress_enable(uct_iface_h tl_iface, unsigned flags); +void uct_base_iface_progress_enable_cb(uct_base_iface_t *iface, + ucs_callback_t cb, unsigned flags); + void uct_base_iface_progress_disable(uct_iface_h tl_iface, unsigned flags); ucs_status_t uct_base_ep_flush(uct_ep_h tl_ep, unsigned flags, diff --git a/src/uct/ib/dc/accel/dc_mlx5.c b/src/uct/ib/dc/accel/dc_mlx5.c index e7f390cd8cd..60a85a19f00 100644 --- a/src/uct/ib/dc/accel/dc_mlx5.c +++ b/src/uct/ib/dc/accel/dc_mlx5.c @@ -801,9 +801,6 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker UCT_IB_MLX5_AV_FULL_SIZE) / sizeof(struct mlx5_wqe_data_seg)); - /* TODO: only register progress when we have a connection */ - uct_base_iface_progress_enable(&self->super.super.super.super.super, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); ucs_debug("created dc iface %p", self); return UCS_OK; diff --git a/src/uct/ib/dc/verbs/dc_verbs.c b/src/uct/ib/dc/verbs/dc_verbs.c index 882769178a9..3308f847e00 100644 --- a/src/uct/ib/dc/verbs/dc_verbs.c +++ b/src/uct/ib/dc/verbs/dc_verbs.c @@ -767,9 +767,9 @@ uct_dc_verbs_poll_tx(uct_dc_verbs_iface_t *iface) return num_wcs; } -static unsigned uct_dc_verbs_iface_progress(uct_iface_h tl_iface) +static unsigned uct_dc_verbs_iface_progress(void *arg) { - uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_verbs_iface_t); + uct_dc_verbs_iface_t *iface = arg; unsigned count; count = uct_rc_verbs_iface_poll_rx_common(&iface->super.super); @@ -978,9 +978,9 @@ static ucs_status_t uct_dc_verbs_iface_tag_recv_cancel(uct_iface_h tl_iface, ctx, force); } -static unsigned uct_dc_verbs_iface_progress_tm(uct_iface_h tl_iface) +static unsigned uct_dc_verbs_iface_progress_tm(void *arg) { - uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_verbs_iface_t); + uct_dc_verbs_iface_t *iface = arg; unsigned count; count = uct_rc_verbs_iface_poll_rx_tm(&iface->verbs_common, @@ -997,9 +997,8 @@ static ucs_status_t uct_dc_verbs_iface_tag_init(uct_dc_verbs_iface_t *iface, uct_dc_verbs_iface_config_t *config) { - uct_iface_t *tl_iface = &iface->super.super.super.super.super; - #if IBV_EXP_HW_TM_DC + if (UCT_RC_VERBS_TM_ENABLED(&iface->verbs_common)) { struct ibv_exp_create_srq_attr srq_init_attr = {}; struct ibv_exp_srq_dc_offload_params dc_op = {}; @@ -1032,13 +1031,13 @@ uct_dc_verbs_iface_tag_init(uct_dc_verbs_iface_t *iface, return status; } - - tl_iface->ops.iface_progress = uct_dc_verbs_iface_progress_tm; - } + iface->verbs_common.progress = uct_dc_verbs_iface_progress_tm; + } else #endif + { + iface->verbs_common.progress = uct_dc_verbs_iface_progress; + } - uct_base_iface_progress_enable(tl_iface, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); return UCS_OK; } @@ -1096,6 +1095,19 @@ uct_dc_verbs_iface_event_arm(uct_iface_h tl_iface, unsigned events) UCT_RC_VERBS_TM_ENABLED(&iface->verbs_common)); } +static void uct_dc_verbs_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) +{ + uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_verbs_iface_t); + uct_rc_verbs_iface_common_progress_enable(&iface->verbs_common, + &iface->super.super, flags); +} + +static unsigned uct_dc_verbs_iface_do_progress(uct_iface_h tl_iface) +{ + uct_dc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_verbs_iface_t); + return iface->verbs_common.progress(iface); +} + static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_verbs_iface_t)(uct_iface_t*); static uct_dc_iface_ops_t uct_dc_verbs_iface_ops = { @@ -1136,9 +1148,9 @@ static uct_dc_iface_ops_t uct_dc_verbs_iface_ops = { #endif .iface_flush = uct_dc_iface_flush, .iface_fence = uct_base_iface_fence, - .iface_progress_enable = uct_base_iface_progress_enable, + .iface_progress_enable = uct_dc_verbs_iface_progress_enable, .iface_progress_disable = uct_base_iface_progress_disable, - .iface_progress = uct_dc_verbs_iface_progress, + .iface_progress = uct_dc_verbs_iface_do_progress, .iface_event_fd_get = uct_ib_iface_event_fd_get, .iface_event_arm = uct_dc_verbs_iface_event_arm, .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_dc_verbs_iface_t), diff --git a/src/uct/ib/rc/accel/rc_mlx5_ep.c b/src/uct/ib/rc/accel/rc_mlx5_ep.c index 719f314e5d2..28e142acb3e 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_ep.c +++ b/src/uct/ib/rc/accel/rc_mlx5_ep.c @@ -443,10 +443,6 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, uct_iface_h tl_iface) self->qp_num = self->super.txqp.qp->qp_num; self->tx.wq.bb_max = ucs_min(self->tx.wq.bb_max, iface->tx.bb_max); uct_rc_txqp_available_set(&self->super.txqp, self->tx.wq.bb_max); - - uct_worker_progress_add_safe(iface->super.super.super.worker, - uct_rc_mlx5_iface_progress, iface, - &iface->super.super.super.prog); return UCS_OK; } @@ -455,8 +451,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t) uct_rc_mlx5_iface_t *iface = ucs_derived_of(self->super.super.super.iface, uct_rc_mlx5_iface_t); - uct_worker_progress_remove(iface->super.super.super.worker, - &iface->super.super.super.prog); uct_ib_mlx5_txwq_cleanup(&self->tx.wq); /* Modify QP to error to make HW generate CQEs for all in-progress SRQ diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c index 162110a8ef1..d47962ed9e4 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_iface.c +++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c @@ -174,6 +174,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t, uct_md_h md, uct_worker_h worker static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_iface_t) { + uct_base_iface_progress_disable(&self->super.super.super.super, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); uct_rc_mlx5_iface_common_cleanup(&self->mlx5_common); } @@ -213,8 +215,8 @@ static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = { .ep_connect_to_ep = uct_rc_ep_connect_to_ep, .iface_flush = uct_rc_iface_flush, .iface_fence = uct_base_iface_fence, - .iface_progress_enable = ucs_empty_function, - .iface_progress_disable = ucs_empty_function, + .iface_progress_enable = uct_base_iface_progress_enable, + .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = (void*)uct_rc_mlx5_iface_progress, .iface_event_fd_get = uct_ib_iface_event_fd_get, .iface_event_arm = uct_rc_iface_event_arm, diff --git a/src/uct/ib/rc/verbs/rc_verbs.h b/src/uct/ib/rc/verbs/rc_verbs.h index e3e7067a526..750de26e8af 100644 --- a/src/uct/ib/rc/verbs/rc_verbs.h +++ b/src/uct/ib/rc/verbs/rc_verbs.h @@ -57,9 +57,6 @@ typedef struct uct_rc_verbs_iface { struct { unsigned tx_max_wr; } config; - - /* Progress function (either regular or TM aware) */ - ucs_callback_t progress; } uct_rc_verbs_iface_t; @@ -198,8 +195,6 @@ ucs_status_t uct_rc_verbs_ep_connect_to_ep(uct_ep_h tl_ep, ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr); -unsigned uct_rc_verbs_iface_progress(void *arg); - ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req); diff --git a/src/uct/ib/rc/verbs/rc_verbs_common.c b/src/uct/ib/rc/verbs/rc_verbs_common.c index badc2b9d974..4e7bed1496f 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_common.c +++ b/src/uct/ib/rc/verbs/rc_verbs_common.c @@ -165,8 +165,13 @@ ucs_status_t uct_rc_verbs_iface_prepost_recvs_common(uct_rc_iface_t *iface) return UCS_OK; } -#if IBV_EXP_HW_TM - +void uct_rc_verbs_iface_common_progress_enable(uct_rc_verbs_iface_common_t *iface, + uct_rc_iface_t *rc_iface, + unsigned flags) +{ + uct_base_iface_progress_enable_cb(&rc_iface->super.super, iface->progress, + flags); +} static void uct_rc_verbs_iface_release_desc(uct_recv_desc_t *self, void *desc) { @@ -176,6 +181,8 @@ static void uct_rc_verbs_iface_release_desc(uct_recv_desc_t *self, void *desc) ucs_mpool_put_inline(ib_desc); } +#if IBV_EXP_HW_TM + ucs_status_t uct_rc_verbs_iface_common_tag_init(uct_rc_verbs_iface_common_t *iface, uct_rc_iface_t *rc_iface, diff --git a/src/uct/ib/rc/verbs/rc_verbs_common.h b/src/uct/ib/rc/verbs/rc_verbs_common.h index 8463e87adbf..9a6602f001e 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_common.h +++ b/src/uct/ib/rc/verbs/rc_verbs_common.h @@ -124,6 +124,10 @@ typedef struct uct_rc_verbs_iface_common { uct_rc_verbs_release_desc_t rndv_desc; } tm; #endif + + /* Progress function (either regular or TM aware) */ + ucs_callback_t progress; + /* TODO: make a separate datatype */ struct { size_t notag_hdr_size; @@ -172,6 +176,10 @@ void uct_rc_verbs_iface_common_tag_cleanup(uct_rc_verbs_iface_common_t *iface); ucs_status_t uct_rc_verbs_iface_prepost_recvs_common(uct_rc_iface_t *iface); +void uct_rc_verbs_iface_common_progress_enable(uct_rc_verbs_iface_common_t *iface, + uct_rc_iface_t *rc_iface, + unsigned flags); + void uct_rc_verbs_iface_common_query(uct_rc_verbs_iface_common_t *verbs_iface, uct_rc_iface_t *rc_iface, uct_iface_attr_t *iface_attr); diff --git a/src/uct/ib/rc/verbs/rc_verbs_ep.c b/src/uct/ib/rc/verbs/rc_verbs_ep.c index 49cba9e0270..eb02d40ec4a 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_ep.c +++ b/src/uct/ib/rc/verbs/rc_verbs_ep.c @@ -831,10 +831,6 @@ UCS_CLASS_INIT_FUNC(uct_rc_verbs_ep_t, uct_iface_h tl_iface) uct_rc_txqp_available_set(&self->super.txqp, iface->config.tx_max_wr); uct_rc_verbs_txcnt_init(&self->txcnt); - uct_worker_progress_add_safe(iface->super.super.super.worker, - iface->progress, iface, - &iface->super.super.super.prog); - return uct_rc_verbs_ep_tag_qp_create(iface, self); } @@ -842,8 +838,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_ep_t) { uct_rc_verbs_iface_t *iface = ucs_derived_of(self->super.super.super.iface, uct_rc_verbs_iface_t); - uct_worker_progress_remove(iface->super.super.super.worker, - &iface->super.super.super.prog); + /* NOTE: usually, ci == pi here, but if user calls * flush(UCT_FLUSH_FLAG_CANCEL) then ep_destroy without next progress, * TX-completion handler is not able to return CQ credits because diff --git a/src/uct/ib/rc/verbs/rc_verbs_iface.c b/src/uct/ib/rc/verbs/rc_verbs_iface.c index 2cfd15bc715..a77d9737f6a 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_iface.c +++ b/src/uct/ib/rc/verbs/rc_verbs_iface.c @@ -108,7 +108,7 @@ uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) return num_wcs; } -unsigned uct_rc_verbs_iface_progress(void *arg) +static unsigned uct_rc_verbs_iface_progress(void *arg) { uct_rc_verbs_iface_t *iface = arg; unsigned count; @@ -121,11 +121,10 @@ unsigned uct_rc_verbs_iface_progress(void *arg) return uct_rc_verbs_iface_poll_tx(iface); } -unsigned uct_rc_verbs_iface_do_progress(uct_iface_h tl_iface) +static unsigned uct_rc_verbs_iface_do_progress(uct_iface_h tl_iface) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); - - return iface->progress(iface); + return iface->verbs_common.progress(iface); } #if IBV_EXP_HW_TM @@ -185,7 +184,7 @@ uct_rc_verbs_iface_tag_init(uct_rc_verbs_iface_t *iface, if (UCT_RC_VERBS_TM_ENABLED(&iface->verbs_common)) { struct ibv_exp_create_srq_attr srq_init_attr = {}; - iface->progress = uct_rc_verbs_iface_progress_tm; + iface->verbs_common.progress = uct_rc_verbs_iface_progress_tm; return uct_rc_verbs_iface_common_tag_init(&iface->verbs_common, &iface->super, @@ -195,7 +194,7 @@ uct_rc_verbs_iface_tag_init(uct_rc_verbs_iface_t *iface, sizeof(struct ibv_exp_tmh_rvh)); } #endif - iface->progress = uct_rc_verbs_iface_progress; + iface->verbs_common.progress = uct_rc_verbs_iface_progress; return UCS_OK; } @@ -261,6 +260,13 @@ static ucs_status_t uct_rc_verbs_iface_query(uct_iface_h tl_iface, uct_iface_att return UCS_OK; } +static void uct_rc_verbs_iface_progress_enable(uct_iface_h tl_iface, unsigned flags) +{ + uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); + uct_rc_verbs_iface_common_progress_enable(&iface->verbs_common, &iface->super, + flags); +} + static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, const uct_iface_config_t *tl_config) @@ -334,6 +340,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worke static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_iface_t) { + uct_base_iface_progress_disable(&self->super.super.super.super, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); uct_rc_verbs_iface_common_tag_cleanup(&self->verbs_common); uct_rc_verbs_iface_common_cleanup(&self->verbs_common); } @@ -373,8 +381,8 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = { .ep_connect_to_ep = uct_rc_verbs_ep_connect_to_ep, .iface_flush = uct_rc_iface_flush, .iface_fence = uct_base_iface_fence, - .iface_progress_enable = ucs_empty_function, - .iface_progress_disable = ucs_empty_function, + .iface_progress_enable = uct_rc_verbs_iface_progress_enable, + .iface_progress_disable = uct_base_iface_progress_disable, .iface_progress = uct_rc_verbs_iface_do_progress, #if IBV_EXP_HW_TM .iface_tag_recv_zcopy = uct_rc_verbs_iface_tag_recv_zcopy, diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index 6737e4a8bc9..bfb5bccecb5 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -367,8 +367,6 @@ ucs_status_t uct_ud_iface_complete_init(uct_ud_iface_t *iface) goto err_twheel_cleanup; } - uct_base_iface_progress_enable(&iface->super.super.super, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); return UCS_OK; err_twheel_cleanup: diff --git a/src/uct/sm/mm/mm_iface.c b/src/uct/sm/mm/mm_iface.c index 1a2b331747b..3b45622c94e 100644 --- a/src/uct/sm/mm/mm_iface.c +++ b/src/uct/sm/mm/mm_iface.c @@ -550,9 +550,6 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker, ucs_arbiter_init(&self->arbiter); - uct_base_iface_progress_enable(&self->super.super, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); - ucs_debug("Created an MM iface. FIFO mm id: %zu", self->fifo_mm_id); return UCS_OK; diff --git a/test/examples/uct_hello_world.c b/test/examples/uct_hello_world.c index 43381c5525e..8e1e8c0014b 100644 --- a/test/examples/uct_hello_world.c +++ b/test/examples/uct_hello_world.c @@ -231,6 +231,10 @@ static ucs_status_t init_iface(char *dev_name, char *tl_name, uct_config_release(config); CHKERR_JUMP(UCS_OK != status, "open temporary interface", error_ret); + /* Enable progress on the interface */ + uct_iface_progress_enable(iface_p->iface, + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + /* Get interface attributes */ status = uct_iface_query(iface_p->iface, &iface_p->attr); CHKERR_JUMP(UCS_OK != status, "query iface", error_iface); diff --git a/test/gtest/uct/uct_test.cc b/test/gtest/uct/uct_test.cc index 049224f3f05..745e3cd8e05 100644 --- a/test/gtest/uct/uct_test.cc +++ b/test/gtest/uct/uct_test.cc @@ -272,6 +272,8 @@ uct_test::entity::entity(const resource& resource, uct_iface_config_t *iface_con status = uct_iface_query(m_iface, &m_iface_attr); ASSERT_UCS_OK(status); + + uct_iface_progress_enable(m_iface, UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); } From f9c1a31cd49ab35b5dede3d4abeadbe22c3b8c64 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sat, 7 Oct 2017 16:07:57 +0300 Subject: [PATCH 04/16] UCT/RC/VERBS: Compilation fix when HW TM not supported. --- src/uct/ib/rc/verbs/rc_verbs_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/uct/ib/rc/verbs/rc_verbs_common.c b/src/uct/ib/rc/verbs/rc_verbs_common.c index 4e7bed1496f..106564a1e4e 100644 --- a/src/uct/ib/rc/verbs/rc_verbs_common.c +++ b/src/uct/ib/rc/verbs/rc_verbs_common.c @@ -173,6 +173,8 @@ void uct_rc_verbs_iface_common_progress_enable(uct_rc_verbs_iface_common_t *ifac flags); } +#if IBV_EXP_HW_TM + static void uct_rc_verbs_iface_release_desc(uct_recv_desc_t *self, void *desc) { uct_rc_verbs_release_desc_t *release = ucs_derived_of(self, @@ -181,8 +183,6 @@ static void uct_rc_verbs_iface_release_desc(uct_recv_desc_t *self, void *desc) ucs_mpool_put_inline(ib_desc); } -#if IBV_EXP_HW_TM - ucs_status_t uct_rc_verbs_iface_common_tag_init(uct_rc_verbs_iface_common_t *iface, uct_rc_iface_t *rc_iface, From 7eaa6e7efa17d3043bd9fc2eeb501ccc12d4d256 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sun, 8 Oct 2017 00:01:50 +0300 Subject: [PATCH 05/16] UCT/API/UCP: Add thread-safety flag for UCT progres control. UCT progress control may be called when processing a wireup message, for p2p transports such as RC. In this case, enabling progress on the interface must be thread-safe with respect to the main thread, which may be calling ucp_worker_progress(). --- src/ucp/core/ucp_context.c | 2 +- src/ucp/core/ucp_worker.c | 15 ++++++++++----- src/ucp/core/ucp_worker.h | 2 +- src/uct/api/uct.h | 17 ++++++++++++++--- src/uct/base/uct_iface.c | 26 +++++++++++++++++++++----- 5 files changed, 47 insertions(+), 15 deletions(-) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index a10dd627e38..298f55d7279 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -978,7 +978,7 @@ void ucp_context_tag_offload_enable(ucp_context_h context) offload_iface = ucs_queue_head_elem_non_empty(&context->tm.offload.ifaces, ucp_worker_iface_t, queue); - ucp_worker_iface_activate(offload_iface); + ucp_worker_iface_activate(offload_iface, 0); ucs_debug("Enable TM offload: thresh %zu, zcopy_thresh %zu", context->tm.offload.thresh, context->tm.offload.zcopy_thresh); diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index ea5063b44d6..70c1e27e7ad 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -381,7 +381,7 @@ ucp_worker_iface_error_handler(void *arg, uct_ep_h uct_ep, ucs_status_t status) } } -void ucp_worker_iface_activate(ucp_worker_iface_t *wiface) +void ucp_worker_iface_activate(ucp_worker_iface_t *wiface, unsigned uct_flags) { ucp_worker_h worker = wiface->worker; ucs_status_t status; @@ -408,7 +408,7 @@ void ucp_worker_iface_activate(ucp_worker_iface_t *wiface) } uct_iface_progress_enable(wiface->iface, - UCT_PROGRESS_SEND | UCT_PROGRESS_RECV); + UCT_PROGRESS_SEND | UCT_PROGRESS_RECV | uct_flags); } static void ucp_worker_iface_deactivate(ucp_worker_iface_t *wiface, int force) @@ -451,7 +451,12 @@ void ucp_worker_iface_progress_ep(ucp_worker_iface_t *wiface) ucs_trace_func("iface=%p", wiface->iface); UCS_ASYNC_BLOCK(&wiface->worker->async); - ucp_worker_iface_activate(wiface); + + /* This function may be called from progress thread (such as when processing + * wireup messages), so ask UCT to be thread-safe. + */ + ucp_worker_iface_activate(wiface, UCT_PROGRESS_THREAD_SAFE); + UCS_ASYNC_UNBLOCK(&wiface->worker->async); } @@ -488,7 +493,7 @@ static ucs_status_t ucp_worker_iface_check_events_do(ucp_worker_iface_t *wiface, *progress_count = uct_iface_progress(wiface->iface); if (prev_am_count != wiface->proxy_am_count) { /* Received relevant active messages, activate the interface */ - ucp_worker_iface_activate(wiface); + ucp_worker_iface_activate(wiface, 0); return UCS_OK; } else if (*progress_count == 0) { /* Arm the interface to wait for next event */ @@ -721,7 +726,7 @@ ucp_worker_add_iface(ucp_worker_h worker, ucp_rsc_index_t tl_id, { ucp_worker_iface_deactivate(wiface, 1); } else { - ucp_worker_iface_activate(wiface); + ucp_worker_iface_activate(wiface, 0); } } diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h index 916755910a5..dba1dca46ca 100644 --- a/src/ucp/core/ucp_worker.h +++ b/src/ucp/core/ucp_worker.h @@ -169,7 +169,7 @@ void ucp_worker_iface_unprogress_ep(ucp_worker_iface_t *wiface); void ucp_worker_signal_internal(ucp_worker_h worker); -void ucp_worker_iface_activate(ucp_worker_iface_t *wiface); +void ucp_worker_iface_activate(ucp_worker_iface_t *wiface, unsigned uct_flags); static inline const char* ucp_worker_get_name(ucp_worker_h worker) { diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 0e9e579e5fa..e2eca43c932 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -290,8 +290,11 @@ enum uct_flush_flags { * @brief UCT progress types */ enum uct_progress_types { - UCT_PROGRESS_SEND = UCS_BIT(0), /**< Progress send operations */ - UCT_PROGRESS_RECV = UCS_BIT(1) /**< Progress receive operations */ + UCT_PROGRESS_SEND = UCS_BIT(0), /**< Progress send operations */ + UCT_PROGRESS_RECV = UCS_BIT(1), /**< Progress receive operations */ + UCT_PROGRESS_THREAD_SAFE = UCS_BIT(7) /**< Enable/disable progress while + another thread may be calling + @ref ucp_worker_progress(). */ }; @@ -2310,7 +2313,11 @@ UCT_INLINE_API ucs_status_t uct_iface_tag_recv_cancel(uct_iface_h iface, * * @param [in] iface The interface to enable progress. * @param [in] flags The type of progress to enable as defined by - * @ref uct_progress_types. + * @ref uct_progress_types + * + * @note This function is not thread safe with respect to + * @ref ucp_worker_progress(), unless the flag + * @ref UCT_PROGRESS_THREAD_SAFE is specified. * */ UCT_INLINE_API void uct_iface_progress_enable(uct_iface_h iface, unsigned flags) @@ -2333,6 +2340,10 @@ UCT_INLINE_API void uct_iface_progress_enable(uct_iface_h iface, unsigned flags) * @param [in] flags The type of progress to disable as defined by * @ref uct_progress_types. * + * @note This function is not thread safe with respect to + * @ref ucp_worker_progress(), unless the flag + * @ref UCT_PROGRESS_THREAD_SAFE is specified. + * */ UCT_INLINE_API void uct_iface_progress_disable(uct_iface_h iface, unsigned flags) { diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c index 64b5f0e76d0..fe794255c2c 100644 --- a/src/uct/base/uct_iface.c +++ b/src/uct/base/uct_iface.c @@ -194,12 +194,21 @@ void uct_base_iface_progress_enable_cb(uct_base_iface_t *iface, ucs_callback_t cb, unsigned flags) { uct_priv_worker_t *worker = iface->worker; + unsigned thread_safe; UCS_ASYNC_BLOCK(worker->async); + thread_safe = flags & UCT_PROGRESS_THREAD_SAFE; + flags &= ~UCT_PROGRESS_THREAD_SAFE; + /* Add callback only if previous flags are 0 and new flags != 0 */ - if (!iface->progress_flags && flags) { - if (iface->prog.id == UCS_CALLBACKQ_ID_NULL) { + if ((!iface->progress_flags && flags) && + (iface->prog.id == UCS_CALLBACKQ_ID_NULL)) { + if (thread_safe) { + iface->prog.id = ucs_callbackq_add_safe(&worker->super.progress_q, + cb, iface, + UCS_CALLBACKQ_FLAG_FAST); + } else { iface->prog.id = ucs_callbackq_add(&worker->super.progress_q, cb, iface, UCS_CALLBACKQ_FLAG_FAST); } @@ -213,17 +222,24 @@ void uct_base_iface_progress_disable(uct_iface_h tl_iface, unsigned flags) { uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t); uct_priv_worker_t *worker = iface->worker; + unsigned thread_safe; UCS_ASYNC_BLOCK(worker->async); + thread_safe = flags & UCT_PROGRESS_THREAD_SAFE; + flags &= ~UCT_PROGRESS_THREAD_SAFE; + /* Remove callback only if previous flags != 0, and removing the given * flags makes it become 0. */ - if (iface->progress_flags && !(iface->progress_flags & ~flags)) { - if (iface->prog.id != UCS_CALLBACKQ_ID_NULL) { + if ((iface->progress_flags && !(iface->progress_flags & ~flags)) && + (iface->prog.id != UCS_CALLBACKQ_ID_NULL)) { + if (thread_safe) { + ucs_callbackq_remove_safe(&worker->super.progress_q, iface->prog.id); + } else { ucs_callbackq_remove(&worker->super.progress_q, iface->prog.id); - iface->prog.id = UCS_CALLBACKQ_ID_NULL; } + iface->prog.id = UCS_CALLBACKQ_ID_NULL; } iface->progress_flags &= ~flags; From d24e5b25ee1758ce58adca0a6007995a4dd9b7a9 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 10 Oct 2017 19:45:25 +0300 Subject: [PATCH 06/16] UCT/API: Fix comment for uct_iface_progress_disable(). --- src/uct/api/uct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index e2eca43c932..4c747a5497b 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -2334,7 +2334,7 @@ UCT_INLINE_API void uct_iface_progress_enable(uct_iface_h iface, unsigned flags) * @ref uct_worker_progress(). Thus the latency of other transports may be * improved. * - * By default, progress is enabled when the interface is created. + * By default, progress is disabled when the interface is created. * * @param [in] iface The interface to disable progress. * @param [in] flags The type of progress to disable as defined by From 869706af8d71951400425f447efdb3a7b2485288 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:10:59 -0700 Subject: [PATCH 07/16] CUDA: configuration flags --- config/m4/cuda.m4 | 87 ++++++++++++++++++++++++++++++++++++++++++++++- configure.ac | 1 + 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4 index 8fc12fa38e6..55cab62ee05 100644 --- a/config/m4/cuda.m4 +++ b/config/m4/cuda.m4 @@ -1 +1,86 @@ -AM_CONDITIONAL([HAVE_CUDA], [true]) +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +# +# Check for CUDA support +# +cuda_happy="no" +gdrcopy_happy="no" + +AC_ARG_WITH([cuda], + [AS_HELP_STRING([--with-cuda=(DIR)], [Enable the use of CUDA (default is no).])], + [], [with_cuda=no]) + +AS_IF([test "x$with_cuda" != "xno"], + [AS_IF([test ! -z "$with_cuda" -a "x$with_cuda" != "xyes"], + [ + ucx_check_cuda_dir="$with_cuda" + ucx_check_cuda_libdir="$with_cuda/lib64 " + ]) + AS_IF([test ! -z "$with_cuda_libdir" -a "x$with_cuda_libdir" != "xyes"], + [ucx_check_cuda_libdir="$with_nccl_libdir"]) + + AC_CHECK_HEADERS([cuda.h cuda_runtime.h], + [AC_CHECK_DECLS([cuPointerGetAttribute], + [cuda_happy="yes"], + [AC_MSG_WARN([CUDA runtime not detected. Disable.]) + cuda_happy="no"], + [#include ]) + AS_IF([test "x$cuda_happy" == "xyes"], + [AC_DEFINE([HAVE_CUDA], 1, [Enable CUDA support]) + AC_SUBST(CUDA_CPPFLAGS, "-I$ucx_check_cuda_dir/include ") + AC_SUBST(CUDA_CFLAGS, "-I$ucx_check_cuda_dir/include ") + AC_SUBST(CUDA_LDFLAGS, "-lcudart -lcuda -L$ucs_check_cuda_libdir/ ") + CFLAGS="$CFLAGS $CUDA_CFLAGS" + CPPFLAGS="$CPPFLAGS $CUDA_CPPFLAGS" + LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"], + [])], + [AC_MSG_WARN([CUDA not found]) + AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])])], + [AC_MSG_WARN([CUDA was explicitly disabled]) + AC_DEFINE([HAVE_CUDA], [0], [Disable the use of CUDA])] +) + + +AM_CONDITIONAL([HAVE_CUDA], [test "x$cuda_happy" != xno]) + +AC_ARG_WITH([gdrcopy], + [AS_HELP_STRING([--with-gdrcopy=(DIR)], [Enable the use of GDR_COPY (default is no).])], + [], [with_gdrcopy=no]) + +AS_IF([test "x$with_gdrcopy" != "xno"], + + [AS_IF([test "x$cuda_happy" == "xno"], + [AC_MSG_ERROR([--with-cuda not specified ...])],[:]) + AS_IF([test ! -z "$with_gdrcopy" -a "x$with_gdrcopy" != "xyes"], + [ + ucx_check_gdrcopy_dir="$with_gdrcopy" + ucx_check_gdrcopy_libdir="$with_gdrcopy/lib64 " + ]) + AS_IF([test ! -z "$with_gdrcopy_libdir" -a "x$with_gdrcopy_libdir" != "xyes"], + [ucx_check_gdrcopy_libdir="$with_nccl_libdir"]) + + AC_CHECK_HEADERS([gdrapi.h], + [AC_CHECK_DECLS([gdr_pin_buffer], + [gdrcopy_happy="yes"], + [AC_MSG_WARN([GDR_COPY runtime not detected. Disable.]) + gdrcopy_happy="no"], + [#include ]) + AS_IF([test "x$gdrcopy_happy" == "xyes"], + [AC_DEFINE([HAVE_GDR_COPY], 1, [Enable GDR_COPY support]) + AC_SUBST(GDR_COPY_CPPFLAGS, "-I$ucx_check_gdrcopy_dir/include/ ") + AC_SUBST(GDR_COPY_CFLAGS, "-I$ucx_check_gdrcopy_dir/include/ ") + AC_SUBST(GDR_COPY_LDFLAGS, "-lgdrapi -L$ucx_check_gdrcopy_dir/lib64") + CFLAGS="$CFLAGS $GDR_COPY_CFLAGS" + CPPFLAGS="$CPPFLAGS $GDR_COPY_CPPFLAGS" + LDFLAGS="$LDFLAGS $GDR_COPY_LDFLAGS"], + [])], + [AC_MSG_WARN([GDR_COPY not found]) + AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])])], + [AC_MSG_WARN([GDR_COPY was explicitly disabled]) + AC_DEFINE([HAVE_GDR_COPY], [0], [Disable the use of GDR_COPY])] +) + +AM_CONDITIONAL([HAVE_GDR_COPY], [test "x$gdrcopy_happy" != xno]) diff --git a/configure.ac b/configure.ac index affa3cee6b6..f8f21b628eb 100644 --- a/configure.ac +++ b/configure.ac @@ -119,6 +119,7 @@ AS_IF([test "x$with_docs_only" == xyes], AM_CONDITIONAL([HAVE_IBV_EX_HW_TM], [false]) AM_CONDITIONAL([HAVE_CRAY_UGNI], [false]) AM_CONDITIONAL([HAVE_CUDA], [false]) + AM_CONDITIONAL([HAVE_GDR_COPY], [false]) AM_CONDITIONAL([HAVE_ROCM], [false]) AM_CONDITIONAL([HAVE_XPMEM], [false]) AM_CONDITIONAL([HAVE_CMA], [false]) From 35dd1fe537ea684af84a72a0ff0ae56f4999f445 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:20:39 -0700 Subject: [PATCH 08/16] CUDA: UCT interface changes for supporting memory types --- src/uct/api/uct.h | 23 +++++++++++++++++++++++ src/uct/base/uct_md.c | 5 +++++ src/uct/base/uct_md.h | 1 + src/uct/ib/base/ib_md.c | 2 ++ src/uct/rocm/rocm_cma_md.c | 4 +++- src/uct/sm/cma/cma_md.c | 4 +++- src/uct/sm/knem/knem_md.c | 4 +++- src/uct/sm/mm/mm_md.c | 2 ++ src/uct/sm/self/self_md.c | 4 +++- src/uct/tcp/tcp_md.c | 3 ++- src/uct/ugni/base/ugni_md.c | 4 +++- 11 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 4c747a5497b..6e2831f85f8 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -385,6 +385,16 @@ enum { sockaddr */ }; +/* + * @ingroup UCT_MD + * @brief Memory types + */ +typedef enum { + UCT_MD_MEM_TYPE_DEFAULT = 0, /**< Default system memory */ + UCT_MD_MEM_TYPE_CUDA, /**< NVIDIA CUDA memory */ + UCT_MD_MEM_TYPE_LAST = UCT_MD_MEM_TYPE_CUDA +} uct_memory_type_t; + /** * @ingroup UCT_MD @@ -633,6 +643,7 @@ struct uct_md_attr { size_t max_alloc; /**< Maximal allocation size */ size_t max_reg; /**< Maximal registration size */ uint64_t flags; /**< UCT_MD_FLAG_xx */ + uct_memory_type_t mem_type; /**< Supported memory type */ } cap; uct_linear_growth_t reg_cost; /**< Memory registration cost estimation @@ -1416,6 +1427,18 @@ ucs_status_t uct_md_mem_reg(uct_md_h md, void *address, size_t length, ucs_status_t uct_md_mem_dereg(uct_md_h md, uct_mem_h memh); +/** + * @ingroup UCT_MD + * @brief Detect memory type. + * + * Detect memory type. + * Return UCS_OK if address belongs to MD's supported memory type + * + * @param [in] md Memory domain to detect if memory belongs to. + * @param [in] address Memory address to detect. + */ +ucs_status_t uct_md_mem_type_detect(uct_md_h md, void *addr); + /** * @ingroup UCT_MD * @brief Allocate memory for zero-copy communications and remote access. diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 7876eb602da..6566803c6f5 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -543,3 +543,8 @@ int uct_md_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr, { return md->ops->is_sockaddr_accessible(md, sockaddr, mode); } + +ucs_status_t uct_md_mem_type_detect(uct_md_h md, void *addr) +{ + return md->ops->mem_type_detect(md, addr); +} diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 1cf33670017..a6fd717bf49 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -136,6 +136,7 @@ struct uct_md_ops { int (*is_sockaddr_accessible)(uct_md_h md, const ucs_sock_addr_t *sockaddr, uct_sockaddr_accessibility_t mode); + ucs_status_t (*mem_type_detect)(uct_md_h md, void *addr); }; diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 7811bcece3d..1c3ca648bad 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -158,6 +158,7 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr) UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY | UCT_MD_FLAG_ADVISE; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; md_attr->rkey_packed_size = sizeof(uint64_t); if (md->config.enable_contig_pages && @@ -869,6 +870,7 @@ static uct_md_ops_t uct_ib_md_ops = { .mem_dereg = uct_ib_mem_dereg, .mem_advise = uct_ib_mem_advise, .mkey_pack = uct_ib_mkey_pack, + .mem_type_detect = ucs_empty_function_return_unsupported, }; static inline uct_ib_rcache_region_t* uct_ib_rache_region_from_memh(uct_mem_h memh) diff --git a/src/uct/rocm/rocm_cma_md.c b/src/uct/rocm/rocm_cma_md.c index 3ba3d6ad10c..3c6ff1c9a34 100644 --- a/src/uct/rocm/rocm_cma_md.c +++ b/src/uct/rocm/rocm_cma_md.c @@ -30,6 +30,7 @@ static ucs_status_t uct_rocm_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->rkey_packed_size = sizeof(uct_rocm_cma_key_t); md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; @@ -211,7 +212,8 @@ static ucs_status_t uct_rocm_cma_md_open(const char *md_name, .query = uct_rocm_cma_md_query, .mkey_pack = uct_rocm_cma_rkey_pack, .mem_reg = uct_rocm_cma_mem_reg, - .mem_dereg = uct_rocm_cma_mem_dereg + .mem_dereg = uct_rocm_cma_mem_dereg, + .mem_type_detect = ucs_empty_function_return_unsupported, }; ucs_trace("deal with any memory = %d\n", md_config->any_memory); diff --git a/src/uct/sm/cma/cma_md.c b/src/uct/sm/cma/cma_md.c index 71676b4d6bd..6c62bfeaa1d 100644 --- a/src/uct/sm/cma/cma_md.c +++ b/src/uct/sm/cma/cma_md.c @@ -59,7 +59,8 @@ static ucs_status_t uct_cma_md_open(const char *md_name, const uct_md_config_t * .mem_free = (void*)ucs_empty_function_return_success, .mkey_pack = (void*)ucs_empty_function_return_success, .mem_reg = uct_cma_mem_reg, - .mem_dereg = (void*)ucs_empty_function_return_success + .mem_dereg = (void*)ucs_empty_function_return_success, + .mem_type_detect = ucs_empty_function_return_unsupported, }; static uct_md_t md = { .ops = &md_ops, @@ -80,6 +81,7 @@ ucs_status_t uct_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr) { md_attr->rkey_packed_size = 0; md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 9e-9; diff --git a/src/uct/sm/knem/knem_md.c b/src/uct/sm/knem/knem_md.c index 4a1cb24ffe8..d3cf063d098 100644 --- a/src/uct/sm/knem/knem_md.c +++ b/src/uct/sm/knem/knem_md.c @@ -13,6 +13,7 @@ ucs_status_t uct_knem_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->rkey_packed_size = sizeof(uct_knem_key_t); md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 1200.0e-9; @@ -186,7 +187,8 @@ static ucs_status_t uct_knem_md_open(const char *md_name, const uct_md_config_t .mem_free = (void*)ucs_empty_function_return_success, .mkey_pack = uct_knem_rkey_pack, .mem_reg = uct_knem_mem_reg, - .mem_dereg = uct_knem_mem_dereg + .mem_dereg = uct_knem_mem_dereg, + .mem_type_detect = ucs_empty_function_return_unsupported, }; knem_md = ucs_malloc(sizeof(uct_knem_md_t), "uct_knem_md_t"); diff --git a/src/uct/sm/mm/mm_md.c b/src/uct/sm/mm/mm_md.c index c849dfd6412..c4aef94e493 100644 --- a/src/uct/sm/mm/mm_md.c +++ b/src/uct/sm/mm/mm_md.c @@ -124,6 +124,7 @@ ucs_status_t uct_mm_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->reg_cost.growth = 0.007e-9; } md_attr->cap.flags |= UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; /* all mm md(s) support fixed memory alloc */ md_attr->cap.flags |= UCT_MD_FLAG_FIXED; md_attr->cap.max_alloc = ULONG_MAX; @@ -228,6 +229,7 @@ uct_md_ops_t uct_mm_md_ops = { .mem_reg = uct_mm_mem_reg, .mem_dereg = uct_mm_mem_dereg, .mkey_pack = uct_mm_mkey_pack, + .mem_type_detect = ucs_empty_function_return_unsupported, }; ucs_status_t uct_mm_md_open(const char *md_name, const uct_md_config_t *md_config, diff --git a/src/uct/sm/self/self_md.c b/src/uct/sm/self/self_md.c index ede57407be5..ad1d91d7767 100644 --- a/src/uct/sm/self/self_md.c +++ b/src/uct/sm/self/self_md.c @@ -10,6 +10,7 @@ static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_t *attr) { /* Dummy memory registration provided. No real memory handling exists */ attr->cap.flags = UCT_MD_FLAG_REG; + attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; attr->cap.max_alloc = 0; attr->cap.max_reg = ULONG_MAX; attr->rkey_packed_size = 0; /* uct_md_query adds UCT_MD_COMPONENT_NAME_MAX to this */ @@ -41,7 +42,8 @@ static ucs_status_t uct_self_md_open(const char *md_name, const uct_md_config_t .query = uct_self_md_query, .mkey_pack = ucs_empty_function_return_success, .mem_reg = uct_self_mem_reg, - .mem_dereg = ucs_empty_function_return_success + .mem_dereg = ucs_empty_function_return_success, + .mem_type_detect = ucs_empty_function_return_unsupported, }; static uct_md_t md = { .ops = &md_ops, diff --git a/src/uct/tcp/tcp_md.c b/src/uct/tcp/tcp_md.c index 008ea5f1d60..94b8fd4441a 100644 --- a/src/uct/tcp/tcp_md.c +++ b/src/uct/tcp/tcp_md.c @@ -32,7 +32,8 @@ static ucs_status_t uct_tcp_md_open(const char *md_name, const uct_md_config_t * .query = uct_tcp_md_query, .mkey_pack = ucs_empty_function_return_unsupported, .mem_reg = ucs_empty_function_return_unsupported, - .mem_dereg = ucs_empty_function_return_unsupported + .mem_dereg = ucs_empty_function_return_unsupported, + .mem_type_detect = ucs_empty_function_return_unsupported, }; static uct_md_t md = { .ops = &md_ops, diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c index a0b671544f7..b2a4443b369 100644 --- a/src/uct/ugni/base/ugni_md.c +++ b/src/uct/ugni/base/ugni_md.c @@ -34,6 +34,7 @@ static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_t *md_attr) md_attr->cap.flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_NEED_RKEY; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_DEFAULT; md_attr->cap.max_alloc = 0; md_attr->cap.max_reg = ULONG_MAX; md_attr->reg_cost.overhead = 1000.0e-9; @@ -180,7 +181,8 @@ static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t .mem_free = (void*)ucs_empty_function, .mem_reg = uct_ugni_mem_reg, .mem_dereg = uct_ugni_mem_dereg, - .mkey_pack = uct_ugni_rkey_pack + .mkey_pack = uct_ugni_rkey_pack, + .mem_type_detect = ucs_empty_function_return_unsupported, }; static uct_ugni_md_t md = { From afe690b2242d353e602989e7219fbc8dc16e0ff0 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Wed, 11 Oct 2017 00:59:37 +0300 Subject: [PATCH 09/16] CUDA: UCP memory type detection --- src/ucp/api/ucp_def.h | 7 +++++++ src/ucp/core/ucp_mm.c | 36 ++++++++++++++++++++++++++++++++++++ src/ucp/core/ucp_mm.h | 21 +++++++++++++++++++++ src/ucp/core/ucp_request.h | 1 + 4 files changed, 65 insertions(+) diff --git a/src/ucp/api/ucp_def.h b/src/ucp/api/ucp_def.h index d094541e5e4..127d340ac5a 100644 --- a/src/ucp/api/ucp_def.h +++ b/src/ucp/api/ucp_def.h @@ -141,6 +141,13 @@ typedef struct ucp_rkey *ucp_rkey_h; */ typedef struct ucp_mem *ucp_mem_h; +/* + * @ingroup UCP_MEM + * @brief UCP memory type + * + * Memory type handle is an opaque object representing a memory adreess type +*/ +typedef struct ucp_mem_type *ucp_mem_type_h; /** * @ingroup UCP_WORKER diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 9c89adec845..b8f256d67f7 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -24,6 +24,12 @@ static ucp_mem_t ucp_mem_dummy_handle = { .md_map = 0 }; +ucp_mem_type_t ucp_mem_type_dummy_handle = { + .md_map = 0, + .id = UCT_MD_MEM_TYPE_DEFAULT +}; + + /** * Unregister memory from all memory domains. * Save in *alloc_md_memh_p the memory handle of the allocating MD, if such exists. @@ -106,6 +112,36 @@ static ucs_status_t ucp_memh_reg_mds(ucp_context_h context, ucp_mem_h memh, return UCS_OK; } +ucs_status_t ucp_addr_domain_detect_mds(ucp_context_h context, void *addr, ucp_mem_type_h mem_type_h) +{ + ucs_status_t status; + unsigned md_index; + uct_memory_type_t mem_type = UCT_MD_MEM_TYPE_DEFAULT; + + mem_type_h->md_map = 0; + mem_type_h->id = UCT_MD_MEM_TYPE_DEFAULT; + + /*TODO: return if no MDs with address domain detect */ + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (context->tl_mds[md_index].attr.cap.mem_type != UCT_MD_MEM_TYPE_DEFAULT) { + if (mem_type == UCT_MD_MEM_TYPE_DEFAULT) { + status = uct_md_mem_type_detect(context->tl_mds[md_index].md, addr); + if (status == UCS_OK) { + mem_type = context->tl_mds[md_index].attr.cap.mem_type; + + mem_type_h->id = mem_type; + mem_type_h->md_map = UCS_BIT(md_index); + } + } else { + if (mem_type == context->tl_mds[md_index].attr.cap.mem_type) { + mem_type_h->md_map |= UCS_BIT(md_index); + } + } + } + } + return UCS_OK; +} /** * @return Whether MD number 'md_index' is selected by the configuration as part * of allocation method number 'config_method_index'. diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index 3ddfbbfc842..82f035f3efa 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -64,6 +64,15 @@ typedef struct ucp_mem_desc { } ucp_mem_desc_t; +/** + * Memory type descriptor. + * Contains memory type information. + */ +typedef struct ucp_mem_type { + ucp_md_map_t md_map; /* Which MDs have own ths addr Domain */ + uct_memory_type_t id; /* memory type */ +} ucp_mem_type_t; + void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep); ucs_status_t ucp_mpool_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p); @@ -72,6 +81,18 @@ void ucp_mpool_free(ucs_mpool_t *mp, void *chunk); void ucp_mpool_obj_init(ucs_mpool_t *mp, void *obj, void *chunk); +#define UCP_IS_DEFAULT_MEMORY_TYPE(_id) ((_id ) == UCT_MD_MEM_TYPE_DEFAULT) + +/** + * Detects the address domain on all MDs. skips on detect on sub-sequence MDs + * if it sucessfully detected by MD. +**/ +ucs_status_t ucp_addr_domain_detect_mds(ucp_context_h context, void *addr, + ucp_mem_type_h mem_type_h); + + +extern ucp_mem_type_t ucp_mem_type_dummy_handle; + static UCS_F_ALWAYS_INLINE uct_mem_h ucp_memh2uct(ucp_mem_h memh, ucp_md_index_t md_idx) { diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h index 52c92dbddc2..7fd5357922e 100644 --- a/src/ucp/core/ucp_request.h +++ b/src/ucp/core/ucp_request.h @@ -78,6 +78,7 @@ typedef void (*ucp_request_callback_t)(ucp_request_t *req); struct ucp_request { ucs_status_t status; /* Operation status */ uint16_t flags; /* Request flags */ + ucp_mem_type_t mem_type; /* Memory type handle */ union { struct { From b2eff43ae445e45aef1adfa9d0851824b9981ec9 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:23:04 -0700 Subject: [PATCH 10/16] CUDA: cudaFree() hook --- src/ucm/Makefile.am | 7 +++ src/ucm/cuda/cudamem.h | 20 ++++++++ src/ucm/cuda/install.c | 62 +++++++++++++++++++++++++ src/ucm/cuda/replace.c | 98 +++++++++++++++++++++++++++++++++++++++ src/ucm/event/event.c | 32 +++++++++++++ src/ucm/util/ucm_config.c | 14 +++++- src/ucm/util/ucm_config.h | 3 ++ 7 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 src/ucm/cuda/cudamem.h create mode 100644 src/ucm/cuda/install.c create mode 100644 src/ucm/cuda/replace.c diff --git a/src/ucm/Makefile.am b/src/ucm/Makefile.am index 2215ddd2111..7db3ea7ccef 100644 --- a/src/ucm/Makefile.am +++ b/src/ucm/Makefile.am @@ -47,6 +47,13 @@ libucm_la_SOURCES = \ util/reloc.c \ util/sys.c +if HAVE_CUDA +libucm_la_SOURCES += \ + cuda/install.c \ + cuda/replace.c + +endif + if HAVE_UCM_PTMALLOC283 libucm_la_CPPFLAGS += \ -I$(srcdir)/ptmalloc283/sysdeps/pthread \ diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h new file mode 100644 index 00000000000..76ae80eabda --- /dev/null +++ b/src/ucm/cuda/cudamem.h @@ -0,0 +1,20 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifndef UCM_CUDAMEM_H_ +#define UCM_CUDAMEM_H_ + +#include +#include +#include + +ucs_status_t ucm_cudamem_install(); + +cudaError_t ucm_override_cudaFree(void *addr); +cudaError_t ucm_orig_cudaFree(void *address); +cudaError_t ucm_cudaFree(void *address); + +#endif diff --git a/src/ucm/cuda/install.c b/src/ucm/cuda/install.c new file mode 100644 index 00000000000..ca34e5023bb --- /dev/null +++ b/src/ucm/cuda/install.c @@ -0,0 +1,62 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + + +static ucm_reloc_patch_t ucm_cudamem_symbol_patches[] = { + {"cudaFree", ucm_override_cudaFree}, + {NULL, NULL} +}; + +ucs_status_t ucm_cudamem_install() +{ + static int ucm_cudamem_installed = 0; + static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER; + ucm_reloc_patch_t *patch; + ucs_status_t status = UCS_OK; + + if (!ucm_global_config.enable_cuda_hooks) { + ucm_debug("installing cudamem relocations is disabled by configuration"); + return UCS_ERR_UNSUPPORTED; + } + if (ucm_cudamem_installed) { + return UCS_OK; + } + + pthread_mutex_lock(&install_mutex); + + for (patch = ucm_cudamem_symbol_patches; patch->symbol != NULL; ++patch) { + status = ucm_reloc_modify(patch); + if (status != UCS_OK) { + ucm_warn("failed to install relocation table entry for '%s'", + patch->symbol); + goto out_unlock; + } + } + ucm_cudamem_installed = 1; + +out_unlock: + pthread_mutex_unlock(&install_mutex); + return status; +} diff --git a/src/ucm/cuda/replace.c b/src/ucm/cuda/replace.c new file mode 100644 index 00000000000..c41b204b670 --- /dev/null +++ b/src/ucm/cuda/replace.c @@ -0,0 +1,98 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * + * See file LICENSE for terms. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "cudamem.h" + +#include +#include +#include +#include +#include +#include +#include + +static pthread_mutex_t ucm_cudamem_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +static pthread_t volatile ucm_cudamem_get_orig_thread = -1; + + +/** + * Define a replacement function to a memory-mapping function call, which calls + * the event handler, and if event handler returns error code - calls the original + * function. + */ +#define UCM_DEFINE_CUDA_FUNC(_name, _rettype, _fail_val, ...) \ + \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)); \ + \ + /* Call the original function using dlsym(RTLD_NEXT) */ \ + _rettype ucm_orig_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + typedef _rettype (*func_ptr_t) (__VA_ARGS__); \ + static func_ptr_t orig_func_ptr = NULL; \ + \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(orig_func_ptr == NULL)) { \ + pthread_mutex_lock(&ucm_cudamem_get_orig_lock); \ + ucm_cudamem_get_orig_thread = pthread_self(); \ + orig_func_ptr = ucm_reloc_get_orig(UCS_PP_QUOTE(_name), \ + ucm_override_##_name); \ + ucm_cudamem_get_orig_thread = -1; \ + pthread_mutex_unlock(&ucm_cudamem_get_orig_lock); \ + } \ + return orig_func_ptr(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } \ + \ + /* Define a symbol which goes to the replacement - in case we are loaded first */ \ + _rettype ucm_override_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \ + { \ + ucm_trace("%s()", __FUNCTION__); \ + \ + if (ucs_unlikely(ucm_cudamem_get_orig_thread == pthread_self())) { \ + return _fail_val; \ + } \ + return ucm_##_name(UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \ + } + +#define UCM_OVERRIDE_CUDA_FUNC(_name) \ + cudaError_t _name() __attribute__ ((alias ("ucm_override_" UCS_PP_QUOTE(_name)))); \ + + +/* + * Define argument list with given types. + */ +#define UCM_FUNC_DEFINE_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_DEFINE, _, \ + UCS_PP_ZIP((UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))), \ + (__VA_ARGS__))) + +/* + * Pass auto-generated arguments to a function call. + */ +#define UCM_FUNC_PASS_ARGS(...) \ + UCS_PP_FOREACH_SEP(_UCM_FUNC_ARG_PASS, _, UCS_PP_SEQ(UCS_PP_NUM_ARGS(__VA_ARGS__))) + + +/* + * Helpers + */ +#define _UCM_FUNC_ARG_DEFINE(_, _bundle) \ + __UCM_FUNC_ARG_DEFINE(_, UCS_PP_TUPLE_0 _bundle, UCS_PP_TUPLE_1 _bundle) +#define __UCM_FUNC_ARG_DEFINE(_, _index, _type) \ + _type UCS_PP_TOKENPASTE(arg, _index) +#define _UCM_FUNC_ARG_PASS(_, _index) \ + UCS_PP_TOKENPASTE(arg, _index) + + +UCM_DEFINE_CUDA_FUNC(cudaFree, cudaError_t, -1, void*) + +#if ENABLE_SYMBOL_OVERRIDE +UCM_OVERRIDE_CUDA_FUNC(cudaFree) +#endif diff --git a/src/ucm/event/event.c b/src/ucm/event/event.c index f572dae6bbe..65fe0059667 100644 --- a/src/ucm/event/event.c +++ b/src/ucm/event/event.c @@ -13,6 +13,9 @@ #include #include #include +#if HAVE_CUDA +#include +#endif #include #include #include @@ -334,6 +337,25 @@ void *ucm_sbrk(intptr_t increment) return event.sbrk.result; } +#if HAVE_CUDA +cudaError_t ucm_cudaFree(void *addr) +{ + cudaError_t ret; + + ucm_event_enter(); + + ucm_trace("ucm_cudaFree(addr=%p )", addr); + + ucm_dispatch_vm_munmap(addr, 0); + + ret = ucm_orig_cudaFree(addr); + + ucm_event_leave(); + + return ret; +} +#endif + void ucm_event_handler_add(ucm_event_handler_t *handler) { ucm_event_handler_t *elem; @@ -390,6 +412,16 @@ static ucs_status_t ucm_event_install(int events) } ucm_debug("malloc hooks are ready"); + +#if HAVE_CUDA + status = ucm_cudamem_install(); + if (status != UCS_OK) { + ucm_debug("failed to install cudamem events"); + goto out_unlock; + } + ucm_debug("cudaFree hooks are ready"); +#endif + status = UCS_OK; out_unlock: diff --git a/src/ucm/util/ucm_config.c b/src/ucm/util/ucm_config.c index 2d9d2b43301..fd2081ff37d 100644 --- a/src/ucm/util/ucm_config.c +++ b/src/ucm/util/ucm_config.c @@ -20,6 +20,7 @@ #define UCM_EN_MMAP_RELOC_VAR "MMAP_RELOC" #define UCM_EN_MALLOC_HOOKS_VAR "MALLOC_HOOKS" #define UCM_EN_MALLOC_RELOC_VAR "MALLOC_RELOC" +#define UCM_EN_CUDA_HOOKS_VAR "CUDA_HOOKS" ucm_config_t ucm_global_config = { @@ -28,7 +29,10 @@ ucm_config_t ucm_global_config = { .enable_events = 1, .enable_mmap_reloc = 1, .enable_malloc_hooks = 1, - .enable_malloc_reloc = 0 + .enable_malloc_reloc = 0, +#if HAVE_CUDA + .enable_cuda_hooks = 1 +#endif }; static const char *ucm_config_bool_to_string(int value) @@ -107,6 +111,10 @@ void ucm_config_print(FILE *stream, ucs_config_print_flags_t print_flags) print_flags); fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_MALLOC_RELOC_VAR, ucm_config_bool_to_string(ucm_global_config.enable_malloc_reloc)); +#if HAVE_CUDA + fprintf(stream, "%s%s=%s\n", UCM_ENV_PREFIX, UCM_EN_CUDA_HOOKS_VAR, + ucm_config_bool_to_string(ucm_global_config.enable_cuda_hooks)); +#endif } static void ucm_config_set_value_table(const char *str_value, const char **table, @@ -157,6 +165,10 @@ ucs_status_t ucm_config_modify(const char *name, const char *value) ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_hooks); } else if (!strcmp(name, UCM_EN_MALLOC_RELOC_VAR)) { ucm_config_set_value_bool(value, &ucm_global_config.enable_malloc_reloc); +#if HAVE_CUDA + } else if (!strcmp(name, UCM_EN_CUDA_HOOKS_VAR)) { + ucm_config_set_value_bool(value, &ucm_global_config.enable_cuda_hooks); +#endif } else { return UCS_ERR_INVALID_PARAM; } diff --git a/src/ucm/util/ucm_config.h b/src/ucm/util/ucm_config.h index 317608a57c6..bc42a600b12 100644 --- a/src/ucm/util/ucm_config.h +++ b/src/ucm/util/ucm_config.h @@ -18,6 +18,9 @@ typedef struct ucm_config { int enable_mmap_reloc; int enable_malloc_hooks; int enable_malloc_reloc; +#if HAVE_CUDA + int enable_cuda_hooks; +#endif size_t alloc_alignment; } ucm_config_t; From c289fece20fd5ed3a59d4f55c94dd8f7528a5ecc Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:12:39 -0700 Subject: [PATCH 11/16] remove CUDA UCT --- src/uct/cuda/cuda_ep.c | 47 ----------- src/uct/cuda/cuda_ep.h | 32 -------- src/uct/cuda/cuda_iface.c | 164 -------------------------------------- src/uct/cuda/cuda_iface.h | 27 ------- src/uct/cuda/cuda_md.c | 102 ------------------------ src/uct/cuda/cuda_md.h | 16 ---- 6 files changed, 388 deletions(-) delete mode 100644 src/uct/cuda/cuda_ep.c delete mode 100644 src/uct/cuda/cuda_ep.h delete mode 100644 src/uct/cuda/cuda_iface.c delete mode 100644 src/uct/cuda/cuda_iface.h delete mode 100644 src/uct/cuda/cuda_md.c delete mode 100644 src/uct/cuda/cuda_md.h diff --git a/src/uct/cuda/cuda_ep.c b/src/uct/cuda/cuda_ep.c deleted file mode 100644 index 7150a26ca71..00000000000 --- a/src/uct/cuda/cuda_ep.c +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cuda_ep.h" -#include "cuda_iface.h" - -#include -#include -#include - - -static UCS_CLASS_INIT_FUNC(uct_cuda_ep_t, uct_iface_t *tl_iface, - const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr) -{ - uct_cuda_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_iface_t); - UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ep_t) -{ -} - -UCS_CLASS_DEFINE(uct_cuda_ep_t, uct_base_ep_t) -UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_ep_t, uct_ep_t, uct_iface_t*, - const uct_device_addr_t *, const uct_iface_addr_t *); -UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_ep_t, uct_ep_t); - - -ucs_status_t uct_cuda_ep_put_short(uct_ep_h tl_ep, const void *buffer, - unsigned length, uint64_t remote_addr, - uct_rkey_t rkey) -{ - /* Code for PUT here */ - return UCS_ERR_UNSUPPORTED; -} - -ucs_status_t uct_cuda_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t header, - const void *payload, unsigned length) -{ - return UCS_ERR_UNSUPPORTED; -} - diff --git a/src/uct/cuda/cuda_ep.h b/src/uct/cuda/cuda_ep.h deleted file mode 100644 index 686d7a65455..00000000000 --- a/src/uct/cuda/cuda_ep.h +++ /dev/null @@ -1,32 +0,0 @@ -/** -* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -* Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. -* See file LICENSE for terms. -*/ - -#ifndef UCT_SYSV_EP_H -#define UCT_SYSV_EP_H - -#include -#include -#include - - -typedef struct uct_cuda_ep_addr { - int ep_id; -} uct_cuda_ep_addr_t; - -typedef struct uct_cuda_ep { - uct_base_ep_t super; - struct uct_cuda_ep *next; -} uct_cuda_ep_t; - -UCS_CLASS_DECLARE_NEW_FUNC(uct_cuda_ep_t, uct_ep_t, uct_iface_t*, - const uct_device_addr_t *, const uct_iface_addr_t *); -UCS_CLASS_DECLARE_DELETE_FUNC(uct_cuda_ep_t, uct_ep_t); - -ucs_status_t uct_cuda_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length, - uint64_t remote_addr, uct_rkey_t rkey); -ucs_status_t uct_cuda_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t header, - const void *payload, unsigned length); -#endif diff --git a/src/uct/cuda/cuda_iface.c b/src/uct/cuda/cuda_iface.c deleted file mode 100644 index ee6441709e8..00000000000 --- a/src/uct/cuda/cuda_iface.c +++ /dev/null @@ -1,164 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cuda_iface.h" -#include "cuda_md.h" -#include "cuda_ep.h" - -#include -#include - - -static ucs_config_field_t uct_cuda_iface_config_table[] = { - - {"", "", NULL, - ucs_offsetof(uct_cuda_iface_config_t, super), - UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, - - {NULL} -}; - - -/* Forward declaration for the delete function */ -static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_iface_t)(uct_iface_t*); - - -static ucs_status_t uct_cuda_iface_get_address(uct_iface_h tl_iface, - uct_iface_addr_t *iface_addr) -{ - int *cuda_addr = (int*)iface_addr; - *cuda_addr = 0; - return UCS_OK; -} - -static int uct_cuda_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, - const uct_iface_addr_t *iface_addr) -{ - return 0; -} - -static ucs_status_t uct_cuda_iface_query(uct_iface_h iface, - uct_iface_attr_t *iface_attr) -{ - memset(iface_attr, 0, sizeof(uct_iface_attr_t)); - - /* FIXME all of these values */ - iface_attr->iface_addr_len = sizeof(int); - iface_attr->device_addr_len = 0; - iface_attr->ep_addr_len = 0; - iface_attr->max_conn_priv = 0; - iface_attr->cap.flags = 0; - - iface_attr->cap.put.max_short = 0; - iface_attr->cap.put.max_bcopy = 0; - iface_attr->cap.put.min_zcopy = 0; - iface_attr->cap.put.max_zcopy = 0; - iface_attr->cap.put.opt_zcopy_align = 1; - iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; - iface_attr->cap.put.max_iov = 1; - - iface_attr->cap.get.max_bcopy = 0; - iface_attr->cap.get.min_zcopy = 0; - iface_attr->cap.get.max_zcopy = 0; - iface_attr->cap.get.opt_zcopy_align = 1; - iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; - iface_attr->cap.get.max_iov = 1; - - iface_attr->cap.am.max_short = 0; - iface_attr->cap.am.max_bcopy = 0; - iface_attr->cap.am.min_zcopy = 0; - iface_attr->cap.am.max_zcopy = 0; - iface_attr->cap.am.opt_zcopy_align = 1; - iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; - iface_attr->cap.am.max_hdr = 0; - iface_attr->cap.am.max_iov = 1; - - iface_attr->latency.overhead = 1e-9; - iface_attr->latency.growth = 0; - iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; - iface_attr->overhead = 0; - iface_attr->priority = 0; - - return UCS_OK; -} - -static uct_iface_ops_t uct_cuda_iface_ops = { - .ep_put_short = uct_cuda_ep_put_short, - .ep_am_short = uct_cuda_ep_am_short, - .ep_flush = uct_base_ep_flush, - .ep_fence = uct_base_ep_fence, - .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_ep_t), - .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ep_t), - .iface_flush = uct_base_iface_flush, - .iface_fence = uct_base_iface_fence, - .iface_progress_enable = ucs_empty_function, - .iface_progress_disable = ucs_empty_function, - .iface_progress = ucs_empty_function_return_zero, - .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_iface_t), - .iface_query = uct_cuda_iface_query, - .iface_get_device_address = (void*)ucs_empty_function_return_success, - .iface_get_address = uct_cuda_iface_get_address, - .iface_is_reachable = uct_cuda_iface_is_reachable, -}; - -static UCS_CLASS_INIT_FUNC(uct_cuda_iface_t, uct_md_h md, uct_worker_h worker, - const uct_iface_params_t *params, - const uct_iface_config_t *tl_config) -{ - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_iface_ops, md, worker, - params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_CUDA_TL_NAME)); - - if (strcmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME) != 0) { - ucs_error("No device was found: %s", params->mode.device.dev_name); - return UCS_ERR_NO_DEVICE; - } - - return UCS_OK; -} - -static UCS_CLASS_CLEANUP_FUNC(uct_cuda_iface_t) -{ - /* tasks to tear down the domain */ -} - -UCS_CLASS_DEFINE(uct_cuda_iface_t, uct_base_iface_t); -UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_iface_t, uct_iface_t, uct_md_h, uct_worker_h, - const uct_iface_params_t*, const uct_iface_config_t*); -static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_iface_t, uct_iface_t); - - -static ucs_status_t uct_cuda_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resource_p, - unsigned *num_resources_p) -{ - uct_tl_resource_desc_t *resource; - - resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); - if (NULL == resource) { - ucs_error("Failed to allocate memory"); - return UCS_ERR_NO_MEMORY; - } - - ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", - UCT_CUDA_TL_NAME); - ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", - UCT_CUDA_DEV_NAME); - resource->dev_type = UCT_DEVICE_TYPE_ACC; - - *num_resources_p = 1; - *resource_p = resource; - return UCS_OK; -} - -UCT_TL_COMPONENT_DEFINE(uct_cuda_tl, - uct_cuda_query_tl_resources, - uct_cuda_iface_t, - UCT_CUDA_TL_NAME, - "CUDA_", - uct_cuda_iface_config_table, - uct_cuda_iface_config_t); -UCT_MD_REGISTER_TL(&uct_cuda_md, &uct_cuda_tl); diff --git a/src/uct/cuda/cuda_iface.h b/src/uct/cuda/cuda_iface.h deleted file mode 100644 index 4c6bc352352..00000000000 --- a/src/uct/cuda/cuda_iface.h +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#ifndef UCT_CUDA_IFACE_H -#define UCT_CUDA_IFACE_H - -#include - - -#define UCT_CUDA_TL_NAME "cuda" -#define UCT_CUDA_DEV_NAME "gpu0" - - -typedef struct uct_cuda_iface { - uct_base_iface_t super; -} uct_cuda_iface_t; - - -typedef struct uct_cuda_iface_config { - uct_iface_config_t super; -} uct_cuda_iface_config_t; - - -#endif diff --git a/src/uct/cuda/cuda_md.c b/src/uct/cuda/cuda_md.c deleted file mode 100644 index 19596500f93..00000000000 --- a/src/uct/cuda/cuda_md.c +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#include "cuda_md.h" - -#include -#include -#include -#include -#include -#include - - -static ucs_status_t uct_cuda_md_query(uct_md_h md, uct_md_attr_t *md_attr) -{ - md_attr->cap.flags = UCT_MD_FLAG_REG; - md_attr->cap.max_alloc = 0; - md_attr->cap.max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost.overhead = 0; - md_attr->reg_cost.growth = 0; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); - return UCS_OK; -} - -static ucs_status_t uct_cuda_mkey_pack(uct_md_h md, uct_mem_h memh, - void *rkey_buffer) -{ - return UCS_OK; -} - -static ucs_status_t uct_cuda_rkey_unpack(uct_md_component_t *mdc, - const void *rkey_buffer, uct_rkey_t *rkey_p, - void **handle_p) -{ - *rkey_p = 0xdeadbeef; - *handle_p = NULL; - return UCS_OK; -} - -static ucs_status_t uct_cuda_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, - void *handle) -{ - return UCS_OK; -} - -static ucs_status_t uct_cuda_mem_reg(uct_md_h md, void *address, size_t length, - unsigned flags, uct_mem_h *memh_p) -{ - ucs_status_t rc; - uct_mem_h * mem_hndl = NULL; - mem_hndl = ucs_malloc(sizeof(void *), "cuda handle for test passing"); - if (NULL == mem_hndl) { - ucs_error("Failed to allocate memory for gni_mem_handle_t"); - rc = UCS_ERR_NO_MEMORY; - goto mem_err; - } - *memh_p = mem_hndl; - return UCS_OK; - mem_err: - return rc; -} - -static ucs_status_t uct_cuda_mem_dereg(uct_md_h md, uct_mem_h memh) -{ - ucs_free(memh); - return UCS_OK; -} - -static ucs_status_t uct_cuda_query_md_resources(uct_md_resource_desc_t **resources_p, - unsigned *num_resources_p) -{ - return uct_single_md_resource(&uct_cuda_md, resources_p, num_resources_p); -} - -static ucs_status_t uct_cuda_md_open(const char *md_name, const uct_md_config_t *md_config, - uct_md_h *md_p) -{ - static uct_md_ops_t md_ops = { - .close = (void*)ucs_empty_function, - .query = uct_cuda_md_query, - .mkey_pack = uct_cuda_mkey_pack, - .mem_reg = uct_cuda_mem_reg, - .mem_dereg = uct_cuda_mem_dereg - }; - static uct_md_t md = { - .ops = &md_ops, - .component = &uct_cuda_md - }; - - *md_p = &md; - return UCS_OK; -} - -UCT_MD_COMPONENT_DEFINE(uct_cuda_md, UCT_CUDA_MD_NAME, - uct_cuda_query_md_resources, uct_cuda_md_open, NULL, - uct_cuda_rkey_unpack, uct_cuda_rkey_release, "CUDA_", - uct_md_config_table, uct_md_config_t); - diff --git a/src/uct/cuda/cuda_md.h b/src/uct/cuda/cuda_md.h deleted file mode 100644 index 4c947e7d5af..00000000000 --- a/src/uct/cuda/cuda_md.h +++ /dev/null @@ -1,16 +0,0 @@ -/** - * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED. - * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. - * See file LICENSE for terms. - */ - -#ifndef UCT_CUDA_CONTEXT_H -#define UCT_CUDA_CONTEXT_H - -#include - -#define UCT_CUDA_MD_NAME "gpu" - -extern uct_md_component_t uct_cuda_md; - -#endif From deb9c6474c9b525dc4ac2776788b47fa70349928 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:13:34 -0700 Subject: [PATCH 12/16] CUDA; GDR COPY UCT implementation - copy to cuda memory using gdr copy utility - gdr copy (https://github.com/NVIDIA/gdrcopy) --- src/uct/cuda/gdr_copy/gdr_copy_ep.c | 55 ++++ src/uct/cuda/gdr_copy/gdr_copy_ep.h | 32 +++ src/uct/cuda/gdr_copy/gdr_copy_iface.c | 165 +++++++++++ src/uct/cuda/gdr_copy/gdr_copy_iface.h | 25 ++ src/uct/cuda/gdr_copy/gdr_copy_md.c | 366 +++++++++++++++++++++++++ src/uct/cuda/gdr_copy/gdr_copy_md.h | 64 +++++ 6 files changed, 707 insertions(+) create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_ep.c create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_ep.h create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_iface.c create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_iface.h create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_md.c create mode 100644 src/uct/cuda/gdr_copy/gdr_copy_md.h diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.c b/src/uct/cuda/gdr_copy/gdr_copy_ep.c new file mode 100644 index 00000000000..c5f817f8016 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.c @@ -0,0 +1,55 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "gdr_copy_ep.h" +#include "gdr_copy_md.h" +#include "gdr_copy_iface.h" + +#include +#include +#include + + +static UCS_CLASS_INIT_FUNC(uct_gdr_copy_ep_t, uct_iface_t *tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_gdr_copy_iface_t); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_gdr_copy_ep_t) +{ +} + +UCS_CLASS_DEFINE(uct_gdr_copy_ep_t, uct_base_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_ep_t, uct_ep_t); + + +ucs_status_t uct_gdr_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) +{ + uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_gdr_copy_iface_t); + uct_gdr_copy_md_t *md = (uct_gdr_copy_md_t *)iface->super.md; + uct_gdr_copy_mem_t *mem_hndl = (uct_gdr_copy_mem_t *) rkey; + gdr_info_t gdr_info; + size_t bar_off; + + assert(iovcnt == 1); + + if (gdr_get_info(md->gdrcpy_ctx, mem_hndl->mh, &gdr_info) != 0) { + ucs_error("gdr_get_info failed. "); + return UCS_ERR_IO_ERROR; + } + bar_off = remote_addr - gdr_info.va; + + gdr_copy_to_bar ((mem_hndl->bar_ptr + bar_off), iov[0].buffer, iov[0].length); + + return UCS_OK; +} diff --git a/src/uct/cuda/gdr_copy/gdr_copy_ep.h b/src/uct/cuda/gdr_copy/gdr_copy_ep.h new file mode 100644 index 00000000000..f2c1093a548 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_ep.h @@ -0,0 +1,32 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifndef UCT_SYSV_EP_H +#define UCT_SYSV_EP_H + +#include +#include +#include + + +typedef struct uct_gdr_copy_ep_addr { + int ep_id; +} uct_gdr_copy_ep_addr_t; + +typedef struct uct_gdr_copy_ep { + uct_base_ep_t super; + struct uct_gdr_copy_ep *next; +} uct_gdr_copy_ep_t; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_gdr_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_gdr_copy_ep_t, uct_ep_t); + +ucs_status_t uct_gdr_copy_ep_put_zcopy(uct_ep_h tl_ep, + const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp); + +#endif diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c new file mode 100644 index 00000000000..21c56bfc810 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.c @@ -0,0 +1,165 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "gdr_copy_iface.h" +#include "gdr_copy_md.h" +#include "gdr_copy_ep.h" + +#include +#include + + +static ucs_config_field_t uct_gdr_copy_iface_config_table[] = { + + {"", "", NULL, + ucs_offsetof(uct_gdr_copy_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {NULL} +}; + + +/* Forward declaration for the delete function */ +static void UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_iface_t)(uct_iface_t*); + + +static ucs_status_t uct_gdr_copy_iface_get_address(uct_iface_h tl_iface, + uct_iface_addr_t *iface_addr) +{ + int *gdr_copy_addr = (int*)iface_addr; + *gdr_copy_addr = 0; + return UCS_OK; +} + +static int uct_gdr_copy_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + return 1; +} + +static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h iface, + uct_iface_attr_t *iface_attr) +{ + memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + + /* FIXME all of these values */ + iface_attr->iface_addr_len = sizeof(int); + iface_attr->device_addr_len = 0; + iface_attr->ep_addr_len = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | + UCT_IFACE_FLAG_PUT_ZCOPY | + UCT_IFACE_FLAG_PENDING; + + iface_attr->cap.put.max_short = 0; + iface_attr->cap.put.max_bcopy = 0; + iface_attr->cap.put.min_zcopy = 0; + iface_attr->cap.put.max_zcopy = 0; + iface_attr->cap.put.opt_zcopy_align = 1; + iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; + iface_attr->cap.put.max_iov = 1; + + iface_attr->cap.get.max_bcopy = 0; + iface_attr->cap.get.min_zcopy = 0; + iface_attr->cap.get.max_zcopy = 0; + iface_attr->cap.get.opt_zcopy_align = 1; + iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; + iface_attr->cap.get.max_iov = 1; + + iface_attr->cap.am.max_short = -1; + iface_attr->cap.am.max_bcopy = 0; + iface_attr->cap.am.min_zcopy = 0; + iface_attr->cap.am.max_zcopy = 0; + iface_attr->cap.am.opt_zcopy_align = 1; + iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; + iface_attr->cap.am.max_hdr = 0; + iface_attr->cap.am.max_iov = 1; + + iface_attr->latency.overhead = 2e-6; + iface_attr->latency.growth = 0; + iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->overhead = 100; + iface_attr->priority = 0; + + return UCS_OK; +} + +static uct_iface_ops_t uct_gdr_copy_iface_ops = { + .ep_put_zcopy = uct_gdr_copy_ep_put_zcopy, + .ep_pending_add = ucs_empty_function_return_busy, + .ep_pending_purge = ucs_empty_function, + .ep_flush = uct_base_ep_flush, + .ep_fence = uct_base_ep_fence, + .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_gdr_copy_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_ep_t), + .iface_flush = uct_base_iface_flush, + .iface_fence = uct_base_iface_fence, + .iface_progress_enable = ucs_empty_function, + .iface_progress_disable = ucs_empty_function, + .iface_progress = ucs_empty_function_return_zero, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_gdr_copy_iface_t), + .iface_query = uct_gdr_copy_iface_query, + .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_address = uct_gdr_copy_iface_get_address, + .iface_is_reachable = uct_gdr_copy_iface_is_reachable, +}; + +static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gdr_copy_iface_ops, md, worker, + params, tl_config UCS_STATS_ARG(params->stats_root) + UCS_STATS_ARG(UCT_GDR_COPY_TL_NAME)); + + if (strcmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME) != 0) { + ucs_error("No device was found: %s", params->mode.device.dev_name); + return UCS_ERR_NO_DEVICE; + } + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_gdr_copy_iface_t) +{ + /* tasks to tear down the domain */ +} + +UCS_CLASS_DEFINE(uct_gdr_copy_iface_t, uct_base_iface_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_gdr_copy_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_gdr_copy_iface_t, uct_iface_t); + + +static ucs_status_t uct_gdr_copy_query_tl_resources(uct_md_h md, + uct_tl_resource_desc_t **resource_p, + unsigned *num_resources_p) +{ + uct_tl_resource_desc_t *resource; + + resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); + if (NULL == resource) { + ucs_error("Failed to allocate memory"); + return UCS_ERR_NO_MEMORY; + } + + ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", + UCT_GDR_COPY_TL_NAME); + ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", + UCT_CUDA_DEV_NAME); + resource->dev_type = UCT_DEVICE_TYPE_ACC; + + *num_resources_p = 1; + *resource_p = resource; + return UCS_OK; +} + +UCT_TL_COMPONENT_DEFINE(uct_gdr_copy_tl, + uct_gdr_copy_query_tl_resources, + uct_gdr_copy_iface_t, + UCT_GDR_COPY_TL_NAME, + "GDR_COPY_", + uct_gdr_copy_iface_config_table, + uct_gdr_copy_iface_config_t); +UCT_MD_REGISTER_TL(&uct_gdr_copy_md_component, &uct_gdr_copy_tl); diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h new file mode 100644 index 00000000000..9cea8e6c031 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.h @@ -0,0 +1,25 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_CUDA_IFACE_H +#define UCT_CUDA_IFACE_H + +#include + + +#define UCT_GDR_COPY_TL_NAME "gdr_copy" +#define UCT_CUDA_DEV_NAME "gdrcopy0" + + +typedef struct uct_gdr_copy_iface { + uct_base_iface_t super; +} uct_gdr_copy_iface_t; + + +typedef struct uct_gdr_copy_iface_config { + uct_iface_config_t super; +} uct_gdr_copy_iface_config_t; + +#endif diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c new file mode 100644 index 00000000000..e94b5d31382 --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -0,0 +1,366 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "gdr_copy_md.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN 4096 + +static ucs_config_field_t uct_gdr_copy_md_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_gdr_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {"RCACHE", "try", "Enable using memory registration cache", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.enable), UCS_CONFIG_TYPE_TERNARY}, + + {"RCACHE_ADDR_ALIGN", UCS_PP_MAKE_STRING(UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN), + "Registration cache address alignment, must be power of 2\n" + "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.alignment), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_MEM_PRIO", "1000", "Registration cache memory event priority", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.event_prio), UCS_CONFIG_TYPE_UINT}, + + {"RCACHE_OVERHEAD", "90ns", "Registration cache lookup overhead", + ucs_offsetof(uct_gdr_copy_md_config_t, rcache.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_OVERHEAD", "16us", "Memory registration overhead", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.overhead), UCS_CONFIG_TYPE_TIME}, + + {"MEM_REG_GROWTH", "0.06ns", "Memory registration growth rate", /* TODO take default from device */ + ucs_offsetof(uct_gdr_copy_md_config_t, uc_reg_cost.growth), UCS_CONFIG_TYPE_TIME}, + + {NULL} +}; + +static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) +{ + md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost.overhead = 0; + md_attr->reg_cost.growth = 0; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mkey_pack(uct_md_h md, uct_mem_h memh, + void *rkey_buffer) +{ + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_rkey_unpack(uct_md_component_t *mdc, + const void *rkey_buffer, uct_rkey_t *rkey_p, + void **handle_p) +{ + *rkey_p = 0xdeadbeef; + *handle_p = NULL; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, + void *handle) +{ + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_reg_internal(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_gdr_copy_mem_t *mem_hndl) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + CUdeviceptr d_ptr = ((CUdeviceptr )(char *) address); + gdr_mh_t mh; + void *bar_ptr; + + if (gdr_pin_buffer(md->gdrcpy_ctx, d_ptr, length, 0, 0, &mh) != 0) { + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + + } + if (mh == 0) { + ucs_error("gdr_pin_buffer Failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + } + + if (gdr_map(md->gdrcpy_ctx, mh, &bar_ptr, length) !=0) { + ucs_error("gdr_map failed. length :%lu ", length); + return UCS_ERR_IO_ERROR; + } + + mem_hndl->mh = mh; + mem_hndl->bar_ptr = bar_ptr; + mem_hndl->reg_size = length; + + return UCS_OK; + +} + +static ucs_status_t uct_gdr_copy_mem_dereg_internal(uct_md_h uct_md, uct_gdr_copy_mem_t *mem_hndl) +{ + + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + + if (gdr_unmap(md->gdrcpy_ctx, mem_hndl->mh, mem_hndl->bar_ptr, mem_hndl->reg_size) !=0) { + ucs_error("gdr_unmap Failed. unpin_size:%lu ", mem_hndl->reg_size); + return UCS_ERR_IO_ERROR; + } + if (gdr_unpin_buffer(md->gdrcpy_ctx, mem_hndl->mh) !=0) { + ucs_error("gdr_unpin_buffer failed "); + return UCS_ERR_IO_ERROR; + } + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_reg(uct_md_h uct_md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) +{ + uct_gdr_copy_mem_t * mem_hndl = NULL; + size_t reg_size; + void *ptr; + ucs_status_t status; + + + mem_hndl = ucs_malloc(sizeof(uct_gdr_copy_mem_t), "gdr_copy handle"); + if (NULL == mem_hndl) { + ucs_error("Failed to allocate memory for uct_gdr_copy_mem_t"); + return UCS_ERR_NO_MEMORY; + } + + reg_size = (length + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; + ptr = (void *) ((uintptr_t)address & GPU_PAGE_MASK); + + status = uct_gdr_copy_mem_reg_internal(uct_md, ptr, reg_size, 0, mem_hndl); + if (status != UCS_OK) { + free(mem_hndl); + return status; + } + + *memh_p = mem_hndl; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_mem_t *mem_hndl = memh; + ucs_status_t status; + + status = uct_gdr_copy_mem_dereg_internal(uct_md, mem_hndl); + free(mem_hndl); + return status; +} + +static ucs_status_t uct_gdr_copy_mem_type_detect(uct_md_h md, void *addr) +{ + int memory_type; + cudaError_t cuda_err = cudaSuccess; + struct cudaPointerAttributes attributes; + CUresult cu_err = CUDA_SUCCESS; + + if (addr == NULL) { + return UCS_ERR_INVALID_ADDR; + } + + cu_err = cuPointerGetAttribute(&memory_type, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)addr); + if (cu_err != CUDA_SUCCESS) { + cuda_err = cudaPointerGetAttributes (&attributes, addr); + if (cuda_err == cudaSuccess) { + if (attributes.memoryType == cudaMemoryTypeDevice) { + return UCS_OK; + } + } + } else if (memory_type == CU_MEMORYTYPE_DEVICE) { + return UCS_OK; + } + + return UCS_ERR_INVALID_ADDR; +} + +static ucs_status_t uct_gdr_copy_query_md_resources(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + + return uct_single_md_resource(&uct_gdr_copy_md_component, resources_p, num_resources_p); +} + +static void uct_gdr_copy_md_close(uct_md_h uct_md) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + + if (md->rcache != NULL) { + ucs_rcache_destroy(md->rcache); + } + + if (gdr_close(md->gdrcpy_ctx) != 0) { + ucs_error("Failed to close gdrcopy"); + } + + ucs_free(md); +} + +static uct_md_ops_t md_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_reg, + .mem_dereg = uct_gdr_copy_mem_dereg, + .mem_type_detect = uct_gdr_copy_mem_type_detect, +}; + +static inline uct_gdr_copy_rcache_region_t* uct_gdr_copy_rache_region_from_memh(uct_mem_h memh) +{ + return ucs_container_of(memh, uct_gdr_copy_rcache_region_t, memh); +} + +static ucs_status_t uct_gdr_copy_mem_rcache_reg(uct_md_h uct_md, void *address, + size_t length, unsigned flags, + uct_mem_h *memh_p) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + ucs_rcache_region_t *rregion; + ucs_status_t status; + uct_gdr_copy_mem_t *memh; + + status = ucs_rcache_get(md->rcache, address, length, PROT_READ|PROT_WRITE, + &flags, &rregion); + if (status != UCS_OK) { + return status; + } + + ucs_assert(rregion->refcount > 0); + memh = &ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t)->memh; + *memh_p = memh; + return UCS_OK; +} + +static ucs_status_t uct_gdr_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh) +{ + uct_gdr_copy_md_t *md = ucs_derived_of(uct_md, uct_gdr_copy_md_t); + uct_gdr_copy_rcache_region_t *region = uct_gdr_copy_rache_region_from_memh(memh); + + ucs_rcache_region_put(md->rcache, ®ion->super); + return UCS_OK; +} + +static uct_md_ops_t md_rcache_ops = { + .close = uct_gdr_copy_md_close, + .query = uct_gdr_copy_md_query, + .mkey_pack = uct_gdr_copy_mkey_pack, + .mem_reg = uct_gdr_copy_mem_rcache_reg, + .mem_dereg = uct_gdr_copy_mem_rcache_dereg, + .mem_type_detect = uct_gdr_copy_mem_type_detect +}; +static ucs_status_t uct_gdr_copy_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache, + void *arg, ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + int *flags = arg; + ucs_status_t status; + + status = uct_gdr_copy_mem_reg_internal(&md->super, (void*)region->super.super.start, + region->super.super.end - region->super.super.start, + *flags, ®ion->memh); + if (status != UCS_OK) { + return status; + } + + return UCS_OK; +} + +static void uct_gdr_copy_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion) +{ + uct_gdr_copy_rcache_region_t *region = ucs_derived_of(rregion, uct_gdr_copy_rcache_region_t); + uct_gdr_copy_md_t *md = context; + + (void)uct_gdr_copy_mem_dereg_internal(&md->super, ®ion->memh); +} + +static void uct_gdr_copy_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache, + ucs_rcache_region_t *rregion, char *buf, + size_t max) +{ + +} + +static ucs_rcache_ops_t uct_gdr_copy_rcache_ops = { + .mem_reg = uct_gdr_copy_rcache_mem_reg_cb, + .mem_dereg = uct_gdr_copy_rcache_mem_dereg_cb, + .dump_region = uct_gdr_copy_rcache_dump_region_cb +}; + +static ucs_status_t uct_gdr_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, + uct_md_h *md_p) +{ + ucs_status_t status; + uct_gdr_copy_md_t *md; + const uct_gdr_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_gdr_copy_md_config_t); + ucs_rcache_params_t rcache_params; + + md = ucs_malloc(sizeof(uct_gdr_copy_md_t), "uct_gdr_copy_md_t"); + if (NULL == md) { + ucs_error("Failed to allocate memory for uct_gdr_copy_md_t"); + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &md_ops; + md->super.component = &uct_gdr_copy_md_component; + md->rcache = NULL; + md->reg_cost = md_config->uc_reg_cost; + + + + md->gdrcpy_ctx = gdr_open(); + if (md->gdrcpy_ctx == (void *)0) { + ucs_error("Failed to open gdrcopy "); + return UCS_ERR_IO_ERROR; + } + + if (md_config->rcache.enable != UCS_NO) { + // UCS_STATIC_ASSERT(UCS_PGT_ADDR_ALIGN >= UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN); + rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t); + rcache_params.alignment = md_config->rcache.alignment; + rcache_params.ucm_event_priority = md_config->rcache.event_prio; + rcache_params.context = md; + rcache_params.ops = &uct_gdr_copy_rcache_ops; + status = ucs_rcache_create(&rcache_params, "gdr_copy" UCS_STATS_ARG(NULL), &md->rcache); + if (status == UCS_OK) { + md->super.ops = &md_rcache_ops; + md->reg_cost.overhead = 0; + md->reg_cost.growth = 0; /* It's close enough to 0 */ + } else { + ucs_assert(md->rcache == NULL); + if (md_config->rcache.enable == UCS_YES) { + ucs_error("Failed to create registration cache: %s", + ucs_status_string(status)); + return UCS_ERR_IO_ERROR; + } else { + ucs_debug("Could not create registration cache for: %s", + ucs_status_string(status)); + } + } + } + + *md_p = (uct_md_h) md; + return UCS_OK; +} + +UCT_MD_COMPONENT_DEFINE(uct_gdr_copy_md_component, UCT_GDR_COPY_MD_NAME, + uct_gdr_copy_query_md_resources, uct_gdr_copy_md_open, NULL, + uct_gdr_copy_rkey_unpack, uct_gdr_copy_rkey_release, "GDR_COPY_", + uct_gdr_copy_md_config_table, uct_gdr_copy_md_config_t); diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.h b/src/uct/cuda/gdr_copy/gdr_copy_md.h new file mode 100644 index 00000000000..2f8e98a5d4b --- /dev/null +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.h @@ -0,0 +1,64 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_GDR_COPY_MD_H +#define UCT_GDR_COPY_MD_H + +#include +#include +#include "gdrapi.h" + +#define UCT_GDR_COPY_MD_NAME "gdr_copy" + +extern uct_md_component_t uct_gdr_copy_md_component; + +/** + * @brief gdr_copy MD descriptor + */ +typedef struct uct_gdr_copy_md { + struct uct_md super; /**< Domain info */ + gdr_t gdrcpy_ctx; /**< gdr copy context */ + ucs_rcache_t *rcache; /**< Registration cache (can be NULL) */ + uct_linear_growth_t reg_cost; /**< Memory registration cost */ +} uct_gdr_copy_md_t; + +/** + * gdr copy domain configuration. + */ +typedef struct uct_gdr_copy_md_config { + uct_md_config_t super; + struct { + ucs_ternary_value_t enable; /**< Enable registration cache */ + size_t alignment; /**< Force address alignment */ + unsigned event_prio; /**< Memory events priority */ + double overhead; /**< Lookup overhead estimation */ + } rcache; + + uct_linear_growth_t uc_reg_cost; /**< Memory registration cost estimation + without using the cache */ + + +} uct_gdr_copy_md_config_t; + + +/** + * @brief gdr copy mem handle + */ +typedef struct uct_gdr_copy_mem { + gdr_mh_t mh; + void *bar_ptr; + size_t reg_size; +} uct_gdr_copy_mem_t; + +/** + * cuda memory region in the registration cache. + */ +typedef struct uct_gdr_copy_rcache_region { + ucs_rcache_region_t super; + uct_gdr_copy_mem_t memh; /**< mr exposed to the user as the memh */ +} uct_gdr_copy_rcache_region_t; + + +#endif From 51b140509f05d388e1eb3caf561ea6037a6d50a5 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:14:29 -0700 Subject: [PATCH 13/16] CUDA: CUDA COPY UCT - copy to/from cuda memory to system memory - for pipeline rndv transfers --- src/uct/Makefile.am | 25 ++- src/uct/cuda/cuda_copy/cuda_copy_ep.c | 60 ++++++ src/uct/cuda/cuda_copy/cuda_copy_ep.h | 32 +++ src/uct/cuda/cuda_copy/cuda_copy_iface.c | 241 +++++++++++++++++++++++ src/uct/cuda/cuda_copy/cuda_copy_iface.h | 50 +++++ src/uct/cuda/cuda_copy/cuda_copy_md.c | 163 +++++++++++++++ src/uct/cuda/cuda_copy/cuda_copy_md.h | 30 +++ 7 files changed, 595 insertions(+), 6 deletions(-) create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_ep.c create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_ep.h create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_iface.c create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_iface.h create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_md.c create mode 100644 src/uct/cuda/cuda_copy/cuda_copy_md.h diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am index 780b494a843..97318ecaab5 100644 --- a/src/uct/Makefile.am +++ b/src/uct/Makefile.am @@ -195,14 +195,27 @@ endif if HAVE_CUDA noinst_HEADERS += \ - cuda/cuda_md.h \ - cuda/cuda_iface.h \ - cuda/cuda_ep.h + cuda/cuda_copy/cuda_copy_md.h \ + cuda/cuda_copy/cuda_copy_iface.h \ + cuda/cuda_copy/cuda_copy_ep.h libuct_la_SOURCES += \ - cuda/cuda_md.c \ - cuda/cuda_iface.c \ - cuda/cuda_ep.c + cuda/cuda_copy/cuda_copy_md.c \ + cuda/cuda_copy/cuda_copy_iface.c \ + cuda/cuda_copy/cuda_copy_ep.c + +if HAVE_GDR_COPY +noinst_HEADERS += \ + cuda/gdr_copy/gdr_copy_md.h \ + cuda/gdr_copy/gdr_copy_iface.h \ + cuda/gdr_copy/gdr_copy_ep.h + +libuct_la_SOURCES += \ + cuda/gdr_copy/gdr_copy_md.c \ + cuda/gdr_copy/gdr_copy_iface.c \ + cuda/gdr_copy/gdr_copy_ep.c +endif + endif if HAVE_ROCM diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.c b/src/uct/cuda/cuda_copy/cuda_copy_ep.c new file mode 100644 index 00000000000..fc36d66fc08 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.c @@ -0,0 +1,60 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_ep.h" +#include "cuda_copy_iface.h" + +#include +#include +#include + + +static UCS_CLASS_INIT_FUNC(uct_cuda_copy_ep_t, uct_iface_t *tl_iface, + const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t); + UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super) + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_ep_t) +{ +} + +UCS_CLASS_DEFINE(uct_cuda_copy_ep_t, uct_base_ep_t) +UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t); + + +ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp) +{ + ucs_status_t status; + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_copy_iface_t); + + uct_cuda_copy_event_desc_t *cuda_event = ucs_mpool_get(&iface->cuda_event_desc); + + status = CUDA_FUNC(cudaMemcpyAsync((void *)remote_addr, iov[0].buffer, iov[0].length, cudaMemcpyDeviceToHost, iface->stream_d2h)); + if (UCS_OK != status) { + ucs_error("cudaMemcpyAsync Failed "); + return UCS_ERR_IO_ERROR; + } + status = CUDA_FUNC(cudaEventRecord(cuda_event->event, iface->stream_d2h)); + if (UCS_OK != status) { + ucs_error("cudaEventRecord Failed "); + return UCS_ERR_IO_ERROR; + } + cuda_event->comp = comp; + + ucs_queue_push(&iface->pending_event_q, &cuda_event->queue); + + ucs_info("cuda async issued :%p buffer:%p len:%ld", cuda_event, iov[0].buffer, iov[0].length); + + return UCS_INPROGRESS; + +} diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.h b/src/uct/cuda/cuda_copy/cuda_copy_ep.h new file mode 100644 index 00000000000..9335d3142b0 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.h @@ -0,0 +1,32 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. +* See file LICENSE for terms. +*/ + +#ifndef UCT_SYSV_EP_H +#define UCT_SYSV_EP_H + +#include +#include +#include + + +typedef struct uct_cuda_copy_ep_addr { + int ep_id; +} uct_cuda_copy_ep_addr_t; + +typedef struct uct_cuda_copy_ep { + uct_base_ep_t super; + struct uct_cuda_copy_ep *next; +} uct_cuda_copy_ep_t; + +UCS_CLASS_DECLARE_NEW_FUNC(uct_cuda_copy_ep_t, uct_ep_t, uct_iface_t*, + const uct_device_addr_t *, const uct_iface_addr_t *); +UCS_CLASS_DECLARE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t); + +ucs_status_t uct_cuda_copy_ep_get_zcopy(uct_ep_h tl_ep, + const uct_iov_t *iov, size_t iovcnt, + uint64_t remote_addr, uct_rkey_t rkey, + uct_completion_t *comp); + +#endif diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c new file mode 100644 index 00000000000..264f0309e56 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c @@ -0,0 +1,241 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_iface.h" +#include "cuda_copy_md.h" +#include "cuda_copy_ep.h" + +#include +#include + + +static ucs_config_field_t uct_cuda_copy_iface_config_table[] = { + + {"", "", NULL, + ucs_offsetof(uct_cuda_copy_iface_config_t, super), + UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)}, + + {NULL} +}; + + +/* Forward declaration for the delete function */ +static void UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t)(uct_iface_t*); + + +static ucs_status_t uct_cuda_copy_iface_get_address(uct_iface_h tl_iface, + uct_iface_addr_t *iface_addr) +{ + int *cuda_copy_addr = (int*)iface_addr; + *cuda_copy_addr = 0; + return UCS_OK; +} + +static int uct_cuda_copy_iface_is_reachable(const uct_iface_h iface, const uct_device_addr_t *dev_addr, + const uct_iface_addr_t *iface_addr) +{ + return 1; +} + +static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h iface, + uct_iface_attr_t *iface_attr) +{ + memset(iface_attr, 0, sizeof(uct_iface_attr_t)); + + /* FIXME all of these values */ + iface_attr->iface_addr_len = sizeof(int); + iface_attr->device_addr_len = 0; + iface_attr->ep_addr_len = 0; + iface_attr->cap.flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE | + UCT_IFACE_FLAG_GET_ZCOPY | + UCT_IFACE_FLAG_PENDING; + + iface_attr->cap.put.max_short = 0; + iface_attr->cap.put.max_bcopy = 0; + iface_attr->cap.put.min_zcopy = 0; + iface_attr->cap.put.max_zcopy = 0; + iface_attr->cap.put.opt_zcopy_align = 1; + iface_attr->cap.put.align_mtu = iface_attr->cap.put.opt_zcopy_align; + iface_attr->cap.put.max_iov = 1; + + iface_attr->cap.get.max_bcopy = 0; + iface_attr->cap.get.min_zcopy = 0; + iface_attr->cap.get.max_zcopy = 0; + iface_attr->cap.get.opt_zcopy_align = 1; + iface_attr->cap.get.align_mtu = iface_attr->cap.get.opt_zcopy_align; + iface_attr->cap.get.max_iov = 1; + + iface_attr->cap.am.max_short = -1; + iface_attr->cap.am.max_bcopy = 0; + iface_attr->cap.am.min_zcopy = 0; + iface_attr->cap.am.max_zcopy = 0; + iface_attr->cap.am.opt_zcopy_align = 1; + iface_attr->cap.am.align_mtu = iface_attr->cap.am.opt_zcopy_align; + iface_attr->cap.am.max_hdr = 0; + iface_attr->cap.am.max_iov = 1; + + iface_attr->latency.overhead = 10e-6; + iface_attr->latency.growth = 0; + iface_attr->bandwidth = 6911 * 1024.0 * 1024.0; + iface_attr->overhead = 1000; + iface_attr->priority = 0; + + return UCS_OK; +} +static unsigned uct_cuda_copy_iface_progress(uct_iface_h tl_iface) +{ + uct_cuda_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_cuda_copy_iface_t); + unsigned count = 0; + + ucs_queue_iter_t iter; + uct_cuda_copy_event_desc_t *cuda_event; + cudaError_t result = cudaSuccess; + + ucs_queue_for_each_safe(cuda_event, iter, &iface->pending_event_q, queue) { + result = cudaEventQuery(cuda_event->event); + if (cudaSuccess == result) { + ucs_queue_del_iter(&iface->pending_event_q, iter); + cuda_event->comp->func(cuda_event->comp, UCS_OK); + ucs_info("Event Done :%p", cuda_event); + ucs_mpool_put(cuda_event); + count++; + } + } + return count; +} + +static uct_iface_ops_t uct_cuda_copy_iface_ops = { + .ep_get_zcopy = uct_cuda_copy_ep_get_zcopy, + .ep_pending_add = ucs_empty_function_return_busy, + .ep_pending_purge = ucs_empty_function, + .ep_flush = uct_base_ep_flush, + .ep_fence = uct_base_ep_fence, + .ep_create_connected = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_copy_ep_t), + .ep_destroy = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_ep_t), + .iface_flush = uct_base_iface_flush, + .iface_fence = uct_base_iface_fence, + .iface_progress_enable = ucs_empty_function, + .iface_progress_disable = ucs_empty_function, + .iface_progress = uct_cuda_copy_iface_progress, + .iface_close = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_copy_iface_t), + .iface_query = uct_cuda_copy_iface_query, + .iface_get_device_address = (void*)ucs_empty_function_return_success, + .iface_get_address = uct_cuda_copy_iface_get_address, + .iface_is_reachable = uct_cuda_copy_iface_is_reachable, +}; + +static void uct_cuda_copy_event_desc_init(ucs_mpool_t *mp, void *obj, void *chunk) +{ + uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; + ucs_status_t status; + memset(base, 0 , sizeof(*base)); + status = CUDA_FUNC(cudaEventCreateWithFlags(&(base->event), cudaEventDisableTiming)); + if (UCS_OK != status) { + ucs_error("cudaEventCreateWithFlags Failed"); + } +} +static void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj) +{ + ucs_status_t status; + uct_cuda_copy_event_desc_t *base = (uct_cuda_copy_event_desc_t *) obj; + status = CUDA_FUNC(cudaEventDestroy(base->event)); + if (UCS_OK != status) { + ucs_error("cudaEventDestroy Failed"); + } +} + +static ucs_mpool_ops_t uct_cuda_copy_event_desc_mpool_ops = { + .chunk_alloc = ucs_mpool_hugetlb_malloc, + .chunk_release = ucs_mpool_hugetlb_free, + .obj_init = uct_cuda_copy_event_desc_init, + .obj_cleanup = uct_cuda_copy_event_desc_cleanup, +}; + +static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h worker, + const uct_iface_params_t *params, + const uct_iface_config_t *tl_config) +{ + ucs_status_t status; + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_copy_iface_ops, md, worker, + params, tl_config UCS_STATS_ARG(params->stats_root) + UCS_STATS_ARG(UCT_CUDA_COPY_TL_NAME)); + + if (strcmp(params->mode.device.dev_name, UCT_CUDA_DEV_NAME) != 0) { + ucs_error("No device was found: %s", params->mode.device.dev_name); + return UCS_ERR_NO_DEVICE; + } + + + status = ucs_mpool_init(&self->cuda_event_desc, + 0, + sizeof(uct_cuda_copy_event_desc_t), + 0, /* alignment offset */ + UCS_SYS_CACHE_LINE_SIZE, /* alignment */ + 128, /* grow */ + 1024, /* max desc */ + &uct_cuda_copy_event_desc_mpool_ops, + "CUDA EVENT objects"); + + if (UCS_OK != status) { + ucs_error("Mpool creation failed"); + return UCS_ERR_IO_ERROR; + } + + status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_d2h, cudaStreamNonBlocking)); + if (UCS_OK != status) { + ucs_error("cudaStreamCreateWithFlags creation failed"); + return UCS_ERR_IO_ERROR; + } + status = CUDA_FUNC(cudaStreamCreateWithFlags(&self->stream_h2d, cudaStreamNonBlocking)); + if (UCS_OK != status) { + ucs_error("cudaStreamCreateWithFlags creation failed"); + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static UCS_CLASS_CLEANUP_FUNC(uct_cuda_copy_iface_t) +{ + ucs_mpool_cleanup(&self->cuda_event_desc, 1); +} + +UCS_CLASS_DEFINE(uct_cuda_copy_iface_t, uct_base_iface_t); +UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_iface_t, uct_iface_t, uct_md_h, uct_worker_h, + const uct_iface_params_t*, const uct_iface_config_t*); +static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_iface_t, uct_iface_t); + + +static ucs_status_t uct_cuda_copy_query_tl_resources(uct_md_h md, + uct_tl_resource_desc_t **resource_p, + unsigned *num_resources_p) +{ + uct_tl_resource_desc_t *resource; + + resource = ucs_calloc(1, sizeof(uct_tl_resource_desc_t), "resource desc"); + if (NULL == resource) { + ucs_error("Failed to allocate memory"); + return UCS_ERR_NO_MEMORY; + } + + ucs_snprintf_zero(resource->tl_name, sizeof(resource->tl_name), "%s", + UCT_CUDA_COPY_TL_NAME); + ucs_snprintf_zero(resource->dev_name, sizeof(resource->dev_name), "%s", + UCT_CUDA_DEV_NAME); + resource->dev_type = UCT_DEVICE_TYPE_ACC; + + *num_resources_p = 1; + *resource_p = resource; + return UCS_OK; +} + +UCT_TL_COMPONENT_DEFINE(uct_cuda_copy_tl, + uct_cuda_copy_query_tl_resources, + uct_cuda_copy_iface_t, + UCT_CUDA_COPY_TL_NAME, + "CUDA_COPY_", + uct_cuda_copy_iface_config_table, + uct_cuda_copy_iface_config_t); +UCT_MD_REGISTER_TL(&uct_cuda_copy_md_component, &uct_cuda_copy_tl); diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h new file mode 100644 index 00000000000..9bca99cd249 --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h @@ -0,0 +1,50 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_CUDA_IFACE_H +#define UCT_CUDA_IFACE_H + +#include +#include +#include +#include + + +#define UCT_CUDA_COPY_TL_NAME "cuda_copy" +#define UCT_CUDA_DEV_NAME "cudacopy0" + +#define CUDA_FUNC(func) ({ \ +ucs_status_t _status = UCS_OK; \ +do { \ + CUresult _result = (func); \ + if (CUDA_SUCCESS != _result) { \ + ucs_error("[%s:%d] cuda failed with %d \n", \ + __FILE__, __LINE__,_result); \ + _status = UCS_ERR_IO_ERROR; \ + } \ +} while (0); \ +_status; \ +}) + +typedef struct uct_cuda_copy_iface { + uct_base_iface_t super; + ucs_mpool_t cuda_event_desc; + ucs_queue_head_t pending_event_q; + cudaStream_t stream_d2h; + cudaStream_t stream_h2d; +} uct_cuda_copy_iface_t; + + +typedef struct uct_cuda_copy_iface_config { + uct_iface_config_t super; +} uct_cuda_copy_iface_config_t; + +typedef struct uct_cuda_copy_event_desc { + cudaEvent_t event; + uct_completion_t *comp; + ucs_queue_elem_t queue; +} uct_cuda_copy_event_desc_t; + +#endif diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c new file mode 100644 index 00000000000..157107c784e --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -0,0 +1,163 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "cuda_copy_md.h" + +#include +#include +#include +#include +#include +#include +#include +#include + + +static ucs_config_field_t uct_cuda_copy_md_config_table[] = { + {"", "", NULL, + ucs_offsetof(uct_cuda_copy_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)}, + + {NULL} +}; + +static ucs_status_t uct_cuda_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr) +{ + md_attr->cap.flags = UCT_MD_FLAG_REG; + md_attr->cap.mem_type = UCT_MD_MEM_TYPE_CUDA; + md_attr->cap.max_alloc = 0; + md_attr->cap.max_reg = ULONG_MAX; + md_attr->rkey_packed_size = 0; + md_attr->reg_cost.overhead = 0; + md_attr->reg_cost.growth = 0; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mkey_pack(uct_md_h md, uct_mem_h memh, + void *rkey_buffer) +{ + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_rkey_unpack(uct_md_component_t *mdc, + const void *rkey_buffer, uct_rkey_t *rkey_p, + void **handle_p) +{ + *rkey_p = 0xdeadbeef; + *handle_p = NULL; + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_rkey_release(uct_md_component_t *mdc, uct_rkey_t rkey, + void *handle) +{ + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_reg(uct_md_h md, void *address, size_t length, + unsigned flags, uct_mem_h *memh_p) +{ + cudaError_t cuerr = cudaSuccess; + + if(address == NULL) { + *memh_p = address; + return UCS_OK; + } + + cuerr = cudaHostRegister(address, length, cudaHostRegisterPortable); + if (cuerr != cudaSuccess) { + return UCS_ERR_IO_ERROR; + } + + *memh_p = address; + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_dereg(uct_md_h md, uct_mem_h memh) +{ + void *address = (void *)memh; + cudaError_t cuerr = cudaSuccess; + if (address == NULL) { + return UCS_OK; + } + cuerr = cudaHostUnregister(address); + if (cuerr != cudaSuccess) { + return UCS_ERR_IO_ERROR; + } + + return UCS_OK; +} + +static ucs_status_t uct_cuda_copy_mem_type_detect(uct_md_h md, void *addr) +{ + int memory_type; + cudaError_t cuda_err = cudaSuccess; + struct cudaPointerAttributes attributes; + CUresult cu_err = CUDA_SUCCESS; + + if (addr == NULL) { + return UCS_ERR_INVALID_ADDR; + } + + cu_err = cuPointerGetAttribute(&memory_type, + CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (CUdeviceptr)addr); + if (cu_err != CUDA_SUCCESS) { + cuda_err = cudaPointerGetAttributes (&attributes, addr); + if (cuda_err == cudaSuccess) { + if (attributes.memoryType == cudaMemoryTypeDevice) { + return UCS_OK; + } + } + } else if (memory_type == CU_MEMORYTYPE_DEVICE) { + return UCS_OK; + } + return UCS_ERR_INVALID_ADDR; +} + +static ucs_status_t uct_cuda_copy_query_md_resources(uct_md_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + return uct_single_md_resource(&uct_cuda_copy_md_component, resources_p, num_resources_p); +} + +static void uct_cuda_copy_md_close(uct_md_h uct_md) { + uct_cuda_copy_md_t *md = ucs_derived_of(uct_md, uct_cuda_copy_md_t); + ucs_free(md); + +} +static uct_md_ops_t md_ops = { + .close = uct_cuda_copy_md_close, + .query = uct_cuda_copy_md_query, + .mkey_pack = uct_cuda_copy_mkey_pack, + .mem_reg = uct_cuda_copy_mem_reg, + .mem_dereg = uct_cuda_copy_mem_dereg, + .mem_type_detect = uct_cuda_copy_mem_type_detect, +}; + +static ucs_status_t uct_cuda_copy_md_open(const char *md_name, const uct_md_config_t *uct_md_config, + uct_md_h *md_p) +{ + uct_cuda_copy_md_t *md; + // ucs_status_t status; + // const uct_cuda_copy_md_config_t *md_config = ucs_derived_of(uct_md_config, uct_cuda_copy_md_config_t); + + md = ucs_malloc(sizeof(uct_cuda_copy_md_t), "uct_cuda_copy_md_t"); + if (NULL == md) { + ucs_error("Failed to allocate memory for uct_cuda_copy_md_t"); + return UCS_ERR_NO_MEMORY; + } + + md->super.ops = &md_ops; + md->super.component = &uct_cuda_copy_md_component; + + *md_p = (uct_md_h) md; + return UCS_OK; +} + +UCT_MD_COMPONENT_DEFINE(uct_cuda_copy_md_component, UCT_CUDA_COPY_MD_NAME, + uct_cuda_copy_query_md_resources, uct_cuda_copy_md_open, NULL, + uct_cuda_copy_rkey_unpack, uct_cuda_copy_rkey_release, "CUDA_COPY_", + uct_cuda_copy_md_config_table, uct_cuda_copy_md_config_t); diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.h b/src/uct/cuda/cuda_copy/cuda_copy_md.h new file mode 100644 index 00000000000..fedba9d012e --- /dev/null +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.h @@ -0,0 +1,30 @@ +/** + * Copyright (C) Mellanox Technologies Ltd. 2001-2017. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCT_CUDA_COPY_H +#define UCT_CUDA_COPY_H + +#include + +#define UCT_CUDA_COPY_MD_NAME "cuda_cpy" + +extern uct_md_component_t uct_cuda_copy_md_component; + +/** + * @brief cuda_copy MD descriptor + */ +typedef struct uct_cuda_copy_md { + struct uct_md super; /**< Domain info */ +} uct_cuda_copy_md_t; + +/** + * gdr copy domain configuration. + */ +typedef struct uct_cuda_copy_md_config { + uct_md_config_t super; + +} uct_cuda_copy_md_config_t; + +#endif From 71fa1ed5bc3e792d405817afca5af5330ec5ba64 Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:34:43 -0700 Subject: [PATCH 14/16] CUDA: mpool for pipeline staging for domain memory --- src/ucp/core/ucp_context.c | 4 ++ src/ucp/core/ucp_context.h | 2 + src/ucp/core/ucp_worker.c | 102 +++++++++++++++++++++++++++++++++++++ src/ucp/core/ucp_worker.h | 1 + 4 files changed, 109 insertions(+) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 298f55d7279..82e7acaafaf 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -150,6 +150,10 @@ static ucs_config_field_t ucp_config_table[] = { "Also the value has to be bigger than UCX_TM_THRESH to take an effect." , ucs_offsetof(ucp_config_t, ctx.tm_max_bcopy), UCS_CONFIG_TYPE_MEMUNITS}, + {"RNDV_FRAG_SIZE", "65536", + "RNDV fragment size \n", + ucs_offsetof(ucp_config_t, ctx.rndv_frag_size), UCS_CONFIG_TYPE_MEMUNITS}, + {NULL} }; diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h index 827f486ef87..4e848c2c618 100644 --- a/src/ucp/core/ucp_context.h +++ b/src/ucp/core/ucp_context.h @@ -51,6 +51,8 @@ typedef struct ucp_context_config { ucp_atomic_mode_t atomic_mode; /** If use mutex for MT support or not */ int use_mt_mutex; + /** RNDV pipeline fragment size */ + size_t rndv_frag_size; /** On-demand progress */ int adaptive_progress; } ucp_context_config_t; diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 70c1e27e7ad..2daad2ad000 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -36,6 +36,88 @@ static ucs_stats_class_t ucp_worker_stats_class = { #endif +static ucs_status_t ucp_mpool_dereg_mds(ucp_context_h context, ucp_mem_h memh) { + unsigned md_index, uct_index; + ucs_status_t status; + + uct_index = 0; + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (!(memh->md_map & UCS_BIT(md_index))) { + continue; + } + + status = uct_md_mem_dereg(context->tl_mds[md_index].md, + memh->uct[uct_index]); + if (status != UCS_OK) { + ucs_error("Failed to dereg address %p with md %s", memh->address, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + ++uct_index; + } + + return UCS_OK; +} + +static ucs_status_t ucp_mpool_reg_mds(ucp_context_h context, ucp_mem_h memh) { + unsigned md_index, uct_memh_count; + ucs_status_t status; + + uct_memh_count = 0; + memh->md_map = 0; + + for (md_index = 0; md_index < context->num_mds; ++md_index) { + if (context->tl_mds[md_index].attr.cap.flags & UCT_MD_FLAG_REG) { + status = uct_md_mem_reg(context->tl_mds[md_index].md, memh->address, + memh->length, 0, memh->uct[uct_memh_count]); + if (status != UCS_OK) { + ucs_error("Failed to register memory pool chunk %p with md %s", + memh->address, context->tl_mds[md_index].rsc.md_name); + return status; + } + + memh->md_map |= UCS_BIT(md_index); + uct_memh_count++; + } + } + + return UCS_OK; +} + + +static ucs_status_t ucp_mpool_rndv_malloc(ucs_mpool_t *mp, size_t *size_p, void **chunk_p) { + ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, reg_mp); + ucp_mem_desc_t *chunk_hdr; + ucs_status_t status; + + status = ucp_mpool_malloc(mp, size_p, chunk_p); + if (status != UCS_OK) { + ucs_error("Failed to allocate memory pool chunk: %s", ucs_status_string(status)); + return UCS_ERR_NO_MEMORY; + } + + chunk_hdr = (ucp_mem_desc_t *)(*chunk_p) - 1; + + status = ucp_mpool_reg_mds(worker->context, chunk_hdr->memh); + if (status != UCS_OK) { + ucp_mpool_dereg_mds(worker->context, chunk_hdr->memh); + return status; + } + + return UCS_OK; +} + + +static void ucp_mpool_rndv_free(ucs_mpool_t *mp, void *chunk) { + ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, reg_mp); + ucp_mem_desc_t *chunk_hdr = (ucp_mem_desc_t *)chunk - 1; + ucp_mpool_dereg_mds(worker->context, chunk_hdr->memh); + ucp_mpool_free(mp, chunk); +} + + ucs_mpool_ops_t ucp_am_mpool_ops = { .chunk_alloc = ucs_mpool_hugetlb_malloc, .chunk_release = ucs_mpool_hugetlb_free, @@ -52,6 +134,14 @@ ucs_mpool_ops_t ucp_reg_mpool_ops = { }; +ucs_mpool_ops_t ucp_rndv_frag_mpool_ops = { + .chunk_alloc = ucp_mpool_rndv_malloc, + .chunk_release = ucp_mpool_rndv_free, + .obj_init = ucs_empty_function, + .obj_cleanup = ucs_empty_function +}; + + void ucp_worker_iface_check_events(ucp_worker_iface_t *wiface, int force); @@ -917,8 +1007,19 @@ static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker, goto err_release_am_mpool; } + + status = ucs_mpool_init(&worker->rndv_frag_mp, 0, + context->config.ext.rndv_frag_size, + 0, 128, 128, UINT_MAX, + &ucp_rndv_frag_mpool_ops, "ucp_rndv_frags"); + if (status != UCS_OK) { + goto err_release_reg_mpool; + } + return UCS_OK; +err_release_reg_mpool: + ucs_mpool_cleanup(&worker->reg_mp, 0); err_release_am_mpool: ucs_mpool_cleanup(&worker->am_mp, 0); out: @@ -1140,6 +1241,7 @@ void ucp_worker_destroy(ucp_worker_h worker) ucp_worker_destroy_eps(worker); ucs_mpool_cleanup(&worker->am_mp, 1); ucs_mpool_cleanup(&worker->reg_mp, 1); + ucs_mpool_cleanup(&worker->rndv_frag_mp, 1); ucp_worker_close_ifaces(worker); ucp_worker_wakeup_cleanup(worker); ucs_mpool_cleanup(&worker->req_mp, 1); diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h index dba1dca46ca..c8b370a68da 100644 --- a/src/ucp/core/ucp_worker.h +++ b/src/ucp/core/ucp_worker.h @@ -147,6 +147,7 @@ typedef struct ucp_worker { ucs_mpool_t am_mp; /* Memory pool for AM receives */ ucs_mpool_t reg_mp; /* Registered memory pool */ ucp_mt_lock_t mt_lock; /* Configuration of multi-threading support */ + ucs_mpool_t rndv_frag_mp; /* Memory pool for RNDV fragments */ UCS_STATS_NODE_DECLARE(stats); From ac2cc56fdd9d9c0256a3a390423aadeb93c6f87a Mon Sep 17 00:00:00 2001 From: Devendar Bureddy Date: Tue, 26 Sep 2017 15:33:47 -0700 Subject: [PATCH 15/16] CUDA: protocol changes for eager support for domain memory --- src/ucp/core/ucp_ep.c | 19 ++++ src/ucp/core/ucp_ep.h | 25 ++++- src/ucp/core/ucp_mm.h | 1 + src/ucp/dt/dt.c | 121 ++++++++++++++++++++++++ src/ucp/dt/dt.h | 50 +--------- src/ucp/tag/eager.h | 4 +- src/ucp/tag/eager_rcv.c | 2 +- src/ucp/tag/offload.c | 2 +- src/ucp/tag/rndv.c | 8 +- src/ucp/tag/tag_recv.c | 4 +- src/ucp/tag/tag_send.c | 27 ++++-- src/ucp/wireup/address.c | 2 +- src/ucp/wireup/select.c | 192 ++++++++++++++++++++++++++++++++++++++- 13 files changed, 389 insertions(+), 68 deletions(-) diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index f378a95a212..d9bd15404a2 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -899,6 +899,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) { ucp_context_h context = worker->context; ucp_ep_rma_config_t *rma_config; + ucp_ep_addr_domain_config_t *domain_config; uct_iface_attr_t *iface_attr; uct_md_attr_t *md_attr; ucp_rsc_index_t rsc_index; @@ -917,6 +918,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) config->tag.eager.zcopy_auto_thresh = 0; config->am.zcopy_auto_thresh = 0; config->p2p_lanes = 0; + config->domain_lanes = 0; config->bcopy_thresh = context->config.ext.bcopy_thresh; config->tag.lane = UCP_NULL_LANE; config->tag.proto = &ucp_tag_eager_proto; @@ -1004,6 +1006,23 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) } } + /* Configuration for memory domains */ + for (lane = 0; lane < config->key.num_lanes; ++lane) { + if (config->key.domain_lanes[lane] == UCP_NULL_LANE) { + continue; + } + config->domain_lanes |= UCS_BIT(lane); + + domain_config = &config->domain[lane]; + rsc_index = config->key.lanes[lane].rsc_index; + iface_attr = &worker->ifaces[rsc_index].attr; + + domain_config->tag.eager.max_short = iface_attr->cap.am.max_short; + //TODO: zcopy threshold should be based on the ep AM lane capability with domain addr(i.e can UCT do zcopy from domain) + memset(domain_config->tag.eager.zcopy_thresh, 0, UCP_MAX_IOV * sizeof(size_t)); + + } + /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { if (ucp_ep_config_get_rma_prio(config->key.rma_lanes, lane) == -1) { diff --git a/src/ucp/core/ucp_ep.h b/src/ucp/core/ucp_ep.h index f6fd896ee1b..4dd92a1b678 100644 --- a/src/ucp/core/ucp_ep.h +++ b/src/ucp/core/ucp_ep.h @@ -81,6 +81,9 @@ typedef struct ucp_ep_config_key { /* Lanes for atomic operations, sorted by priority, highest first */ ucp_lane_index_t amo_lanes[UCP_MAX_LANES]; + /* Lanes for domain operations, sorted by priority, highest first */ + ucp_lane_index_t domain_lanes[UCP_MAX_LANES]; + /* Bitmap of remote mds which are reachable from this endpoint (with any set * of transports which could be selected in the future). */ @@ -106,6 +109,15 @@ typedef struct ucp_ep_rma_config { } ucp_ep_rma_config_t; +typedef struct ucp_ep_addr_domain_config { + struct { + struct { + ssize_t max_short; + size_t zcopy_thresh[UCP_MAX_IOV]; + } eager; + } tag; +} ucp_ep_addr_domain_config_t; + /* * Configuration for AM and tag offload protocols */ @@ -136,6 +148,10 @@ typedef struct ucp_ep_config { */ ucp_lane_map_t p2p_lanes; + /* Bitmap of which lanes are domain lanes + */ + ucp_lane_map_t domain_lanes; + /* Configuration for each lane that provides RMA */ ucp_ep_rma_config_t rma[UCP_MAX_LANES]; /* Threshold for switching from put_short to put_bcopy */ @@ -179,8 +195,11 @@ typedef struct ucp_ep_config { * (currently it's only AM based). */ const ucp_proto_t *proto; } stream; -} ucp_ep_config_t; + /* Configuration of all domains */ + ucp_ep_addr_domain_config_t domain[UCP_MAX_LANES]; + +} ucp_ep_config_t; /** * Remote protocol layer endpoint @@ -245,4 +264,8 @@ size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt, const ucp_context_h context, double bandwidth); +ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, + const ucp_lane_index_t *lanes, + ucp_md_map_t dn_md_map); +ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_mem_type_h mem_type_h); #endif diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h index 82f035f3efa..c08c281f6fd 100644 --- a/src/ucp/core/ucp_mm.h +++ b/src/ucp/core/ucp_mm.h @@ -71,6 +71,7 @@ typedef struct ucp_mem_desc { typedef struct ucp_mem_type { ucp_md_map_t md_map; /* Which MDs have own ths addr Domain */ uct_memory_type_t id; /* memory type */ + ucp_lane_index_t eager_lane; } ucp_mem_type_t; void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep); diff --git a/src/ucp/dt/dt.c b/src/ucp/dt/dt.c index 418b9775446..44d8b88ec72 100644 --- a/src/ucp/dt/dt.c +++ b/src/ucp/dt/dt.c @@ -5,6 +5,7 @@ */ #include "dt.h" +#include size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, @@ -44,3 +45,123 @@ size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, state->offset += result_len; return result_len; } + +static UCS_F_ALWAYS_INLINE ucs_status_t ucp_dn_dt_unpack(ucp_request_t *req, void *buffer, size_t buffer_size, + const void *recv_data, size_t recv_length) +{ + ucs_status_t status; + ucp_worker_h worker = req->recv.worker; + ucp_context_h context = worker->context; + ucp_ep_h ep = ucp_worker_ep_find(worker, worker->uuid); + ucp_ep_config_t *config = ucp_ep_config(ep); + ucp_md_map_t dn_md_map = req->mem_type.md_map; + ucp_lane_index_t dn_lane; + ucp_rsc_index_t rsc_index; + uct_iface_attr_t *iface_attr; + unsigned md_index; + uct_mem_h memh; + uct_iov_t iov; + + if (recv_length == 0) { + return UCS_OK; + } + + while (1) { + dn_lane = ucp_config_find_domain_lane(config, config->key.domain_lanes, dn_md_map); + if (dn_lane == UCP_NULL_LANE) { + ucs_error("Not find address domain lane."); + return UCS_ERR_IO_ERROR; + } + rsc_index = ucp_ep_get_rsc_index(ep, dn_lane); + iface_attr = &worker->ifaces[rsc_index].attr; + md_index = config->key.lanes[dn_lane].dst_md_index; + if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY)) { + dn_md_map |= ~UCS_BIT(md_index); + continue; + } + break; + } + + + status = uct_md_mem_reg(context->tl_mds[md_index].md, buffer, buffer_size, + UCT_MD_MEM_ACCESS_REMOTE_PUT, &memh); + if (status != UCS_OK) { + ucs_error("Failed to reg address %p with md %s", buffer, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + ucs_assert(buffer_size >= recv_length); + iov.buffer = (void *)recv_data; + iov.length = recv_length; + iov.count = 1; + iov.memh = UCT_MEM_HANDLE_NULL; + + + status = uct_ep_put_zcopy(ep->uct_eps[dn_lane], &iov, 1, (uint64_t)buffer, + (uct_rkey_t )memh, NULL); + if (status != UCS_OK) { + uct_md_mem_dereg(context->tl_mds[md_index].md, memh); + ucs_error("Failed to perform uct_ep_put_zcopy to address %p", recv_data); + return status; + } + + status = uct_md_mem_dereg(context->tl_mds[md_index].md, memh); + if (status != UCS_OK) { + ucs_error("Failed to dereg address %p with md %s", buffer, + context->tl_mds[md_index].rsc.md_name); + return status; + } + + return UCS_OK; +} + + +ucs_status_t ucp_dt_unpack(ucp_request_t *req, ucp_datatype_t datatype, void *buffer, size_t buffer_size, + ucp_dt_state_t *state, const void *recv_data, size_t recv_length, int last) +{ + ucp_dt_generic_t *dt_gen; + size_t offset = state->offset; + ucs_status_t status; + + if (ucs_unlikely((recv_length + offset) > buffer_size)) { + ucs_trace_req("message truncated: recv_length %zu offset %zu buffer_size %zu", + recv_length, offset, buffer_size); + if (UCP_DT_IS_GENERIC(datatype) && last) { + ucp_dt_generic(datatype)->ops.finish(state->dt.generic.state); + } + return UCS_ERR_MESSAGE_TRUNCATED; + } + + switch (datatype & UCP_DATATYPE_CLASS_MASK) { + case UCP_DATATYPE_CONTIG: + if (ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(req->mem_type.id))) { + UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, buffer + offset, + recv_data, recv_length); + return UCS_OK; + } else { + return ucp_dn_dt_unpack(req, buffer, buffer_size, recv_data, recv_length); + } + + case UCP_DATATYPE_IOV: + UCS_PROFILE_CALL(ucp_dt_iov_scatter, buffer, state->dt.iov.iovcnt, + recv_data, recv_length, &state->dt.iov.iov_offset, + &state->dt.iov.iovcnt_offset); + return UCS_OK; + + case UCP_DATATYPE_GENERIC: + dt_gen = ucp_dt_generic(datatype); + status = UCS_PROFILE_NAMED_CALL("dt_unpack", dt_gen->ops.unpack, + state->dt.generic.state, offset, + recv_data, recv_length); + if (last) { + UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, + state->dt.generic.state); + } + return status; + + default: + ucs_error("unexpected datatype=%lx", datatype); + return UCS_ERR_INVALID_PARAM; + } +} diff --git a/src/ucp/dt/dt.h b/src/ucp/dt/dt.h index f35b03f75b3..82c83a84ce1 100644 --- a/src/ucp/dt/dt.h +++ b/src/ucp/dt/dt.h @@ -15,6 +15,7 @@ #include #include #include +#include /** @@ -72,51 +73,8 @@ size_t ucp_dt_length(ucp_datatype_t datatype, size_t count, size_t ucp_dt_pack(ucp_datatype_t datatype, void *dest, const void *src, ucp_dt_state_t *state, size_t length); -static UCS_F_ALWAYS_INLINE ucs_status_t -ucp_dt_unpack(ucp_datatype_t datatype, void *buffer, size_t buffer_size, - ucp_dt_state_t *state, const void *recv_data, - size_t recv_length, int last) -{ - ucp_dt_generic_t *dt_gen; - size_t offset = state->offset; - ucs_status_t status; - - if (ucs_unlikely((recv_length + offset) > buffer_size)) { - ucs_trace_req("message truncated: recv_length %zu offset %zu buffer_size %zu", - recv_length, offset, buffer_size); - if (UCP_DT_IS_GENERIC(datatype) && last) { - ucp_dt_generic(datatype)->ops.finish(state->dt.generic.state); - } - return UCS_ERR_MESSAGE_TRUNCATED; - } - - switch (datatype & UCP_DATATYPE_CLASS_MASK) { - case UCP_DATATYPE_CONTIG: - UCS_PROFILE_NAMED_CALL("memcpy_recv", memcpy, buffer + offset, - recv_data, recv_length); - return UCS_OK; - - case UCP_DATATYPE_IOV: - UCS_PROFILE_CALL(ucp_dt_iov_scatter, buffer, state->dt.iov.iovcnt, - recv_data, recv_length, &state->dt.iov.iov_offset, - &state->dt.iov.iovcnt_offset); - return UCS_OK; - - case UCP_DATATYPE_GENERIC: - dt_gen = ucp_dt_generic(datatype); - status = UCS_PROFILE_NAMED_CALL("dt_unpack", dt_gen->ops.unpack, - state->dt.generic.state, offset, - recv_data, recv_length); - if (last) { - UCS_PROFILE_NAMED_CALL_VOID("dt_finish", dt_gen->ops.finish, - state->dt.generic.state); - } - return status; - - default: - ucs_error("unexpected datatype=%lx", datatype); - return UCS_ERR_INVALID_PARAM; - } -} +ucs_status_t ucp_dt_unpack(ucp_request_t *req, ucp_datatype_t datatype, + void *buffer, size_t buffer_size, ucp_dt_state_t *state, + const void *recv_data, size_t recv_length, int last); #endif diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h index 94202c2f477..2e1580cc326 100644 --- a/src/ucp/tag/eager.h +++ b/src/ucp/tag/eager.h @@ -101,7 +101,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucp_eager_unexp_match(ucp_worker_h worker, ucp_recv_desc_t *rdesc, ucp_tag_t tag, unsigned flags, void *buffer, size_t count, ucp_datatype_t datatype, ucp_dt_state_t *state, - ucp_tag_recv_info_t *info) + ucp_request_t *req, ucp_tag_recv_info_t *info) { size_t recv_len, hdr_len; ucs_status_t status; @@ -110,7 +110,7 @@ ucp_eager_unexp_match(ucp_worker_h worker, ucp_recv_desc_t *rdesc, ucp_tag_t tag UCP_WORKER_STAT_EAGER_CHUNK(worker, UNEXP); hdr_len = rdesc->hdr_len; recv_len = rdesc->length - hdr_len; - status = ucp_dt_unpack(datatype, buffer, count, state, data + hdr_len, + status = ucp_dt_unpack(req, datatype, buffer, count, state, data + hdr_len, recv_len, flags & UCP_RECV_DESC_FLAG_LAST); state->offset += recv_len; diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c index 742c43e18a0..1a2138aee4c 100644 --- a/src/ucp/tag/eager_rcv.c +++ b/src/ucp/tag/eager_rcv.c @@ -71,7 +71,7 @@ ucp_eager_handler(void *arg, void *data, size_t length, unsigned am_flags, if (req != NULL) { UCS_PROFILE_REQUEST_EVENT(req, "eager_recv", recv_len); - status = ucp_dt_unpack(req->recv.datatype, req->recv.buffer, + status = ucp_dt_unpack(req, req->recv.datatype, req->recv.buffer, req->recv.length, &req->recv.state, data + hdr_len, recv_len, flags & UCP_RECV_DESC_FLAG_LAST); diff --git a/src/ucp/tag/offload.c b/src/ucp/tag/offload.c index 91142df2d6d..550c4f30e1e 100644 --- a/src/ucp/tag/offload.c +++ b/src/ucp/tag/offload.c @@ -64,7 +64,7 @@ void ucp_tag_offload_completed(uct_tag_context_t *self, uct_tag_t stag, } if (req->recv.rdesc != NULL) { - status = ucp_dt_unpack(req->recv.datatype, req->recv.buffer, req->recv.length, + status = ucp_dt_unpack(req, req->recv.datatype, req->recv.buffer, req->recv.length, &req->recv.state, req->recv.rdesc + 1, length, 1); ucs_mpool_put_inline(req->recv.rdesc); } else { diff --git a/src/ucp/tag/rndv.c b/src/ucp/tag/rndv.c index 548d3c4c10a..05cad2ce386 100644 --- a/src/ucp/tag/rndv.c +++ b/src/ucp/tag/rndv.c @@ -731,7 +731,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_handler, } UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_recv", recv_len); - status = ucp_dt_unpack(rreq->recv.datatype, rreq->recv.buffer, + status = ucp_dt_unpack(rreq, rreq->recv.datatype, rreq->recv.buffer, rreq->recv.length, &rreq->recv.state, data + hdr_len, recv_len, 0); if ((status == UCS_OK) || (status == UCS_INPROGRESS)) { @@ -764,9 +764,9 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_last_handler, /* Check that total received length matches RTS->length */ ucs_assert(rreq->recv.info.length == rreq->recv.state.offset + recv_len); UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_last_recv", recv_len); - status = ucp_dt_unpack(rreq->recv.datatype, rreq->recv.buffer, - rreq->recv.length, &rreq->recv.state, - data + hdr_len, recv_len, 1); + status = ucp_dt_unpack(rreq, rreq->recv.datatype, rreq->recv.buffer, + rreq->recv.length, &rreq->recv.state, + data + hdr_len, recv_len, 1); } else { ucs_trace_data("drop last segment for rreq %p, length %zu, status %s", rreq, recv_len, ucs_status_string(rreq->status)); diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c index e9413eefd11..14d772a8cdf 100644 --- a/src/ucp/tag/tag_recv.c +++ b/src/ucp/tag/tag_recv.c @@ -85,7 +85,7 @@ ucp_tag_search_unexp(ucp_worker_h worker, void *buffer, size_t buffer_size, UCS_PROFILE_REQUEST_EVENT(req, "eager_match", 0); status = ucp_eager_unexp_match(worker, rdesc, recv_tag, flags, buffer, buffer_size, datatype, - &req->recv.state, info); + &req->recv.state, req, info); ucs_trace_req("release receive descriptor %p", rdesc); if (status != UCS_INPROGRESS) { goto out_release_desc; @@ -128,6 +128,8 @@ ucp_tag_recv_request_init(ucp_request_t *req, ucp_worker_h worker, void* buffer, req->recv.state.offset = 0; req->recv.worker = worker; + ucp_addr_domain_detect_mds(worker->context, buffer, &req->mem_type); + switch (datatype & UCP_DATATYPE_CLASS_MASK) { case UCP_DATATYPE_IOV: req->recv.state.dt.iov.iov_offset = 0; diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 9b7326e3d98..86bec096422 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -202,7 +202,8 @@ ucp_tag_send_req(ucp_request_t *req, size_t count, ssize_t max_short, static void ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, const void* buffer, uintptr_t datatype, - ucp_tag_t tag, uint16_t flags) + ucp_tag_t tag, uint16_t flags, + ucp_mem_type_t mem_type) { req->flags = flags; req->send.ep = ep; @@ -211,6 +212,7 @@ static void ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, req->send.tag = tag; req->send.reg_rsc = UCP_NULL_RESOURCE; req->send.state.offset = 0; + req->mem_type = mem_type; VALGRIND_MAKE_MEM_UNDEFINED(&req->send.uct_comp.count, sizeof(req->send.uct_comp.count)); @@ -228,13 +230,20 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, ucp_request_t *req; size_t length; ucs_status_ptr_t ret; + ucp_mem_type_t mem_type; UCP_THREAD_CS_ENTER_CONDITIONAL(&ep->worker->mt_lock); ucs_trace_req("send_nb buffer %p count %zu tag %"PRIx64" to %s cb %p", buffer, count, tag, ucp_ep_peer_name(ep), cb); - if (ucs_likely(UCP_DT_IS_CONTIG(datatype))) { + ucp_addr_domain_detect_mds(ep->worker->context, (void *)buffer, &mem_type); + if (ucs_likely(!UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id))) { + ucp_ep_set_domain_lanes(ep, &mem_type); + } + + if (ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) && + ucs_likely(UCP_DT_IS_CONTIG(datatype))) { length = ucp_contig_dt_length(datatype, count); if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->tag.eager.max_short)) { status = UCS_PROFILE_CALL(ucp_tag_send_eager_short, ep, tag, buffer, @@ -253,11 +262,15 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, goto out; } - ucp_tag_send_req_init(req, ep, buffer, datatype, tag, 0); + ucp_tag_send_req_init(req, ep, buffer, datatype, tag, 0, mem_type); ret = ucp_tag_send_req(req, count, - ucp_ep_config(ep)->tag.eager.max_short, - ucp_ep_config(ep)->tag.eager.zcopy_thresh, + ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) ? + ucp_ep_config(ep)->tag.eager.max_short : + ucp_ep_config(ep)->domain[mem_type.eager_lane].tag.eager.max_short, + ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) ? + ucp_ep_config(ep)->tag.eager.zcopy_thresh : + ucp_ep_config(ep)->domain[mem_type.eager_lane].tag.eager.zcopy_thresh, ucp_ep_config(ep)->tag.rndv.rma_thresh, ucp_ep_config(ep)->tag.rndv.am_thresh, cb, ucp_ep_config(ep)->tag.proto); @@ -293,7 +306,9 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, /* Remote side needs to send reply, so have it connect to us */ ucp_ep_connect_remote(ep); - ucp_tag_send_req_init(req, ep, buffer, datatype, tag, UCP_REQUEST_FLAG_SYNC); + + ucp_tag_send_req_init(req, ep, buffer, datatype, tag, UCP_REQUEST_FLAG_SYNC, + ucp_mem_type_dummy_handle); ret = ucp_tag_send_req(req, count, -1, /* disable short method */ diff --git a/src/ucp/wireup/address.c b/src/ucp/wireup/address.c index 43cdaca9de3..5509608ca05 100644 --- a/src/ucp/wireup/address.c +++ b/src/ucp/wireup/address.c @@ -325,7 +325,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, *(uint8_t*)ptr = md_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | - ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); + ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0) ; ++ptr; /* Device address length */ diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c index 1f6ae4457b1..b86fa92a7f2 100644 --- a/src/ucp/wireup/select.c +++ b/src/ucp/wireup/select.c @@ -10,17 +10,19 @@ #include #include #include +#include #include #include #define UCP_WIREUP_RNDV_TEST_MSG_SIZE 262144 enum { - UCP_WIREUP_LANE_USAGE_AM = UCS_BIT(0), - UCP_WIREUP_LANE_USAGE_RMA = UCS_BIT(1), - UCP_WIREUP_LANE_USAGE_AMO = UCS_BIT(2), - UCP_WIREUP_LANE_USAGE_RNDV = UCS_BIT(3), - UCP_WIREUP_LANE_USAGE_TAG = UCS_BIT(4) + UCP_WIREUP_LANE_USAGE_AM = UCS_BIT(0), + UCP_WIREUP_LANE_USAGE_RMA = UCS_BIT(1), + UCP_WIREUP_LANE_USAGE_AMO = UCS_BIT(2), + UCP_WIREUP_LANE_USAGE_RNDV = UCS_BIT(3), + UCP_WIREUP_LANE_USAGE_TAG = UCS_BIT(4), + UCP_WIREUP_LANE_USAGE_DOMAIN = UCS_BIT(5) }; @@ -32,6 +34,7 @@ typedef struct { uint32_t usage; double rma_score; double amo_score; + double domain_score; } ucp_wireup_lane_desc_t; @@ -361,6 +364,7 @@ ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, lane_desc->usage = usage; lane_desc->rma_score = 0.0; lane_desc->amo_score = 0.0; + lane_desc->domain_score = 0.0; out_update_score: if (usage & UCP_WIREUP_LANE_USAGE_RMA) { @@ -369,6 +373,9 @@ ucp_wireup_add_lane_desc(ucp_wireup_lane_desc_t *lane_descs, if (usage & UCP_WIREUP_LANE_USAGE_AMO) { lane_desc->amo_score = score; } + if (usage & UCP_WIREUP_LANE_USAGE_DOMAIN) { + lane_desc->domain_score = score; + } } #define UCP_WIREUP_COMPARE_SCORE(_elem1, _elem2, _arg, _token) \ @@ -396,6 +403,12 @@ static int ucp_wireup_compare_lane_amo_score(const void *elem1, const void *elem return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, amo); } +static int ucp_wireup_compare_lane_domain_score(const void *elem1, const void *elem2, + void *arg) +{ + return UCP_WIREUP_COMPARE_SCORE(elem1, elem2, arg, amo); +} + static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(ucp_ep_h ep, unsigned address_count, const ucp_address_entry_t *address_list, @@ -548,6 +561,163 @@ static ucs_status_t ucp_wireup_add_rma_lanes(ucp_ep_h ep, const ucp_ep_params_t -1, UCP_WIREUP_LANE_USAGE_RMA); } +ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, + const ucp_lane_index_t *lanes, + ucp_md_map_t dn_md_map) +{ + ucp_md_index_t dst_md_index; + ucp_lane_index_t lane; + ucp_md_map_t dst_md_mask; + int prio; + + for (prio = 0; prio < UCP_MAX_LANES; ++prio) { + lane = lanes[prio]; + if (lane == UCP_NULL_LANE) { + return UCP_NULL_LANE; /* No more lanes */ + } + + dst_md_index = config->key.lanes[lane].dst_md_index; + dst_md_mask = UCS_BIT(dst_md_index); + if (dn_md_map & dst_md_mask) { + return lane; + } + } + return UCP_NULL_LANE; +} + +ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_mem_type_h mem_type_h) +{ + ucp_rsc_index_t rsc_index; + uct_iface_attr_t *iface_attr; + ucp_md_map_t dn_md_map; + ucp_lane_index_t dn_lane; + ucp_md_index_t md_index; + + dn_md_map = mem_type_h->md_map; + + while (1) { + dn_lane = ucp_config_find_domain_lane(ucp_ep_config(ep), + ucp_ep_config(ep)->key.domain_lanes, dn_md_map); + if (dn_lane == UCP_NULL_LANE) { + ucs_error("Not find address domain lane."); + return UCS_ERR_IO_ERROR; + } + rsc_index = ucp_ep_get_rsc_index(ep, dn_lane); + iface_attr = &ep->worker->ifaces[rsc_index].attr; + md_index = ucp_ep_config(ep)->key.lanes[dn_lane].dst_md_index; + if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) { + mem_type_h->eager_lane = dn_lane; + } + /*TODO: revisit cap flags for rndv lane*/ + /*if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) { + *mem_type_h->rndv_lane = dn_lane + }*/ + dn_md_map |= ~UCS_BIT(md_index); + if (mem_type_h->eager_lane != UCP_NULL_LANE || dn_md_map == 0) { + break; + } + } + + return UCS_OK; +} + + +double ucp_wireup_addr_domain_score_func(ucp_context_h context, + const uct_md_attr_t *md_attr, + const uct_iface_attr_t *iface_attr, + const ucp_address_iface_attr_t *remote_iface_attr) +{ + /* best end-to-end latency and larger bcopy size */ + return (1e-3 / (ucp_wireup_tl_iface_latency(context, iface_attr, remote_iface_attr) + + iface_attr->overhead + remote_iface_attr->overhead)); +} + +static UCS_F_NOINLINE ucs_status_t +ucp_wireup_add_addr_domain_lanes(ucp_ep_h ep, unsigned address_count, + const ucp_address_entry_t *address_list, + ucp_wireup_lane_desc_t *lane_descs, + ucp_lane_index_t *num_lanes_p, + const ucp_wireup_criteria_t *criteria, + uint64_t tl_bitmap, uint32_t usage) +{ + ucp_address_entry_t *address_list_copy; + ucp_rsc_index_t rsc_index, dst_md_index; + size_t address_list_size; + double score; + uint64_t remote_md_map; + unsigned addr_index; + ucs_status_t status; + + remote_md_map = -1; + + /* Create a copy of the address list */ + address_list_size = sizeof(*address_list_copy) * address_count; + address_list_copy = ucs_malloc(address_list_size, "rma address list"); + if (address_list_copy == NULL) { + status = UCS_ERR_NO_MEMORY; + goto out; + } + + memcpy(address_list_copy, address_list, address_list_size); + + status = ucp_wireup_select_transport(ep, address_list_copy, address_count, + criteria, tl_bitmap, remote_md_map, + 0, &rsc_index, &addr_index, &score); + if (status != UCS_OK) { + goto out_free_address_list; + } + + dst_md_index = address_list_copy[addr_index].md_index; + + /* Add to the list of lanes and remove all occurrences of the remote md + * from the address list, to avoid selecting the same remote md again.*/ + ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, + dst_md_index, score, usage, 0); + remote_md_map &= ~UCS_BIT(dst_md_index); + + while (address_count > 0) { + status = ucp_wireup_select_transport(ep, address_list_copy, address_count, + criteria, tl_bitmap, remote_md_map, + 0, &rsc_index, &addr_index, &score); + if (status != UCS_OK) { + break; + } + + /* Add lane description and remove all occurrences of the remote md */ + dst_md_index = address_list_copy[addr_index].md_index; + ucp_wireup_add_lane_desc(lane_descs, num_lanes_p, rsc_index, addr_index, + dst_md_index, score, usage, 0); + remote_md_map &= ~UCS_BIT(dst_md_index); + } + + status = UCS_OK; + +out_free_address_list: + ucs_free(address_list_copy); +out: + return UCS_OK; +} +static ucs_status_t ucp_wireup_add_domain_lane(ucp_ep_h ep, const ucp_ep_params_t *params, + unsigned address_count, + const ucp_address_entry_t *address_list, + ucp_wireup_lane_desc_t *lane_descs, + ucp_lane_index_t *num_lanes_p) +{ + ucp_wireup_criteria_t criteria; + + criteria.title = "adress domain"; + criteria.local_md_flags = 0; + criteria.remote_md_flags = 0; + criteria.remote_iface_flags = UCT_IFACE_FLAG_CONNECT_TO_IFACE; + criteria.local_iface_flags = criteria.remote_iface_flags; + criteria.calc_score = ucp_wireup_addr_domain_score_func; + ucp_wireup_fill_ep_params_criteria(&criteria, params); + + return ucp_wireup_add_addr_domain_lanes(ep, address_count, address_list, + lane_descs, num_lanes_p, &criteria, + -1, UCP_WIREUP_LANE_USAGE_DOMAIN); +} + double ucp_wireup_amo_score_func(ucp_context_h context, const uct_md_attr_t *md_attr, const uct_iface_attr_t *iface_attr, @@ -899,6 +1069,12 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, return status; } + status = ucp_wireup_add_domain_lane(ep, params, address_count, address_list, + lane_descs, &key->num_lanes); + if (status != UCS_OK) { + return status; + } + /* User should not create endpoints unless requested communication features */ if (key->num_lanes == 0) { ucs_error("No transports selected to %s (features: 0x%lx)", @@ -936,6 +1112,9 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, ucs_assert(key->tag_lane == UCP_NULL_LANE); key->tag_lane = lane; } + if (lane_descs[lane].usage & UCP_WIREUP_LANE_USAGE_DOMAIN) { + key->domain_lanes[lane] = lane; + } } /* Sort RMA and AMO lanes according to score */ @@ -943,6 +1122,9 @@ ucs_status_t ucp_wireup_select_lanes(ucp_ep_h ep, const ucp_ep_params_t *params, ucp_wireup_compare_lane_rma_score, lane_descs); ucs_qsort_r(key->amo_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), ucp_wireup_compare_lane_amo_score, lane_descs); + ucs_qsort_r(key->domain_lanes, UCP_MAX_LANES, sizeof(ucp_lane_index_t), + ucp_wireup_compare_lane_domain_score, lane_descs); + /* Get all reachable MDs from full remote address list */ key->reachable_md_map = ucp_wireup_get_reachable_mds(worker, address_count, From 400bc0451396c10cc9680d105d3cfb424c142557 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Tue, 17 Oct 2017 21:15:39 +0300 Subject: [PATCH 16/16] Make eager parameters in ep config as a domain array. --- src/ucp/core/ucp_ep.c | 30 ++++++++++++++++++++++++------ src/ucp/core/ucp_ep.h | 4 ++-- src/ucp/tag/offload.c | 4 ++-- src/ucp/tag/tag_send.c | 25 +++++++++---------------- src/ucp/wireup/select.c | 14 ++++++++------ src/uct/api/uct.h | 2 +- 6 files changed, 46 insertions(+), 33 deletions(-) diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index d9bd15404a2..a32e508c118 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -912,10 +912,10 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) for (it = 0; it < UCP_MAX_IOV; ++it) { config->am.zcopy_thresh[it] = SIZE_MAX; config->am.sync_zcopy_thresh[it] = SIZE_MAX; - config->tag.eager.zcopy_thresh[it] = SIZE_MAX; - config->tag.eager.sync_zcopy_thresh[it] = SIZE_MAX; + config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].zcopy_thresh[it] = SIZE_MAX; + config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].sync_zcopy_thresh[it] = SIZE_MAX; } - config->tag.eager.zcopy_auto_thresh = 0; + config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].zcopy_auto_thresh = 0; config->am.zcopy_auto_thresh = 0; config->p2p_lanes = 0; config->domain_lanes = 0; @@ -946,7 +946,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) rsc_index = config->key.lanes[lane].rsc_index; if (rsc_index != UCP_NULL_RESOURCE) { iface_attr = &worker->ifaces[rsc_index].attr; - ucp_ep_config_init_attrs(worker, rsc_index, &config->tag.eager, + ucp_ep_config_init_attrs(worker, rsc_index, &(config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT]), iface_attr->cap.tag.eager.max_short, iface_attr->cap.tag.eager.max_bcopy, iface_attr->cap.tag.eager.max_zcopy, @@ -997,7 +997,7 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) ucp_ep_config_set_rndv_thresh(worker, config, config->key.rndv_lane, UCT_IFACE_FLAG_GET_ZCOPY, max_rndv_thresh); - config->tag.eager = config->am; + config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT] = config->am; config->tag.lane = lane; } } else { @@ -1023,6 +1023,24 @@ void ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config) } + for (it = 0; it < UCT_MD_MEM_TYPE_LAST; it++) { + config->tag.eager[it] = config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT]; + if (!UCP_IS_DEFAULT_MEMORY_TYPE(it)) { + ucp_mem_type_t mem_type; + unsigned md_index; + mem_type.id = it; + mem_type.md_map = 0; + for (md_index = 0; md_index < context->num_mds; md_index++) { + if (context->tl_mds[md_index].attr.cap.mem_type == it) { + mem_type.md_map |= UCS_BIT(md_index); + } + } + ucp_ep_config_set_domain_lanes(worker, config, &mem_type); + config->tag.eager[it].max_short = config->domain[mem_type.eager_lane].tag.eager.max_short; + memcpy(config->tag.eager[it].zcopy_thresh, config->domain[mem_type.eager_lane].tag.eager.zcopy_thresh, UCP_MAX_IOV * sizeof(size_t)); + } + } + /* Configuration for remote memory access */ for (lane = 0; lane < config->key.num_lanes; ++lane) { if (ucp_ep_config_get_rma_prio(config->key.rma_lanes, lane) == -1) { @@ -1245,7 +1263,7 @@ static void ucp_ep_config_print(FILE *stream, ucp_worker_h worker, if (context->config.features & UCP_FEATURE_TAG) { tag_config = (ucp_ep_is_tag_offload_enabled((ucp_ep_config_t *)config)) ? - &config->tag.eager : &config->am; + &config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT] : &config->am; ucp_ep_config_print_tag_proto(stream, "tag_send", tag_config->max_short, tag_config->zcopy_thresh[0], diff --git a/src/ucp/core/ucp_ep.h b/src/ucp/core/ucp_ep.h index 4dd92a1b678..8f9303ca932 100644 --- a/src/ucp/core/ucp_ep.h +++ b/src/ucp/core/ucp_ep.h @@ -171,7 +171,7 @@ typedef struct ucp_ep_config { /* Configuration of the lane used for eager protocols * (can be AM or tag offload). */ - ucp_ep_msg_config_t eager; + ucp_ep_msg_config_t eager[UCT_MD_MEM_TYPE_LAST]; struct { /* Maximal total size of rndv_get_zcopy */ @@ -267,5 +267,5 @@ size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt, ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, const ucp_lane_index_t *lanes, ucp_md_map_t dn_md_map); -ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_mem_type_h mem_type_h); +ucs_status_t ucp_ep_config_set_domain_lanes(ucp_worker_h worker, ucp_ep_config_t *ep_config, ucp_mem_type_h mem_type_h); #endif diff --git a/src/ucp/tag/offload.c b/src/ucp/tag/offload.c index 550c4f30e1e..f6d5660de27 100644 --- a/src/ucp/tag/offload.c +++ b/src/ucp/tag/offload.c @@ -348,7 +348,7 @@ ucp_do_tag_offload_zcopy(uct_pending_req_t *self, uint64_t imm_data, { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_ep_t *ep = req->send.ep; - size_t max_iov = ucp_ep_config(ep)->tag.eager.max_iov; + size_t max_iov = ucp_ep_config(ep)->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_iov; uct_iov_t *iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); size_t iovcnt = 0; ucs_status_t status; @@ -438,7 +438,7 @@ ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self) { ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct); ucp_ep_t *ep = req->send.ep; - size_t max_iov = ucp_ep_config(ep)->tag.eager.max_iov; + size_t max_iov = ucp_ep_config(ep)->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_iov; uct_iov_t *iov = ucs_alloca(max_iov * sizeof(uct_iov_t)); size_t iovcnt = 0; ucp_request_hdr_t rndv_hdr = { diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c index 86bec096422..aeef1d4145e 100644 --- a/src/ucp/tag/tag_send.c +++ b/src/ucp/tag/tag_send.c @@ -41,7 +41,7 @@ static ucs_status_t ucp_tag_req_start(ucp_request_t *req, size_t count, req->send.state.dt.iov.iovcnt_offset = 0; req->send.state.dt.iov.iov_offset = 0; req->send.state.dt.iov.iovcnt = count; - flag_iov_single = (count <= config->tag.eager.max_iov); + flag_iov_single = (count <= config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_iov); if (!flag_iov_single && ucp_ep_is_tag_offload_enabled(config)) { /* Make sure SW RNDV will be used, because tag offload does @@ -52,7 +52,7 @@ static ucs_status_t ucp_tag_req_start(ucp_request_t *req, size_t count, if (0 == count) { /* disable zcopy */ zcopy_thresh = SIZE_MAX; - } else if (!config->tag.eager.zcopy_auto_thresh) { + } else if (!config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].zcopy_auto_thresh) { /* The user defined threshold or no zcopy enabled */ zcopy_thresh = zcopy_thresh_arr[0]; } else if (count <= UCP_MAX_IOV) { @@ -94,7 +94,7 @@ static ucs_status_t ucp_tag_req_start(ucp_request_t *req, size_t count, UCS_PROFILE_REQUEST_EVENT(req, "start_rndv", req->send.length); } else if (length < zcopy_thresh) { /* bcopy */ - if (length <= (config->tag.eager.max_bcopy - only_hdr_size)) { + if (length <= (config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_bcopy - only_hdr_size)) { req->send.uct.func = proto->bcopy_single; UCS_PROFILE_REQUEST_EVENT(req, "start_egr_bcopy_single", req->send.length); } else { @@ -111,7 +111,7 @@ static ucs_status_t ucp_tag_req_start(ucp_request_t *req, size_t count, req->send.uct_comp.func = proto->zcopy_completion; req->send.uct_comp.count = 0; - if ((length <= (config->tag.eager.max_zcopy - only_hdr_size)) && + if ((length <= (config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_zcopy - only_hdr_size)) && flag_iov_single) { req->send.uct.func = proto->zcopy_single; UCS_PROFILE_REQUEST_EVENT(req, "start_egr_zcopy_single", req->send.length); @@ -138,7 +138,7 @@ static void ucp_tag_req_start_generic(ucp_request_t *req, size_t count, req->send.state.dt.generic.state = state; req->send.length = length = dt_gen->ops.packed_size(state); - if (length <= config->tag.eager.max_bcopy - proto->only_hdr_size) { + if (length <= config->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_bcopy - proto->only_hdr_size) { /* bcopy single */ req->send.uct.func = proto->bcopy_single; UCS_PROFILE_REQUEST_EVENT(req, "start_gen_bcopy_single", req->send.length); @@ -238,14 +238,11 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, buffer, count, tag, ucp_ep_peer_name(ep), cb); ucp_addr_domain_detect_mds(ep->worker->context, (void *)buffer, &mem_type); - if (ucs_likely(!UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id))) { - ucp_ep_set_domain_lanes(ep, &mem_type); - } if (ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) && ucs_likely(UCP_DT_IS_CONTIG(datatype))) { length = ucp_contig_dt_length(datatype, count); - if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->tag.eager.max_short)) { + if (ucs_likely((ssize_t)length <= ucp_ep_config(ep)->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].max_short)) { status = UCS_PROFILE_CALL(ucp_tag_send_eager_short, ep, tag, buffer, length); if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) { @@ -265,12 +262,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nb, ucp_tag_send_req_init(req, ep, buffer, datatype, tag, 0, mem_type); ret = ucp_tag_send_req(req, count, - ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) ? - ucp_ep_config(ep)->tag.eager.max_short : - ucp_ep_config(ep)->domain[mem_type.eager_lane].tag.eager.max_short, - ucs_likely(UCP_IS_DEFAULT_MEMORY_TYPE(mem_type.id)) ? - ucp_ep_config(ep)->tag.eager.zcopy_thresh : - ucp_ep_config(ep)->domain[mem_type.eager_lane].tag.eager.zcopy_thresh, + ucp_ep_config(ep)->tag.eager[mem_type.id].max_short, + ucp_ep_config(ep)->tag.eager[mem_type.id].zcopy_thresh, ucp_ep_config(ep)->tag.rndv.rma_thresh, ucp_ep_config(ep)->tag.rndv.am_thresh, cb, ucp_ep_config(ep)->tag.proto); @@ -312,7 +305,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nb, ret = ucp_tag_send_req(req, count, -1, /* disable short method */ - ucp_ep_config(ep)->tag.eager.sync_zcopy_thresh, + ucp_ep_config(ep)->tag.eager[UCT_MD_MEM_TYPE_DEFAULT].sync_zcopy_thresh, ucp_ep_config(ep)->tag.rndv.rma_thresh, ucp_ep_config(ep)->tag.rndv.am_thresh, cb, ucp_ep_config(ep)->tag.sync_proto); diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c index b86fa92a7f2..3ac3c85c9dd 100644 --- a/src/ucp/wireup/select.c +++ b/src/ucp/wireup/select.c @@ -585,7 +585,9 @@ ucp_lane_index_t ucp_config_find_domain_lane(const ucp_ep_config_t *config, return UCP_NULL_LANE; } -ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_mem_type_h mem_type_h) +ucs_status_t ucp_ep_config_set_domain_lanes(ucp_worker_h worker, + ucp_ep_config_t *ep_config, + ucp_mem_type_h mem_type_h) { ucp_rsc_index_t rsc_index; uct_iface_attr_t *iface_attr; @@ -596,15 +598,15 @@ ucs_status_t ucp_ep_set_domain_lanes(ucp_ep_h ep, ucp_mem_type_h mem_type_h) dn_md_map = mem_type_h->md_map; while (1) { - dn_lane = ucp_config_find_domain_lane(ucp_ep_config(ep), - ucp_ep_config(ep)->key.domain_lanes, dn_md_map); + dn_lane = ucp_config_find_domain_lane(ep_config, + ep_config->key.domain_lanes, dn_md_map); if (dn_lane == UCP_NULL_LANE) { ucs_error("Not find address domain lane."); return UCS_ERR_IO_ERROR; } - rsc_index = ucp_ep_get_rsc_index(ep, dn_lane); - iface_attr = &ep->worker->ifaces[rsc_index].attr; - md_index = ucp_ep_config(ep)->key.lanes[dn_lane].dst_md_index; + rsc_index = ep_config->key.lanes[dn_lane].rsc_index; + iface_attr = &worker->ifaces[rsc_index].attr; + md_index = ep_config->key.lanes[dn_lane].dst_md_index; if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) { mem_type_h->eager_lane = dn_lane; } diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h index 6e2831f85f8..f3d50481a24 100644 --- a/src/uct/api/uct.h +++ b/src/uct/api/uct.h @@ -392,7 +392,7 @@ enum { typedef enum { UCT_MD_MEM_TYPE_DEFAULT = 0, /**< Default system memory */ UCT_MD_MEM_TYPE_CUDA, /**< NVIDIA CUDA memory */ - UCT_MD_MEM_TYPE_LAST = UCT_MD_MEM_TYPE_CUDA + UCT_MD_MEM_TYPE_LAST } uct_memory_type_t;