diff --git a/doc/changelog.rst b/doc/changelog.rst index 29f7d6c9c..60e1e786e 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -1,6 +1,12 @@ Changelog ========= +.. rubric:: 2.1.2 + +- Make verbs acceleration work when run against MLNX OFED 5.x, including with + Python wheels. Note that it will not use multi-packet receive queues, so + receive performance may still be better on MLNX OFED 4.9. + .. rubric:: 2.1.1 - Update pybind to 2.5.0. diff --git a/doc/py-ibverbs.rst b/doc/py-ibverbs.rst index b97ddd37d..cdea4a139 100644 --- a/doc/py-ibverbs.rst +++ b/doc/py-ibverbs.rst @@ -3,8 +3,7 @@ Support for ibverbs Receiver performance can be significantly improved by using the Infiniband Verbs API instead of the BSD sockets API. This is currently only tested on Linux with Mellanox ConnectX®-3 and ConnectX®-5 NICs. It depends on device -managed flow steering (DMFS), which may require using the Mellanox OFED version -of libibverbs. +managed flow steering (DMFS). There are a number of limitations in the current implementation: @@ -13,27 +12,24 @@ There are a number of limitations in the current implementation: - For sending, only multicast is supported. Within these limitations, it is quite easy to take advantage of this faster -code path. The main difficulty is that one *must* specify the IP address of -the interface that will send or receive the packets. The netifaces_ module can +code path. The main difficulties are that one *must* specify the IP address of +the interface that will send or receive the packets, and that the +``CAP_NET_RAW`` capability may be needed. The netifaces_ module can help find the IP address for an interface by name. .. _netifaces: https://pypi.python.org/pypi/netifaces System configuration -------------------- -It is likely that some system configuration will be needed to allow this mode -to work correctly. For ConnectX®-3, add the following to -:file:`/etc/modprobe.d/mlnx.conf`:: + +ConnectX®-3 +^^^^^^^^^^^ +Add the following to :file:`/etc/modprobe.d/mlnx.conf`:: options ib_uverbs disable_raw_qp_enforcement=1 options mlx4_core fast_drop=1 options mlx4_core log_num_mgm_entry_size=-1 -For more information, see the `libvma documentation`_. For ConnectX®-4/5, only -the first of these lines is required. - -.. _libvma documentation: https://github.com/Mellanox/libvma - .. note:: Setting ``log_num_mgm_entry_size`` to -7 instead of -1 will activate faster @@ -43,6 +39,20 @@ the first of these lines is required. .. _manual: http://www.mellanox.com/related-docs/prod_software/Mellanox_EN_for_Linux_User_Manual_v4_3.pdf +ConnectX®-4+, MLNX OFED up to 4.9 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Add the following to :file:`/etc/modprobe.d/mlnx.conf`:: + + options ib_uverbs disable_raw_qp_enforcement=1 + +All other cases +^^^^^^^^^^^^^^^ +No system configuration is needed, but the ``CAP_NET_RAW`` capability is +required. Running as root will achieve this; a full discussion of Linux +capabilities is beyond the scope of this manual. +For more information, see the `libvma documentation`_. For ConnectX®-4/5, only +the first of these lines is required. + Receiving --------- The ibverbs API can be used programmatically by using an extra method of @@ -81,10 +91,10 @@ The ibverbs API can be used programmatically by using an extra method of non-negative) or letting other code run on the thread (if `comp_vector` is negative). -If supported by the NIC, the receive code will automatically use a -"multi-packet receive queue", which allows each packet to consume only the -amount of space needed in the buffer. This is not supported by all NICs (for -example, ConnectX-3 does not, but ConnectX-5 does). When in use, the +If supported by the NIC and the drivers, the receive code will automatically +use a "multi-packet receive queue", which allows each packet to consume only +the amount of space needed in the buffer. This is currently only supported on +ConnectX®-4+ with MLNX OFED drivers prior to 5.0. When in use, the `max_size` parameter has little impact on performance, and is used only to reject larger packets. @@ -96,7 +106,7 @@ efficiency by keeping data more-or-less contiguous. Environment variables ^^^^^^^^^^^^^^^^^^^^^ -An existing application can be forced to use ibverbs for all multicast IPv4 +An existing application can be forced to use ibverbs for all IPv4 readers, by setting the environment variable :envvar:`SPEAD2_IBV_INTERFACE` to the IP address of the interface to receive the packets. Note that calls to :py:meth:`spead2.recv.Stream.add_udp_reader` that pass an explicit interface diff --git a/include/spead2/common_ibv.h b/include/spead2/common_ibv.h index 98d3c4960..52de1dd99 100644 --- a/include/spead2/common_ibv.h +++ b/include/spead2/common_ibv.h @@ -114,10 +114,7 @@ struct ibv_comp_channel_deleter struct ibv_flow_deleter { - void operator()(ibv_flow *flow) - { - ibv_destroy_flow(flow); - } + void operator()(ibv_flow *flow); }; #if SPEAD2_USE_IBV_MPRQ diff --git a/include/spead2/recv_udp_ibv.h b/include/spead2/recv_udp_ibv.h index 0abbde2a4..50a6d8d80 100644 --- a/include/spead2/recv_udp_ibv.h +++ b/include/spead2/recv_udp_ibv.h @@ -377,7 +377,8 @@ struct reader_factory } catch (std::system_error &e) { - if (e.code() != std::errc::not_supported) + if (e.code() != std::errc::not_supported // ENOTSUP + && e.code() != std::errc::function_not_supported) // ENOSYS throw; log_debug("Multi-packet receive queues not supported (%1%), falling back", e.what()); return std::unique_ptr(new udp_ibv_reader( diff --git a/spead2/_version.py b/spead2/_version.py index 58039f505..4eabd0b3f 100644 --- a/spead2/_version.py +++ b/spead2/_version.py @@ -1 +1 @@ -__version__ = "2.1.1" +__version__ = "2.1.2" diff --git a/src/common_ibv.cpp b/src/common_ibv.cpp index 8186a5b81..bf2fe9614 100644 --- a/src/common_ibv.cpp +++ b/src/common_ibv.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,79 @@ void ibv_exp_res_domain_deleter::operator()(ibv_exp_res_domain *res_domain) #endif +/* At some point (I think v12 or v13) rdma-core changed the ABI for + * ibv_create_flow, and the backwards-compatibility shim in place was + * flawed and only fixed in v30. See + * https://github.com/linux-rdma/rdma-core/commit/88789b7. + * + * We can work around it by mimicking the internals of verbs.h and + * selecting the correct slot. + */ +struct verbs_context_fixed +{ + int (*ibv_destroy_flow)(ibv_flow *flow); + int (*old_ibv_destroy_flow)(ibv_flow *flow); + ibv_flow * (*ibv_create_flow)(ibv_qp *qp, ibv_flow_attr *flow_attr); + ibv_flow * (*old_ibv_create_flow)(ibv_qp *qp, ibv_flow_attr *flow_attr); + void (*padding[6])(void); + std::uint64_t has_comp_mask; + std::size_t sz; + ibv_context context; +}; + +// Returns the wrapping verbs_context_fixed if it seems like there is one, +// otherwise NULL. +static const verbs_context_fixed *get_verbs_context_fixed(ibv_context *ctx) +{ + if (!ctx || ctx->abi_compat != __VERBS_ABI_IS_EXTENDED) + return NULL; + const verbs_context_fixed *vctx = (const verbs_context_fixed *)( + (const char *)(ctx) - offsetof(verbs_context_fixed, context)); + if (vctx->sz >= sizeof(*vctx)) + return vctx; + else + return NULL; +} + +static ibv_flow *wrap_ibv_create_flow(ibv_qp *qp, ibv_flow_attr *flow_attr) +{ + errno = 0; + ibv_flow *flow = ibv_create_flow(qp, flow_attr); + if (!flow && (errno == 0 || errno == EOPNOTSUPP)) + { + const verbs_context_fixed *vctx = get_verbs_context_fixed(qp->context); + if (!vctx->old_ibv_create_flow && vctx->ibv_create_flow) + flow = vctx->ibv_create_flow(qp, flow_attr); + else if (errno == 0) + errno = EOPNOTSUPP; // old versions of ibv_create_flow neglect to set errno + } + return flow; +} + +static int wrap_ibv_destroy_flow(ibv_flow *flow) +{ + errno = 0; + int result = ibv_destroy_flow(flow); + /* While ibv_destroy_flow is supposed to return an errno on failure, the + * header files have in the past returned negated error numbers. + */ + if (result != 0) + { + if (std::abs(result) == ENOSYS || std::abs(result) == EOPNOTSUPP) + { + const verbs_context_fixed *vctx = get_verbs_context_fixed(flow->context); + if (!vctx->old_ibv_destroy_flow && vctx->ibv_destroy_flow) + result = vctx->ibv_destroy_flow(flow); + } + } + return result; +} + +void ibv_flow_deleter::operator()(ibv_flow *flow) +{ + wrap_ibv_destroy_flow(flow); +} + } // namespace detail rdma_event_channel_t::rdma_event_channel_t() @@ -296,7 +370,13 @@ ibv_qp_t::ibv_qp_t(const ibv_pd_t &pd, ibv_qp_init_attr *init_attr) errno = 0; ibv_qp *qp = ibv_create_qp(pd.get(), init_attr); if (!qp) - throw_errno("ibv_create_qp failed"); + { + if (errno == EINVAL && init_attr->qp_type == IBV_QPT_RAW_PACKET) + throw_errno( + "ibv_create_qp failed (could be a permission problem - do you have CAP_NET_RAW?)"); + else + throw_errno("ibv_create_qp failed"); + } reset(qp); } @@ -365,8 +445,7 @@ void ibv_qp_t::post_send(ibv_send_wr *wr) ibv_flow_t::ibv_flow_t(const ibv_qp_t &qp, ibv_flow_attr *flow_attr) { - errno = 0; - ibv_flow *flow = ibv_create_flow(qp.get(), flow_attr); + ibv_flow *flow = detail::wrap_ibv_create_flow(qp.get(), flow_attr); if (!flow) throw_errno("ibv_create_flow failed"); reset(flow); diff --git a/src/recv_udp_ibv.cpp b/src/recv_udp_ibv.cpp index 58c97fabe..259d4b509 100644 --- a/src/recv_udp_ibv.cpp +++ b/src/recv_udp_ibv.cpp @@ -126,6 +126,8 @@ static std::size_t compute_n_slots(const rdma_cm_id_t &cm_id, std::size_t buffer { bool reduced = false; ibv_device_attr attr = cm_id.query_device(); + if (!(attr.device_cap_flags & IBV_DEVICE_MANAGED_FLOW_STEERING)) + throw std::invalid_argument("This device does not support flow steering"); if (attr.max_mr_size < max_raw_size) throw std::invalid_argument("Packet size is larger than biggest MR supported by device"); if (attr.max_mr_size < buffer_size)