diff --git a/doc/changelog.rst b/doc/changelog.rst index c5b6a4dc9..edcb47316 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -8,6 +8,8 @@ Changelog - Make the ibverbs sender compatible with `PeerDirect`_. - Add examples programs showing integration with `gdrcopy`_ and `PeerDirect`_. +- Always use SFENCE at end of :cpp:func:`memcpy_nontemporal` so that it is + appropriate for use with `gdrcopy`_. - Fix a memory leak when receiving with ibverbs. .. _gdrcopy: https://github.com/NVIDIA/gdrcopy diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp index e957f702d..7295eaa52 100644 --- a/src/common_memcpy.cpp +++ b/src/common_memcpy.cpp @@ -43,7 +43,14 @@ void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src { if (head >= n) { - return std::memcpy(dest_c, src_c, n); + std::memcpy(dest_c, src_c, n); + /* Not normally required, but if the destination is + * write-combining memory then this will flush the combining + * buffers. That may be necessary if the memory is actually on + * a GPU or other accelerator. + */ + _mm_sfence(); + return dest; } std::memcpy(dest_c, src_c, head); dest_c += head;