From 5182f7bea641eba633c6496463fce09a194e4ea6 Mon Sep 17 00:00:00 2001 From: Bruce Merry Date: Mon, 23 Nov 2020 15:58:51 +0200 Subject: [PATCH] Always SFENCE at end of memcpy_nontemporal It might be needed for correct ordering with write-combining memory, such as used by gdrcopy. --- doc/changelog.rst | 2 ++ src/common_memcpy.cpp | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index c5b6a4dc9..edcb47316 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -8,6 +8,8 @@ Changelog - Make the ibverbs sender compatible with `PeerDirect`_. - Add examples programs showing integration with `gdrcopy`_ and `PeerDirect`_. +- Always use SFENCE at end of :cpp:func:`memcpy_nontemporal` so that it is + appropriate for use with `gdrcopy`_. - Fix a memory leak when receiving with ibverbs. .. _gdrcopy: https://github.com/NVIDIA/gdrcopy diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp index e957f702d..7295eaa52 100644 --- a/src/common_memcpy.cpp +++ b/src/common_memcpy.cpp @@ -43,7 +43,14 @@ void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src { if (head >= n) { - return std::memcpy(dest_c, src_c, n); + std::memcpy(dest_c, src_c, n); + /* Not normally required, but if the destination is + * write-combining memory then this will flush the combining + * buffers. That may be necessary if the memory is actually on + * a GPU or other accelerator. + */ + _mm_sfence(); + return dest; } std::memcpy(dest_c, src_c, head); dest_c += head;