Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add non-temporal memcpy support for ARM #361

Merged
merged 8 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/all-builds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ for python in true false; do
meson compile
done
done
# Similar recvmmsg and gro don't interact with either of the above
# Similarly recvmmsg and gro don't interact with either of the above
for recvmmsg in auto disabled; do
for gro in auto disabled; do
meson configure -Dpython=$python -Drecvmmsg=$recvmmsg -Dgro=$gro
Expand Down
58 changes: 58 additions & 0 deletions .ci/setup-flags.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash

# Produce flags to meson to force certain features. This ensures that the CI
# builds are actually testing all the optional features, and not failing to
# include them because the detection code is broken.

flags=(
"--native-file=ci.ini"
"-Dwerror=true"
"-Dtools=enabled"
"-Dpcap=enabled"
"-Dunit_test=enabled"
)

if [ "$(uname)" = "Linux" ]; then
flags+=(
"-Dibv=enabled"
"-Dmlx5dv=enabled"
"-Dibv_hw_rate_limit=enabled"
"-Dpcap=enabled"
"-Dcap=enabled"
"-Drecvmmsg=enabled"
"-Dsendmmsg=enabled"
"-Dgso=enabled"
"-Dgro=enabled"
"-Dpthread_setaffinity_np=enabled"
"-Dposix_semaphores=enabled"
"-Deventfd=enabled"
)
fi

case "$(arch)" in
x86_64)
flags+=(
"-Dsse2_stream=enabled"
"-Davx_stream=enabled"
"-Davx512_stream=enabled"
)
;;
aarch64)
# Note: Apple uses "arm64" while Linux uses "aarch64". Apple doesn't
# seem to support SVE in any hardware (up to M4) and our detection
# code is Linux-specific, so we don't try to force this for MacOS
# builds.
flags+=("-Dsve_stream=enabled")
;;
esac

echo "Setting flags ${flags[*]}" 1>&2

if [ "$1" = "--python" ]; then
for arg in "${flags[@]}"; do
echo -n "--config-settings=setup-args=$arg "
done
echo
else
echo "${flags[*]}"
fi
37 changes: 4 additions & 33 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,14 @@ jobs:
include:
- os: ubuntu-20.04
cxx: g++-9
extras: enabled
- os: ubuntu-22.04
cxx: g++-12
extras: enabled
- os: ubuntu-20.04
cxx: clang++-10
extras: enabled
- os: ubuntu-22.04
cxx: clang++-14
extras: enabled
- os: macos-14
cxx: clang++
extras: disabled
runs-on: ${{ matrix.os }}
env:
SCCACHE_GHA_ENABLED: "true"
Expand All @@ -44,14 +39,7 @@ jobs:
- name: Install build requirements
run: ./.ci/py-build-requirements.sh
- name: Set up build directory
run: >-
meson setup build
--native-file=ci.ini
-Dwerror=true
-Dauto_features=${{ matrix.extras }}
-Dcuda=disabled
-Dgdrapi=disabled
-Dfmv="$(if [[ '${{ matrix.cxx }}' == g++* ]]; then echo enabled; else echo disabled; fi)"
run: meson setup build $(.ci/setup-flags.sh)
- name: Build
run: meson compile -C build
- name: Run tests
Expand All @@ -66,7 +54,6 @@ jobs:
cc: gcc-9
cxx: g++-9
python-version: '3.9'
extras: enabled
# Disabled for now because numba/llvmlite (needed for tests)
# doesn't have a pypy wheel, and it's not worth the effect to
# install all the dependencies needed to make that work.
Expand All @@ -77,22 +64,18 @@ jobs:
cc: gcc-12
cxx: g++-12
python-version: '3.12'
extras: enabled
- os: ubuntu-20.04
cc: clang-10
cxx: clang++-10
python-version: '3.12'
extras: enabled
- os: ubuntu-22.04
cc: clang-14
cxx: clang++-14
python-version: '3.12'
extras: enabled
- os: macos-14
cc: clang
cxx: clang++
python-version: '3.12'
extras: disabled
runs-on: ${{ matrix.os }}
env:
SCCACHE_GHA_ENABLED: "true"
Expand All @@ -114,13 +97,7 @@ jobs:
env:
CC: ${{ matrix.cc }} # Do not pass -Werror when building dependencies
- name: Install Python package
run: >-
pip install -v
--config-settings=setup-args=--native-file=ci.ini
--config-settings=setup-args=-Dwerror=true
--config-settings=setup-args=-Dauto_features=${{ matrix.extras }}
--config-settings=setup-args=-Dfmv="$(if [[ '${{ matrix.cxx }}' == g++* ]]; then echo enabled; else echo disabled; fi)"
.
run: pip install -v $(.ci/setup-flags.sh --python) .
- name: Run tests
run: pytest -v -ra # -ra summarises the reasons for skipping or failing tests
- name: Run shutdown tests
Expand Down Expand Up @@ -168,9 +145,7 @@ jobs:
# the ephemeral build directory. So do a non-isolated editable
# install instead.
run: >-
pip install -v
--config-settings=setup-args=-Dwerror=true
--config-settings=setup-args=-Dauto_features=enabled
pip install -v $(.ci/setup-flags.sh --python)
--config-settings=setup-args=-Dbuildtype=debug
--config-settings=setup-args=-Db_coverage=true
--no-build-isolation
Expand All @@ -179,11 +154,7 @@ jobs:
run: pytest -v -ra && ./.ci/py-tests-shutdown.sh
- name: Set up C++ build
run: >-
meson setup build
-Dwerror=true
-Dauto_features=enabled
-Dcuda=disabled
-Dgdrapi=disabled
meson setup build $(.ci/setup-flags.sh)
-Dbuildtype=debug
-Db_coverage=true
- name: Build C++
Expand Down
2 changes: 1 addition & 1 deletion doc/py-recv.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ properties after construction.
Set the method used to copy data from the network to the heap. The
default is :py:const:`~spead2.MEMCPY_STD`. This can be changed to
:py:const:`~spead2.MEMCPY_NONTEMPORAL`, which writes to the destination with a
non-temporal cache hint (if SSE2 is enabled at compile time). This can
non-temporal cache hint (if CPU support is available). This can
improve performance with large heaps if the data is not going to be used
immediately, by reducing cache pollution. Be careful when benchmarking:
receiving heaps will generally appear faster, but it can slow down
Expand Down
7 changes: 7 additions & 0 deletions include/spead2/common_features.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@
# define SPEAD2_USE_AVX512_STREAM 0
#endif

/* Similarly for AARCH64 features */
#if defined(__aarch64__)
# define SPEAD2_USE_SVE_STREAM @SPEAD2_USE_SVE_STREAM@
#else
# define SPEAD2_USE_SVE_STREAM 0
#endif

#define SPEAD2_USE_POSIX_SEMAPHORES @SPEAD2_USE_POSIX_SEMAPHORES@
#define SPEAD2_USE_PCAP @SPEAD2_USE_PCAP@

Expand Down
11 changes: 11 additions & 0 deletions include/spead2/common_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@
*
* If compiler and run-time support is not available, this falls back to
* regular memcpy.
*
* On AArch64, this does not carry a dependency from the source address to
* the source data. In other words, if you do the following, proper ordering
* is not guaranteed:
*
* 1. Write data to an array;
* 2. Write the address of the array to an atomic pointer;
* 3. In another thread, read the pointer with @c std::memory_order_consume and
* pass it to this function.
*
* Rather use @c std::memory_order_acquire in this case.
*/
namespace spead2
{
Expand Down
17 changes: 17 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,22 @@ use_avx512_stream = get_option('avx512_stream').require(
name : 'AVX-512 streaming intrinsic'
)
).allowed()
use_sve_stream = get_option('sve_stream').require(
compiler.compiles(
'''
#include <arm_sve.h>
#include <sys/auxv.h>

[[gnu::target("+sve")]]
void foo()
{
bool have_sve = getauxval(AT_HWCAP) & HWCAP_SVE;
svldnt1_u8(svptrue_b8(), (const unsigned char *) 0);
}
''',
name : 'SVE streaming intrinsic'
)
).allowed()

# Write configuration data
conf = configuration_data()
Expand All @@ -283,6 +299,7 @@ conf.set10('SPEAD2_USE_FMV', use_fmv)
conf.set10('SPEAD2_USE_SSE2_STREAM', use_sse2_stream)
conf.set10('SPEAD2_USE_AVX_STREAM', use_avx_stream)
conf.set10('SPEAD2_USE_AVX512_STREAM', use_avx512_stream)
conf.set10('SPEAD2_USE_SVE_STREAM', use_sve_stream)
conf.set10('SPEAD2_USE_PCAP', pcap_dep.found())
conf.set('SPEAD2_MAX_LOG_LEVEL', '(spead2::log_level::' + get_option('max_log_level') + ')')

Expand Down
1 change: 1 addition & 0 deletions meson.options
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ option('fmv', type : 'feature', description : 'Use function multi-versioning')
option('sse2_stream', type : 'feature', description : 'Use SSE2 for non-temporal stores')
option('avx_stream', type : 'feature', description : 'Use AVX for non-temporal stores')
option('avx512_stream', type : 'feature', description : 'Use AVX-512 for non-temporal stores')
option('sve_stream', type : 'feature', description : 'Use SVE for non-temporal stores')
option('cuda', type : 'feature', description : 'Build CUDA examples')
option('gdrapi', type : 'feature', description : 'Build gdrcopy examples')
option('unit_test', type : 'feature', description : 'Build the unit tests')
Expand Down
Loading
Loading