-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
393 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
// Copyright 2010-2022 Google LLC | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Provides functions and data structures that make it easier to work with | ||
// aligned memory: | ||
// | ||
// - AlignedAllocator<T, n>, an extension of std::allocator<T> that also takes | ||
// an explicit memory alignment parameter. The memory blocks returned by the | ||
// allocator are aligned to this number of bytes, i.e. the address of the | ||
// beginning of the block will be N * alignment_bytes for some N. | ||
// - AlignedVector<>, a specialization of std::vector<> that uses the aligned | ||
// allocator to create blocks with explicit allocations. | ||
// | ||
// - AlignUp and AlignDown are functions that align a pointer to the given | ||
// number of bytes. | ||
|
||
#ifndef OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ | ||
#define OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ | ||
|
||
#include <cstddef> | ||
#include <cstdint> | ||
#include <vector> | ||
|
||
#include "ortools/util/aligned_memory_internal.h" | ||
|
||
namespace operations_research { | ||
|
||
// Functions for working with pointers and rounding them up and down to a given | ||
// alignment. | ||
|
||
// Returns the nearest greater or equal address that is a multiple of | ||
// alignment_bytes. When ptr is already aligned to alignment_bytes, returns it | ||
// unchanged. | ||
template <size_t alignment_bytes, typename Value> | ||
inline Value* AlignUp(Value* ptr) { | ||
const std::uintptr_t int_ptr = reinterpret_cast<std::intptr_t>(ptr); | ||
const std::uintptr_t misalignment = int_ptr % alignment_bytes; | ||
if (misalignment == 0) return ptr; | ||
return reinterpret_cast<Value*>(int_ptr - misalignment + alignment_bytes); | ||
} | ||
|
||
// Returns the nearest smaller or equal address that is a multiple of | ||
// alignment_bytes. When ptr is already aligned to alignment_bytes, returns it | ||
// unchanged | ||
template <size_t alignment_bytes, typename Value> | ||
inline Value* AlignDown(Value* ptr) { | ||
const std::intptr_t int_ptr = reinterpret_cast<std::intptr_t>(ptr); | ||
const std::intptr_t misalignment = int_ptr % alignment_bytes; | ||
return reinterpret_cast<Value*>(int_ptr - misalignment); | ||
} | ||
|
||
// Returns true when `ptr` is aligned to `alignment_bytes` bytes. | ||
template <size_t alignment_bytes, typename Value> | ||
inline bool IsAligned(Value* ptr) { | ||
return reinterpret_cast<std::uintptr_t>(ptr) % alignment_bytes == 0; | ||
} | ||
|
||
// Support for aligned containers in STL. | ||
|
||
// An allocator that always aligns its memory to `alignment_bytes`. | ||
template <typename T, size_t alignment_bytes> | ||
using AlignedAllocator = | ||
internal::AllocatorWithAlignment<T, alignment_bytes, 0>; | ||
|
||
// A version of std::vector<T> whose data() pointer is always aligned to | ||
// `alignment_bytes`. | ||
template <typename T, size_t alignment_bytes> | ||
using AlignedVector = std::vector<T, AlignedAllocator<T, alignment_bytes>>; | ||
|
||
// Intentionally misaligned containers for testing correctness and performance | ||
// of code that may depend on a certain alignment. | ||
namespace use_only_in_tests { | ||
|
||
// A version of AlignedAllocator for testing purposes that adds intentional | ||
// misalignment. The returned address has the form | ||
// alignment_bytes * N + misalignment_bytes. | ||
template <typename T, size_t alignment_bytes, size_t misalignment_bytes> | ||
using MisalignedAllocator = | ||
internal::AllocatorWithAlignment<T, alignment_bytes, misalignment_bytes>; | ||
|
||
// A specialization of std::vector<> that uses MisalignedAllocator with the | ||
// given parameters. | ||
template <typename T, size_t alignment_bytes, size_t misalignment_bytes> | ||
using MisalignedVector = | ||
std::vector<T, MisalignedAllocator<T, alignment_bytes, misalignment_bytes>>; | ||
|
||
} // namespace use_only_in_tests | ||
|
||
} // namespace operations_research | ||
|
||
#endif // OR_TOOLS_UTIL_ALIGNED_MEMORY_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
// Copyright 2010-2022 Google LLC | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ | ||
#define OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ | ||
|
||
#include <cstddef> | ||
#include <cstdlib> | ||
#include <memory> | ||
|
||
#include "ortools/base/mathutil.h" | ||
|
||
namespace operations_research { | ||
|
||
namespace internal { | ||
|
||
template <typename T, size_t alignment_bytes, size_t misalignment_bytes> | ||
struct AllocatorWithAlignment : public std::allocator<T> { | ||
// Allocates memory for num_items items of type T. The memory must be freed | ||
// using deallocate(); using it with free() or `delete` might cause unexpected | ||
// behavior when misalignment is used. | ||
T* allocate(size_t num_items) { | ||
// Having misalignment_bytes >= alignment_bytes is useless, because all | ||
// misalignments are equivalent modulo `alignment_bytes`. Disallowing it | ||
// allows us to simplify the code below. | ||
static_assert(alignment_bytes == 0 || misalignment_bytes < alignment_bytes); | ||
|
||
// `std::aligned_alloc(alignment, size)` requires that `size` is a multiple | ||
// of `alignment`, and might return a nullptr when this is not respected. To | ||
// be safe, we round the number of bytes up to alignment. | ||
const size_t num_required_bytes = | ||
misalignment_bytes + num_items * sizeof(T); | ||
|
||
const size_t num_allocated_bytes = | ||
MathUtil::RoundUpTo(num_required_bytes, alignment_bytes); | ||
|
||
std::uintptr_t ptr = reinterpret_cast<std::uintptr_t>( | ||
std::aligned_alloc(alignment_bytes, num_allocated_bytes)); | ||
return reinterpret_cast<T*>(ptr + misalignment_bytes); | ||
} | ||
// A version of allocate() that takes a hint; we just ignore the hint. | ||
T* allocate(size_t n, const void*) { return allocate(n); } | ||
|
||
// Frees memory allocated by allocate(). | ||
void deallocate(T* p, size_t) { | ||
std::uintptr_t aligned_pointer = | ||
reinterpret_cast<std::uintptr_t>(p) - misalignment_bytes; | ||
free(reinterpret_cast<void*>(aligned_pointer)); | ||
} | ||
|
||
// Rebind must be specialized to produce AllocatorWithAlignment and not | ||
// std::allocator. It uses the same alignment and misalignment as its source. | ||
template <typename U> | ||
struct rebind { | ||
using other = | ||
AllocatorWithAlignment<U, alignment_bytes, misalignment_bytes>; | ||
}; | ||
}; | ||
|
||
} // namespace internal | ||
|
||
} // namespace operations_research | ||
|
||
#endif // OR_TOOLS_UTIL_ALIGNED_MEMORY_INTERNAL_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
// Copyright 2010-2022 Google LLC | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Fast summation of arrays (vectors, spans) of numbers. | ||
// | ||
// Speed: up to 2x faster than Eigen for float arrays with ~100 elements or more | ||
// (as of 2023-05). | ||
// Precision: Better or comparable precision to std::accumulate<> on the same | ||
// value type. That said, the precision is inferior to precise sum | ||
// algorithm such as ::AccurateSum. | ||
|
||
#ifndef OR_TOOLS_UTIL_VECTOR_SUM_H_ | ||
#define OR_TOOLS_UTIL_VECTOR_SUM_H_ | ||
|
||
#include "absl/types/span.h" | ||
#include "ortools/util/vector_sum_internal.h" | ||
|
||
namespace operations_research { | ||
|
||
// Computes the sum of `values`, assuming that the first element of `values` is | ||
// aligned to 16 bytes. | ||
inline float AlignedVectorSum(absl::Span<const float> values) { | ||
return internal::VectorSum<4, 4, /*assume_aligned_at_start=*/true>(values); | ||
} | ||
|
||
// Computes the sum of `values` without assuming anything. | ||
inline float VectorSum(absl::Span<const float> values) { | ||
return internal::VectorSum<4, 4, /*assume_aligned_at_start=*/false>(values); | ||
} | ||
|
||
} // namespace operations_research | ||
|
||
#endif // OR_TOOLS_UTIL_VECTOR_SUM_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
// Copyright 2010-2022 Google LLC | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_ | ||
#define OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_ | ||
|
||
#include <algorithm> | ||
#include <cstddef> | ||
#include <iterator> | ||
#include <numeric> | ||
|
||
#include "absl/base/attributes.h" | ||
#include "absl/types/span.h" | ||
#include "ortools/util/aligned_memory.h" | ||
|
||
namespace operations_research { | ||
namespace internal { | ||
|
||
// A contiguous block of memory that contains `size` values of type `Value`, and | ||
// the first value is aligned to `alignment` bytes. | ||
template <typename Value, size_t size, size_t alignment = sizeof(Value) * size> | ||
struct alignas(alignment) AlignedBlock { | ||
Value values[size] = {}; | ||
|
||
Value Sum() const { | ||
alignas(alignment) Value sum[size]; | ||
std::copy(std::begin(values), std::end(values), std::begin(sum)); | ||
for (int i = size; i > 1; i /= 2) { | ||
int middle = i / 2; | ||
for (int j = 0; j < middle; ++j) { | ||
sum[j] += sum[middle + j]; | ||
} | ||
} | ||
return sum[0]; | ||
} | ||
}; | ||
|
||
// In-place addition for two AlignedBlock values. Adds `in` to `out`, storing | ||
// the value in `out`. Unless something steps in, this compiles into a single | ||
// `*addp*` instruction. | ||
template <typename Value, size_t size> | ||
void AddInPlace(AlignedBlock<Value, size>& out, | ||
const AlignedBlock<Value, size>& in) { | ||
for (int i = 0; i < size; ++i) { | ||
out.values[i] += in.values[i]; | ||
} | ||
} | ||
|
||
// Computes the sum of `num_blocks` aligned blocks. Proceeds in three phases: | ||
// 1. Parallel sum with N = `num_blocks_per_iteration` independent block-sized | ||
// accumulators. At the end, accumulator i contains partial sums at indices | ||
// i, i + N, i + 2*N, ..., i + M*N, where M is the largest number of blocks | ||
// such that i+M*N < num_blocks for all i. | ||
// 2. Parallel addition of remaining blocks. All remaining blocks are added to | ||
// accumulators 0, ..., num_remaining_blocks - 1. | ||
// 3. Sum of accumulators: all accumulators are added together. The result is a | ||
// single block-sized accumulator that is returned to the caller. | ||
// | ||
// The code of the function was specifically tuned for 32-bit floating point | ||
// values, and works best with block_size = 4 and num_blocks_per_iteration = 4. | ||
// It will likely work with other types, but may require extra care and possibly | ||
// different parameters. | ||
// | ||
// NOTE(user): As of 2023-04-28, Clang's auto-vectorizer is *very* brittle | ||
// and most attempts to reduce the last accumulator from step 3 into a single | ||
// value prevents the rest of the function from being vectorized. To mitigate | ||
// this behavior we return the whole block (which should normally fit into a | ||
// single vector register). We also mark the function with | ||
// `ABSL_ATTRIBUTE_NOINLINE` to make prevent the inliner from merging this | ||
// function with any additional code that would prevent vectorization. | ||
template <size_t block_size, size_t num_blocks_per_iteration, typename Value> | ||
AlignedBlock<Value, block_size> ABSL_ATTRIBUTE_NOINLINE AlignedBlockSum( | ||
const AlignedBlock<Value, block_size>* blocks, size_t num_blocks) { | ||
using Block = AlignedBlock<Value, block_size>; | ||
static_assert(sizeof(Block[2]) == sizeof(Block::values) * 2, | ||
"The values in the block are not packed."); | ||
|
||
AlignedBlock<Value, block_size> sum[num_blocks_per_iteration]; | ||
|
||
const int leftover_blocks = num_blocks % num_blocks_per_iteration; | ||
const int packed_blocks = num_blocks - leftover_blocks; | ||
|
||
const AlignedBlock<Value, block_size>* aligned_block_end = | ||
blocks + packed_blocks; | ||
|
||
// Phase 1: Parallel sum of the bulk of the data. | ||
if (packed_blocks >= num_blocks_per_iteration) { | ||
std::copy(blocks, blocks + num_blocks_per_iteration, sum); | ||
} | ||
for (int i = num_blocks_per_iteration; i < packed_blocks; | ||
i += num_blocks_per_iteration) { | ||
for (int j = 0; j < num_blocks_per_iteration; ++j) { | ||
AddInPlace(sum[j], blocks[i + j]); | ||
} | ||
} | ||
|
||
// Phase 2: Semi-parallel sum of the remaining up to | ||
// num_blocks_per_iteration - 1 blocks. | ||
for (int i = 0; i < leftover_blocks; ++i) { | ||
AddInPlace(sum[i], aligned_block_end[i]); | ||
} | ||
|
||
// Phase 3: Reduce the accumulator blocks to a single block. | ||
// NOTE(user): When this code is auto-vectorized correctly, the initial | ||
// copy is a no-op, and the for loop below translates to | ||
// num_blocks_per_iteration - 1 vector add instructions. In 2023-05, I | ||
// experimented with other versions (e.g. using sum[0] as the target or making | ||
// res a const reference to sum[0], but in most cases they broke vectorization | ||
// of the whole function). | ||
AlignedBlock<Value, block_size> res = sum[0]; | ||
for (int i = 1; i < num_blocks_per_iteration; ++i) { | ||
AddInPlace(res, sum[i]); | ||
} | ||
|
||
return res; | ||
} | ||
|
||
// Computes the sum of values in `values`, by adding `num_blocks_per_iteration` | ||
// blocks of `block_size` values. | ||
// By default, the sum does not make any assumptions about the size or alignment | ||
// of `values`. When the first item of `values` is known to be aligned to | ||
// `block_size * sizeof(Value)` bytes, `assume_aligned_at_start` can be used to | ||
// save a small amount of time. | ||
template <size_t block_size = 4, size_t num_blocks_per_iteration = 4, | ||
bool assume_aligned_at_start = false, typename Value> | ||
Value VectorSum(absl::Span<const Value> values) { | ||
using Block = AlignedBlock<Value, block_size>; | ||
const Value* start_ptr = values.data(); | ||
const int size = values.size(); | ||
// With less than two blocks, there's not a lot to vectorize, and a simple | ||
// sequential sum is usually faster. | ||
if (size == 0) return Value{0}; | ||
if (size < 2 * block_size) { | ||
return std::accumulate(start_ptr + 1, start_ptr + size, *start_ptr); | ||
} | ||
|
||
if (assume_aligned_at_start) { | ||
ABSL_ASSUME(reinterpret_cast<std::uintptr_t>(start_ptr) % alignof(Block) == | ||
0); | ||
} | ||
const Value* aligned_start_ptr = | ||
assume_aligned_at_start ? start_ptr : AlignUp<alignof(Block)>(start_ptr); | ||
const Block* blocks = reinterpret_cast<const Block*>(aligned_start_ptr); | ||
const Value* end_ptr = start_ptr + size; | ||
const Value* aligned_end_ptr = AlignDown<alignof(Block)>(end_ptr); | ||
ABSL_ASSUME(aligned_start_ptr <= aligned_end_ptr); | ||
const size_t num_blocks = (aligned_end_ptr - aligned_start_ptr) / block_size; | ||
ABSL_ASSUME( | ||
reinterpret_cast<std::uintptr_t>(aligned_end_ptr) % alignof(Block) == 0); | ||
|
||
Value leading_items_sum{}; | ||
if (!assume_aligned_at_start) { | ||
leading_items_sum = std::accumulate(start_ptr, aligned_start_ptr, Value{}); | ||
} | ||
Block res = | ||
AlignedBlockSum<block_size, num_blocks_per_iteration>(blocks, num_blocks); | ||
Value block_sum = res.Sum(); | ||
return std::accumulate(aligned_end_ptr, end_ptr, | ||
block_sum + leading_items_sum); | ||
} | ||
|
||
} // namespace internal | ||
} // namespace operations_research | ||
|
||
#endif // OR_TOOLS_UTIL_VECTOR_SUM_INTERNAL_H_ |