Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[libc++] Take advantage of trivial relocation in std::vector::erase #116268

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions libcxx/docs/ReleaseNotes/20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ Improvements and New Features
- The ``lexicographical_compare`` and ``ranges::lexicographical_compare`` algorithms have been optimized for trivially
equality comparable types, resulting in a performance improvement of up to 40x.

- The ``std::vector::erase`` function has been optimized for types that can be relocated trivially (such as ``std::string``),
yielding speed ups witnessed to be around 2x for these types (but subject to the use case).

- The ``_LIBCPP_ENABLE_CXX20_REMOVED_TEMPORARY_BUFFER`` macro has been added to make ``std::get_temporary_buffer`` and
``std::return_temporary_buffer`` available.

Expand Down
1 change: 1 addition & 0 deletions libcxx/include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,7 @@ set(files
__memory/construct_at.h
__memory/destruct_n.h
__memory/inout_ptr.h
__memory/is_trivially_allocator_relocatable.h
__memory/noexcept_move_assign_container.h
__memory/out_ptr.h
__memory/pointer_traits.h
Expand Down
49 changes: 49 additions & 0 deletions libcxx/include/__memory/is_trivially_allocator_relocatable.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef _LIBCPP___MEMORY_IS_TRIVIALLY_ALLOCATOR_RELOCATABLE_H
#define _LIBCPP___MEMORY_IS_TRIVIALLY_ALLOCATOR_RELOCATABLE_H

#include <__config>
#include <__memory/allocator_traits.h>
#include <__type_traits/integral_constant.h>
#include <__type_traits/is_trivially_relocatable.h>
#include <__type_traits/negation.h>

#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif

_LIBCPP_BEGIN_NAMESPACE_STD

// A type is trivially allocator relocatable if the allocator's move construction and destruction
// don't do anything beyond calling the type's move constructor and destructor, and if the type
// itself is trivially relocatable.

template <class _Alloc, class _Type>
struct __allocator_has_trivial_move_construct : _Not<__has_construct<_Alloc, _Type*, _Type&&> > {};

template <class _Type>
struct __allocator_has_trivial_move_construct<allocator<_Type>, _Type> : true_type {};

template <class _Alloc, class _Tp>
struct __allocator_has_trivial_destroy : _Not<__has_destroy<_Alloc, _Tp*> > {};

template <class _Tp, class _Up>
struct __allocator_has_trivial_destroy<allocator<_Tp>, _Up> : true_type {};

template <class _Alloc, class _Tp>
struct __is_trivially_allocator_relocatable
: integral_constant<bool,
__allocator_has_trivial_move_construct<_Alloc, _Tp>::value &&
__allocator_has_trivial_destroy<_Alloc, _Tp>::value &&
__libcpp_is_trivially_relocatable<_Tp>::value> {};

_LIBCPP_END_NAMESPACE_STD

#endif // _LIBCPP___MEMORY_IS_TRIVIALLY_ALLOCATOR_RELOCATABLE_H
53 changes: 16 additions & 37 deletions libcxx/include/__memory/uninitialized_algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <__memory/addressof.h>
#include <__memory/allocator_traits.h>
#include <__memory/construct_at.h>
#include <__memory/is_trivially_allocator_relocatable.h>
#include <__memory/pointer_traits.h>
#include <__type_traits/enable_if.h>
#include <__type_traits/extent.h>
Expand Down Expand Up @@ -591,60 +592,38 @@ __uninitialized_allocator_copy(_Alloc& __alloc, _Iter1 __first1, _Sent1 __last1,
return std::__rewrap_iter(__first2, __result);
}

template <class _Alloc, class _Type>
struct __allocator_has_trivial_move_construct : _Not<__has_construct<_Alloc, _Type*, _Type&&> > {};

template <class _Type>
struct __allocator_has_trivial_move_construct<allocator<_Type>, _Type> : true_type {};

template <class _Alloc, class _Tp>
struct __allocator_has_trivial_destroy : _Not<__has_destroy<_Alloc, _Tp*> > {};

template <class _Tp, class _Up>
struct __allocator_has_trivial_destroy<allocator<_Tp>, _Up> : true_type {};

// __uninitialized_allocator_relocate relocates the objects in [__first, __last) into __result.
//
// Relocation means that the objects in [__first, __last) are placed into __result as-if by move-construct and destroy,
// except that the move constructor and destructor may never be called if they are known to be equivalent to a memcpy.
//
// Preconditions: __result doesn't contain any objects and [__first, __last) contains objects
// This algorithm works even if part of the resulting range overlaps with [__first, __last), as long as __result itself
// is not in [__first, last).
//
// Preconditions:
// - __result doesn't contain any objects and [__first, __last) contains objects
// - __result is not in [__first, __last) sd
// Postconditions: __result contains the objects from [__first, __last) and
// [__first, __last) doesn't contain any objects
//
// The strong exception guarantee is provided if any of the following are true:
// - is_nothrow_move_constructible<_ValueType>
// - is_copy_constructible<_ValueType>
// - __libcpp_is_trivially_relocatable<_ValueType>
template <class _Alloc, class _ContiguousIterator>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void __uninitialized_allocator_relocate(
_Alloc& __alloc, _ContiguousIterator __first, _ContiguousIterator __last, _ContiguousIterator __result) {
static_assert(__libcpp_is_contiguous_iterator<_ContiguousIterator>::value, "");
using _ValueType = typename iterator_traits<_ContiguousIterator>::value_type;
static_assert(__is_cpp17_move_insertable<_Alloc>::value,
"The specified type does not meet the requirements of Cpp17MoveInsertable");
if (__libcpp_is_constant_evaluated() || !__libcpp_is_trivially_relocatable<_ValueType>::value ||
!__allocator_has_trivial_move_construct<_Alloc, _ValueType>::value ||
!__allocator_has_trivial_destroy<_Alloc, _ValueType>::value) {
auto __destruct_first = __result;
auto __guard = std::__make_exception_guard(
_AllocatorDestroyRangeReverse<_Alloc, _ContiguousIterator>(__alloc, __destruct_first, __result));
auto __iter = __first;
while (__iter != __last) {
#if _LIBCPP_HAS_EXCEPTIONS
allocator_traits<_Alloc>::construct(__alloc, std::__to_address(__result), std::move_if_noexcept(*__iter));
#else
allocator_traits<_Alloc>::construct(__alloc, std::__to_address(__result), std::move(*__iter));
#endif
++__iter;
if (__libcpp_is_constant_evaluated() || !__is_trivially_allocator_relocatable<_Alloc, _ValueType>::value) {
while (__first != __last) {
allocator_traits<_Alloc>::construct(__alloc, std::__to_address(__result), std::move(*__first));
allocator_traits<_Alloc>::destroy(__alloc, std::__to_address(__first));
++__first;
++__result;
}
__guard.__complete();
std::__allocator_destroy(__alloc, __first, __last);
} else {
// Casting to void* to suppress clang complaining that this is technically UB.
__builtin_memcpy(static_cast<void*>(std::__to_address(__result)),
std::__to_address(__first),
sizeof(_ValueType) * (__last - __first));
__builtin_memmove(static_cast<void*>(std::__to_address(__result)),
std::__to_address(__first),
sizeof(_ValueType) * (__last - __first));
}
}

Expand Down
10 changes: 4 additions & 6 deletions libcxx/include/__type_traits/is_trivially_relocatable.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD

// A type is trivially relocatable if a move construct + destroy of the original object is equivalent to
// `memcpy(dst, src, sizeof(T))`.

#if __has_builtin(__is_trivially_relocatable)
template <class _Tp, class = void>
struct __libcpp_is_trivially_relocatable : integral_constant<bool, __is_trivially_relocatable(_Tp)> {};
#else
//
// Note that we don't use Clang's __is_trivially_relocatable builtin because it doesn't honor the presence
// of non-trivial special members like assignment operators, or even a copy constructor, making it possible
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In particular, this change is required or else we fail the test I added in erase_iter.pass.cpp which counts the number of assignments. Because of this, and since the semantics of what it means to be trivially relocatable are still being actively debated, I think it makes sense for us to err on the safer side and use is_trivially_copyable for now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TrackedAssignment has a trivial destructor and a trivial copy constructor, so it's trivially relocatable by destroying it and copy-constructing a new instance. Is the library code not permitted to do that? If not, would it make sense to have separate internal traits for "trivially assignment-relocatable" versus "trivially construction-relocatable" that only detect the case where you could relocate specifically using assignment versus specifically using construction?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the delay, the last days have been busy. I believe what you are suggesting is essentially what http://wg21.link/P2687R9 introduced with the replaceable concept -- or were you aware of that concept but you had something else in mind?

Since I wrote this patch, my understanding has evolved and so did P2687, so I created this draft PR to discuss with the P2687 authors. That PR implements the currently proposed semantics for P2687 (although with a IMO better library API).

// to incorrectly optimize operations that should call user-provided operations instead.
template <class _Tp, class = void>
struct __libcpp_is_trivially_relocatable : is_trivially_copyable<_Tp> {};
#endif

template <class _Tp>
struct __libcpp_is_trivially_relocatable<_Tp,
Expand Down
56 changes: 32 additions & 24 deletions libcxx/include/__vector/vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <__memory/allocator.h>
#include <__memory/allocator_traits.h>
#include <__memory/compressed_pair.h>
#include <__memory/is_trivially_allocator_relocatable.h>
#include <__memory/noexcept_move_assign_container.h>
#include <__memory/pointer_traits.h>
#include <__memory/swap_allocator.h>
Expand Down Expand Up @@ -515,8 +516,37 @@ class _LIBCPP_TEMPLATE_VIS vector {
}
#endif

_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position);
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last);
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) {
_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
__position != end(), "vector::erase(iterator) called with a non-dereferenceable iterator");
return erase(__position, __position + 1);
}
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __cfirst, const_iterator __clast) {
_LIBCPP_ASSERT_VALID_INPUT_RANGE(__cfirst <= __clast, "vector::erase(first, last) called with invalid range");

iterator __first = begin() + std::distance(cbegin(), __cfirst);
iterator __last = begin() + std::distance(cbegin(), __clast);
if (__first == __last)
return __last;

auto __n = std::distance(__first, __last);

// When the value_type is trivially relocatable, we know that move-assignment followed by a destruction
// is equivalent to a memcpy, and we can elide calls to the move-assignment operator (which are mandated
// by the Standard) under the as-if rule. So instead, we destroy the range being erased and we relocate the
// tail of the vector into the created gap.
if (__is_trivially_allocator_relocatable<_Allocator, value_type>::value) {
std::__allocator_destroy(this->__alloc_, __first, __last);
std::__uninitialized_allocator_relocate(this->__alloc_, __last, end(), __first);
} else {
auto __new_end = std::move(__last, end(), __first);
std::__allocator_destroy(this->__alloc_, __new_end, end());
}

this->__end_ -= __n;
__annotate_shrink(size() + __n);
return __first;
}

_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT {
size_type __old_size = size();
Expand Down Expand Up @@ -1125,28 +1155,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 inline
#endif
}

template <class _Tp, class _Allocator>
_LIBCPP_CONSTEXPR_SINCE_CXX20 inline _LIBCPP_HIDE_FROM_ABI typename vector<_Tp, _Allocator>::iterator
vector<_Tp, _Allocator>::erase(const_iterator __position) {
_LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
__position != end(), "vector::erase(iterator) called with a non-dereferenceable iterator");
difference_type __ps = __position - cbegin();
pointer __p = this->__begin_ + __ps;
this->__destruct_at_end(std::move(__p + 1, this->__end_, __p));
return __make_iter(__p);
}

template <class _Tp, class _Allocator>
_LIBCPP_CONSTEXPR_SINCE_CXX20 typename vector<_Tp, _Allocator>::iterator
vector<_Tp, _Allocator>::erase(const_iterator __first, const_iterator __last) {
_LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "vector::erase(first, last) called with invalid range");
pointer __p = this->__begin_ + (__first - begin());
if (__first != __last) {
this->__destruct_at_end(std::move(__p + (__last - __first), this->__end_, __p));
}
return __make_iter(__p);
}

template <class _Tp, class _Allocator>
_LIBCPP_CONSTEXPR_SINCE_CXX20 void
vector<_Tp, _Allocator>::__move_range(pointer __from_s, pointer __from_e, pointer __to) {
Expand Down
1 change: 1 addition & 0 deletions libcxx/include/module.modulemap
Original file line number Diff line number Diff line change
Expand Up @@ -1533,6 +1533,7 @@ module std [system] {
module destruct_n { header "__memory/destruct_n.h" }
module fwd { header "__fwd/memory.h" }
module inout_ptr { header "__memory/inout_ptr.h" }
module is_trivially_allocator_relocatable { header "__memory/is_trivially_allocator_relocatable.h" }
module noexcept_move_assign_container { header "__memory/noexcept_move_assign_container.h" }
module out_ptr { header "__memory/out_ptr.h" }
module pointer_traits { header "__memory/pointer_traits.h" }
Expand Down
30 changes: 30 additions & 0 deletions libcxx/test/benchmarks/ContainerBenchmarks.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#define BENCHMARK_CONTAINER_BENCHMARKS_H

#include <cassert>
#include <iterator>
#include <utility>

#include "Utilities.h"
#include "benchmark/benchmark.h"
Expand Down Expand Up @@ -149,6 +151,34 @@ void BM_EmplaceDuplicate(benchmark::State& st, Container c, GenInputs gen) {
}
}

template <class Container, class GenInputs>
void BM_erase_iter_in_middle(benchmark::State& st, Container, GenInputs gen) {
auto in = gen(st.range(0));
Container c(in.begin(), in.end());
assert(c.size() > 2);
for (auto _ : st) {
auto mid = std::next(c.begin(), c.size() / 2);
auto tmp = *mid;
auto result = c.erase(mid); // erase an element in the middle
benchmark::DoNotOptimize(result);
c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
}
}

template <class Container, class GenInputs>
void BM_erase_iter_at_start(benchmark::State& st, Container, GenInputs gen) {
auto in = gen(st.range(0));
Container c(in.begin(), in.end());
assert(c.size() > 2);
for (auto _ : st) {
auto it = c.begin();
auto tmp = *it;
auto result = c.erase(it); // erase the first element
benchmark::DoNotOptimize(result);
c.push_back(std::move(tmp)); // and then push it back at the end to avoid needing a new container
}
}

template <class Container, class GenInputs>
void BM_Find(benchmark::State& st, Container c, GenInputs gen) {
auto in = gen(st.range(0));
Expand Down
11 changes: 11 additions & 0 deletions libcxx/test/benchmarks/deque.bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20

#include <deque>
#include <string>

#include "benchmark/benchmark.h"

Expand Down Expand Up @@ -41,4 +42,14 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_size_t, std::deque<size_t>{}, get
BENCHMARK_CAPTURE(BM_ConstructFromRange, deque_string, std::deque<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);

BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
->Range(TestNumInputs, TestNumInputs * 10);
BENCHMARK_CAPTURE(BM_erase_iter_in_middle, deque_string, std::deque<std::string>{}, getRandomStringInputs)
->Range(TestNumInputs, TestNumInputs * 10);

BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_int, std::deque<int>{}, getRandomIntegerInputs<int>)
->Range(TestNumInputs, TestNumInputs * 10);
BENCHMARK_CAPTURE(BM_erase_iter_at_start, deque_string, std::deque<std::string>{}, getRandomStringInputs)
->Range(TestNumInputs, TestNumInputs * 10);

BENCHMARK_MAIN();
10 changes: 10 additions & 0 deletions libcxx/test/benchmarks/vector_operations.bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_string, std::vector<std::string>

BENCHMARK_CAPTURE(BM_Pushback_no_grow, vector_int, std::vector<int>{})->Arg(TestNumInputs);

BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
->Range(TestNumInputs, TestNumInputs * 10);
BENCHMARK_CAPTURE(BM_erase_iter_in_middle, vector_string, std::vector<std::string>{}, getRandomStringInputs)
->Range(TestNumInputs, TestNumInputs * 10);

BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_int, std::vector<int>{}, getRandomIntegerInputs<int>)
->Range(TestNumInputs, TestNumInputs * 10);
BENCHMARK_CAPTURE(BM_erase_iter_at_start, vector_string, std::vector<std::string>{}, getRandomStringInputs)
->Range(TestNumInputs, TestNumInputs * 10);

template <class T>
void bm_grow(benchmark::State& state) {
for (auto _ : state) {
Expand Down
Loading
Loading