Skip to content

Commit

Permalink
Speed up creating and extending packed arrays from iterators up to 63×
Browse files Browse the repository at this point in the history
This uses the iterator size hint to pre-allocate, which leads to 63×
speedup in the best case. If the hint is pessimistic, it reads into a
buffer to avoid repeated push() calls, which is still 44x as fast as the
previous implementation.
  • Loading branch information
ttencate committed Jan 20, 2025
1 parent 40186f3 commit f2e267d
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 9 deletions.
105 changes: 98 additions & 7 deletions godot-core/src/builtin/collections/packed_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use godot_ffi as sys;

use crate::builtin::*;
use crate::meta::{AsArg, ToGodot};
use std::mem::{size_of, MaybeUninit};
use std::{fmt, ops, ptr};
use sys::types::*;
use sys::{ffi_methods, interface_fn, GodotFfi};
Expand Down Expand Up @@ -483,6 +484,15 @@ macro_rules! impl_packed_array {
}

#[doc = concat!("Creates a `", stringify!($PackedArray), "` from an iterator.")]
///
/// # Performance note
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If
/// the iterator returns more than that number of elements, it falls back to reading
/// elements into a fixed-size buffer before adding them all efficiently as a batch.
///
/// # Panics
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach
/// of the `Iterator` protocol).
impl FromIterator<$Element> for $PackedArray {
fn from_iter<I: IntoIterator<Item = $Element>>(iter: I) -> Self {
let mut array = $PackedArray::default();
Expand All @@ -491,16 +501,88 @@ macro_rules! impl_packed_array {
}
}

#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator")]
#[doc = concat!("Extends a`", stringify!($PackedArray), "` with the contents of an iterator.")]
///
/// # Performance note
/// This uses the lower bound from `Iterator::size_hint()` to allocate memory up front. If
/// the iterator returns more than that number of elements, it falls back to reading
/// elements into a fixed-size buffer before adding them all efficiently as a batch.
///
/// # Panics
/// - If the iterator's `size_hint()` returns an incorrect lower bound (which is a breach
/// of the `Iterator` protocol).
impl Extend<$Element> for $PackedArray {
fn extend<I: IntoIterator<Item = $Element>>(&mut self, iter: I) {
// Unfortunately the GDExtension API does not offer the equivalent of `Vec::reserve`.
// Otherwise we could use it to pre-allocate based on `iter.size_hint()`.
// Naive implementation:
//
// for item in iter.into_iter() {
// self.push(meta::ParamType::owned_to_arg(item));
// }
//
// This takes 6.1 µs for 1000 i32 elements in release mode.

let mut iter = iter.into_iter();
// Cache the length to avoid repeated Godot API calls.
let mut len = self.len();

// Fast part.
//
// Use `Iterator::size_hint()` to pre-allocate the minimum number of elements in
// the iterator, then write directly to the resulting slice. We can do this because
// `size_hint()` is required by the `Iterator` contract to return correct bounds.
// Note that any bugs in it must not result in UB.
//
// This takes 0.097 µs: 63× as fast as the naive approach.
let (size_hint_min, _size_hint_max) = iter.size_hint();
if size_hint_min > 0 {
let capacity = len + size_hint_min;
self.resize(capacity);
for out_ref in &mut self.as_mut_slice()[len..] {
*out_ref = iter.next().expect("iterator returned fewer than size_hint().0 elements");
}
len = capacity;
}

// Slower part.
//
// While the iterator is still not finished, gather elements into a fixed-size
// buffer, then add them all at once.
//
// A faster implementation using `resize()` and direct pointer writes might still be
// possible.
for item in iter.into_iter() {
self.push(meta::ParamType::owned_to_arg(item));
// This takes 0.14 µs: still 44× as fast as the naive approach.
//
// Note that we can't get by with simple memcpys, because `PackedStringArray`
// contains `GString`, which does not implement `Copy`.
//
// Buffer size: 2 kB is enough for the performance win, without needlessly blowing
// up the stack size.
const BUFFER_SIZE_BYTES: usize = 2048;
const BUFFER_CAPACITY: usize = const_max(
1,
BUFFER_SIZE_BYTES / size_of::<$Element>(),
);
let mut buf = [const { MaybeUninit::<$Element>::uninit() }; BUFFER_CAPACITY];
while let Some(item) = iter.next() {
buf[0].write(item);
let mut buf_len = 1;
while buf_len < BUFFER_CAPACITY {
if let Some(item) = iter.next() {
buf[buf_len].write(item);
buf_len += 1;
} else {
break;
}
}
let capacity = len + buf_len;
self.resize(capacity);
let out_slice = &mut self.as_mut_slice()[len..];
for i in 0..buf_len {
// SAFETY: We called `write()` on items `0..buf_len`, so they are all
// initialized.
let item = unsafe { buf[i].assume_init_read() };
out_slice[i] = item;
}
// At this point, we have moved all initialized values out of the buffer, so
// all of them are uninitialized again, and there are no leaks.
}
}
}
Expand Down Expand Up @@ -1071,3 +1153,12 @@ fn populated_or_err(array: PackedByteArray) -> Result<PackedByteArray, ()> {
Ok(array)
}
}

/// Helper because `usize::max()` is not const.
const fn const_max(a: usize, b: usize) -> usize {
if a > b {
a
} else {
b
}
}
20 changes: 19 additions & 1 deletion itest/rust/src/benchmarks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
use std::hint::black_box;

use godot::builtin::inner::InnerRect2i;
use godot::builtin::{GString, Rect2i, StringName, Vector2i};
use godot::builtin::{GString, PackedInt32Array, Rect2i, StringName, Vector2i};
use godot::classes::{Node3D, Os, RefCounted};
use godot::obj::{Gd, InstanceId, NewAlloc, NewGd};
use godot::register::GodotClass;
Expand Down Expand Up @@ -93,6 +93,24 @@ fn utilities_ffi_call() -> f64 {
godot::global::pow(base, exponent)
}

#[bench(repeat = 25)]
fn packed_array_from_iter_known_size() -> PackedInt32Array {
PackedInt32Array::from_iter(0..100)
}

#[bench(repeat = 25)]
fn packed_array_from_iter_unknown_size() -> PackedInt32Array {
let mut item = 0;
PackedInt32Array::from_iter(std::iter::from_fn(|| {
item += 1;
if item <= 100 {
Some(item)
} else {
None
}
}))
}

// ----------------------------------------------------------------------------------------------------------------------------------------------
// Helpers for benchmarks above

Expand Down
43 changes: 42 additions & 1 deletion itest/rust/src/builtin_tests/containers/packed_array_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,48 @@ fn packed_array_insert() {
}

#[itest]
fn packed_array_extend() {
fn packed_array_extend_known_size() {
// The logic in `extend()` is not trivial, so we test it for a wide range of sizes.
for len_a in 0..32i32 {
for len_b in 0..32i32 {
let mut array = PackedInt32Array::from_iter(0..len_a);
array.extend(len_a..len_a + len_b);
assert_eq!(
array.to_vec(),
(0..len_a + len_b).collect::<Vec<_>>(),
"len_a = {len_a}, len_b = {len_b}",
);
}
}
}

#[itest]
fn packed_array_extend_unknown_size() {
// The logic in `extend()` is not trivial, so we test it for a wide range of sizes.
for len_a in 0..32i32 {
for len_b in 0..32i32 {
let mut array = PackedInt32Array::from_iter(0..len_a);
let mut item = len_a;
array.extend(std::iter::from_fn(|| {
let result = if item < len_a + len_b {
Some(item)
} else {
None
};
item += 1;
result
}));
assert_eq!(
array.to_vec(),
(0..len_a + len_b).collect::<Vec<_>>(),
"len_a = {len_a}, len_b = {len_b}",
);
}
}
}

#[itest]
fn packed_array_extend_array() {
let mut array = PackedByteArray::from(&[1, 2]);
let other = PackedByteArray::from(&[3, 4]);
array.extend_array(&other);
Expand Down

0 comments on commit f2e267d

Please sign in to comment.