From c20bad7bce6be0b0d57e486f8f60df32d71825d7 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Fri, 15 Mar 2024 19:55:42 +0100
Subject: [PATCH 01/14] Add Atomic equivalent of MaxAlign

---
 texel/src/buf.rs   |  14 +++++-
 texel/src/lib.rs   |   1 +
 texel/src/texel.rs | 113 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 121 insertions(+), 7 deletions(-)
diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 24c1d80..2294a35 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -6,7 +6,7 @@ use core::{borrow, cmp, mem, ops};
 use alloc::borrow::ToOwned;
 use alloc::vec::Vec;
 
-use crate::texel::{constants::MAX, MaxAligned, Texel, MAX_ALIGN};
+use crate::texel::{constants::MAX, AtomicPart, MaxAligned, MaxAtomic, Texel, MAX_ALIGN};
 
 /// Allocates and manages raw bytes.
 ///
@@ -37,6 +37,18 @@ pub(crate) struct Buffer {
 #[allow(non_camel_case_types)]
 pub(crate) struct buf([u8]);
 
+/// An aligned slice of atomic memory.
+///
+/// This is a wrapper around a byte slice that additionally requires the slice to be highly
+/// aligned. It's usually created by first allocating an owned `buf`, and sharing it.
+///
+/// Note: Contrary to `buf`, this type __can not__ be sliced at arbitrary locations.
+///
+/// See `pixel.rs` for the only constructors.
+#[repr(transparent)]
+#[allow(non_camel_case_types)]
+pub(crate) struct atomic_buf([AtomicPart]);
+
 /// A copy-on-grow version of a buffer.
 pub(crate) enum Cog<'buf> {
     Owned(Buffer),
diff --git a/texel/src/lib.rs b/texel/src/lib.rs
index 93fd532..8b5741c 100644
--- a/texel/src/lib.rs
+++ b/texel/src/lib.rs
@@ -105,4 +105,5 @@ pub mod texels {
     pub use crate::texel::constants::*;
     pub use crate::texel::IsTransparentWrapper;
     pub use crate::texel::MaxAligned;
+    pub use crate::texel::MaxAtomic;
 }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index a71b05f..182ebaf 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -7,7 +7,7 @@ use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice};
 
-use crate::buf::buf;
+use crate::buf::{atomic_buf, buf};
 
 /// Marker struct to denote a texel type.
 ///
@@ -49,8 +49,15 @@ pub trait AsTexel {
 
 macro_rules! def_max_align {
     (
+        match cfg(target) {
+            $($($arch:literal)|* => $num:literal),*,
+        }
+
         $(#[$common_attr:meta])*
-        $($($arch:literal),* = $num:literal),*
+        struct MaxAligned(..);
+
+        $(#[$atomic_attr:meta])*
+        struct MaxAtomic(..);
     ) => {
         /// A byte-like-type that is aligned to the required max alignment.
         ///
@@ -65,6 +72,47 @@ macro_rules! def_max_align {
         )*
         pub struct MaxAligned(pub(crate) [u8; MAX_ALIGN]);
 
+        #[cfg(all(
+            not(target_has_atomic = "8"),
+            not(target_has_atomic = "16"),
+            not(target_has_atomic = "32"),
+            not(target_has_atomic = "64"),
+        ))]
+        compile_error!("Synchronous buffer API requires one atomic unsigned type");
+
+        #[cfg(all(
+            target_has_atomic = "8",
+            not(target_has_atomic = "16"),
+            not(target_has_atomic = "32"),
+            not(target_has_atomic = "64"),
+        ))]
+        pub(crate) type AtomicPart = core::sync::atomic::AtomicU8;
+        #[cfg(all(
+            target_has_atomic = "16",
+            not(target_has_atomic = "32"),
+            not(target_has_atomic = "64"),
+        ))]
+        pub(crate) type AtomicPart = core::sync::atomic::AtomicU16;
+        #[cfg(all(
+            target_has_atomic = "32",
+            not(target_has_atomic = "64"),
+        ))]
+        pub(crate) type AtomicPart = core::sync::atomic::AtomicU32;
+        #[cfg(all(
+            target_has_atomic = "64",
+        ))]
+        pub(crate) type AtomicPart = core::sync::atomic::AtomicU64;
+
+        const ATOMIC_PARTS: usize = MAX_ALIGN / core::mem::size_of::<AtomicPart>();
+
+        $(
+            #[cfg_attr(
+                any($(target_arch = $arch),*),
+                repr(align($num))
+            )]
+        )*
+        pub struct MaxAtomic(pub(crate) [AtomicPart; ATOMIC_PARTS]);
+
         $(
             #[cfg(
                 any($(target_arch = $arch),*),
@@ -82,17 +130,23 @@ macro_rules! def_max_align {
 }
 
 def_max_align! {
+    match cfg(target) {
+        "x86" | "x86_64" => 32,
+        "arm" => 16,
+        "aarch64" => 16,
+        "wasm32" => 16,
+    }
+
     /// A byte-like-type that is aligned to the required max alignment.
     ///
     /// This type does not contain padding and implements `Pod`. Generally, the alignment and size
     /// requirement is kept small to avoid overhead.
     #[derive(Clone, Copy)]
     #[repr(C)]
+    struct MaxAligned(..);
 
-    "x86", "x86_64" = 32,
-    "arm" = 16,
-    "aarch64" = 16,
-    "wasm32" = 16
+    /// Atomic equivalence of [`MaxAligned`].
+    struct MaxAtomic(..);
 }
 
 unsafe impl bytemuck::Zeroable for MaxAligned {}
@@ -544,6 +598,53 @@ impl<P> Texel<P> {
     }
 }
 
+const _: () = {
+    const fn atomic_is_size_equivalent_of_aligned() {}
+    const fn atomic_is_align_equivalent_of_aligned() {}
+
+    [atomic_is_size_equivalent_of_aligned()]
+        [!(core::mem::size_of::<MaxAtomic>() == core::mem::size_of::<MaxAligned>()) as usize];
+
+    [atomic_is_align_equivalent_of_aligned()]
+        [!(core::mem::align_of::<MaxAtomic>() == core::mem::align_of::<MaxAligned>()) as usize];
+};
+
+impl MaxAtomic {
+    /// Create a vector of atomic zero-bytes.
+    pub const fn zero() -> Self {
+        const Z: AtomicPart = AtomicPart::new(0);
+        MaxAtomic([Z; ATOMIC_PARTS])
+    }
+
+    /// Create a vector from values initialized synchronously.
+    pub fn new(contents: MaxAligned) -> Self {
+        let mut result = Self::zero();
+        let from = bytemuck::bytes_of(&contents);
+        let from = from.chunks_exact(core::mem::size_of::<AtomicPart>());
+
+        for (part, src) in result.0.iter_mut().zip(from) {
+            let to = bytemuck::bytes_of_mut(AtomicPart::get_mut(part));
+            to.copy_from_slice(src);
+        }
+
+        result
+    }
+
+    /// Unwrap an owned value.
+    pub fn into_inner(mut self) -> MaxAligned {
+        let mut result = MaxAligned([0; MAX_ALIGN]);
+        let from = bytemuck::bytes_of_mut(&mut result);
+        let from = from.chunks_exact_mut(core::mem::size_of::<AtomicPart>());
+
+        for (part, to) in self.0.iter_mut().zip(from) {
+            let src = bytemuck::bytes_of(AtomicPart::get_mut(part));
+            to.copy_from_slice(src);
+        }
+
+        result
+    }
+}
+
 /// This is a pure marker type.
 impl<P> Clone for Texel<P> {
     fn clone(&self) -> Self {

From 65e1e29fc74e82495208c5f629b3b0b80f8e427e Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Sat, 16 Mar 2024 00:24:51 +0100
Subject: [PATCH 02/14] Add MaxCell and cell_buf, atomic_buf

Those abstract the two main shared access memory slices: unsynchronous
and synchronous access to memory. The goal here is to find out which
actual operations will be possible under these two models. One main
limit has already been observed where atomic reads can not be trivially
broken up for the slice.

Due to this reason we stabilize the Cell types first, and deal with
synchronization at a later point.
---
 texel/src/buf.rs   | 80 +++++++++++++++++++++++++++++++++++---
 texel/src/lib.rs   |  3 ++
 texel/src/rec.rs   |  2 +-
 texel/src/texel.rs | 97 +++++++++++++++++++++++++++++++++++++---------
 4 files changed, 156 insertions(+), 26 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 2294a35..829e6f8 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -1,12 +1,13 @@
 // Distributed under The MIT License (MIT)
 //
 // Copyright (c) 2019 The `image-rs` developers
-use core::{borrow, cmp, mem, ops};
+use core::{borrow, cell, cmp, mem, ops};
 
+use alloc::sync::Arc;
 use alloc::borrow::ToOwned;
 use alloc::vec::Vec;
 
-use crate::texel::{constants::MAX, AtomicPart, MaxAligned, MaxAtomic, Texel, MAX_ALIGN};
+use crate::texel::{constants::MAX, AtomicPart, MaxAligned, MaxAtomic, MaxCell, Texel, MAX_ALIGN};
 
 /// Allocates and manages raw bytes.
 ///
@@ -22,11 +23,49 @@ use crate::texel::{constants::MAX, AtomicPart, MaxAligned, MaxAtomic, Texel, MAX
 /// there are also no operations which explicitely uncouple length and capacity. All operations
 /// simply work on best effort of making some number of bytes available.
 #[derive(Clone, Default)]
-pub(crate) struct Buffer {
+pub struct Buffer {
     /// The backing memory.
     inner: Vec<MaxAligned>,
 }
 
+/// Allocates and manages atomically shared bytes.
+///
+/// Provides a utility to allocate a slice of bytes aligned to the maximally required alignment.
+/// Since the elements are much larger than single bytes the inner storage will **not** have exact
+/// sizes as one would be used from by using a `Vec` as an allocator. This is instead more close to
+/// a `RawVec` and most operations have the same drawback as `Vec::reserve_exact` in not actually
+/// being exact.
+///
+/// Since exact length and capacity semantics are hard to guarantee for most operations, no effort
+/// is made to uphold them. Instead. keeping track of the exact, wanted logical length of the
+/// requested byte slice is the obligation of the user *under all circumstances*. As a consequence,
+/// there are also no operations which explicitely uncouple length and capacity. All operations
+/// simply work on best effort of making some number of bytes available.
+#[derive(Clone)]
+pub(crate) struct AtomicBuffer {
+    /// The backing memory.
+    inner: Arc<atomic_buf>,
+}
+
+/// Allocates and manages unsynchronized shared bytes.
+///
+/// Provides a utility to allocate a slice of bytes aligned to the maximally required alignment.
+/// Since the elements are much larger than single bytes the inner storage will **not** have exact
+/// sizes as one would be used from by using a `Vec` as an allocator. This is instead more close to
+/// a `RawVec` and most operations have the same drawback as `Vec::reserve_exact` in not actually
+/// being exact.
+///
+/// Since exact length and capacity semantics are hard to guarantee for most operations, no effort
+/// is made to uphold them. Instead. keeping track of the exact, wanted logical length of the
+/// requested byte slice is the obligation of the user *under all circumstances*. As a consequence,
+/// there are also no operations which explicitely uncouple length and capacity. All operations
+/// simply work on best effort of making some number of bytes available.
+#[derive(Clone)]
+pub struct CellBuffer {
+    /// The backing memory.
+    inner: Arc<cell_buf>,
+}
+
 /// An aligned slice of memory.
 ///
 /// This is a wrapper around a byte slice that additionally requires the slice to be highly
@@ -35,7 +74,7 @@ pub(crate) struct Buffer {
 /// See `pixel.rs` for the only constructors.
 #[repr(transparent)]
 #[allow(non_camel_case_types)]
-pub(crate) struct buf([u8]);
+pub struct buf([u8]);
 
 /// An aligned slice of atomic memory.
 ///
@@ -47,8 +86,21 @@ pub(crate) struct buf([u8]);
 /// See `pixel.rs` for the only constructors.
 #[repr(transparent)]
 #[allow(non_camel_case_types)]
+// FIXME: in contrast to other types, this can not be slice at arbitrary byte ends since we must
+// still utilize potentially full atomic instructions for the underlying interaction? At least we
+// do not know.. Have to figure this out.
 pub(crate) struct atomic_buf([AtomicPart]);
 
+/// An aligned slice of shared-access memory.
+///
+/// This is a wrapper around a cell of a byte slice that additionally requires the slice to be
+/// highly aligned.
+///
+/// See `pixel.rs` for the only constructors.
+#[repr(transparent)]
+#[allow(non_camel_case_types)]
+pub struct cell_buf(cell::Cell<[u8]>);
+
 /// A copy-on-grow version of a buffer.
 pub(crate) enum Cog<'buf> {
     Owned(Buffer),
@@ -148,7 +200,7 @@ impl buf {
     where
         T: AsRef<[MaxAligned]> + ?Sized,
     {
-        let bytes = MAX.cast_bytes(data.as_ref());
+        let bytes = MAX.to_bytes(data.as_ref());
         Self::from_bytes(bytes).unwrap()
     }
 
@@ -159,7 +211,7 @@ impl buf {
     where
         T: AsMut<[MaxAligned]> + ?Sized,
     {
-        let bytes = MAX.cast_mut_bytes(data.as_mut());
+        let bytes = MAX.to_mut_bytes(data.as_mut());
         Self::from_bytes_mut(bytes).unwrap()
     }
 
@@ -499,6 +551,22 @@ impl ops::DerefMut for Buffer {
     }
 }
 
+impl ops::Deref for AtomicBuffer {
+    type Target = atomic_buf;
+
+    fn deref(&self) -> &atomic_buf {
+        self.inner.as_ref()
+    }
+}
+
+impl ops::Deref for CellBuffer {
+    type Target = cell_buf;
+
+    fn deref(&self) -> &cell_buf {
+        self.inner.as_ref()
+    }
+}
+
 impl ops::Deref for buf {
     type Target = [u8];
 
diff --git a/texel/src/lib.rs b/texel/src/lib.rs
index 8b5741c..da7dce2 100644
--- a/texel/src/lib.rs
+++ b/texel/src/lib.rs
@@ -106,4 +106,7 @@ pub mod texels {
     pub use crate::texel::IsTransparentWrapper;
     pub use crate::texel::MaxAligned;
     pub use crate::texel::MaxAtomic;
+    pub use crate::texel::MaxCell;
+
+    pub use crate::buf::{buf, cell_buf, Buffer, CellBuffer};
 }
diff --git a/texel/src/rec.rs b/texel/src/rec.rs
index 0b3e80f..9d40335 100644
--- a/texel/src/rec.rs
+++ b/texel/src/rec.rs
@@ -114,7 +114,7 @@ impl<P> TexelBuffer<P> {
     ///
     /// This function will panic if the allocation fails.
     pub fn with_elements_for_texel(texel: Texel<P>, elements: &[P]) -> Self {
-        let src = texel.cast_bytes(elements);
+        let src = texel.to_bytes(elements);
         let mut buffer = TexelBuffer::from_buffer(Buffer::from(src), texel);
         // Will be treated as empty, so adjust to be filled up to count.
         buffer.length = src.len();
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index 182ebaf..7156a67 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -3,11 +3,12 @@
 // Copyright (c) 2019, 2020 The `image-rs` developers
 #![allow(unsafe_code)]
 
+use core::cell::Cell;
 use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice};
 
-use crate::buf::{atomic_buf, buf};
+use crate::buf::{buf, cell_buf};
 
 /// Marker struct to denote a texel type.
 ///
@@ -58,6 +59,9 @@ macro_rules! def_max_align {
 
         $(#[$atomic_attr:meta])*
         struct MaxAtomic(..);
+
+        $(#[$cell_attr:meta])*
+        struct MaxCell(..);
     ) => {
         /// A byte-like-type that is aligned to the required max alignment.
         ///
@@ -113,6 +117,14 @@ macro_rules! def_max_align {
         )*
         pub struct MaxAtomic(pub(crate) [AtomicPart; ATOMIC_PARTS]);
 
+        $(
+            #[cfg_attr(
+                any($(target_arch = $arch),*),
+                repr(align($num))
+            )]
+        )*
+        pub struct MaxCell(pub(crate) Cell<[u8; MAX_ALIGN]>);
+
         $(
             #[cfg(
                 any($(target_arch = $arch),*),
@@ -147,6 +159,9 @@ def_max_align! {
 
     /// Atomic equivalence of [`MaxAligned`].
     struct MaxAtomic(..);
+
+    /// A cell of a byte array equivalent to [`MaxAligned`].
+    struct MaxCell(..);
 }
 
 unsafe impl bytemuck::Zeroable for MaxAligned {}
@@ -363,6 +378,19 @@ impl buf {
     }
 }
 
+impl cell_buf {
+    pub const ALIGNMENT: usize = MAX_ALIGN;
+
+    pub fn from_bytes(bytes: &[Cell<u8>]) -> Option<&Self> {
+        if bytes.as_ptr() as usize % Self::ALIGNMENT == 0 {
+            // Safety: these types are binary compatible
+            Some(unsafe { &*(bytes as *const [_] as *const Cell<[u8]> as *const cell_buf) })
+        } else {
+            None
+        }
+    }
+}
+
 impl<P> Texel<P> {
     /// Create a witness certifying `P` as a texel without checks.
     ///
@@ -486,6 +514,12 @@ impl<P> Texel<P> {
         unsafe { ptr::read(val) }
     }
 
+    pub fn copy_cell(self, val: &Cell<P>) -> P {
+        // SAFETY: by the constructor, this inner type can be copied byte-by-byte. And `Cell` is a
+        // transparent wrapper so it can be read byte-by-byte as well.
+        unsafe { ptr::read(val) }.into_inner()
+    }
+
     /// Reinterpret a slice of aligned bytes as a slice of the texel.
     ///
     /// Note that the size (in bytes) of the slice will be shortened if the size of `P` is not a
@@ -533,18 +567,59 @@ impl<P> Texel<P> {
         }
     }
 
+    /// Reinterpret a slice of texel as memory.
+    ///
+    /// Note that you can convert a reference to a single value by [`core::slice::from_ref`].
+    pub fn try_to_cell<'buf>(self, bytes: &'buf [Cell<u8>]) -> Option<&'buf Cell<[P]>> {
+        // Safety:
+        // - The `pod`-ness certified by `self` ensures the cast of the contents of the memory is
+        //   valid. All representations are a valid P and conversely and P is valid as bytes. Since
+        //   Cell is a transparent wrapper the types are compatible.
+        // - We uphold the share invariants of `Cell`, which are trivial (less than those required
+        //   and provided by a shared reference).
+        if bytes.as_ptr() as usize % mem::align_of::<P>() == 0 {
+            let len = bytes.len() / mem::size_of::<P>();
+            let ptr = ptr::slice_from_raw_parts(bytes.as_ptr() as *const P, len);
+            Some(unsafe { &*(ptr as *const Cell<[P]>) })
+        } else {
+            None
+        }
+    }
+
     /// Reinterpret a slice of texel as memory.
     ///
     /// Note that you can convert a reference to a single value by [`core::slice::from_ref`].
     pub fn to_bytes<'buf>(self, texel: &'buf [P]) -> &'buf [u8] {
-        self.cast_bytes(texel)
+        // Safety:
+        // * lifetime is not changed
+        // * keeps the exact same size
+        // * validity for byte reading checked by Texel constructor
+        unsafe { slice::from_raw_parts(texel.as_ptr() as *const u8, mem::size_of_val(texel)) }
     }
 
     /// Reinterpret a mutable slice of texel as memory.
     ///
     /// Note that you can convert a reference to a single value by [`core::slice::from_mut`].
     pub fn to_mut_bytes<'buf>(self, texel: &'buf mut [P]) -> &'buf mut [u8] {
-        self.cast_mut_bytes(texel)
+        // Safety:
+        // * lifetime is not changed
+        // * keeps the exact same size
+        // * validity as bytes checked by Texel constructor
+        unsafe { slice::from_raw_parts_mut(texel.as_mut_ptr() as *mut u8, mem::size_of_val(texel)) }
+    }
+
+    /// Reinterpret a slice of texel as memory.
+    ///
+    /// Note that you can convert a reference to a single value by [`core::slice::from_ref`].
+    pub fn cell_bytes<'buf>(self, texel: &'buf [Cell<P>]) -> &'buf Cell<[u8]> {
+        let ptr: *const [u8] =
+            { ptr::slice_from_raw_parts(texel.as_ptr() as *const u8, mem::size_of_val(texel)) };
+
+        // Safety:
+        // * lifetime is not changed
+        // * kept the exact same size
+        // * validity for byte representations both ways checked by Texel constructor
+        unsafe { &*(ptr as *const Cell<[u8]>) }
     }
 
     pub(crate) fn cast_buf<'buf>(self, buffer: &'buf buf) -> &'buf [P] {
@@ -580,22 +655,6 @@ impl<P> Texel<P> {
             )
         }
     }
-
-    pub(crate) fn cast_bytes<'buf>(self, texel: &'buf [P]) -> &'buf [u8] {
-        // Safety:
-        // * lifetime is not changed
-        // * keeps the exact same size
-        // * validity for byte reading checked by Texel constructor
-        unsafe { slice::from_raw_parts(texel.as_ptr() as *const u8, mem::size_of_val(texel)) }
-    }
-
-    pub(crate) fn cast_mut_bytes<'buf>(self, texel: &'buf mut [P]) -> &'buf mut [u8] {
-        // Safety:
-        // * lifetime is not changed
-        // * keeps the exact same size
-        // * validity as bytes checked by Texel constructor
-        unsafe { slice::from_raw_parts_mut(texel.as_mut_ptr() as *mut u8, mem::size_of_val(texel)) }
-    }
 }
 
 const _: () = {

From 960c567613beb6901831003e4c35dcc5687850c9 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Sat, 16 Mar 2024 01:07:00 +0100
Subject: [PATCH 03/14] Add constructions for MaxCell

---
 texel/src/rec.rs   |  4 ++++
 texel/src/texel.rs | 29 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/texel/src/rec.rs b/texel/src/rec.rs
index 9d40335..bfb9480 100644
--- a/texel/src/rec.rs
+++ b/texel/src/rec.rs
@@ -247,10 +247,12 @@ impl<P> TexelBuffer<P> {
         self.length = exact_size;
     }
 
+    /// View the valid portion of the buffer as a slice of the texel type.
     pub fn as_slice(&self) -> &[P] {
         self.buf().as_texels(self.texel)
     }
 
+    /// View the valid portion of the buffer as a mutable slice of the texel type.
     pub fn as_mut_slice(&mut self) -> &mut [P] {
         let texel = self.texel;
         self.buf_mut().as_mut_texels(texel)
@@ -266,10 +268,12 @@ impl<P> TexelBuffer<P> {
         self.inner.capacity() / self.texel.size_nz().get()
     }
 
+    /// View the raw bytes representing the buffer, in the native memory layout.
     pub fn as_bytes(&self) -> &[u8] {
         self.buf().as_bytes()
     }
 
+    /// View the mutable raw bytes representing the buffer, in the native memory layout.
     pub fn as_bytes_mut(&mut self) -> &mut [u8] {
         self.buf_mut().as_bytes_mut()
     }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index 7156a67..e6330cc 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -115,6 +115,7 @@ macro_rules! def_max_align {
                 repr(align($num))
             )]
         )*
+        $(#[$atomic_attr])*
         pub struct MaxAtomic(pub(crate) [AtomicPart; ATOMIC_PARTS]);
 
         $(
@@ -123,6 +124,7 @@ macro_rules! def_max_align {
                 repr(align($num))
             )]
         )*
+        $(#[$cell_attr])*
         pub struct MaxCell(pub(crate) Cell<[u8; MAX_ALIGN]>);
 
         $(
@@ -704,6 +706,33 @@ impl MaxAtomic {
     }
 }
 
+impl MaxCell {
+    /// Create a vector of atomic zero-bytes.
+    pub const fn zero() -> Self {
+        MaxCell(Cell::new([0; MAX_ALIGN]))
+    }
+
+    /// Create a vector from values initialized synchronously.
+    pub fn new(contents: MaxAligned) -> Self {
+        MaxCell(Cell::new(contents.0))
+    }
+
+    /// Overwrite the contents with new information from another cell.
+    pub fn set(&self, newval: &Self) {
+        self.0.set(newval.0.get())
+    }
+
+    /// Read the current contents from this cell into an owned value.
+    pub fn get(&self) -> MaxAligned {
+        MaxAligned(self.0.get())
+    }
+
+    /// Unwrap an owned value.
+    pub fn into_inner(self) -> MaxAligned {
+        MaxAligned(self.0.into_inner())
+    }
+}
+
 /// This is a pure marker type.
 impl<P> Clone for Texel<P> {
     fn clone(&self) -> Self {

From 94f36e3acc007ae85993012f608a53dd2a1b4cd8 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Sat, 16 Mar 2024 01:07:20 +0100
Subject: [PATCH 04/14] WIP: planned features

---
 texel/src/arc.rs |  6 ++++
 texel/src/buf.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 texel/src/arc.rs

diff --git a/texel/src/arc.rs b/texel/src/arc.rs
new file mode 100644
index 0000000..50e0ed8
--- /dev/null
+++ b/texel/src/arc.rs
@@ -0,0 +1,6 @@
+
+
+#[derive(Clone, Default)]
+pub(crate) struct ArcBuffer {
+    inner: Arc<[AlignedAtomic]>,
+}
diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 829e6f8..bbe414a 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -498,6 +498,18 @@ impl From<&'_ [u8]> for Buffer {
     }
 }
 
+impl From<&'_ [u8]> for AtomicBuffer {
+    fn from(_: &'_ [u8]) -> Self {
+        todo!()
+    }
+}
+
+impl From<&'_ [u8]> for CellBuffer {
+    fn from(_: &'_ [u8]) -> Self {
+        todo!()
+    }
+}
+
 impl From<&'_ buf> for Buffer {
     fn from(content: &'_ buf) -> Self {
         content.to_owned()
@@ -651,6 +663,70 @@ impl ops::IndexMut<ops::RangeTo<usize>> for buf {
     }
 }
 
+#[allow(dead_code)]
+impl atomic_buf {
+    /// Overwrite bytes within the vector with new data.
+    fn copy_within(&self, _from: core::ops::Range<usize>, _to: usize) {
+        todo!()
+    }
+
+    /// Overwrite the whole vector with new data.
+    fn copy_from(
+        &self,
+        _from: core::ops::Range<usize>,
+        _source: &[MaxAtomic],
+        _to: usize,
+    ) {
+        todo!()
+    }
+}
+
+impl cell_buf {
+    /// Wraps an aligned buffer into `buf`.
+    ///
+    /// This method will never panic, as the alignment of the data is guaranteed.
+    pub fn new<T>(_data: &T) -> &Self
+    where
+        T: AsRef<[MaxCell]> + ?Sized,
+    {
+        // We can't use `bytemuck` here.
+        todo!()
+    }
+
+    pub fn truncate(&self, at: usize) -> &Self {
+        // We promise this does not panic since the buffer is in fact aligned.
+        Self::from_bytes(&self.0.as_slice_of_cells()[..at]).unwrap()
+    }
+
+    pub fn split_at(&self, at: usize) -> (&Self, &Self) {
+        assert!(at % MAX_ALIGN == 0);
+        let (a, b) = self.0.as_slice_of_cells().split_at(at);
+        let a = Self::from_bytes(a).expect("was previously aligned");
+        let b = Self::from_bytes(b).expect("asserted to be aligned");
+        (a, b)
+    }
+
+    /// Reinterpret the buffer for the specific texel type.
+    ///
+    /// The alignment of `P` is already checked to be smaller than `MAX_ALIGN` through the
+    /// constructor of `Texel`. The slice will have the maximum length possible but may leave
+    /// unused bytes in the end.
+    pub fn as_texels<P>(&self, _pixel: Texel<P>) -> &cell::Cell<[P]> {
+        todo!()
+    }
+
+    pub fn map_within<P, Q>(
+        &mut self,
+        _src: impl ops::RangeBounds<usize>,
+        _dest: usize,
+        _f: impl Fn(P) -> Q,
+        _p: Texel<P>,
+        _q: Texel<Q>,
+    ) {
+        todo!()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From ad4d96eb9c4b099ecc899a4abbb90dbb0803a3ab Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Fri, 5 Apr 2024 22:35:02 +0200
Subject: [PATCH 05/14] Implement the Atomic buffer types

---
 texel/src/buf.rs   | 59 +++++++++++++++++++++++++++++-----------
 texel/src/texel.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index bbe414a..e0a6ff7 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -44,7 +44,7 @@ pub struct Buffer {
 #[derive(Clone)]
 pub(crate) struct AtomicBuffer {
     /// The backing memory.
-    inner: Arc<atomic_buf>,
+    inner: Arc<[MaxAtomic]>,
 }
 
 /// Allocates and manages unsynchronized shared bytes.
@@ -62,8 +62,8 @@ pub(crate) struct AtomicBuffer {
 /// simply work on best effort of making some number of bytes available.
 #[derive(Clone)]
 pub struct CellBuffer {
-    /// The backing memory.
-    inner: Arc<cell_buf>,
+    /// The backing memory, aligned by allocating it with the proper type.
+    inner: Arc<[MaxCell]>,
 }
 
 /// An aligned slice of memory.
@@ -76,19 +76,15 @@ pub struct CellBuffer {
 #[allow(non_camel_case_types)]
 pub struct buf([u8]);
 
-/// An aligned slice of atomic memory.
-///
-/// This is a wrapper around a byte slice that additionally requires the slice to be highly
-/// aligned. It's usually created by first allocating an owned `buf`, and sharing it.
-///
-/// Note: Contrary to `buf`, this type __can not__ be sliced at arbitrary locations.
+/// In contrast to other types, this can not be slice at arbitrary byte ends since we must
+/// still utilize potentially full atomic instructions for the underlying interaction! Until we get
+/// custom metadata, we have our own 'reference type' here. This makes interfaces slightly less
+/// convenient but it is internal to the library anyways.
 ///
-/// See `pixel.rs` for the only constructors.
+/// Note: Contrary to `buf`, this type __can not__ be sliced at arbitrary locations. Use the
+/// conversion to `atomic_ref` for this.
 #[repr(transparent)]
 #[allow(non_camel_case_types)]
-// FIXME: in contrast to other types, this can not be slice at arbitrary byte ends since we must
-// still utilize potentially full atomic instructions for the underlying interaction? At least we
-// do not know.. Have to figure this out.
 pub(crate) struct atomic_buf([AtomicPart]);
 
 /// An aligned slice of shared-access memory.
@@ -101,9 +97,28 @@ pub(crate) struct atomic_buf([AtomicPart]);
 #[allow(non_camel_case_types)]
 pub struct cell_buf(cell::Cell<[u8]>);
 
+/// A logical reference to a byte slice from some atomic memory.
+///
+/// This is a wrapper around a slice of the underlying atomics. From this buffer, in contrast to
+/// the shared and the unsynchronized slices, you can _not_ retrieve slices to typed memory.
+///
+/// See `pixel.rs` for the only constructors.
+#[derive(Clone, Copy)]
+pub(crate) struct AtomicRef<'lt> {
+    buf: &'lt atomic_buf,
+    /// The first byte referred to by this slice.
+    ///
+    /// Not using `core::ops::Range` since we want to be Copy!
+    start: usize,
+    /// The past-the-end byte referred to by this slice.
+    end: usize,
+}
+
 /// A copy-on-grow version of a buffer.
 pub(crate) enum Cog<'buf> {
     Owned(Buffer),
+    // May be used later..
+    #[allow(dead_code)]
     Borrowed(&'buf mut buf),
 }
 
@@ -567,7 +582,7 @@ impl ops::Deref for AtomicBuffer {
     type Target = atomic_buf;
 
     fn deref(&self) -> &atomic_buf {
-        self.inner.as_ref()
+        atomic_buf::from_slice(&self.inner)
     }
 }
 
@@ -575,7 +590,7 @@ impl ops::Deref for CellBuffer {
     type Target = cell_buf;
 
     fn deref(&self) -> &cell_buf {
-        self.inner.as_ref()
+        cell_buf::from_slice(&self.inner)
     }
 }
 
@@ -664,7 +679,7 @@ impl ops::IndexMut<ops::RangeTo<usize>> for buf {
 }
 
 #[allow(dead_code)]
-impl atomic_buf {
+impl AtomicRef<'_> {
     /// Overwrite bytes within the vector with new data.
     fn copy_within(&self, _from: core::ops::Range<usize>, _to: usize) {
         todo!()
@@ -727,6 +742,18 @@ impl cell_buf {
     }
 }
 
+impl<'lt> From<&'lt atomic_buf> for AtomicRef<'lt> {
+    fn from(value: &'lt atomic_buf) -> AtomicRef<'lt> {
+        let end = core::mem::size_of_val(value);
+
+        AtomicRef {
+            buf: value,
+            start: 0,
+            end,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index e6330cc..fa45304 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -8,7 +8,8 @@ use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice};
 
-use crate::buf::{buf, cell_buf};
+use alloc::sync::Arc;
+use crate::buf::{buf, atomic_buf, cell_buf};
 
 /// Marker struct to denote a texel type.
 ///
@@ -76,6 +77,26 @@ macro_rules! def_max_align {
         )*
         pub struct MaxAligned(pub(crate) [u8; MAX_ALIGN]);
 
+        /* Note: We need to be really careful to avoid peril for several reasons.
+         *
+         * Firstly, the Rust atomic model forbids us from doing unsynchronized access (stores _or_
+         * loads) with differing sizes to the same memory location. For now, and for the
+         * foreseeable future. Since we do not synchronize the access to the buffer, we must use
+         * the same size everywhere.
+         *
+         * Secondly, using any type other than `AtomicU8` for these makes it hard for us to slice
+         * the buffer at arbitrary points. For true references we might work around this by custom
+         * metadata, yet this is not stable. Hence, we _must_ use a non-reference type wrapper for
+         * the kind of access we need. Or rather, the initial buffer allocation can deref into a
+         * reference to a slice of atomics but to slice it we must use our own type. And all
+         * operations are implemented to work on full units of this atomic type.
+         *
+         * At least for relaxed operations, the larger unit is somewhat equivalent. It's certainly
+         * at bit of a balance. Larger units might be more costly from destructive interference
+         * between different accesses, but small units are costly due to added instructions.
+         *
+         * View the selection below as a 'best-effort' really.
+         **/
         #[cfg(all(
             not(target_has_atomic = "8"),
             not(target_has_atomic = "16"),
@@ -160,6 +181,8 @@ def_max_align! {
     struct MaxAligned(..);
 
     /// Atomic equivalence of [`MaxAligned`].
+    ///
+    /// This contains some instance of [`core::sync::atomic::AtomicU8`].
     struct MaxAtomic(..);
 
     /// A cell of a byte array equivalent to [`MaxAligned`].
@@ -380,12 +403,53 @@ impl buf {
     }
 }
 
+impl atomic_buf {
+    pub const ALIGNMENT: usize = MAX_ALIGN;
+
+    pub fn from_slice(values: &[MaxAtomic]) -> &Self {
+        debug_assert_eq!(values.as_ptr() as usize % Self::ALIGNMENT, 0);
+        let ptr = values.as_ptr() as *const AtomicPart;
+        let count = values.len() * ATOMIC_PARTS;
+        // Safety: these types are binary compatible, they wrap atomics of the same size,  and
+        // starting at the same address, with a pointer of the same provenance which will be valid
+        // for the whole lifetime.
+        //
+        // This case relaxes the alignment requirements from `MaxAtomic` to that of the underlying
+        // atomic, which allows us to go beyond the public interface.
+        //
+        // The new size covered by the slice is the same as the input slice, since there are
+        // `ATOMIC_PARTS` units within each `MaxAtomic`. The memory invariants of the new type are
+        // the same as the old type, which is that we access only with atomics instructions of the
+        // size of the `AtomicPart` type.
+        let atomics = core::ptr::slice_from_raw_parts::<AtomicPart>(ptr, count);
+        // Safety: `atomic_buf` has the same layout as a `[MaxAtomic]` and wraps it transparently.
+        unsafe { &*(atomics as *const Self) }
+    }
+}
+
 impl cell_buf {
     pub const ALIGNMENT: usize = MAX_ALIGN;
 
+    pub fn from_slice(values: &[MaxCell]) -> &Self {
+        debug_assert_eq!(values.as_ptr() as usize % Self::ALIGNMENT, 0);
+        let ptr = values.as_ptr() as *const Cell<u8>;
+        let count = core::mem::size_of_val(values);
+        // Safety: constructs a pointer to a slice validly covering exactly the values in the
+        // input. The byte length is determined by `size_of_val` and starting at the same address,
+        // with a pointer of the same provenance which will be valid for the whole lifetime. The
+        // memory invariants of the new type are the same as the old type, which is that we access
+        // only with atomics instructions of the size of the `AtomicPart` type.
+        let memory = core::ptr::slice_from_raw_parts::<Cell<u8>>(ptr, count);
+        // Safety: these types are binary compatible, they wrap memory of the same size.
+        // This case relaxes the alignment requirements from `MaxAtomic` to that of the underlying
+        // atomic, which allows us to go beyond the public interface.
+        unsafe { &*(memory as *const Self) }
+    }
+
     pub fn from_bytes(bytes: &[Cell<u8>]) -> Option<&Self> {
         if bytes.as_ptr() as usize % Self::ALIGNMENT == 0 {
-            // Safety: these types are binary compatible
+            // Safety: these types are binary compatible. The metadata is also the same, as both
+            // types encapsulate a slice of `u8`-sized types.
             Some(unsafe { &*(bytes as *const [_] as *const Cell<[u8]> as *const cell_buf) })
         } else {
             None

From 231c18fa5b34ca846f8e15893a47d64b25a8e585 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Sat, 6 Apr 2024 00:16:32 +0200
Subject: [PATCH 06/14] Implement {Cell,Atomic}Buffer initialization

---
 texel/src/buf.rs   | 95 +++++++++++++++++++++++++++++++++++++++-------
 texel/src/lib.rs   |  2 +-
 texel/src/texel.rs |  1 -
 3 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index e0a6ff7..289aa0b 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -3,8 +3,8 @@
 // Copyright (c) 2019 The `image-rs` developers
 use core::{borrow, cell, cmp, mem, ops};
 
-use alloc::sync::Arc;
 use alloc::borrow::ToOwned;
+use alloc::sync::Arc;
 use alloc::vec::Vec;
 
 use crate::texel::{constants::MAX, AtomicPart, MaxAligned, MaxAtomic, MaxCell, Texel, MAX_ALIGN};
@@ -42,7 +42,7 @@ pub struct Buffer {
 /// there are also no operations which explicitely uncouple length and capacity. All operations
 /// simply work on best effort of making some number of bytes available.
 #[derive(Clone)]
-pub(crate) struct AtomicBuffer {
+pub struct AtomicBuffer {
     /// The backing memory.
     inner: Arc<[MaxAtomic]>,
 }
@@ -76,16 +76,20 @@ pub struct CellBuffer {
 #[allow(non_camel_case_types)]
 pub struct buf([u8]);
 
+/// An aligned slice of atomic memory.
+///
 /// In contrast to other types, this can not be slice at arbitrary byte ends since we must
 /// still utilize potentially full atomic instructions for the underlying interaction! Until we get
-/// custom metadata, we have our own 'reference type' here. This makes interfaces slightly less
+/// custom metadata, we have our own 'reference type' here.
+///
+/// This type is relatively useless in the public interface, this makes interfaces slightly less
 /// convenient but it is internal to the library anyways.
 ///
 /// Note: Contrary to `buf`, this type __can not__ be sliced at arbitrary locations. Use the
 /// conversion to `atomic_ref` for this.
 #[repr(transparent)]
 #[allow(non_camel_case_types)]
-pub(crate) struct atomic_buf([AtomicPart]);
+pub struct atomic_buf([AtomicPart]);
 
 /// An aligned slice of shared-access memory.
 ///
@@ -179,6 +183,18 @@ impl Buffer {
     }
 }
 
+impl CellBuffer {
+    pub fn capacity(&self) -> usize {
+        core::mem::size_of_val(&*self.inner)
+    }
+}
+
+impl AtomicBuffer {
+    pub fn capacity(&self) -> usize {
+        core::mem::size_of_val(&*self.inner)
+    }
+}
+
 impl Cog<'_> {
     pub(crate) fn to_owned(this: &mut Self) -> &'_ mut Buffer {
         match this {
@@ -514,14 +530,54 @@ impl From<&'_ [u8]> for Buffer {
 }
 
 impl From<&'_ [u8]> for AtomicBuffer {
-    fn from(_: &'_ [u8]) -> Self {
-        todo!()
+    fn from(values: &'_ [u8]) -> Self {
+        let chunks = values.chunks_exact(MAX_ALIGN);
+        let remainder = chunks.remainder();
+
+        let capacity = Buffer::alloc_len(values.len());
+        let mut buffer = Vec::with_capacity(capacity);
+
+        buffer.extend(chunks.map(|arr| {
+            let mut data = MaxAligned([0; MAX_ALIGN]);
+            data.0.copy_from_slice(arr);
+            MaxAtomic::new(data)
+        }));
+
+        if !remainder.is_empty() {
+            let mut data = MaxAligned([0; MAX_ALIGN]);
+            data.0[..remainder.len()].copy_from_slice(remainder);
+            buffer.push(MaxAtomic::new(data));
+        }
+
+        AtomicBuffer {
+            inner: buffer.into(),
+        }
     }
 }
 
 impl From<&'_ [u8]> for CellBuffer {
-    fn from(_: &'_ [u8]) -> Self {
-        todo!()
+    fn from(values: &'_ [u8]) -> Self {
+        let chunks = values.chunks_exact(MAX_ALIGN);
+        let remainder = chunks.remainder();
+
+        let capacity = Buffer::alloc_len(values.len());
+        let mut buffer = Vec::with_capacity(capacity);
+
+        buffer.extend(chunks.map(|arr| {
+            let mut data = [0; MAX_ALIGN];
+            data.copy_from_slice(arr);
+            MaxCell(cell::Cell::new(data))
+        }));
+
+        if !remainder.is_empty() {
+            let mut data = [0; MAX_ALIGN];
+            data[..remainder.len()].copy_from_slice(remainder);
+            buffer.push(MaxCell(cell::Cell::new(data)));
+        }
+
+        CellBuffer {
+            inner: buffer.into(),
+        }
     }
 }
 
@@ -686,12 +742,7 @@ impl AtomicRef<'_> {
     }
 
     /// Overwrite the whole vector with new data.
-    fn copy_from(
-        &self,
-        _from: core::ops::Range<usize>,
-        _source: &[MaxAtomic],
-        _to: usize,
-    ) {
+    fn copy_from(&self, _from: core::ops::Range<usize>, _source: &[MaxAtomic], _to: usize) {
         todo!()
     }
 }
@@ -835,4 +886,20 @@ mod tests {
             (0..LEN as u32).collect::<Vec<_>>()
         );
     }
+
+    #[test]
+    fn cell_buffer() {
+        let data = [0, 0, 255, 0, 255, 0, 255, 0, 0];
+        let buffer = CellBuffer::from(&data[..]);
+        // Gets rounded up to the next alignment.
+        assert_eq!(buffer.capacity(), Buffer::alloc_len(data.len()) * MAX_ALIGN);
+    }
+
+    #[test]
+    fn atomic_buffer() {
+        let data = [0, 0, 255, 0, 255, 0, 255, 0, 0];
+        let buffer = AtomicBuffer::from(&data[..]);
+        // Gets rounded up to the next alignment.
+        assert_eq!(buffer.capacity(), Buffer::alloc_len(data.len()) * MAX_ALIGN);
+    }
 }
diff --git a/texel/src/lib.rs b/texel/src/lib.rs
index da7dce2..6e6afbc 100644
--- a/texel/src/lib.rs
+++ b/texel/src/lib.rs
@@ -108,5 +108,5 @@ pub mod texels {
     pub use crate::texel::MaxAtomic;
     pub use crate::texel::MaxCell;
 
-    pub use crate::buf::{buf, cell_buf, Buffer, CellBuffer};
+    pub use crate::buf::{atomic_buf, buf, cell_buf, AtomicBuffer, Buffer, CellBuffer};
 }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index fa45304..562ae54 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -8,7 +8,6 @@ use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice};
 
-use alloc::sync::Arc;
 use crate::buf::{buf, atomic_buf, cell_buf};
 
 /// Marker struct to denote a texel type.

From 59e04b2d5a68edb9851969e2aa7897bdffdc32d4 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@nebumind.com>
Date: Sat, 6 Apr 2024 03:00:10 +0200
Subject: [PATCH 07/14] Implement conversion between buffers

---
 texel/src/buf.rs   | 88 +++++++++++++++++++++++++++++++++++++++++++++-
 texel/src/texel.rs | 17 ++++++++-
 2 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 289aa0b..64cbcb3 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -1,7 +1,7 @@
 // Distributed under The MIT License (MIT)
 //
 // Copyright (c) 2019 The `image-rs` developers
-use core::{borrow, cell, cmp, mem, ops};
+use core::{borrow, cell, cmp, mem, ops, sync::atomic};
 
 use alloc::borrow::ToOwned;
 use alloc::sync::Arc;
@@ -184,15 +184,95 @@ impl Buffer {
 }
 
 impl CellBuffer {
+    const ELEMENT: MaxCell = MaxCell::zero();
+
+    /// Allocate a new [`CellBuffer`] with a number of bytes.
+    ///
+    /// Panics if the length is too long to find a properly aligned subregion.
+    pub fn new(length: usize) -> Self {
+        let alloc_len = Buffer::alloc_len(length);
+        let inner: Vec<_> = (0..alloc_len).map(|_| Self::ELEMENT).collect();
+
+        CellBuffer {
+            inner: inner.into(),
+        }
+    }
+
+    /// Share an existing buffer.
+    ///
+    /// The library will try, to an extent, to avoid an allocation here. However, it can only do so
+    /// if the capacity of the underlying buffer is the same as the logical length of the shared
+    /// buffer. Ultimately we rely on the standard libraries guarantees for constructing a
+    /// reference counted allocation from an owned vector.
+    pub fn with_buffer(buffer: Buffer) -> Self {
+        let inner: Vec<_> = buffer.inner.into_iter().map(MaxCell::new).collect();
+
+        CellBuffer {
+            inner: inner.into(),
+        }
+    }
+
+    /// Retrieve the byte capacity of the allocated storage.
     pub fn capacity(&self) -> usize {
         core::mem::size_of_val(&*self.inner)
     }
+
+    /// Copy the data into an owned buffer.
+    pub fn to_owned(&self) -> Buffer {
+        let inner = self.inner.iter().map(|cell| cell.get()).collect();
+
+        Buffer { inner }
+    }
 }
 
 impl AtomicBuffer {
+    const ELEMENT: MaxAtomic = MaxAtomic::zero();
+
+    /// Allocate a new [`AtomicBuffer`] with a number of bytes.
+    ///
+    /// Panics if the length is too long to find a properly aligned subregion.
+    pub fn new(length: usize) -> Self {
+        let alloc_len = Buffer::alloc_len(length);
+        let inner: Vec<_> = (0..alloc_len).map(|_| Self::ELEMENT).collect();
+
+        AtomicBuffer {
+            inner: inner.into(),
+        }
+    }
+
+    /// Share an existing buffer.
+    ///
+    /// The library will try, to an extent, to avoid an allocation here. However, it can only do so
+    /// if the capacity of the underlying buffer is the same as the logical length of the shared
+    /// buffer. Ultimately we rely on the standard libraries guarantees for constructing a
+    /// reference counted allocation from an owned vector.
+    pub fn with_buffer(buffer: Buffer) -> Self {
+        let inner: Vec<_> = buffer.inner.into_iter().map(MaxAtomic::new).collect();
+
+        AtomicBuffer {
+            inner: inner.into(),
+        }
+    }
+
+    /// Retrieve the byte capacity of the allocated storage.
     pub fn capacity(&self) -> usize {
         core::mem::size_of_val(&*self.inner)
     }
+
+    /// Copy the data into an owned buffer.
+    ///
+    /// The load will always be relaxed. If more guarantees are required, insert your owned memory
+    /// barrier instructions before or after the access or otherwise synchronize the call to this
+    /// function.
+    pub fn to_owned(&self) -> Buffer {
+        let inner = self
+            .inner
+            .iter()
+            .map(|cell| cell.load(atomic::Ordering::Relaxed))
+            .collect();
+
+        Buffer { inner }
+    }
 }
 
 impl Cog<'_> {
@@ -893,6 +973,9 @@ mod tests {
         let buffer = CellBuffer::from(&data[..]);
         // Gets rounded up to the next alignment.
         assert_eq!(buffer.capacity(), Buffer::alloc_len(data.len()) * MAX_ALIGN);
+
+        let alternative = CellBuffer::with_buffer(buffer.to_owned());
+        assert_eq!(buffer.capacity(), alternative.capacity());
     }
 
     #[test]
@@ -901,5 +984,8 @@ mod tests {
         let buffer = AtomicBuffer::from(&data[..]);
         // Gets rounded up to the next alignment.
         assert_eq!(buffer.capacity(), Buffer::alloc_len(data.len()) * MAX_ALIGN);
+
+        let alternative = CellBuffer::with_buffer(buffer.to_owned());
+        assert_eq!(buffer.capacity(), alternative.capacity());
     }
 }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index 562ae54..8eb8653 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -6,7 +6,7 @@
 use core::cell::Cell;
 use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
-use core::{fmt, hash, mem, num, ptr, slice};
+use core::{fmt, hash, mem, num, ptr, slice, sync::atomic};
 
 use crate::buf::{buf, atomic_buf, cell_buf};
 
@@ -767,6 +767,21 @@ impl MaxAtomic {
 
         result
     }
+
+    /// Load the data into an owned value.
+    pub fn load(&self, ordering: atomic::Ordering) -> MaxAligned {
+        let mut result = MaxAligned([0; MAX_ALIGN]);
+        let from = bytemuck::bytes_of_mut(&mut result);
+        let from = from.chunks_exact_mut(core::mem::size_of::<AtomicPart>());
+
+        for (part, to) in self.0.iter().zip(from) {
+            let data = part.load(ordering);
+            let src = bytemuck::bytes_of(&data);
+            to.copy_from_slice(src);
+        }
+
+        result
+    }
 }
 
 impl MaxCell {

From ec0c00fb3285fa37ea84c81d2b436034587a8812 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 04:35:22 +0200
Subject: [PATCH 08/14] Implement casts of cell/atomic bufs

---
 texel/src/buf.rs   | 60 +++++++++++++++++++++++++++++++++-------------
 texel/src/texel.rs | 42 +++++++++++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 64cbcb3..5c830e5 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -89,7 +89,7 @@ pub struct buf([u8]);
 /// conversion to `atomic_ref` for this.
 #[repr(transparent)]
 #[allow(non_camel_case_types)]
-pub struct atomic_buf([AtomicPart]);
+pub struct atomic_buf(pub(crate) [AtomicPart]);
 
 /// An aligned slice of shared-access memory.
 ///
@@ -103,19 +103,24 @@ pub struct cell_buf(cell::Cell<[u8]>);
 
 /// A logical reference to a byte slice from some atomic memory.
 ///
-/// This is a wrapper around a slice of the underlying atomics. From this buffer, in contrast to
-/// the shared and the unsynchronized slices, you can _not_ retrieve slices to typed memory.
+/// The analogue of this is `&[P]` or `&[Cell<P>]` respectively. This is a wrapper around a slice
+/// of the underlying atomics. However, note we promise soundness but _not_ absence of tears in the
+/// logical data type if the data straddles different underlying atomic representation types. We
+/// simply can not promise this. Of course, an external synchronization might be used enforce this
+/// additional guarantee.
 ///
-/// See `pixel.rs` for the only constructors.
+/// For consistency with slices, casting of this type is done via an instance of [`Texel`].
 #[derive(Clone, Copy)]
-pub(crate) struct AtomicRef<'lt> {
-    buf: &'lt atomic_buf,
+pub struct AtomicRef<'lt, P = u8> {
+    pub(crate) buf: &'lt atomic_buf,
+    /// The underlying logical texel type this is bound to.
+    pub(crate) texel: Texel<P>,
     /// The first byte referred to by this slice.
     ///
     /// Not using `core::ops::Range` since we want to be Copy!
-    start: usize,
+    pub(crate) start: usize,
     /// The past-the-end byte referred to by this slice.
-    end: usize,
+    pub(crate) end: usize,
 }
 
 /// A copy-on-grow version of a buffer.
@@ -857,8 +862,11 @@ impl cell_buf {
     /// The alignment of `P` is already checked to be smaller than `MAX_ALIGN` through the
     /// constructor of `Texel`. The slice will have the maximum length possible but may leave
     /// unused bytes in the end.
-    pub fn as_texels<P>(&self, _pixel: Texel<P>) -> &cell::Cell<[P]> {
-        todo!()
+    pub fn as_texels<P>(&self, texel: Texel<P>) -> &cell::Cell<[P]> {
+        let slice = self.0.as_slice_of_cells();
+        texel
+            .try_to_cell(slice)
+            .expect("A cell_buf is always aligned")
     }
 
     pub fn map_within<P, Q>(
@@ -873,15 +881,25 @@ impl cell_buf {
     }
 }
 
-impl<'lt> From<&'lt atomic_buf> for AtomicRef<'lt> {
-    fn from(value: &'lt atomic_buf) -> AtomicRef<'lt> {
-        let end = core::mem::size_of_val(value);
+impl atomic_buf {
+    /// Reinterpret the buffer for the specific texel type.
+    ///
+    /// The alignment of `P` is already checked to be smaller than `MAX_ALIGN` through the
+    /// constructor of `Texel`. The slice will have the maximum length possible but may leave
+    /// unused bytes in the end.
+    pub fn as_texels<P>(&self, texel: Texel<P>) -> AtomicRef<P> {
+        use crate::texels::U8;
 
-        AtomicRef {
-            buf: value,
+        let buffer = AtomicRef {
+            buf: self,
             start: 0,
-            end,
-        }
+            end: core::mem::size_of_val(self),
+            texel: U8,
+        };
+
+        texel
+            .try_to_atomic(buffer)
+            .expect("An atomic_buf is always aligned")
     }
 }
 
@@ -976,6 +994,10 @@ mod tests {
 
         let alternative = CellBuffer::with_buffer(buffer.to_owned());
         assert_eq!(buffer.capacity(), alternative.capacity());
+
+        let contents: &cell_buf = &*buffer;
+        let slice: &[cell::Cell<u8>] = contents.as_texels(U8).as_slice_of_cells();
+        assert!(cell_buf::from_bytes(slice).is_some());
     }
 
     #[test]
@@ -987,5 +1009,9 @@ mod tests {
 
         let alternative = CellBuffer::with_buffer(buffer.to_owned());
         assert_eq!(buffer.capacity(), alternative.capacity());
+
+        let contents: &atomic_buf = &*buffer;
+        let slice: AtomicRef<u8> = contents.as_texels(U8);
+        assert!(atomic_buf::from_bytes(slice).is_some());
     }
 }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index 8eb8653..ee5830f 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -8,7 +8,7 @@ use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice, sync::atomic};
 
-use crate::buf::{buf, atomic_buf, cell_buf};
+use crate::buf::{atomic_buf, buf, cell_buf, AtomicRef};
 
 /// Marker struct to denote a texel type.
 ///
@@ -424,6 +424,21 @@ impl atomic_buf {
         // Safety: `atomic_buf` has the same layout as a `[MaxAtomic]` and wraps it transparently.
         unsafe { &*(atomics as *const Self) }
     }
+
+    /// Wrap a sub-slice of bytes from an atomic buffer into a new `atomic_buf`.
+    ///
+    /// The bytes need to be aligned to `ALIGNMENT`.
+    pub fn from_bytes(bytes: AtomicRef<u8>) -> Option<&Self> {
+        if bytes.start & Self::ALIGNMENT == 0 {
+            let offset = bytes.start / core::mem::size_of::<AtomicPart>();
+            let buffer = &bytes.buf.0[offset..];
+            // Safety: these types are binary compatible. The metadata is also the same, as both
+            // types encapsulate a slice of `AtomicPart`-sized types.
+            Some(unsafe { &*(buffer as *const _ as *const Self) })
+        } else {
+            None
+        }
+    }
 }
 
 impl cell_buf {
@@ -651,6 +666,21 @@ impl<P> Texel<P> {
         }
     }
 
+    /// Reinterpret a slice of atomically access memory with a type annotation.
+    pub fn try_to_atomic<'buf>(self, bytes: AtomicRef<'buf, u8>) -> Option<AtomicRef<'buf, P>> {
+        if bytes.start % mem::align_of::<P>() == 0 {
+            let end = bytes.end - bytes.end % mem::align_of::<P>();
+            Some(AtomicRef {
+                buf: bytes.buf,
+                start: bytes.start,
+                end,
+                texel: self,
+            })
+        } else {
+            None
+        }
+    }
+
     /// Reinterpret a slice of texel as memory.
     ///
     /// Note that you can convert a reference to a single value by [`core::slice::from_ref`].
@@ -687,6 +717,16 @@ impl<P> Texel<P> {
         unsafe { &*(ptr as *const Cell<[u8]>) }
     }
 
+    /// Reinterpret a slice of atomically modified texels as atomic bytes.
+    pub fn atomic_bytes<'buf>(self, texel: AtomicRef<'buf, P>) -> AtomicRef<'buf, u8> {
+        AtomicRef {
+            buf: texel.buf,
+            start: texel.start,
+            end: texel.end,
+            texel: constants::U8,
+        }
+    }
+
     pub(crate) fn cast_buf<'buf>(self, buffer: &'buf buf) -> &'buf [P] {
         debug_assert_eq!(buffer.as_ptr() as usize % mem::align_of::<MaxAligned>(), 0);
         debug_assert_eq!(buffer.as_ptr() as usize % mem::align_of::<P>(), 0);

From 73ee76f0a360846e848faa6f49fbcc3b61de6bb5 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 17:24:30 +0200
Subject: [PATCH 09/14] Remove CopyOnGrow

This experiment wasn't public, and adds some complexity. Remove it until
request. We should balance the need for its implementation with the
abilities of concurrent buffers.
---
 texel/src/buf.rs   | 76 ----------------------------------------------
 texel/src/image.rs | 54 +-------------------------------
 2 files changed, 1 insertion(+), 129 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index 5c830e5..b3db804 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -123,14 +123,6 @@ pub struct AtomicRef<'lt, P = u8> {
     pub(crate) end: usize,
 }
 
-/// A copy-on-grow version of a buffer.
-pub(crate) enum Cog<'buf> {
-    Owned(Buffer),
-    // May be used later..
-    #[allow(dead_code)]
-    Borrowed(&'buf mut buf),
-}
-
 impl Buffer {
     const ELEMENT: MaxAligned = MaxAligned([0; MAX_ALIGN]);
 
@@ -280,34 +272,6 @@ impl AtomicBuffer {
     }
 }
 
-impl Cog<'_> {
-    pub(crate) fn to_owned(this: &mut Self) -> &'_ mut Buffer {
-        match this {
-            Cog::Owned(buffer) => buffer,
-            Cog::Borrowed(buffer) => {
-                let buffer = buffer.to_owned();
-                *this = Cog::Owned(buffer);
-                Cog::to_owned(this)
-            }
-        }
-    }
-
-    pub(crate) fn into_owned(this: Self) -> Buffer {
-        match this {
-            Cog::Owned(buffer) => buffer,
-            Cog::Borrowed(buffer) => buffer.to_owned(),
-        }
-    }
-
-    pub(crate) fn grow_to(this: &mut Self, bytes: usize) -> &mut buf {
-        if this.len() < bytes {
-            Cog::to_owned(this).grow_to(bytes);
-        }
-
-        &mut **this
-    }
-}
-
 impl buf {
     /// Wraps an aligned buffer into `buf`.
     ///
@@ -749,46 +713,6 @@ impl ops::DerefMut for buf {
     }
 }
 
-impl ops::Deref for Cog<'_> {
-    type Target = buf;
-
-    fn deref(&self) -> &buf {
-        match self {
-            Cog::Owned(buffer) => buffer,
-            Cog::Borrowed(buffer) => buffer,
-        }
-    }
-}
-
-impl ops::DerefMut for Cog<'_> {
-    fn deref_mut(&mut self) -> &mut buf {
-        match self {
-            Cog::Owned(buffer) => buffer,
-            Cog::Borrowed(buffer) => buffer,
-        }
-    }
-}
-
-impl borrow::Borrow<buf> for Cog<'_> {
-    fn borrow(&self) -> &buf {
-        &**self
-    }
-}
-
-impl borrow::BorrowMut<buf> for Cog<'_> {
-    fn borrow_mut(&mut self) -> &mut buf {
-        &mut **self
-    }
-}
-
-impl cmp::PartialEq<Cog<'_>> for Cog<'_> {
-    fn eq(&self, other: &Cog<'_>) -> bool {
-        **self == **other
-    }
-}
-
-impl cmp::Eq for Cog<'_> {}
-
 impl cmp::PartialEq for buf {
     fn eq(&self, other: &buf) -> bool {
         self.as_bytes() == other.as_bytes()
diff --git a/texel/src/image.rs b/texel/src/image.rs
index 6b729f4..5470b34 100644
--- a/texel/src/image.rs
+++ b/texel/src/image.rs
@@ -13,7 +13,7 @@
 // Copyright (c) 2019, 2020 The `image-rs` developers
 use core::{fmt, ops};
 
-use crate::buf::{buf, Buffer, Cog};
+use crate::buf::{buf, Buffer};
 use crate::layout::{
     Bytes, Decay, DynLayout, Layout, Mend, Raster, RasterMut, SliceLayout, Take, TryMend,
 };
@@ -82,19 +82,6 @@ pub struct Image<Layout = Bytes> {
     inner: RawImage<Buffer, Layout>,
 }
 
-/// An owned or borrowed image, parameterized over the layout.
-///
-/// The buffer is either owned or _mutably_ borrowed from another `Image`. Some allocating methods
-/// may lead to an implicit change from a borrowed to an owned buffer. These methods are documented
-/// as performing a fallible allocation. Other method calls on the previously borrowing image will
-/// afterwards no longer change the bytes of the image it was borrowed from.
-///
-/// FIXME: figure out if this is 'right' to expose in this crate.
-#[derive(Clone, PartialEq, Eq)]
-pub(crate) struct CopyOnGrow<'buf, Layout = Bytes> {
-    inner: RawImage<Cog<'buf>, Layout>,
-}
-
 /// A read-only view of an image.
 ///
 /// Note that this requires its underlying buffer to be highly aligned! For that reason it is not
@@ -1323,16 +1310,6 @@ impl<'lt, L> From<RawImage<&'lt mut buf, L>> for ImageMut<'lt, L> {
     }
 }
 
-impl BufferLike for Cog<'_> {
-    fn into_owned(self) -> Buffer {
-        Cog::into_owned(self)
-    }
-
-    fn take(&mut self) -> Self {
-        core::mem::replace(self, Cog::Owned(Default::default()))
-    }
-}
-
 impl BufferLike for Buffer {
     fn into_owned(self) -> Self {
         self
@@ -1353,34 +1330,16 @@ impl BufferLike for &'_ mut buf {
     }
 }
 
-impl Growable for Cog<'_> {
-    fn grow_to(&mut self, bytes: usize) {
-        Cog::grow_to(self, bytes);
-    }
-}
-
 impl Growable for Buffer {
     fn grow_to(&mut self, bytes: usize) {
         Buffer::grow_to(self, bytes);
     }
 }
 
-impl BufferMut for Cog<'_> {}
-
 impl BufferMut for Buffer {}
 
 impl BufferMut for &'_ mut buf {}
 
-impl<Layout: Clone> Clone for RawImage<Cog<'_>, Layout> {
-    fn clone(&self) -> Self {
-        use alloc::borrow::ToOwned;
-        RawImage {
-            buffer: Cog::Owned(self.buffer.to_owned()),
-            layout: self.layout.clone(),
-        }
-    }
-}
-
 impl<Layout: Default> Default for Image<Layout> {
     fn default() -> Self {
         Image {
@@ -1392,17 +1351,6 @@ impl<Layout: Default> Default for Image<Layout> {
     }
 }
 
-impl<Layout: Default> Default for CopyOnGrow<'_, Layout> {
-    fn default() -> Self {
-        CopyOnGrow {
-            inner: RawImage {
-                buffer: Cog::Owned(Buffer::default()),
-                layout: Layout::default(),
-            },
-        }
-    }
-}
-
 impl<L> fmt::Debug for Image<L>
 where
     L: SliceLayout + fmt::Debug,

From d1806685bbdc6abd6c245bc124de91bac8be00de Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 17:26:02 +0200
Subject: [PATCH 10/14] Switch CellBuffer to use Rc

---
 texel/src/buf.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index b3db804..f6fc413 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -4,6 +4,7 @@
 use core::{borrow, cell, cmp, mem, ops, sync::atomic};
 
 use alloc::borrow::ToOwned;
+use alloc::rc::Rc;
 use alloc::sync::Arc;
 use alloc::vec::Vec;
 
@@ -63,7 +64,7 @@ pub struct AtomicBuffer {
 #[derive(Clone)]
 pub struct CellBuffer {
     /// The backing memory, aligned by allocating it with the proper type.
-    inner: Arc<[MaxCell]>,
+    inner: Rc<[MaxCell]>,
 }
 
 /// An aligned slice of memory.

From d56812b18ec2c8af12ed3e553be1c24666e4a53e Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 18:35:37 +0200
Subject: [PATCH 11/14] Implement map_within for CellBuffer

---
 texel/src/buf.rs | 298 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 232 insertions(+), 66 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index f6fc413..c5bbcd5 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -210,6 +210,11 @@ impl CellBuffer {
         }
     }
 
+    /// Query if two buffers share the same memory region.
+    pub fn ptr_eq(&self, other: &Self) -> bool {
+        Rc::ptr_eq(&self.inner, &other.inner)
+    }
+
     /// Retrieve the byte capacity of the allocated storage.
     pub fn capacity(&self) -> usize {
         core::mem::size_of_val(&*self.inner)
@@ -221,6 +226,16 @@ impl CellBuffer {
 
         Buffer { inner }
     }
+
+    /// Create an independent copy of the buffer, with a new length.
+    ///
+    /// The prefix contents of the new buffer will be the same as the current buffer. The new
+    /// buffer will _never_ share memory with the current buffer.
+    pub fn to_resized(&self, bytes: usize) -> Self {
+        let mut working_copy = self.to_owned();
+        working_copy.resize_to(bytes);
+        Self::with_buffer(working_copy)
+    }
 }
 
 impl AtomicBuffer {
@@ -271,6 +286,16 @@ impl AtomicBuffer {
 
         Buffer { inner }
     }
+
+    /// Create an independent copy of the buffer, with a new length.
+    ///
+    /// The prefix contents of the new buffer will be the same as the current buffer. The new
+    /// buffer will _never_ share memory with the current buffer.
+    pub fn to_resized(&self, bytes: usize) -> Self {
+        let mut working_copy = self.to_owned();
+        working_copy.resize_to(bytes);
+        Self::with_buffer(working_copy)
+    }
 }
 
 impl buf {
@@ -297,12 +322,10 @@ impl buf {
     }
 
     pub fn truncate(&self, at: usize) -> &Self {
-        // TODO: worth it to use unsafe for avoiding unwrap checks?
         Self::from_bytes(&self.as_bytes()[..at]).unwrap()
     }
 
     pub fn truncate_mut(&mut self, at: usize) -> &mut Self {
-        // TODO: worth it to use unsafe for avoiding unwrap checks?
         Self::from_bytes_mut(&mut self.as_bytes_mut()[..at]).unwrap()
     }
 
@@ -369,6 +392,86 @@ impl buf {
         f: impl Fn(P) -> Q,
         p: Texel<P>,
         q: Texel<Q>,
+    ) {
+        TexelMappingBuffer::map_within(self, src, dest, f, p, q)
+    }
+}
+
+impl TexelMappingBuffer for buf {
+    /// Internally mapping function when the mapping can be done forwards.
+    fn map_forward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        for idx in 0..len {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.copy_val(&self.as_texels(p)[source_idx]);
+            let target = f(source);
+            self.as_mut_texels(q)[target_idx] = target;
+        }
+    }
+
+    /// Internally mapping function when the mapping can be done backwards.
+    fn map_backward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        for idx in (0..len).rev() {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.copy_val(&self.as_texels(p)[source_idx]);
+            let target = f(source);
+            self.as_mut_texels(q)[target_idx] = target;
+        }
+    }
+
+    fn texel_len<P>(&self, texel: Texel<P>) -> usize {
+        self.as_texels(texel).len()
+    }
+}
+
+/// A buffer in which we can copy, apply a transform, and write back.
+trait TexelMappingBuffer {
+    fn map_forward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    );
+
+    fn map_backward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    );
+
+    fn texel_len<P>(&self, texel: Texel<P>) -> usize;
+
+    fn map_within<P, Q>(
+        &mut self,
+        src: impl ops::RangeBounds<usize>,
+        dest: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
     ) {
         // By symmetry, a write sequence that map `src` to `dest` without clobbering any values
         // that need to be read later can be applied in reverse to map `dest` to `src` instead.
@@ -446,7 +549,7 @@ impl buf {
             ops::Bound::Included(&bound) => bound
                 .checked_add(1)
                 .expect("Range does not specify a valid bound end"),
-            ops::Bound::Unbounded => self.as_texels(p).len(),
+            ops::Bound::Unbounded => self.texel_len(p),
         };
 
         let len = p_end.checked_sub(p_start).expect("Bound violates order");
@@ -454,15 +557,15 @@ impl buf {
         let q_start = dest;
 
         let _ = self
-            .as_texels(p)
-            .get(p_start..)
-            .and_then(|slice| slice.get(..len))
+            .texel_len(p)
+            .checked_sub(p_start)
+            .and_then(|slice| slice.checked_sub(len))
             .expect("Source out of bounds");
 
         let _ = self
-            .as_texels(q)
-            .get(q_start..)
-            .and_then(|slice| slice.get(..len))
+            .texel_len(q)
+            .checked_sub(q_start)
+            .and_then(|slice| slice.checked_sub(len))
             .expect("Destination out of bounds");
 
         // Due to both being Texels.
@@ -505,44 +608,6 @@ impl buf {
             self.map_forward(p_start, q_start, backwards_end, &f, p, q);
         }
     }
-
-    /// Internally mapping function when the mapping can be done forwards.
-    fn map_forward<P, Q>(
-        &mut self,
-        src: usize,
-        dest: usize,
-        len: usize,
-        f: impl Fn(P) -> Q,
-        p: Texel<P>,
-        q: Texel<Q>,
-    ) {
-        for idx in 0..len {
-            let source_idx = idx + src;
-            let target_idx = idx + dest;
-            let source = p.copy_val(&self.as_texels(p)[source_idx]);
-            let target = f(source);
-            self.as_mut_texels(q)[target_idx] = target;
-        }
-    }
-
-    /// Internally mapping function when the mapping can be done backwards.
-    fn map_backward<P, Q>(
-        &mut self,
-        src: usize,
-        dest: usize,
-        len: usize,
-        f: impl Fn(P) -> Q,
-        p: Texel<P>,
-        q: Texel<Q>,
-    ) {
-        for idx in (0..len).rev() {
-            let source_idx = idx + src;
-            let target_idx = idx + dest;
-            let source = p.copy_val(&self.as_texels(p)[source_idx]);
-            let target = f(source);
-            self.as_mut_texels(q)[target_idx] = target;
-        }
-    }
 }
 
 trait ByteSlice: Sized {
@@ -744,19 +809,6 @@ impl ops::IndexMut<ops::RangeTo<usize>> for buf {
     }
 }
 
-#[allow(dead_code)]
-impl AtomicRef<'_> {
-    /// Overwrite bytes within the vector with new data.
-    fn copy_within(&self, _from: core::ops::Range<usize>, _to: usize) {
-        todo!()
-    }
-
-    /// Overwrite the whole vector with new data.
-    fn copy_from(&self, _from: core::ops::Range<usize>, _source: &[MaxAtomic], _to: usize) {
-        todo!()
-    }
-}
-
 impl cell_buf {
     /// Wraps an aligned buffer into `buf`.
     ///
@@ -795,14 +847,65 @@ impl cell_buf {
     }
 
     pub fn map_within<P, Q>(
+        &self,
+        src: impl ops::RangeBounds<usize>,
+        dest: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        let mut that = self;
+        TexelMappingBuffer::map_within(&mut that, src, dest, f, p, q)
+    }
+}
+
+impl TexelMappingBuffer for &'_ cell_buf {
+    /// Internally mapping function when the mapping can be done forwards.
+    fn map_forward<P, Q>(
         &mut self,
-        _src: impl ops::RangeBounds<usize>,
-        _dest: usize,
-        _f: impl Fn(P) -> Q,
-        _p: Texel<P>,
-        _q: Texel<Q>,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
     ) {
-        todo!()
+        let src_buffer = self.as_texels(p).as_slice_of_cells();
+        let target_buffer = self.as_texels(q).as_slice_of_cells();
+
+        for idx in 0..len {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.copy_cell(&src_buffer[source_idx]);
+            let target = f(source);
+            target_buffer[target_idx].set(target);
+        }
+    }
+
+    /// Internally mapping function when the mapping can be done backwards.
+    fn map_backward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        let src_buffer = self.as_texels(p).as_slice_of_cells();
+        let target_buffer = self.as_texels(q).as_slice_of_cells();
+
+        for idx in (0..len).rev() {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.copy_cell(&src_buffer[source_idx]);
+            let target = f(source);
+            target_buffer[target_idx].set(target);
+        }
+    }
+
+    fn texel_len<P>(&self, texel: Texel<P>) -> usize {
+        self.as_texels(texel).as_slice_of_cells().len()
     }
 }
 
@@ -826,6 +929,27 @@ impl atomic_buf {
             .try_to_atomic(buffer)
             .expect("An atomic_buf is always aligned")
     }
+
+    pub fn map_within<P, Q>(
+        &mut self,
+        _src: impl ops::RangeBounds<usize>,
+        _dest: usize,
+        _f: impl Fn(P) -> Q,
+        _p: Texel<P>,
+        _q: Texel<Q>,
+    ) {
+        todo!()
+    }
+
+    /// Overwrite bytes within the vector with new data.
+    fn _copy_within(&self, _from: core::ops::Range<usize>, _to: usize) {
+        todo!()
+    }
+
+    /// Overwrite the whole vector with new data.
+    fn _copy_from(&self, _from: core::ops::Range<usize>, _source: &[MaxAtomic], _to: usize) {
+        todo!()
+    }
 }
 
 #[cfg(test)]
@@ -939,4 +1063,46 @@ mod tests {
         let slice: AtomicRef<u8> = contents.as_texels(U8);
         assert!(atomic_buf::from_bytes(slice).is_some());
     }
+
+    #[test]
+    fn mapping_cells() {
+        const LEN: usize = 10;
+        // Look, we can actually map over this buffer while it is *not* mutable.
+        let buffer = CellBuffer::new(LEN * mem::size_of::<u32>());
+        // And receive all the results in this shared copy of our buffer.
+        let output_tap = buffer.clone();
+        assert!(buffer.ptr_eq(&output_tap));
+
+        buffer
+            .as_texels(U32)
+            .as_slice_of_cells()
+            .iter()
+            .enumerate()
+            .for_each(|(idx, p)| p.set(idx as u32));
+
+        // Map those numbers in-place.
+        buffer.map_within(..LEN, 0, |n: u32| n as u8, U32, U8);
+        buffer.map_within(..LEN, 0, |n: u8| n as u32, U8, U32);
+
+        // Back to where we started.
+        assert_eq!(
+            output_tap.as_texels(U32).as_slice_of_cells()[..LEN]
+                .iter()
+                .map(cell::Cell::get)
+                .collect::<Vec<_>>(),
+            (0..LEN as u32).collect::<Vec<_>>()
+        );
+
+        // This should work even if we don't map to index 0.
+        buffer.map_within(0..LEN, 3 * LEN, |n: u32| n as u8, U32, U8);
+        buffer.map_within(3 * LEN..4 * LEN, 0, |n: u8| n as u32, U8, U32);
+
+        assert_eq!(
+            output_tap.as_texels(U32).as_slice_of_cells()[..LEN]
+                .iter()
+                .map(cell::Cell::get)
+                .collect::<Vec<_>>(),
+            (0..LEN as u32).collect::<Vec<_>>()
+        );
+    }
 }

From 846ba2e824e4f697b54b8259e701302b17ac36bd Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 21:34:07 +0200
Subject: [PATCH 12/14] Implement Atomic map_within

---
 texel/src/buf.rs   | 194 ++++++++++++++++++++++++++++++++++++++++++---
 texel/src/texel.rs |  91 +++++++++++++++++++--
 2 files changed, 267 insertions(+), 18 deletions(-)

diff --git a/texel/src/buf.rs b/texel/src/buf.rs
index c5bbcd5..83aecb0 100644
--- a/texel/src/buf.rs
+++ b/texel/src/buf.rs
@@ -111,8 +111,7 @@ pub struct cell_buf(cell::Cell<[u8]>);
 /// additional guarantee.
 ///
 /// For consistency with slices, casting of this type is done via an instance of [`Texel`].
-#[derive(Clone, Copy)]
-pub struct AtomicRef<'lt, P = u8> {
+pub struct AtomicSliceRef<'lt, P = u8> {
     pub(crate) buf: &'lt atomic_buf,
     /// The underlying logical texel type this is bound to.
     pub(crate) texel: Texel<P>,
@@ -124,6 +123,20 @@ pub struct AtomicRef<'lt, P = u8> {
     pub(crate) end: usize,
 }
 
+/// A logical reference to a typed element from some atomic memory.
+///
+/// The analogue of this is `&P` or `&Cell<P>` respectively. Note we promise soundness but _not_
+/// absence of tears in the logical data type if the data straddles different underlying atomic
+/// representation types. We simply can not promise this. Of course, an external synchronization
+/// might be used enforce this additional guarantee.
+pub struct AtomicRef<'lt, P = u8> {
+    pub(crate) buf: &'lt atomic_buf,
+    /// The underlying logical texel type this is bound to.
+    pub(crate) texel: Texel<P>,
+    /// The first byte referred to by this slice.
+    pub(crate) start: usize,
+}
+
 impl Buffer {
     const ELEMENT: MaxAligned = MaxAligned([0; MAX_ALIGN]);
 
@@ -846,6 +859,20 @@ impl cell_buf {
             .expect("A cell_buf is always aligned")
     }
 
+    /// Apply a mapping function to some elements.
+    ///
+    /// The indices `src` and `dest` are indices as if the slice were interpreted as `[P]` or `[Q]`
+    /// respectively.
+    ///
+    /// The types may differ which allows the use of this function to prepare a reinterpretation
+    /// cast of a typed buffer. This function chooses the order of function applications such that
+    /// values are not overwritten before they are used, i.e. the function arguments are exactly
+    /// the previously visible values. This is even less trivial than for copy if the parameter
+    /// types differ in size.
+    ///
+    /// # Panics
+    ///
+    /// This function panics if `src` or the implied range of `dest` are out of bounds.
     pub fn map_within<P, Q>(
         &self,
         src: impl ops::RangeBounds<usize>,
@@ -915,10 +942,10 @@ impl atomic_buf {
     /// The alignment of `P` is already checked to be smaller than `MAX_ALIGN` through the
     /// constructor of `Texel`. The slice will have the maximum length possible but may leave
     /// unused bytes in the end.
-    pub fn as_texels<P>(&self, texel: Texel<P>) -> AtomicRef<P> {
+    pub fn as_texels<P>(&self, texel: Texel<P>) -> AtomicSliceRef<P> {
         use crate::texels::U8;
 
-        let buffer = AtomicRef {
+        let buffer = AtomicSliceRef {
             buf: self,
             start: 0,
             end: core::mem::size_of_val(self),
@@ -930,15 +957,30 @@ impl atomic_buf {
             .expect("An atomic_buf is always aligned")
     }
 
+    /// Apply a mapping function to some elements.
+    ///
+    /// The indices `src` and `dest` are indices as if the slice were interpreted as `[P]` or `[Q]`
+    /// respectively.
+    ///
+    /// The types may differ which allows the use of this function to prepare a reinterpretation
+    /// cast of a typed buffer. This function chooses the order of function applications such that
+    /// values are not overwritten before they are used, i.e. the function arguments are exactly
+    /// the previously visible values. This is even less trivial than for copy if the parameter
+    /// types differ in size.
+    ///
+    /// # Panics
+    ///
+    /// This function panics if `src` or the implied range of `dest` are out of bounds.
     pub fn map_within<P, Q>(
-        &mut self,
-        _src: impl ops::RangeBounds<usize>,
-        _dest: usize,
-        _f: impl Fn(P) -> Q,
-        _p: Texel<P>,
-        _q: Texel<Q>,
+        &self,
+        src: impl ops::RangeBounds<usize>,
+        dest: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
     ) {
-        todo!()
+        let mut that = self;
+        TexelMappingBuffer::map_within(&mut that, src, dest, f, p, q)
     }
 
     /// Overwrite bytes within the vector with new data.
@@ -952,6 +994,97 @@ impl atomic_buf {
     }
 }
 
+impl TexelMappingBuffer for &'_ atomic_buf {
+    /// Internally mapping function when the mapping can be done forwards.
+    fn map_forward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        let src_buffer = self.as_texels(p);
+        let target_buffer = self.as_texels(q);
+
+        // FIXME: isn't it particularly inefficient to load values one-by-one? But we offer that
+        // primitive. A stack buffer for a statically sized burst of values would be better though.
+
+        for idx in 0..len {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.load_atomic(src_buffer.idx(source_idx));
+            let target = f(source);
+            q.store_atomic(target_buffer.idx(target_idx), target);
+        }
+    }
+
+    /// Internally mapping function when the mapping can be done backwards.
+    fn map_backward<P, Q>(
+        &mut self,
+        src: usize,
+        dest: usize,
+        len: usize,
+        f: impl Fn(P) -> Q,
+        p: Texel<P>,
+        q: Texel<Q>,
+    ) {
+        let src_buffer = self.as_texels(p);
+        let target_buffer = self.as_texels(q);
+
+        for idx in (0..len).rev() {
+            let source_idx = idx + src;
+            let target_idx = idx + dest;
+            let source = p.load_atomic(src_buffer.idx(source_idx));
+            let target = f(source);
+            q.store_atomic(target_buffer.idx(target_idx), target);
+        }
+    }
+
+    fn texel_len<P>(&self, texel: Texel<P>) -> usize {
+        self.as_texels(texel).len()
+    }
+}
+
+impl<'lt, P> AtomicSliceRef<'lt, P> {
+    /// Grab a single element.
+    ///
+    /// Not `get` since it does not return a reference, and we can not use the standard SliceIndex
+    /// trait anyways. Also we do not implement the assertion outside of debug for now, it is also
+    /// not used for unsafe code.
+    pub(crate) fn idx(self, idx: usize) -> AtomicRef<'lt, P> {
+        assert!(idx < self.len());
+
+        AtomicRef {
+            buf: self.buf,
+            start: self.start + idx * self.texel.size(),
+            texel: self.texel,
+        }
+    }
+
+    /// Get the number of elements referenced by this slice.
+    pub fn len(&self) -> usize {
+        self.end.saturating_sub(self.start) / self.texel.size()
+    }
+}
+
+impl<P> Clone for AtomicSliceRef<'_, P> {
+    fn clone(&self) -> Self {
+        AtomicSliceRef { ..*self }
+    }
+}
+
+impl<P> Copy for AtomicSliceRef<'_, P> {}
+
+impl<P> Clone for AtomicRef<'_, P> {
+    fn clone(&self) -> Self {
+        AtomicRef { ..*self }
+    }
+}
+
+impl<P> Copy for AtomicRef<'_, P> {}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1060,7 +1193,7 @@ mod tests {
         assert_eq!(buffer.capacity(), alternative.capacity());
 
         let contents: &atomic_buf = &*buffer;
-        let slice: AtomicRef<u8> = contents.as_texels(U8);
+        let slice: AtomicSliceRef<u8> = contents.as_texels(U8);
         assert!(atomic_buf::from_bytes(slice).is_some());
     }
 
@@ -1105,4 +1238,41 @@ mod tests {
             (0..LEN as u32).collect::<Vec<_>>()
         );
     }
+
+    #[test]
+    fn mapping_atomics() {
+        const LEN: usize = 10;
+        let mut initial_state = Buffer::new(LEN * mem::size_of::<u32>());
+
+        initial_state
+            .as_mut_texels(U32)
+            .iter_mut()
+            .enumerate()
+            .for_each(|(idx, p)| *p = idx as u32);
+
+        // Look, we can actually map over this buffer while it is *not* mutable.
+        let buffer = AtomicBuffer::with_buffer(initial_state);
+        // And receive all the results in this shared copy of our buffer.
+        let output_tap = buffer.clone();
+        // assert!(buffer.ptr_eq(&output_tap));
+
+        // Map those numbers in-place.
+        buffer.map_within(..LEN, 0, |n: u32| n as u8, U32, U8);
+        buffer.map_within(..LEN, 0, |n: u8| n as u32, U8, U32);
+
+        // Back to where we started.
+        assert_eq!(
+            output_tap.to_owned().as_texels(U32)[..LEN].to_vec(),
+            (0..LEN as u32).collect::<Vec<_>>()
+        );
+
+        // This should work even if we don't map to index 0.
+        buffer.map_within(0..LEN, 3 * LEN, |n: u32| n as u8, U32, U8);
+        buffer.map_within(3 * LEN..4 * LEN, 0, |n: u8| n as u32, U8, U32);
+
+        assert_eq!(
+            output_tap.to_owned().as_texels(U32)[..LEN].to_vec(),
+            (0..LEN as u32).collect::<Vec<_>>()
+        );
+    }
 }
diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index ee5830f..12dcacd 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -8,7 +8,7 @@ use core::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
 use core::marker::PhantomData;
 use core::{fmt, hash, mem, num, ptr, slice, sync::atomic};
 
-use crate::buf::{atomic_buf, buf, cell_buf, AtomicRef};
+use crate::buf::{atomic_buf, buf, cell_buf, AtomicRef, AtomicSliceRef};
 
 /// Marker struct to denote a texel type.
 ///
@@ -428,7 +428,7 @@ impl atomic_buf {
     /// Wrap a sub-slice of bytes from an atomic buffer into a new `atomic_buf`.
     ///
     /// The bytes need to be aligned to `ALIGNMENT`.
-    pub fn from_bytes(bytes: AtomicRef<u8>) -> Option<&Self> {
+    pub fn from_bytes(bytes: AtomicSliceRef<u8>) -> Option<&Self> {
         if bytes.start & Self::ALIGNMENT == 0 {
             let offset = bytes.start / core::mem::size_of::<AtomicPart>();
             let buffer = &bytes.buf.0[offset..];
@@ -600,6 +600,82 @@ impl<P> Texel<P> {
         unsafe { ptr::read(val) }.into_inner()
     }
 
+    /// Load a value from an atomic slice.
+    ///
+    /// The results is only correct if no concurrent modification occurs. The library promises
+    /// *basic soundness* but no particular defined behaviour under parallel modifications to the
+    /// memory bytes which describe the value to be loaded.
+    ///
+    /// Each atomic unit is touched at most once.
+    pub fn load_atomic(self, val: AtomicRef<P>) -> P {
+        // SAFETY: by `Texel` being a POD this is a valid representation.
+        let mut value = unsafe { core::mem::zeroed::<P>() };
+
+        let offset = val.start / core::mem::size_of::<AtomicPart>();
+        let mut initial_skip = val.start % core::mem::size_of::<AtomicPart>();
+        let mut target = self.to_mut_bytes(core::slice::from_mut(&mut value));
+
+        let mut buffer = val.buf.0[offset..].iter();
+        // By the invariants of `AtomicRef`, that number of bytes is in-bounds.
+        let mut load = buffer.next().unwrap().load(atomic::Ordering::Relaxed);
+
+        loop {
+            let input = &bytemuck::bytes_of(&load)[initial_skip..];
+            let copy_len = input.len().min(target.len());
+            target[..copy_len].copy_from_slice(&input[..copy_len]);
+            target = &mut target[copy_len..];
+
+            if target.is_empty() {
+                break;
+            }
+
+            load = buffer.next().unwrap().load(atomic::Ordering::Relaxed);
+            initial_skip = 0;
+        }
+
+        value
+    }
+
+    /// Store a value to an atomic slice.
+    ///
+    /// The results is only correct if no concurrent modification occurs. The library promises
+    /// *basic soundness* but no particular defined behaviour under parallel modifications to the
+    /// memory bytes which describe the value to be store.
+    ///
+    /// Provides the same wait-freeness as the underlying platform for `fetch_*` instructions, that
+    /// is this does not use `compare_exchange_weak`. This implies that concurrent modifications to
+    /// bytes *not* covered by this particular representation will not inherently block progress.
+    pub fn store_atomic(self, val: AtomicRef<P>, value: P) {
+        let offset = val.start / core::mem::size_of::<AtomicPart>();
+        let mut initial_skip = val.start % core::mem::size_of::<AtomicPart>();
+
+        let mut source = self.to_bytes(core::slice::from_ref(&value));
+        let mut buffer = val.buf.0[offset..].iter();
+
+        loop {
+            let mut value = 0;
+            let mut mask = !0;
+
+            let target = &mut bytemuck::bytes_of_mut(&mut value)[initial_skip..];
+            let copy_len = source.len().min(source.len());
+            target[..copy_len].copy_from_slice(&source[..copy_len]);
+            for b in &mut bytemuck::bytes_of_mut(&mut mask)[initial_skip..][..copy_len] {
+                *b = 0;
+            }
+            source = &source[copy_len..];
+
+            let into = buffer.next().unwrap();
+            into.fetch_and(mask, atomic::Ordering::Relaxed);
+            into.fetch_or(value, atomic::Ordering::Relaxed);
+
+            if source.is_empty() {
+                break;
+            }
+
+            initial_skip = 0;
+        }
+    }
+
     /// Reinterpret a slice of aligned bytes as a slice of the texel.
     ///
     /// Note that the size (in bytes) of the slice will be shortened if the size of `P` is not a
@@ -667,10 +743,13 @@ impl<P> Texel<P> {
     }
 
     /// Reinterpret a slice of atomically access memory with a type annotation.
-    pub fn try_to_atomic<'buf>(self, bytes: AtomicRef<'buf, u8>) -> Option<AtomicRef<'buf, P>> {
+    pub fn try_to_atomic<'buf>(
+        self,
+        bytes: AtomicSliceRef<'buf, u8>,
+    ) -> Option<AtomicSliceRef<'buf, P>> {
         if bytes.start % mem::align_of::<P>() == 0 {
             let end = bytes.end - bytes.end % mem::align_of::<P>();
-            Some(AtomicRef {
+            Some(AtomicSliceRef {
                 buf: bytes.buf,
                 start: bytes.start,
                 end,
@@ -718,8 +797,8 @@ impl<P> Texel<P> {
     }
 
     /// Reinterpret a slice of atomically modified texels as atomic bytes.
-    pub fn atomic_bytes<'buf>(self, texel: AtomicRef<'buf, P>) -> AtomicRef<'buf, u8> {
-        AtomicRef {
+    pub fn atomic_bytes<'buf>(self, texel: AtomicSliceRef<'buf, P>) -> AtomicSliceRef<'buf, u8> {
+        AtomicSliceRef {
             buf: texel.buf,
             start: texel.start,
             end: texel.end,

From 21051f2a3cb491a0cee53f112df082132a9dfb36 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Sat, 6 Apr 2024 21:36:06 +0200
Subject: [PATCH 13/14] Update atomic values via XOR

---
 texel/src/texel.rs | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/texel/src/texel.rs b/texel/src/texel.rs
index 12dcacd..90b999f 100644
--- a/texel/src/texel.rs
+++ b/texel/src/texel.rs
@@ -653,20 +653,18 @@ impl<P> Texel<P> {
         let mut buffer = val.buf.0[offset..].iter();
 
         loop {
-            let mut value = 0;
-            let mut mask = !0;
+            let into = buffer.next().unwrap();
+            let original = into.load(atomic::Ordering::Relaxed);
+            let mut value = original;
 
             let target = &mut bytemuck::bytes_of_mut(&mut value)[initial_skip..];
             let copy_len = source.len().min(source.len());
             target[..copy_len].copy_from_slice(&source[..copy_len]);
-            for b in &mut bytemuck::bytes_of_mut(&mut mask)[initial_skip..][..copy_len] {
-                *b = 0;
-            }
             source = &source[copy_len..];
 
-            let into = buffer.next().unwrap();
-            into.fetch_and(mask, atomic::Ordering::Relaxed);
-            into.fetch_or(value, atomic::Ordering::Relaxed);
+            // Any bits we did not modify, including those outside our own range, will not get
+            // modified by this instruction. This provides the basic conflict guarantee.
+            into.fetch_xor(original ^ value, atomic::Ordering::Relaxed);
 
             if source.is_empty() {
                 break;

From 900ce9d0db4e59b1b00ef593a1550167f03efe43 Mon Sep 17 00:00:00 2001
From: Andreas Molzer <andreas.molzer@gmx.de>
Date: Mon, 8 Apr 2024 10:45:01 +0200
Subject: [PATCH 14/14] Add test for atomic parallel modification

---
 texel/tests/atomic.rs | 53 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 texel/tests/atomic.rs

diff --git a/texel/tests/atomic.rs b/texel/tests/atomic.rs
new file mode 100644
index 0000000..a262615
--- /dev/null
+++ b/texel/tests/atomic.rs
@@ -0,0 +1,53 @@
+use image_texel::texels::{AtomicBuffer, U32};
+use std::{mem, thread};
+
+#[test]
+fn mapping_atomics_parallel() {
+    const LEN: usize = 128;
+    let buffer = AtomicBuffer::new(LEN * mem::size_of::<u32>());
+    // And receive all the results in this shared copy of our buffer.
+    let output_tap = buffer.clone();
+
+    const SPLIT_MAX: usize = 1 << 6;
+    // Proxy for whether we run with optimization. Makes execution time bearable.
+    #[cfg(debug_assertions)]
+    const REPEAT: usize = 1 << 6;
+    #[cfg(not(debug_assertions))]
+    const REPEAT: usize = 1 << 12;
+
+    for split in 0..SPLIT_MAX {
+        // We want the modifying loops to overlap as much as possible for the strongest test, so
+        // ensure they do not run early.
+        let barrier = &std::sync::Barrier::new(2);
+
+        // Concurrently and repeatedly increment non-overlapping parts of the image.
+        thread::scope(|join| {
+            let img_a = buffer.clone();
+            let img_b = buffer.clone();
+
+            join.spawn(move || {
+                let _ = barrier.wait();
+                for _ in 0..REPEAT {
+                    img_a.map_within(..split, 0, |n: u32| n + 1, U32, U32);
+                }
+            });
+
+            join.spawn(move || {
+                let _ = barrier.wait();
+                for _ in 0..REPEAT {
+                    img_b.map_within(split.., split, |n: u32| n + 1, U32, U32);
+                }
+            });
+        });
+    }
+
+    // Each individual `u32` has been incremented precisely as often as each other. Since the
+    // individual transforms are synchronized with thread-scope and within they do not overlap with
+    // each other, we must expect that the values have each been touched precisely how we intended
+    // them to.
+    let expected = (SPLIT_MAX * REPEAT) as u32;
+    assert_eq!(
+        output_tap.to_owned().as_texels(U32)[..LEN].to_vec(),
+        (0..LEN as u32).map(|_| expected).collect::<Vec<_>>()
+    );
+}