diff --git a/CREDITS.md b/CREDITS.md
index 0a5bc31..707e2cc 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -1,7 +1,7 @@
 # Project Dependencies
     Package:   flaca
-    Version:   3.1.2
-    Generated: 2024-06-18 04:53:29 UTC
+    Version:   3.1.3
+    Generated: 2024-07-07 07:18:53 UTC
 
 | Package | Version | Author(s) | License |
 | ---- | ---- | ---- | ---- |
@@ -9,7 +9,7 @@
 | [argyle](https://github.com/Blobfolio/argyle) | 0.7.2 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL |
 | [bitvec](https://github.com/bitvecto-rs/bitvec) | 1.0.1 |  | MIT |
 | [bytecount](https://github.com/llogiq/bytecount) | 0.6.8 | [Andre Bogus](mailto:bogusandre@gmail.de) and [Joshua Landau](mailto:joshua@landau.ws) | Apache-2.0 or MIT |
-| [bytemuck](https://github.com/Lokathor/bytemuck) | 1.16.0 | [Lokathor](mailto:zefria@gmail.com) | Apache-2.0, MIT, or Zlib |
+| [bytemuck](https://github.com/Lokathor/bytemuck) | 1.16.1 | [Lokathor](mailto:zefria@gmail.com) | Apache-2.0, MIT, or Zlib |
 | [cfg-if](https://github.com/alexcrichton/cfg-if) | 1.0.0 | [Alex Crichton](mailto:alex@alexcrichton.com) | Apache-2.0 or MIT |
 | [crc32fast](https://github.com/srijs/rust-crc32fast) | 1.4.2 | [Sam Rijs](mailto:srijs@airpost.net) and [Alex Crichton](mailto:alex@alexcrichton.com) | Apache-2.0 or MIT |
 | [crossbeam-channel](https://github.com/crossbeam-rs/crossbeam) | 0.5.13 |  | Apache-2.0 or MIT |
@@ -19,7 +19,7 @@
 | [dowser](https://github.com/Blobfolio/dowser) | 0.9.1 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL |
 | [equivalent](https://github.com/cuviper/equivalent) | 1.0.1 |  | Apache-2.0 or MIT |
 | [fastrand](https://github.com/smol-rs/fastrand) | 2.1.0 | [Stjepan Glavina](mailto:stjepang@gmail.com) | Apache-2.0 or MIT |
-| flapfli | 3.1.2 | [Josh Stoik](mailto:josh@blobfolio.com) | WTFPL |
+| flapfli | 3.1.3 | [Josh Stoik](mailto:josh@blobfolio.com) | WTFPL |
 | [funty](https://github.com/myrrlyn/funty) | 2.0.0 | [myrrlyn](mailto:self@myrrlyn.dev) | MIT |
 | [fyi_msg](https://github.com/Blobfolio/fyi) | 0.13.6 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL |
 | [hashbrown](https://github.com/rust-lang/hashbrown) | 0.14.5 | [Amanieu d'Antras](mailto:amanieu@gmail.com) | Apache-2.0 or MIT |
@@ -27,11 +27,11 @@
 | [libc](https://github.com/rust-lang/libc) | 0.2.155 | The Rust Project Developers | Apache-2.0 or MIT |
 | [libdeflate-sys](https://github.com/adamkewley/libdeflater) | 1.20.0 | [Adam Kewley](mailto:contact@adamkewley.com) | Apache-2.0 |
 | [libdeflater](https://github.com/adamkewley/libdeflater) | 1.20.0 | [Adam Kewley](mailto:contact@adamkewley.com) | Apache-2.0 |
-| [log](https://github.com/rust-lang/log) | 0.4.21 | The Rust Project Developers | Apache-2.0 or MIT |
+| [log](https://github.com/rust-lang/log) | 0.4.22 | The Rust Project Developers | Apache-2.0 or MIT |
 | [mozjpeg-sys](https://github.com/kornelski/mozjpeg-sys.git) | 2.2.0 | [Kornel](mailto:kornel@geekhood.net) | IJG AND Zlib AND BSD-3-Clause |
 | [oxipng](https://github.com/shssoichiro/oxipng) | 9.1.1 | [Joshua Holmer](mailto:jholmer.in@gmail.com) | MIT |
 | [radium](https://github.com/bitvecto-rs/radium) | 0.7.0 | [Nika Layzell](mailto:nika@thelayzells.com) and [myrrlyn](mailto:self@myrrlyn.dev) | MIT |
-| [rgb](https://github.com/kornelski/rust-rgb) | 0.8.37 | [Kornel Lesiński](mailto:kornel@geekhood.net) | MIT |
+| [rgb](https://github.com/kornelski/rust-rgb) | 0.8.40 | [Kornel Lesiński](mailto:kornel@geekhood.net) | MIT |
 | [rustc-hash](https://github.com/rust-lang-nursery/rustc-hash) | 1.1.0 | The Rust Project Developers | Apache-2.0 or MIT |
 | [tap](https://github.com/myrrlyn/tap) | 1.0.1 | [Elliott Linder](mailto:elliott.darfink@gmail.com) and [myrrlyn](mailto:self@myrrlyn.dev) | MIT |
 | [tempfile](https://github.com/Stebalien/tempfile) | 3.10.1 | [Steven Allen](mailto:steven@stebalien.com), The Rust Project Developers, [Ashley Mannix](mailto:ashleymannix@live.com.au), and [Jason White](mailto:me@jasonwhite.io) | Apache-2.0 or MIT |
@@ -39,4 +39,4 @@
 | [unicode-width](https://github.com/unicode-rs/unicode-width) | 0.1.13 | [kwantam](mailto:kwantam@gmail.com) and [Manish Goregaokar](mailto:manishsmail@gmail.com) | Apache-2.0 or MIT |
 | [write_atomic](https://github.com/Blobfolio/write_atomic) | 0.5.0 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL |
 | [wyz](https://github.com/myrrlyn/wyz) | 0.5.1 | [myrrlyn](mailto:self@myrrlyn.dev) | MIT |
-| [zerocopy](https://github.com/google/zerocopy) | 0.7.34 | [Joshua Liebow-Feeser](mailto:joshlf@google.com) | Apache-2.0, BSD-2-Clause, or MIT |
+| [zerocopy](https://github.com/google/zerocopy) | 0.7.35 | [Joshua Liebow-Feeser](mailto:joshlf@google.com) | Apache-2.0, BSD-2-Clause, or MIT |
diff --git a/flaca/Cargo.toml b/flaca/Cargo.toml
index 878fd00..8ad3b50 100644
--- a/flaca/Cargo.toml
+++ b/flaca/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "flaca"
-version = "3.1.2"
+version = "3.1.3"
 license = "WTFPL"
 authors = ["Josh Stoik <josh@blobfolio.com>"]
 edition = "2021"
diff --git a/flaca/src/main.rs b/flaca/src/main.rs
index 4f06bce..4f6f6fa 100644
--- a/flaca/src/main.rs
+++ b/flaca/src/main.rs
@@ -54,7 +54,6 @@ use dactyl::{
 	NiceElapsed,
 	NiceU64,
 	traits::{
-		BytesToSigned,
 		BytesToUnsigned,
 		NiceInflection,
 	},
@@ -140,9 +139,7 @@ fn _main() -> Result<(), FlacaError> {
 
 	// Zopfli iterations.
 	if let Some(n) = args.option(b"-z") {
-		let n = i32::btoi(n)
-			.filter(|n| n.is_positive())
-			.ok_or(FlacaError::ZopfliIterations)?;
+		let n = u32::btou(n).ok_or(FlacaError::ZopfliIterations)?;
 		flapfli::ZOPFLI_ITERATIONS.store(n, Relaxed);
 	}
 
diff --git a/flapfli/Cargo.toml b/flapfli/Cargo.toml
index 51ee11d..9d796cd 100644
--- a/flapfli/Cargo.toml
+++ b/flapfli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "flapfli"
-version = "3.1.2"
+version = "3.1.3"
 license = "WTFPL"
 authors = ["Josh Stoik <josh@blobfolio.com>"]
 edition = "2021"
@@ -17,11 +17,6 @@ exclude = [
 
 [dependencies]
 crc32fast = "=1.4.*"
-dactyl = "0.7.*"
-
-[dependencies.ahash]
-version = "=0.8.*"
-default-features = false
 
 [build-dependencies]
 bindgen = "0.69.*"
diff --git a/flapfli/build.rs b/flapfli/build.rs
index be3b738..f15b696 100644
--- a/flapfli/build.rs
+++ b/flapfli/build.rs
@@ -20,6 +20,27 @@ const DISTANCE_EXTRA_BITS_MASK: [(u32, u32); 16] = [
 	(8193, 4095), (16_385, 8191), (32_769, 16_383),
 ];
 
+/// # Distance Extra Byts (by Symbol).
+const DISTANCE_BITS: [u8; 32] = [
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+	7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0,
+];
+
+/// # Length Symbol Bits (by Litlen).
+const LENGTH_SYMBOL_BITS: [u8; 259] = [
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0,
+];
+
 const ZOPFLI_WINDOW_SIZE: u16 = 32_768;
 
 
@@ -80,11 +101,12 @@ fn build_symbols() {
 	use std::fmt::Write;
 
 	let mut out = format!(
-		"{}{}{}{}{}{}",
-		NumEnum::new(0..19_u8, "Whackadoodle Deflate Indices.", "DeflateSym")
+		"{}{}{}{}{}{}{}",
+		NumEnum::new(0..19_u8, "Extended Deflate Indices.", "DeflateSym")
 			.with_debug()
 			.with_eq()
 			.with_iter(),
+		NumEnum::new(0..16_u8, "Basic Deflate Indices.", "DeflateSymBasic").with_eq(),
 		NumEnum::new(0..32_u16, "Distance Symbols.", "Dsym"),
 		NumEnum::new(0..259_u16, "Lit/Lengths.", "LitLen").with_eq().with_iter(),
 		NumEnum::new(0..286_u16, "Lit/Length Symbols.", "Lsym"),
@@ -135,6 +157,26 @@ pub(crate) const DISTANCE_VALUES: &[u16; 32_768] = &[");
 	}
 	out.push_str("\n];\n");
 
+	/// # Distance and length bits.
+	///
+	/// Generate integer and float constants for our bit arrays.
+	fn bits_and_bobs<const N: usize>(title: &str, name: &str, arr: [u8; N]) -> String {
+		format!(
+			"/// # {title}.
+pub(crate) const {name}: [u8; {N}] = {arr:?};
+
+/// # {title} (Float).
+///
+/// This is identical to the `u8` version, but avoids a lot of `f64::from` calls.
+pub(crate) const {name}_F: [f64; {N}] = {:?};
+",
+			arr.map(f64::from),
+		)
+	}
+
+	out.push_str(&bits_and_bobs("Distance Bits (by Symbol)", "DISTANCE_BITS", DISTANCE_BITS));
+	out.push_str(&bits_and_bobs("Length Bits (by Symbol)", "LENGTH_SYMBOL_BITS", LENGTH_SYMBOL_BITS));
+
 	// Save it!
 	write(&out_path("symbols.rs"), out.as_bytes());
 }
@@ -328,6 +370,7 @@ pub(crate) struct {name}Iter({kind});
 
 impl Iterator for {name}Iter {{
 	type Item = {name};
+
 	fn next(&mut self) -> Option<Self::Item> {{
 		let old = self.0;
 		if old < {end} {{
@@ -337,6 +380,11 @@ impl Iterator for {name}Iter {{
 		}}
 		else {{ None }}
 	}}
+
+	fn size_hint(&self) -> (usize, Option<usize>) {{
+		let len = self.len();
+		(len, Some(len))
+	}}
 }}
 
 impl ExactSizeIterator for {name}Iter {{
diff --git a/flapfli/src/deflate.rs b/flapfli/src/deflate.rs
new file mode 100644
index 0000000..05e74d2
--- /dev/null
+++ b/flapfli/src/deflate.rs
@@ -0,0 +1,351 @@
+/*!
+# Flapfli: Deflate.
+
+This module contains the custom lodepng callback (that uses zopfli), and
+supporting components.
+*/
+
+use std::{
+	cell::RefCell,
+	ffi::{
+		c_uchar,
+		c_uint,
+	},
+	num::{
+		NonZeroUsize,
+		NonZeroU32,
+	},
+	sync::atomic::Ordering::Relaxed,
+};
+use super::{
+	deflate_part,
+	ffi::flapfli_allocate,
+	lodepng::LodePNGCompressSettings,
+	ZOPFLI_ITERATIONS,
+	ZOPFLI_MASTER_BLOCK_SIZE,
+	ZopfliChunk,
+	ZopfliState,
+};
+
+
+
+#[allow(unsafe_code)]
+/// # Twenty is Non-Zero.
+const NZ20: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(20) };
+
+#[allow(unsafe_code)]
+/// # Sixty is Non-Zero.
+const NZ60: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(60) };
+
+#[allow(unsafe_code)]
+/// # Max Iterations.
+const MAX_ITERATIONS: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(i32::MAX as u32) };
+
+
+
+#[no_mangle]
+#[allow(unsafe_code)]
+/// # Custom PNG Deflate.
+///
+/// This is a custom deflate callback for lodepng. When set, image blocks are
+/// compressed using zopfli instead of basic-ass deflate.
+///
+/// Zopfli is a monster, though, so this is only actually used for the final
+/// pass. (Brute force strategizing uses cheaper compression.)
+///
+/// Following C convention, this returns `0` for success, `1` for sadness.
+///
+/// ## Safety
+///
+/// The mutable pointers may or may not initially be null. Allocations are
+/// handled on the Rust side, though, and those methods are aware of the fact
+/// and will later act (or not act) on these pointer accordingly.
+///
+/// The `arr`/`insize` values, on the other hand, _should_ definitely be
+/// initialized and valid. We can't verify that, but their existence is the
+/// whole point of this callback, so it's probably fine…
+///
+/// Flaca processes images in parallel, but the lodepng/zopfli operations are
+/// single-threaded. (All work for a given image happens on a single thread.)
+/// This is why we can leverage local statics like `STATE` without fear of
+/// access contention.
+pub(crate) extern "C" fn flaca_png_deflate(
+	out: *mut *mut c_uchar,
+	outsize: *mut usize,
+	arr: *const c_uchar,
+	insize: usize,
+	_settings: *const LodePNGCompressSettings,
+) -> c_uint {
+	thread_local!(
+		static STATE: RefCell<Box<ZopfliState>> = RefCell::new(ZopfliState::new())
+	);
+
+	// Group the pointer crap to cut down on the number of args being
+	// passed around.
+	let mut dst = ZopfliOut {
+		bp: 0,
+		out,
+		outsize,
+	};
+
+	// Make a proper slice out of the data.
+	let arr = unsafe { std::slice::from_raw_parts(arr, insize) };
+
+	// Figure out how many iterations to use.
+	let numiterations = NonZeroU32::new(ZOPFLI_ITERATIONS.load(Relaxed)).map_or(
+		if arr.len() < 200_000 { NZ60 } else { NZ20 },
+		|custom| NonZeroU32::min(custom, MAX_ITERATIONS)
+	);
+
+	// Compress in chunks, à la ZopfliDeflate.
+	for chunk in DeflateIter::new(arr) {
+		#[cfg(not(debug_assertions))]
+		if STATE.with_borrow_mut(|state| deflate_part(
+			state,
+			numiterations,
+			chunk.total_len().get() == arr.len(),
+			chunk,
+			&mut dst,
+		)).is_err() { return 1; };
+
+		#[cfg(debug_assertions)]
+		if let Err(e) = STATE.with_borrow_mut(|state| deflate_part(
+			state,
+			numiterations,
+			chunk.total_len().get() == arr.len(),
+			chunk,
+			&mut dst,
+		)) { panic!("{e}"); };
+	}
+
+	// All clear!
+	0
+}
+
+
+
+/// # Lodepng Output Pointers.
+///
+/// This struct serves as a convenience wrapper for the various lodepng/zopfli
+/// output pointers, saving us the trouble of passing each of them individually
+/// down the rabbit hole.
+///
+/// This struct also enables us to centralize the convoluted bit-writing
+/// methods used to record data, minimizing — as much as possible — the use of
+/// `unsafe` everywhere else.
+pub(super) struct ZopfliOut {
+	bp: u8,
+	out: *mut *mut u8,
+	outsize: *mut usize,
+}
+
+impl ZopfliOut {
+	#[allow(unsafe_code)]
+	#[inline]
+	/// # Append Data.
+	///
+	/// This adds a single byte to the output array, re-allocating as
+	/// necessary. The `outsize` value is incremented accordingly.
+	///
+	/// In practice, most data is written bit-by-bite rather than byte-by-byte.
+	/// As such, most calls to this method simply write a zero and bit-OR it a
+	/// few times afterwards.
+	fn append_data(&mut self, value: u8) {
+		#[cold]
+		/// # Allocate.
+		///
+		/// Re/allocation is (potentially) necessary whenever `outsize` reaches
+		/// a power of two, but since that value represents the length written
+		/// rather than the actual capacity, this is often a no-op (after some
+		/// checking).
+		///
+		/// As such, we don't want all this stuff affecting the compiler's
+		/// inlining decisions, hence the cold wrapper.
+		unsafe fn alloc_cold(ptr: *mut u8, size: usize) -> *mut u8 {
+			flapfli_allocate(
+				ptr,
+				NonZeroUsize::new(size * 2).unwrap_or(NonZeroUsize::MIN),
+			)
+		}
+
+		unsafe {
+			// Dereference the size once to save some sanity.
+			let size = *self.outsize;
+
+			// (Re)allocate if size is a power of two, or empty.
+			if 0 == (size & size.wrapping_sub(1)) {
+				*self.out = alloc_cold(*self.out, size);
+			}
+
+			// Write the value and bump the outside length counter.
+			(*self.out).add(size).write(value);
+			self.outsize.write(size + 1);
+		}
+	}
+}
+
+impl ZopfliOut {
+	#[allow(clippy::doc_markdown)]
+	#[inline]
+	/// # Add Bit.
+	///
+	/// This adds a single bit to the output array. When the internal `bp`
+	/// counter is zero that bit gets added on top of a new zero byte,
+	/// otherwise it is ORed on top of the last one.
+	pub(crate) fn add_bit(&mut self, bit: u8) {
+		if self.bp == 0 { self.append_data(0); }
+		#[allow(unsafe_code)]
+		unsafe {
+			// Safety: `append_data` writes a byte to `outsize` and then
+			// increments it, so to reach and modify that same position we need
+			// to use `outsize - 1` instead.
+			*(*self.out).add(*self.outsize - 1) |= bit << self.bp;
+		}
+		self.bp = self.bp.wrapping_add(1) & 7;
+	}
+
+	/// # Add Multiple Bits.
+	///
+	/// This method is used to write multiple bits — `length` of them — at
+	/// once, shifting on each pass.
+	pub(crate) fn add_bits(&mut self, symbol: u32, length: u32) {
+		for i in 0..length {
+			let bit = (symbol >> i) & 1;
+			self.add_bit(bit as u8);
+		}
+	}
+
+	#[inline]
+	/// # Add Multiple Bits.
+	///
+	/// Same as `ZopfliOut::add_bits`, but optimized for lengths known at
+	/// compile-time.
+	///
+	/// ## Panics
+	///
+	/// This will panic at compile-time if `N` is less than two.
+	pub(crate) fn add_fixed_bits<const N: u8>(&mut self, symbol: u32) {
+		const { assert!(1 < N); }
+		for i in const { 0..N } {
+			let bit = (symbol >> i) & 1;
+			self.add_bit(bit as u8);
+		}
+	}
+
+	#[inline]
+	/// # Add Type Bits Header.
+	///
+	/// This writes the three-bit block type header. In practice, there are
+	/// only three possible values:
+	/// * 0 for uncompressed;
+	/// * 1 for fixed;
+	/// * 2 for dynamic;
+	pub(crate) fn add_header<const BLOCK_BIT: u8>(&mut self, last_block: bool) {
+		self.add_bit(u8::from(last_block));
+		self.add_bit(const { BLOCK_BIT & 1 });
+		self.add_bit(const { (BLOCK_BIT & 2) >> 1 });
+	}
+
+	/// # Add Huffman Bits.
+	///
+	/// Same as `ZopfliOut::add_bits`, but the bits are written in the
+	/// reverse order to keep life interesting.
+	pub(crate) fn add_huffman_bits(&mut self, symbol: u32, length: u32) {
+		// Same as add_bits, except we're doing it backwards.
+		for i in (0..length).rev() {
+			let bit = (symbol >> i) & 1;
+			self.add_bit(bit as u8);
+		}
+	}
+
+	#[allow(clippy::cast_possible_truncation)]
+	/// # Add Non-Compressed Block.
+	///
+	/// As one might suspect, uncompressed blocks are virtually never smaller
+	/// than compressed blocks, so this method is included more for
+	/// completeness than anything else.
+	///
+	/// But who knows?
+	///
+	/// Implementation-wise, this requires no statistical data; it merely
+	/// loops through the raw data in chunks of `u16::MAX`, writes some
+	/// header/size data, then copies the bytes over.
+	pub(crate) fn add_uncompressed_block(
+		&mut self,
+		last_block: bool,
+		chunk: ZopfliChunk<'_>,
+	) {
+		// We need to proceed u16::MAX bytes at a time.
+		let iter = chunk.block().chunks(usize::from(u16::MAX));
+		let len = iter.len() - 1;
+		for (i, block) in iter.enumerate() {
+			let blocksize = block.len();
+			let nlen = ! blocksize;
+			let really_last_block = i == len;
+
+			// Each chunk gets its own header.
+			self.add_header::<0>(last_block && really_last_block);
+
+			// Ignore bits of input up to the next byte boundary.
+			self.bp = 0;
+
+			// Some size details.
+			self.append_data((blocksize % 256) as u8);
+			self.append_data((blocksize.wrapping_div(256) % 256) as u8);
+			self.append_data((nlen % 256) as u8);
+			self.append_data((nlen.wrapping_div(256) % 256) as u8);
+
+			// And finally the data!
+			for byte in block.iter().copied() { self.append_data(byte); }
+		}
+	}
+}
+
+
+
+/// # Deflate Chunk Iterator.
+///
+/// Zopfli processes image data in chunks of (up to) a million bytes, but for
+/// some reason it needs to see any previously-seen data on each pass too.
+///
+/// This iterator thus yields increasingly larger slices of `arr`, until
+/// eventually the whole thing is returned. The internal `pos` value tracks the
+/// start of the "active" portion.
+///
+/// See `ZopfliChunk` for more information. Haha.
+struct DeflateIter<'a> {
+	arr: &'a [u8],
+	pos: usize,
+}
+
+impl<'a> Iterator for DeflateIter<'a> {
+	type Item = ZopfliChunk<'a>;
+
+	fn next(&mut self) -> Option<Self::Item> {
+		if self.pos < self.arr.len() {
+			let pos = self.pos;
+			let chunk = self.arr.get(..pos + ZOPFLI_MASTER_BLOCK_SIZE).unwrap_or(self.arr);
+			self.pos = chunk.len();
+			ZopfliChunk::new(chunk, pos).ok()
+		}
+		else { None }
+	}
+
+	fn size_hint(&self) -> (usize, Option<usize>) {
+		let len = self.len();
+		(len, Some(len))
+	}
+}
+
+impl<'a> ExactSizeIterator for DeflateIter<'a> {
+	fn len(&self) -> usize {
+		(self.arr.len() - self.pos).div_ceil(ZOPFLI_MASTER_BLOCK_SIZE)
+	}
+}
+
+impl<'a> DeflateIter<'a> {
+	/// # New.
+	const fn new(arr: &'a [u8]) -> Self {
+		Self { arr, pos: 0 }
+	}
+}
diff --git a/flapfli/src/ffi.rs b/flapfli/src/ffi.rs
index ab2afd9..b766150 100644
--- a/flapfli/src/ffi.rs
+++ b/flapfli/src/ffi.rs
@@ -1,7 +1,7 @@
 /*!
 # Flapfli: FFI Image Wrapper.
 
-This module contains custom allocation wrappers for `lodepng`, allowing Rust
+This module contains custom allocation wrappers for lodepng, allowing Rust
 to (more or less) manage the memory.
 */
 
@@ -14,6 +14,7 @@ use std::{
 		realloc,
 	},
 	ffi::c_void,
+	num::NonZeroUsize,
 	ops::Deref,
 	ptr::NonNull,
 };
@@ -28,10 +29,13 @@ const USIZE_SIZE: usize = std::mem::size_of::<usize>();
 #[derive(Debug)]
 /// # Encoded Image.
 ///
-/// This is a convenience wrapper for an image encoded by `lodepng`, allowing
+/// This is a convenience wrapper for an image encoded by lodepng, allowing
 /// for easy slice dereferencing and automatic drop cleanup.
 ///
-/// Note the initial state is null/empty.
+/// Note the initial state will be null/empty.
+///
+/// Allocations are handled by Rust, at least, and are aware of that fact so
+/// will act (or not act) on the pointers accordingly.
 pub struct EncodedPNG {
 	/// # Buffer.
 	pub(crate) buf: *mut u8,
@@ -46,7 +50,7 @@ impl Deref for EncodedPNG {
 	#[allow(unsafe_code)]
 	#[inline]
 	fn deref(&self) -> &Self::Target {
-		if self.is_empty() { &[] }
+		if self.is_null() { &[] }
 		else {
 			unsafe { std::slice::from_raw_parts(self.buf, self.size) }
 		}
@@ -70,10 +74,14 @@ impl EncodedPNG {
 		}
 	}
 
-	/// # Is Empty?
+	/// # Is Null?
+	///
+	/// This is essentially an `is_empty`, returning `true` if the length value
+	/// is zero or the buffer pointer is literally null.
 	///
-	/// Returns true if the instance is empty.
-	fn is_empty(&self) -> bool { self.size == 0 || self.buf.is_null() }
+	/// (The name was chosen to help avoid conflicts with dereferenced slice
+	/// methods.)
+	pub(crate) fn is_null(&self) -> bool { self.size == 0 || self.buf.is_null() }
 }
 
 
@@ -83,17 +91,20 @@ impl EncodedPNG {
 /// # (Re)Allocate!
 ///
 /// Allocate (or reallocate) and return a new pointer for `size` bytes that can
-/// be used by the crate or `lodepng` or both.
+/// be used by the crate or lodepng or both.
 ///
 /// Since C can't be trusted to keep track of allocation sizes, we use the same
 /// trick the [`libdeflater`](https://github.com/adamkewley/libdeflater/blob/master/src/malloc_wrapper.rs) crate does;
-/// we over-allocate by `size_of::<usize>()` bytes, use that extra space to
-/// hold the length details, and return the rest so the caller gets what it
-/// expects.
+/// we over-allocate by `size_of::<usize>()` bytes, using that extra space to
+/// hold the length details.
+///
+/// The caller then gets `ptr.add(size_of::<usize>())` sized as they expect it
+/// to be, and when that pointer is returned to us, we can subtract the same
+/// amount to find the length. Rinse and repeat.
 ///
 /// This still requires a lot of unsafe, but at least it lives on this side of
 /// the FFI divide!
-pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: usize) -> *mut u8 {
+pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: NonZeroUsize) -> *mut u8 {
 	let real_ptr =
 		// If null, allocate it fresh.
 		if ptr.is_null() {
@@ -105,26 +116,23 @@ pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: usize) -> *mut u8
 		// Otherwise resize!
 		else {
 			let (real_ptr, old_size) = size_and_ptr(ptr);
-			realloc(real_ptr, layout_for(old_size), new_size + USIZE_SIZE)
+			// Return it as-was if the allocation is already sufficient.
+			if old_size >= new_size { return ptr; }
+			realloc(real_ptr, layout_for(old_size), USIZE_SIZE + new_size.get())
 		};
 
 	// Safety: the layout is aligned to usize.
-	real_ptr.cast::<usize>().write(new_size); // Write the length.
-	real_ptr.add(USIZE_SIZE)                  // Return the rest.
+	real_ptr.cast::<usize>().write(new_size.get()); // Write the length.
+	real_ptr.add(USIZE_SIZE)                        // Return the rest.
 }
 
 #[allow(unsafe_code, clippy::inline_always)]
 #[inline(always)]
-/// # (Re)Allocate!
+/// # Freedom!
 ///
-/// Allocate (or reallocate) and return a new pointer for `size` bytes that can
-/// be used by the crate or C or both.
-///
-/// The trick — courtesy of the [`libdeflater`](https://github.com/adamkewley/libdeflater/blob/master/src/malloc_wrapper.rs) crate —
-/// is we over-allocate by `size_of::<usize>()`, using that extra space to hold
-/// the length so that later on, we can de- or re-allocate correctly.
-///
-/// This still requires a lot of unsafe, but at least that unsafe lives here!
+/// This method deallocates a pointer previously allocated by
+/// `flapfli_allocate`. Refer to that method's documentation for the how and
+/// why.
 pub(crate) unsafe fn flapfli_free(ptr: *mut u8) {
 	if ! ptr.is_null() {
 		let (ptr, size) = size_and_ptr(ptr);
@@ -136,23 +144,36 @@ pub(crate) unsafe fn flapfli_free(ptr: *mut u8) {
 
 #[no_mangle]
 #[allow(unsafe_code)]
-/// # Free Willy.
+/// # Lodepng-specific Free.
+///
+/// This override allows lodepng to use `flapfli_free` for pointer
+/// deallocation.
 unsafe extern "C" fn lodepng_free(ptr: *mut c_void) { flapfli_free(ptr.cast()); }
 
 #[no_mangle]
 #[allow(unsafe_code)]
 /// # Lodepng-specific Malloc.
 ///
-/// This is the same as ours, but casts to `c_void` for the ABI.
+/// This override allows lodepng to use `flapfli_allocate` for pointer
+/// allocation.
 unsafe extern "C" fn lodepng_malloc(size: usize) -> *mut c_void {
-	flapfli_allocate(std::ptr::null_mut(), size).cast()
+	flapfli_allocate(
+		std::ptr::null_mut(),
+		NonZeroUsize::new(size).unwrap_or(NonZeroUsize::MIN),
+	).cast()
 }
 
 #[no_mangle]
 #[allow(unsafe_code)]
-/// # Re-allocate!
+/// # Lodepng-specific Realloc.
+///
+/// This override allows lodepng to use `flapfli_allocate` for pointer
+/// resizing.
 unsafe extern "C" fn lodepng_realloc(ptr: *mut c_void, new_size: usize) -> *mut c_void {
-	flapfli_allocate(ptr.cast(), new_size).cast()
+	flapfli_allocate(
+		ptr.cast(),
+		NonZeroUsize::new(new_size).unwrap_or(NonZeroUsize::MIN),
+	).cast()
 }
 
 
@@ -163,8 +184,8 @@ unsafe extern "C" fn lodepng_realloc(ptr: *mut c_void, new_size: usize) -> *mut
 ///
 /// This returns an appropriately sized and aligned layout with room at the
 /// beginning to hold our "secret" length information.
-const unsafe fn layout_for(size: usize) -> Layout {
-	Layout::from_size_align_unchecked(USIZE_SIZE + size, std::mem::align_of::<usize>())
+const unsafe fn layout_for(size: NonZeroUsize) -> Layout {
+	Layout::from_size_align_unchecked(USIZE_SIZE + size.get(), std::mem::align_of::<usize>())
 }
 
 #[allow(unsafe_code, clippy::cast_ptr_alignment, clippy::inline_always)]
@@ -172,10 +193,12 @@ const unsafe fn layout_for(size: usize) -> Layout {
 /// # Derive Real Pointer and User Size.
 ///
 /// This method takes the `size`-sized pointer shared with the rest of the
-/// crate (and `lodepng`) and converts it to the "real" one containing the
-/// extra length information, returning it along with said length.
-const unsafe fn size_and_ptr(ptr: *mut u8) -> (*mut u8, usize) {
+/// crate (and lodepng) and converts it to the "real" one (with the leading
+/// length details), returning it and the logical size (i.e. minus eight bytes
+/// or whatever).
+const unsafe fn size_and_ptr(ptr: *mut u8) -> (*mut u8, NonZeroUsize) {
 	let size_and_data_ptr = ptr.sub(USIZE_SIZE);
-	let size = *(size_and_data_ptr as *const usize);
+	// Safety: the size is written from a NonZeroUsize.
+	let size = NonZeroUsize::new_unchecked(*(size_and_data_ptr as *const usize));
 	(size_and_data_ptr, size)
 }
diff --git a/flapfli/src/lib.rs b/flapfli/src/lib.rs
index 300ef65..15e9407 100644
--- a/flapfli/src/lib.rs
+++ b/flapfli/src/lib.rs
@@ -34,23 +34,128 @@ heavily optimized flaca's specific use cases (hence "fla" + "pfli").
 	clippy::redundant_pub_crate,
 )]
 
+mod deflate;
 mod ffi;
 mod lodepng;
 mod zopflipng;
 
 use ffi::EncodedPNG;
-pub use zopflipng::optimize;
-
-use std::sync::atomic::AtomicI32;
+use lodepng::{
+	DecodedImage,
+	LodePNGColorType,
+	LodePNGFilterStrategy,
+	LodePNGState,
+};
+use std::sync::atomic::AtomicU32;
 use zopflipng::{
 	deflate_part,
-	reset_dynamic_length_cache,
-	SplitPoints,
 	ZOPFLI_MASTER_BLOCK_SIZE,
+	ZopfliChunk,
 	ZopfliState,
 };
 
 
 
 /// # Number of Zopfli Iterations.
-pub static ZOPFLI_ITERATIONS: AtomicI32 = AtomicI32::new(0);
+///
+/// A non-zero value indicates a fixed user preference (capped at `i32::MAX`,
+/// though anything above a few thousand is usually terrible). If zero, the
+/// number of iterations will vary by file size.
+///
+/// This is only actually written to once, if ever, but is atomic to make it
+/// easier to read the value from within the callback. (That callback is Rust,
+/// but called from C.)
+pub static ZOPFLI_ITERATIONS: AtomicU32 = AtomicU32::new(0);
+
+#[must_use]
+/// # Optimize!
+///
+/// This will attempt to losslessly recompress the source PNG with the
+/// strongest Zopfli filter strategy, and return a new PNG image if the result
+/// is smaller than the original.
+///
+/// Note: 16-bit transformations are not lossless; such images will have their
+/// bit depths reduced to a more typical 8 bits.
+pub fn optimize(src: &[u8]) -> Option<EncodedPNG> {
+	// Start by decoding the source.
+	let mut dec = LodePNGState::default();
+	let img = dec.decode(src)?;
+
+	// Find the right strategy.
+	let mut enc = LodePNGState::encoder(&dec)?;
+	let mut out = EncodedPNG::new();
+	let strategy = best_strategy(&img, &mut enc, &mut out);
+
+	// Now re-re-encode with zopfli and the best strategy.
+	enc.set_strategy(strategy);
+	enc.set_zopfli();
+	if enc.encode(&img, &mut out) {
+		// For really small images, we might be able to save even more by
+		// nuking the palette.
+		if out.size < 4096 && LodePNGColorType::LCT_PALETTE.is_match(&out) {
+			if let Some(out2) = enc.try_small(&img) {
+				if out2.size < out.size && out2.size < src.len() {
+					// We improved again!
+					return Some(out2);
+				}
+			}
+		}
+
+		// We improved!
+		if out.size < src.len() { return Some(out); }
+	}
+
+	None
+}
+
+#[track_caller]
+#[allow(unsafe_code)]
+/// # Unreachable Hint.
+///
+/// This is a simple unreachability wrapper that calls `unreachable!` when
+/// debug assertions are enabled, or the quieter `hint::unreachable_unchecked`
+/// when not.
+///
+/// Especially since the latter is unsafe, this helps prevent the compiler
+/// from making stupid inlining decisions in hot blocks. Haha.
+pub(crate) const fn unreachable() {
+	#[cfg(debug_assertions)] unreachable!();
+	#[cfg(not(debug_assertions))] unsafe { core::hint::unreachable_unchecked(); }
+}
+
+
+
+/// # Best Strategy.
+///
+/// This re-encodes the image (quickly) using each strategy, returning
+/// whichever produced the smallest output.
+///
+/// Skipping zopfli here saves _a ton_ of processing time and (almost) never
+/// changes the answer, so it's a shortcut worth taking.
+fn best_strategy(
+	img: &DecodedImage,
+	enc: &mut LodePNGState,
+	out: &mut EncodedPNG,
+) -> LodePNGFilterStrategy {
+	let mut best_size = usize::MAX;
+	let mut best_strategy = LodePNGFilterStrategy::LFS_ZERO;
+
+	for strategy in [
+		LodePNGFilterStrategy::LFS_ZERO,
+		LodePNGFilterStrategy::LFS_ONE,
+		LodePNGFilterStrategy::LFS_TWO,
+		LodePNGFilterStrategy::LFS_THREE,
+		LodePNGFilterStrategy::LFS_FOUR,
+		LodePNGFilterStrategy::LFS_MINSUM,
+		LodePNGFilterStrategy::LFS_ENTROPY,
+		LodePNGFilterStrategy::LFS_BRUTE_FORCE,
+	] {
+		enc.set_strategy(strategy);
+		if enc.encode(img, out) && out.size < best_size {
+			best_size = out.size;
+			best_strategy = strategy;
+		}
+	}
+
+	best_strategy
+}
diff --git a/flapfli/src/lodepng.rs b/flapfli/src/lodepng.rs
index 61449ef..b20d608 100644
--- a/flapfli/src/lodepng.rs
+++ b/flapfli/src/lodepng.rs
@@ -6,30 +6,17 @@ This module contains FFI bindings to `lodepng.c`.
 
 #![allow(non_camel_case_types, non_upper_case_globals)]
 
-use crate::{
-	ffi::{
-		flapfli_allocate,
-		flapfli_free,
-	},
-	ZOPFLI_ITERATIONS,
-};
 use std::{
-	cell::RefCell,
 	ffi::{
 		c_uchar,
 		c_uint,
 	},
 	mem::MaybeUninit,
-	ops::Range,
-	sync::atomic::Ordering::Relaxed,
 };
 use super::{
-	deflate_part,
+	deflate::flaca_png_deflate,
 	EncodedPNG,
-	reset_dynamic_length_cache,
-	SplitPoints,
-	ZopfliState,
-	ZOPFLI_MASTER_BLOCK_SIZE,
+	ffi::flapfli_free,
 };
 
 
@@ -37,82 +24,17 @@ use super::{
 // Generated by build.rs.
 include!(concat!(env!("OUT_DIR"), "/lodepng-bindgen.rs"));
 
-thread_local!(
-	static STATES: RefCell<(ZopfliState, SplitPoints)> = RefCell::new((
-		ZopfliState::new(),
-		SplitPoints::new(),
-	))
-);
-
-
-
-#[no_mangle]
-#[allow(unsafe_code)]
-/// # Custom PNG Deflate.
-///
-/// This tells lodepng to use zopfli for encoding.
-pub(crate) extern "C" fn flaca_png_deflate(
-	out: *mut *mut c_uchar,
-	outsize: *mut usize,
-	arr: *const c_uchar,
-	insize: usize,
-	_settings: *const LodePNGCompressSettings,
-) -> c_uint {
-	// Figure out how many iterations to use.
-	let mut numiterations = ZOPFLI_ITERATIONS.load(Relaxed);
-	if numiterations <= 0 {
-		numiterations = if insize < 200_000 { 60 } else { 20 };
-	}
-
-	// Compact the pointers.
-	let mut dst = ZopfliOut {
-		bp: 0,
-		out,
-		outsize,
-	};
-
-	// Compress in chunks, à la ZopfliDeflate.
-	reset_dynamic_length_cache();
-	let mut i: usize = 0;
-	while i < insize {
-		// Each pass needs to know if it is the last, and how much data to
-		// handle.
-		let (last_part, size) =
-			if i + ZOPFLI_MASTER_BLOCK_SIZE >= insize { (true, insize - i) }
-			else { (false, ZOPFLI_MASTER_BLOCK_SIZE) };
-
-		// Crunch the part!
-		let res = STATES.with_borrow_mut(|(state, splits)| deflate_part(
-			state,
-			splits,
-			numiterations,
-			last_part,
-			unsafe { std::slice::from_raw_parts(arr, i + size) },
-			i,
-			&mut dst,
-		));
-
-		#[cfg(debug_assertions)] if let Err(e) = res { panic!("{e}"); }
-
-		// Errors shouldn't be possible, but if something happens to go wrong,
-		// return one so lodepng can abandon its efforts.
-		if res.is_err() { return 1; }
-
-		// Onward and upward!
-		i += size;
-	}
 
-	// Errors panic, so if we're here everything must be fine.
-	0
-}
 
 #[no_mangle]
 #[inline(always)]
 #[allow(unsafe_code, clippy::inline_always)]
 /// # Lodepng CRC32.
 ///
-/// Replace lodepng's native CRC32 hashing method with Rust's (faster)
-/// `crc32fast`.
+/// This override allows lodepng to use `crc32fast` for CRC hashing.
+///
+/// Note: this is more about relative safety than performance; CRC processing
+/// times are negligible compared to everything else. Haha.
 pub(crate) extern "C" fn lodepng_crc32(buf: *const c_uchar, len: usize) -> c_uint {
 	let mut h = crc32fast::Hasher::new();
 	h.update(unsafe { std::slice::from_raw_parts(buf, len) });
@@ -123,6 +45,10 @@ pub(crate) extern "C" fn lodepng_crc32(buf: *const c_uchar, len: usize) -> c_uin
 
 #[derive(Debug)]
 /// # Decoded Image.
+///
+/// This is a simple wrapper holding a pointer to a decoded image along with
+/// the image dimensions. It enables us to hold one thing instead of three
+/// while also ensuring the memory is freed correctly on drop.
 pub(super) struct DecodedImage {
 	pub(super) buf: *mut c_uchar,
 	pub(super) w: c_uint,
@@ -137,111 +63,6 @@ impl Drop for DecodedImage {
 	}
 }
 
-
-
-/// # Lodepng Output Pointers.
-///
-/// This struct provides a wrapper around the lingering bit-writing zopfli C
-/// methods, saving us the trouble of having to pass down three different
-/// pointers (and using a bunch of unsafe blocks) just to get the data saved.
-pub(super) struct ZopfliOut {
-	bp: u8,
-	out: *mut *mut u8,
-	outsize: *mut usize,
-}
-
-impl ZopfliOut {
-	#[allow(unsafe_code)]
-	#[inline(never)]
-	/// # Append Data.
-	fn append_data(&mut self, value: u8) {
-		unsafe {
-			// Dereferencing this size gets annoying quick! Haha.
-			let size = *self.outsize;
-
-			// (Re)allocate if size is a power of two, or empty.
-			if 0 == (size & size.wrapping_sub(1)) {
-				*self.out = flapfli_allocate(*self.out, usize::max(size * 2, 1));
-			}
-
-			(*self.out).add(size).write(value);
-			self.outsize.write(size + 1);
-		}
-	}
-}
-
-impl ZopfliOut {
-	#[allow(unsafe_code)]
-	/// # Add Bit.
-	pub(crate) fn add_bit(&mut self, bit: u8) {
-		if self.bp == 0 { self.append_data(0); }
-		unsafe {
-			// Safety: `append_data` writes a byte to `outsize` and then
-			// increments it, so to reach and modify that same position we need
-			// to use `outsize - 1` instead.
-			*(*self.out).add(*self.outsize - 1) |= bit << self.bp;
-		}
-		self.bp = self.bp.wrapping_add(1) & 7;
-	}
-
-	/// # Add Multiple Bits.
-	pub(crate) fn add_bits(&mut self, symbol: u32, length: u32) {
-		for i in 0..length {
-			let bit = (symbol >> i) & 1;
-			self.add_bit(bit as u8);
-		}
-	}
-
-	/// # Add Huffman Bits.
-	pub(crate) fn add_huffman_bits(&mut self, symbol: u32, length: u32) {
-		// Same as add_bits, except we're doing it backwards.
-		for i in (0..length).rev() {
-			let bit = (symbol >> i) & 1;
-			self.add_bit(bit as u8);
-		}
-	}
-
-	#[allow(clippy::cast_possible_truncation)]
-	/// # Add Non-Compressed Block.
-	pub(crate) fn add_uncompressed_block(
-		&mut self,
-		last_block: bool,
-		arr: &[u8],
-		rng: Range<usize>,
-	) {
-		let mut pos = rng.start;
-		loop {
-			let mut blocksize = usize::from(u16::MAX);
-			if pos + blocksize > rng.end { blocksize = rng.end - pos; }
-			let really_last_block = pos + blocksize >= rng.end;
-			let nlen = ! blocksize;
-
-			self.add_bit(u8::from(last_block && really_last_block));
-
-			// BTYPE 00.
-			self.add_bit(0);
-			self.add_bit(0);
-
-			// Ignore bits of input up to th enext byte boundary.
-			self.bp = 0;
-
-			self.append_data((blocksize % 256) as u8);
-			self.append_data((blocksize.wrapping_div(256) % 256) as u8);
-			self.append_data((nlen % 256) as u8);
-			self.append_data((nlen.wrapping_div(256) % 256) as u8);
-
-			for bit in arr.iter().copied().skip(pos).take(blocksize) {
-				self.append_data(bit);
-			}
-
-			if really_last_block { break; }
-			pos += blocksize;
-		}
-	}
-}
-
-
-
 impl Default for LodePNGColorStats {
 	#[allow(unsafe_code)]
 	fn default() -> Self {
@@ -287,6 +108,9 @@ impl Drop for LodePNGState {
 impl LodePNGState {
 	#[allow(unsafe_code)]
 	/// # Decode!
+	///
+	/// This attempts to decode a raw image byte slice, returning the details
+	/// if successful.
 	pub(super) fn decode(&mut self, src: &[u8]) -> Option<DecodedImage> {
 		let mut buf = std::ptr::null_mut();
 		let mut w = 0;
@@ -306,27 +130,28 @@ impl LodePNGState {
 
 	#[allow(unsafe_code)]
 	/// # Encode!
-	pub(super) fn encode(&mut self, img: &DecodedImage) -> Option<EncodedPNG> {
+	///
+	/// Encode the image, returning `true` if lodepng was happy and the output
+	/// is non-empty.
+	pub(super) fn encode(&mut self, img: &DecodedImage, out: &mut EncodedPNG) -> bool {
+		// Reset the size.
+		out.size = 0;
+
 		// Safety: a non-zero response is an error.
-		let mut out = EncodedPNG::new();
 		let res = unsafe {
 			lodepng_encode(&mut out.buf, &mut out.size, img.buf, img.w, img.h, self)
 		};
 
-		// Return it if we got it.
-		if 0 == res && ! out.is_empty() { Some(out) }
-		else { None }
+		0 == res && ! out.is_null()
 	}
 
 	#[allow(unsafe_code)]
 	/// # Set Up Encoder.
 	///
-	/// This configures and returns a new state for encoding purposes.
-	pub(super) fn encoder(
-		dec: &Self,
-		strategy: LodePNGFilterStrategy,
-		slow: bool
-	) -> Option<Self> {
+	/// This configures and returns a new state for general encoding purposes.
+	/// As this is recycled across runs, separate methods are used to configure
+	/// the strategy and zopfliness.
+	pub(super) fn encoder(dec: &Self) -> Option<Self> {
 		let mut enc = Self::default();
 
 		// Copy palette details over to the encoder.
@@ -341,31 +166,43 @@ impl LodePNGState {
 		}
 
 		enc.encoder.filter_palette_zero = 0;
-		enc.encoder.filter_strategy = strategy;
-
-		// For final compression, enable the custom zopfli deflater.
-		if slow {
-			enc.encoder.zlibsettings.windowsize = 32_768;
-			enc.encoder.zlibsettings.custom_deflate = Some(flaca_png_deflate);
-		}
-		else {
-			enc.encoder.zlibsettings.windowsize = 8_192;
-		}
+		enc.encoder.filter_strategy = LodePNGFilterStrategy::LFS_ZERO;
+		enc.encoder.zlibsettings.windowsize = 8_192;
 
 		Some(enc)
 	}
 
+	/// # Change Strategies.
+	pub(super) fn set_strategy(&mut self, strategy: LodePNGFilterStrategy) {
+		self.encoder.filter_strategy = strategy;
+	}
+
+	/// # Prepare for Zopfli.
+	///
+	/// Increase the window size and enable our custom zopfli deflate callback.
+	/// For performance reasons, this is only called before the final
+	/// encoding pass; everything else is run with saner tunings.
+	pub(super) fn set_zopfli(&mut self) {
+		self.encoder.zlibsettings.windowsize = 32_768;
+		self.encoder.zlibsettings.custom_deflate = Some(flaca_png_deflate);
+	}
+
 	#[allow(unsafe_code)]
-	/// # Prepare Encoder for Encoding (a small image).
+	#[inline(never)]
+	/// # Paletteless Encode (for small images).
 	///
-	/// This updates an existing encoder to potentially further optimize a
-	/// really small image.
-	pub(super) fn prepare_encoder_small(&mut self, img: &DecodedImage) -> bool {
+	/// Patch the encoder settings to see if we can squeeze even more savings
+	/// out of the (small) image, reencode it, and return the result if there
+	/// are no errors.
+	///
+	/// Note: the caller will need to check the resulting size to see if
+	/// savings were actually achieved, and keep whichever version was better.
+	pub(super) fn try_small(&mut self, img: &DecodedImage) -> Option<EncodedPNG> {
 		// Safety: a non-zero response is an error.
 		let mut stats = LodePNGColorStats::default();
 		if 0 != unsafe {
 			lodepng_compute_color_stats(&mut stats, img.buf, img.w, img.h, &self.info_raw)
-		} { return false; }
+		} { return None; }
 
 		// The image is too small for tRNS chunk overhead.
 		if img.w * img.h <= 16 && 0 != stats.key { stats.alpha = 1; }
@@ -389,7 +226,10 @@ impl LodePNGState {
 		}
 		else { self.info_png.color.key_defined = 0; }
 
-		true
+		// Re-encode it and see what happens!
+		let mut out = EncodedPNG::new();
+		if self.encode(img, &mut out) { Some(out) }
+		else { None }
 	}
 }
 
diff --git a/flapfli/src/zopflipng/blocks.rs b/flapfli/src/zopflipng/blocks.rs
index c454fab..c315e8f 100644
--- a/flapfli/src/zopflipng/blocks.rs
+++ b/flapfli/src/zopflipng/blocks.rs
@@ -5,28 +5,25 @@ This module contains the deflate entrypoint and all of the block-related odds
 and ends that didn't make it into other modules.
 */
 
-use dactyl::NoHash;
-use std::{
-	collections::HashSet,
-	num::NonZeroU32,
-	ops::Range,
-};
+use std::num::NonZeroU32;
 use super::{
 	ArrayD,
 	ArrayLL,
 	DeflateSym,
 	DISTANCE_BITS,
 	DISTANCE_VALUES,
+	DynamicLengths,
 	encode_tree,
 	FIXED_SYMBOLS_D,
 	FIXED_SYMBOLS_LL,
 	FIXED_TREE_D,
 	FIXED_TREE_LL,
-	get_dynamic_lengths,
 	LENGTH_SYMBOL_BIT_VALUES,
 	LENGTH_SYMBOL_BITS,
 	LengthLimitedCodeLengths,
 	LZ77Store,
+	LZ77StoreRange,
+	SplitCache,
 	SplitLen,
 	SplitPIdx,
 	SymbolIteration,
@@ -35,8 +32,10 @@ use super::{
 		SymbolStats,
 	},
 	zopfli_error,
+	ZopfliChunk,
 	ZopfliError,
 	ZopfliOut,
+	ZopfliRange,
 	ZopfliState,
 };
 
@@ -49,234 +48,21 @@ const BLOCK_TYPE_DYNAMIC: u8 = 2;
 const MINIMUM_SPLIT_DISTANCE: usize = 10;
 
 #[allow(unsafe_code)]
+/// # Ten is Non-Zero.
 const NZ10: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(10) };
+
 #[allow(unsafe_code)]
+/// # Eleven is Non-Zero.
 const NZ11: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(11) };
 
-
-
-/// # Split Point Scratch.
-///
-/// This holds two sets of block split points for use during the deflate
-/// passes. Each set can hold up to 14 points (one less than
-/// `BLOCKSPLITTING_MAX`), but we're overallocating to 15 to cheaply elide
-/// bounds checks.
+/// # Block Split Points.
 ///
-/// A single instance of this struct is (re)used for all deflate passes on a
-/// given image to reduce allocation overhead.
-pub(crate) struct SplitPoints {
-	slice1: [usize; 15],
-	slice2: [usize; 15],
-	done: HashSet<usize, NoHash>,
-}
-
-impl SplitPoints {
-	/// # New Instance.
-	pub(crate) fn new() -> Self {
-		Self {
-			slice1: [0; 15],
-			slice2: [0; 15],
-			done: HashSet::with_hasher(NoHash::default()),
-		}
-	}
-}
-
-impl SplitPoints {
-	/// # Uncompressed Split Pass.
-	///
-	/// This sets the uncompressed split points, by way of first setting the
-	/// LZ77 split points.
-	///
-	/// In terms of order-of-operations, this must be called _before_ the
-	/// second-stage LZ77 pass as it would otherwise blow away that data.
-	fn split_raw(&mut self, arr: &[u8], instart: usize, state: &mut ZopfliState, store: &mut LZ77Store)
-	-> Result<SplitLen, ZopfliError> {
-		// Populate an LZ77 store from a greedy pass. This results in better
-		// block choices than a full optimal pass.
-		state.greedy_cold(arr, instart, store, None)?;
-
-		// Do an LZ77 pass.
-		let len = self.split_lz77(store)?;
-
-		// Find the corresponding uncompressed positions.
-		if len.is_zero() { Ok(len) }
-		else {
-			let mut pos = instart;
-			let mut j = SplitLen::S00;
-			for (i, e) in store.entries.iter().enumerate().take(self.slice2[len as usize - 1] + 1) {
-				if i == self.slice2[j as usize] {
-					self.slice1[j as usize] = pos;
-					j = j.increment();
-					if (j as u8) == (len as u8) { return Ok(len); }
-				}
-				pos += e.length() as usize;
-			}
-
-			Err(zopfli_error!())
-		}
-	}
-
-	/// # LZ77 Split Pass.
-	///
-	/// This sets the LZ77 split points according to convoluted cost
-	/// evaluations.
-	fn split_lz77(&mut self, store: &LZ77Store) -> Result<SplitLen, ZopfliError> {
-		/// # Find Largest Splittable Block.
-		///
-		/// This finds the largest available block for splitting, evenly spreading the
-		/// load if a limited number of blocks are requested.
-		///
-		/// Returns `false` if no blocks are found.
-		fn find_largest(
-			lz77size: usize,
-			done: &HashSet<usize, NoHash>,
-			splitpoints: &[usize],
-			rng: &mut Range<usize>,
-		) -> bool {
-			let mut best = 0;
-			for i in 0..=splitpoints.len() {
-				let start =
-					if i == 0 { 0 }
-					else { splitpoints[i - 1] };
-				let end =
-					if i < splitpoints.len() { splitpoints[i] }
-					else { lz77size - 1 };
-
-				// We found a match!
-				if best < end - start && ! done.contains(&start) {
-					rng.start = start;
-					rng.end = end;
-					best = end - start;
-				}
-			}
-			MINIMUM_SPLIT_DISTANCE <= best
-		}
-
-		// This won't work on tiny files.
-		if store.len() < MINIMUM_SPLIT_DISTANCE { return Ok(SplitLen::S00); }
-
-		// Get started!
-		self.done.clear();
-		let mut rng = 0..store.len();
-		let mut last = 0;
-		let mut len = SplitLen::S00;
-		loop {
-			let (llpos, llcost) = find_minimum_cost(store, rng.start + 1..rng.end)?;
-			if llpos <= rng.start || llpos >= rng.end {
-				return Err(zopfli_error!());
-			}
+/// This array holds up to fourteen middle points as well as the absolute start
+/// and end indices.
+type SplitPoints = [usize; 16];
 
-			// Ignore points we've already covered.
-			if llpos == rng.start + 1 || calculate_block_size_auto_type(store, rng.clone())? < llcost {
-				self.done.insert(rng.start);
-			}
-			else {
-				// Mark it as a split point and add it sorted.
-				self.slice2[len as usize] = llpos;
-				len = len.increment();
-
-				// Keep the list sorted.
-				if last > llpos { self.slice2[..len as usize].sort_unstable(); }
-				else { last = llpos; }
-
-				// Stop if we've split the maximum number of times.
-				if len.is_max() { break; }
-			}
-
-			// Look for a split and adjust the start/end accordingly. If we don't
-			// find one or the remaining distance is too small to continue, we're
-			// done!
-			if ! find_largest(
-				store.len(),
-				&self.done,
-				&self.slice2[..len as usize],
-				&mut rng,
-			) { break; }
-		}
-
-		Ok(len)
-	}
-
-	/// # (Re)split Best.
-	///
-	/// If there's enough data, resplit with optimized LZ77 paths and return
-	/// whichever best is better.
-	fn split_again(
-		&mut self,
-		store: &LZ77Store,
-		limit1: SplitLen,
-		cost1: u32,
-	) -> Result<&[usize], ZopfliError> {
-		if 1 < (limit1 as u8) {
-			// Move slice2 over to slice1 so we can repopulate slice2.
-			self.slice1.copy_from_slice(self.slice2.as_slice());
-
-			let limit2 = self.split_lz77(store)?;
-			let mut cost2 = 0;
-			for i in 0..=limit2 as usize {
-				let start = if i == 0 { 0 } else { self.slice2[i - 1] };
-				let end = if i < (limit2 as usize) { self.slice2[i] } else { store.len() };
-				cost2 += calculate_block_size_auto_type(store, start..end)?.get();
-			}
-
-			// It's better!
-			if cost2 < cost1 { Ok(&self.slice2[..limit2 as usize]) }
-			else { Ok(&self.slice1[..limit1 as usize]) }
-		}
-		else { Ok(&self.slice2[..limit1 as usize]) }
-	}
-
-	/// # Split Best.
-	///
-	/// Compare the optimal raw split points with a dedicated lz77 pass and
-	/// return whichever is predicted to compress better.
-	fn split(
-		&mut self,
-		numiterations: i32,
-		arr: &[u8],
-		instart: usize,
-		store: &mut LZ77Store,
-		store2: &mut LZ77Store,
-		state: &mut ZopfliState,
-	) -> Result<&[usize], ZopfliError> {
-		// Start by splitting uncompressed.
-		let limit = self.split_raw(arr, instart, state, store2)?;
-		store2.clear();
-
-		// Now some LZ77 funny business.
-		let mut cost1 = 0;
-		let mut store3 = LZ77Store::new();
-		for i in 0..=limit as usize {
-			let start = if i == 0 { instart } else { self.slice1[i - 1] };
-			let end = if i < (limit as usize) { self.slice1[i] } else { arr.len() };
-
-			// This assertion is redundant as we explicitly check range sanity
-			// earlier and later in the pipeline.
-			debug_assert!(start <= end && end <= arr.len());
-
-			// Make another store.
-			lz77_optimal(
-				arr.get(..end).ok_or(zopfli_error!())?,
-				start,
-				numiterations,
-				store2,
-				&mut store3,
-				state,
-			)?;
-			cost1 += calculate_block_size_auto_type(store2, 0..store2.len())?.get();
-
-			// Append its data to our main store.
-			store.steal_entries(store2);
-
-			// Save the chunk size to our best.
-			if i < (limit as usize) { self.slice2[i] = store.len(); }
-		}
-
-		// Try a second pass, recalculating the LZ77 splits with the updated
-		// store details.
-		self.split_again(store, limit, cost1)
-	}
-}
+/// # Zero-Filled Split Points.
+const ZEROED_SPLIT_POINTS: SplitPoints = [0; 16];
 
 
 
@@ -289,39 +75,53 @@ impl SplitPoints {
 /// chunk, then writes the resulting blocks to the output file.
 pub(crate) fn deflate_part(
 	state: &mut ZopfliState,
-	splits: &mut SplitPoints,
-	numiterations: i32,
+	numiterations: NonZeroU32,
 	last_block: bool,
-	arr: &[u8],
-	instart: usize,
+	chunk: ZopfliChunk<'_>,
 	out: &mut ZopfliOut,
 ) -> Result<(), ZopfliError> {
+	#[inline(never)]
+	fn empty_fixed(last_block: bool, out: &mut ZopfliOut) {
+		out.add_header::<BLOCK_TYPE_FIXED>(last_block);
+		out.add_fixed_bits::<7>(0);
+	}
+
 	let mut store = LZ77Store::new();
 	let mut store2 = LZ77Store::new();
 
 	// Find the split points.
-	let best = splits.split(
+	let (best, best_len) = split_points(
 		numiterations,
-		arr,
-		instart,
+		chunk,
 		&mut store,
 		&mut store2,
 		state,
 	)?;
 
 	// Write the data!
-	for i in 0..=best.len() {
-		let start = if i == 0 { 0 } else { best[i - 1] };
-		let end = if i < best.len() { best[i] } else { store.len() };
-		add_lz77_block(
-			last_block && i == best.len(),
-			&store,
-			&mut store2,
-			state,
-			arr,
-			start..end,
-			out,
-		)?;
+	let store_len = best[best_len as usize + 1];
+	for pair in best[..best_len as usize + 2].windows(2) {
+		let really_last_block = last_block && pair[1] == store_len;
+
+		if let Ok(rng) = ZopfliRange::new(pair[0], pair[1]) {
+			let store_rng = store.ranged(rng)?;
+			add_lz77_block(
+				really_last_block,
+				store_rng,
+				store_len,
+				&mut store2,
+				state,
+				chunk,
+				out,
+			)?;
+		}
+
+		// This shouldn't be reachable, but the original zopfli seemed to think
+		// empty blocks are possible and imply fixed-tree layouts, so maybe?
+		else {
+			debug_assert_eq!(pair[0], pair[1]);
+			empty_fixed(really_last_block, out);
+		}
 	}
 
 	Ok(())
@@ -330,33 +130,32 @@ pub(crate) fn deflate_part(
 
 
 #[allow(clippy::cast_precision_loss, clippy::cast_sign_loss)]
+#[inline]
 /// # Add LZ77 Block (Automatic Type).
 ///
 /// This calculates the expected output sizes for all three block types, then
 /// writes the best one to the output file.
 fn add_lz77_block(
 	last_block: bool,
-	store: &LZ77Store,
+	store: LZ77StoreRange,
+	store_len: usize,
 	fixed_store: &mut LZ77Store,
 	state: &mut ZopfliState,
-	arr: &[u8],
-	rng: Range<usize>,
+	chunk: ZopfliChunk<'_>,
 	out: &mut ZopfliOut
 ) -> Result<(), ZopfliError> {
+	#[inline(never)]
 	/// # Add LZ77 Block (Dynamic).
 	fn add_dynamic(
 		last_block: bool,
-		store: &LZ77Store,
-		rng: Range<usize>,
+		store: LZ77StoreRange,
 		out: &mut ZopfliOut,
 		extra: u8,
 		ll_lengths: &ArrayLL<DeflateSym>,
 		d_lengths: &ArrayD<DeflateSym>,
 	) -> Result<(), ZopfliError> {
 		// Type Bits.
-		out.add_bit(u8::from(last_block));
-		out.add_bit(BLOCK_TYPE_DYNAMIC & 1);
-		out.add_bit((BLOCK_TYPE_DYNAMIC & 2) >> 1);
+		out.add_header::<BLOCK_TYPE_DYNAMIC>(last_block);
 
 		// Build the lengths first.
 		encode_tree(ll_lengths, d_lengths, extra, out)?;
@@ -366,118 +165,99 @@ fn add_lz77_block(
 		let d_symbols = ArrayD::<u32>::llcl_symbols(d_lengths);
 
 		// Write all the data!
-		add_lz77_data(
-			store, rng, &ll_symbols, ll_lengths, &d_symbols, d_lengths, out
-		)?;
-
-		// Finish up by writting the end symbol.
-		out.add_huffman_bits(ll_symbols[256], ll_lengths[256] as u32);
-		Ok(())
+		add_lz77_data(store, &ll_symbols, ll_lengths, &d_symbols, d_lengths, out)
 	}
 
+	#[inline(never)]
 	/// # Add LZ77 Block (Fixed).
 	fn add_fixed(
 		last_block: bool,
-		store: &LZ77Store,
-		rng: Range<usize>,
+		store: LZ77StoreRange,
 		out: &mut ZopfliOut,
 	) -> Result<(), ZopfliError> {
 		// Type Bits.
-		out.add_bit(u8::from(last_block));
-		out.add_bit(BLOCK_TYPE_FIXED & 1);
-		out.add_bit((BLOCK_TYPE_FIXED & 2) >> 1);
+		out.add_header::<BLOCK_TYPE_FIXED>(last_block);
 
 		// Write all the data!
 		add_lz77_data(
-			store, rng,
+			store,
 			&FIXED_SYMBOLS_LL, &FIXED_TREE_LL, &FIXED_SYMBOLS_D, &FIXED_TREE_D,
 			out
-		)?;
-
-		// Finish up by writting the end symbol.
-		out.add_huffman_bits(FIXED_SYMBOLS_LL[256], FIXED_TREE_LL[256] as u32);
-		Ok(())
+		)
 	}
 
 	#[inline(never)]
-	fn dynamic_details(store: &LZ77Store, rng: Range<usize>)
-	-> Result<(u8, NonZeroU32, ArrayLL<DeflateSym>, ArrayD<DeflateSym>), ZopfliError> {
-		get_dynamic_lengths(store, rng)
+	/// # Add Uncompressed.
+	///
+	/// It is extremely unlikely this will ever be called. Haha.
+	fn add_uncompressed(
+		last_block: bool,
+		store: LZ77StoreRange,
+		chunk: ZopfliChunk<'_>,
+		out: &mut ZopfliOut,
+	) -> Result<(), ZopfliError> {
+		let rng = store.byte_range()?;
+		let chunk2 = chunk.reslice_rng(rng)?;
+		out.add_uncompressed_block(last_block, chunk2);
+		Ok(())
 	}
 
 	#[inline(never)]
-	fn fixed_cost_cold(store: &LZ77Store, rng: Range<usize>) -> NonZeroU32 {
-		calculate_block_size_fixed(store, rng)
-	}
-
-	// If the block is empty, we can assume a fixed-tree layout.
-	if rng.is_empty() {
-		out.add_bits(u32::from(last_block), 1);
-		out.add_bits(1, 2);
-		out.add_bits(0, 7);
-		return Ok(());
-	}
+	fn dynamic_details(store: LZ77StoreRange)
+	-> Result<DynamicLengths, ZopfliError> { DynamicLengths::new(store) }
 
 	// Calculate the three costs.
-	let uncompressed_cost = calculate_block_size_uncompressed(store, rng.clone())?;
-	let (dynamic_extra, dynamic_cost, dynamic_ll, dynamic_d) = dynamic_details(store, rng.clone())?;
+	let uncompressed_cost = store.block_size_uncompressed()?;
+	let dynamic = dynamic_details(store)?;
 
 	// Most blocks won't benefit from a fixed tree layout, but if we've got a
 	// tiny one or the unoptimized-fixed size is within 10% of the dynamic size
 	// we should check it out.
 	if
-		store.len() <= 1000 ||
-		calculate_block_size_fixed(store, rng.clone()).saturating_mul(NZ10) <= dynamic_cost.saturating_mul(NZ11)
+		store_len <= 1000 ||
+		store.block_size_fixed().saturating_mul(NZ10) <= dynamic.cost().saturating_mul(NZ11)
 	{
-		let rng2 = store.byte_range(rng.clone())?;
-		state.init_lmc(rng2.len());
+		let rng = store.byte_range()?;
+		let fixed_chunk = chunk.reslice_rng(rng)?;
+		state.init_lmc(&fixed_chunk);
 
 		// Perform an optimal run.
-		state.optimal_run_cold(
-			arr.get(..rng2.end).ok_or(zopfli_error!())?,
-			rng2.start,
-			None,
-			fixed_store,
-		)?;
+		state.optimal_run_fixed(fixed_chunk, fixed_store)?;
 
 		// And finally, the cost!
-		let fixed_cost = fixed_cost_cold(fixed_store, 0..fixed_store.len());
-		if fixed_cost < dynamic_cost && fixed_cost <= uncompressed_cost {
-			return add_fixed(last_block, fixed_store, 0..fixed_store.len(), out);
+		let fixed_rng = ZopfliRange::new(0, fixed_store.len())?;
+		let fixed_store_rng = fixed_store.ranged(fixed_rng)?;
+		let fixed_cost = fixed_store_rng.block_size_fixed();
+		if fixed_cost < dynamic.cost() && fixed_cost <= uncompressed_cost {
+			return add_fixed(last_block, fixed_store_rng, out);
 		}
 	}
 
 	// Dynamic is best!
-	if dynamic_cost <= uncompressed_cost {
-		add_dynamic(
-			last_block, store, rng, out,
-			dynamic_extra, &dynamic_ll, &dynamic_d,
-		)
+	if dynamic.cost() <= uncompressed_cost {
+		add_dynamic(last_block, store, out, dynamic.extra(), dynamic.ll_lengths(), dynamic.d_lengths())
 	}
-	// All the work we did earlier was fruitless; the block works best in an
-	// uncompressed form.
+	// Nothing is everything!
 	else {
-		let rng = store.byte_range(rng)?;
-		out.add_uncompressed_block(last_block, arr, rng);
-		Ok(())
+		add_uncompressed(last_block, store, chunk, out)
 	}
 }
 
 #[allow(clippy::cast_sign_loss)]
+#[inline]
 /// # Add LZ77 Data.
 ///
 /// This adds all lit/len/dist codes from the lists as huffman symbols, but not
 /// the end code (256).
 fn add_lz77_data(
-	store: &LZ77Store,
-	rng: Range<usize>,
+	store: LZ77StoreRange,
 	ll_symbols: &ArrayLL<u32>,
 	ll_lengths: &ArrayLL<DeflateSym>,
 	d_symbols: &ArrayD<u32>,
 	d_lengths: &ArrayD<DeflateSym>,
 	out: &mut ZopfliOut
 ) -> Result<(), ZopfliError> {
-	for e in store.entries.get(rng).ok_or(zopfli_error!())? {
+	for e in store.entries {
 		// Always add the length symbol (or literal).
 		if ll_lengths[e.ll_symbol as usize].is_zero() { return Err(zopfli_error!()); }
 		out.add_huffman_bits(
@@ -507,116 +287,81 @@ fn add_lz77_data(
 		else if (e.litlen as u16) >= 256 { return Err(zopfli_error!()); }
 	}
 
-	Ok(())
-}
-
-#[allow(clippy::cast_possible_truncation)] // The maximum blocksize is only 1 million.
-/// # Calculate Block Size (Uncompressed).
-fn calculate_block_size_uncompressed(store: &LZ77Store, rng: Range<usize>)
--> Result<NonZeroU32, ZopfliError> {
-	let rng = store.byte_range(rng)?;
-	let blocksize = rng.len() as u32;
-
-	// Blocks larger than u16::MAX need to be split.
-	let blocks = blocksize.div_ceil(65_535);
-	NonZeroU32::new(blocks * 40 + blocksize * 8).ok_or(zopfli_error!())
-}
-
-/// # Calculate Block Size (Fixed).
-fn calculate_block_size_fixed(store: &LZ77Store, rng: Range<usize>) -> NonZeroU32 {
-	// The end symbol is always included.
-	let mut size = FIXED_TREE_LL[256] as u32;
-
-	// Loop the store if we have data to loop.
-	let slice = store.entries.as_slice();
-	if rng.start < rng.end && rng.end <= slice.len() {
-		// Make sure the end does not exceed the store!
-		for e in &slice[rng] {
-			size += FIXED_TREE_LL[e.ll_symbol as usize] as u32;
-			if 0 < e.dist {
-				size += u32::from(LENGTH_SYMBOL_BITS[e.litlen as usize]);
-				size += u32::from(DISTANCE_BITS[e.d_symbol as usize]);
-				size += FIXED_TREE_D[e.d_symbol as usize] as u32;
-			}
-		}
-	}
-
-	// This can't really fail, but fixed models are bullshit anyway so we can
-	// fall back to an unbeatably large number.
-	NonZeroU32::new(size).unwrap_or(NonZeroU32::MAX)
-}
+	// Finish up by writting the end symbol.
+	out.add_huffman_bits(ll_symbols[256], ll_lengths[256] as u32);
 
-/// # Calculate Block Size (Dynamic).
-fn calculate_block_size_dynamic(store: &LZ77Store, rng: Range<usize>)
--> Result<NonZeroU32, ZopfliError> {
-	get_dynamic_lengths(store, rng).map(|(_, size, _, _)| size)
+	Ok(())
 }
 
 /// # Calculate Best Block Size (in Bits).
-fn calculate_block_size_auto_type(store: &LZ77Store, rng: Range<usize>)
+fn calculate_block_size_auto(store: &LZ77Store, rng: ZopfliRange)
 -> Result<NonZeroU32, ZopfliError> {
-	let uncompressed_cost = calculate_block_size_uncompressed(store, rng.clone())?;
-
-	// We can skip the expensive fixed-cost calculations for large blocks since
-	// they're unlikely ever to use it.
-	let fixed_cost =
-		if 1000 < store.len() { uncompressed_cost }
-		else { calculate_block_size_fixed(store, rng.clone()) };
-
-	let dynamic_cost = calculate_block_size_dynamic(store, rng)?;
-
-	// If uncompressed is better than everything, return it.
-	if uncompressed_cost < fixed_cost && uncompressed_cost < dynamic_cost {
-		Ok(uncompressed_cost)
-	}
-	// Otherwise choose the smaller of fixed and dynamic.
-	else if fixed_cost < dynamic_cost { Ok(fixed_cost) }
-	else { Ok(dynamic_cost) }
+	let small = store.len() <= 1000;
+	let store = store.ranged(rng)?;
+	store.block_size_auto(small)
 }
 
 /// # Minimum Split Cost.
 ///
 /// Return the index of the smallest split cost between `start..end`.
-fn find_minimum_cost(store: &LZ77Store, mut rng: Range<usize>)
+fn find_minimum_cost(store: &LZ77Store, full_rng: ZopfliRange)
 -> Result<(usize, NonZeroU32), ZopfliError> {
+	#[cold]
+	/// # Small Cost.
+	///
+	/// For small ranges, skip the logic and compare all possible splits. This
+	/// will return an error if no splits are possible.
+	fn small_cost(store: LZ77StoreRange, offset: usize, small: bool)
+	-> Result<(usize, NonZeroU32), ZopfliError> {
+		let mut best_cost = NonZeroU32::MAX;
+		let mut best_idx = 1;
+		let mut mid = 1;
+		for (a, b) in store.splits()? {
+			let cost = split_cost(a, b, small)?;
+			if cost < best_cost {
+				best_cost = cost;
+				best_idx = mid; // The split point.
+			}
+			mid += 1;
+		}
+		Ok((offset + best_idx, best_cost))
+	}
+
 	/// # Split Block Cost.
 	///
 	/// Sum the left and right halves of the range.
-	fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result<NonZeroU32, ZopfliError> {
-		let a = calculate_block_size_auto_type(store, start..mid)?;
-		let b = calculate_block_size_auto_type(store, mid..end)?;
+	fn split_cost(a: LZ77StoreRange, b: LZ77StoreRange, small: bool) -> Result<NonZeroU32, ZopfliError> {
+		let a = a.block_size_auto(small)?;
+		let b = b.block_size_auto(small)?;
 		Ok(a.saturating_add(b.get()))
 	}
 
-	// Keep track of the original start/end points.
-	let split_start = rng.start - 1;
-	let split_end = rng.end;
+	// Break it down a bit.
+	let offset = full_rng.start();
+	let small = store.len() <= 1000;
+	let store_rng = store.ranged(full_rng)?;
 
-	let mut best_cost = NonZeroU32::MAX;
-	let mut best_idx = rng.start;
+	// Short circuit.
+	if store_rng.len().get() <= 1024 { return small_cost(store_rng, offset, small); }
 
-	// Small chunks don't need much.
-	if rng.len() < 1024 {
-		for i in rng {
-			let cost = split_cost(store, split_start, i, split_end)?;
-			if cost < best_cost {
-				best_cost = cost;
-				best_idx = i;
-			}
-		}
-		return Ok((best_idx, best_cost));
-	}
+	// Split range, relative to the length of the ranged store.
+	let mut split_rng = 1..store_rng.len().get();
 
 	// Divide and conquer.
+	let mut best_cost = NonZeroU32::MAX;
+	let mut best_idx = 1;
 	let mut p = [0_usize; MINIMUM_SPLIT_DISTANCE - 1];
 	let mut last_best_cost = NonZeroU32::MAX;
-	while MINIMUM_SPLIT_DISTANCE <= rng.len() {
+	loop {
 		let mut best_p_idx = SplitPIdx::S0;
 		for (i, pp) in SplitPIdx::all().zip(p.iter_mut()) {
-			*pp = rng.start + (i as usize + 1) * (rng.len().wrapping_div(MINIMUM_SPLIT_DISTANCE));
+			*pp = split_rng.start + (i as usize + 1) * (split_rng.len().wrapping_div(MINIMUM_SPLIT_DISTANCE));
 			let line_cost =
 				if best_idx == *pp { last_best_cost }
-				else { split_cost(store, split_start, *pp, split_end)? };
+				else {
+					let (a, b) = store_rng.split(*pp)?;
+					split_cost(a, b, small)?
+				};
 
 			if (i as usize) == 0 || line_cost < best_cost {
 				best_cost = line_cost;
@@ -629,15 +374,17 @@ fn find_minimum_cost(store: &LZ77Store, mut rng: Range<usize>)
 
 		// Nudge the boundaries and back again.
 		best_idx = p[best_p_idx as usize];
-		if 0 != (best_p_idx as usize) { rng.start = p[best_p_idx as usize - 1]; }
-		if (best_p_idx as usize) + 1 < p.len() { rng.end = p[best_p_idx as usize + 1]; }
+		if 0 != (best_p_idx as usize) { split_rng.start = p[best_p_idx as usize - 1]; }
+		if (best_p_idx as usize) + 1 < p.len() { split_rng.end = p[best_p_idx as usize + 1]; }
 
 		last_best_cost = best_cost;
+		if split_rng.len() < MINIMUM_SPLIT_DISTANCE { break; }
 	}
 
-	Ok((best_idx, last_best_cost))
+	Ok((offset + best_idx, last_best_cost))
 }
 
+#[inline]
 /// # Optimal LZ77.
 ///
 /// Calculate lit/len and dist pairs for the dataset.
@@ -645,46 +392,42 @@ fn find_minimum_cost(store: &LZ77Store, mut rng: Range<usize>)
 /// Note: this incorporates the functionality of `ZopfliLZ77OptimalRun`
 /// directly.
 fn lz77_optimal(
-	arr: &[u8],
-	instart: usize,
-	numiterations: i32,
+	chunk: ZopfliChunk<'_>,
+	numiterations: NonZeroU32,
 	store: &mut LZ77Store,
 	scratch_store: &mut LZ77Store,
 	state: &mut ZopfliState,
-) -> Result<(), ZopfliError> {
-	// Easy abort.
-	if instart >= arr.len() || numiterations < 1 { return Ok(()); }
-
+) -> Result<NonZeroU32, ZopfliError> {
 	// Reset the main cache for the current blocksize.
-	state.init_lmc(arr.len() - instart);
+	state.init_lmc(&chunk);
 
 	// Greedy run.
-	state.greedy(arr, instart, scratch_store, Some(instart))?;
-
-	// Create new stats with the store (updated by the greedy pass).
-	let mut current_stats = SymbolStats::new();
-	current_stats.load_store(scratch_store);
+	state.greedy(chunk, scratch_store, Some(chunk.pos()))?;
 
-	// Set up dummy stats we can use to track best and last.
+	// Set up the PRNG and two sets of stats, populating one with the greedy-
+	// crunched store.
 	let mut ran = RanState::new();
 	let mut best_stats = SymbolStats::new();
+	let mut current_stats = SymbolStats::new();
+	current_stats.load_store(scratch_store);
 
 	// We'll also want dummy best and last costs.
-	let mut last_cost = NonZeroU32::MIN;
+	let mut last_cost = NonZeroU32::MAX;
 	let mut best_cost = NonZeroU32::MAX;
 
 	// Repeat statistics with the cost model from the previous
 	// stat run.
-	let mut last_ran = -1;
-	for i in 0..numiterations {
+	let mut weighted = false;
+	for i in 0..numiterations.get() {
+		// Rebuild the symbols.
+		current_stats.crunch();
+
 		// Optimal run.
-		state.optimal_run(arr, instart, Some(&current_stats), scratch_store)?;
+		state.optimal_run(chunk, &current_stats, scratch_store)?;
 
 		// This is the cost we actually care about.
-		let current_cost = calculate_block_size_dynamic(
-			scratch_store,
-			0..scratch_store.len(),
-		)?;
+		let current_cost = scratch_store.ranged_full()
+			.and_then(LZ77StoreRange::block_size_dynamic)?;
 
 		// We have a new best!
 		if current_cost < best_cost {
@@ -693,31 +436,221 @@ fn lz77_optimal(
 			best_cost = current_cost;
 		}
 
-		// Copy the stats to last_stats, clear them, and repopulate
-		// with the current store.
-		let (last_litlens, last_dists) = current_stats.clear();
-		current_stats.load_store(scratch_store);
-
-		// Once the randomness has kicked in, improve convergence by
-		// weighting the current and previous stats.
-		if last_ran != -1 {
-			current_stats.add_last(&last_litlens, &last_dists);
-			current_stats.crunch();
-		}
+		// Repopulate the counts from the current store, and if the randomness
+		// has "warmed up" sufficiently, combine them with half the previous
+		// values to create a sorted of weighted average.
+		current_stats.reload_store(scratch_store, weighted);
 
-		// Replace the current stats with the best stats, randomize,
-		// and see what happens.
+		// If nothing changed, replace the current stats with the best stats,
+		// reorder the counts, and see what happens.
 		if 5 < i && current_cost == last_cost {
 			current_stats = best_stats;
 			current_stats.randomize(&mut ran);
-			current_stats.crunch();
-			last_ran = i;
+			weighted = true;
 		}
+		else { last_cost = current_cost; }
+	}
+
+	// Find and return the current (best) cost of the store.
+	let store_rng = store.ranged_full()?;
+	store_rng.block_size_auto(store_rng.len().get() <= 1000)
+}
 
-		last_cost = current_cost;
+#[inline(never)]
+/// # Best Split Points.
+///
+/// Compare the optimal raw and LZ77 split points, returning whichever is
+/// predicted to compress better.
+///
+/// Note the returned length corresponds to the number of points in the middle;
+/// it excludes the absolute start and end points.
+fn split_points(
+	numiterations: NonZeroU32,
+	chunk: ZopfliChunk<'_>,
+	store: &mut LZ77Store,
+	store2: &mut LZ77Store,
+	state: &mut ZopfliState,
+) -> Result<(SplitPoints, SplitLen), ZopfliError> {
+	// We'll need two sets of split points.
+	let mut split_a = ZEROED_SPLIT_POINTS;
+	let mut split_b = ZEROED_SPLIT_POINTS;
+
+	// Start by splitting uncompressed.
+	let raw_len = split_points_raw(chunk, store2, state, &mut split_a, &mut split_b)?;
+	store2.clear();
+
+	// Calculate the costs associated with that split and update the store with
+	// the symbol information encountered.
+	let mut cost1 = 0;
+	let mut store3 = LZ77Store::new();
+	for i in 0..=raw_len as usize {
+		let start = if i == 0 { chunk.pos() } else { split_a[i - 1] };
+		let end = if i < (raw_len as usize) { split_a[i] } else { chunk.total_len().get() };
+
+		// Crunch this chunk into a clean store.
+		cost1 += lz77_optimal(
+			chunk.reslice(start, end)?,
+			numiterations,
+			store2,
+			&mut store3,
+			state,
+		)?.get();
+
+		// Append its data to our main store.
+		store.steal_entries(store2);
+
+		// Save the chunk size to our split_b as the defacto best.
+		split_b[i] = store.len();
 	}
 
-	Ok(())
+	// If we have at least two split points, do one further LZ77 pass using the
+	// updated store details to see if the big picture changes anything.
+	if 1 < (raw_len as u8) {
+		let two_len = split_points_lz77_cold(state, store, &mut split_a)?;
+		split_a[two_len as usize] = store.len();
+		split_a.rotate_right(1);
+		debug_assert!(split_a[0] == 0); // We don't write to the last byte.
+		let mut cost2 = 0;
+		for pair in split_a[..two_len as usize + 2].windows(2) {
+			cost2 += calculate_block_size_auto(
+				store,
+				ZopfliRange::new(pair[0], pair[1])?,
+			)?.get();
+		}
+
+		// It's better!
+		if cost2 < cost1 { return Ok((split_a, two_len)) }
+	}
+
+	split_b.rotate_right(1);
+	debug_assert!(split_b[0] == 0); // We don't write to the last byte.
+	Ok((split_b, raw_len))
+}
+
+#[inline(never)]
+/// # Split Points: Uncompressed.
+fn split_points_raw(
+	chunk: ZopfliChunk<'_>,
+	store: &mut LZ77Store,
+	state: &mut ZopfliState,
+	split_a: &mut SplitPoints,
+	split_b: &mut SplitPoints,
+) -> Result<SplitLen, ZopfliError> {
+	// Populate an LZ77 store from a greedy pass. This results in better
+	// block choices than a full optimal pass.
+	state.greedy_cold(chunk, store, None)?;
+
+	// Do an LZ77 pass.
+	let len = split_points_lz77(state, store, split_b)?;
+
+	// Find the corresponding uncompressed positions.
+	if len.is_zero() { Ok(len) }
+	else {
+		let mut pos = chunk.pos();
+		let mut j = SplitLen::S00;
+		for (i, e) in store.entries.iter().enumerate().take(split_b[len as usize - 1] + 1) {
+			if i == split_b[j as usize] {
+				split_a[j as usize] = pos;
+				j = j.increment();
+				if (j as u8) == (len as u8) { return Ok(len); }
+			}
+			pos += e.length() as usize;
+		}
+
+		Err(zopfli_error!())
+	}
+}
+
+#[inline(never)]
+fn split_points_lz77_cold(
+	state: &mut ZopfliState,
+	store: &LZ77Store,
+	split_b: &mut SplitPoints,
+) -> Result<SplitLen, ZopfliError> { split_points_lz77(state, store, split_b) }
+
+#[inline]
+/// # LZ77 Split Pass.
+///
+/// This sets the LZ77 split points according to convoluted cost
+/// evaluations.
+fn split_points_lz77(
+	state: &mut ZopfliState,
+	store: &LZ77Store,
+	split_b: &mut SplitPoints,
+) -> Result<SplitLen, ZopfliError> {
+	/// # Find Largest Splittable Block.
+	///
+	/// This finds the largest available block for splitting, evenly spreading the
+	/// load if a limited number of blocks are requested.
+	///
+	/// Returns `false` if no blocks are found.
+	fn find_largest(
+		lz77size: usize,
+		done: &SplitCache,
+		splitpoints: &[usize],
+		rng: &mut ZopfliRange,
+	) -> Result<bool, ZopfliError> {
+		let mut best = 0;
+		for i in 0..=splitpoints.len() {
+			let start =
+				if i == 0 { 0 }
+				else { splitpoints[i - 1] };
+			let end =
+				if i < splitpoints.len() { splitpoints[i] }
+				else { lz77size - 1 };
+
+			// We found a match!
+			if best < end - start && done.is_unset(start) {
+				rng.set(start, end)?;
+				best = end - start;
+			}
+		}
+		Ok(MINIMUM_SPLIT_DISTANCE <= best)
+	}
+
+	// This won't work on tiny files.
+	if store.len() < MINIMUM_SPLIT_DISTANCE { return Ok(SplitLen::S00); }
+
+	// Get started!
+	let mut rng = ZopfliRange::new(0, store.len())?;
+	let done = state.split_cache(rng);
+	let mut last = 0;
+	let mut len = SplitLen::S00;
+	loop {
+		// Safety: find_minimum_cost will return an error if the block doesn't
+		// have a midpoint.
+		let (llpos, llcost) = find_minimum_cost(store, rng)?;
+		if rng.start() >= llpos || rng.end() <= llpos { crate::unreachable(); }
+
+		// Ignore points we've already covered.
+		if llpos == rng.start() + 1 || calculate_block_size_auto(store, rng)? < llcost {
+			done.set(rng.start());
+		}
+		else {
+			// Mark it as a split point and add it sorted.
+			split_b[len as usize] = llpos;
+			len = len.increment();
+
+			// Keep the list sorted.
+			if last > llpos { split_b[..len as usize].sort_unstable(); }
+			else { last = llpos; }
+
+			// Stop if we've split the maximum number of times.
+			if len.is_max() { break; }
+		}
+
+		// Look for a split and adjust the start/end accordingly. If we don't
+		// find one or the remaining distance is too small to continue, we're
+		// done!
+		if ! find_largest(
+			store.len(),
+			done,
+			&split_b[..len as usize],
+			&mut rng,
+		)? { break; }
+	}
+
+	Ok(len)
 }
 
 
diff --git a/flapfli/src/zopflipng/cache.rs b/flapfli/src/zopflipng/cache.rs
index ac98710..019551b 100644
--- a/flapfli/src/zopflipng/cache.rs
+++ b/flapfli/src/zopflipng/cache.rs
@@ -1,16 +1,11 @@
 /*!
-# Flapfli: Longest Match Cache.
+# Flapfli: Caches.
 
-The LMC is used to eleviate some of the burden that would otherwise result from
-calling `ZopfliHash::find` a hundred million times in a row. Haha.
+This module contains the Longest Match cache along with several smaller caching
+structures that aren't big enough to warrant their own dedicated modules.
 */
 
 use std::{
-	alloc::{
-		alloc,
-		handle_alloc_error,
-		Layout,
-	},
 	cell::Cell,
 	ptr::{
 		addr_of_mut,
@@ -23,7 +18,10 @@ use super::{
 	zopfli_error,
 	ZOPFLI_MASTER_BLOCK_SIZE,
 	ZOPFLI_MIN_MATCH,
+	ZopfliChunk,
 	ZopfliError,
+	ZopfliRange,
+	ZopfliStateInit,
 };
 
 
@@ -32,83 +30,97 @@ use super::{
 ///
 /// Length and distance are always fetched/stored together, so are grouped into
 /// a single value to reduce indexing/bounds overhead.
+///
+/// A tuple would be friendlier, but doesn't scale particularly well, so
+/// whatever. The `join_ld` and `split_ld` helper methods fill the ergonomic
+/// gaps.
 const DEFAULT_LD: u32 = u32::from_le_bytes([1, 0, 0, 0]);
 
 /// # Sublength Cache Entries.
+///
+/// This is the total number of "entries" a given sublength cache record
+/// contains.
 const ZOPFLI_CACHE_LENGTH: usize = 8;
 
 /// # Sublength Cache Total Length.
 ///
-/// Each entry uses three bytes, so the total size is…
+/// Each entry uses three bytes, so the total length of a sublength cache
+/// collection is thus…
 const SUBLEN_CACHED_LEN: usize = ZOPFLI_CACHE_LENGTH * 3;
 
+/// # Length of Split Cache.
+///
+/// The split cache is mercifully boolean, so we can pack it into a bit array,
+/// reducing its size to one eighth what it otherwise would be.
+const SPLIT_CACHE_LEN: usize = ZOPFLI_MASTER_BLOCK_SIZE.div_ceil(8);
 
 
+
+#[repr(C)]
 /// # Longest Match Cache.
 ///
 /// This structure holds cached length/distance details for individual
-/// sublengths. Its memory usage is no joke, but the performance savings more
-/// than make up for it.
+/// "sublengths" — chunks of chunks of data processed by `ZopfliHash` —
+/// mitigating the overhead of doing the same shit over and over and over
+/// again.
+///
+/// As with most of this library's caches, the memory usage is no joke, but
+/// trying to get by without without it is downright _miserable_.
+///
+/// On the bright side, we only need one instance per thread for the duration
+/// of the program run, and thanks to some clever boxing, it winds up on the
+/// heap instead of the stack.
 pub(crate) struct MatchCache {
 	ld: [u32; ZOPFLI_MASTER_BLOCK_SIZE],
-	sublen: [[u8; SUBLEN_CACHED_LEN]; ZOPFLI_MASTER_BLOCK_SIZE],
+	sublen: [u8; SUBLEN_CACHED_LEN * ZOPFLI_MASTER_BLOCK_SIZE],
 }
 
-impl MatchCache {
+impl ZopfliStateInit for MatchCache {
 	#[allow(unsafe_code)]
-	/// # New.
-	///
-	/// Arrays holding a million elements are obviously less than ideal, but
-	/// because these are referenced repeatedly with different sub-slice sizes,
-	/// it is much better for performance than vectors that have to be
-	/// continuously resized/reallocated.
+	#[inline]
+	/// # State Initialization.
 	///
-	/// Still, these are too big for the stack, so we're initializing them via
-	/// raw pointers and jamming them straight into a `Box`.
-	pub(super) fn new() -> Box<Self> {
-		// Reserve the space.
-		const LAYOUT: Layout = Layout::new::<MatchCache>();
-		let out = NonNull::new(unsafe { alloc(LAYOUT).cast() })
-			.unwrap_or_else(|| handle_alloc_error(LAYOUT));
-		let ptr: *mut Self = out.as_ptr();
-
-		unsafe {
-			// The arrays can be zero-filled to start with; they'll get reset
-			// prior to use anyway.
-			addr_of_mut!((*ptr).ld).write_bytes(0, 1);
-			addr_of_mut!((*ptr).sublen).write_bytes(0, 1);
-
-			// All set!
-			Box::from_raw(ptr)
-		}
+	/// See `ZopfliState` for more details.
+	unsafe fn state_init(nn: NonNull<Self>) {
+		let ptr = nn.as_ptr();
+
+		// The proper defaults for both members are _mostly_ zeroes, so let's
+		// roll with that since it's cheap and easy. (The values will be reset
+		// properly before each use anyway.)
+		addr_of_mut!((*ptr).ld).write_bytes(0, 1);
+		addr_of_mut!((*ptr).sublen).write_bytes(0, 1);
 	}
+}
 
+impl MatchCache {
 	/// # Initialize.
 	///
-	/// This resizes the cache buffers and resets their values to their default
-	/// states — one for length, zero for everything else.
+	/// Reset (enough of) the cache to its initial/default state for any
+	/// subsequent processing of `chunk` we might need to do. (Most chunks will
+	/// be smaller than `ZOPFLI_MASTER_BLOCK_SIZE` so we won't normally need to
+	/// reset _everything_.)
 	///
-	/// Because this is a shared buffer, allocations persist for the duration
-	/// of the program run so they can be reused.
-	pub(crate) fn init(&mut self, mut blocksize: usize) {
-		// Lodepng will never pass along more than ZOPFLI_MASTER_BLOCK_SIZE
-		// bytes, but this lets the compiler know we won't go over.
-		if ZOPFLI_MASTER_BLOCK_SIZE < blocksize {
-			blocksize = ZOPFLI_MASTER_BLOCK_SIZE;
-		}
+	/// The length half of `ld` defaults to one; everything else defaults to
+	/// zero.
+	pub(crate) fn init(&mut self, chunk: &ZopfliChunk<'_>) {
+		// Safety: ZopfliChunk verifies the block size is under the limit.
+		let blocksize = chunk.block_size().get();
+		if blocksize > ZOPFLI_MASTER_BLOCK_SIZE { crate::unreachable(); }
 
 		// Lengths default to one, everything else to zero.
 		self.ld[..blocksize].fill(DEFAULT_LD);
-		self.sublen[..blocksize].fill([0; SUBLEN_CACHED_LEN]);
+		self.sublen[..blocksize * SUBLEN_CACHED_LEN].fill(0);
 	}
 
-	#[allow(clippy::cast_possible_truncation)]
+	#[allow(unsafe_code, clippy::cast_possible_truncation)]
 	/// # Find Match.
 	///
-	/// Find the sublength, distance, and length from cache, if possible.
+	/// Find the sublength, distance, and length from cache, if present, and
+	/// (possibly) add it to the cache if not.
 	///
-	/// Values are written directly to the passed arguments. A bool is returned
-	/// to indicate whether or not the find was successful.
+	/// The results are written back to the mutable arguments passed to the
+	/// method. A bool is returned to indicate whether or not the search was
+	/// successful.
 	pub(crate) fn find(
 		&self,
 		pos: usize,
@@ -123,22 +135,25 @@ impl MatchCache {
 		// If we have no distance, we have no cache.
 		let (cache_len, cache_dist) = ld_split(self.ld[pos]);
 		if ! cache_len.is_zero() && cache_dist == 0 { return Ok(false); }
-		let cache_sublen: &[u8; SUBLEN_CACHED_LEN] = &self.sublen[pos];
+		let cache_sublen: &[u8; SUBLEN_CACHED_LEN] = unsafe {
+			// Safety: the slice has the same length as the array.
+			&* self.sublen[pos * SUBLEN_CACHED_LEN..(pos + 1) * SUBLEN_CACHED_LEN].as_ptr().cast()
+		};
 
 		// Find the max sublength once, if ever.
 		let maxlength =
-			if sublen.is_none() { 0 }
+			if sublen.is_none() { LitLen::L000 }
 			else { max_sublen(cache_sublen) };
 
 		// Proceed if our cached length or max sublength are under the limit.
 		if
 			limit.is_max() ||
 			(cache_len as u16) <= (*limit as u16) ||
-			(sublen.is_some() && maxlength >= (*limit as usize))
+			(sublen.is_some() && (maxlength as u16) >= (*limit as u16))
 		{
 			// Update length and distance if the sublength pointer is null or
 			// the cached sublength is bigger than the cached length.
-			if sublen.is_none() || (cache_len as usize) <= maxlength {
+			if sublen.is_none() || (cache_len as u16) <= (maxlength as u16) {
 				// Cap the length.
 				*length = cache_len;
 				if (*length as u16) > (*limit as u16) { *length = *limit; }
@@ -175,7 +190,7 @@ impl MatchCache {
 		Ok(false)
 	}
 
-	#[allow(clippy::cast_possible_truncation)]
+	#[cold]
 	/// # Set Sublength.
 	///
 	/// Save the provided sublength data to the cache.
@@ -219,7 +234,7 @@ impl MatchCache {
 
 		// The cache gets written three bytes at a time; this iterator will
 		// help us eliminate the bounds checks we'd otherwise run into.
-		let mut dst = self.sublen[pos].chunks_exact_mut(3);
+		let mut dst = self.sublen.chunks_exact_mut(3).skip(pos * ZOPFLI_CACHE_LENGTH).take(ZOPFLI_CACHE_LENGTH);
 
 		// Start by writing all mismatched pairs, up to the limit.
 		for (i, pair) in (0_u8..=u8::MAX).zip(slice.windows(2)) {
@@ -233,12 +248,12 @@ impl MatchCache {
 		// The final value is implicitly "mismatched"; if we haven't hit the
 		// limit we should write it too.
 		if let Some([d0, d1, d2]) = dst.next() {
-			*d0 = (length as u16 - 3) as u8;
+			*d0 = length.to_packed_u8();
 			[*d1, *d2] = slice[slice.len() - 1].to_le_bytes();
 
 			// If we're still below the limit, copy (only) the length to the
 			// last slot to simplify any subsequent max_length lookups.
-			if let Some([d0, _, _]) = dst.last() { *d0 = (length as u16 - 3) as u8; }
+			if let Some([d0, _, _]) = dst.last() { *d0 = length.to_packed_u8(); }
 		}
 
 		Ok(())
@@ -247,6 +262,69 @@ impl MatchCache {
 
 
 
+/// # Split Cache.
+///
+/// This structure holds a sort of bit-array used for keeping track of which
+/// split points (indices) have already been tested to avoid the overhead of
+/// testing them again.
+///
+/// As with `MatchCache`, we only need one instance of this struct per thread
+/// for the duration of the program run.
+pub(crate) struct SplitCache {
+	set: [u8; SPLIT_CACHE_LEN],
+}
+
+impl ZopfliStateInit for SplitCache {
+	#[allow(unsafe_code)]
+	#[inline]
+	/// # State Initialization.
+	///
+	/// See `ZopfliState` for more details.
+	unsafe fn state_init(nn: NonNull<Self>) {
+		// False is zeroes all the way down.
+		addr_of_mut!((*nn.as_ptr()).set).write_bytes(0, 1);
+	}
+}
+
+impl SplitCache {
+	/// # Initialize.
+	///
+	/// Reset the first `rng.len()` bits — these ranges always start at zero —
+	/// to false so we can track a new set of indices.
+	pub(crate) fn init(&mut self, rng: ZopfliRange) {
+		// Safety: ZopfliRange checks the range is non-empty and within the
+		// limit.
+		let blocksize = rng.len().get();
+		if ZOPFLI_MASTER_BLOCK_SIZE < blocksize { crate::unreachable(); }
+
+		// Fill uses bytes rather than bits, so we need to round up to ensure
+		// complete coverage for our range.
+		let bitsize = blocksize.div_ceil(8);
+		self.set[..bitsize].fill(0);
+	}
+
+	#[inline]
+	/// # Not Checked?
+	///
+	/// Returns true if the value is currently _unchecked_. (The caller takes
+	/// action on the negative rather than the positive.)
+	pub(crate) const fn is_unset(&self, pos: usize) -> bool {
+		let idx = pos.wrapping_div(8); // The byte.
+		let mask: u8 = 1 << (pos % 8); // The bit.
+		SPLIT_CACHE_LEN <= idx || 0 == self.set[idx] & mask
+	}
+
+	#[inline]
+	/// # Mark as Checked.
+	pub(crate) fn set(&mut self, pos: usize) {
+		let idx = pos.wrapping_div(8); // The byte.
+		let mask: u8 = 1 << (pos % 8); // The bit.
+		if idx < SPLIT_CACHE_LEN { self.set[idx] |= mask; }
+	}
+}
+
+
+
 /// # Squeeze Cache.
 ///
 /// This struct stores LZ77 length costs and paths.
@@ -255,54 +333,43 @@ impl MatchCache {
 /// to-block, but can actually go as high as a million and one!
 ///
 /// Lest that sound like a terrible waste, this struct only exists as part of
-/// a thread-local static so will be reused as many times as needed.
+/// a thread-local static so will be reused as many times as needed. That
+/// static is also boxed to ensure the data winds up on the heap instead of the
+/// stack.
 pub(crate) struct SqueezeCache {
 	costs: [(f32, LitLen); ZOPFLI_MASTER_BLOCK_SIZE + 1],
 	paths: [LitLen; ZOPFLI_MASTER_BLOCK_SIZE],
 	costs_len: Cell<usize>,
 }
 
-impl SqueezeCache {
+impl ZopfliStateInit for SqueezeCache {
 	#[allow(unsafe_code)]
-	/// # New (Boxed) Instance.
-	///
-	/// Arrays holding a million+ elements is obviously less than ideal, but
-	/// because these are referenced repeatedly with different sub-slice sizes,
-	/// it is much better for performance than vectors that have to be
-	/// continuously resized/reallocated.
+	#[inline]
+	/// # State Initialization.
 	///
-	/// Still, these are too big for the stack, so we're initializing them via
-	/// raw pointers and jamming them straight into a `Box`.
-	pub(crate) fn new() -> Box<Self> {
-		// Reserve the space.
-		const LAYOUT: Layout = Layout::new::<SqueezeCache>();
-		let out = NonNull::new(unsafe { alloc(LAYOUT).cast() })
-			.unwrap_or_else(|| handle_alloc_error(LAYOUT));
-		let ptr: *mut Self = out.as_ptr();
-
-		unsafe {
-			// The arrays can be zero-filled to start with; they'll be reset
-			// or overwritten before use anyway.
-			addr_of_mut!((*ptr).costs).write_bytes(0, 1);
-			addr_of_mut!((*ptr).paths).write_bytes(0, 1);
-
-			// Zero works equally well for the initial length, especially
-			// because it's true! Haha.
-			addr_of_mut!((*ptr).costs_len).write(Cell::new(0));
-
-			// All set!
-			Box::from_raw(ptr)
-		}
+	/// See `ZopfliState` for more details.
+	unsafe fn state_init(nn: NonNull<Self>) {
+		let ptr = nn.as_ptr();
+
+		// The arrays can be zero-filled to start with; they'll be reset
+		// or overwritten before each use anyway.
+		addr_of_mut!((*ptr).costs).write_bytes(0, 1);
+		addr_of_mut!((*ptr).paths).write_bytes(0, 1);
+
+		// Zero works equally well for the initial length, especially since
+		// that happens to be true!
+		addr_of_mut!((*ptr).costs_len).write(Cell::new(0));
 	}
+}
 
+impl SqueezeCache {
 	/// # Resize Costs.
 	///
-	/// This sets the internal costs length to match the desired blocksize, but
-	/// does _not_ reset their values. (Unlike the LMC, which more or less
-	/// persists for the duration of a given block, costs are calculated and
-	/// discarded and recalculated and discarded… several times.)
-	pub(crate) fn resize_costs(&self, blocksize: usize) {
-		self.costs_len.set(blocksize);
+	/// This method merely sets the internal cost-length variable to match
+	/// `chunk`'s block size (plus one). (It does _not_ reset the actual
+	/// cost data or anything like that.)
+	pub(crate) fn resize_costs(&self, chunk: &ZopfliChunk<'_>) {
+		self.costs_len.set(chunk.block_size().get() + 1);
 	}
 
 	/// # Reset Costs.
@@ -311,14 +378,16 @@ impl SqueezeCache {
 	/// `resize_costs` call.
 	///
 	/// Note that only the costs themselves are reset; the lengths and paths
-	/// are dealt with _in situ_ during crunching (without being read).
+	/// are dealt with _in situ_ during crunching (without first being read).
 	pub(crate) fn reset_costs(&mut self) -> &mut [(f32, LitLen)] {
-		let costs = self.costs.get_mut(..self.costs_len.get()).unwrap_or(&mut []);
-		if ! costs.is_empty() {
-			// The first cost needs to be zero; the rest need to be infinity.
-			costs[0].0 = 0.0;
-			for c in costs.iter_mut().skip(1) { c.0 = f32::INFINITY; }
-		}
+		// Safety: ZopfliChunk verifies the block size is under the limit and
+		// non-empty, and since costs is always blocks+1, the minimum is 2.
+		let len = self.costs_len.get();
+		if ! (2..=ZOPFLI_MASTER_BLOCK_SIZE + 1).contains(&len) { crate::unreachable(); }
+
+		let costs = &mut self.costs[..len];
+		costs[0].0 = 0.0;
+		for c in &mut costs[1..] { c.0 = f32::INFINITY; }
 		costs
 	}
 
@@ -373,12 +442,17 @@ const fn ld_split(ld: u32) -> (LitLen, u16) {
 
 /// # Max Sublength.
 ///
-/// Return the maximum sublength length for a given chunk.
-const fn max_sublen(slice: &[u8; SUBLEN_CACHED_LEN]) -> usize {
+/// Return the maximum sublength length for a given cache chunk.
+///
+/// Each three-byte cache-entry has its length recorded in the first byte; the
+/// last such entry holds the maximum.
+const fn max_sublen(slice: &[u8; SUBLEN_CACHED_LEN]) -> LitLen {
 	// If the first chunk has no distance, assume a zero length.
-	if slice[1] == 0 && slice[2] == 0 { 0 }
+	if slice[1] == 0 && slice[2] == 0 { LitLen::L000 }
 	// Otherwise the "max" is stored as the first value of the last chunk.
-	else { slice[SUBLEN_CACHED_LEN - 3] as usize + 3 }
+	// Since lengths are stored `-3`, we have to add three back to the stored
+	// value to make it a real length.
+	else { LitLen::from_packed_u8(slice[SUBLEN_CACHED_LEN - 3]) }
 }
 
 /// # Write Sublength.
@@ -388,13 +462,13 @@ fn write_sublen(src: &[u8; SUBLEN_CACHED_LEN], dst: &mut [u16; SUBLEN_LEN]) {
 	let maxlength = max_sublen(src);
 	let mut old = 0;
 	for chunk in src.chunks_exact(3) {
-		let length = usize::from(chunk[0]) + ZOPFLI_MIN_MATCH;
-		if old <= length {
+		let length = LitLen::from_packed_u8(chunk[0]);
+		if old <= (length as usize) {
 			let value = u16::from_le_bytes([chunk[1], chunk[2]]);
-			dst[old..=length].fill(value);
+			dst[old..=length as usize].fill(value);
 		}
-		if length == maxlength { return; }
-		old = length + 1;
+		if (length as u16) >= (maxlength as u16) { return; }
+		old = (length as usize) + 1;
 	}
 }
 
@@ -414,4 +488,52 @@ mod tests {
 		// Joining should get us back where we started.
 		assert_eq!(DEFAULT_LD, ld_join(len, dist));
 	}
+
+	#[test]
+	fn t_split_mask() {
+		// What we expect our masks to look like.
+		const fn split_cache_mask(pos: usize) -> u8 {
+			match pos % 8 {
+				0 => 0b0000_0001,
+				1 => 0b0000_0010,
+				2 => 0b0000_0100,
+				3 => 0b0000_1000,
+				4 => 0b0001_0000,
+				5 => 0b0010_0000,
+				6 => 0b0100_0000,
+				_ => 0b1000_0000,
+			}
+		}
+
+		for pos in 0..255_usize {
+			let mask: u8 = 1 << (pos % 8);
+			assert_eq!(mask, split_cache_mask(pos));
+		}
+	}
+
+	#[test]
+	fn t_split_cache() {
+		let mut cache = SplitCache {
+			set: [0_u8; SPLIT_CACHE_LEN],
+		};
+
+		// Check that positions are false to start, true after set.
+		for i in 0..ZOPFLI_MASTER_BLOCK_SIZE {
+			assert!(cache.is_unset(i));
+			cache.set(i);
+			assert!(! cache.is_unset(i));
+		}
+
+		// Everything should be set now.
+		assert!(cache.set.iter().all(|&b| b == u8::MAX));
+
+		// If we initialize with a small value, only those bits should be
+		// affected.
+		cache.init(ZopfliRange::new(0, 32).unwrap());
+		assert_eq!(cache.set[0], 0);
+		assert_eq!(cache.set[1], 0);
+		assert_eq!(cache.set[2], 0);
+		assert_eq!(cache.set[3], 0);
+		assert_eq!(cache.set[4], u8::MAX);
+	}
 }
diff --git a/flapfli/src/zopflipng/chunk.rs b/flapfli/src/zopflipng/chunk.rs
new file mode 100644
index 0000000..c866139
--- /dev/null
+++ b/flapfli/src/zopflipng/chunk.rs
@@ -0,0 +1,314 @@
+/*!
+# Flapfli: Slice Chunks.
+*/
+
+use std::num::NonZeroUsize;
+use super::{
+	zopfli_error,
+	ZOPFLI_MASTER_BLOCK_SIZE,
+	ZOPFLI_WINDOW_SIZE,
+	ZopfliError,
+	ZopfliRange,
+};
+
+
+
+#[derive(Debug, Clone, Copy)]
+/// # Deflate Chunk.
+///
+/// The deflate/zopfli process is weird. The data is sliced in
+/// `ZOPFLI_MASTER_BLOCK_SIZE` windows, kinda, but the previous data is
+/// included for the ride because it is sometimes relevant for hashing and
+/// caching.
+///
+/// Similar to `ZopfliRange`, this struct mainly exists to help enforce the
+/// logical constraints so we don't have to repeat sanity checks every five
+/// seconds.
+///
+/// The struct's `from` value may or may not be zero — on the first pass there
+/// won't be any previous data — but it will always be less than `arr.len()`,
+/// and `arr.len() - from` will always be less than or equal to
+/// `ZOPFLI_MASTER_BLOCK_SIZE`, i.e. one million.
+pub(crate) struct ZopfliChunk<'a> {
+	arr: &'a [u8],
+	from: usize,
+}
+
+impl<'a> ZopfliChunk<'a> {
+	/// # New.
+	///
+	/// Define a new chunk with the given source and starting point.
+	///
+	/// ## Errors.
+	///
+	/// This will return an error if the slice is empty, `from` is out of
+	/// range, or the length from `from` is greater than
+	/// `ZOPFLI_MASTER_BLOCK_SIZE`.
+	pub(crate) const fn new(arr: &'a [u8], from: usize) -> Result<Self, ZopfliError> {
+		if from < arr.len() && arr.len() - from <= ZOPFLI_MASTER_BLOCK_SIZE {
+			Ok(Self { arr, from })
+		}
+		else { Err(zopfli_error!()) }
+	}
+
+	/// # Reslice.
+	///
+	/// Return a new instance capped to the range, or an error if the range is
+	/// out of bounds or otherwise violates the struct's requirements.
+	///
+	/// The `start` serves as the new instances `from`. If `end` is less than
+	/// `arr.len()`, the new chunk's slice will be truncated accordingly.
+	pub(crate) fn reslice(&self, start: usize, end: usize) -> Result<Self, ZopfliError> {
+		if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE && end <= self.arr.len() {
+			let arr = &self.arr[..end];
+			Ok(Self { arr, from: start })
+		}
+		else { Err(zopfli_error!()) }
+	}
+
+	/// # Reslice to Range.
+	///
+	/// Same as `ZopfliChunk::reslice`, but with the range specified as a
+	/// `ZopfliRange`.
+	///
+	/// This version should be preferred in cases where such a range has
+	/// already been constructed since it moots all but one of the conditions
+	/// we'd otherwise need to verify before giving the `Ok()`.
+	pub(crate) fn reslice_rng(&self, rng: ZopfliRange) -> Result<Self, ZopfliError> {
+		let arr = self.arr.get(..rng.end()).ok_or(zopfli_error!())?;
+		Ok(Self { arr, from: rng.start() })
+	}
+}
+
+impl<'a> ZopfliChunk<'a> {
+	/// # Full Slice.
+	///
+	/// Return the entire data slice, including the prelude, if any.
+	///
+	/// Note: this will never be empty.
+	pub(crate) const fn arr(&self) -> &[u8] { self.arr }
+
+	/// # Block Slice.
+	///
+	/// Return the "active" portion of the data slice, i.e. everything from
+	/// `from`.
+	///
+	/// Note: this will never be empty.
+	pub(crate) fn block(&self) -> &[u8] {
+		#[allow(unsafe_code)]
+		// Safety: from is verified during construction.
+		unsafe { self.arr.get_unchecked(self.from..) }
+	}
+
+	/// # First Value.
+	///
+	/// Return the first value from the "active" portion of the data slice,
+	/// i.e. `arr[from]`.
+	///
+	/// Because the current block may never be empty, there will always be at
+	/// least one value.
+	pub(crate) const fn first(&self) -> u8 {
+		// Safety: from is verified during construction.
+		if self.from >= self.arr.len() { crate::unreachable(); }
+		self.arr[self.from]
+	}
+
+	/// # Active Length.
+	///
+	/// Return the length of the "active" slice, e.g. its block size.
+	pub(crate) const fn block_size(&self) -> NonZeroUsize {
+		#[allow(unsafe_code)]
+		// Safety: the length is verified during construction.
+		unsafe { NonZeroUsize::new_unchecked(self.arr.len() - self.from) }
+	}
+
+	/// # Current Position.
+	///
+	/// Return the `from` index that marks the starting point of the "active"
+	/// portion of the data slice.
+	pub(crate) const fn pos(&self) -> usize { self.from }
+
+	/// # Total Length.
+	///
+	/// Return the length of the entire data slice, prelude and all.
+	pub(crate) const fn total_len(&self) -> NonZeroUsize {
+		#[allow(unsafe_code)]
+		// Safety: slices are verified non-empty at construction.
+		unsafe { NonZeroUsize::new_unchecked(self.arr.len()) }
+	}
+
+	#[allow(unsafe_code)]
+	/// # Warmup Values.
+	///
+	/// This returns the first one or two values from `window_start`, used for
+	/// warming up the `ZopfliHash` cache.
+	///
+	/// Note: it is probably impossible for there to not be a second value, but
+	/// since we don't explicitly require lengths of two, it's safer to treat
+	/// it as optional.
+	pub(crate) const fn warmup_values(&self) -> (u8, Option<u8>) {
+		// Safety: from (and by association window_start) is verified at
+		// construction.
+		let window_start = self.window_start();
+		if window_start >= self.arr.len() { crate::unreachable(); }
+
+		let a = self.arr[window_start];
+
+		// There will usually be a second value, but not always!
+		let b =
+			if window_start + 1 < self.arr.len() { Some(self.arr[window_start + 1]) }
+			else { None };
+
+		(a, b)
+	}
+
+	/// # Window Start.
+	///
+	/// If we're at the beginning of a chunk, this is equivalent to
+	/// `ZopfliChunk::pos` (e.g. `self.from`), otherwise it reaches back up to
+	/// `ZOPFLI_WINDOW_SIZE` slots into the prelude, returning that index
+	/// instead.
+	pub(crate) const fn window_start(&self) -> usize {
+		self.from.saturating_sub(ZOPFLI_WINDOW_SIZE)
+	}
+}
+
+impl<'a> ZopfliChunk<'a> {
+	/// # Reducing Prelude Iterator.
+	///
+	/// Same as `ZopfliChunk::reducing_block_iter`, except the chunks are
+	/// restricted to the range of the prelude — `window_start..from` — if any.
+	///
+	/// If there is no prelude, `None` is returned instead.
+	///
+	/// Note: the internal slice will be truncated if needed to uphold the
+	/// maximum length constraint, but that loss doesn't actually matter since
+	/// prelude hashing never looks at more than `u16::MAX` bytes anyway. (A
+	/// million minus thirty-odd thousand is still much more than that!)
+	pub(crate) fn reducing_prelude_iter(self) -> Option<std::iter::Take<ZopfliChunkIter<'a>>> {
+		// If we're at the start of the slice, there is no prelude.
+		if self.from == 0 { None }
+		else {
+			// Safety: from (and by association window_start) is verified at
+			// construction.
+			let window_start = self.window_start();
+			if window_start >= self.arr.len() { crate::unreachable(); }
+
+			let arr =
+				if self.arr.len() - window_start <= ZOPFLI_MASTER_BLOCK_SIZE { self.arr }
+				else { &self.arr[..window_start + ZOPFLI_MASTER_BLOCK_SIZE] };
+
+			let chunk = Self { arr, from: window_start };
+			Some(ZopfliChunkIter(chunk).take(self.from - window_start))
+		}
+	}
+
+	/// # Reducing Block Chunk Iterator.
+	///
+	/// Return an iterator that increases the block's starting position (`from`)
+	/// after each pass, stopping once the chunk would be empty/invalid.
+	///
+	/// Similar to the more generic `ReducingSlice` iterator, this starts with
+	/// the current value, so there will always be at least one valid result
+	/// before `None`.
+	pub(crate) const fn reducing_block_iter(self) -> ZopfliChunkIter<'a> {
+		ZopfliChunkIter(self)
+	}
+}
+
+
+
+/// # Chunk Iterator.
+///
+/// This iterator yields increasingly smaller chunks until empty, incrementing
+/// the starting position by one after each cycle, beginning with the seed
+/// chunk.
+pub(crate) struct ZopfliChunkIter<'a>(ZopfliChunk<'a>);
+
+impl<'a> Iterator for ZopfliChunkIter<'a> {
+	type Item = ZopfliChunk<'a>;
+
+	fn next(&mut self) -> Option<Self::Item> {
+		// We potentially break the constraints during iteration so need to
+		// explicitly check from is still in range and non-empty before
+		// returning.
+		if self.0.from < self.0.arr.len() {
+			let next = Some(self.0);
+			self.0.from += 1;
+			next
+		}
+		else { None }
+	}
+
+	fn size_hint(&self) -> (usize, Option<usize>) {
+		let len = self.0.arr.len() - self.0.from;
+		(len, Some(len))
+	}
+}
+
+impl<'a> ExactSizeIterator for ZopfliChunkIter<'a> {
+	fn len(&self) -> usize { self.0.arr.len() - self.0.from }
+}
+
+
+
+#[cfg(test)]
+mod test {
+	use super::*;
+
+	#[test]
+	fn t_reducing_block_iter() {
+		let arr: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+		let chunk = ZopfliChunk { arr, from: 1 };
+		let mut iter = chunk.reducing_block_iter();
+
+		let mut len = 9;
+		let mut from = 1;
+		loop {
+			// Check the iterator's presumed length.
+			assert_eq!(iter.len(), len);
+			if len == 0 { break; }
+
+			// Pull the next entry and check the result.
+			let next = iter.next().expect("reducing block iter terminated early");
+			assert_eq!(next.block(), &arr[from..]);
+			assert_eq!(next.pos(), from);
+
+			len -= 1;
+			from += 1;
+		}
+
+		// It should be empty.
+		assert!(iter.next().is_none());
+	}
+
+	#[test]
+	fn t_reducing_prelude_iter() {
+		let arr: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+		let chunk = ZopfliChunk { arr, from: 1 };
+		let mut iter = chunk.reducing_prelude_iter().expect("missing prelude iter");
+
+		assert_eq!(iter.len(), 1);
+		let next = iter.next().expect("reducing prelude iter terminated early");
+		assert_eq!(next.block(), arr);
+		assert_eq!(next.pos(), 0);
+
+		assert_eq!(iter.len(), 0);
+		assert!(iter.next().is_none());
+
+		// Let's try it again with a chunk that has no prelude.
+		let chunk = ZopfliChunk { arr, from: 0 };
+		assert!(chunk.reducing_prelude_iter().is_none());
+
+		// And let's try one that is too big.
+		let arr: &[u8] = &[0; ZOPFLI_MASTER_BLOCK_SIZE + 10];
+		let chunk = ZopfliChunk::new(arr, 10).expect("Chunk failed.");
+		let mut iter = chunk.reducing_prelude_iter().expect("missing prelude iter");
+
+		assert_eq!(iter.len(), 10);
+		let next = iter.next().expect("reducing prelude iter terminated early");
+
+		// The slice should be truncated to fit the constraint.
+		assert_eq!(next.block_size().get(), ZOPFLI_MASTER_BLOCK_SIZE);
+	}
+}
diff --git a/flapfli/src/zopflipng/error.rs b/flapfli/src/zopflipng/error.rs
index 250c467..021a443 100644
--- a/flapfli/src/zopflipng/error.rs
+++ b/flapfli/src/zopflipng/error.rs
@@ -26,6 +26,9 @@ pub(crate) type ZopfliError = ();
 ///
 /// When debug assertions are _enabled_, error responses panic with the
 /// relevant source details to aid further investigation.
+///
+/// This struct stores those details, allowing us to delay the panicking until
+/// the error has bubbled back to lodepng.
 pub(crate) struct ZopfliError {
 	file: &'static str,
 	line: u32,
@@ -58,9 +61,7 @@ impl fmt::Display for ZopfliError {
 ///
 /// The debug version of this macro panics with a message indicating the file
 /// and line number to aid further investigation.
-macro_rules! zopfli_error {
-	() => (ZopfliError::new(file!(), line!()));
-}
+macro_rules! zopfli_error { () => (ZopfliError::new(file!(), line!())); }
 
 #[cfg(not(debug_assertions))]
 /// # Error Macro (Release).
@@ -68,5 +69,5 @@ macro_rules! zopfli_error {
 /// The non-debug version simply returns a `()`.
 macro_rules! zopfli_error { () => (()); }
 
-/// # Expose it to the rest of the module.
+/// # Expose the macro to the rest of the module.
 pub(super) use zopfli_error;
diff --git a/flapfli/src/zopflipng/hash.rs b/flapfli/src/zopflipng/hash.rs
index 1b55ebe..d0f751e 100644
--- a/flapfli/src/zopflipng/hash.rs
+++ b/flapfli/src/zopflipng/hash.rs
@@ -19,23 +19,27 @@ use std::{
 	},
 };
 use super::{
-	DISTANCE_BITS,
+	DISTANCE_BITS_F,
 	DISTANCE_SYMBOLS,
-	LENGTH_SYMBOL_BITS,
+	LENGTH_SYMBOL_BITS_F,
 	LENGTH_SYMBOLS,
 	LitLen,
 	LZ77Store,
 	MatchCache,
+	ReducingSlices,
+	SplitCache,
 	SqueezeCache,
 	stats::SymbolStats,
 	SUBLEN_LEN,
 	zopfli_error,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
+	ZOPFLI_WINDOW_SIZE,
+	ZopfliChunk,
 	ZopfliError,
+	ZopfliRange,
 };
 
-const ZOPFLI_WINDOW_SIZE: usize = 32_768;
 const ZOPFLI_WINDOW_MASK: usize = ZOPFLI_WINDOW_SIZE - 1;
 const HASH_SHIFT: i32 = 5;
 const HASH_MASK: i16 = 32_767;
@@ -55,57 +59,105 @@ const ZEROED_SUBLEN: [u16; SUBLEN_LEN] = [0; SUBLEN_LEN];
 
 /// # Zopfli State.
 ///
-/// This consolidates the Longest Match, Squeeze, and Hash caches into a single
-/// structure, cutting down on the number of references being bounced around
-/// from method to method.
+/// This consolidates the Longest Match, Squeeze, Split, and Hash caches into a
+/// single gratuitous structure, cutting down on the number of references we
+/// need to bounce from method to method.
+///
+/// Each member is big and terrible in its own right, but on the bright side we
+/// only need a single instance per thread for the duration of the program run,
+/// so the allocations are a one-and-done affair.
+///
+/// (That local lives in `deflate.rs`.)
 pub(crate) struct ZopfliState {
-	lmc: Box<MatchCache>,
-	hash: Box<ZopfliHash>,
-	squeeze: Box<SqueezeCache>,
+	lmc: MatchCache,
+	hash: ZopfliHash,
+	split: SplitCache,
+	squeeze: SqueezeCache,
 }
 
 impl ZopfliState {
+	#[allow(unsafe_code)]
+	#[inline(never)]
 	/// # New.
-	pub(crate) fn new() -> Self {
-		Self {
-			lmc: MatchCache::new(),
-			hash: ZopfliHash::new(),
-			squeeze: SqueezeCache::new(),
+	///
+	/// This struct's members are mostly large and terrible arrays. To keep
+	/// them off the stack, it is necessary to initialize everything from raw
+	/// pointers and box them up.
+	///
+	/// This unfortunately requires a lot of upfront unsafe code during
+	/// construction, but everything can be accessed normally thereafter.
+	///
+	/// To cut down on some of the complexity, the manual layout allocation and
+	/// boxing is done once, here, instead of separately for each individual
+	/// member.
+	///
+	/// See `ZopfliStateInit` below for a few more details.
+	pub(crate) fn new() -> Box<Self> {
+		// Reserve the space.
+		const LAYOUT: Layout = Layout::new::<ZopfliState>();
+		let out: NonNull<Self> = NonNull::new(unsafe { alloc(LAYOUT).cast() })
+			.unwrap_or_else(|| handle_alloc_error(LAYOUT));
+		let ptr = out.as_ptr();
+
+		unsafe {
+			// Initialize the members.
+			MatchCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).lmc)));
+			ZopfliHash::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).hash)));
+			SplitCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).split)));
+			SqueezeCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).squeeze)));
+
+			// Done!
+			Box::from_raw(ptr)
 		}
 	}
 
 	/// # Initialize LMC/Squeeze Caches.
-	pub(crate) fn init_lmc(&mut self, blocksize: usize) {
-		self.lmc.init(blocksize);
-		self.squeeze.resize_costs(blocksize + 1);
+	///
+	/// This prepares the Longest Match Cache and Squeeze caches for subsequent
+	/// work on `chunk`, if any.
+	pub(crate) fn init_lmc(&mut self, chunk: &ZopfliChunk<'_>) {
+		self.lmc.init(chunk);
+		self.squeeze.resize_costs(chunk);
+	}
+
+	/// # Split Cache.
+	///
+	/// Clear the split cache and return a mutable reference to it so the
+	/// split points within `rng` can be tracked.
+	pub(crate) fn split_cache(&mut self, rng: ZopfliRange) -> &mut SplitCache {
+		self.split.init(rng);
+		&mut self.split
 	}
 }
 
 impl ZopfliState {
 	#[inline(never)]
 	/// # Greedy LZ77 Run (No Inlining).
+	///
+	/// Same as `greedy`, but the compiler is given an `inline(never)` hint to
+	/// (hopefully) keep all this code from affecting its inlining decisions
+	/// about the caller.
 	pub(crate) fn greedy_cold(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
+		chunk: ZopfliChunk<'_>,
 		store: &mut LZ77Store,
 		cache: Option<usize>,
 	) -> Result<(), ZopfliError> {
-		self.greedy(arr, instart, store, cache)
+		self.greedy(chunk, store, cache)
 	}
 
-	#[allow(unsafe_code, clippy::cast_possible_truncation)]
+	#[allow(clippy::cast_possible_truncation)]
 	#[inline]
 	/// # Greedy LZ77 Run.
 	///
 	/// This method looks for best-length matches in the data (and/or cache),
 	/// updating the store with the results.
 	///
-	/// This is one of two entrypoints into the inner `ZopfliHash` data.
+	/// This is very similar to `ZopfliState::optimal_run`, but better suited
+	/// for general-purpose store population.
 	pub(crate) fn greedy(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
+		chunk: ZopfliChunk<'_>,
 		store: &mut LZ77Store,
 		cache: Option<usize>,
 	) -> Result<(), ZopfliError> {
@@ -117,7 +169,10 @@ impl ZopfliState {
 
 		// Reset the store and hash.
 		store.clear();
-		self.hash.reset(arr, instart);
+		self.hash.reset(chunk);
+
+		// Short circuit.
+		let mut iter = chunk.reducing_block_iter();
 
 		// We'll need a few more variables…
 		let mut sublen = ZEROED_SUBLEN;
@@ -126,17 +181,16 @@ impl ZopfliState {
 		let mut prev_length = LitLen::L000;
 		let mut prev_distance: u16 = 0;
 		let mut match_available = false;
+		let mut prev_value = 0_u8;
 
 		// Loop the data!
-		let mut i = instart;
-		while i < arr.len() {
-			// Update the hash.
-			self.hash.update_hash(&arr[i..], i);
+		while let Some(chunk2) = iter.next() {
+			self.hash.update_hash(chunk2);
+			let prev_prev_value = std::mem::replace(&mut prev_value, chunk2.first());
 
 			// Run the finder.
 			self.hash.find(
-				arr,
-				i,
+				chunk2,
 				LitLen::MAX_MATCH,
 				&mut Some(&mut sublen),
 				&mut distance,
@@ -152,20 +206,15 @@ impl ZopfliState {
 				match_available = false;
 
 				if length_score > prev_length_score + 1 {
-					// Safety: match_available starts false so even if instart
-					// is zero, we won't reach this part until we've iterated
-					// at least once.
 					store.push(
-						LitLen::from_u8(unsafe { *arr.get_unchecked(i - 1) }),
+						LitLen::from_u8(prev_prev_value),
 						0,
-						i - 1,
+						chunk2.pos() - 1,
 					);
 					if length_score >= ZOPFLI_MIN_MATCH as u16 && ! length.is_max() {
 						match_available = true;
 						prev_length = length;
 						prev_distance = distance;
-
-						i += 1;
 						continue;
 					}
 				}
@@ -175,16 +224,13 @@ impl ZopfliState {
 					distance = prev_distance;
 
 					// Write the values!
-					store.push(length, distance, i - 1);
+					store.push(length, distance, chunk2.pos() - 1);
 
 					// Update the hash up through length and increment the loop
 					// position accordingly.
-					for _ in 2..(length as u16) {
-						i += 1;
-						self.hash.update_hash(&arr[i..], i);
+					for chunk2 in iter.by_ref().take(length as usize - 2) {
+						self.hash.update_hash(chunk2);
 					}
-
-					i += 1;
 					continue;
 				}
 			}
@@ -194,58 +240,73 @@ impl ZopfliState {
 				match_available = true;
 				prev_length = length;
 				prev_distance = distance;
-
-				i += 1;
 				continue;
 			}
 
 			// Write the current length/distance.
 			if length_score >= ZOPFLI_MIN_MATCH as u16 {
-				store.push(length, distance, i);
+				store.push(length, distance, chunk2.pos());
 			}
 			// Write from the source with no distance and reset the length to
 			// one.
 			else {
 				length = LitLen::L001;
-				store.push(LitLen::from_u8(arr[i]), 0, i);
+				store.push(LitLen::from_u8(chunk2.first()), 0, chunk2.pos());
 			}
 
 			// Update the hash up through length and increment the loop
 			// position accordingly.
-			for _ in 1..(length as u16) {
-				i += 1;
-				self.hash.update_hash(&arr[i..], i);
+			for chunk2 in iter.by_ref().take(length as usize - 1) {
+				self.hash.update_hash(chunk2);
 			}
-
-			i += 1;
 		}
 
 		Ok(())
 	}
 
 	#[inline(never)]
-	/// # Optimal Run (No Inlining).
-	pub(crate) fn optimal_run_cold(
+	/// # Optimal Run (Fixed).
+	///
+	/// Same as `ZopfliHash::optimal_run`, but fixed tree counts and symbols
+	/// are used instead of the store's actual histogram.
+	pub(crate) fn optimal_run_fixed(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
-		stats: Option<&SymbolStats>,
+		chunk: ZopfliChunk<'_>,
 		store: &mut LZ77Store,
-	) -> Result<(), ZopfliError> { self.optimal_run(arr, instart, stats, store) }
+	) -> Result<(), ZopfliError> {
+		// Reset the store and costs.
+		store.clear();
+		let costs = self.squeeze.reset_costs();
+		if ! costs.is_empty() {
+			// Reset and warm the hash.
+			self.hash.reset(chunk);
 
-	#[inline]
+			// Forward and backward squeeze passes.
+			self.hash.get_best_lengths_fixed(chunk, costs, &mut self.lmc)?;
+			let paths = self.squeeze.trace_paths()?;
+			if ! paths.is_empty() {
+				self.hash.follow_paths(
+					chunk,
+					paths,
+					store,
+					&mut self.lmc,
+				)?;
+			}
+		}
+
+		Ok(())
+	}
+
+	#[inline(never)]
 	/// # Optimal Run.
 	///
-	/// This performs backward/forward squeeze passes on the data, optionally
-	/// considering existing histogram data. The `store` is updated with the
-	/// best-length match data.
-	///
-	/// This is one of two entrypoints into the inner `ZopfliHash` data.
+	/// This performs backward/forward squeeze passes on the data with
+	/// existing histogram data. The `store` is updated with the best-length
+	/// match data.
 	pub(crate) fn optimal_run(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
-		stats: Option<&SymbolStats>,
+		chunk: ZopfliChunk<'_>,
+		stats: &SymbolStats,
 		store: &mut LZ77Store,
 	) -> Result<(), ZopfliError> {
 		// Reset the store and costs.
@@ -253,15 +314,14 @@ impl ZopfliState {
 		let costs = self.squeeze.reset_costs();
 		if ! costs.is_empty() {
 			// Reset and warm the hash.
-			self.hash.reset(arr, instart);
+			self.hash.reset(chunk);
 
 			// Forward and backward squeeze passes.
-			self.hash.get_best_lengths(arr, instart, stats, costs, &mut self.lmc)?;
+			self.hash.get_best_lengths(chunk, stats, costs, &mut self.lmc)?;
 			let paths = self.squeeze.trace_paths()?;
 			if ! paths.is_empty() {
 				self.hash.follow_paths(
-					arr,
-					instart,
+					chunk,
 					paths,
 					store,
 					&mut self.lmc,
@@ -275,88 +335,85 @@ impl ZopfliState {
 
 
 
+/// # State Init.
+///
+/// The `ZopfliState` struct is initialized from a raw pointer to prevent
+/// stack allocations. This trait exposes — in as limited a way as possible —
+/// raw initialization methods for its members. (`ZopfliState::new` is the only
+/// place that calls these methods.)
+///
+/// The `state_init` invocations do not necessarily populate _default_ values
+/// since they'll be re(reset) prior to use anyway, but the values will at
+/// least be valid for their types, preventing accidental UB.
+pub(crate) trait ZopfliStateInit {
+	#[allow(unsafe_code)]
+	/// # State Initialization.
+	unsafe fn state_init(nn: NonNull<Self>);
+}
+
+
+
 #[derive(Clone, Copy)]
 /// # Zopfli Hash.
 ///
 /// This structure tracks byte values and hashes by position, facilitating
 /// match-finding (length and distance) at various offsets.
-///
-/// It is functionally equivalent to the original `hash.c` structure, but with
-/// more consistent member typing, sizing, and naming.
 struct ZopfliHash {
 	chain1: ZopfliHashChain,
 	chain2: ZopfliHashChain,
 
-	/// Repetitions of the same byte after this.
+	/// # Repetitions of the same byte after this.
 	same: [u16; ZOPFLI_WINDOW_SIZE],
 }
 
-impl ZopfliHash {
+impl ZopfliStateInit for ZopfliHash {
 	#[allow(unsafe_code)]
-	/// # New (Boxed) Instance.
-	///
-	/// The fixed arrays holding this structure's data are monstrous — 458,756
-	/// bytes per instance! — but absolutely critical for performance.
+	#[inline]
+	/// # State Initialization.
 	///
-	/// To keep Rust from placing all that shit on the stack — as it would
-	/// normally try to do — this method manually initializes everything from
-	/// raw pointers, then boxes it up for delivery à la [`zopfli-rs`](https://github.com/zopfli-rs/zopfli).
-	fn new() -> Box<Self> {
-		// Reserve the space.
-		const LAYOUT: Layout = Layout::new::<ZopfliHash>();
-		let out = NonNull::new(unsafe { alloc(LAYOUT).cast() })
-			.unwrap_or_else(|| handle_alloc_error(LAYOUT));
-		let ptr: *mut Self = out.as_ptr();
-
-		// Safety: all this pointer business is necessary to keep the content
-		// off the stack. Once it's boxed we can breathe easier. ;)
-		unsafe {
-			// All the hash/index arrays default to `-1_i16` for `None`, which
-			// we can do efficiently by setting all bits to one.
-			addr_of_mut!((*ptr).chain1.hash_idx).write_bytes(u8::MAX, 1);
-			addr_of_mut!((*ptr).chain1.idx_hash).write_bytes(u8::MAX, 1);
-			addr_of_mut!((*ptr).chain1.idx_prev).write_bytes(u8::MAX, 1);
+	/// See `ZopfliState` for more details.
+	unsafe fn state_init(nn: NonNull<Self>) {
+		let ptr = nn.as_ptr();
 
-			// The initial hash value is just plain zero.
-			addr_of_mut!((*ptr).chain1.val).write(0);
+		// All the hash/index arrays default to `-1_i16` for `None`, which
+		// we can do efficiently by flipping all bits on.
+		addr_of_mut!((*ptr).chain1.hash_idx).write_bytes(u8::MAX, 1);
+		addr_of_mut!((*ptr).chain1.idx_hash).write_bytes(u8::MAX, 1);
+		addr_of_mut!((*ptr).chain1.idx_prev).write_bytes(u8::MAX, 1);
 
-			// The second chain is the same as the first, so we can simply copy
-			// it wholesale.
-			addr_of_mut!((*ptr).chain2).copy_from_nonoverlapping(addr_of!((*ptr).chain1), 1);
+		// The initial hash value is just plain zero.
+		addr_of_mut!((*ptr).chain1.val).write(0);
 
-			// The repetition counts default to zero.
-			addr_of_mut!((*ptr).same).write_bytes(0, 1);
+		// The second chain is the same as the first, so we can simply copy
+		// it wholesale.
+		addr_of_mut!((*ptr).chain2).copy_from_nonoverlapping(addr_of!((*ptr).chain1), 1);
 
-			// All set!
-			Box::from_raw(ptr)
-		}
+		// The repetition counts default to zero.
+		addr_of_mut!((*ptr).same).write_bytes(0, 1);
 	}
+}
 
+impl ZopfliHash {
 	/// # Reset/Warm Up.
 	///
 	/// This sets all values to their defaults, then cycles the first chain's
 	/// hash value once or twice, then hashes the bits between the start of the
 	/// window and the start of the slice we're actually interested in, if any.
-	fn reset(
-		&mut self,
-		arr: &[u8],
-		instart: usize,
-	) {
+	fn reset(&mut self, chunk: ZopfliChunk<'_>) {
 		// Reset the data.
 		self.chain1.reset();
 		self.chain2.reset();
 		self.same.fill(0);
 
 		// Cycle the hash once or twice.
-		if instart >= arr.len() { return; }
-		let windowstart = instart.saturating_sub(ZOPFLI_WINDOW_SIZE);
-		self.update_hash_value(arr[windowstart]);
-		if windowstart + 1 < arr.len() {
-			self.update_hash_value(arr[windowstart + 1]);
-		}
+		let (a, b) = chunk.warmup_values();
+		self.update_hash_value(a);
+		if let Some(b) = b { self.update_hash_value(b); }
 
-		// Process the values between windowstart and instart.
-		for i in windowstart..instart { self.update_hash(&arr[i..], i); }
+		// Process the values between windowstart and instart, if any.
+		if let Some(iter) = chunk.reducing_prelude_iter() {
+			for chunk2 in iter { self.update_hash(chunk2); }
+		}
 	}
 
 	#[allow(
@@ -366,23 +423,24 @@ impl ZopfliHash {
 	)]
 	/// # Update Hash.
 	///
-	/// This updates the hash tables using the data from `arr`. The `pos` value
-	/// marks the position of `arr` within the original block slice. (That is,
-	/// `arr` is pre-sliced to `arr[pos..]` before being passed to this method.)
-	fn update_hash(&mut self, arr: &[u8], pos: usize) {
+	/// This updates the hash tables using the chunk's block data.
+	fn update_hash(&mut self, chunk: ZopfliChunk<'_>) {
+		let pos = chunk.pos();
 		let hpos = pos & ZOPFLI_WINDOW_MASK;
 
 		// Cycle the first hash.
+		let arr = chunk.block();
 		self.update_hash_value(arr.get(ZOPFLI_MIN_MATCH - 1).map_or(0, |v| *v));
 		self.chain1.update_hash(pos);
 
 		// Count up the repetitions (and update sameness).
+		let current = chunk.first();
 		let mut amount = self.same[pos.wrapping_sub(1) & ZOPFLI_WINDOW_MASK]
 			.saturating_sub(1);
 		while
 			amount < u16::MAX &&
 			usize::from(amount) + 1 < arr.len() &&
-			arr[0] == arr[usize::from(amount) + 1]
+			current == arr[usize::from(amount) + 1]
 		{
 			amount += 1;
 		}
@@ -396,6 +454,8 @@ impl ZopfliHash {
 	/// # Update Hash Value.
 	///
 	/// This updates the rotating (chain1) hash value.
+	///
+	/// Note: the value will always fit within the equivalent of `u15`.
 	fn update_hash_value(&mut self, c: u8) {
 		self.chain1.val = ((self.chain1.val << HASH_SHIFT) ^ i16::from(c)) & HASH_MASK;
 	}
@@ -412,40 +472,107 @@ impl ZopfliHash {
 	///
 	/// Note: the costs really do need to be calculated in 64 bits, truncated
 	/// to 32 bits for storage, then widened back to 64 bits for comparison.
+	/// Zopfli is evil!
 	fn get_best_lengths(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
-		stats: Option<&SymbolStats>,
+		chunk: ZopfliChunk<'_>,
+		stats: &SymbolStats,
 		costs: &mut [(f32, LitLen)],
 		lmc: &mut MatchCache,
 	) -> Result<(), ZopfliError> {
-		// Costs and lengths are resized prior to this point; they should be
-		// one larger than the data of interest (and equal to each other).
-		debug_assert!(costs.len() == arr.len() - instart + 1);
+		/// # Minimum Cost Model (Non-Zero Distances).
+		fn minimum_cost(stats: &SymbolStats) -> f64 {
+			// Find the minimum length cost.
+			let mut length_cost = f64::INFINITY;
+			for (lsym, lbits) in LENGTH_SYMBOLS.iter().copied().zip(LENGTH_SYMBOL_BITS_F.into_iter()).skip(3) {
+				let cost = lbits + stats.ll_symbols[lsym as usize];
+				if cost < length_cost { length_cost = cost; }
+			}
+
+			// Now find the minimum distance cost.
+			let mut dist_cost = f64::INFINITY;
+			for (bits, v) in MIN_COST_DISTANCES.iter().copied().zip(stats.d_symbols) {
+				let cost = f64::from(bits) + v;
+				if cost < dist_cost { dist_cost = cost; }
+			}
+
+			// Add them together and we have our minimum.
+			length_cost + dist_cost
+		}
+
+		/// # Adjusted Cost.
+		fn stat_cost(dist: u16, k: LitLen, stats: &SymbolStats) -> f64 {
+			if dist == 0 { stats.ll_symbols[k as usize] }
+			else {
+				let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
+				DISTANCE_BITS_F[dsym as usize] +
+				stats.d_symbols[dsym as usize] +
+				stats.ll_symbols[LENGTH_SYMBOLS[k as usize] as usize] +
+				LENGTH_SYMBOL_BITS_F[k as usize]
+			}
+		}
+
+		// The costs are sized according to the (relevant) array slice; they
+		// should always be exactly one larger.
+		if costs.len() != chunk.block_size().get() + 1 {
+			return Err(zopfli_error!());
+		}
+
+		// Iterators will help us avoid a bunch of unsafe.
+		let instart = chunk.pos();
+		let mut iter = chunk.reducing_block_iter().zip(
+			ReducingSlices::new(Cell::from_mut(costs).as_slice_of_cells())
+		);
 
 		let mut length = LitLen::L000;
 		let mut distance = 0_u16;
 		let mut sublen = ZEROED_SUBLEN;
 
-		// Find the minimum and maximum cost.
-		let min_cost = stats.map_or(12.0, get_minimum_cost);
+		// Find the minimum and symbol costs, which we'll need to reference
+		// repeatedly in the loop.
+		let min_cost = minimum_cost(stats);
+		let symbol_cost = stats.ll_symbols[285] + stats.d_symbols[0];
+
+		while let Some((mut chunk2, mut cost2)) = iter.next() {
+			debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len());
 
-		let mut i = instart;
-		while i < arr.len() {
 			// Hash the remainder.
-			self.update_hash(&arr[i..], i);
+			self.update_hash(chunk2);
+
+			let pos = chunk2.pos();
+			if
+				// We have more than ZOPFLI_MAX_MATCH entries behind us, and twice
+				// twice as many ahead of us.
+				pos > instart + ZOPFLI_MAX_MATCH + 1 &&
+				chunk2.block_size().get() > ZOPFLI_MAX_MATCH * 2 + 1 &&
+				// The current and max-match-ago positions have long repetitions.
+				self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 &&
+				self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16
+			{
+				// Fast forward!
+				let before = pos;
+				for (chunk3, cost3) in iter.by_ref().take(ZOPFLI_MAX_MATCH) {
+					// Safety: arr2.len() has at least ZOPFLI_MAX_MATCH*2+1
+					// remaining entries; cost2.len() will be at least one
+					// more than that.
+					if cost2.len() <= ZOPFLI_MAX_MATCH { crate::unreachable(); }
+					cost2[ZOPFLI_MAX_MATCH].set((
+						(f64::from(cost2[0].get().0) + symbol_cost) as f32,
+						LitLen::MAX_MATCH,
+					));
+					cost2 = cost3; // The costs are rotated _after_ updating…
+
+					chunk2 = chunk3;   // …but the array is rotated beforehand.
+					self.update_hash(chunk2);
+				}
 
-			// We're in a long repetition of the same character and have more
-			// than ZOPFLI_MAX_MATCH ahead of and behind us.
-			if self._get_best_lengths_max_match(instart, i, stats, arr, costs) {
-				i += ZOPFLI_MAX_MATCH;
+				debug_assert_eq!(chunk2.pos() - before, ZOPFLI_MAX_MATCH);
+				debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len());
 			}
 
 			// Find the longest remaining match.
 			self.find(
-				arr,
-				i,
+				chunk2,
 				LitLen::MAX_MATCH,
 				&mut Some(&mut sublen),
 				&mut distance,
@@ -454,44 +581,40 @@ impl ZopfliHash {
 				Some(instart),
 			)?;
 
-			// Relative position for the costs and lengths, which have
-			// (iend - istart + 1) entries, so j is always in range when i is.
-			let j = i - instart;
-
-			// This should never trigger; it is mainly a reminder to the
-			// compiler that our i/j indices are still applicable.
-			if i >= arr.len() || j + 1 >= costs.len() { break; }
-
-			let cost_j = f64::from(costs[j].0);
-			let new_cost = stats.map_or_else(
-				|| if arr[i] <= 143 { 8.0 } else { 9.0 },
-				|s| s.ll_symbols[usize::from(arr[i])],
-			) + cost_j;
-			debug_assert!(0.0 <= new_cost);
+			// Safety: the MAX loop (if it ran at all) only advanced the
+			// slices ZOPFLI_MAX_MATCH; we have more work to do!
+			if cost2.len() < 2 { crate::unreachable(); }
 
 			// Update it if lower.
-			if new_cost < f64::from(costs[j + 1].0) {
-				costs[j + 1].0 = new_cost as f32;
-				costs[j + 1].1 = LitLen::L001;
+			let cost_j = f64::from(cost2[0].get().0);
+			let new_cost = stats.ll_symbols[usize::from(chunk2.first())] + cost_j;
+			if new_cost < f64::from(cost2[1].get().0) {
+				cost2[1].set((new_cost as f32, LitLen::L001));
 			}
 
 			// If a long match was found, peek forward to recalculate those
 			// costs, at least the ones who could benefit from the expense of
 			// all that effort.
-			let limit = length.min_usize(costs.len().saturating_sub(j + 1));
+			let limit = length.min_usize(cost2.len() - 1);
 			if limit.is_matchable() {
-				let sublen2 = &sublen[ZOPFLI_MIN_MATCH..=limit as usize];
-				let costs2 = &mut costs[j + ZOPFLI_MIN_MATCH..];
-				if let Some(s) = stats {
-					peek_ahead_stats(cost_j, min_cost, sublen2, costs2, s);
-				}
-				else {
-					peek_ahead_fixed(cost_j, min_cost, sublen2, costs2);
+				let min_cost_add = min_cost + cost_j;
+
+				// Safety: limit is capped to cost2.len() - 1.
+				if cost2.len() <= (limit as usize) { crate::unreachable(); }
+
+				for ((dist, c), k) in sublen[ZOPFLI_MIN_MATCH..=limit as usize].iter()
+					.copied()
+					.zip(&cost2[ZOPFLI_MIN_MATCH..=limit as usize])
+					.zip(LitLen::matchable_iter())
+				{
+					let current_cost = f64::from(c.get().0);
+					if min_cost_add < current_cost {
+						// Update it if lower.
+						let new_cost = cost_j + stat_cost(dist, k, stats);
+						if new_cost < current_cost { c.set((new_cost as f32, k)); }
+					}
 				}
 			}
-
-			// Back around again!
-			i += 1;
 		}
 
 		// All costs should have been updated…
@@ -500,64 +623,136 @@ impl ZopfliHash {
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
-	/// # Best Length Max Match.
-	///
-	/// This fast-forwards through long repetitions in the middle of a
-	/// `ZopfliHash::get_best_lengths` block, processing `ZOPFLI_MAX_MATCH`
-	/// `arr` and `costs` entries in one go.
+	#[inline(never)]
+	/// # Get Best Lengths (Fixed).
 	///
-	/// Returns `true` if such a match was found so the indices can be
-	/// incremented accordingly on the caller's side.
-	fn _get_best_lengths_max_match(
+	/// Same as `ZopfliHash::get_best_lengths`, but simpler fixed-tree lengths
+	/// and symbols are used instead of variable store-specific data.
+	fn get_best_lengths_fixed(
 		&mut self,
-		instart: usize,
-		mut pos: usize,
-		stats: Option<&SymbolStats>,
-		arr: &[u8],
+		chunk: ZopfliChunk<'_>,
 		costs: &mut [(f32, LitLen)],
-	) -> bool {
-		if
-			// We have more than ZOPFLI_MAX_MATCH entries behind us, and twice
-			// twice as many ahead of us.
-			pos > instart + ZOPFLI_MAX_MATCH + 1 &&
-			arr.len() > pos + ZOPFLI_MAX_MATCH * 2 + 1 &&
-			// The current and max-match-ago positions have long repetitions.
-			self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 &&
-			self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16
-		{
-			// The symbol cost for ZOPFLI_MAX_LENGTH (and a distance of 1) doesn't
-			// need mutch calculation.
-			let symbol_cost = stats.map_or(
-				13.0,
-				|s| s.ll_symbols[285] + s.d_symbols[0],
-			);
+		lmc: &mut MatchCache,
+	) -> Result<(), ZopfliError> {
+		/// # Adjusted Cost.
+		///
+		/// These are really tiny so we might as well use single-byte math.
+		const fn fixed_cost(dist: u16, k: LitLen) -> u8 {
+			use super::{DISTANCE_BITS, LENGTH_SYMBOL_BITS};
+
+			if dist == 0 { 8 + (143 < (k as u16)) as u8 }
+			else {
+				let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
+				DISTANCE_BITS[dsym as usize] +
+				LENGTH_SYMBOL_BITS[k as usize] +
+				(114 < (k as u16)) as u8 +
+				12
+			}
+		}
+
+		// The costs are sized according to the (relevant) array slice; they
+		// should always be exactly one larger.
+		if costs.len() != chunk.block_size().get() + 1 {
+			return Err(zopfli_error!());
+		}
+
+		// Iterators will help us avoid a bunch of unsafe.
+		let instart = chunk.pos();
+		let mut iter = chunk.reducing_block_iter().zip(
+			ReducingSlices::new(Cell::from_mut(costs).as_slice_of_cells())
+		);
+
+		let mut length = LitLen::L000;
+		let mut distance = 0_u16;
+		let mut sublen = ZEROED_SUBLEN;
+
+		while let Some((mut chunk2, mut cost2)) = iter.next() {
+			debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len());
+
+			// Hash the remainder.
+			self.update_hash(chunk2);
+
+			let pos = chunk2.pos();
+			if
+				// We have more than ZOPFLI_MAX_MATCH entries behind us, and twice
+				// twice as many ahead of us.
+				pos > instart + ZOPFLI_MAX_MATCH + 1 &&
+				chunk2.block_size().get() > ZOPFLI_MAX_MATCH * 2 + 1 &&
+				// The current and max-match-ago positions have long repetitions.
+				self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 &&
+				self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16
+			{
+				// Fast forward!
+				let before = pos;
+				for (chunk3, cost3) in iter.by_ref().take(ZOPFLI_MAX_MATCH) {
+					// Safety: arr2.len() has at least ZOPFLI_MAX_MATCH*2+1
+					// remaining entries; cost2.len() will be at least one
+					// more than that.
+					if cost2.len() <= ZOPFLI_MAX_MATCH { crate::unreachable(); }
+					cost2[ZOPFLI_MAX_MATCH].set((
+						(f64::from(cost2[0].get().0) + 13.0) as f32,
+						LitLen::MAX_MATCH,
+					));
+					cost2 = cost3; // The costs are rotated _after_ updating…
+
+					chunk2 = chunk3;   // …but the array is rotated beforehand.
+					self.update_hash(chunk2);
+				}
 
-			// We'll need to read data from one portion of the slice and add it
-			// to data in another portion. Index-based access confusing the
-			// compiler, so to avoid a bunch of "unsafe", we'll work with a
-			// slice-of-cells representation instead.
-			let costs = Cell::from_mut(costs).as_slice_of_cells();
-
-			// Fast forward!
-			let before = pos;
-			let mut iter = costs.windows(ZOPFLI_MAX_MATCH + 1).skip(pos - instart).take(ZOPFLI_MAX_MATCH);
-			while let Some([a, _rest @ .., z]) = iter.next() {
-				z.set((
-					(f64::from(a.get().0) + symbol_cost) as f32,
-					LitLen::MAX_MATCH,
-				));
-				pos += 1;
-				self.update_hash(&arr[pos..], pos);
+				debug_assert_eq!(chunk2.pos() - before, ZOPFLI_MAX_MATCH);
+				debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len());
 			}
 
-			// We should never not hit our desired take() because the lengths
-			// of arr and cost are fixed and intertwined, but it's a good debug
-			// sort of thing to check.
-			debug_assert_eq!(pos - before, ZOPFLI_MAX_MATCH);
+			// Find the longest remaining match.
+			self.find(
+				chunk2,
+				LitLen::MAX_MATCH,
+				&mut Some(&mut sublen),
+				&mut distance,
+				&mut length,
+				lmc,
+				Some(instart),
+			)?;
 
-			true
+			// Safety: the MAX loop (if it ran at all) only advanced the
+			// slices ZOPFLI_MAX_MATCH; we have more work to do!
+			if cost2.len() < 2 { crate::unreachable(); }
+
+			// Update it if lower.
+			let cost_j = f64::from(cost2[0].get().0);
+			let new_cost = if chunk2.first() <= 143 { 8.0 } else { 9.0 } + cost_j;
+			if new_cost < f64::from(cost2[1].get().0) {
+				cost2[1].set((new_cost as f32, LitLen::L001));
+			}
+
+			// If a long match was found, peek forward to recalculate those
+			// costs, at least the ones who could benefit from the expense of
+			// all that effort.
+			let limit = length.min_usize(cost2.len() - 1);
+			if limit.is_matchable() {
+				let min_cost_add = 8.0 + cost_j;
+
+				// Safety: limit is capped to cost2.len() - 1.
+				if cost2.len() <= (limit as usize) { crate::unreachable(); }
+
+				for ((dist, c), k) in sublen[ZOPFLI_MIN_MATCH..=limit as usize].iter()
+					.copied()
+					.zip(&cost2[ZOPFLI_MIN_MATCH..=limit as usize])
+					.zip(LitLen::matchable_iter())
+				{
+					let current_cost = f64::from(c.get().0);
+					if min_cost_add < current_cost {
+						// Update it if lower.
+						let new_cost = cost_j + f64::from(fixed_cost(dist, k));
+						if new_cost < current_cost { c.set((new_cost as f32, k)); }
+					}
+				}
+			}
 		}
-		else { false }
+
+		// All costs should have been updated…
+		debug_assert!(costs.iter().all(|(cost, _)| (0.0..1E30).contains(cost)));
+		Ok(())
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
@@ -567,22 +762,21 @@ impl ZopfliHash {
 	/// squeeze-based path lengths. The store is updated with the results.
 	fn follow_paths(
 		&mut self,
-		arr: &[u8],
-		instart: usize,
+		chunk: ZopfliChunk<'_>,
 		paths: &[LitLen],
 		store: &mut LZ77Store,
 		lmc: &mut MatchCache,
 	) -> Result<(), ZopfliError> {
-		// Easy abort.
-		if instart >= arr.len() { return Ok(()); }
-
 		// Reset the hash.
-		self.reset(arr, instart);
+		self.reset(chunk);
 
 		// Hash the path symbols.
-		let mut i = instart;
-		for length in paths.iter().copied() {
-			self.update_hash(&arr[i..], i);
+		let instart = chunk.pos();
+		let mut len_iter = paths.iter().copied();
+		let mut arr_iter = chunk.reducing_block_iter();
+		while let Some((length, chunk2)) = len_iter.next().zip(arr_iter.next()) {
+			// Hash it.
+			self.update_hash(chunk2);
 
 			// Follow the matches!
 			if length.is_matchable() {
@@ -592,8 +786,7 @@ impl ZopfliHash {
 				let mut test_length = LitLen::L000;
 				let mut dist = 0;
 				self.find(
-					arr,
-					i,
+					chunk2,
 					length,
 					&mut None,
 					&mut dist,
@@ -608,20 +801,17 @@ impl ZopfliHash {
 				}
 
 				// Add it to the store.
-				store.push(length, dist, i);
+				store.push(length, dist, chunk2.pos());
 
 				// Hash the rest of the match.
-				for _ in 1..(length as u16) {
-					i += 1;
-					self.update_hash(&arr[i..], i);
+				for chunk2 in arr_iter.by_ref().take(length as usize - 1) {
+					self.update_hash(chunk2);
 				}
 			}
 			// It isn't matchable; add it directly to the store.
 			else {
-				store.push(LitLen::from_u8(arr[i]), 0, i);
+				store.push(LitLen::from_u8(chunk2.first()), 0, chunk2.pos());
 			}
-
-			i += 1;
 		}
 
 		Ok(())
@@ -632,16 +822,15 @@ impl ZopfliHash {
 	#[allow(clippy::too_many_arguments)]
 	/// # Find Longest Match.
 	///
-	/// This finds the longest match in `arr` (and/or the cache), setting the
-	/// passed `sublen`/`distance`/`length` values accordingly.
+	/// This finds the longest match in the chunk (and/or the cache), setting
+	/// the provided `sublen`/`distance`/`length` values accordingly.
 	///
 	/// Lengths will never exceed `limit` nor `ZOPFLI_MAX_MATCH`, but they
-	/// might be _less_ than `ZOPFLI_MIN_MATCH`, especially near the end of a
-	/// slice.
+	/// might be _less_ than `ZOPFLI_MIN_MATCH`, especially as we near the end
+	/// of the block slice.
 	fn find(
 		&self,
-		arr: &[u8],
-		pos: usize,
+		chunk: ZopfliChunk<'_>,
 		mut limit: LitLen,
 		sublen: &mut Option<&mut [u16; SUBLEN_LEN]>,
 		distance: &mut u16,
@@ -650,6 +839,7 @@ impl ZopfliHash {
 		cache: Option<usize>,
 	) -> Result<(), ZopfliError> {
 		// Check the longest match cache first!
+		let pos = chunk.pos();
 		if let Some(blockstart) = cache {
 			if lmc.find(
 				pos - blockstart,
@@ -658,14 +848,14 @@ impl ZopfliHash {
 				distance,
 				length,
 			)? {
-				if pos + (*length as usize) <= arr.len() { return Ok(()); }
+				if (*length as usize) <= chunk.block_size().get() { return Ok(()); }
 				return Err(zopfli_error!());
 			}
 		}
 
 		// We'll need at least ZOPFLI_MIN_MATCH bytes for a search; if we don't
 		// have it, zero everything out and call it a day.
-		if pos + ZOPFLI_MIN_MATCH > arr.len() {
+		if ZOPFLI_MIN_MATCH > chunk.block_size().get() {
 			*length = LitLen::L000;
 			*distance = 0;
 			return Ok(());
@@ -673,10 +863,10 @@ impl ZopfliHash {
 
 		// Cap the limit to fit if needed. Note that limit will always be at
 		// least one even if capped since pos < size.
-		limit = limit.min_usize(arr.len() - pos);
+		limit = limit.min_usize(chunk.block_size().get());
 
 		// Calculate the best distance and length.
-		let (bestdist, bestlength) = self.find_loop(arr, pos, limit, sublen)?;
+		let (bestdist, bestlength) = self.find_loop(chunk, limit, sublen);
 
 		// Cache the results for next time, maybe.
 		if limit.is_max() {
@@ -690,7 +880,7 @@ impl ZopfliHash {
 		// Update the values.
 		*distance = bestdist;
 		*length = bestlength;
-		if pos + (*length as usize) <= arr.len() { Ok(()) }
+		if (*length as usize) <= chunk.block_size().get() { Ok(()) }
 		else { Err(zopfli_error!()) }
 	}
 
@@ -703,26 +893,26 @@ impl ZopfliHash {
 	)]
 	/// # Find Longest Match Loop.
 	///
-	/// This method is the (nasty-looking) workhorse of the above
+	/// This method is a (nasty-looking) workhorse for the above
 	/// `ZopfliHash::find` method. It finds and returns the matching distance
 	/// and length, or `(0, 1)` if none.
 	fn find_loop(
 		&self,
-		arr: &[u8],
-		pos: usize,
+		chunk: ZopfliChunk<'_>,
 		limit: LitLen,
 		sublen: &mut Option<&mut [u16; SUBLEN_LEN]>,
-	) -> Result<(u16, LitLen), ZopfliError> {
+	) -> (u16, LitLen) {
 		/// # Distance Given Positions.
 		const fn ppp_distance(p: usize, pp: usize) -> usize {
 			if p < pp { pp - p }
 			else { ZOPFLI_WINDOW_SIZE + pp - p }
 		}
 
-		// This is asserted by find() too, but it's a good reminder.
-		if arr.len() <= pos { return Err(zopfli_error!()); }
-		let right = &arr[pos..];
+		// Prepopulate some slices to work with directly later on.
+		let arr = chunk.arr();
+		let right = chunk.block();
 
+		let pos = chunk.pos();
 		let hpos = pos & ZOPFLI_WINDOW_MASK;
 
 		// The default distance and length. We'll be wanting 16-bit values for
@@ -771,9 +961,7 @@ impl ZopfliHash {
 				// verified it was non-empty, but the compiler will have
 				// forgotten that by now.
 				let left = unsafe { arr.get_unchecked(pos - dist..pos - dist + right.len()) };
-				if right.is_empty() || left.len() != right.len() {
-					unsafe { core::hint::unreachable_unchecked(); }
-				}
+				if right.is_empty() || left.len() != right.len() { crate::unreachable(); }
 
 				// Check to see if we can do better than we've already done.
 				if (bestlength as usize) >= right.len() || right[bestlength as usize] == left[bestlength as usize] {
@@ -842,8 +1030,8 @@ impl ZopfliHash {
 		} // Thus concludes the long-ass loop!
 
 		// Return the distance and length values.
-		if (bestlength as u16) <= (limit as u16) { Ok((bestdist as u16, bestlength)) }
-		else { Ok((0, LitLen::L001)) }
+		if (bestlength as u16) <= (limit as u16) { (bestdist as u16, bestlength) }
+		else { (0, LitLen::L001) }
 	}
 }
 
@@ -856,11 +1044,13 @@ impl ZopfliHash {
 /// positions.
 ///
 /// Written values are all in the range of `0..=i16::MAX`, matching the array
-/// sizes, elminating bounds checking on the upper end.
+/// sizes, elminating bounds checking on the upper end. (They're effectively
+/// `u15`.)
 ///
 /// The remaining "sign" bit is logically repurposed to serve as a sort of
-/// `None`, allowing us to cheaply identify unwritten values. (Testing for that
-/// takes care of bounds checking on the lower end.)
+/// `None` flag, allowing us to cheaply identify uninitialized values.
+/// (And by testing for that, we eliminate bounds checks on the lower end of
+/// the range.)
 struct ZopfliHashChain {
 	/// Hash value to (most recent) index.
 	///
@@ -925,98 +1115,6 @@ impl ZopfliHashChain {
 
 
 
-/// # Minimum Cost Model.
-///
-/// This returns the minimum _statistical_ cost, which is the sum of the
-/// minimum length cost and minimum distance cost.
-fn get_minimum_cost(stats: &SymbolStats) -> f64 {
-	// Find the minimum length cost.
-	let mut length_cost = f64::INFINITY;
-	for (lsym, lbits) in LENGTH_SYMBOLS.into_iter().zip(LENGTH_SYMBOL_BITS.into_iter()).skip(3) {
-		let cost = f64::from(lbits) + stats.ll_symbols[lsym as usize];
-		if cost < length_cost { length_cost = cost; }
-	}
-
-	// Now find the minimum distance cost.
-	let mut dist_cost = f64::INFINITY;
-	for (bits, v) in MIN_COST_DISTANCES.iter().copied().zip(stats.d_symbols) {
-		let cost = f64::from(bits) + v;
-		if cost < dist_cost { dist_cost = cost; }
-	}
-
-	// Add them together and we have our minimum.
-	length_cost + dist_cost
-}
-
-#[allow(clippy::cast_possible_truncation)]
-/// # Get Best Lengths Peek Ahead (Fixed).
-fn peek_ahead_fixed(
-	cost_j: f64,
-	min_cost: f64,
-	sublen: &[u16],
-	costs: &mut [(f32, LitLen)],
-) {
-	let min_cost_add = min_cost + cost_j;
-	for ((dist, c), k) in sublen.iter().copied().zip(costs).zip(LitLen::matchable_iter()) {
-		if min_cost_add < f64::from(c.0) {
-			let mut new_cost = cost_j;
-			if dist == 0 {
-				if (k as u16) <= 143 { new_cost += 8.0; }
-				else { new_cost += 9.0; }
-			}
-			else {
-				if 114 < (k as u16) { new_cost += 13.0; }
-				else { new_cost += 12.0; }
-
-				let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
-				new_cost += f64::from(DISTANCE_BITS[dsym as usize]);
-				new_cost += f64::from(LENGTH_SYMBOL_BITS[k as usize]);
-			}
-
-			// Update it if lower.
-			if (0.0..f64::from(c.0)).contains(&new_cost) {
-				c.0 = new_cost as f32;
-				c.1 = k;
-			}
-		}
-	}
-}
-
-#[allow(clippy::cast_possible_truncation)]
-/// # Get Best Lengths Peek Ahead (Dynamic).
-fn peek_ahead_stats(
-	cost_j: f64,
-	min_cost: f64,
-	sublen: &[u16],
-	costs: &mut [(f32, LitLen)],
-	stats: &SymbolStats,
-) {
-	let min_cost_add = min_cost + cost_j;
-	for ((dist, c), k) in sublen.iter().copied().zip(costs).zip(LitLen::matchable_iter()) {
-		if min_cost_add < f64::from(c.0) {
-			let mut new_cost = cost_j;
-			if dist == 0 {
-				new_cost += stats.ll_symbols[k as usize];
-			}
-			else {
-				let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
-				new_cost += f64::from(DISTANCE_BITS[dsym as usize]);
-				new_cost += stats.d_symbols[dsym as usize];
-				new_cost += stats.ll_symbols[LENGTH_SYMBOLS[k as usize] as usize];
-				new_cost += f64::from(LENGTH_SYMBOL_BITS[k as usize]);
-			}
-
-			// Update it if lower.
-			if (0.0..f64::from(c.0)).contains(&new_cost) {
-				c.0 = new_cost as f32;
-				c.1 = k;
-			}
-		}
-	}
-}
-
-
-
 #[cfg(test)]
 mod tests {
 	use super::*;
@@ -1024,8 +1122,8 @@ mod tests {
 	#[test]
 	fn t_fixed_cost() {
 		// Get the largest dbit and lbit values.
-		let d_max: u8 = DISTANCE_BITS.into_iter().max().unwrap();
-		let l_max: u8 = LENGTH_SYMBOL_BITS.into_iter().max().unwrap();
+		let d_max: u8 = super::super::DISTANCE_BITS.into_iter().max().unwrap();
+		let l_max: u8 = super::super::LENGTH_SYMBOL_BITS.into_iter().max().unwrap();
 
 		// Make sure their sum (along with the largest base) fits within
 		// the u8 space, since that's what we're using at runtime.
diff --git a/flapfli/src/zopflipng/iter.rs b/flapfli/src/zopflipng/iter.rs
new file mode 100644
index 0000000..1dc5f2b
--- /dev/null
+++ b/flapfli/src/zopflipng/iter.rs
@@ -0,0 +1,74 @@
+/*!
+# Flapfli: Miscellaneous Iterators.
+*/
+
+
+
+/// # Reducing Slice Iterator.
+///
+/// This iterator yields all non-empty slices spanning `n..`, incrementing `n`
+/// by one after each cycle.
+///
+/// In other words, this will start by returning the original slice, then `1..`,
+/// `2..`, `3..`, etc., stopping when empty.
+pub(super) struct ReducingSlices<'a, T>(&'a [T]);
+
+impl<'a, T> ReducingSlices<'a, T> {
+	/// # New.
+	pub(super) const fn new(arr: &'a [T]) -> Self { Self(arr) }
+}
+
+impl<'a, T> Iterator for ReducingSlices<'a, T> {
+	type Item = &'a [T];
+
+	fn next(&mut self) -> Option<Self::Item> {
+		if let [_, rest @ ..] = &self.0 {
+			Some(std::mem::replace(&mut self.0, rest))
+		}
+		else { None }
+	}
+
+	fn size_hint(&self) -> (usize, Option<usize>) {
+		let len = self.0.len();
+		(len, Some(len))
+	}
+}
+
+impl<'a, T> ExactSizeIterator for ReducingSlices<'a, T> {
+	#[inline]
+	fn len(&self) -> usize { self.0.len() }
+}
+
+
+
+#[cfg(test)]
+mod test {
+	use super::*;
+
+	#[test]
+	fn t_reducing_slices() {
+		let slice: &[u8] = &[0, 1, 2, 3, 4, 5];
+		let mut reducing = ReducingSlices::new(slice);
+
+		assert_eq!(reducing.len(), slice.len());
+		assert_eq!(reducing.next(), Some(slice));
+
+		assert_eq!(reducing.len(), 5);
+		assert_eq!(reducing.next(), Some(&slice[1..]));
+
+		assert_eq!(reducing.len(), 4);
+		assert_eq!(reducing.next(), Some(&slice[2..]));
+
+		assert_eq!(reducing.len(), 3);
+		assert_eq!(reducing.next(), Some(&slice[3..]));
+
+		assert_eq!(reducing.len(), 2);
+		assert_eq!(reducing.next(), Some(&slice[4..]));
+
+		assert_eq!(reducing.len(), 1);
+		assert_eq!(reducing.next(), Some(&slice[5..]));
+
+		assert_eq!(reducing.len(), 0);
+		assert_eq!(reducing.next(), None);
+	}
+}
diff --git a/flapfli/src/zopflipng/kat.rs b/flapfli/src/zopflipng/kat.rs
index 04fa013..4514419 100644
--- a/flapfli/src/zopflipng/kat.rs
+++ b/flapfli/src/zopflipng/kat.rs
@@ -12,10 +12,7 @@ use std::{
 		handle_alloc_error,
 		Layout,
 	},
-	cell::{
-		Cell,
-		RefCell,
-	},
+	cell::Cell,
 	cmp::Ordering,
 	num::{
 		NonZeroU32,
@@ -27,6 +24,7 @@ use super::{
 	ArrayD,
 	ArrayLL,
 	DeflateSym,
+	DeflateSymBasic,
 	zopfli_error,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
@@ -37,12 +35,15 @@ use super::{
 
 
 #[allow(unsafe_code)]
-const NZ1: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1) };
+/// # One is Non-Zero.
+const NZ01: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1) };
 
 #[allow(unsafe_code)]
-const NZ2: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(2) };
+/// # Two is Non-Zero.
+const NZ02: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(2) };
 
 #[allow(unsafe_code)]
+/// # Fourteen is Non-Zero.
 const NZ14: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(14) };
 
 /// # Zero-Filled Tree Counts.
@@ -56,12 +57,7 @@ thread_local!(
 	/// The length-limited-code-length methods need to temporarily store
 	/// thousands of `Node` objects. Using a thread-local share for that cuts
 	/// way down on the number of allocations we'd otherwise have to make!
-	static KATSCRATCH: KatScratch = KatScratch::new();
-
-	/// # Shared Tree Scratch.
-	///
-	/// Similar idea as above, but for tree sizing/writing.
-	static TREESCRATCH: RefCell<TreeScratch> = const { RefCell::new(TreeScratch::new()) };
+	static KATSCRATCH: KatScratch = KatScratch::new()
 );
 
 
@@ -83,7 +79,8 @@ mod sealed {
 		/// # Crunch the Code Lengths.
 		///
 		/// This method serves as the closure for the caller's call to
-		/// `KATSCRATCH.with_borrow_mut()`.
+		/// `KATSCRATCH.with_borrow_mut()`. It does all that needs doing to get
+		/// the desired length-limited data into the provided `bitlengths`.
 		fn _llcl<'a>(
 			frequencies: &'a [u32; N],
 			bitlengths: &'a [Cell<DeflateSym>; N],
@@ -101,14 +98,11 @@ mod sealed {
 				leaves[0].frequency,
 				leaves[1].frequency,
 			);
-			#[allow(unsafe_code)]
-			if lists.len() < 2 {
-				// Safety: `usize::min(MAXBITS, leaves.len() - 1)` (above) is
-				// how many lists we'll have, and since MAXBITS is at least
-				// seven and leaves.len() at least three, we'll always have at
-				// least two lists to work with.
-				unsafe { core::hint::unreachable_unchecked(); }
-			}
+			// Safety: `usize::min(MAXBITS, leaves.len() - 1)` (above) is
+			// how many lists we'll have, and since MAXBITS is at least
+			// seven and leaves.len() at least three, we'll always have at
+			// least two lists to work with.
+			if lists.len() < 2 { crate::unreachable(); }
 
 			// In the last list, (2 * len_leaves - 2) active chains need to be
 			// created. We have two already from initialization; each boundary_pm run
@@ -124,6 +118,9 @@ mod sealed {
 
 		#[inline]
 		/// # Write Code Lengths!
+		///
+		/// This is the final stage of the LLCL chain, where the results are
+		/// finally recorded!
 		fn llcl_write(mut node: Node, leaves: &[Leaf<'_>]) -> Result<(), ZopfliError> {
 			// Make sure we counted correctly before doing anything else.
 			let mut last_count = node.count;
@@ -251,24 +248,117 @@ pub(crate) fn best_tree_size(
 	ll_lengths: &ArrayLL<DeflateSym>,
 	d_lengths: &ArrayD<DeflateSym>,
 ) -> Result<(u8, NonZeroU32), ZopfliError> {
-	TREESCRATCH.with_borrow_mut(|t|
-		t.with_symbols(ll_lengths, d_lengths).best_tree()
-	)
+	// Drop the last two zeroes plus any trailing zeroes, then merge them
+	// together into a single collection.
+	let all: Vec<DeflateSym> = {
+		let mut ll_lengths = &ll_lengths[..286];
+		while let [rest @ .., DeflateSym::D00] = ll_lengths {
+			ll_lengths = rest;
+			if ll_lengths.len() == 257 { break; } // Keep all literals.
+		}
+
+		let mut d_lengths = &d_lengths[..30];
+		while let [rest @ .., DeflateSym::D00] = d_lengths { d_lengths = rest; }
+
+		[ll_lengths, d_lengths].concat()
+	};
+
+	// Our targets!
+	let mut best_extra = 0;
+	let mut best_size = NonZeroU32::MAX;
+
+	for extra in 0..8 {
+		let cl_counts = best_tree_size_counts(&all, extra);
+		let cl_lengths = cl_counts.llcl()?;
+		let hclen = tree_hclen(&cl_counts);
+
+		// We can finally calculate the size!
+		let mut size = (hclen as u32 + 4) * 3;
+		size += cl_lengths.iter()
+			.copied()
+			.zip(cl_counts.iter().copied())
+			.map(|(a, b)| (a as u32) * b)
+			.sum::<u32>();
+		size += cl_counts[16] * 2; // Extra bits.
+		size += cl_counts[17] * 3;
+		size += cl_counts[18] * 7;
+		let size = NZ14.saturating_add(size);
+
+		// If better, keep it!
+		if size < best_size {
+			best_extra = extra;
+			best_size = size;
+		}
+	}
+
+	// Done!
+	Ok((best_extra, best_size))
 }
 
 /// # Encode Tree.
 ///
-/// This finds the index that produces the smallest tree size, then writes
-/// that table's bits to the output.
+/// This writes the best-found tree data to `out`.
 pub(crate) fn encode_tree(
 	ll_lengths: &ArrayLL<DeflateSym>,
 	d_lengths: &ArrayD<DeflateSym>,
 	extra: u8,
 	out: &mut ZopfliOut,
 ) -> Result<(), ZopfliError> {
-	TREESCRATCH.with_borrow_mut(|t|
-		t.with_symbols(ll_lengths, d_lengths).write_tree(extra, out)
-	)
+	// Drop the last two zeroes plus any trailing zeroes, then merge them
+	// together into a single collection.
+	let mut hlit: u32 = 29;
+	let mut hdist: u32 = 29;
+	let all: Vec<DeflateSym> = {
+		let mut ll_lengths = &ll_lengths[..286];
+		while let [rest @ .., DeflateSym::D00] = ll_lengths {
+			ll_lengths = rest;
+			hlit -= 1;
+			if ll_lengths.len() == 257 { break; } // Keep all literals.
+		}
+
+		let mut d_lengths = &d_lengths[..30];
+		while let [rest @ .., DeflateSym::D00] = d_lengths {
+			d_lengths = rest;
+			hdist -= 1;
+		}
+
+		[ll_lengths, d_lengths].concat()
+	};
+
+	// We'll need to store some RLE symbols and positions too.
+	let mut rle: Vec<(DeflateSym, u16)> = Vec::new();
+
+	let cl_counts = encode_tree_counts(&all, &mut rle, extra);
+	let cl_lengths = cl_counts.llcl()?;
+	let hclen = tree_hclen(&cl_counts);
+	let cl_symbols = <[u32; 19]>::llcl_symbols(&cl_lengths);
+
+	// Write the main lengths.
+	out.add_fixed_bits::<5>(hlit);
+	out.add_fixed_bits::<5>(hdist);
+	out.add_fixed_bits::<4>(hclen as u32);
+
+	// Write each cl_length in the jumbled DEFLATE order.
+	for &o in &DeflateSym::TREE[..hclen as usize + 4] {
+		out.add_fixed_bits::<3>(cl_lengths[o as usize] as u32);
+	}
+
+	// Write each symbol in order of appearance along with its extra bits,
+	// if any.
+	for (a, b) in rle {
+		let symbol = cl_symbols[a as usize];
+		out.add_huffman_bits(symbol, cl_lengths[a as usize] as u32);
+
+		// Extra bits.
+		match a {
+			DeflateSym::D16 => { out.add_fixed_bits::<2>(u32::from(b)); },
+			DeflateSym::D17 => { out.add_fixed_bits::<3>(u32::from(b)); },
+			DeflateSym::D18 => { out.add_fixed_bits::<7>(u32::from(b)); },
+			_ => {},
+		}
+	}
+
+	Ok(())
 }
 
 
@@ -277,6 +367,10 @@ pub(crate) fn encode_tree(
 ///
 /// This is a super-cheap arena-like structure for holding all the temporary
 /// data required for length-limited-code-length calculations.
+///
+/// This requires doing some fairly un-Rust-like things, but that would be
+/// equally true of any third-party structure as well, and since we know the
+/// particulars in advance, we can do it leaner and meaner ourselves.
 struct KatScratch {
 	leaves: NonNull<u8>,
 	lists: NonNull<u8>,
@@ -303,7 +397,8 @@ impl KatScratch {
 	/// # Max Nodes.
 	///
 	/// This represents the theoretical maximum number of nodes a length-
-	/// limiting pass might generate.
+	/// limiting pass might generate, though it is unlikely to ever be reached
+	/// in practice. (Better safe than sorry!)
 	const MAX: usize = (2 * ZOPFLI_NUM_LL - 2) * 15;
 
 	/// # Leaves Array Layout.
@@ -318,12 +413,19 @@ impl KatScratch {
 	#[allow(unsafe_code)]
 	/// # New!
 	///
-	/// Return a new instance of self, allocated but uninitialized.
+	/// Return a new instance of self, allocated but **uninitialized**.
 	///
 	/// Similar to other mega-array structures like `ZopfliHash`, its members
 	/// are manually allocated from pointers to keep them off the stack. Unlike
 	/// the others, though, the `KatScratch` members remain in pointer form to
-	/// prevent lifetime/borrow-checker confusion.
+	/// prevent subsequent lifetime/borrow-checker confusion.
+	///
+	/// ## Safety
+	///
+	/// New values are written from pointers without first reading or dropping
+	/// the previous values at that position, and references to the new values
+	/// are only made available after said write, eliminating any UB weirdness
+	/// from possibly-uninitialized data.
 	fn new() -> Self {
 		let leaves: NonNull<u8> = NonNull::new(unsafe { alloc(Self::LEAVES_LAYOUT) })
 			.unwrap_or_else(|| handle_alloc_error(Self::LEAVES_LAYOUT));
@@ -347,7 +449,7 @@ impl KatScratch {
 	/// # Make Leaves.
 	///
 	/// Join the non-zero frequencies with their corresponding bitlengths into
-	/// a collection of leaves, then return it sorted.
+	/// a collection of leaves. That collection is then sorted and returned.
 	///
 	/// ## Safety
 	///
@@ -408,7 +510,7 @@ impl KatScratch {
 		let ptr = self.nodes.cast::<Node>().as_ptr();
 		ptr.write(Node {
 			weight: weight1,
-			count: NZ1,
+			count: NZ01,
 			tail: None,
 		});
 		let lookahead0 = &*ptr;
@@ -417,7 +519,7 @@ impl KatScratch {
 		let ptr = ptr.add(1);
 		ptr.write(Node {
 			weight: weight2,
-			count: NZ2,
+			count: NZ02,
 			tail: None,
 		});
 		let lookahead1 = &*ptr;
@@ -547,7 +649,10 @@ impl<'a> PartialOrd for Leaf<'a> {
 #[derive(Clone, Copy)]
 /// # List.
 ///
-/// This struct holds a pair of recursive node chains.
+/// This struct holds a pair of recursive node chains. The lifetimes are
+/// technically static, but in practice are always scoped to the more limited
+/// lifetime of the borrow. (`List`s are never accessible once the session that
+/// birthed them has closed.)
 struct List {
 	lookahead0: &'static Node,
 	lookahead1: &'static Node,
@@ -556,10 +661,14 @@ struct List {
 impl List {
 	#[inline]
 	/// # Rotate.
+	///
+	/// Replace the first chain with a copy of the second.
 	fn rotate(&mut self) { self.lookahead0 = self.lookahead1; }
 
 	#[inline]
 	/// # Weight Sum.
+	///
+	/// Add and return the sum of the weights of the two chains.
 	const fn weight_sum(&self) -> NonZeroU32 {
 		self.lookahead0.weight.saturating_add(self.lookahead1.weight.get())
 	}
@@ -569,6 +678,12 @@ impl List {
 
 #[derive(Clone, Copy)]
 /// # Node.
+///
+/// This holds a weight and frequency pair, and possibly a reference to the
+/// previous `Node` this one replaced.
+///
+/// As with `List`, the static lifetime is technically true, but in practice
+/// references will never extend beyond the current borrow.
 struct Node {
 	weight: NonZeroU32,
 	count: NonZeroU32,
@@ -579,22 +694,22 @@ impl Node {
 	#[inline]
 	/// # Finish Last Node!
 	///
-	/// This method establishes the final tail that the subsequent writing
-	/// will start with.
+	/// This method creates and returns the final tail to be used as the
+	/// starting point for the subsequent `llcl_write` call.
 	fn last(list_y: &List, list_z: &List, leaves: &[Leaf<'_>]) -> Self {
 		// Figure out the final node!
 		let last_count = list_z.lookahead1.count;
 		let weight_sum = list_y.weight_sum();
 		if (last_count.get() as usize) < leaves.len() && leaves[last_count.get() as usize].frequency < weight_sum {
 			Self {
-				weight: NZ1, // We'll never look at this value.
+				weight: NZ01, // We'll never look at this value.
 				count: last_count.saturating_add(1),
 				tail: list_z.lookahead1.tail,
 			}
 		}
 		else {
 			Self {
-				weight: NZ1, // We'll never look at this value.
+				weight: NZ01, // We'll never look at this value.
 				count: last_count,
 				tail: Some(list_y.lookahead1),
 			}
@@ -604,291 +719,150 @@ impl Node {
 
 
 
-/// # Tree Scratch.
+#[allow(unsafe_code)]
+/// Array of Cells.
 ///
-/// This holds a merged length-and-distance symbol set for tree sizing and
-/// writing purposes.
+/// Revisualize a mutable array as an array of cells.
 ///
-/// This isn't nearly as large as most of the other zopfli structures, but
-/// referenced frequently enough to justify storing it as a thread-local static
-/// that can be reused willynilly.
-struct TreeScratch {
-	symbols: [DeflateSym; Self::MAX],
-	hlit: usize,
-	hdist: usize,
-
-	// Note: this should really be an array with the same count as `symbols`,
-	// but the compiler doesn't seem to like that, so whatever.
-	rle: Vec<(DeflateSym, u16)>,
+/// TODO: use `Cell::as_array_of_cells` once that method is stabilized.
+fn array_of_cells<T, const N: usize>(arr: &mut [T; N]) -> &[Cell<T>; N] {
+	let cells = Cell::from_mut(arr);
+	// Safety: `Cell<T>` has the same memory layout as `T`.
+	unsafe { &*(std::ptr::from_ref(cells).cast::<[Cell<T>; N]>()) }
 }
 
-impl TreeScratch {
-	/// The maximum number of symbols.
-	const MAX: usize = 29 + 257 + 29 + 1;
-
-	/// # New.
-	const fn new() -> Self {
-		Self {
-			symbols: [DeflateSym::D00; Self::MAX],
-			hlit: 0,
-			hdist: 0,
-			rle: Vec::new(),
+/// # Tree Counts.
+///
+/// Populate and return the tree counts for `best_tree_size`.
+fn best_tree_size_counts(all: &[DeflateSym], extra: u8) -> [u32; 19] {
+	let mut cl_counts = ZEROED_COUNTS_TREE;
+	let (use_16, use_17, use_18) = extra_bools(extra);
+
+	let mut i = 0;
+	while i < all.len() {
+		let mut count = 1_u32;
+		let symbol = all[i];
+
+		macro_rules! special {
+			($step:literal, $max:literal, $symbol:ident) => (
+				while count >= $step {
+					let count2 = if count < $max { count } else { $max };
+					cl_counts[DeflateSym::$symbol as usize] += 1;
+					count -= count2;
+				}
+			);
 		}
-	}
 
-	/// # Total Length.
-	///
-	/// Returning a slice would be more useful, but Rust's borrow checker
-	/// gets confused because we'll still need to write to RLE.
-	const fn len(&self) -> usize { self.hlit + 257 + self.hdist + 1 }
-
-	/// # Load Symbols (and Reset).
-	fn with_symbols(
-		&mut self,
-		ll_lengths: &ArrayLL<DeflateSym>,
-		d_lengths: &ArrayD<DeflateSym>
-	) -> &mut Self {
-		// Find the last non-zero length symbol, starting from 285. (The offset
-		// (256) marks the boundary between literals and symbols; we'll use
-		// both literals and symbols in some places, but only the latter in
-		// others.)
-		self.hlit = 29;
-		while self.hlit > 0 && ll_lengths[256 + self.hlit].is_zero() { self.hlit -= 1; }
-
-		// Now the same for distance, starting at 29 proper.
-		self.hdist = 29;
-		while self.hdist > 0 && d_lengths[self.hdist].is_zero() { self.hdist -= 1; }
-
-		// Copy both into place. Note that both hlit and hdist are inclusive,
-		// so we need to +1 both for exclusivity.
-		let ll_end = self.hlit + 257;
-		self.symbols[..ll_end].copy_from_slice(&ll_lengths[..ll_end]);
-		self.symbols[ll_end..=ll_end + self.hdist].copy_from_slice(&d_lengths[..=self.hdist]);
-
-		self
-	}
-}
-
-impl TreeScratch {
-	/// # Best Tree.
-	///
-	/// Crunch all special symbol combinations and return the "extra" key
-	/// (0..8) that achieved the smallest output, along with its size.
-	fn best_tree(&self) -> Result<(u8, NonZeroU32), ZopfliError> {
-		let mut best_extra = 0;
-		let mut best_size = NonZeroU32::MAX;
-
-		for extra in 0..8 {
-			let size = self.crunch_size(extra)?;
-			if size < best_size {
-				best_extra = extra;
-				best_size = size;
+		// Peek ahead to maybe save some iteration!
+		if use_16 || ((use_17 || use_18) && symbol.is_zero()) {
+			let mut j = i + 1;
+			while j < all.len() && symbol == all[j] {
+				count += 1;
+				j += 1;
+				i += 1;
 			}
 		}
 
-		Ok((best_extra, best_size))
-	}
-
-	#[allow(clippy::cast_possible_truncation)]
-	/// # Calculate Tree Size.
-	fn crunch_size(&self, extra: u8) -> Result<NonZeroU32, ZopfliError> {
-		let (use_16, use_17, use_18) = extra_bools(extra);
-
-		// Hold the counts.
-		let mut cl_counts = ZEROED_COUNTS_TREE;
-
-		let mut i = 0;
-		let all = &self.symbols[..usize::min(self.len(), Self::MAX)];
-		while i < all.len() {
-			let mut count = 1_u32;
-			let symbol = all[i];
-
-			macro_rules! special {
-				($step:literal, $max:literal, $symbol:ident) => (
-					while count >= $step {
-						let count2 = if count < $max { count } else { $max };
-						cl_counts[DeflateSym::$symbol as usize] += 1;
-						count -= count2;
-					}
-				);
+		// Repetitions of zeroes.
+		if symbol.is_zero() && count >= 3 {
+			if use_18 {
+				special!(11, 138, D18);
 			}
-
-			// Peek ahead to maybe save some iteration!
-			if use_16 || ((use_17 || use_18) && symbol.is_zero()) {
-				let mut j = i + 1;
-				while j < all.len() && symbol == all[j] {
-					count += 1;
-					j += 1;
-					i += 1;
-				}
-			}
-
-			// Repetitions of zeroes.
-			if symbol.is_zero() && count >= 3 {
-				if use_18 {
-					special!(11, 138, D18);
-				}
-				if use_17 {
-					special!(3, 10, D17);
-				}
-			}
-
-			// Other symbol repetitions.
-			if use_16 && count >= 4 {
-				// Always count the first one as itself.
-				count -= 1;
-				cl_counts[symbol as usize] += 1;
-
-				special!(3, 6, D16);
+			if use_17 {
+				special!(3, 10, D17);
 			}
-
-			// Count the current symbol and move on.
-			cl_counts[symbol as usize] += count;
-			i += 1;
 		}
 
-		// Update the lengths and symbols given the counts.
-		let cl_lengths = cl_counts.llcl()?;
+		// Other symbol repetitions.
+		if use_16 && count >= 4 {
+			// Always count the first one as itself.
+			count -= 1;
+			cl_counts[symbol as usize] += 1;
 
-		// Find the last non-zero count.
-		let mut hclen = 15;
-		while hclen > 0 && cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 {
-			hclen -= 1;
+			special!(3, 6, D16);
 		}
 
-		// We can finally calculate the size!
-		let mut size = (hclen as u32 + 4) * 3;
-		for (a, b) in cl_lengths.into_iter().zip(cl_counts.iter().copied()) {
-			size += (a as u32) * b;
-		}
-		size += cl_counts[16] * 2; // Extra bits.
-		size += cl_counts[17] * 3;
-		size += cl_counts[18] * 7;
-		Ok(NZ14.saturating_add(size))
+		// Count the current symbol and move on.
+		cl_counts[symbol as usize] += count;
+		i += 1;
 	}
-}
-
-impl TreeScratch {
-	#[allow(clippy::cast_possible_truncation)]
-	/// # Write Tree.
-	fn write_tree(&mut self, extra: u8, out: &mut ZopfliOut) -> Result<(), ZopfliError> {
-		let (use_16, use_17, use_18) = extra_bools(extra);
-
-		// Hold the counts.
-		let mut cl_counts = ZEROED_COUNTS_TREE;
-		self.rle.truncate(0);
-
-		let mut i = 0;
-		let all = &self.symbols[..usize::min(self.len(), Self::MAX)];
-		while i < all.len() {
-			let mut count = 1_u16;
-			let symbol = all[i];
-
-			macro_rules! special {
-				($step:literal, $max:literal, $symbol:ident) => (
-					while count >= $step {
-						let count2 = if count < $max { count } else { $max };
-						self.rle.push((DeflateSym::$symbol, count2 - $step));
-						cl_counts[DeflateSym::$symbol as usize] += 1;
-						count -= count2;
-					}
-				);
-			}
 
-			// Peek ahead to maybe save some iteration!
-			if use_16 || ((use_17 || use_18) && symbol.is_zero()) {
-				let mut j = i + 1;
-				while j < all.len() && symbol == all[j] {
-					count += 1;
-					j += 1;
-					i += 1;
-				}
-			}
+	cl_counts
+}
 
-			// Repetitions of zeroes.
-			if count >= 3 && symbol.is_zero() {
-				if use_18 {
-					special!(11, 138, D18);
-				}
-				if use_17 {
-					special!(3, 10, D17);
+/// # Tree Counts (Writing).
+///
+/// Populate and return the tree counts for `encode_tree`, as well as the RLE
+/// symbol and position details.
+fn encode_tree_counts(
+	all: &[DeflateSym],
+	rle: &mut Vec<(DeflateSym, u16)>,
+	extra: u8,
+) -> [u32; 19] {
+	let mut cl_counts = ZEROED_COUNTS_TREE;
+	let (use_16, use_17, use_18) = extra_bools(extra);
+
+	let mut i = 0;
+	while i < all.len() {
+		let mut count = 1_u16;
+		let symbol = all[i];
+
+		macro_rules! special {
+			($step:literal, $max:literal, $symbol:ident) => (
+				while count >= $step {
+					let count2 = if count < $max { count } else { $max };
+					rle.push((DeflateSym::$symbol, count2 - $step));
+					cl_counts[DeflateSym::$symbol as usize] += 1;
+					count -= count2;
 				}
-			}
-
-			// Other symbol repetitions.
-			if use_16 && count >= 4 {
-				// Always count the first one as itself.
-				count -= 1;
-				self.rle.push((symbol, 0));
-				cl_counts[symbol as usize] += 1;
+			);
+		}
 
-				special!(3, 6, D16);
+		// Peek ahead to maybe save some iteration!
+		if use_16 || ((use_17 || use_18) && symbol.is_zero()) {
+			let mut j = i + 1;
+			while j < all.len() && symbol == all[j] {
+				count += 1;
+				j += 1;
+				i += 1;
 			}
-
-			// Count the current symbol and move on.
-			for _ in 0..count { self.rle.push((symbol, 0)); }
-			cl_counts[symbol as usize] += u32::from(count);
-			i += 1;
 		}
 
-		// Update the lengths and symbols given the counts.
-		let cl_lengths = cl_counts.llcl()?;
-
-		// Find the last non-zero count.
-		let mut hclen = 15;
-		while hclen > 0 && cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 {
-			hclen -= 1;
+		// Repetitions of zeroes.
+		if count >= 3 && symbol.is_zero() {
+			if use_18 {
+				special!(11, 138, D18);
+			}
+			if use_17 {
+				special!(3, 10, D17);
+			}
 		}
 
-		// Convert the lengths to (different) symbols.
-		let cl_symbols = <[u32; 19]>::llcl_symbols(&cl_lengths);
-
-		// Write the main lengths.
-		out.add_bits(self.hlit as u32, 5);
-		out.add_bits(self.hdist as u32, 5);
-		out.add_bits(hclen as u32, 4);
-
-		// Write each cl_length in the jumbled DEFLATE order.
-		for &o in &DeflateSym::TREE[..hclen + 4] {
-			out.add_bits(cl_lengths[o as usize] as u32, 3);
-		}
+		// Other symbol repetitions.
+		if use_16 && count >= 4 {
+			// Always count the first one as itself.
+			count -= 1;
+			rle.push((symbol, 0));
+			cl_counts[symbol as usize] += 1;
 
-		// Write each symbol in order of appearance along with its extra bits,
-		// if any.
-		for (a, b) in self.rle.drain(..) {
-			let symbol = cl_symbols[a as usize];
-			out.add_huffman_bits(symbol, cl_lengths[a as usize] as u32);
-
-			// Extra bits.
-			match a {
-				DeflateSym::D16 => { out.add_bits(u32::from(b), 2); },
-				DeflateSym::D17 => { out.add_bits(u32::from(b), 3); },
-				DeflateSym::D18 => { out.add_bits(u32::from(b), 7); },
-				_ => {},
-			}
+			special!(3, 6, D16);
 		}
 
-		Ok(())
+		// Count the current symbol and move on.
+		for _ in 0..count { rle.push((symbol, 0)); }
+		cl_counts[symbol as usize] += u32::from(count);
+		i += 1;
 	}
-}
-
-
 
-#[allow(unsafe_code)]
-/// Array of Cells.
-///
-/// Revisualize a mutable array as an array of cells.
-///
-/// TODO: use `Cell::as_array_of_cells` once stabilized.
-fn array_of_cells<T, const N: usize>(arr: &mut [T; N]) -> &[Cell<T>; N] {
-	let cells = Cell::from_mut(arr);
-	// Safety: `Cell<T>` has the same memory layout as `T`.
-	unsafe { &*(std::ptr::from_ref(cells).cast::<[Cell<T>; N]>()) }
+	// Done!
+	cl_counts
 }
 
 /// # Extra Boolification.
 ///
-/// Extra the use-16/17/18 bools (for tree business) from a given byte.
+/// Extract the use-16/17/18 bools (for tree business) from a given byte. This
+/// is easy enough, but easy enough to screw up, so handy to keep in just one
+/// place. ;)
 const fn extra_bools(extra: u8) -> (bool, bool, bool) {
 	(0 != extra & 1, 0 != extra & 2, 0 != extra & 4)
 }
@@ -897,6 +871,11 @@ const fn extra_bools(extra: u8) -> (bool, bool, bool) {
 ///
 /// Add a new chain to the list, using either a leaf or combination of
 /// two chains from the previous list.
+///
+/// Note: it would probably be more appropriate to make this a trait member or
+/// at least scope it to the sealed trait's module, but doing either leads the
+/// compiler to change its inlining decisions for the worse, so best to leave
+/// it where it is!
 fn llcl_boundary_pm(leaves: &[Leaf<'_>], lists: &mut [List], nodes: &KatScratch)
 -> Result<(), ZopfliError> {
 	// This method should never be called with an empty list.
@@ -948,6 +927,22 @@ fn llcl_boundary_pm(leaves: &[Leaf<'_>], lists: &mut [List], nodes: &KatScratch)
 	llcl_boundary_pm(leaves, rest, nodes)
 }
 
+/// # Last Non-Zero, Non-Special Count.
+///
+/// This method loops through the counts in the jumbled DEFLATE tree order,
+/// returning the last index with a non-zero count. (The extended symbols are
+/// ignored.)
+const fn tree_hclen(cl_counts: &[u32; 19]) -> DeflateSymBasic {
+	let mut hclen = 15;
+	while cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 {
+		hclen -= 1;
+		if hclen == 0 { break; }
+	}
+	#[allow(unsafe_code)]
+	// Safety: DeflateSymBasic covers all values between 0..=15.
+	unsafe { std::mem::transmute::<u8, DeflateSymBasic>(hclen as u8) }
+}
+
 
 
 #[cfg(test)]
@@ -967,6 +962,14 @@ mod tests {
 		}
 	}
 
+	#[test]
+	/// # Tree Max.
+	///
+	/// Make sure our math correctly aligns with `TreeRleIdx`.
+	fn t_tree_max() {
+		assert_eq!(TreeScratch::MAX - 1, TreeRleIdx::T315 as usize);
+	}
+
 	// The following tests have been adapted from the zopfli-rs crate:
 	// <https://github.com/zopfli-rs/zopfli/blob/main/src/katajainen.rs>
 
diff --git a/flapfli/src/zopflipng/lz77.rs b/flapfli/src/zopflipng/lz77.rs
index ce789ad..37374eb 100644
--- a/flapfli/src/zopflipng/lz77.rs
+++ b/flapfli/src/zopflipng/lz77.rs
@@ -4,33 +4,53 @@
 This module defines the LZ77 store structures.
 */
 
-use std::ops::Range;
+use std::{
+	num::{
+		NonZeroU32,
+		NonZeroUsize,
+	},
+	ops::Range,
+};
 use super::{
 	ArrayD,
 	ArrayLL,
+	DISTANCE_BITS,
 	DISTANCE_SYMBOLS,
 	Dsym,
+	DynamicLengths,
+	FIXED_TREE_LL,
+	LENGTH_SYMBOL_BITS,
 	LENGTH_SYMBOLS,
 	LitLen,
 	Lsym,
 	ZEROED_COUNTS_D,
 	ZEROED_COUNTS_LL,
 	zopfli_error,
+	ZOPFLI_MASTER_BLOCK_SIZE,
 	ZopfliError,
+	ZopfliRange,
 };
 
 
 
-/// # Shared `LZ77Store` Pool.
-///
-/// Each `deflate_part` run can use as many as three of these; we might as well
-/// reuse the objects to cut down on the number of allocations being made.
-// static POOL: Pool = Pool::new();
+#[allow(unsafe_code)]
+/// # Seven is Non-Zero.
+const NZ07: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(7) };
+
+#[allow(unsafe_code)]
+/// # Eight is Non-Zero.
+const NZ08: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(8) };
 
 
 
 #[derive(Clone)]
 /// # LZ77 Data Store.
+///
+/// This struct holds litlen, dist, and symbol information for LZ77 block
+/// compression.
+///
+/// This can be thought of as the owned version of `LZ77StoreRange`, useful
+/// while the data is still being gathered and manipulated.
 pub(crate) struct LZ77Store {
 	pub(crate) entries: Vec<LZ77StoreEntry>,
 }
@@ -41,33 +61,48 @@ impl LZ77Store {
 		Self { entries: Vec::new() }
 	}
 
-	/// # Symbol Span Range.
+	/// # Ranged.
+	///
+	/// Return an immutable ranged view of the data, or an error if the range
+	/// is invalid.
+	pub(crate) fn ranged(&self, rng: ZopfliRange) -> Result<LZ77StoreRange, ZopfliError> {
+		let entries = self.entries.get(rng.rng()).ok_or(zopfli_error!())?;
+		Ok(LZ77StoreRange { entries })
+	}
+
+	/// # Ranged (Full).
 	///
-	/// Convert an LZ77 range to the start/end positions of the block.
-	pub(crate) fn byte_range(&self, rng: Range<usize>) -> Result<Range<usize>, ZopfliError> {
-		let slice = self.entries.as_slice();
-		if rng.start < rng.end && rng.end <= slice.len() {
-			let instart = slice[rng.start].pos;
-			let e = slice[rng.end - 1];
-			Ok(instart..e.length() as usize + e.pos)
+	/// Same as `LZ77Store::range`, except the range is everything. This will
+	/// return an error if the store is empty or too large.
+	pub(crate) fn ranged_full(&self) -> Result<LZ77StoreRange, ZopfliError> {
+		let entries = self.entries.as_slice();
+		if entries.is_empty() || ZOPFLI_MASTER_BLOCK_SIZE < entries.len() {
+			Err(zopfli_error!())
 		}
-		else { Err(zopfli_error!()) }
+		else { Ok(LZ77StoreRange { entries }) }
 	}
 
 	/// # Clear.
+	///
+	/// Remove all previously-collected entries, allowing the store to be
+	/// re-used for a new set of data.
 	pub(crate) fn clear(&mut self) { self.entries.truncate(0); }
 
 	/// # Push Values.
+	///
+	/// Create an entry from the arguments, then insert it into the store.
 	pub(crate) fn push(&mut self, litlen: LitLen, dist: u16, pos: usize) {
 		self.push_entry(LZ77StoreEntry::new(litlen, dist, pos));
 	}
 
 	/// # Push Entry.
+	///
+	/// Push an existing entry directly to the store.
 	fn push_entry(&mut self, entry: LZ77StoreEntry) { self.entries.push(entry); }
 
 	/// # Replace Store.
 	///
-	/// Replace the current content with some other store's content.
+	/// Replace the current store's data with what the other guy's got.
 	pub(crate) fn replace(&mut self, other: &Self) {
 		self.entries.truncate(0);
 		self.entries.extend_from_slice(&other.entries);
@@ -75,7 +110,9 @@ impl LZ77Store {
 
 	/// # Steal/Append Entries.
 	///
-	/// Drain the entires from other and append them to self.
+	/// Drain the entires from `other` and append them to `self`. (This is a
+	/// more efficient alternative to calling `LZ77Store::replace` and
+	/// `LZ77Store::clear` separately.)
 	pub(crate) fn steal_entries(&mut self, other: &mut Self) {
 		self.entries.append(&mut other.entries);
 	}
@@ -83,31 +120,203 @@ impl LZ77Store {
 
 impl LZ77Store {
 	/// # Length.
+	///
+	/// Return the number of entries in the store. Unlike `LZ77StoreRange`,
+	/// this can return zero.
 	pub(crate) fn len(&self) -> usize { self.entries.len() }
+}
+
+
+
+#[repr(transparent)]
+#[derive(Clone, Copy)]
+/// # Ranged LZ77 Data Store.
+///
+/// Same as `LZ77Store`, but immutable and non-empty, offering a more
+/// const-friendly and performant view into some or all of the former's
+/// data.
+pub(crate) struct LZ77StoreRange<'a> {
+	pub(crate) entries: &'a [LZ77StoreEntry],
+}
+
+impl<'a> LZ77StoreRange<'a> {
+	/// # Uncompressed Range.
+	///
+	/// Return the original uncompressed range — from e.g. a `ZopfliChunk` —
+	/// used to build this store. If for some reason that range cannot be
+	/// recreated, an error will be returned instead.
+	pub(crate) const fn byte_range(self) -> Result<ZopfliRange, ZopfliError> {
+		// Safety: ranged stores are never empty.
+		let len = self.entries.len();
+		if 0 == len { crate::unreachable(); }
+
+		let first = self.entries[0];
+		let last = self.entries[len - 1];
+		ZopfliRange::new(first.pos, last.length() as usize + last.pos)
+	}
 
 	/// # Histogram.
-	pub(crate) fn histogram(&self, rng: Range<usize>)
-	-> (ArrayLL<u32>, ArrayD<u32>) {
+	///
+	/// Count up and return the litlen and distance symbols included in this
+	/// range.
+	pub(crate) fn histogram(self) -> (ArrayLL<u32>, ArrayD<u32>) {
 		let mut ll_counts = ZEROED_COUNTS_LL;
 		let mut d_counts = ZEROED_COUNTS_D;
 
-		for e in self.entries.iter().take(rng.end).skip(rng.start) {
+		for e in self.entries {
 			ll_counts[e.ll_symbol as usize] += 1;
 			if 0 < e.dist { d_counts[e.d_symbol as usize] += 1; }
 		}
 
 		(ll_counts, d_counts)
 	}
+
+	/// # Length.
+	///
+	/// Return the total number of entries included in this store. Unlike
+	/// `LZ77Store`, this cannot be empty, so the result will always be
+	/// non-zero.
+	pub(crate) const fn len(self) -> NonZeroUsize {
+		#[allow(unsafe_code)]
+		// Safety: we verified the store is non-empty at construction.
+		unsafe { NonZeroUsize::new_unchecked(self.entries.len()) }
+	}
+
+	#[allow(unsafe_code)]
+	/// # Split.
+	///
+	/// Split the range into two at `mid`, unless that would leave either side
+	/// empty, in which case an error will be returned instead.
+	///
+	/// Note: this returns two new instances; `self` is left unchanged.
+	pub(crate) const fn split(self, mid: usize) -> Result<(Self, Self), ZopfliError> {
+		if 0 == mid || self.entries.len() <= mid { Err(zopfli_error!()) }
+		else {
+			// Safety: we have checked mid is between the start and end of our
+			// entries.
+			let (a, b) = unsafe { self.entries.split_at_unchecked(mid) };
+			Ok((Self { entries: a }, Self { entries: b }))
+		}
+	}
+
+	/// # Split Iterator.
+	///
+	/// Return an iterator that yields every possible split combination in
+	/// order, unless `self` has only one entry and cannot be split, in which
+	/// case an error is returned instead.
+	pub(crate) const fn splits(self) -> Result<LZ77StoreRangeSplits<'a>, ZopfliError> {
+		let len = self.entries.len();
+		if 1 < len {
+			Ok(LZ77StoreRangeSplits {
+				entries: self.entries,
+				splits: 1..len,
+			})
+		}
+		// Not big enough to split!
+		else { Err(zopfli_error!()) }
+	}
+}
+
+impl<'a> LZ77StoreRange<'a> {
+	/// # Calculate Block Size (Auto).
+	///
+	/// Return the smallest of the uncompressed, fixed, and dynamic sizes.
+	/// (When `try_fixed` is false, only uncompressed and dynamic sizes are
+	/// calculated and compared.)
+	pub(crate) fn block_size_auto(self, try_fixed: bool) -> Result<NonZeroU32, ZopfliError> {
+		// Take the smaller of the uncompressed and dynamic costs.
+		let cost = NonZeroU32::min(
+			self.block_size_uncompressed()?,
+			self.block_size_dynamic()?,
+		);
+
+		// Counter-intuitively, we'll usually get better block-splitting decisions
+		// by ignoring fixed costs entirely unless the store is really small. This
+		// condition is also necessary to maintain parity with the original zopfli.
+		if try_fixed {
+			let cost2 = self.block_size_fixed();
+			if cost2 < cost { return Ok(cost2); }
+		}
+
+		Ok(cost)
+	}
+
+	/// # Calculate Block Size (Dynamic).
+	///
+	/// This calculation is… a lot. See the `rle` module for more information.
+	pub(crate) fn block_size_dynamic(self) -> Result<NonZeroU32, ZopfliError> {
+		DynamicLengths::new(self).map(DynamicLengths::take_size)
+	}
+
+	/// # Calculate Block Size (Fixed).
+	pub(crate) fn block_size_fixed(self) -> NonZeroU32 {
+		// Loop the store if we have data to loop.
+		let size = self.entries.iter()
+			.map(LZ77StoreEntry::fixed_cost)
+			.sum::<u32>();
+
+		NZ07.saturating_add(size) // FIXED_TREE_LL[256]
+	}
+
+	/// # Calculate Block Size (Uncompressed).
+	pub(crate) fn block_size_uncompressed(self) -> Result<NonZeroU32, ZopfliError> {
+		let blocksize = self.byte_range()?.len32();
+
+		// Uncompressed blocks are split at u16::MAX.
+		let chunks = blocksize.get().div_ceil(u32::from(u16::MAX));
+
+		Ok(NZ08.saturating_mul(blocksize).saturating_add(chunks * 40))
+	}
+}
+
+
+
+/// # Ranged Store Splits.
+///
+/// This iterator yields all non-empty split pairs of a ranged store.
+pub(crate) struct LZ77StoreRangeSplits<'a> {
+	entries: &'a [LZ77StoreEntry],
+	splits: Range<usize>,
+}
+
+impl<'a> Iterator for LZ77StoreRangeSplits<'a> {
+	type Item = (LZ77StoreRange<'a>, LZ77StoreRange<'a>);
+
+	#[allow(unsafe_code)]
+	fn next(&mut self) -> Option<Self::Item> {
+		let mid = self.splits.next()?;
+		// Safety: we verified splits was in between the start and end points
+		// of our entries.
+		let (a, b) = unsafe { self.entries.split_at_unchecked(mid) };
+		Some((
+			LZ77StoreRange { entries: a },
+			LZ77StoreRange { entries: b },
+		))
+	}
+
+	fn size_hint(&self) -> (usize, Option<usize>) {
+		let len = self.splits.len();
+		(len, Some(len))
+	}
+}
+
+impl<'a> ExactSizeIterator for LZ77StoreRangeSplits<'a> {
+	fn len(&self) -> usize { self.splits.len() }
 }
 
 
 
 #[derive(Clone, Copy)]
+/// # LZ77 Store Entry.
+///
+/// This struct holds all of the relevant details for a given entry, including
+/// its index in the original uncompressed chunk, the length and distance pair,
+/// and the corresponding length and distance symbols.
 pub(crate) struct LZ77StoreEntry {
-	pub(crate) pos: usize,
+	pub(crate) pos: usize,      // The original uncompressed chunk index.
 	pub(crate) litlen: LitLen,
 	pub(crate) dist: i16,
-	pub(crate) ll_symbol: Lsym,
+	pub(crate) ll_symbol: Lsym, // A symbol or literal depending on distance.
 	pub(crate) d_symbol: Dsym,
 }
 
@@ -122,8 +331,9 @@ impl LZ77StoreEntry {
 		debug_assert!(dist < 32_768);
 
 		// Using the signed type helps the compiler understand the upper
-		// range fits ZOPFLI_WINDOW_MAX and wraps (impossible) bad values to
-		// boot.
+		// range fits ZOPFLI_WINDOW_MAX. Impossibly large values would also
+		// get neatly tucked away in negative-land and ignored, but that'd be
+		// impossible!
 		let dist = dist as i16;
 		let (ll_symbol, d_symbol) =
 			if 0 < dist {(
@@ -141,6 +351,23 @@ impl LZ77StoreEntry {
 		}
 	}
 
+	/// # Fixed Cost.
+	///
+	/// Note: these values all fit comfortably within `u8`, but we never just
+	/// want one cost, so the result is widened to `u32` to simplify
+	/// `LZ77StoreRange::block_size_fixed`'s efforts.
+	const fn fixed_cost(&self) -> u32 {
+		let base = FIXED_TREE_LL[self.ll_symbol as usize] as u8;
+		let extra =
+			if 0 < self.dist {
+				LENGTH_SYMBOL_BITS[self.litlen as usize] +
+				DISTANCE_BITS[self.d_symbol as usize] +
+				5 // FIXED_TREE_D.
+			}
+			else { 0 };
+		(base + extra) as u32
+	}
+
 	/// # Length.
 	///
 	/// If the distance is zero, 1, otherwise the litlen.
@@ -149,3 +376,83 @@ impl LZ77StoreEntry {
 		else { LitLen::L001 }
 	}
 }
+
+
+
+#[cfg(test)]
+mod test {
+	use super::*;
+
+	#[test]
+	fn t_fixed_tree_256() {
+		// Our use of this particular index is hardcoded for simplicity; let's
+		// triple-check we chose correctly!
+		assert_eq!(FIXED_TREE_LL[256] as u32, NZ07.get());
+	}
+
+	#[test]
+	fn t_fixed_tree_d5() {
+		// Our use of this particular index is hardcoded for simplicity; let's
+		// triple-check we chose correctly!
+		assert!(super::super::FIXED_TREE_D.iter().all(|&d| d as u32 == 5));
+	}
+
+	#[test]
+	fn t_ranged_splits() {
+		/// # Poor Man's Equal Impl.
+		///
+		/// Most of these types do not implement (or need) `Eq`, but since
+		/// we're only setting `pos` and `dist` uniquely here anyway, we can
+		/// limit matching to those two.
+		fn entry_eq((a, b): (&LZ77StoreEntry, &LZ77StoreEntry)) -> bool {
+			a.pos == b.pos && a.dist == b.dist
+		}
+
+		// Generate an entry with the given pos and dist.
+		macro_rules! entry {
+			($i:literal) => (
+				LZ77StoreEntry {
+					pos: $i,
+					litlen: LitLen::L000,
+					dist: $i,
+					ll_symbol: Lsym::L000,
+					d_symbol: Dsym::D00,
+				}
+			);
+		}
+
+		// These entries are nonsensical, but all we're looking to do is check
+		// that splits are happening in the right place, so they only really
+		// need to be unique from one another.
+		let arr: &[LZ77StoreEntry] = &[
+			entry!(0),
+			entry!(1),
+			entry!(2),
+			entry!(3),
+			entry!(4),
+			entry!(5),
+		];
+		let store = LZ77StoreRange { entries: arr };
+
+		// Do the splits.
+		let mut splits = store.splits().expect("failed to split store");
+		for i in 1..arr.len() {
+			assert_eq!(splits.len(), arr.len() - i);
+			let (a, b) = splits.next().expect("expected next split");
+			let c = &arr[..i]; // Expected A.
+			let d = &arr[i..]; // Expected B.
+
+			assert_eq!(a.len().get(), a.entries.len());
+			assert_eq!(a.entries.len(), c.len());
+			assert!(a.entries.iter().zip(c.iter()).all(entry_eq));
+
+			assert_eq!(b.len().get(), b.entries.len());
+			assert_eq!(b.entries.len(), d.len());
+			assert!(b.entries.iter().zip(d.iter()).all(entry_eq));
+		}
+
+		// We should be empty.
+		assert_eq!(splits.len(), 0);
+		assert!(splits.next().is_none());
+	}
+}
diff --git a/flapfli/src/zopflipng/mod.rs b/flapfli/src/zopflipng/mod.rs
index cff95ae..7852218 100644
--- a/flapfli/src/zopflipng/mod.rs
+++ b/flapfli/src/zopflipng/mod.rs
@@ -16,53 +16,54 @@ performant.
 
 mod blocks;
 mod cache;
+mod chunk;
 mod error;
 mod hash;
+mod iter;
 mod kat;
 mod lz77;
 mod rle;
+mod rng;
 mod stats;
 mod symbols;
 
-pub(crate) use blocks::{
-	deflate_part,
-	SplitPoints,
-};
+pub(crate) use blocks::deflate_part;
 use cache::{
 	MatchCache,
+	SplitCache,
 	SqueezeCache,
 };
+pub(crate) use chunk::ZopfliChunk;
 use error::{
 	zopfli_error,
 	ZopfliError,
 };
+use hash::ZopfliStateInit;
 pub(crate) use hash::ZopfliState;
+use iter::ReducingSlices;
 use kat::{
 	best_tree_size,
 	encode_tree,
 	LengthLimitedCodeLengths,
 };
-pub(crate) use lz77::LZ77Store;
-use rle::get_dynamic_lengths;
-pub(crate) use rle::reset_dynamic_length_cache;
-use super::{
-	EncodedPNG,
-	lodepng::{
-		DecodedImage,
-		LodePNGColorType,
-		LodePNGFilterStrategy,
-		LodePNGState,
-		ZopfliOut,
-	},
+use lz77::{
+	LZ77Store,
+	LZ77StoreRange,
 };
+use rng::ZopfliRange;
+use rle::DynamicLengths;
+use super::deflate::ZopfliOut;
 use symbols::{
 	DeflateSym,
+	DeflateSymBasic,
 	DISTANCE_BITS,
+	DISTANCE_BITS_F,
 	DISTANCE_SYMBOLS,
 	DISTANCE_VALUES,
 	Dsym,
 	LENGTH_SYMBOL_BIT_VALUES,
 	LENGTH_SYMBOL_BITS,
+	LENGTH_SYMBOL_BITS_F,
 	LENGTH_SYMBOLS,
 	LitLen,
 	Lsym,
@@ -123,16 +124,28 @@ const FIXED_SYMBOLS_D: ArrayD<u32> = [
 	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 ];
 
-// This is the biggest chunk-o-data that can be passed to deflate.
+/// # Step Size for Deflate Parts.
+///
+/// The "active" portion of the `ZopfliChunk` passed from lodepng will never
+/// exceed a million bytes.
 pub(super) const ZOPFLI_MASTER_BLOCK_SIZE: usize = 1_000_000;
 
-// The matchable hash cache range.
+/// # Hash/LZ77 Window Size.
+///
+/// This is the window size used by lodepng when zopfli processing is enabled,
+/// and the amount expected by structs like `ZopfliHash`.
+const ZOPFLI_WINDOW_SIZE: usize = 32_768;
+
+/// # Minimum Matchable Distance.
 const ZOPFLI_MIN_MATCH: usize = 3;
+
+/// # Maximum Matchable Distance.
 const ZOPFLI_MAX_MATCH: usize = 258;
 
 /// # Length of Sublength Array.
 ///
-/// This is hardcoded in `squeeze.c`.
+/// The squeeze sublength array slices have indices spanning
+/// `0..=ZOPFLI_MAX_MATCH`.
 const SUBLEN_LEN: usize = ZOPFLI_MAX_MATCH + 1;
 
 /// # Array with `ZOPFLI_NUM_LL` Entries.
@@ -140,82 +153,3 @@ type ArrayLL<T> = [T; ZOPFLI_NUM_LL];
 
 /// # Array with `ZOPFLI_NUM_D` Entries.
 type ArrayD<T> = [T; ZOPFLI_NUM_D];
-
-
-
-#[must_use]
-/// # Optimize!
-///
-/// This will attempt to losslessly recompress the source PNG with the
-/// strongest Zopfli filter strategy, and return a new PNG image if the result
-/// is smaller than the original.
-///
-/// Note: 16-bit transformations are not lossless; such images will have their
-/// bit depths reduced to a more typical 8 bits.
-pub fn optimize(src: &[u8]) -> Option<EncodedPNG> {
-	let mut dec = LodePNGState::default();
-	let img = dec.decode(src)?;
-
-	// Encode!
-	let strategy = best_strategy(&dec, &img);
-	let out = encode(&dec, &img, strategy, true)?;
-
-	// Return it if better and nonzero!
-	if out.size < src.len() { Some(out) }
-	else { None }
-}
-
-
-
-/// # Best Strategy.
-///
-/// This attempts to find the best filtering strategy for the image by trying
-/// all of them in fast mode, and picking whichever produces the smallest
-/// output.
-fn best_strategy(dec: &LodePNGState, img: &DecodedImage) -> LodePNGFilterStrategy {
-	[
-		LodePNGFilterStrategy::LFS_ZERO,
-		LodePNGFilterStrategy::LFS_ONE,
-		LodePNGFilterStrategy::LFS_TWO,
-		LodePNGFilterStrategy::LFS_THREE,
-		LodePNGFilterStrategy::LFS_FOUR,
-		LodePNGFilterStrategy::LFS_MINSUM,
-		LodePNGFilterStrategy::LFS_ENTROPY,
-		LodePNGFilterStrategy::LFS_BRUTE_FORCE,
-	]
-		.into_iter()
-		.filter_map(|s| encode(dec, img, s, false).map(|out| (out.size, s)))
-		.min_by(|a, b| a.0.cmp(&b.0))
-		.map_or(LodePNGFilterStrategy::LFS_ZERO, |(_, s)| s)
-}
-
-/// # Apply Optimizations.
-///
-/// This attempts to re-encode an image using the provided filter strategy,
-/// returning an `EncodedPNG` object if it all works out.
-fn encode(
-	dec: &LodePNGState,
-	img: &DecodedImage,
-	strategy: LodePNGFilterStrategy,
-	slow: bool,
-) -> Option<EncodedPNG> {
-	// Encode and write to the buffer if it worked.
-	let mut enc = LodePNGState::encoder(dec, strategy, slow)?;
-	let out = enc.encode(img)?;
-
-	// We might be able to save a couple bytes by nuking the palette if the
-	// image is already really small.
-	if
-		out.size < 4096 &&
-		LodePNGColorType::LCT_PALETTE.is_match(&out) &&
-		enc.prepare_encoder_small(img)
-	{
-		if let Some(out2) = enc.encode(img) {
-			if out2.size < out.size {
-				return Some(out2);
-			}
-		}
-	}
-
-	Some(out)
-}
diff --git a/flapfli/src/zopflipng/rle.rs b/flapfli/src/zopflipng/rle.rs
index 0170b4c..8de329a 100644
--- a/flapfli/src/zopflipng/rle.rs
+++ b/flapfli/src/zopflipng/rle.rs
@@ -2,18 +2,9 @@
 # Flapfli: Huffman RLE Optimization.
 */
 
-use dactyl::NoHash;
 use std::{
-	cell::{
-		Cell,
-		RefCell,
-	},
-	collections::{
-		hash_map::Entry,
-		HashMap,
-	},
+	cell::Cell,
 	num::NonZeroU32,
-	ops::Range,
 };
 use super::{
 	ArrayD,
@@ -22,7 +13,7 @@ use super::{
 	DeflateSym,
 	DISTANCE_BITS,
 	LengthLimitedCodeLengths,
-	LZ77Store,
+	LZ77StoreRange,
 	ZopfliError,
 };
 
@@ -34,112 +25,117 @@ const LENGTH_EXTRA_BITS: [u32; 29] = [
 	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0,
 ];
 
-type RleCache = HashMap<u64, CacheEntry, NoHash>;
-
-thread_local!(
-	/// # Best Tree Cache.
-	///
-	/// The dynamic length calculations are pretty terrible and can wind up
-	/// being repeated several times for a given block. To take out some of the
-	/// sting from that repetition, the results are statically cached.
-	///
-	/// To prevent endless reallocation and minimize lookup times, the cache is
-	/// cleared for each new image.
-	static CACHE: RefCell<RleCache> = RefCell::new(HashMap::default())
-);
-
 
 
-/// # Get Dynamic Lengths.
+/// # Dynamic Lengths.
 ///
-/// This method calculates the dynamic tree symbols and size using both the
-/// existing and optimized counts, then returns whichever set produces the
-/// smallest output.
+/// This struct is used to perform brute-force length-limited-code-length
+/// calculations to determine the best (smallest) DEFLATE configuration for the
+/// data.
 ///
-/// Note: the returned size does not include the 3-bit block header.
-pub(super) fn get_dynamic_lengths(store: &LZ77Store, rng: Range<usize>)
--> Result<(u8, NonZeroU32, ArrayLL<DeflateSym>, ArrayD<DeflateSym>), ZopfliError> {
-	fn fetch(
-		cache: &mut RleCache,
-		ll_counts: &ArrayLL<u32>,
-		d_counts: &ArrayD<u32>,
-	) -> Result<(u8, NonZeroU32, ArrayLL<DeflateSym>, ArrayD<DeflateSym>), ZopfliError> {
+/// This is done in two passes: the first using the previously-collected LZ77
+/// histogram data, the second using RLE-optimized counts derived from same.
+/// The best of the best is kept, the rest are forgotten.
+pub(crate) struct DynamicLengths {
+	extra: u8,
+	size: NonZeroU32,
+	ll_lengths: ArrayLL<DeflateSym>,
+	d_lengths: ArrayD<DeflateSym>,
+}
+
+impl DynamicLengths {
+	/// # New.
+	pub(crate) fn new(store: LZ77StoreRange) -> Result<Self, ZopfliError> {
+		// Pull the counts from the store.
+		let (mut ll_counts, d_counts) = store.histogram();
+		ll_counts[256] = 1;
+
 		// Pull the symbols, then get the sizes.
 		let ll_lengths = ll_counts.llcl()?;
-		let d_lengths = d_llcl(d_counts)?;
-		let (data1, hash1) = calculate_size(cache, ll_counts, d_counts, &ll_lengths, &d_lengths)?;
-
-		// Unless we've been here before and found optimization useless, repeat
-		// the process using optimized counts and symbols.
-		if ! data1.noop() {
-			let (ll_lengths2, d_lengths2) = optimized_symbols(ll_counts, d_counts)?;
-			let (data2, _) = calculate_size(cache, ll_counts, d_counts, &ll_lengths2, &d_lengths2)?;
-
-			// Return this version if better.
-			if data2.size < data1.size {
-				return Ok((data2.extra(), data2.size, ll_lengths2, d_lengths2));
-			}
+		let d_lengths = d_llcl(&d_counts)?;
 
-			// Update the original's cache to reflect that optimization didn't
-			// help so that we can skip all this the next time around.
-			if let Some(e) = cache.get_mut(&hash1) { e.set_noop(); }
-		}
+		// Calculate the sizes.
+		let (extra, treesize) = best_tree_size(&ll_lengths, &d_lengths)?;
+		let datasize = calculate_size_data(&ll_counts, &d_counts, &ll_lengths, &d_lengths);
+		let size = treesize.saturating_add(datasize);
 
-		// The first version was better!
-		Ok((data1.extra(), data1.size, ll_lengths, d_lengths))
-	}
+		// Build the response.
+		let mut out = Self { extra, size, ll_lengths, d_lengths };
 
-	// Pull the counts from the store.
-	let (mut ll_counts, d_counts) = store.histogram(rng);
-	ll_counts[256] = 1;
+		// But wait, there's more! Optimize the counts and repeat the process
+		// to see if that helps.
+		out.try_optimized(&ll_counts, &d_counts)?;
 
-	// Do all the work!
-	CACHE.with_borrow_mut(|cache| fetch(cache, &ll_counts, &d_counts))
-}
+		// Done!
+		Ok(out)
+	}
 
-/// # Reset Dynamic Length Cache.
-///
-/// To prevent endless reallocation and minimize lookup times, the cache is
-/// cleared each time a new image is loaded.
-pub(crate) fn reset_dynamic_length_cache() { CACHE.with_borrow_mut(HashMap::clear); }
+	#[inline(never)]
+	/// # Unique Symbols?
+	///
+	/// Returns true if any of the symbols are different than the ones we
+	/// already have. (They wind up the same often enough that it is worth
+	/// checking to reduce the potential workload.)
+	fn is_unique(&self, ll_lengths: &ArrayLL<DeflateSym>, d_lengths: &ArrayD<DeflateSym>) -> bool {
+		#[allow(unsafe_code)]
+		/// # As Bytes.
+		///
+		/// Reimagine a symbol array as raw bytes for more optimal comparison.
+		const fn deflate_bytes<const N: usize>(arr: &[DeflateSym; N]) -> &[u8; N] {
+			// Safety: DeflateSym has the same size and alignment as u8.
+			unsafe { &* arr.as_ptr().cast() }
+		}
 
+		*deflate_bytes(&self.d_lengths) != *deflate_bytes(d_lengths) ||
+		deflate_bytes(&self.ll_lengths) != deflate_bytes(ll_lengths)
+	}
 
+	/// # Try Optimized.
+	///
+	/// Optimize the counts and fetch new symbols, calculate their cost, and
+	/// keep them if better.
+	fn try_optimized(&mut self, ll_counts: &ArrayLL<u32>, d_counts: &ArrayD<u32>)
+	-> Result<(), ZopfliError> {
+		let (ll_lengths2, d_lengths2) = optimized_symbols(ll_counts, d_counts)?;
+
+		// It is only worth calculating the new sizes if the lengths are
+		// different than the ones we already have.
+		if self.is_unique(&ll_lengths2, &d_lengths2) {
+			// Calculate the sizes.
+			let (extra, treesize) = best_tree_size(&ll_lengths2, &d_lengths2)?;
+			let datasize = calculate_size_data(ll_counts, d_counts, &ll_lengths2, &d_lengths2);
+			let size = treesize.saturating_add(datasize);
+
+			// Update our values if the new cost is lower.
+			if size < self.size {
+				self.extra = extra;
+				self.size = size;
+				self.ll_lengths = ll_lengths2;
+				self.d_lengths = d_lengths2;
+			}
+		}
 
-#[derive(Clone, Copy)]
-/// # Cache Entry.
-struct CacheEntry {
-	extra: u8,        // Extended alphabet used.
-	size: NonZeroU32, // Combined tree/data size.
+		Ok(())
+	}
 }
 
-impl CacheEntry {
-	/// # Extra Bits.
-	///
-	/// The first three bits comprise the extended alphabet details.
-	const MASK_EXTRA: u8 = 0b0000_0111;
-
-	/// # Fruitless Optimization Mask.
-	///
-	/// The fourth bit is used to indicate when the secondary optimization pass
-	/// failed to result in better output.
-	const MASK_NOOP: u8 = 0b0000_1000;
+impl DynamicLengths {
+	/// # Cost.
+	pub(crate) const fn cost(&self) -> NonZeroU32 { self.size }
 
 	/// # Extra.
-	///
-	/// Return the true "extra" value, without the noop bit.
-	const fn extra(self) -> u8 { self.extra & Self::MASK_EXTRA }
+	pub(crate) const fn extra(&self) -> u8 { self.extra }
 
-	/// # Fruitless Optimization?
-	///
-	/// Returns true if optimizing the counts made no positive difference
-	/// during the previous pass.
-	const fn noop(self) -> bool { Self::MASK_NOOP == self.extra & Self::MASK_NOOP }
+	/// # Litlen Lengths.
+	pub(crate) const fn ll_lengths(&self) -> &ArrayLL<DeflateSym> { &self.ll_lengths }
+
+	/// # Distance Lengths.
+	pub(crate) const fn d_lengths(&self) -> &ArrayD<DeflateSym> { &self.d_lengths }
 
-	/// # Set Fruitless Optimization.
+	/// # Take Size.
 	///
-	/// This sets the noop flag so the optimization pass can be skipped on
-	/// subsequent calls.
-	fn set_noop(&mut self) { self.extra |= Self::MASK_NOOP; }
+	/// Same as `DynamicLengths::cost`, but drop `self` in the process.
+	pub(crate) const fn take_size(self) -> NonZeroU32 { self.size }
 }
 
 
@@ -150,7 +146,7 @@ impl CacheEntry {
 /// `true` for distance codes in a sequence of 5+ zeroes or 7+ (identical)
 /// non-zeroes, `false` otherwise.
 ///
-/// This moots the need to collect such values into a vector in advance and
+/// This moots the need to collect the values into a vector in advance and
 /// reduces the number of passes required to optimize Huffman codes.
 struct GoodForRle<'a> {
 	counts: &'a [Cell<u32>],
@@ -229,64 +225,38 @@ impl<'a> ExactSizeIterator for GoodForRle<'a> {
 
 
 
-/// # Calculate Size.
+/// # Calculate Dynamic Data Block Size.
 ///
-/// Pull the best tree details from the cache, or calculate them fresh (and
-/// cache them for next time).
-fn calculate_size(
-	cache: &mut RleCache,
+/// This returns the size of the data itself, basically just a sum of sums.
+fn calculate_size_data(
 	ll_counts: &ArrayLL<u32>,
 	d_counts: &ArrayD<u32>,
 	ll_lengths: &ArrayLL<DeflateSym>,
 	d_lengths: &ArrayD<DeflateSym>,
-) -> Result<(CacheEntry, u64), ZopfliError> {
-	#[inline(never)]
-	/// # Calculate Dynamic Block Size.
-	fn data_size(
-		ll_counts: &ArrayLL<u32>,
-		d_counts: &ArrayD<u32>,
-		ll_lengths: &ArrayLL<DeflateSym>,
-		d_lengths: &ArrayD<DeflateSym>,
-	) -> u32 {
-		// The end symbol is always included.
-		let mut size = ll_lengths[256] as u32;
-
-		// The early lengths and counts.
-		for (ll, lc) in ll_lengths.iter().copied().zip(ll_counts).take(256) {
-			size += (ll as u32) * lc;
-		}
-
-		// The lengths and counts with extra bits.
-		for (i, lbit) in (257..257 + LENGTH_EXTRA_BITS.len()).zip(LENGTH_EXTRA_BITS) {
-			size += (ll_lengths[i] as u32 + lbit) * ll_counts[i];
-		}
-
-		// The distance lengths, counts, and extra bits.
-		for (i, dbit) in DISTANCE_BITS.iter().copied().enumerate().take(30) {
-			size += (d_lengths[i] as u32 + u32::from(dbit)) * d_counts[i];
-		}
-
-		size
-	}
-
-	// Hash the symbols.
-	let hash = deflate_hash(ll_counts, d_counts, ll_lengths, d_lengths);
-
-	// Check the cache first.
-	let entry = match cache.entry(hash) {
-		Entry::Occupied(e) => return Ok((*e.get(), hash)),
-		Entry::Vacant(e) => e,
-	};
-
-	// Calculate the sizes.
-	let (extra, treesize) = best_tree_size(ll_lengths, d_lengths)?;
-	let datasize = data_size(ll_counts, d_counts, ll_lengths, d_lengths);
-	let size = treesize.saturating_add(datasize);
-	let out = CacheEntry { extra, size };
-
-	// Save to cache for later, then return.
-	entry.insert(out);
-	Ok((out, hash))
+) -> u32 {
+	// The early lengths and counts.
+	let a = ll_lengths.iter().copied()
+		.zip(ll_counts.iter().copied())
+		.take(256)
+		.map(|(ll, lc)| (ll as u32) * lc)
+		.sum::<u32>();
+
+	// The lengths and counts with extra bits.
+	let b = ll_lengths[257..].iter().copied()
+		.zip(ll_counts[257..].iter().copied())
+		.zip(LENGTH_EXTRA_BITS)
+		.map(|((ll, lc), lbit)| (ll as u32 + lbit) * lc)
+		.sum::<u32>();
+
+	// The distance lengths, counts, and extra bits.
+	let c = d_lengths.iter().copied()
+		.zip(d_counts.iter().copied())
+		.zip(DISTANCE_BITS)
+		.take(30)
+		.map(|((dl, dc), dbit)| (dl as u32 + u32::from(dbit)) * dc)
+		.sum::<u32>();
+
+	a + b + c + ll_lengths[256] as u32
 }
 
 /// # Dynamic Length-Limited Code Lengths.
@@ -296,18 +266,20 @@ fn d_llcl(d_counts: &ArrayD<u32>)
 -> Result<ArrayD<DeflateSym>, ZopfliError> {
 	let mut d_lengths = d_counts.llcl()?;
 
-	// Buggy decoders require at least two non-zero distances. Let's see
-	// what we've got!
+	// Buggy decoders require at least two non-zero distances. Let's make sure
+	// we have at least that many.
 	let mut one: Option<bool> = None;
 	for (i, dist) in d_lengths.iter().copied().enumerate().take(30) {
 		// We have (at least) two non-zero entries; no patching needed!
 		if ! dist.is_zero() && one.replace(i == 0).is_some() { return Ok(d_lengths); }
 	}
 
+	// If we're here, fewer than two non-zero distances are in the collection;
+	// we'll need to fake the counts to reach our quota. Haha.
 	match one {
 		// The first entry had a code, so patching the second gives us two.
 		Some(true) => { d_lengths[1] = DeflateSym::D01; },
-		// The first entry didn't have a code, so patching it gives us two.
+		// The first entry did not have a code, so patching it gives us two.
 		Some(false) => { d_lengths[0] = DeflateSym::D01; },
 		// There were no codes at all, so we can just patch the first two.
 		None => {
@@ -318,33 +290,21 @@ fn d_llcl(d_counts: &ArrayD<u32>)
 
 	Ok(d_lengths)
 }
-
-/// # Hash Counts and Symbols.
+/*
+#[inline(never)]
+/// # Compare Two Symbol Sets for Uniqueness.
 ///
-/// Calculate and return a hash for the set. This is done independently of the
-/// map to reduce its signature and enable us to quickly repeat lookups if
-/// necessary.
-///
-/// Note: both passes from a given dynamic lengths call will have the same
-/// counts, but they hash quickly enough there's no performance benefit from
-/// over-complicated the formula.
-fn deflate_hash(
-	ll_counts: &ArrayLL<u32>,
-	d_counts: &ArrayD<u32>,
-	ll_lengths: &ArrayLL<DeflateSym>,
-	d_lengths: &ArrayD<DeflateSym>,
-) -> u64 {
-	use ahash::RandomState;
-	use std::hash::{BuildHasher, Hash, Hasher};
-
+/// This compares two sets of symbols, returning `true` if they're different
+/// from one another.
+fn diff_symbols<const N: usize>(a: &[DeflateSym; N], b: &[DeflateSym; N]) -> bool {
 	#[allow(unsafe_code)]
 	/// # As Bytes.
 	///
-	/// Convert a `DeflateSym` array into an equivalent byte array for faster
-	/// hashing.
+	/// Transform a `DeflateSym` array into an equivalent byte array for more
+	/// efficient comparison. (Bytes get all the love!)
 	const fn deflate_bytes<const N: usize>(arr: &[DeflateSym; N]) -> &[u8; N] {
 		// Safety: DeflateSym has the same size and alignment as u8, and if
-		// for some reason that isn't true, this code won't compile.
+		// for some reason that isn't true, this code won't compile!
 		const {
 			assert!(std::mem::size_of::<[DeflateSym; N]>() == std::mem::size_of::<[u8; N]>());
 			assert!(std::mem::align_of::<[DeflateSym; N]>() == std::mem::align_of::<[u8; N]>());
@@ -352,20 +312,8 @@ fn deflate_hash(
 		unsafe { &* arr.as_ptr().cast() }
 	}
 
-	let mut h = RandomState::with_seeds(
-		0x8596_cc44_bef0_1aa0,
-		0x98d4_0948_da60_19ae,
-		0x49f1_3013_c503_a6aa,
-		0xc4d7_82ff_3c9f_7bef,
-	).build_hasher();
-
-	ll_counts.hash(&mut h);
-	d_counts.hash(&mut h);
-	deflate_bytes(ll_lengths).hash(&mut h);
-	deflate_bytes(d_lengths).hash(&mut h);
-
-	h.finish()
-}
+	deflate_bytes(a) != deflate_bytes(b)
+}*/
 
 /// # Get RLE-Optimized Symbols.
 ///
diff --git a/flapfli/src/zopflipng/rng.rs b/flapfli/src/zopflipng/rng.rs
new file mode 100644
index 0000000..598d77b
--- /dev/null
+++ b/flapfli/src/zopflipng/rng.rs
@@ -0,0 +1,125 @@
+/*!
+# Flapfli: Ranges.
+*/
+
+use std::{
+	num::{
+		NonZeroU32,
+		NonZeroUsize,
+	},
+	ops::Range,
+};
+use super::{
+	zopfli_error,
+	ZOPFLI_MASTER_BLOCK_SIZE,
+	ZopfliError,
+};
+
+
+
+#[derive(Debug, Clone, Copy)]
+/// # Block Range.
+///
+/// This struct exists primarily to guarantee a range is non-empty and no
+/// larger than `ZOPFLI_MASTER_BLOCK_SIZE`.
+///
+/// It also implements `Copy`, so there's that too! Haha.
+pub(crate) struct ZopfliRange {
+	start: usize,
+	end: usize,
+}
+
+impl ZopfliRange {
+	/// # New.
+	///
+	/// Return a new instance spanning `start..end` so long as the struct's
+	/// requirements are met, otherwise an error.
+	pub(crate) const fn new(start: usize, end: usize) -> Result<Self, ZopfliError> {
+		if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE {
+			Ok(Self { start, end })
+		}
+		else { Err(zopfli_error!()) }
+	}
+
+	/// # Update.
+	///
+	/// Adjust the start and end positions of the range so long as the new
+	/// values satisfy the struct's requirements, otherwise an error.
+	pub(crate) fn set(&mut self, start: usize, end: usize) -> Result<(), ZopfliError> {
+		if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE {
+			self.start = start;
+			self.end = end;
+			Ok(())
+		}
+		else { Err(zopfli_error!()) }
+	}
+}
+
+impl ZopfliRange {
+	/// # Start.
+	pub(crate) const fn start(&self) -> usize { self.start }
+
+	/// # End.
+	pub(crate) const fn end(&self) -> usize { self.end }
+
+	/// # As (Traditional) Range.
+	pub(crate) const fn rng(&self) -> Range<usize> { self.start..self.end }
+
+	#[allow(unsafe_code)]
+	/// # Length.
+	pub(crate) const fn len(&self) -> NonZeroUsize {
+		// Safety: we verified start is less than end during construction.
+		unsafe { NonZeroUsize::new_unchecked(self.end - self.start) }
+	}
+
+	#[allow(unsafe_code, clippy::cast_possible_truncation)]
+	/// # Length (32-bit).
+	///
+	/// Same as `ZopfliRange::len`, but more convenient in cases where 32-bit
+	/// values are needed (such as cost/size calculations).
+	///
+	/// Because our ranges are capped at a million, the lengths will always fit
+	/// without truncation.
+	pub(crate) const fn len32(&self) -> NonZeroU32 {
+		// Safety: we verified start is less than end during construction, and
+		// the total is within a million.
+		unsafe { NonZeroU32::new_unchecked((self.end - self.start) as u32) }
+	}
+}
+
+
+
+#[cfg(test)]
+mod test {
+	use super::*;
+
+	#[test]
+	fn t_range() {
+		// Some simple bad ranges.
+		assert!(ZopfliRange::new(0, 0).is_err());
+		assert!(ZopfliRange::new(3, 2).is_err());
+		assert!(ZopfliRange::new(0, ZOPFLI_MASTER_BLOCK_SIZE + 1).is_err());
+
+		// This should fit!.
+		assert!(ZopfliRange::new(0, ZOPFLI_MASTER_BLOCK_SIZE).is_ok());
+
+		// Let's test the getters.
+		let mut rng = ZopfliRange::new(1, 5).expect("Range failed!");
+		assert_eq!(rng.start(), 1);
+		assert_eq!(rng.end(), 5);
+		assert_eq!(rng.len(), NonZeroUsize::new(4).unwrap());
+		assert_eq!(rng.rng(), 1..5);
+
+		// And the setters.
+		assert!(rng.set(2, 6).is_ok());
+		assert_eq!(rng.start(), 2);
+		assert_eq!(rng.end(), 6);
+		assert_eq!(rng.len(), NonZeroUsize::new(4).unwrap());
+		assert_eq!(rng.rng(), 2..6);
+
+		// This should fail.
+		assert!(rng.set(0, 0).is_err());
+		assert!(rng.set(3, 2).is_err());
+		assert!(rng.set(0, ZOPFLI_MASTER_BLOCK_SIZE + 1).is_err());
+	}
+}
diff --git a/flapfli/src/zopflipng/stats.rs b/flapfli/src/zopflipng/stats.rs
index e96c0bf..8e4c18b 100644
--- a/flapfli/src/zopflipng/stats.rs
+++ b/flapfli/src/zopflipng/stats.rs
@@ -19,7 +19,8 @@ use super::{
 #[derive(Clone, Copy)]
 /// # Randomness.
 ///
-/// This struct is only used to cheaply randomize stat frequencies.
+/// This struct is only used to cheaply (and predictably) shuffle stat
+/// frequencies.
 pub(crate) struct RanState {
 	m_w: u32,
 	m_z: u32,
@@ -50,12 +51,11 @@ impl RanState {
 #[derive(Clone, Copy)]
 /// # Symbol Stats.
 ///
-/// This holds the length and distance symbols and costs for a given block,
-/// data that can be used to improve compression on subsequent passes.
+/// This hols the length and distance symbols and costs for a given block.
+/// data which can be used to improve compression on subsequent passes.
 pub(crate) struct SymbolStats {
 	ll_counts: ArrayLL<u32>,
 	d_counts:  ArrayD<u32>,
-
 	pub(crate) ll_symbols: ArrayLL<f64>,
 	pub(crate) d_symbols:  ArrayD<f64>,
 }
@@ -74,70 +74,47 @@ impl SymbolStats {
 }
 
 impl SymbolStats {
-	/// # Add Previous Stats (Weighted).
+	/// # Crunch Symbols.
 	///
-	/// This is essentially an `AddAssign` for `ll_counts` and `d_counts`. Each
-	/// previous value is halved and added to the corresponding current value.
-	pub(crate) fn add_last(
-		&mut self,
-		ll_counts: &ArrayLL<u32>,
-		d_counts: &ArrayD<u32>,
-	) {
-		for (l, r) in self.ll_counts.iter_mut().zip(ll_counts.iter().copied()) {
-			*l += r.wrapping_div(2);
-		}
-		for (l, r) in self.d_counts.iter_mut().zip(d_counts.iter().copied()) {
-			*l += r.wrapping_div(2);
-		}
-
-		// Set the end symbol.
-		self.ll_counts[256] = 1;
-	}
-
-	/// # Clear Frequencies.
+	/// This calculates the "entropy" of the `ll_counts` and `d_counts` — a
+	/// fancy way of saying the difference between the log2 of everything and
+	/// the log2 of self — storing the results in the corresponding symbol
+	/// arrays.
 	///
-	/// Set all `ll_counts` and `d_counts` to zero and return the originals.
-	pub(crate) fn clear(&mut self) -> (ArrayLL<u32>, ArrayD<u32>) {
-		(
-			std::mem::replace(&mut self.ll_counts, ZEROED_COUNTS_LL),
-			std::mem::replace(&mut self.d_counts, ZEROED_COUNTS_D),
-		)
-	}
-
-	/// # Calculate/Set Statistics.
-	///
-	/// This calculates the "entropy" of the `ll_counts` and `d_counts`, storing the
-	/// results in the corresponding symbols arrays.
+	/// Note: the symbols are only valid for the _current_ counts, but they
+	/// don't need to be rebuilt after each and every little change because
+	/// they're only ever referenced during `ZopfliState::optimal_run` passes;
+	/// so long as they're (re)crunched before that method is called, life is
+	/// grand.
 	pub(crate) fn crunch(&mut self) {
-		#[allow(clippy::cast_precision_loss)]
-		fn calculate_entropy<const N: usize>(count: &[u32; N], bitlengths: &mut [f64; N]) {
-			let sum = count.iter().sum::<u32>();
-
-			if sum == 0 {
-				let log2sum = (N as f64).log2();
-				bitlengths.fill(log2sum);
-			}
-			else {
-				let log2sum = f64::from(sum).log2();
-
-				for (c, b) in count.iter().copied().zip(bitlengths.iter_mut()) {
-					if c == 0 { *b = log2sum; }
-					else {
-						*b = log2sum - f64::from(c).log2();
-						if b.is_sign_negative() { *b = 0.0; }
-					}
-				}
-			}
+		// Distances first.
+		let sum = self.d_counts.iter().copied().sum::<u32>();
+		let log2sum =
+			if sum == 0 { 5.0 } // 32.log2()
+			else { f64::from(sum).log2() };
+		self.d_symbols.fill(log2sum);
+		for (c, b) in self.d_counts.iter().copied().zip(&mut self.d_symbols) {
+			if c != 0 { *b -= f64::from(c).log2(); }
 		}
 
-		calculate_entropy(&self.ll_counts, &mut self.ll_symbols);
-		calculate_entropy(&self.d_counts, &mut self.d_symbols);
+		// Lengths second.
+		let sum = self.ll_counts.iter().copied().sum::<u32>();
+		// Safety: ll_counts[256] is always 1 — (re)load_store and randomize
+		// both force it — so this sum will always be nonzero.
+		if sum == 0 { crate::unreachable(); }
+		let log2sum = f64::from(sum).log2();
+		self.ll_symbols.fill(log2sum);
+		for (c, b) in self.ll_counts.iter().copied().zip(&mut self.ll_symbols) {
+			if c != 0 { *b -= f64::from(c).log2(); }
+		}
 	}
 
 	/// # Load Statistics.
 	///
 	/// This updates the `ll_counts` and `d_counts` stats using the data from the
-	/// `ZopfliLZ77Store` store, then crunches the results.
+	/// `LZ77Store` store.
+	///
+	/// Note: this does _not_ rebuild the symbol tables.
 	pub(crate) fn load_store(&mut self, store: &LZ77Store) {
 		for e in &store.entries {
 			self.ll_counts[e.ll_symbol as usize] += 1;
@@ -146,26 +123,67 @@ impl SymbolStats {
 
 		// Set the end symbol and crunch.
 		self.ll_counts[256] = 1;
-		self.crunch();
 	}
 
 	/// # Randomize Stat Frequencies.
 	///
 	/// This randomizes the stat frequencies to allow things to maybe turn out
 	/// different on subsequent squeeze passes.
+	///
+	/// For this to work properly, a single `RanState` must be used for all
+	/// iterations, and because shuffling advances the `RanState`, litlens must
+	/// be processed before distances.
+	///
+	/// Yeah… this is super weird. Haha.
+	///
+	/// Note: this does _not_ rebuild the symbol tables.
 	pub(crate) fn randomize(&mut self, state: &mut RanState) {
-		fn randomize_freqs<const N: usize>(freqs: &mut [u32; N], state: &mut RanState) {
-			for i in 0..N {
+		fn shuffle_counts<const N: usize>(counts: &mut [u32; N], state: &mut RanState) {
+			const { assert!(N == ZOPFLI_NUM_D || N == ZOPFLI_NUM_LL); }
+			for i in const { 0..N } {
 				if (state.randomize() >> 4) % 3 == 0 {
 					let index = state.randomize() as usize % N;
-					freqs[i] = freqs[index];
+					counts[i] = counts[index];
 				}
 			}
 		}
-		randomize_freqs(&mut self.ll_counts, state);
-		randomize_freqs(&mut self.d_counts, state);
+		shuffle_counts(&mut self.ll_counts, state); // Lengths need to go first.
+		shuffle_counts(&mut self.d_counts, state);
 
 		// Set the end symbol.
 		self.ll_counts[256] = 1;
 	}
+
+	/// # Reload Store.
+	///
+	/// Like `SymbolStats::load_store`, but reset or halve the counts first.
+	/// (Halving creates a sort of weighted average, useful once a few
+	/// iterations have occurred.)
+	///
+	/// Note: this does _not_ rebuild the symbols.
+	pub(crate) fn reload_store(&mut self, store: &LZ77Store, weighted: bool) {
+		if weighted {
+			for c in &mut self.d_counts { *c /= 2; }
+			for c in &mut self.ll_counts { *c /= 2; }
+		}
+		else {
+			self.d_counts.fill(0);
+			self.ll_counts.fill(0);
+		}
+
+		self.load_store(store);
+	}
+}
+
+
+
+#[cfg(test)]
+mod test {
+	use super::*;
+
+	#[test]
+	fn t_d_log2() {
+		// Make sure we precomputed the 32.log2() correctly!
+		assert_eq!((ZOPFLI_NUM_D as f64).log2(), 5.0);
+	}
 }
diff --git a/flapfli/src/zopflipng/symbols.rs b/flapfli/src/zopflipng/symbols.rs
index 453b395..3eb8c21 100644
--- a/flapfli/src/zopflipng/symbols.rs
+++ b/flapfli/src/zopflipng/symbols.rs
@@ -10,16 +10,6 @@ via `build.rs`.
 // terrible DISTANCE_SYMBOLS and DISTANCE_VALUES lookup tables.
 include!(concat!(env!("OUT_DIR"), "/symbols.rs"));
 
-/// # Distance Extra Bits (by Symbol).
-///
-/// Note only the first `30` values have meaning, but the compiler doesn't
-/// understand distances are only using 15 bits. Padding the table to `32`
-/// helps eliminate superfluous bounds checks.
-pub(crate) const DISTANCE_BITS: [u8; 32] = [
-	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
-	7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0,
-];
-
 /// # Length Symbols by Litlen.
 pub(crate) const LENGTH_SYMBOLS: [Lsym; 259] = [
 	Lsym::L000, Lsym::L000, Lsym::L000,
@@ -57,21 +47,6 @@ pub(crate) const LENGTH_SYMBOLS: [Lsym; 259] = [
 	Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L285,
 ];
 
-/// # Length Symbol Bits by Litlen.
-pub(crate) const LENGTH_SYMBOL_BITS: [u8; 259] = [
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
-	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-	4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-	5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0,
-];
-
 /// # Length Symbol Bit Values by Litlen.
 pub(crate) const LENGTH_SYMBOL_BIT_VALUES: [u8; 259] = [
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
@@ -88,7 +63,11 @@ pub(crate) const LENGTH_SYMBOL_BIT_VALUES: [u8; 259] = [
 ];
 
 /// # Symbol Iterator.
+///
+/// This trait exposes a single `all` method that returns an iterator over the
+/// enum's variants.
 pub(crate) trait SymbolIteration<U: ExactSizeIterator<Item=Self>>: Sized {
+	/// # Iterate All Variants!
 	fn all() -> U;
 }
 
@@ -127,7 +106,7 @@ impl LitLen {
 
 	/// # Is Max?
 	///
-	/// Returns `true` if `self` is `Self::MAX_MATCH`.
+	/// Returns `true` if `self` is exactly `Self::MAX_MATCH`.
 	pub(crate) const fn is_max(self) -> bool { matches!(self, Self::MAX_MATCH) }
 
 	/// # Is Zero?
@@ -146,6 +125,15 @@ impl LitLen {
 		unsafe { std::mem::transmute::<u16, Self>(n as u16) }
 	}
 
+	#[allow(unsafe_code)]
+	/// # From U8+3.
+	///
+	/// This reverses the work done by `LitLen::to_packed_u8`, returning the
+	/// `LitLen` equivalent of `n + 3`.
+	pub(crate) const fn from_packed_u8(n: u8) -> Self {
+		unsafe { std::mem::transmute::<u16, Self>(n as u16 + 3) }
+	}
+
 	#[allow(unsafe_code)]
 	/// # Min w/ U16.
 	///
@@ -181,6 +169,20 @@ impl LitLen {
 	pub(crate) const fn next_iter(after: Self) -> LitLenIter {
 		LitLenIter(after as u16 + 1)
 	}
+
+	#[allow(clippy::cast_possible_truncation)]
+	/// # To Packed U8.
+	///
+	/// This method packs (a matcheable) `LitLen` into a `u8` by subtracting
+	/// three. (This works because `LitLen::MAX_MATCH - 3 == u8::MAX`.)
+	///
+	/// Values less than three shouldn't ever find their way here, but if they
+	/// do zero is returned.
+	pub(crate) const fn to_packed_u8(self) -> u8 {
+		let n = self as u16;
+		if 3 < n { (n - 3) as u8 }
+		else { 0 }
+	}
 }
 
 impl Lsym {
@@ -196,12 +198,23 @@ impl Lsym {
 
 impl SplitLen {
 	/// # Is Zero?
+	///
+	/// Returns `true` if `self` is zero.
 	pub(crate) const fn is_zero(self) -> bool { matches!(self, Self::S00) }
 
 	/// # Is Max?
+	///
+	/// Returns `true` if `self` is the maximum value (`SplitLen::S14`).
 	pub(crate) const fn is_max(self) -> bool { matches!(self, Self::S14) }
 
 	/// # Increment.
+	///
+	/// Returns `self + 1`.
+	///
+	/// ## Safety
+	///
+	/// This would be UB if `self.is_max()`; the caller must explicitly check
+	/// that is not the case before incrementing.
 	pub(crate) const fn increment(self) -> Self {
 		#[allow(unsafe_code)]
 		unsafe {
@@ -209,7 +222,7 @@ impl SplitLen {
 			// `split_lz77` and `split_raw` — both of which explicitly check
 			// the current value, breaking their loops if/when the maximum is
 			// reached.
-			if self.is_max() { core::hint::unreachable_unchecked(); }
+			if self.is_max() { crate::unreachable(); }
 
 			// Safety: SplitLen has the same size and alignment as u8.
 			std::mem::transmute::<u8, Self>(self as u8 + 1)
@@ -223,6 +236,18 @@ impl SplitLen {
 mod tests {
 	use super::*;
 
+	#[test]
+	fn t_symbol_bits() {
+		// The DISTANCE_BITS/_F and LENGTH_SYMBOL_BITS/_F constants should have
+		// equivalent values.
+		for (f, i) in DISTANCE_BITS_F.iter().copied().zip(DISTANCE_BITS) {
+			assert_eq!(f, f64::from(i));
+		}
+		for (f, i) in LENGTH_SYMBOL_BITS_F.iter().copied().zip(LENGTH_SYMBOL_BITS) {
+			assert_eq!(f, f64::from(i));
+		}
+	}
+
 	#[test]
 	/// # Deflate Symbol Size and Alignment.
 	fn t_deflate_size_align() {
diff --git a/justfile b/justfile
index a3208e5..9766b99 100644
--- a/justfile
+++ b/justfile
@@ -105,7 +105,15 @@ export CXXFLAGS := "-Wall -Wextra -flto -march=x86-64-v3"
 	# Make the docs.
 	cargo rustdoc \
 		--release \
-		--target-dir "{{ cargo_dir }}"
+		--manifest-path "{{ pkg_dir1 }}/Cargo.toml" \
+		--target-dir "{{ cargo_dir }}" \
+		-- --document-private-items
+
+	cargo rustdoc \
+		--release \
+		--manifest-path "{{ pkg_dir2 }}/Cargo.toml" \
+		--target-dir "{{ cargo_dir }}" \
+		-- --document-private-items
 
 	# Move the docs and clean up ownership.
 	[ ! -d "{{ doc_dir }}" ] || rm -rf "{{ doc_dir }}"
diff --git a/release/man/flaca.1 b/release/man/flaca.1
index 47fc9df..6685f30 100644
--- a/release/man/flaca.1
+++ b/release/man/flaca.1
@@ -1,6 +1,6 @@
-.TH "FLACA" "1" "June 2024" "Flaca v3.1.2" "User Commands"
+.TH "FLACA" "1" "July 2024" "Flaca v3.1.3" "User Commands"
 .SH NAME
-Flaca \- Manual page for flaca v3.1.2.
+Flaca \- Manual page for flaca v3.1.3.
 .SH DESCRIPTION
 Brute\-force, lossless JPEG and PNG compression.
 .SS USAGE:
diff --git a/skel/assets/pgo.b3 b/skel/assets/pgo.b3
index 9373934..675c524 100644
--- a/skel/assets/pgo.b3
+++ b/skel/assets/pgo.b3
@@ -10,6 +10,7 @@ e4ff0642f0f19d91b28125d166ad4691d66ea039896a70c45a1c40f9644b90b3  ./pgo/periodic
 347c34094723a02f3b6432aa00de53ef0a47c8b735f4d8bb7a564f6890a08db9  ./pgo/smile.png
 9ce27ed293bfc346b5f6a20b2e7fabb0f8d15f8fce6ac60b3669b56778fbe616  ./pgo/sr.png
 ed450bcfae1bb62505c9a6375b3458046942ba85b7cc9c7e12e10c906c0ecaae  ./pgo/suck.png
+772c152fa637e451ac1bf7352ec54ed91c824f42b0f1be4e39ceb3501e9cdb99  ./pgo/tiny.png
 39db2fcb57e60439467a78b03cf9bf984e0f562ef1d676ee8537a4f71f669b52  ./pgo/ubuntu.png
 ba2e1f129bddbbf614553a028d24ff2d1d7b49b2d5eddf9542c1c32af8e7d739  ./pgo/venn256.png
 5782a9a860dbccd78d2bb061e7facd599a5f603650187c4612dadbf316bd42fb  ./pgo/venn2048.png
diff --git a/skel/pgo/tiny.png b/skel/pgo/tiny.png
new file mode 100644
index 0000000..0874f6a
Binary files /dev/null and b/skel/pgo/tiny.png differ