diff --git a/CREDITS.md b/CREDITS.md index 0a5bc31..707e2cc 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -1,7 +1,7 @@ # Project Dependencies Package: flaca - Version: 3.1.2 - Generated: 2024-06-18 04:53:29 UTC + Version: 3.1.3 + Generated: 2024-07-07 07:18:53 UTC | Package | Version | Author(s) | License | | ---- | ---- | ---- | ---- | @@ -9,7 +9,7 @@ | [argyle](https://github.com/Blobfolio/argyle) | 0.7.2 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL | | [bitvec](https://github.com/bitvecto-rs/bitvec) | 1.0.1 | | MIT | | [bytecount](https://github.com/llogiq/bytecount) | 0.6.8 | [Andre Bogus](mailto:bogusandre@gmail.de) and [Joshua Landau](mailto:joshua@landau.ws) | Apache-2.0 or MIT | -| [bytemuck](https://github.com/Lokathor/bytemuck) | 1.16.0 | [Lokathor](mailto:zefria@gmail.com) | Apache-2.0, MIT, or Zlib | +| [bytemuck](https://github.com/Lokathor/bytemuck) | 1.16.1 | [Lokathor](mailto:zefria@gmail.com) | Apache-2.0, MIT, or Zlib | | [cfg-if](https://github.com/alexcrichton/cfg-if) | 1.0.0 | [Alex Crichton](mailto:alex@alexcrichton.com) | Apache-2.0 or MIT | | [crc32fast](https://github.com/srijs/rust-crc32fast) | 1.4.2 | [Sam Rijs](mailto:srijs@airpost.net) and [Alex Crichton](mailto:alex@alexcrichton.com) | Apache-2.0 or MIT | | [crossbeam-channel](https://github.com/crossbeam-rs/crossbeam) | 0.5.13 | | Apache-2.0 or MIT | @@ -19,7 +19,7 @@ | [dowser](https://github.com/Blobfolio/dowser) | 0.9.1 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL | | [equivalent](https://github.com/cuviper/equivalent) | 1.0.1 | | Apache-2.0 or MIT | | [fastrand](https://github.com/smol-rs/fastrand) | 2.1.0 | [Stjepan Glavina](mailto:stjepang@gmail.com) | Apache-2.0 or MIT | -| flapfli | 3.1.2 | [Josh Stoik](mailto:josh@blobfolio.com) | WTFPL | +| flapfli | 3.1.3 | [Josh Stoik](mailto:josh@blobfolio.com) | WTFPL | | [funty](https://github.com/myrrlyn/funty) | 2.0.0 | [myrrlyn](mailto:self@myrrlyn.dev) | MIT | | [fyi_msg](https://github.com/Blobfolio/fyi) | 0.13.6 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL | | [hashbrown](https://github.com/rust-lang/hashbrown) | 0.14.5 | [Amanieu d'Antras](mailto:amanieu@gmail.com) | Apache-2.0 or MIT | @@ -27,11 +27,11 @@ | [libc](https://github.com/rust-lang/libc) | 0.2.155 | The Rust Project Developers | Apache-2.0 or MIT | | [libdeflate-sys](https://github.com/adamkewley/libdeflater) | 1.20.0 | [Adam Kewley](mailto:contact@adamkewley.com) | Apache-2.0 | | [libdeflater](https://github.com/adamkewley/libdeflater) | 1.20.0 | [Adam Kewley](mailto:contact@adamkewley.com) | Apache-2.0 | -| [log](https://github.com/rust-lang/log) | 0.4.21 | The Rust Project Developers | Apache-2.0 or MIT | +| [log](https://github.com/rust-lang/log) | 0.4.22 | The Rust Project Developers | Apache-2.0 or MIT | | [mozjpeg-sys](https://github.com/kornelski/mozjpeg-sys.git) | 2.2.0 | [Kornel](mailto:kornel@geekhood.net) | IJG AND Zlib AND BSD-3-Clause | | [oxipng](https://github.com/shssoichiro/oxipng) | 9.1.1 | [Joshua Holmer](mailto:jholmer.in@gmail.com) | MIT | | [radium](https://github.com/bitvecto-rs/radium) | 0.7.0 | [Nika Layzell](mailto:nika@thelayzells.com) and [myrrlyn](mailto:self@myrrlyn.dev) | MIT | -| [rgb](https://github.com/kornelski/rust-rgb) | 0.8.37 | [Kornel Lesiński](mailto:kornel@geekhood.net) | MIT | +| [rgb](https://github.com/kornelski/rust-rgb) | 0.8.40 | [Kornel Lesiński](mailto:kornel@geekhood.net) | MIT | | [rustc-hash](https://github.com/rust-lang-nursery/rustc-hash) | 1.1.0 | The Rust Project Developers | Apache-2.0 or MIT | | [tap](https://github.com/myrrlyn/tap) | 1.0.1 | [Elliott Linder](mailto:elliott.darfink@gmail.com) and [myrrlyn](mailto:self@myrrlyn.dev) | MIT | | [tempfile](https://github.com/Stebalien/tempfile) | 3.10.1 | [Steven Allen](mailto:steven@stebalien.com), The Rust Project Developers, [Ashley Mannix](mailto:ashleymannix@live.com.au), and [Jason White](mailto:me@jasonwhite.io) | Apache-2.0 or MIT | @@ -39,4 +39,4 @@ | [unicode-width](https://github.com/unicode-rs/unicode-width) | 0.1.13 | [kwantam](mailto:kwantam@gmail.com) and [Manish Goregaokar](mailto:manishsmail@gmail.com) | Apache-2.0 or MIT | | [write_atomic](https://github.com/Blobfolio/write_atomic) | 0.5.0 | [Blobfolio, LLC.](mailto:hello@blobfolio.com) | WTFPL | | [wyz](https://github.com/myrrlyn/wyz) | 0.5.1 | [myrrlyn](mailto:self@myrrlyn.dev) | MIT | -| [zerocopy](https://github.com/google/zerocopy) | 0.7.34 | [Joshua Liebow-Feeser](mailto:joshlf@google.com) | Apache-2.0, BSD-2-Clause, or MIT | +| [zerocopy](https://github.com/google/zerocopy) | 0.7.35 | [Joshua Liebow-Feeser](mailto:joshlf@google.com) | Apache-2.0, BSD-2-Clause, or MIT | diff --git a/flaca/Cargo.toml b/flaca/Cargo.toml index 878fd00..8ad3b50 100644 --- a/flaca/Cargo.toml +++ b/flaca/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flaca" -version = "3.1.2" +version = "3.1.3" license = "WTFPL" authors = ["Josh Stoik "] edition = "2021" diff --git a/flaca/src/main.rs b/flaca/src/main.rs index 4f06bce..4f6f6fa 100644 --- a/flaca/src/main.rs +++ b/flaca/src/main.rs @@ -54,7 +54,6 @@ use dactyl::{ NiceElapsed, NiceU64, traits::{ - BytesToSigned, BytesToUnsigned, NiceInflection, }, @@ -140,9 +139,7 @@ fn _main() -> Result<(), FlacaError> { // Zopfli iterations. if let Some(n) = args.option(b"-z") { - let n = i32::btoi(n) - .filter(|n| n.is_positive()) - .ok_or(FlacaError::ZopfliIterations)?; + let n = u32::btou(n).ok_or(FlacaError::ZopfliIterations)?; flapfli::ZOPFLI_ITERATIONS.store(n, Relaxed); } diff --git a/flapfli/Cargo.toml b/flapfli/Cargo.toml index 51ee11d..9d796cd 100644 --- a/flapfli/Cargo.toml +++ b/flapfli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flapfli" -version = "3.1.2" +version = "3.1.3" license = "WTFPL" authors = ["Josh Stoik "] edition = "2021" @@ -17,11 +17,6 @@ exclude = [ [dependencies] crc32fast = "=1.4.*" -dactyl = "0.7.*" - -[dependencies.ahash] -version = "=0.8.*" -default-features = false [build-dependencies] bindgen = "0.69.*" diff --git a/flapfli/build.rs b/flapfli/build.rs index be3b738..f15b696 100644 --- a/flapfli/build.rs +++ b/flapfli/build.rs @@ -20,6 +20,27 @@ const DISTANCE_EXTRA_BITS_MASK: [(u32, u32); 16] = [ (8193, 4095), (16_385, 8191), (32_769, 16_383), ]; +/// # Distance Extra Byts (by Symbol). +const DISTANCE_BITS: [u8; 32] = [ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0, +]; + +/// # Length Symbol Bits (by Litlen). +const LENGTH_SYMBOL_BITS: [u8; 259] = [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, +]; + const ZOPFLI_WINDOW_SIZE: u16 = 32_768; @@ -80,11 +101,12 @@ fn build_symbols() { use std::fmt::Write; let mut out = format!( - "{}{}{}{}{}{}", - NumEnum::new(0..19_u8, "Whackadoodle Deflate Indices.", "DeflateSym") + "{}{}{}{}{}{}{}", + NumEnum::new(0..19_u8, "Extended Deflate Indices.", "DeflateSym") .with_debug() .with_eq() .with_iter(), + NumEnum::new(0..16_u8, "Basic Deflate Indices.", "DeflateSymBasic").with_eq(), NumEnum::new(0..32_u16, "Distance Symbols.", "Dsym"), NumEnum::new(0..259_u16, "Lit/Lengths.", "LitLen").with_eq().with_iter(), NumEnum::new(0..286_u16, "Lit/Length Symbols.", "Lsym"), @@ -135,6 +157,26 @@ pub(crate) const DISTANCE_VALUES: &[u16; 32_768] = &["); } out.push_str("\n];\n"); + /// # Distance and length bits. + /// + /// Generate integer and float constants for our bit arrays. + fn bits_and_bobs(title: &str, name: &str, arr: [u8; N]) -> String { + format!( + "/// # {title}. +pub(crate) const {name}: [u8; {N}] = {arr:?}; + +/// # {title} (Float). +/// +/// This is identical to the `u8` version, but avoids a lot of `f64::from` calls. +pub(crate) const {name}_F: [f64; {N}] = {:?}; +", + arr.map(f64::from), + ) + } + + out.push_str(&bits_and_bobs("Distance Bits (by Symbol)", "DISTANCE_BITS", DISTANCE_BITS)); + out.push_str(&bits_and_bobs("Length Bits (by Symbol)", "LENGTH_SYMBOL_BITS", LENGTH_SYMBOL_BITS)); + // Save it! write(&out_path("symbols.rs"), out.as_bytes()); } @@ -328,6 +370,7 @@ pub(crate) struct {name}Iter({kind}); impl Iterator for {name}Iter {{ type Item = {name}; + fn next(&mut self) -> Option {{ let old = self.0; if old < {end} {{ @@ -337,6 +380,11 @@ impl Iterator for {name}Iter {{ }} else {{ None }} }} + + fn size_hint(&self) -> (usize, Option) {{ + let len = self.len(); + (len, Some(len)) + }} }} impl ExactSizeIterator for {name}Iter {{ diff --git a/flapfli/src/deflate.rs b/flapfli/src/deflate.rs new file mode 100644 index 0000000..05e74d2 --- /dev/null +++ b/flapfli/src/deflate.rs @@ -0,0 +1,351 @@ +/*! +# Flapfli: Deflate. + +This module contains the custom lodepng callback (that uses zopfli), and +supporting components. +*/ + +use std::{ + cell::RefCell, + ffi::{ + c_uchar, + c_uint, + }, + num::{ + NonZeroUsize, + NonZeroU32, + }, + sync::atomic::Ordering::Relaxed, +}; +use super::{ + deflate_part, + ffi::flapfli_allocate, + lodepng::LodePNGCompressSettings, + ZOPFLI_ITERATIONS, + ZOPFLI_MASTER_BLOCK_SIZE, + ZopfliChunk, + ZopfliState, +}; + + + +#[allow(unsafe_code)] +/// # Twenty is Non-Zero. +const NZ20: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(20) }; + +#[allow(unsafe_code)] +/// # Sixty is Non-Zero. +const NZ60: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(60) }; + +#[allow(unsafe_code)] +/// # Max Iterations. +const MAX_ITERATIONS: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(i32::MAX as u32) }; + + + +#[no_mangle] +#[allow(unsafe_code)] +/// # Custom PNG Deflate. +/// +/// This is a custom deflate callback for lodepng. When set, image blocks are +/// compressed using zopfli instead of basic-ass deflate. +/// +/// Zopfli is a monster, though, so this is only actually used for the final +/// pass. (Brute force strategizing uses cheaper compression.) +/// +/// Following C convention, this returns `0` for success, `1` for sadness. +/// +/// ## Safety +/// +/// The mutable pointers may or may not initially be null. Allocations are +/// handled on the Rust side, though, and those methods are aware of the fact +/// and will later act (or not act) on these pointer accordingly. +/// +/// The `arr`/`insize` values, on the other hand, _should_ definitely be +/// initialized and valid. We can't verify that, but their existence is the +/// whole point of this callback, so it's probably fine… +/// +/// Flaca processes images in parallel, but the lodepng/zopfli operations are +/// single-threaded. (All work for a given image happens on a single thread.) +/// This is why we can leverage local statics like `STATE` without fear of +/// access contention. +pub(crate) extern "C" fn flaca_png_deflate( + out: *mut *mut c_uchar, + outsize: *mut usize, + arr: *const c_uchar, + insize: usize, + _settings: *const LodePNGCompressSettings, +) -> c_uint { + thread_local!( + static STATE: RefCell> = RefCell::new(ZopfliState::new()) + ); + + // Group the pointer crap to cut down on the number of args being + // passed around. + let mut dst = ZopfliOut { + bp: 0, + out, + outsize, + }; + + // Make a proper slice out of the data. + let arr = unsafe { std::slice::from_raw_parts(arr, insize) }; + + // Figure out how many iterations to use. + let numiterations = NonZeroU32::new(ZOPFLI_ITERATIONS.load(Relaxed)).map_or( + if arr.len() < 200_000 { NZ60 } else { NZ20 }, + |custom| NonZeroU32::min(custom, MAX_ITERATIONS) + ); + + // Compress in chunks, à la ZopfliDeflate. + for chunk in DeflateIter::new(arr) { + #[cfg(not(debug_assertions))] + if STATE.with_borrow_mut(|state| deflate_part( + state, + numiterations, + chunk.total_len().get() == arr.len(), + chunk, + &mut dst, + )).is_err() { return 1; }; + + #[cfg(debug_assertions)] + if let Err(e) = STATE.with_borrow_mut(|state| deflate_part( + state, + numiterations, + chunk.total_len().get() == arr.len(), + chunk, + &mut dst, + )) { panic!("{e}"); }; + } + + // All clear! + 0 +} + + + +/// # Lodepng Output Pointers. +/// +/// This struct serves as a convenience wrapper for the various lodepng/zopfli +/// output pointers, saving us the trouble of passing each of them individually +/// down the rabbit hole. +/// +/// This struct also enables us to centralize the convoluted bit-writing +/// methods used to record data, minimizing — as much as possible — the use of +/// `unsafe` everywhere else. +pub(super) struct ZopfliOut { + bp: u8, + out: *mut *mut u8, + outsize: *mut usize, +} + +impl ZopfliOut { + #[allow(unsafe_code)] + #[inline] + /// # Append Data. + /// + /// This adds a single byte to the output array, re-allocating as + /// necessary. The `outsize` value is incremented accordingly. + /// + /// In practice, most data is written bit-by-bite rather than byte-by-byte. + /// As such, most calls to this method simply write a zero and bit-OR it a + /// few times afterwards. + fn append_data(&mut self, value: u8) { + #[cold] + /// # Allocate. + /// + /// Re/allocation is (potentially) necessary whenever `outsize` reaches + /// a power of two, but since that value represents the length written + /// rather than the actual capacity, this is often a no-op (after some + /// checking). + /// + /// As such, we don't want all this stuff affecting the compiler's + /// inlining decisions, hence the cold wrapper. + unsafe fn alloc_cold(ptr: *mut u8, size: usize) -> *mut u8 { + flapfli_allocate( + ptr, + NonZeroUsize::new(size * 2).unwrap_or(NonZeroUsize::MIN), + ) + } + + unsafe { + // Dereference the size once to save some sanity. + let size = *self.outsize; + + // (Re)allocate if size is a power of two, or empty. + if 0 == (size & size.wrapping_sub(1)) { + *self.out = alloc_cold(*self.out, size); + } + + // Write the value and bump the outside length counter. + (*self.out).add(size).write(value); + self.outsize.write(size + 1); + } + } +} + +impl ZopfliOut { + #[allow(clippy::doc_markdown)] + #[inline] + /// # Add Bit. + /// + /// This adds a single bit to the output array. When the internal `bp` + /// counter is zero that bit gets added on top of a new zero byte, + /// otherwise it is ORed on top of the last one. + pub(crate) fn add_bit(&mut self, bit: u8) { + if self.bp == 0 { self.append_data(0); } + #[allow(unsafe_code)] + unsafe { + // Safety: `append_data` writes a byte to `outsize` and then + // increments it, so to reach and modify that same position we need + // to use `outsize - 1` instead. + *(*self.out).add(*self.outsize - 1) |= bit << self.bp; + } + self.bp = self.bp.wrapping_add(1) & 7; + } + + /// # Add Multiple Bits. + /// + /// This method is used to write multiple bits — `length` of them — at + /// once, shifting on each pass. + pub(crate) fn add_bits(&mut self, symbol: u32, length: u32) { + for i in 0..length { + let bit = (symbol >> i) & 1; + self.add_bit(bit as u8); + } + } + + #[inline] + /// # Add Multiple Bits. + /// + /// Same as `ZopfliOut::add_bits`, but optimized for lengths known at + /// compile-time. + /// + /// ## Panics + /// + /// This will panic at compile-time if `N` is less than two. + pub(crate) fn add_fixed_bits(&mut self, symbol: u32) { + const { assert!(1 < N); } + for i in const { 0..N } { + let bit = (symbol >> i) & 1; + self.add_bit(bit as u8); + } + } + + #[inline] + /// # Add Type Bits Header. + /// + /// This writes the three-bit block type header. In practice, there are + /// only three possible values: + /// * 0 for uncompressed; + /// * 1 for fixed; + /// * 2 for dynamic; + pub(crate) fn add_header(&mut self, last_block: bool) { + self.add_bit(u8::from(last_block)); + self.add_bit(const { BLOCK_BIT & 1 }); + self.add_bit(const { (BLOCK_BIT & 2) >> 1 }); + } + + /// # Add Huffman Bits. + /// + /// Same as `ZopfliOut::add_bits`, but the bits are written in the + /// reverse order to keep life interesting. + pub(crate) fn add_huffman_bits(&mut self, symbol: u32, length: u32) { + // Same as add_bits, except we're doing it backwards. + for i in (0..length).rev() { + let bit = (symbol >> i) & 1; + self.add_bit(bit as u8); + } + } + + #[allow(clippy::cast_possible_truncation)] + /// # Add Non-Compressed Block. + /// + /// As one might suspect, uncompressed blocks are virtually never smaller + /// than compressed blocks, so this method is included more for + /// completeness than anything else. + /// + /// But who knows? + /// + /// Implementation-wise, this requires no statistical data; it merely + /// loops through the raw data in chunks of `u16::MAX`, writes some + /// header/size data, then copies the bytes over. + pub(crate) fn add_uncompressed_block( + &mut self, + last_block: bool, + chunk: ZopfliChunk<'_>, + ) { + // We need to proceed u16::MAX bytes at a time. + let iter = chunk.block().chunks(usize::from(u16::MAX)); + let len = iter.len() - 1; + for (i, block) in iter.enumerate() { + let blocksize = block.len(); + let nlen = ! blocksize; + let really_last_block = i == len; + + // Each chunk gets its own header. + self.add_header::<0>(last_block && really_last_block); + + // Ignore bits of input up to the next byte boundary. + self.bp = 0; + + // Some size details. + self.append_data((blocksize % 256) as u8); + self.append_data((blocksize.wrapping_div(256) % 256) as u8); + self.append_data((nlen % 256) as u8); + self.append_data((nlen.wrapping_div(256) % 256) as u8); + + // And finally the data! + for byte in block.iter().copied() { self.append_data(byte); } + } + } +} + + + +/// # Deflate Chunk Iterator. +/// +/// Zopfli processes image data in chunks of (up to) a million bytes, but for +/// some reason it needs to see any previously-seen data on each pass too. +/// +/// This iterator thus yields increasingly larger slices of `arr`, until +/// eventually the whole thing is returned. The internal `pos` value tracks the +/// start of the "active" portion. +/// +/// See `ZopfliChunk` for more information. Haha. +struct DeflateIter<'a> { + arr: &'a [u8], + pos: usize, +} + +impl<'a> Iterator for DeflateIter<'a> { + type Item = ZopfliChunk<'a>; + + fn next(&mut self) -> Option { + if self.pos < self.arr.len() { + let pos = self.pos; + let chunk = self.arr.get(..pos + ZOPFLI_MASTER_BLOCK_SIZE).unwrap_or(self.arr); + self.pos = chunk.len(); + ZopfliChunk::new(chunk, pos).ok() + } + else { None } + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.len(); + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for DeflateIter<'a> { + fn len(&self) -> usize { + (self.arr.len() - self.pos).div_ceil(ZOPFLI_MASTER_BLOCK_SIZE) + } +} + +impl<'a> DeflateIter<'a> { + /// # New. + const fn new(arr: &'a [u8]) -> Self { + Self { arr, pos: 0 } + } +} diff --git a/flapfli/src/ffi.rs b/flapfli/src/ffi.rs index ab2afd9..b766150 100644 --- a/flapfli/src/ffi.rs +++ b/flapfli/src/ffi.rs @@ -1,7 +1,7 @@ /*! # Flapfli: FFI Image Wrapper. -This module contains custom allocation wrappers for `lodepng`, allowing Rust +This module contains custom allocation wrappers for lodepng, allowing Rust to (more or less) manage the memory. */ @@ -14,6 +14,7 @@ use std::{ realloc, }, ffi::c_void, + num::NonZeroUsize, ops::Deref, ptr::NonNull, }; @@ -28,10 +29,13 @@ const USIZE_SIZE: usize = std::mem::size_of::(); #[derive(Debug)] /// # Encoded Image. /// -/// This is a convenience wrapper for an image encoded by `lodepng`, allowing +/// This is a convenience wrapper for an image encoded by lodepng, allowing /// for easy slice dereferencing and automatic drop cleanup. /// -/// Note the initial state is null/empty. +/// Note the initial state will be null/empty. +/// +/// Allocations are handled by Rust, at least, and are aware of that fact so +/// will act (or not act) on the pointers accordingly. pub struct EncodedPNG { /// # Buffer. pub(crate) buf: *mut u8, @@ -46,7 +50,7 @@ impl Deref for EncodedPNG { #[allow(unsafe_code)] #[inline] fn deref(&self) -> &Self::Target { - if self.is_empty() { &[] } + if self.is_null() { &[] } else { unsafe { std::slice::from_raw_parts(self.buf, self.size) } } @@ -70,10 +74,14 @@ impl EncodedPNG { } } - /// # Is Empty? + /// # Is Null? + /// + /// This is essentially an `is_empty`, returning `true` if the length value + /// is zero or the buffer pointer is literally null. /// - /// Returns true if the instance is empty. - fn is_empty(&self) -> bool { self.size == 0 || self.buf.is_null() } + /// (The name was chosen to help avoid conflicts with dereferenced slice + /// methods.) + pub(crate) fn is_null(&self) -> bool { self.size == 0 || self.buf.is_null() } } @@ -83,17 +91,20 @@ impl EncodedPNG { /// # (Re)Allocate! /// /// Allocate (or reallocate) and return a new pointer for `size` bytes that can -/// be used by the crate or `lodepng` or both. +/// be used by the crate or lodepng or both. /// /// Since C can't be trusted to keep track of allocation sizes, we use the same /// trick the [`libdeflater`](https://github.com/adamkewley/libdeflater/blob/master/src/malloc_wrapper.rs) crate does; -/// we over-allocate by `size_of::()` bytes, use that extra space to -/// hold the length details, and return the rest so the caller gets what it -/// expects. +/// we over-allocate by `size_of::()` bytes, using that extra space to +/// hold the length details. +/// +/// The caller then gets `ptr.add(size_of::())` sized as they expect it +/// to be, and when that pointer is returned to us, we can subtract the same +/// amount to find the length. Rinse and repeat. /// /// This still requires a lot of unsafe, but at least it lives on this side of /// the FFI divide! -pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: usize) -> *mut u8 { +pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: NonZeroUsize) -> *mut u8 { let real_ptr = // If null, allocate it fresh. if ptr.is_null() { @@ -105,26 +116,23 @@ pub(crate) unsafe fn flapfli_allocate(ptr: *mut u8, new_size: usize) -> *mut u8 // Otherwise resize! else { let (real_ptr, old_size) = size_and_ptr(ptr); - realloc(real_ptr, layout_for(old_size), new_size + USIZE_SIZE) + // Return it as-was if the allocation is already sufficient. + if old_size >= new_size { return ptr; } + realloc(real_ptr, layout_for(old_size), USIZE_SIZE + new_size.get()) }; // Safety: the layout is aligned to usize. - real_ptr.cast::().write(new_size); // Write the length. - real_ptr.add(USIZE_SIZE) // Return the rest. + real_ptr.cast::().write(new_size.get()); // Write the length. + real_ptr.add(USIZE_SIZE) // Return the rest. } #[allow(unsafe_code, clippy::inline_always)] #[inline(always)] -/// # (Re)Allocate! +/// # Freedom! /// -/// Allocate (or reallocate) and return a new pointer for `size` bytes that can -/// be used by the crate or C or both. -/// -/// The trick — courtesy of the [`libdeflater`](https://github.com/adamkewley/libdeflater/blob/master/src/malloc_wrapper.rs) crate — -/// is we over-allocate by `size_of::()`, using that extra space to hold -/// the length so that later on, we can de- or re-allocate correctly. -/// -/// This still requires a lot of unsafe, but at least that unsafe lives here! +/// This method deallocates a pointer previously allocated by +/// `flapfli_allocate`. Refer to that method's documentation for the how and +/// why. pub(crate) unsafe fn flapfli_free(ptr: *mut u8) { if ! ptr.is_null() { let (ptr, size) = size_and_ptr(ptr); @@ -136,23 +144,36 @@ pub(crate) unsafe fn flapfli_free(ptr: *mut u8) { #[no_mangle] #[allow(unsafe_code)] -/// # Free Willy. +/// # Lodepng-specific Free. +/// +/// This override allows lodepng to use `flapfli_free` for pointer +/// deallocation. unsafe extern "C" fn lodepng_free(ptr: *mut c_void) { flapfli_free(ptr.cast()); } #[no_mangle] #[allow(unsafe_code)] /// # Lodepng-specific Malloc. /// -/// This is the same as ours, but casts to `c_void` for the ABI. +/// This override allows lodepng to use `flapfli_allocate` for pointer +/// allocation. unsafe extern "C" fn lodepng_malloc(size: usize) -> *mut c_void { - flapfli_allocate(std::ptr::null_mut(), size).cast() + flapfli_allocate( + std::ptr::null_mut(), + NonZeroUsize::new(size).unwrap_or(NonZeroUsize::MIN), + ).cast() } #[no_mangle] #[allow(unsafe_code)] -/// # Re-allocate! +/// # Lodepng-specific Realloc. +/// +/// This override allows lodepng to use `flapfli_allocate` for pointer +/// resizing. unsafe extern "C" fn lodepng_realloc(ptr: *mut c_void, new_size: usize) -> *mut c_void { - flapfli_allocate(ptr.cast(), new_size).cast() + flapfli_allocate( + ptr.cast(), + NonZeroUsize::new(new_size).unwrap_or(NonZeroUsize::MIN), + ).cast() } @@ -163,8 +184,8 @@ unsafe extern "C" fn lodepng_realloc(ptr: *mut c_void, new_size: usize) -> *mut /// /// This returns an appropriately sized and aligned layout with room at the /// beginning to hold our "secret" length information. -const unsafe fn layout_for(size: usize) -> Layout { - Layout::from_size_align_unchecked(USIZE_SIZE + size, std::mem::align_of::()) +const unsafe fn layout_for(size: NonZeroUsize) -> Layout { + Layout::from_size_align_unchecked(USIZE_SIZE + size.get(), std::mem::align_of::()) } #[allow(unsafe_code, clippy::cast_ptr_alignment, clippy::inline_always)] @@ -172,10 +193,12 @@ const unsafe fn layout_for(size: usize) -> Layout { /// # Derive Real Pointer and User Size. /// /// This method takes the `size`-sized pointer shared with the rest of the -/// crate (and `lodepng`) and converts it to the "real" one containing the -/// extra length information, returning it along with said length. -const unsafe fn size_and_ptr(ptr: *mut u8) -> (*mut u8, usize) { +/// crate (and lodepng) and converts it to the "real" one (with the leading +/// length details), returning it and the logical size (i.e. minus eight bytes +/// or whatever). +const unsafe fn size_and_ptr(ptr: *mut u8) -> (*mut u8, NonZeroUsize) { let size_and_data_ptr = ptr.sub(USIZE_SIZE); - let size = *(size_and_data_ptr as *const usize); + // Safety: the size is written from a NonZeroUsize. + let size = NonZeroUsize::new_unchecked(*(size_and_data_ptr as *const usize)); (size_and_data_ptr, size) } diff --git a/flapfli/src/lib.rs b/flapfli/src/lib.rs index 300ef65..15e9407 100644 --- a/flapfli/src/lib.rs +++ b/flapfli/src/lib.rs @@ -34,23 +34,128 @@ heavily optimized flaca's specific use cases (hence "fla" + "pfli"). clippy::redundant_pub_crate, )] +mod deflate; mod ffi; mod lodepng; mod zopflipng; use ffi::EncodedPNG; -pub use zopflipng::optimize; - -use std::sync::atomic::AtomicI32; +use lodepng::{ + DecodedImage, + LodePNGColorType, + LodePNGFilterStrategy, + LodePNGState, +}; +use std::sync::atomic::AtomicU32; use zopflipng::{ deflate_part, - reset_dynamic_length_cache, - SplitPoints, ZOPFLI_MASTER_BLOCK_SIZE, + ZopfliChunk, ZopfliState, }; /// # Number of Zopfli Iterations. -pub static ZOPFLI_ITERATIONS: AtomicI32 = AtomicI32::new(0); +/// +/// A non-zero value indicates a fixed user preference (capped at `i32::MAX`, +/// though anything above a few thousand is usually terrible). If zero, the +/// number of iterations will vary by file size. +/// +/// This is only actually written to once, if ever, but is atomic to make it +/// easier to read the value from within the callback. (That callback is Rust, +/// but called from C.) +pub static ZOPFLI_ITERATIONS: AtomicU32 = AtomicU32::new(0); + +#[must_use] +/// # Optimize! +/// +/// This will attempt to losslessly recompress the source PNG with the +/// strongest Zopfli filter strategy, and return a new PNG image if the result +/// is smaller than the original. +/// +/// Note: 16-bit transformations are not lossless; such images will have their +/// bit depths reduced to a more typical 8 bits. +pub fn optimize(src: &[u8]) -> Option { + // Start by decoding the source. + let mut dec = LodePNGState::default(); + let img = dec.decode(src)?; + + // Find the right strategy. + let mut enc = LodePNGState::encoder(&dec)?; + let mut out = EncodedPNG::new(); + let strategy = best_strategy(&img, &mut enc, &mut out); + + // Now re-re-encode with zopfli and the best strategy. + enc.set_strategy(strategy); + enc.set_zopfli(); + if enc.encode(&img, &mut out) { + // For really small images, we might be able to save even more by + // nuking the palette. + if out.size < 4096 && LodePNGColorType::LCT_PALETTE.is_match(&out) { + if let Some(out2) = enc.try_small(&img) { + if out2.size < out.size && out2.size < src.len() { + // We improved again! + return Some(out2); + } + } + } + + // We improved! + if out.size < src.len() { return Some(out); } + } + + None +} + +#[track_caller] +#[allow(unsafe_code)] +/// # Unreachable Hint. +/// +/// This is a simple unreachability wrapper that calls `unreachable!` when +/// debug assertions are enabled, or the quieter `hint::unreachable_unchecked` +/// when not. +/// +/// Especially since the latter is unsafe, this helps prevent the compiler +/// from making stupid inlining decisions in hot blocks. Haha. +pub(crate) const fn unreachable() { + #[cfg(debug_assertions)] unreachable!(); + #[cfg(not(debug_assertions))] unsafe { core::hint::unreachable_unchecked(); } +} + + + +/// # Best Strategy. +/// +/// This re-encodes the image (quickly) using each strategy, returning +/// whichever produced the smallest output. +/// +/// Skipping zopfli here saves _a ton_ of processing time and (almost) never +/// changes the answer, so it's a shortcut worth taking. +fn best_strategy( + img: &DecodedImage, + enc: &mut LodePNGState, + out: &mut EncodedPNG, +) -> LodePNGFilterStrategy { + let mut best_size = usize::MAX; + let mut best_strategy = LodePNGFilterStrategy::LFS_ZERO; + + for strategy in [ + LodePNGFilterStrategy::LFS_ZERO, + LodePNGFilterStrategy::LFS_ONE, + LodePNGFilterStrategy::LFS_TWO, + LodePNGFilterStrategy::LFS_THREE, + LodePNGFilterStrategy::LFS_FOUR, + LodePNGFilterStrategy::LFS_MINSUM, + LodePNGFilterStrategy::LFS_ENTROPY, + LodePNGFilterStrategy::LFS_BRUTE_FORCE, + ] { + enc.set_strategy(strategy); + if enc.encode(img, out) && out.size < best_size { + best_size = out.size; + best_strategy = strategy; + } + } + + best_strategy +} diff --git a/flapfli/src/lodepng.rs b/flapfli/src/lodepng.rs index 61449ef..b20d608 100644 --- a/flapfli/src/lodepng.rs +++ b/flapfli/src/lodepng.rs @@ -6,30 +6,17 @@ This module contains FFI bindings to `lodepng.c`. #![allow(non_camel_case_types, non_upper_case_globals)] -use crate::{ - ffi::{ - flapfli_allocate, - flapfli_free, - }, - ZOPFLI_ITERATIONS, -}; use std::{ - cell::RefCell, ffi::{ c_uchar, c_uint, }, mem::MaybeUninit, - ops::Range, - sync::atomic::Ordering::Relaxed, }; use super::{ - deflate_part, + deflate::flaca_png_deflate, EncodedPNG, - reset_dynamic_length_cache, - SplitPoints, - ZopfliState, - ZOPFLI_MASTER_BLOCK_SIZE, + ffi::flapfli_free, }; @@ -37,82 +24,17 @@ use super::{ // Generated by build.rs. include!(concat!(env!("OUT_DIR"), "/lodepng-bindgen.rs")); -thread_local!( - static STATES: RefCell<(ZopfliState, SplitPoints)> = RefCell::new(( - ZopfliState::new(), - SplitPoints::new(), - )) -); - - - -#[no_mangle] -#[allow(unsafe_code)] -/// # Custom PNG Deflate. -/// -/// This tells lodepng to use zopfli for encoding. -pub(crate) extern "C" fn flaca_png_deflate( - out: *mut *mut c_uchar, - outsize: *mut usize, - arr: *const c_uchar, - insize: usize, - _settings: *const LodePNGCompressSettings, -) -> c_uint { - // Figure out how many iterations to use. - let mut numiterations = ZOPFLI_ITERATIONS.load(Relaxed); - if numiterations <= 0 { - numiterations = if insize < 200_000 { 60 } else { 20 }; - } - - // Compact the pointers. - let mut dst = ZopfliOut { - bp: 0, - out, - outsize, - }; - - // Compress in chunks, à la ZopfliDeflate. - reset_dynamic_length_cache(); - let mut i: usize = 0; - while i < insize { - // Each pass needs to know if it is the last, and how much data to - // handle. - let (last_part, size) = - if i + ZOPFLI_MASTER_BLOCK_SIZE >= insize { (true, insize - i) } - else { (false, ZOPFLI_MASTER_BLOCK_SIZE) }; - - // Crunch the part! - let res = STATES.with_borrow_mut(|(state, splits)| deflate_part( - state, - splits, - numiterations, - last_part, - unsafe { std::slice::from_raw_parts(arr, i + size) }, - i, - &mut dst, - )); - - #[cfg(debug_assertions)] if let Err(e) = res { panic!("{e}"); } - - // Errors shouldn't be possible, but if something happens to go wrong, - // return one so lodepng can abandon its efforts. - if res.is_err() { return 1; } - - // Onward and upward! - i += size; - } - // Errors panic, so if we're here everything must be fine. - 0 -} #[no_mangle] #[inline(always)] #[allow(unsafe_code, clippy::inline_always)] /// # Lodepng CRC32. /// -/// Replace lodepng's native CRC32 hashing method with Rust's (faster) -/// `crc32fast`. +/// This override allows lodepng to use `crc32fast` for CRC hashing. +/// +/// Note: this is more about relative safety than performance; CRC processing +/// times are negligible compared to everything else. Haha. pub(crate) extern "C" fn lodepng_crc32(buf: *const c_uchar, len: usize) -> c_uint { let mut h = crc32fast::Hasher::new(); h.update(unsafe { std::slice::from_raw_parts(buf, len) }); @@ -123,6 +45,10 @@ pub(crate) extern "C" fn lodepng_crc32(buf: *const c_uchar, len: usize) -> c_uin #[derive(Debug)] /// # Decoded Image. +/// +/// This is a simple wrapper holding a pointer to a decoded image along with +/// the image dimensions. It enables us to hold one thing instead of three +/// while also ensuring the memory is freed correctly on drop. pub(super) struct DecodedImage { pub(super) buf: *mut c_uchar, pub(super) w: c_uint, @@ -137,111 +63,6 @@ impl Drop for DecodedImage { } } - - -/// # Lodepng Output Pointers. -/// -/// This struct provides a wrapper around the lingering bit-writing zopfli C -/// methods, saving us the trouble of having to pass down three different -/// pointers (and using a bunch of unsafe blocks) just to get the data saved. -pub(super) struct ZopfliOut { - bp: u8, - out: *mut *mut u8, - outsize: *mut usize, -} - -impl ZopfliOut { - #[allow(unsafe_code)] - #[inline(never)] - /// # Append Data. - fn append_data(&mut self, value: u8) { - unsafe { - // Dereferencing this size gets annoying quick! Haha. - let size = *self.outsize; - - // (Re)allocate if size is a power of two, or empty. - if 0 == (size & size.wrapping_sub(1)) { - *self.out = flapfli_allocate(*self.out, usize::max(size * 2, 1)); - } - - (*self.out).add(size).write(value); - self.outsize.write(size + 1); - } - } -} - -impl ZopfliOut { - #[allow(unsafe_code)] - /// # Add Bit. - pub(crate) fn add_bit(&mut self, bit: u8) { - if self.bp == 0 { self.append_data(0); } - unsafe { - // Safety: `append_data` writes a byte to `outsize` and then - // increments it, so to reach and modify that same position we need - // to use `outsize - 1` instead. - *(*self.out).add(*self.outsize - 1) |= bit << self.bp; - } - self.bp = self.bp.wrapping_add(1) & 7; - } - - /// # Add Multiple Bits. - pub(crate) fn add_bits(&mut self, symbol: u32, length: u32) { - for i in 0..length { - let bit = (symbol >> i) & 1; - self.add_bit(bit as u8); - } - } - - /// # Add Huffman Bits. - pub(crate) fn add_huffman_bits(&mut self, symbol: u32, length: u32) { - // Same as add_bits, except we're doing it backwards. - for i in (0..length).rev() { - let bit = (symbol >> i) & 1; - self.add_bit(bit as u8); - } - } - - #[allow(clippy::cast_possible_truncation)] - /// # Add Non-Compressed Block. - pub(crate) fn add_uncompressed_block( - &mut self, - last_block: bool, - arr: &[u8], - rng: Range, - ) { - let mut pos = rng.start; - loop { - let mut blocksize = usize::from(u16::MAX); - if pos + blocksize > rng.end { blocksize = rng.end - pos; } - let really_last_block = pos + blocksize >= rng.end; - let nlen = ! blocksize; - - self.add_bit(u8::from(last_block && really_last_block)); - - // BTYPE 00. - self.add_bit(0); - self.add_bit(0); - - // Ignore bits of input up to th enext byte boundary. - self.bp = 0; - - self.append_data((blocksize % 256) as u8); - self.append_data((blocksize.wrapping_div(256) % 256) as u8); - self.append_data((nlen % 256) as u8); - self.append_data((nlen.wrapping_div(256) % 256) as u8); - - for bit in arr.iter().copied().skip(pos).take(blocksize) { - self.append_data(bit); - } - - if really_last_block { break; } - pos += blocksize; - } - } -} - - - impl Default for LodePNGColorStats { #[allow(unsafe_code)] fn default() -> Self { @@ -287,6 +108,9 @@ impl Drop for LodePNGState { impl LodePNGState { #[allow(unsafe_code)] /// # Decode! + /// + /// This attempts to decode a raw image byte slice, returning the details + /// if successful. pub(super) fn decode(&mut self, src: &[u8]) -> Option { let mut buf = std::ptr::null_mut(); let mut w = 0; @@ -306,27 +130,28 @@ impl LodePNGState { #[allow(unsafe_code)] /// # Encode! - pub(super) fn encode(&mut self, img: &DecodedImage) -> Option { + /// + /// Encode the image, returning `true` if lodepng was happy and the output + /// is non-empty. + pub(super) fn encode(&mut self, img: &DecodedImage, out: &mut EncodedPNG) -> bool { + // Reset the size. + out.size = 0; + // Safety: a non-zero response is an error. - let mut out = EncodedPNG::new(); let res = unsafe { lodepng_encode(&mut out.buf, &mut out.size, img.buf, img.w, img.h, self) }; - // Return it if we got it. - if 0 == res && ! out.is_empty() { Some(out) } - else { None } + 0 == res && ! out.is_null() } #[allow(unsafe_code)] /// # Set Up Encoder. /// - /// This configures and returns a new state for encoding purposes. - pub(super) fn encoder( - dec: &Self, - strategy: LodePNGFilterStrategy, - slow: bool - ) -> Option { + /// This configures and returns a new state for general encoding purposes. + /// As this is recycled across runs, separate methods are used to configure + /// the strategy and zopfliness. + pub(super) fn encoder(dec: &Self) -> Option { let mut enc = Self::default(); // Copy palette details over to the encoder. @@ -341,31 +166,43 @@ impl LodePNGState { } enc.encoder.filter_palette_zero = 0; - enc.encoder.filter_strategy = strategy; - - // For final compression, enable the custom zopfli deflater. - if slow { - enc.encoder.zlibsettings.windowsize = 32_768; - enc.encoder.zlibsettings.custom_deflate = Some(flaca_png_deflate); - } - else { - enc.encoder.zlibsettings.windowsize = 8_192; - } + enc.encoder.filter_strategy = LodePNGFilterStrategy::LFS_ZERO; + enc.encoder.zlibsettings.windowsize = 8_192; Some(enc) } + /// # Change Strategies. + pub(super) fn set_strategy(&mut self, strategy: LodePNGFilterStrategy) { + self.encoder.filter_strategy = strategy; + } + + /// # Prepare for Zopfli. + /// + /// Increase the window size and enable our custom zopfli deflate callback. + /// For performance reasons, this is only called before the final + /// encoding pass; everything else is run with saner tunings. + pub(super) fn set_zopfli(&mut self) { + self.encoder.zlibsettings.windowsize = 32_768; + self.encoder.zlibsettings.custom_deflate = Some(flaca_png_deflate); + } + #[allow(unsafe_code)] - /// # Prepare Encoder for Encoding (a small image). + #[inline(never)] + /// # Paletteless Encode (for small images). /// - /// This updates an existing encoder to potentially further optimize a - /// really small image. - pub(super) fn prepare_encoder_small(&mut self, img: &DecodedImage) -> bool { + /// Patch the encoder settings to see if we can squeeze even more savings + /// out of the (small) image, reencode it, and return the result if there + /// are no errors. + /// + /// Note: the caller will need to check the resulting size to see if + /// savings were actually achieved, and keep whichever version was better. + pub(super) fn try_small(&mut self, img: &DecodedImage) -> Option { // Safety: a non-zero response is an error. let mut stats = LodePNGColorStats::default(); if 0 != unsafe { lodepng_compute_color_stats(&mut stats, img.buf, img.w, img.h, &self.info_raw) - } { return false; } + } { return None; } // The image is too small for tRNS chunk overhead. if img.w * img.h <= 16 && 0 != stats.key { stats.alpha = 1; } @@ -389,7 +226,10 @@ impl LodePNGState { } else { self.info_png.color.key_defined = 0; } - true + // Re-encode it and see what happens! + let mut out = EncodedPNG::new(); + if self.encode(img, &mut out) { Some(out) } + else { None } } } diff --git a/flapfli/src/zopflipng/blocks.rs b/flapfli/src/zopflipng/blocks.rs index c454fab..c315e8f 100644 --- a/flapfli/src/zopflipng/blocks.rs +++ b/flapfli/src/zopflipng/blocks.rs @@ -5,28 +5,25 @@ This module contains the deflate entrypoint and all of the block-related odds and ends that didn't make it into other modules. */ -use dactyl::NoHash; -use std::{ - collections::HashSet, - num::NonZeroU32, - ops::Range, -}; +use std::num::NonZeroU32; use super::{ ArrayD, ArrayLL, DeflateSym, DISTANCE_BITS, DISTANCE_VALUES, + DynamicLengths, encode_tree, FIXED_SYMBOLS_D, FIXED_SYMBOLS_LL, FIXED_TREE_D, FIXED_TREE_LL, - get_dynamic_lengths, LENGTH_SYMBOL_BIT_VALUES, LENGTH_SYMBOL_BITS, LengthLimitedCodeLengths, LZ77Store, + LZ77StoreRange, + SplitCache, SplitLen, SplitPIdx, SymbolIteration, @@ -35,8 +32,10 @@ use super::{ SymbolStats, }, zopfli_error, + ZopfliChunk, ZopfliError, ZopfliOut, + ZopfliRange, ZopfliState, }; @@ -49,234 +48,21 @@ const BLOCK_TYPE_DYNAMIC: u8 = 2; const MINIMUM_SPLIT_DISTANCE: usize = 10; #[allow(unsafe_code)] +/// # Ten is Non-Zero. const NZ10: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(10) }; + #[allow(unsafe_code)] +/// # Eleven is Non-Zero. const NZ11: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(11) }; - - -/// # Split Point Scratch. -/// -/// This holds two sets of block split points for use during the deflate -/// passes. Each set can hold up to 14 points (one less than -/// `BLOCKSPLITTING_MAX`), but we're overallocating to 15 to cheaply elide -/// bounds checks. +/// # Block Split Points. /// -/// A single instance of this struct is (re)used for all deflate passes on a -/// given image to reduce allocation overhead. -pub(crate) struct SplitPoints { - slice1: [usize; 15], - slice2: [usize; 15], - done: HashSet, -} - -impl SplitPoints { - /// # New Instance. - pub(crate) fn new() -> Self { - Self { - slice1: [0; 15], - slice2: [0; 15], - done: HashSet::with_hasher(NoHash::default()), - } - } -} - -impl SplitPoints { - /// # Uncompressed Split Pass. - /// - /// This sets the uncompressed split points, by way of first setting the - /// LZ77 split points. - /// - /// In terms of order-of-operations, this must be called _before_ the - /// second-stage LZ77 pass as it would otherwise blow away that data. - fn split_raw(&mut self, arr: &[u8], instart: usize, state: &mut ZopfliState, store: &mut LZ77Store) - -> Result { - // Populate an LZ77 store from a greedy pass. This results in better - // block choices than a full optimal pass. - state.greedy_cold(arr, instart, store, None)?; - - // Do an LZ77 pass. - let len = self.split_lz77(store)?; - - // Find the corresponding uncompressed positions. - if len.is_zero() { Ok(len) } - else { - let mut pos = instart; - let mut j = SplitLen::S00; - for (i, e) in store.entries.iter().enumerate().take(self.slice2[len as usize - 1] + 1) { - if i == self.slice2[j as usize] { - self.slice1[j as usize] = pos; - j = j.increment(); - if (j as u8) == (len as u8) { return Ok(len); } - } - pos += e.length() as usize; - } - - Err(zopfli_error!()) - } - } - - /// # LZ77 Split Pass. - /// - /// This sets the LZ77 split points according to convoluted cost - /// evaluations. - fn split_lz77(&mut self, store: &LZ77Store) -> Result { - /// # Find Largest Splittable Block. - /// - /// This finds the largest available block for splitting, evenly spreading the - /// load if a limited number of blocks are requested. - /// - /// Returns `false` if no blocks are found. - fn find_largest( - lz77size: usize, - done: &HashSet, - splitpoints: &[usize], - rng: &mut Range, - ) -> bool { - let mut best = 0; - for i in 0..=splitpoints.len() { - let start = - if i == 0 { 0 } - else { splitpoints[i - 1] }; - let end = - if i < splitpoints.len() { splitpoints[i] } - else { lz77size - 1 }; - - // We found a match! - if best < end - start && ! done.contains(&start) { - rng.start = start; - rng.end = end; - best = end - start; - } - } - MINIMUM_SPLIT_DISTANCE <= best - } - - // This won't work on tiny files. - if store.len() < MINIMUM_SPLIT_DISTANCE { return Ok(SplitLen::S00); } - - // Get started! - self.done.clear(); - let mut rng = 0..store.len(); - let mut last = 0; - let mut len = SplitLen::S00; - loop { - let (llpos, llcost) = find_minimum_cost(store, rng.start + 1..rng.end)?; - if llpos <= rng.start || llpos >= rng.end { - return Err(zopfli_error!()); - } +/// This array holds up to fourteen middle points as well as the absolute start +/// and end indices. +type SplitPoints = [usize; 16]; - // Ignore points we've already covered. - if llpos == rng.start + 1 || calculate_block_size_auto_type(store, rng.clone())? < llcost { - self.done.insert(rng.start); - } - else { - // Mark it as a split point and add it sorted. - self.slice2[len as usize] = llpos; - len = len.increment(); - - // Keep the list sorted. - if last > llpos { self.slice2[..len as usize].sort_unstable(); } - else { last = llpos; } - - // Stop if we've split the maximum number of times. - if len.is_max() { break; } - } - - // Look for a split and adjust the start/end accordingly. If we don't - // find one or the remaining distance is too small to continue, we're - // done! - if ! find_largest( - store.len(), - &self.done, - &self.slice2[..len as usize], - &mut rng, - ) { break; } - } - - Ok(len) - } - - /// # (Re)split Best. - /// - /// If there's enough data, resplit with optimized LZ77 paths and return - /// whichever best is better. - fn split_again( - &mut self, - store: &LZ77Store, - limit1: SplitLen, - cost1: u32, - ) -> Result<&[usize], ZopfliError> { - if 1 < (limit1 as u8) { - // Move slice2 over to slice1 so we can repopulate slice2. - self.slice1.copy_from_slice(self.slice2.as_slice()); - - let limit2 = self.split_lz77(store)?; - let mut cost2 = 0; - for i in 0..=limit2 as usize { - let start = if i == 0 { 0 } else { self.slice2[i - 1] }; - let end = if i < (limit2 as usize) { self.slice2[i] } else { store.len() }; - cost2 += calculate_block_size_auto_type(store, start..end)?.get(); - } - - // It's better! - if cost2 < cost1 { Ok(&self.slice2[..limit2 as usize]) } - else { Ok(&self.slice1[..limit1 as usize]) } - } - else { Ok(&self.slice2[..limit1 as usize]) } - } - - /// # Split Best. - /// - /// Compare the optimal raw split points with a dedicated lz77 pass and - /// return whichever is predicted to compress better. - fn split( - &mut self, - numiterations: i32, - arr: &[u8], - instart: usize, - store: &mut LZ77Store, - store2: &mut LZ77Store, - state: &mut ZopfliState, - ) -> Result<&[usize], ZopfliError> { - // Start by splitting uncompressed. - let limit = self.split_raw(arr, instart, state, store2)?; - store2.clear(); - - // Now some LZ77 funny business. - let mut cost1 = 0; - let mut store3 = LZ77Store::new(); - for i in 0..=limit as usize { - let start = if i == 0 { instart } else { self.slice1[i - 1] }; - let end = if i < (limit as usize) { self.slice1[i] } else { arr.len() }; - - // This assertion is redundant as we explicitly check range sanity - // earlier and later in the pipeline. - debug_assert!(start <= end && end <= arr.len()); - - // Make another store. - lz77_optimal( - arr.get(..end).ok_or(zopfli_error!())?, - start, - numiterations, - store2, - &mut store3, - state, - )?; - cost1 += calculate_block_size_auto_type(store2, 0..store2.len())?.get(); - - // Append its data to our main store. - store.steal_entries(store2); - - // Save the chunk size to our best. - if i < (limit as usize) { self.slice2[i] = store.len(); } - } - - // Try a second pass, recalculating the LZ77 splits with the updated - // store details. - self.split_again(store, limit, cost1) - } -} +/// # Zero-Filled Split Points. +const ZEROED_SPLIT_POINTS: SplitPoints = [0; 16]; @@ -289,39 +75,53 @@ impl SplitPoints { /// chunk, then writes the resulting blocks to the output file. pub(crate) fn deflate_part( state: &mut ZopfliState, - splits: &mut SplitPoints, - numiterations: i32, + numiterations: NonZeroU32, last_block: bool, - arr: &[u8], - instart: usize, + chunk: ZopfliChunk<'_>, out: &mut ZopfliOut, ) -> Result<(), ZopfliError> { + #[inline(never)] + fn empty_fixed(last_block: bool, out: &mut ZopfliOut) { + out.add_header::(last_block); + out.add_fixed_bits::<7>(0); + } + let mut store = LZ77Store::new(); let mut store2 = LZ77Store::new(); // Find the split points. - let best = splits.split( + let (best, best_len) = split_points( numiterations, - arr, - instart, + chunk, &mut store, &mut store2, state, )?; // Write the data! - for i in 0..=best.len() { - let start = if i == 0 { 0 } else { best[i - 1] }; - let end = if i < best.len() { best[i] } else { store.len() }; - add_lz77_block( - last_block && i == best.len(), - &store, - &mut store2, - state, - arr, - start..end, - out, - )?; + let store_len = best[best_len as usize + 1]; + for pair in best[..best_len as usize + 2].windows(2) { + let really_last_block = last_block && pair[1] == store_len; + + if let Ok(rng) = ZopfliRange::new(pair[0], pair[1]) { + let store_rng = store.ranged(rng)?; + add_lz77_block( + really_last_block, + store_rng, + store_len, + &mut store2, + state, + chunk, + out, + )?; + } + + // This shouldn't be reachable, but the original zopfli seemed to think + // empty blocks are possible and imply fixed-tree layouts, so maybe? + else { + debug_assert_eq!(pair[0], pair[1]); + empty_fixed(really_last_block, out); + } } Ok(()) @@ -330,33 +130,32 @@ pub(crate) fn deflate_part( #[allow(clippy::cast_precision_loss, clippy::cast_sign_loss)] +#[inline] /// # Add LZ77 Block (Automatic Type). /// /// This calculates the expected output sizes for all three block types, then /// writes the best one to the output file. fn add_lz77_block( last_block: bool, - store: &LZ77Store, + store: LZ77StoreRange, + store_len: usize, fixed_store: &mut LZ77Store, state: &mut ZopfliState, - arr: &[u8], - rng: Range, + chunk: ZopfliChunk<'_>, out: &mut ZopfliOut ) -> Result<(), ZopfliError> { + #[inline(never)] /// # Add LZ77 Block (Dynamic). fn add_dynamic( last_block: bool, - store: &LZ77Store, - rng: Range, + store: LZ77StoreRange, out: &mut ZopfliOut, extra: u8, ll_lengths: &ArrayLL, d_lengths: &ArrayD, ) -> Result<(), ZopfliError> { // Type Bits. - out.add_bit(u8::from(last_block)); - out.add_bit(BLOCK_TYPE_DYNAMIC & 1); - out.add_bit((BLOCK_TYPE_DYNAMIC & 2) >> 1); + out.add_header::(last_block); // Build the lengths first. encode_tree(ll_lengths, d_lengths, extra, out)?; @@ -366,118 +165,99 @@ fn add_lz77_block( let d_symbols = ArrayD::::llcl_symbols(d_lengths); // Write all the data! - add_lz77_data( - store, rng, &ll_symbols, ll_lengths, &d_symbols, d_lengths, out - )?; - - // Finish up by writting the end symbol. - out.add_huffman_bits(ll_symbols[256], ll_lengths[256] as u32); - Ok(()) + add_lz77_data(store, &ll_symbols, ll_lengths, &d_symbols, d_lengths, out) } + #[inline(never)] /// # Add LZ77 Block (Fixed). fn add_fixed( last_block: bool, - store: &LZ77Store, - rng: Range, + store: LZ77StoreRange, out: &mut ZopfliOut, ) -> Result<(), ZopfliError> { // Type Bits. - out.add_bit(u8::from(last_block)); - out.add_bit(BLOCK_TYPE_FIXED & 1); - out.add_bit((BLOCK_TYPE_FIXED & 2) >> 1); + out.add_header::(last_block); // Write all the data! add_lz77_data( - store, rng, + store, &FIXED_SYMBOLS_LL, &FIXED_TREE_LL, &FIXED_SYMBOLS_D, &FIXED_TREE_D, out - )?; - - // Finish up by writting the end symbol. - out.add_huffman_bits(FIXED_SYMBOLS_LL[256], FIXED_TREE_LL[256] as u32); - Ok(()) + ) } #[inline(never)] - fn dynamic_details(store: &LZ77Store, rng: Range) - -> Result<(u8, NonZeroU32, ArrayLL, ArrayD), ZopfliError> { - get_dynamic_lengths(store, rng) + /// # Add Uncompressed. + /// + /// It is extremely unlikely this will ever be called. Haha. + fn add_uncompressed( + last_block: bool, + store: LZ77StoreRange, + chunk: ZopfliChunk<'_>, + out: &mut ZopfliOut, + ) -> Result<(), ZopfliError> { + let rng = store.byte_range()?; + let chunk2 = chunk.reslice_rng(rng)?; + out.add_uncompressed_block(last_block, chunk2); + Ok(()) } #[inline(never)] - fn fixed_cost_cold(store: &LZ77Store, rng: Range) -> NonZeroU32 { - calculate_block_size_fixed(store, rng) - } - - // If the block is empty, we can assume a fixed-tree layout. - if rng.is_empty() { - out.add_bits(u32::from(last_block), 1); - out.add_bits(1, 2); - out.add_bits(0, 7); - return Ok(()); - } + fn dynamic_details(store: LZ77StoreRange) + -> Result { DynamicLengths::new(store) } // Calculate the three costs. - let uncompressed_cost = calculate_block_size_uncompressed(store, rng.clone())?; - let (dynamic_extra, dynamic_cost, dynamic_ll, dynamic_d) = dynamic_details(store, rng.clone())?; + let uncompressed_cost = store.block_size_uncompressed()?; + let dynamic = dynamic_details(store)?; // Most blocks won't benefit from a fixed tree layout, but if we've got a // tiny one or the unoptimized-fixed size is within 10% of the dynamic size // we should check it out. if - store.len() <= 1000 || - calculate_block_size_fixed(store, rng.clone()).saturating_mul(NZ10) <= dynamic_cost.saturating_mul(NZ11) + store_len <= 1000 || + store.block_size_fixed().saturating_mul(NZ10) <= dynamic.cost().saturating_mul(NZ11) { - let rng2 = store.byte_range(rng.clone())?; - state.init_lmc(rng2.len()); + let rng = store.byte_range()?; + let fixed_chunk = chunk.reslice_rng(rng)?; + state.init_lmc(&fixed_chunk); // Perform an optimal run. - state.optimal_run_cold( - arr.get(..rng2.end).ok_or(zopfli_error!())?, - rng2.start, - None, - fixed_store, - )?; + state.optimal_run_fixed(fixed_chunk, fixed_store)?; // And finally, the cost! - let fixed_cost = fixed_cost_cold(fixed_store, 0..fixed_store.len()); - if fixed_cost < dynamic_cost && fixed_cost <= uncompressed_cost { - return add_fixed(last_block, fixed_store, 0..fixed_store.len(), out); + let fixed_rng = ZopfliRange::new(0, fixed_store.len())?; + let fixed_store_rng = fixed_store.ranged(fixed_rng)?; + let fixed_cost = fixed_store_rng.block_size_fixed(); + if fixed_cost < dynamic.cost() && fixed_cost <= uncompressed_cost { + return add_fixed(last_block, fixed_store_rng, out); } } // Dynamic is best! - if dynamic_cost <= uncompressed_cost { - add_dynamic( - last_block, store, rng, out, - dynamic_extra, &dynamic_ll, &dynamic_d, - ) + if dynamic.cost() <= uncompressed_cost { + add_dynamic(last_block, store, out, dynamic.extra(), dynamic.ll_lengths(), dynamic.d_lengths()) } - // All the work we did earlier was fruitless; the block works best in an - // uncompressed form. + // Nothing is everything! else { - let rng = store.byte_range(rng)?; - out.add_uncompressed_block(last_block, arr, rng); - Ok(()) + add_uncompressed(last_block, store, chunk, out) } } #[allow(clippy::cast_sign_loss)] +#[inline] /// # Add LZ77 Data. /// /// This adds all lit/len/dist codes from the lists as huffman symbols, but not /// the end code (256). fn add_lz77_data( - store: &LZ77Store, - rng: Range, + store: LZ77StoreRange, ll_symbols: &ArrayLL, ll_lengths: &ArrayLL, d_symbols: &ArrayD, d_lengths: &ArrayD, out: &mut ZopfliOut ) -> Result<(), ZopfliError> { - for e in store.entries.get(rng).ok_or(zopfli_error!())? { + for e in store.entries { // Always add the length symbol (or literal). if ll_lengths[e.ll_symbol as usize].is_zero() { return Err(zopfli_error!()); } out.add_huffman_bits( @@ -507,116 +287,81 @@ fn add_lz77_data( else if (e.litlen as u16) >= 256 { return Err(zopfli_error!()); } } - Ok(()) -} - -#[allow(clippy::cast_possible_truncation)] // The maximum blocksize is only 1 million. -/// # Calculate Block Size (Uncompressed). -fn calculate_block_size_uncompressed(store: &LZ77Store, rng: Range) --> Result { - let rng = store.byte_range(rng)?; - let blocksize = rng.len() as u32; - - // Blocks larger than u16::MAX need to be split. - let blocks = blocksize.div_ceil(65_535); - NonZeroU32::new(blocks * 40 + blocksize * 8).ok_or(zopfli_error!()) -} - -/// # Calculate Block Size (Fixed). -fn calculate_block_size_fixed(store: &LZ77Store, rng: Range) -> NonZeroU32 { - // The end symbol is always included. - let mut size = FIXED_TREE_LL[256] as u32; - - // Loop the store if we have data to loop. - let slice = store.entries.as_slice(); - if rng.start < rng.end && rng.end <= slice.len() { - // Make sure the end does not exceed the store! - for e in &slice[rng] { - size += FIXED_TREE_LL[e.ll_symbol as usize] as u32; - if 0 < e.dist { - size += u32::from(LENGTH_SYMBOL_BITS[e.litlen as usize]); - size += u32::from(DISTANCE_BITS[e.d_symbol as usize]); - size += FIXED_TREE_D[e.d_symbol as usize] as u32; - } - } - } - - // This can't really fail, but fixed models are bullshit anyway so we can - // fall back to an unbeatably large number. - NonZeroU32::new(size).unwrap_or(NonZeroU32::MAX) -} + // Finish up by writting the end symbol. + out.add_huffman_bits(ll_symbols[256], ll_lengths[256] as u32); -/// # Calculate Block Size (Dynamic). -fn calculate_block_size_dynamic(store: &LZ77Store, rng: Range) --> Result { - get_dynamic_lengths(store, rng).map(|(_, size, _, _)| size) + Ok(()) } /// # Calculate Best Block Size (in Bits). -fn calculate_block_size_auto_type(store: &LZ77Store, rng: Range) +fn calculate_block_size_auto(store: &LZ77Store, rng: ZopfliRange) -> Result { - let uncompressed_cost = calculate_block_size_uncompressed(store, rng.clone())?; - - // We can skip the expensive fixed-cost calculations for large blocks since - // they're unlikely ever to use it. - let fixed_cost = - if 1000 < store.len() { uncompressed_cost } - else { calculate_block_size_fixed(store, rng.clone()) }; - - let dynamic_cost = calculate_block_size_dynamic(store, rng)?; - - // If uncompressed is better than everything, return it. - if uncompressed_cost < fixed_cost && uncompressed_cost < dynamic_cost { - Ok(uncompressed_cost) - } - // Otherwise choose the smaller of fixed and dynamic. - else if fixed_cost < dynamic_cost { Ok(fixed_cost) } - else { Ok(dynamic_cost) } + let small = store.len() <= 1000; + let store = store.ranged(rng)?; + store.block_size_auto(small) } /// # Minimum Split Cost. /// /// Return the index of the smallest split cost between `start..end`. -fn find_minimum_cost(store: &LZ77Store, mut rng: Range) +fn find_minimum_cost(store: &LZ77Store, full_rng: ZopfliRange) -> Result<(usize, NonZeroU32), ZopfliError> { + #[cold] + /// # Small Cost. + /// + /// For small ranges, skip the logic and compare all possible splits. This + /// will return an error if no splits are possible. + fn small_cost(store: LZ77StoreRange, offset: usize, small: bool) + -> Result<(usize, NonZeroU32), ZopfliError> { + let mut best_cost = NonZeroU32::MAX; + let mut best_idx = 1; + let mut mid = 1; + for (a, b) in store.splits()? { + let cost = split_cost(a, b, small)?; + if cost < best_cost { + best_cost = cost; + best_idx = mid; // The split point. + } + mid += 1; + } + Ok((offset + best_idx, best_cost)) + } + /// # Split Block Cost. /// /// Sum the left and right halves of the range. - fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result { - let a = calculate_block_size_auto_type(store, start..mid)?; - let b = calculate_block_size_auto_type(store, mid..end)?; + fn split_cost(a: LZ77StoreRange, b: LZ77StoreRange, small: bool) -> Result { + let a = a.block_size_auto(small)?; + let b = b.block_size_auto(small)?; Ok(a.saturating_add(b.get())) } - // Keep track of the original start/end points. - let split_start = rng.start - 1; - let split_end = rng.end; + // Break it down a bit. + let offset = full_rng.start(); + let small = store.len() <= 1000; + let store_rng = store.ranged(full_rng)?; - let mut best_cost = NonZeroU32::MAX; - let mut best_idx = rng.start; + // Short circuit. + if store_rng.len().get() <= 1024 { return small_cost(store_rng, offset, small); } - // Small chunks don't need much. - if rng.len() < 1024 { - for i in rng { - let cost = split_cost(store, split_start, i, split_end)?; - if cost < best_cost { - best_cost = cost; - best_idx = i; - } - } - return Ok((best_idx, best_cost)); - } + // Split range, relative to the length of the ranged store. + let mut split_rng = 1..store_rng.len().get(); // Divide and conquer. + let mut best_cost = NonZeroU32::MAX; + let mut best_idx = 1; let mut p = [0_usize; MINIMUM_SPLIT_DISTANCE - 1]; let mut last_best_cost = NonZeroU32::MAX; - while MINIMUM_SPLIT_DISTANCE <= rng.len() { + loop { let mut best_p_idx = SplitPIdx::S0; for (i, pp) in SplitPIdx::all().zip(p.iter_mut()) { - *pp = rng.start + (i as usize + 1) * (rng.len().wrapping_div(MINIMUM_SPLIT_DISTANCE)); + *pp = split_rng.start + (i as usize + 1) * (split_rng.len().wrapping_div(MINIMUM_SPLIT_DISTANCE)); let line_cost = if best_idx == *pp { last_best_cost } - else { split_cost(store, split_start, *pp, split_end)? }; + else { + let (a, b) = store_rng.split(*pp)?; + split_cost(a, b, small)? + }; if (i as usize) == 0 || line_cost < best_cost { best_cost = line_cost; @@ -629,15 +374,17 @@ fn find_minimum_cost(store: &LZ77Store, mut rng: Range) // Nudge the boundaries and back again. best_idx = p[best_p_idx as usize]; - if 0 != (best_p_idx as usize) { rng.start = p[best_p_idx as usize - 1]; } - if (best_p_idx as usize) + 1 < p.len() { rng.end = p[best_p_idx as usize + 1]; } + if 0 != (best_p_idx as usize) { split_rng.start = p[best_p_idx as usize - 1]; } + if (best_p_idx as usize) + 1 < p.len() { split_rng.end = p[best_p_idx as usize + 1]; } last_best_cost = best_cost; + if split_rng.len() < MINIMUM_SPLIT_DISTANCE { break; } } - Ok((best_idx, last_best_cost)) + Ok((offset + best_idx, last_best_cost)) } +#[inline] /// # Optimal LZ77. /// /// Calculate lit/len and dist pairs for the dataset. @@ -645,46 +392,42 @@ fn find_minimum_cost(store: &LZ77Store, mut rng: Range) /// Note: this incorporates the functionality of `ZopfliLZ77OptimalRun` /// directly. fn lz77_optimal( - arr: &[u8], - instart: usize, - numiterations: i32, + chunk: ZopfliChunk<'_>, + numiterations: NonZeroU32, store: &mut LZ77Store, scratch_store: &mut LZ77Store, state: &mut ZopfliState, -) -> Result<(), ZopfliError> { - // Easy abort. - if instart >= arr.len() || numiterations < 1 { return Ok(()); } - +) -> Result { // Reset the main cache for the current blocksize. - state.init_lmc(arr.len() - instart); + state.init_lmc(&chunk); // Greedy run. - state.greedy(arr, instart, scratch_store, Some(instart))?; - - // Create new stats with the store (updated by the greedy pass). - let mut current_stats = SymbolStats::new(); - current_stats.load_store(scratch_store); + state.greedy(chunk, scratch_store, Some(chunk.pos()))?; - // Set up dummy stats we can use to track best and last. + // Set up the PRNG and two sets of stats, populating one with the greedy- + // crunched store. let mut ran = RanState::new(); let mut best_stats = SymbolStats::new(); + let mut current_stats = SymbolStats::new(); + current_stats.load_store(scratch_store); // We'll also want dummy best and last costs. - let mut last_cost = NonZeroU32::MIN; + let mut last_cost = NonZeroU32::MAX; let mut best_cost = NonZeroU32::MAX; // Repeat statistics with the cost model from the previous // stat run. - let mut last_ran = -1; - for i in 0..numiterations { + let mut weighted = false; + for i in 0..numiterations.get() { + // Rebuild the symbols. + current_stats.crunch(); + // Optimal run. - state.optimal_run(arr, instart, Some(¤t_stats), scratch_store)?; + state.optimal_run(chunk, ¤t_stats, scratch_store)?; // This is the cost we actually care about. - let current_cost = calculate_block_size_dynamic( - scratch_store, - 0..scratch_store.len(), - )?; + let current_cost = scratch_store.ranged_full() + .and_then(LZ77StoreRange::block_size_dynamic)?; // We have a new best! if current_cost < best_cost { @@ -693,31 +436,221 @@ fn lz77_optimal( best_cost = current_cost; } - // Copy the stats to last_stats, clear them, and repopulate - // with the current store. - let (last_litlens, last_dists) = current_stats.clear(); - current_stats.load_store(scratch_store); - - // Once the randomness has kicked in, improve convergence by - // weighting the current and previous stats. - if last_ran != -1 { - current_stats.add_last(&last_litlens, &last_dists); - current_stats.crunch(); - } + // Repopulate the counts from the current store, and if the randomness + // has "warmed up" sufficiently, combine them with half the previous + // values to create a sorted of weighted average. + current_stats.reload_store(scratch_store, weighted); - // Replace the current stats with the best stats, randomize, - // and see what happens. + // If nothing changed, replace the current stats with the best stats, + // reorder the counts, and see what happens. if 5 < i && current_cost == last_cost { current_stats = best_stats; current_stats.randomize(&mut ran); - current_stats.crunch(); - last_ran = i; + weighted = true; } + else { last_cost = current_cost; } + } + + // Find and return the current (best) cost of the store. + let store_rng = store.ranged_full()?; + store_rng.block_size_auto(store_rng.len().get() <= 1000) +} - last_cost = current_cost; +#[inline(never)] +/// # Best Split Points. +/// +/// Compare the optimal raw and LZ77 split points, returning whichever is +/// predicted to compress better. +/// +/// Note the returned length corresponds to the number of points in the middle; +/// it excludes the absolute start and end points. +fn split_points( + numiterations: NonZeroU32, + chunk: ZopfliChunk<'_>, + store: &mut LZ77Store, + store2: &mut LZ77Store, + state: &mut ZopfliState, +) -> Result<(SplitPoints, SplitLen), ZopfliError> { + // We'll need two sets of split points. + let mut split_a = ZEROED_SPLIT_POINTS; + let mut split_b = ZEROED_SPLIT_POINTS; + + // Start by splitting uncompressed. + let raw_len = split_points_raw(chunk, store2, state, &mut split_a, &mut split_b)?; + store2.clear(); + + // Calculate the costs associated with that split and update the store with + // the symbol information encountered. + let mut cost1 = 0; + let mut store3 = LZ77Store::new(); + for i in 0..=raw_len as usize { + let start = if i == 0 { chunk.pos() } else { split_a[i - 1] }; + let end = if i < (raw_len as usize) { split_a[i] } else { chunk.total_len().get() }; + + // Crunch this chunk into a clean store. + cost1 += lz77_optimal( + chunk.reslice(start, end)?, + numiterations, + store2, + &mut store3, + state, + )?.get(); + + // Append its data to our main store. + store.steal_entries(store2); + + // Save the chunk size to our split_b as the defacto best. + split_b[i] = store.len(); } - Ok(()) + // If we have at least two split points, do one further LZ77 pass using the + // updated store details to see if the big picture changes anything. + if 1 < (raw_len as u8) { + let two_len = split_points_lz77_cold(state, store, &mut split_a)?; + split_a[two_len as usize] = store.len(); + split_a.rotate_right(1); + debug_assert!(split_a[0] == 0); // We don't write to the last byte. + let mut cost2 = 0; + for pair in split_a[..two_len as usize + 2].windows(2) { + cost2 += calculate_block_size_auto( + store, + ZopfliRange::new(pair[0], pair[1])?, + )?.get(); + } + + // It's better! + if cost2 < cost1 { return Ok((split_a, two_len)) } + } + + split_b.rotate_right(1); + debug_assert!(split_b[0] == 0); // We don't write to the last byte. + Ok((split_b, raw_len)) +} + +#[inline(never)] +/// # Split Points: Uncompressed. +fn split_points_raw( + chunk: ZopfliChunk<'_>, + store: &mut LZ77Store, + state: &mut ZopfliState, + split_a: &mut SplitPoints, + split_b: &mut SplitPoints, +) -> Result { + // Populate an LZ77 store from a greedy pass. This results in better + // block choices than a full optimal pass. + state.greedy_cold(chunk, store, None)?; + + // Do an LZ77 pass. + let len = split_points_lz77(state, store, split_b)?; + + // Find the corresponding uncompressed positions. + if len.is_zero() { Ok(len) } + else { + let mut pos = chunk.pos(); + let mut j = SplitLen::S00; + for (i, e) in store.entries.iter().enumerate().take(split_b[len as usize - 1] + 1) { + if i == split_b[j as usize] { + split_a[j as usize] = pos; + j = j.increment(); + if (j as u8) == (len as u8) { return Ok(len); } + } + pos += e.length() as usize; + } + + Err(zopfli_error!()) + } +} + +#[inline(never)] +fn split_points_lz77_cold( + state: &mut ZopfliState, + store: &LZ77Store, + split_b: &mut SplitPoints, +) -> Result { split_points_lz77(state, store, split_b) } + +#[inline] +/// # LZ77 Split Pass. +/// +/// This sets the LZ77 split points according to convoluted cost +/// evaluations. +fn split_points_lz77( + state: &mut ZopfliState, + store: &LZ77Store, + split_b: &mut SplitPoints, +) -> Result { + /// # Find Largest Splittable Block. + /// + /// This finds the largest available block for splitting, evenly spreading the + /// load if a limited number of blocks are requested. + /// + /// Returns `false` if no blocks are found. + fn find_largest( + lz77size: usize, + done: &SplitCache, + splitpoints: &[usize], + rng: &mut ZopfliRange, + ) -> Result { + let mut best = 0; + for i in 0..=splitpoints.len() { + let start = + if i == 0 { 0 } + else { splitpoints[i - 1] }; + let end = + if i < splitpoints.len() { splitpoints[i] } + else { lz77size - 1 }; + + // We found a match! + if best < end - start && done.is_unset(start) { + rng.set(start, end)?; + best = end - start; + } + } + Ok(MINIMUM_SPLIT_DISTANCE <= best) + } + + // This won't work on tiny files. + if store.len() < MINIMUM_SPLIT_DISTANCE { return Ok(SplitLen::S00); } + + // Get started! + let mut rng = ZopfliRange::new(0, store.len())?; + let done = state.split_cache(rng); + let mut last = 0; + let mut len = SplitLen::S00; + loop { + // Safety: find_minimum_cost will return an error if the block doesn't + // have a midpoint. + let (llpos, llcost) = find_minimum_cost(store, rng)?; + if rng.start() >= llpos || rng.end() <= llpos { crate::unreachable(); } + + // Ignore points we've already covered. + if llpos == rng.start() + 1 || calculate_block_size_auto(store, rng)? < llcost { + done.set(rng.start()); + } + else { + // Mark it as a split point and add it sorted. + split_b[len as usize] = llpos; + len = len.increment(); + + // Keep the list sorted. + if last > llpos { split_b[..len as usize].sort_unstable(); } + else { last = llpos; } + + // Stop if we've split the maximum number of times. + if len.is_max() { break; } + } + + // Look for a split and adjust the start/end accordingly. If we don't + // find one or the remaining distance is too small to continue, we're + // done! + if ! find_largest( + store.len(), + done, + &split_b[..len as usize], + &mut rng, + )? { break; } + } + + Ok(len) } diff --git a/flapfli/src/zopflipng/cache.rs b/flapfli/src/zopflipng/cache.rs index ac98710..019551b 100644 --- a/flapfli/src/zopflipng/cache.rs +++ b/flapfli/src/zopflipng/cache.rs @@ -1,16 +1,11 @@ /*! -# Flapfli: Longest Match Cache. +# Flapfli: Caches. -The LMC is used to eleviate some of the burden that would otherwise result from -calling `ZopfliHash::find` a hundred million times in a row. Haha. +This module contains the Longest Match cache along with several smaller caching +structures that aren't big enough to warrant their own dedicated modules. */ use std::{ - alloc::{ - alloc, - handle_alloc_error, - Layout, - }, cell::Cell, ptr::{ addr_of_mut, @@ -23,7 +18,10 @@ use super::{ zopfli_error, ZOPFLI_MASTER_BLOCK_SIZE, ZOPFLI_MIN_MATCH, + ZopfliChunk, ZopfliError, + ZopfliRange, + ZopfliStateInit, }; @@ -32,83 +30,97 @@ use super::{ /// /// Length and distance are always fetched/stored together, so are grouped into /// a single value to reduce indexing/bounds overhead. +/// +/// A tuple would be friendlier, but doesn't scale particularly well, so +/// whatever. The `join_ld` and `split_ld` helper methods fill the ergonomic +/// gaps. const DEFAULT_LD: u32 = u32::from_le_bytes([1, 0, 0, 0]); /// # Sublength Cache Entries. +/// +/// This is the total number of "entries" a given sublength cache record +/// contains. const ZOPFLI_CACHE_LENGTH: usize = 8; /// # Sublength Cache Total Length. /// -/// Each entry uses three bytes, so the total size is… +/// Each entry uses three bytes, so the total length of a sublength cache +/// collection is thus… const SUBLEN_CACHED_LEN: usize = ZOPFLI_CACHE_LENGTH * 3; +/// # Length of Split Cache. +/// +/// The split cache is mercifully boolean, so we can pack it into a bit array, +/// reducing its size to one eighth what it otherwise would be. +const SPLIT_CACHE_LEN: usize = ZOPFLI_MASTER_BLOCK_SIZE.div_ceil(8); + +#[repr(C)] /// # Longest Match Cache. /// /// This structure holds cached length/distance details for individual -/// sublengths. Its memory usage is no joke, but the performance savings more -/// than make up for it. +/// "sublengths" — chunks of chunks of data processed by `ZopfliHash` — +/// mitigating the overhead of doing the same shit over and over and over +/// again. +/// +/// As with most of this library's caches, the memory usage is no joke, but +/// trying to get by without without it is downright _miserable_. +/// +/// On the bright side, we only need one instance per thread for the duration +/// of the program run, and thanks to some clever boxing, it winds up on the +/// heap instead of the stack. pub(crate) struct MatchCache { ld: [u32; ZOPFLI_MASTER_BLOCK_SIZE], - sublen: [[u8; SUBLEN_CACHED_LEN]; ZOPFLI_MASTER_BLOCK_SIZE], + sublen: [u8; SUBLEN_CACHED_LEN * ZOPFLI_MASTER_BLOCK_SIZE], } -impl MatchCache { +impl ZopfliStateInit for MatchCache { #[allow(unsafe_code)] - /// # New. - /// - /// Arrays holding a million elements are obviously less than ideal, but - /// because these are referenced repeatedly with different sub-slice sizes, - /// it is much better for performance than vectors that have to be - /// continuously resized/reallocated. + #[inline] + /// # State Initialization. /// - /// Still, these are too big for the stack, so we're initializing them via - /// raw pointers and jamming them straight into a `Box`. - pub(super) fn new() -> Box { - // Reserve the space. - const LAYOUT: Layout = Layout::new::(); - let out = NonNull::new(unsafe { alloc(LAYOUT).cast() }) - .unwrap_or_else(|| handle_alloc_error(LAYOUT)); - let ptr: *mut Self = out.as_ptr(); - - unsafe { - // The arrays can be zero-filled to start with; they'll get reset - // prior to use anyway. - addr_of_mut!((*ptr).ld).write_bytes(0, 1); - addr_of_mut!((*ptr).sublen).write_bytes(0, 1); - - // All set! - Box::from_raw(ptr) - } + /// See `ZopfliState` for more details. + unsafe fn state_init(nn: NonNull) { + let ptr = nn.as_ptr(); + + // The proper defaults for both members are _mostly_ zeroes, so let's + // roll with that since it's cheap and easy. (The values will be reset + // properly before each use anyway.) + addr_of_mut!((*ptr).ld).write_bytes(0, 1); + addr_of_mut!((*ptr).sublen).write_bytes(0, 1); } +} +impl MatchCache { /// # Initialize. /// - /// This resizes the cache buffers and resets their values to their default - /// states — one for length, zero for everything else. + /// Reset (enough of) the cache to its initial/default state for any + /// subsequent processing of `chunk` we might need to do. (Most chunks will + /// be smaller than `ZOPFLI_MASTER_BLOCK_SIZE` so we won't normally need to + /// reset _everything_.) /// - /// Because this is a shared buffer, allocations persist for the duration - /// of the program run so they can be reused. - pub(crate) fn init(&mut self, mut blocksize: usize) { - // Lodepng will never pass along more than ZOPFLI_MASTER_BLOCK_SIZE - // bytes, but this lets the compiler know we won't go over. - if ZOPFLI_MASTER_BLOCK_SIZE < blocksize { - blocksize = ZOPFLI_MASTER_BLOCK_SIZE; - } + /// The length half of `ld` defaults to one; everything else defaults to + /// zero. + pub(crate) fn init(&mut self, chunk: &ZopfliChunk<'_>) { + // Safety: ZopfliChunk verifies the block size is under the limit. + let blocksize = chunk.block_size().get(); + if blocksize > ZOPFLI_MASTER_BLOCK_SIZE { crate::unreachable(); } // Lengths default to one, everything else to zero. self.ld[..blocksize].fill(DEFAULT_LD); - self.sublen[..blocksize].fill([0; SUBLEN_CACHED_LEN]); + self.sublen[..blocksize * SUBLEN_CACHED_LEN].fill(0); } - #[allow(clippy::cast_possible_truncation)] + #[allow(unsafe_code, clippy::cast_possible_truncation)] /// # Find Match. /// - /// Find the sublength, distance, and length from cache, if possible. + /// Find the sublength, distance, and length from cache, if present, and + /// (possibly) add it to the cache if not. /// - /// Values are written directly to the passed arguments. A bool is returned - /// to indicate whether or not the find was successful. + /// The results are written back to the mutable arguments passed to the + /// method. A bool is returned to indicate whether or not the search was + /// successful. pub(crate) fn find( &self, pos: usize, @@ -123,22 +135,25 @@ impl MatchCache { // If we have no distance, we have no cache. let (cache_len, cache_dist) = ld_split(self.ld[pos]); if ! cache_len.is_zero() && cache_dist == 0 { return Ok(false); } - let cache_sublen: &[u8; SUBLEN_CACHED_LEN] = &self.sublen[pos]; + let cache_sublen: &[u8; SUBLEN_CACHED_LEN] = unsafe { + // Safety: the slice has the same length as the array. + &* self.sublen[pos * SUBLEN_CACHED_LEN..(pos + 1) * SUBLEN_CACHED_LEN].as_ptr().cast() + }; // Find the max sublength once, if ever. let maxlength = - if sublen.is_none() { 0 } + if sublen.is_none() { LitLen::L000 } else { max_sublen(cache_sublen) }; // Proceed if our cached length or max sublength are under the limit. if limit.is_max() || (cache_len as u16) <= (*limit as u16) || - (sublen.is_some() && maxlength >= (*limit as usize)) + (sublen.is_some() && (maxlength as u16) >= (*limit as u16)) { // Update length and distance if the sublength pointer is null or // the cached sublength is bigger than the cached length. - if sublen.is_none() || (cache_len as usize) <= maxlength { + if sublen.is_none() || (cache_len as u16) <= (maxlength as u16) { // Cap the length. *length = cache_len; if (*length as u16) > (*limit as u16) { *length = *limit; } @@ -175,7 +190,7 @@ impl MatchCache { Ok(false) } - #[allow(clippy::cast_possible_truncation)] + #[cold] /// # Set Sublength. /// /// Save the provided sublength data to the cache. @@ -219,7 +234,7 @@ impl MatchCache { // The cache gets written three bytes at a time; this iterator will // help us eliminate the bounds checks we'd otherwise run into. - let mut dst = self.sublen[pos].chunks_exact_mut(3); + let mut dst = self.sublen.chunks_exact_mut(3).skip(pos * ZOPFLI_CACHE_LENGTH).take(ZOPFLI_CACHE_LENGTH); // Start by writing all mismatched pairs, up to the limit. for (i, pair) in (0_u8..=u8::MAX).zip(slice.windows(2)) { @@ -233,12 +248,12 @@ impl MatchCache { // The final value is implicitly "mismatched"; if we haven't hit the // limit we should write it too. if let Some([d0, d1, d2]) = dst.next() { - *d0 = (length as u16 - 3) as u8; + *d0 = length.to_packed_u8(); [*d1, *d2] = slice[slice.len() - 1].to_le_bytes(); // If we're still below the limit, copy (only) the length to the // last slot to simplify any subsequent max_length lookups. - if let Some([d0, _, _]) = dst.last() { *d0 = (length as u16 - 3) as u8; } + if let Some([d0, _, _]) = dst.last() { *d0 = length.to_packed_u8(); } } Ok(()) @@ -247,6 +262,69 @@ impl MatchCache { +/// # Split Cache. +/// +/// This structure holds a sort of bit-array used for keeping track of which +/// split points (indices) have already been tested to avoid the overhead of +/// testing them again. +/// +/// As with `MatchCache`, we only need one instance of this struct per thread +/// for the duration of the program run. +pub(crate) struct SplitCache { + set: [u8; SPLIT_CACHE_LEN], +} + +impl ZopfliStateInit for SplitCache { + #[allow(unsafe_code)] + #[inline] + /// # State Initialization. + /// + /// See `ZopfliState` for more details. + unsafe fn state_init(nn: NonNull) { + // False is zeroes all the way down. + addr_of_mut!((*nn.as_ptr()).set).write_bytes(0, 1); + } +} + +impl SplitCache { + /// # Initialize. + /// + /// Reset the first `rng.len()` bits — these ranges always start at zero — + /// to false so we can track a new set of indices. + pub(crate) fn init(&mut self, rng: ZopfliRange) { + // Safety: ZopfliRange checks the range is non-empty and within the + // limit. + let blocksize = rng.len().get(); + if ZOPFLI_MASTER_BLOCK_SIZE < blocksize { crate::unreachable(); } + + // Fill uses bytes rather than bits, so we need to round up to ensure + // complete coverage for our range. + let bitsize = blocksize.div_ceil(8); + self.set[..bitsize].fill(0); + } + + #[inline] + /// # Not Checked? + /// + /// Returns true if the value is currently _unchecked_. (The caller takes + /// action on the negative rather than the positive.) + pub(crate) const fn is_unset(&self, pos: usize) -> bool { + let idx = pos.wrapping_div(8); // The byte. + let mask: u8 = 1 << (pos % 8); // The bit. + SPLIT_CACHE_LEN <= idx || 0 == self.set[idx] & mask + } + + #[inline] + /// # Mark as Checked. + pub(crate) fn set(&mut self, pos: usize) { + let idx = pos.wrapping_div(8); // The byte. + let mask: u8 = 1 << (pos % 8); // The bit. + if idx < SPLIT_CACHE_LEN { self.set[idx] |= mask; } + } +} + + + /// # Squeeze Cache. /// /// This struct stores LZ77 length costs and paths. @@ -255,54 +333,43 @@ impl MatchCache { /// to-block, but can actually go as high as a million and one! /// /// Lest that sound like a terrible waste, this struct only exists as part of -/// a thread-local static so will be reused as many times as needed. +/// a thread-local static so will be reused as many times as needed. That +/// static is also boxed to ensure the data winds up on the heap instead of the +/// stack. pub(crate) struct SqueezeCache { costs: [(f32, LitLen); ZOPFLI_MASTER_BLOCK_SIZE + 1], paths: [LitLen; ZOPFLI_MASTER_BLOCK_SIZE], costs_len: Cell, } -impl SqueezeCache { +impl ZopfliStateInit for SqueezeCache { #[allow(unsafe_code)] - /// # New (Boxed) Instance. - /// - /// Arrays holding a million+ elements is obviously less than ideal, but - /// because these are referenced repeatedly with different sub-slice sizes, - /// it is much better for performance than vectors that have to be - /// continuously resized/reallocated. + #[inline] + /// # State Initialization. /// - /// Still, these are too big for the stack, so we're initializing them via - /// raw pointers and jamming them straight into a `Box`. - pub(crate) fn new() -> Box { - // Reserve the space. - const LAYOUT: Layout = Layout::new::(); - let out = NonNull::new(unsafe { alloc(LAYOUT).cast() }) - .unwrap_or_else(|| handle_alloc_error(LAYOUT)); - let ptr: *mut Self = out.as_ptr(); - - unsafe { - // The arrays can be zero-filled to start with; they'll be reset - // or overwritten before use anyway. - addr_of_mut!((*ptr).costs).write_bytes(0, 1); - addr_of_mut!((*ptr).paths).write_bytes(0, 1); - - // Zero works equally well for the initial length, especially - // because it's true! Haha. - addr_of_mut!((*ptr).costs_len).write(Cell::new(0)); - - // All set! - Box::from_raw(ptr) - } + /// See `ZopfliState` for more details. + unsafe fn state_init(nn: NonNull) { + let ptr = nn.as_ptr(); + + // The arrays can be zero-filled to start with; they'll be reset + // or overwritten before each use anyway. + addr_of_mut!((*ptr).costs).write_bytes(0, 1); + addr_of_mut!((*ptr).paths).write_bytes(0, 1); + + // Zero works equally well for the initial length, especially since + // that happens to be true! + addr_of_mut!((*ptr).costs_len).write(Cell::new(0)); } +} +impl SqueezeCache { /// # Resize Costs. /// - /// This sets the internal costs length to match the desired blocksize, but - /// does _not_ reset their values. (Unlike the LMC, which more or less - /// persists for the duration of a given block, costs are calculated and - /// discarded and recalculated and discarded… several times.) - pub(crate) fn resize_costs(&self, blocksize: usize) { - self.costs_len.set(blocksize); + /// This method merely sets the internal cost-length variable to match + /// `chunk`'s block size (plus one). (It does _not_ reset the actual + /// cost data or anything like that.) + pub(crate) fn resize_costs(&self, chunk: &ZopfliChunk<'_>) { + self.costs_len.set(chunk.block_size().get() + 1); } /// # Reset Costs. @@ -311,14 +378,16 @@ impl SqueezeCache { /// `resize_costs` call. /// /// Note that only the costs themselves are reset; the lengths and paths - /// are dealt with _in situ_ during crunching (without being read). + /// are dealt with _in situ_ during crunching (without first being read). pub(crate) fn reset_costs(&mut self) -> &mut [(f32, LitLen)] { - let costs = self.costs.get_mut(..self.costs_len.get()).unwrap_or(&mut []); - if ! costs.is_empty() { - // The first cost needs to be zero; the rest need to be infinity. - costs[0].0 = 0.0; - for c in costs.iter_mut().skip(1) { c.0 = f32::INFINITY; } - } + // Safety: ZopfliChunk verifies the block size is under the limit and + // non-empty, and since costs is always blocks+1, the minimum is 2. + let len = self.costs_len.get(); + if ! (2..=ZOPFLI_MASTER_BLOCK_SIZE + 1).contains(&len) { crate::unreachable(); } + + let costs = &mut self.costs[..len]; + costs[0].0 = 0.0; + for c in &mut costs[1..] { c.0 = f32::INFINITY; } costs } @@ -373,12 +442,17 @@ const fn ld_split(ld: u32) -> (LitLen, u16) { /// # Max Sublength. /// -/// Return the maximum sublength length for a given chunk. -const fn max_sublen(slice: &[u8; SUBLEN_CACHED_LEN]) -> usize { +/// Return the maximum sublength length for a given cache chunk. +/// +/// Each three-byte cache-entry has its length recorded in the first byte; the +/// last such entry holds the maximum. +const fn max_sublen(slice: &[u8; SUBLEN_CACHED_LEN]) -> LitLen { // If the first chunk has no distance, assume a zero length. - if slice[1] == 0 && slice[2] == 0 { 0 } + if slice[1] == 0 && slice[2] == 0 { LitLen::L000 } // Otherwise the "max" is stored as the first value of the last chunk. - else { slice[SUBLEN_CACHED_LEN - 3] as usize + 3 } + // Since lengths are stored `-3`, we have to add three back to the stored + // value to make it a real length. + else { LitLen::from_packed_u8(slice[SUBLEN_CACHED_LEN - 3]) } } /// # Write Sublength. @@ -388,13 +462,13 @@ fn write_sublen(src: &[u8; SUBLEN_CACHED_LEN], dst: &mut [u16; SUBLEN_LEN]) { let maxlength = max_sublen(src); let mut old = 0; for chunk in src.chunks_exact(3) { - let length = usize::from(chunk[0]) + ZOPFLI_MIN_MATCH; - if old <= length { + let length = LitLen::from_packed_u8(chunk[0]); + if old <= (length as usize) { let value = u16::from_le_bytes([chunk[1], chunk[2]]); - dst[old..=length].fill(value); + dst[old..=length as usize].fill(value); } - if length == maxlength { return; } - old = length + 1; + if (length as u16) >= (maxlength as u16) { return; } + old = (length as usize) + 1; } } @@ -414,4 +488,52 @@ mod tests { // Joining should get us back where we started. assert_eq!(DEFAULT_LD, ld_join(len, dist)); } + + #[test] + fn t_split_mask() { + // What we expect our masks to look like. + const fn split_cache_mask(pos: usize) -> u8 { + match pos % 8 { + 0 => 0b0000_0001, + 1 => 0b0000_0010, + 2 => 0b0000_0100, + 3 => 0b0000_1000, + 4 => 0b0001_0000, + 5 => 0b0010_0000, + 6 => 0b0100_0000, + _ => 0b1000_0000, + } + } + + for pos in 0..255_usize { + let mask: u8 = 1 << (pos % 8); + assert_eq!(mask, split_cache_mask(pos)); + } + } + + #[test] + fn t_split_cache() { + let mut cache = SplitCache { + set: [0_u8; SPLIT_CACHE_LEN], + }; + + // Check that positions are false to start, true after set. + for i in 0..ZOPFLI_MASTER_BLOCK_SIZE { + assert!(cache.is_unset(i)); + cache.set(i); + assert!(! cache.is_unset(i)); + } + + // Everything should be set now. + assert!(cache.set.iter().all(|&b| b == u8::MAX)); + + // If we initialize with a small value, only those bits should be + // affected. + cache.init(ZopfliRange::new(0, 32).unwrap()); + assert_eq!(cache.set[0], 0); + assert_eq!(cache.set[1], 0); + assert_eq!(cache.set[2], 0); + assert_eq!(cache.set[3], 0); + assert_eq!(cache.set[4], u8::MAX); + } } diff --git a/flapfli/src/zopflipng/chunk.rs b/flapfli/src/zopflipng/chunk.rs new file mode 100644 index 0000000..c866139 --- /dev/null +++ b/flapfli/src/zopflipng/chunk.rs @@ -0,0 +1,314 @@ +/*! +# Flapfli: Slice Chunks. +*/ + +use std::num::NonZeroUsize; +use super::{ + zopfli_error, + ZOPFLI_MASTER_BLOCK_SIZE, + ZOPFLI_WINDOW_SIZE, + ZopfliError, + ZopfliRange, +}; + + + +#[derive(Debug, Clone, Copy)] +/// # Deflate Chunk. +/// +/// The deflate/zopfli process is weird. The data is sliced in +/// `ZOPFLI_MASTER_BLOCK_SIZE` windows, kinda, but the previous data is +/// included for the ride because it is sometimes relevant for hashing and +/// caching. +/// +/// Similar to `ZopfliRange`, this struct mainly exists to help enforce the +/// logical constraints so we don't have to repeat sanity checks every five +/// seconds. +/// +/// The struct's `from` value may or may not be zero — on the first pass there +/// won't be any previous data — but it will always be less than `arr.len()`, +/// and `arr.len() - from` will always be less than or equal to +/// `ZOPFLI_MASTER_BLOCK_SIZE`, i.e. one million. +pub(crate) struct ZopfliChunk<'a> { + arr: &'a [u8], + from: usize, +} + +impl<'a> ZopfliChunk<'a> { + /// # New. + /// + /// Define a new chunk with the given source and starting point. + /// + /// ## Errors. + /// + /// This will return an error if the slice is empty, `from` is out of + /// range, or the length from `from` is greater than + /// `ZOPFLI_MASTER_BLOCK_SIZE`. + pub(crate) const fn new(arr: &'a [u8], from: usize) -> Result { + if from < arr.len() && arr.len() - from <= ZOPFLI_MASTER_BLOCK_SIZE { + Ok(Self { arr, from }) + } + else { Err(zopfli_error!()) } + } + + /// # Reslice. + /// + /// Return a new instance capped to the range, or an error if the range is + /// out of bounds or otherwise violates the struct's requirements. + /// + /// The `start` serves as the new instances `from`. If `end` is less than + /// `arr.len()`, the new chunk's slice will be truncated accordingly. + pub(crate) fn reslice(&self, start: usize, end: usize) -> Result { + if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE && end <= self.arr.len() { + let arr = &self.arr[..end]; + Ok(Self { arr, from: start }) + } + else { Err(zopfli_error!()) } + } + + /// # Reslice to Range. + /// + /// Same as `ZopfliChunk::reslice`, but with the range specified as a + /// `ZopfliRange`. + /// + /// This version should be preferred in cases where such a range has + /// already been constructed since it moots all but one of the conditions + /// we'd otherwise need to verify before giving the `Ok()`. + pub(crate) fn reslice_rng(&self, rng: ZopfliRange) -> Result { + let arr = self.arr.get(..rng.end()).ok_or(zopfli_error!())?; + Ok(Self { arr, from: rng.start() }) + } +} + +impl<'a> ZopfliChunk<'a> { + /// # Full Slice. + /// + /// Return the entire data slice, including the prelude, if any. + /// + /// Note: this will never be empty. + pub(crate) const fn arr(&self) -> &[u8] { self.arr } + + /// # Block Slice. + /// + /// Return the "active" portion of the data slice, i.e. everything from + /// `from`. + /// + /// Note: this will never be empty. + pub(crate) fn block(&self) -> &[u8] { + #[allow(unsafe_code)] + // Safety: from is verified during construction. + unsafe { self.arr.get_unchecked(self.from..) } + } + + /// # First Value. + /// + /// Return the first value from the "active" portion of the data slice, + /// i.e. `arr[from]`. + /// + /// Because the current block may never be empty, there will always be at + /// least one value. + pub(crate) const fn first(&self) -> u8 { + // Safety: from is verified during construction. + if self.from >= self.arr.len() { crate::unreachable(); } + self.arr[self.from] + } + + /// # Active Length. + /// + /// Return the length of the "active" slice, e.g. its block size. + pub(crate) const fn block_size(&self) -> NonZeroUsize { + #[allow(unsafe_code)] + // Safety: the length is verified during construction. + unsafe { NonZeroUsize::new_unchecked(self.arr.len() - self.from) } + } + + /// # Current Position. + /// + /// Return the `from` index that marks the starting point of the "active" + /// portion of the data slice. + pub(crate) const fn pos(&self) -> usize { self.from } + + /// # Total Length. + /// + /// Return the length of the entire data slice, prelude and all. + pub(crate) const fn total_len(&self) -> NonZeroUsize { + #[allow(unsafe_code)] + // Safety: slices are verified non-empty at construction. + unsafe { NonZeroUsize::new_unchecked(self.arr.len()) } + } + + #[allow(unsafe_code)] + /// # Warmup Values. + /// + /// This returns the first one or two values from `window_start`, used for + /// warming up the `ZopfliHash` cache. + /// + /// Note: it is probably impossible for there to not be a second value, but + /// since we don't explicitly require lengths of two, it's safer to treat + /// it as optional. + pub(crate) const fn warmup_values(&self) -> (u8, Option) { + // Safety: from (and by association window_start) is verified at + // construction. + let window_start = self.window_start(); + if window_start >= self.arr.len() { crate::unreachable(); } + + let a = self.arr[window_start]; + + // There will usually be a second value, but not always! + let b = + if window_start + 1 < self.arr.len() { Some(self.arr[window_start + 1]) } + else { None }; + + (a, b) + } + + /// # Window Start. + /// + /// If we're at the beginning of a chunk, this is equivalent to + /// `ZopfliChunk::pos` (e.g. `self.from`), otherwise it reaches back up to + /// `ZOPFLI_WINDOW_SIZE` slots into the prelude, returning that index + /// instead. + pub(crate) const fn window_start(&self) -> usize { + self.from.saturating_sub(ZOPFLI_WINDOW_SIZE) + } +} + +impl<'a> ZopfliChunk<'a> { + /// # Reducing Prelude Iterator. + /// + /// Same as `ZopfliChunk::reducing_block_iter`, except the chunks are + /// restricted to the range of the prelude — `window_start..from` — if any. + /// + /// If there is no prelude, `None` is returned instead. + /// + /// Note: the internal slice will be truncated if needed to uphold the + /// maximum length constraint, but that loss doesn't actually matter since + /// prelude hashing never looks at more than `u16::MAX` bytes anyway. (A + /// million minus thirty-odd thousand is still much more than that!) + pub(crate) fn reducing_prelude_iter(self) -> Option>> { + // If we're at the start of the slice, there is no prelude. + if self.from == 0 { None } + else { + // Safety: from (and by association window_start) is verified at + // construction. + let window_start = self.window_start(); + if window_start >= self.arr.len() { crate::unreachable(); } + + let arr = + if self.arr.len() - window_start <= ZOPFLI_MASTER_BLOCK_SIZE { self.arr } + else { &self.arr[..window_start + ZOPFLI_MASTER_BLOCK_SIZE] }; + + let chunk = Self { arr, from: window_start }; + Some(ZopfliChunkIter(chunk).take(self.from - window_start)) + } + } + + /// # Reducing Block Chunk Iterator. + /// + /// Return an iterator that increases the block's starting position (`from`) + /// after each pass, stopping once the chunk would be empty/invalid. + /// + /// Similar to the more generic `ReducingSlice` iterator, this starts with + /// the current value, so there will always be at least one valid result + /// before `None`. + pub(crate) const fn reducing_block_iter(self) -> ZopfliChunkIter<'a> { + ZopfliChunkIter(self) + } +} + + + +/// # Chunk Iterator. +/// +/// This iterator yields increasingly smaller chunks until empty, incrementing +/// the starting position by one after each cycle, beginning with the seed +/// chunk. +pub(crate) struct ZopfliChunkIter<'a>(ZopfliChunk<'a>); + +impl<'a> Iterator for ZopfliChunkIter<'a> { + type Item = ZopfliChunk<'a>; + + fn next(&mut self) -> Option { + // We potentially break the constraints during iteration so need to + // explicitly check from is still in range and non-empty before + // returning. + if self.0.from < self.0.arr.len() { + let next = Some(self.0); + self.0.from += 1; + next + } + else { None } + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.0.arr.len() - self.0.from; + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for ZopfliChunkIter<'a> { + fn len(&self) -> usize { self.0.arr.len() - self.0.from } +} + + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn t_reducing_block_iter() { + let arr: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let chunk = ZopfliChunk { arr, from: 1 }; + let mut iter = chunk.reducing_block_iter(); + + let mut len = 9; + let mut from = 1; + loop { + // Check the iterator's presumed length. + assert_eq!(iter.len(), len); + if len == 0 { break; } + + // Pull the next entry and check the result. + let next = iter.next().expect("reducing block iter terminated early"); + assert_eq!(next.block(), &arr[from..]); + assert_eq!(next.pos(), from); + + len -= 1; + from += 1; + } + + // It should be empty. + assert!(iter.next().is_none()); + } + + #[test] + fn t_reducing_prelude_iter() { + let arr: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + let chunk = ZopfliChunk { arr, from: 1 }; + let mut iter = chunk.reducing_prelude_iter().expect("missing prelude iter"); + + assert_eq!(iter.len(), 1); + let next = iter.next().expect("reducing prelude iter terminated early"); + assert_eq!(next.block(), arr); + assert_eq!(next.pos(), 0); + + assert_eq!(iter.len(), 0); + assert!(iter.next().is_none()); + + // Let's try it again with a chunk that has no prelude. + let chunk = ZopfliChunk { arr, from: 0 }; + assert!(chunk.reducing_prelude_iter().is_none()); + + // And let's try one that is too big. + let arr: &[u8] = &[0; ZOPFLI_MASTER_BLOCK_SIZE + 10]; + let chunk = ZopfliChunk::new(arr, 10).expect("Chunk failed."); + let mut iter = chunk.reducing_prelude_iter().expect("missing prelude iter"); + + assert_eq!(iter.len(), 10); + let next = iter.next().expect("reducing prelude iter terminated early"); + + // The slice should be truncated to fit the constraint. + assert_eq!(next.block_size().get(), ZOPFLI_MASTER_BLOCK_SIZE); + } +} diff --git a/flapfli/src/zopflipng/error.rs b/flapfli/src/zopflipng/error.rs index 250c467..021a443 100644 --- a/flapfli/src/zopflipng/error.rs +++ b/flapfli/src/zopflipng/error.rs @@ -26,6 +26,9 @@ pub(crate) type ZopfliError = (); /// /// When debug assertions are _enabled_, error responses panic with the /// relevant source details to aid further investigation. +/// +/// This struct stores those details, allowing us to delay the panicking until +/// the error has bubbled back to lodepng. pub(crate) struct ZopfliError { file: &'static str, line: u32, @@ -58,9 +61,7 @@ impl fmt::Display for ZopfliError { /// /// The debug version of this macro panics with a message indicating the file /// and line number to aid further investigation. -macro_rules! zopfli_error { - () => (ZopfliError::new(file!(), line!())); -} +macro_rules! zopfli_error { () => (ZopfliError::new(file!(), line!())); } #[cfg(not(debug_assertions))] /// # Error Macro (Release). @@ -68,5 +69,5 @@ macro_rules! zopfli_error { /// The non-debug version simply returns a `()`. macro_rules! zopfli_error { () => (()); } -/// # Expose it to the rest of the module. +/// # Expose the macro to the rest of the module. pub(super) use zopfli_error; diff --git a/flapfli/src/zopflipng/hash.rs b/flapfli/src/zopflipng/hash.rs index 1b55ebe..d0f751e 100644 --- a/flapfli/src/zopflipng/hash.rs +++ b/flapfli/src/zopflipng/hash.rs @@ -19,23 +19,27 @@ use std::{ }, }; use super::{ - DISTANCE_BITS, + DISTANCE_BITS_F, DISTANCE_SYMBOLS, - LENGTH_SYMBOL_BITS, + LENGTH_SYMBOL_BITS_F, LENGTH_SYMBOLS, LitLen, LZ77Store, MatchCache, + ReducingSlices, + SplitCache, SqueezeCache, stats::SymbolStats, SUBLEN_LEN, zopfli_error, ZOPFLI_MAX_MATCH, ZOPFLI_MIN_MATCH, + ZOPFLI_WINDOW_SIZE, + ZopfliChunk, ZopfliError, + ZopfliRange, }; -const ZOPFLI_WINDOW_SIZE: usize = 32_768; const ZOPFLI_WINDOW_MASK: usize = ZOPFLI_WINDOW_SIZE - 1; const HASH_SHIFT: i32 = 5; const HASH_MASK: i16 = 32_767; @@ -55,57 +59,105 @@ const ZEROED_SUBLEN: [u16; SUBLEN_LEN] = [0; SUBLEN_LEN]; /// # Zopfli State. /// -/// This consolidates the Longest Match, Squeeze, and Hash caches into a single -/// structure, cutting down on the number of references being bounced around -/// from method to method. +/// This consolidates the Longest Match, Squeeze, Split, and Hash caches into a +/// single gratuitous structure, cutting down on the number of references we +/// need to bounce from method to method. +/// +/// Each member is big and terrible in its own right, but on the bright side we +/// only need a single instance per thread for the duration of the program run, +/// so the allocations are a one-and-done affair. +/// +/// (That local lives in `deflate.rs`.) pub(crate) struct ZopfliState { - lmc: Box, - hash: Box, - squeeze: Box, + lmc: MatchCache, + hash: ZopfliHash, + split: SplitCache, + squeeze: SqueezeCache, } impl ZopfliState { + #[allow(unsafe_code)] + #[inline(never)] /// # New. - pub(crate) fn new() -> Self { - Self { - lmc: MatchCache::new(), - hash: ZopfliHash::new(), - squeeze: SqueezeCache::new(), + /// + /// This struct's members are mostly large and terrible arrays. To keep + /// them off the stack, it is necessary to initialize everything from raw + /// pointers and box them up. + /// + /// This unfortunately requires a lot of upfront unsafe code during + /// construction, but everything can be accessed normally thereafter. + /// + /// To cut down on some of the complexity, the manual layout allocation and + /// boxing is done once, here, instead of separately for each individual + /// member. + /// + /// See `ZopfliStateInit` below for a few more details. + pub(crate) fn new() -> Box { + // Reserve the space. + const LAYOUT: Layout = Layout::new::(); + let out: NonNull = NonNull::new(unsafe { alloc(LAYOUT).cast() }) + .unwrap_or_else(|| handle_alloc_error(LAYOUT)); + let ptr = out.as_ptr(); + + unsafe { + // Initialize the members. + MatchCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).lmc))); + ZopfliHash::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).hash))); + SplitCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).split))); + SqueezeCache::state_init(NonNull::new_unchecked(addr_of_mut!((*ptr).squeeze))); + + // Done! + Box::from_raw(ptr) } } /// # Initialize LMC/Squeeze Caches. - pub(crate) fn init_lmc(&mut self, blocksize: usize) { - self.lmc.init(blocksize); - self.squeeze.resize_costs(blocksize + 1); + /// + /// This prepares the Longest Match Cache and Squeeze caches for subsequent + /// work on `chunk`, if any. + pub(crate) fn init_lmc(&mut self, chunk: &ZopfliChunk<'_>) { + self.lmc.init(chunk); + self.squeeze.resize_costs(chunk); + } + + /// # Split Cache. + /// + /// Clear the split cache and return a mutable reference to it so the + /// split points within `rng` can be tracked. + pub(crate) fn split_cache(&mut self, rng: ZopfliRange) -> &mut SplitCache { + self.split.init(rng); + &mut self.split } } impl ZopfliState { #[inline(never)] /// # Greedy LZ77 Run (No Inlining). + /// + /// Same as `greedy`, but the compiler is given an `inline(never)` hint to + /// (hopefully) keep all this code from affecting its inlining decisions + /// about the caller. pub(crate) fn greedy_cold( &mut self, - arr: &[u8], - instart: usize, + chunk: ZopfliChunk<'_>, store: &mut LZ77Store, cache: Option, ) -> Result<(), ZopfliError> { - self.greedy(arr, instart, store, cache) + self.greedy(chunk, store, cache) } - #[allow(unsafe_code, clippy::cast_possible_truncation)] + #[allow(clippy::cast_possible_truncation)] #[inline] /// # Greedy LZ77 Run. /// /// This method looks for best-length matches in the data (and/or cache), /// updating the store with the results. /// - /// This is one of two entrypoints into the inner `ZopfliHash` data. + /// This is very similar to `ZopfliState::optimal_run`, but better suited + /// for general-purpose store population. pub(crate) fn greedy( &mut self, - arr: &[u8], - instart: usize, + chunk: ZopfliChunk<'_>, store: &mut LZ77Store, cache: Option, ) -> Result<(), ZopfliError> { @@ -117,7 +169,10 @@ impl ZopfliState { // Reset the store and hash. store.clear(); - self.hash.reset(arr, instart); + self.hash.reset(chunk); + + // Short circuit. + let mut iter = chunk.reducing_block_iter(); // We'll need a few more variables… let mut sublen = ZEROED_SUBLEN; @@ -126,17 +181,16 @@ impl ZopfliState { let mut prev_length = LitLen::L000; let mut prev_distance: u16 = 0; let mut match_available = false; + let mut prev_value = 0_u8; // Loop the data! - let mut i = instart; - while i < arr.len() { - // Update the hash. - self.hash.update_hash(&arr[i..], i); + while let Some(chunk2) = iter.next() { + self.hash.update_hash(chunk2); + let prev_prev_value = std::mem::replace(&mut prev_value, chunk2.first()); // Run the finder. self.hash.find( - arr, - i, + chunk2, LitLen::MAX_MATCH, &mut Some(&mut sublen), &mut distance, @@ -152,20 +206,15 @@ impl ZopfliState { match_available = false; if length_score > prev_length_score + 1 { - // Safety: match_available starts false so even if instart - // is zero, we won't reach this part until we've iterated - // at least once. store.push( - LitLen::from_u8(unsafe { *arr.get_unchecked(i - 1) }), + LitLen::from_u8(prev_prev_value), 0, - i - 1, + chunk2.pos() - 1, ); if length_score >= ZOPFLI_MIN_MATCH as u16 && ! length.is_max() { match_available = true; prev_length = length; prev_distance = distance; - - i += 1; continue; } } @@ -175,16 +224,13 @@ impl ZopfliState { distance = prev_distance; // Write the values! - store.push(length, distance, i - 1); + store.push(length, distance, chunk2.pos() - 1); // Update the hash up through length and increment the loop // position accordingly. - for _ in 2..(length as u16) { - i += 1; - self.hash.update_hash(&arr[i..], i); + for chunk2 in iter.by_ref().take(length as usize - 2) { + self.hash.update_hash(chunk2); } - - i += 1; continue; } } @@ -194,58 +240,73 @@ impl ZopfliState { match_available = true; prev_length = length; prev_distance = distance; - - i += 1; continue; } // Write the current length/distance. if length_score >= ZOPFLI_MIN_MATCH as u16 { - store.push(length, distance, i); + store.push(length, distance, chunk2.pos()); } // Write from the source with no distance and reset the length to // one. else { length = LitLen::L001; - store.push(LitLen::from_u8(arr[i]), 0, i); + store.push(LitLen::from_u8(chunk2.first()), 0, chunk2.pos()); } // Update the hash up through length and increment the loop // position accordingly. - for _ in 1..(length as u16) { - i += 1; - self.hash.update_hash(&arr[i..], i); + for chunk2 in iter.by_ref().take(length as usize - 1) { + self.hash.update_hash(chunk2); } - - i += 1; } Ok(()) } #[inline(never)] - /// # Optimal Run (No Inlining). - pub(crate) fn optimal_run_cold( + /// # Optimal Run (Fixed). + /// + /// Same as `ZopfliHash::optimal_run`, but fixed tree counts and symbols + /// are used instead of the store's actual histogram. + pub(crate) fn optimal_run_fixed( &mut self, - arr: &[u8], - instart: usize, - stats: Option<&SymbolStats>, + chunk: ZopfliChunk<'_>, store: &mut LZ77Store, - ) -> Result<(), ZopfliError> { self.optimal_run(arr, instart, stats, store) } + ) -> Result<(), ZopfliError> { + // Reset the store and costs. + store.clear(); + let costs = self.squeeze.reset_costs(); + if ! costs.is_empty() { + // Reset and warm the hash. + self.hash.reset(chunk); - #[inline] + // Forward and backward squeeze passes. + self.hash.get_best_lengths_fixed(chunk, costs, &mut self.lmc)?; + let paths = self.squeeze.trace_paths()?; + if ! paths.is_empty() { + self.hash.follow_paths( + chunk, + paths, + store, + &mut self.lmc, + )?; + } + } + + Ok(()) + } + + #[inline(never)] /// # Optimal Run. /// - /// This performs backward/forward squeeze passes on the data, optionally - /// considering existing histogram data. The `store` is updated with the - /// best-length match data. - /// - /// This is one of two entrypoints into the inner `ZopfliHash` data. + /// This performs backward/forward squeeze passes on the data with + /// existing histogram data. The `store` is updated with the best-length + /// match data. pub(crate) fn optimal_run( &mut self, - arr: &[u8], - instart: usize, - stats: Option<&SymbolStats>, + chunk: ZopfliChunk<'_>, + stats: &SymbolStats, store: &mut LZ77Store, ) -> Result<(), ZopfliError> { // Reset the store and costs. @@ -253,15 +314,14 @@ impl ZopfliState { let costs = self.squeeze.reset_costs(); if ! costs.is_empty() { // Reset and warm the hash. - self.hash.reset(arr, instart); + self.hash.reset(chunk); // Forward and backward squeeze passes. - self.hash.get_best_lengths(arr, instart, stats, costs, &mut self.lmc)?; + self.hash.get_best_lengths(chunk, stats, costs, &mut self.lmc)?; let paths = self.squeeze.trace_paths()?; if ! paths.is_empty() { self.hash.follow_paths( - arr, - instart, + chunk, paths, store, &mut self.lmc, @@ -275,88 +335,85 @@ impl ZopfliState { +/// # State Init. +/// +/// The `ZopfliState` struct is initialized from a raw pointer to prevent +/// stack allocations. This trait exposes — in as limited a way as possible — +/// raw initialization methods for its members. (`ZopfliState::new` is the only +/// place that calls these methods.) +/// +/// The `state_init` invocations do not necessarily populate _default_ values +/// since they'll be re(reset) prior to use anyway, but the values will at +/// least be valid for their types, preventing accidental UB. +pub(crate) trait ZopfliStateInit { + #[allow(unsafe_code)] + /// # State Initialization. + unsafe fn state_init(nn: NonNull); +} + + + #[derive(Clone, Copy)] /// # Zopfli Hash. /// /// This structure tracks byte values and hashes by position, facilitating /// match-finding (length and distance) at various offsets. -/// -/// It is functionally equivalent to the original `hash.c` structure, but with -/// more consistent member typing, sizing, and naming. struct ZopfliHash { chain1: ZopfliHashChain, chain2: ZopfliHashChain, - /// Repetitions of the same byte after this. + /// # Repetitions of the same byte after this. same: [u16; ZOPFLI_WINDOW_SIZE], } -impl ZopfliHash { +impl ZopfliStateInit for ZopfliHash { #[allow(unsafe_code)] - /// # New (Boxed) Instance. - /// - /// The fixed arrays holding this structure's data are monstrous — 458,756 - /// bytes per instance! — but absolutely critical for performance. + #[inline] + /// # State Initialization. /// - /// To keep Rust from placing all that shit on the stack — as it would - /// normally try to do — this method manually initializes everything from - /// raw pointers, then boxes it up for delivery à la [`zopfli-rs`](https://github.com/zopfli-rs/zopfli). - fn new() -> Box { - // Reserve the space. - const LAYOUT: Layout = Layout::new::(); - let out = NonNull::new(unsafe { alloc(LAYOUT).cast() }) - .unwrap_or_else(|| handle_alloc_error(LAYOUT)); - let ptr: *mut Self = out.as_ptr(); - - // Safety: all this pointer business is necessary to keep the content - // off the stack. Once it's boxed we can breathe easier. ;) - unsafe { - // All the hash/index arrays default to `-1_i16` for `None`, which - // we can do efficiently by setting all bits to one. - addr_of_mut!((*ptr).chain1.hash_idx).write_bytes(u8::MAX, 1); - addr_of_mut!((*ptr).chain1.idx_hash).write_bytes(u8::MAX, 1); - addr_of_mut!((*ptr).chain1.idx_prev).write_bytes(u8::MAX, 1); + /// See `ZopfliState` for more details. + unsafe fn state_init(nn: NonNull) { + let ptr = nn.as_ptr(); - // The initial hash value is just plain zero. - addr_of_mut!((*ptr).chain1.val).write(0); + // All the hash/index arrays default to `-1_i16` for `None`, which + // we can do efficiently by flipping all bits on. + addr_of_mut!((*ptr).chain1.hash_idx).write_bytes(u8::MAX, 1); + addr_of_mut!((*ptr).chain1.idx_hash).write_bytes(u8::MAX, 1); + addr_of_mut!((*ptr).chain1.idx_prev).write_bytes(u8::MAX, 1); - // The second chain is the same as the first, so we can simply copy - // it wholesale. - addr_of_mut!((*ptr).chain2).copy_from_nonoverlapping(addr_of!((*ptr).chain1), 1); + // The initial hash value is just plain zero. + addr_of_mut!((*ptr).chain1.val).write(0); - // The repetition counts default to zero. - addr_of_mut!((*ptr).same).write_bytes(0, 1); + // The second chain is the same as the first, so we can simply copy + // it wholesale. + addr_of_mut!((*ptr).chain2).copy_from_nonoverlapping(addr_of!((*ptr).chain1), 1); - // All set! - Box::from_raw(ptr) - } + // The repetition counts default to zero. + addr_of_mut!((*ptr).same).write_bytes(0, 1); } +} +impl ZopfliHash { /// # Reset/Warm Up. /// /// This sets all values to their defaults, then cycles the first chain's /// hash value once or twice, then hashes the bits between the start of the /// window and the start of the slice we're actually interested in, if any. - fn reset( - &mut self, - arr: &[u8], - instart: usize, - ) { + fn reset(&mut self, chunk: ZopfliChunk<'_>) { // Reset the data. self.chain1.reset(); self.chain2.reset(); self.same.fill(0); // Cycle the hash once or twice. - if instart >= arr.len() { return; } - let windowstart = instart.saturating_sub(ZOPFLI_WINDOW_SIZE); - self.update_hash_value(arr[windowstart]); - if windowstart + 1 < arr.len() { - self.update_hash_value(arr[windowstart + 1]); - } + let (a, b) = chunk.warmup_values(); + self.update_hash_value(a); + if let Some(b) = b { self.update_hash_value(b); } - // Process the values between windowstart and instart. - for i in windowstart..instart { self.update_hash(&arr[i..], i); } + // Process the values between windowstart and instart, if any. + if let Some(iter) = chunk.reducing_prelude_iter() { + for chunk2 in iter { self.update_hash(chunk2); } + } } #[allow( @@ -366,23 +423,24 @@ impl ZopfliHash { )] /// # Update Hash. /// - /// This updates the hash tables using the data from `arr`. The `pos` value - /// marks the position of `arr` within the original block slice. (That is, - /// `arr` is pre-sliced to `arr[pos..]` before being passed to this method.) - fn update_hash(&mut self, arr: &[u8], pos: usize) { + /// This updates the hash tables using the chunk's block data. + fn update_hash(&mut self, chunk: ZopfliChunk<'_>) { + let pos = chunk.pos(); let hpos = pos & ZOPFLI_WINDOW_MASK; // Cycle the first hash. + let arr = chunk.block(); self.update_hash_value(arr.get(ZOPFLI_MIN_MATCH - 1).map_or(0, |v| *v)); self.chain1.update_hash(pos); // Count up the repetitions (and update sameness). + let current = chunk.first(); let mut amount = self.same[pos.wrapping_sub(1) & ZOPFLI_WINDOW_MASK] .saturating_sub(1); while amount < u16::MAX && usize::from(amount) + 1 < arr.len() && - arr[0] == arr[usize::from(amount) + 1] + current == arr[usize::from(amount) + 1] { amount += 1; } @@ -396,6 +454,8 @@ impl ZopfliHash { /// # Update Hash Value. /// /// This updates the rotating (chain1) hash value. + /// + /// Note: the value will always fit within the equivalent of `u15`. fn update_hash_value(&mut self, c: u8) { self.chain1.val = ((self.chain1.val << HASH_SHIFT) ^ i16::from(c)) & HASH_MASK; } @@ -412,40 +472,107 @@ impl ZopfliHash { /// /// Note: the costs really do need to be calculated in 64 bits, truncated /// to 32 bits for storage, then widened back to 64 bits for comparison. + /// Zopfli is evil! fn get_best_lengths( &mut self, - arr: &[u8], - instart: usize, - stats: Option<&SymbolStats>, + chunk: ZopfliChunk<'_>, + stats: &SymbolStats, costs: &mut [(f32, LitLen)], lmc: &mut MatchCache, ) -> Result<(), ZopfliError> { - // Costs and lengths are resized prior to this point; they should be - // one larger than the data of interest (and equal to each other). - debug_assert!(costs.len() == arr.len() - instart + 1); + /// # Minimum Cost Model (Non-Zero Distances). + fn minimum_cost(stats: &SymbolStats) -> f64 { + // Find the minimum length cost. + let mut length_cost = f64::INFINITY; + for (lsym, lbits) in LENGTH_SYMBOLS.iter().copied().zip(LENGTH_SYMBOL_BITS_F.into_iter()).skip(3) { + let cost = lbits + stats.ll_symbols[lsym as usize]; + if cost < length_cost { length_cost = cost; } + } + + // Now find the minimum distance cost. + let mut dist_cost = f64::INFINITY; + for (bits, v) in MIN_COST_DISTANCES.iter().copied().zip(stats.d_symbols) { + let cost = f64::from(bits) + v; + if cost < dist_cost { dist_cost = cost; } + } + + // Add them together and we have our minimum. + length_cost + dist_cost + } + + /// # Adjusted Cost. + fn stat_cost(dist: u16, k: LitLen, stats: &SymbolStats) -> f64 { + if dist == 0 { stats.ll_symbols[k as usize] } + else { + let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize]; + DISTANCE_BITS_F[dsym as usize] + + stats.d_symbols[dsym as usize] + + stats.ll_symbols[LENGTH_SYMBOLS[k as usize] as usize] + + LENGTH_SYMBOL_BITS_F[k as usize] + } + } + + // The costs are sized according to the (relevant) array slice; they + // should always be exactly one larger. + if costs.len() != chunk.block_size().get() + 1 { + return Err(zopfli_error!()); + } + + // Iterators will help us avoid a bunch of unsafe. + let instart = chunk.pos(); + let mut iter = chunk.reducing_block_iter().zip( + ReducingSlices::new(Cell::from_mut(costs).as_slice_of_cells()) + ); let mut length = LitLen::L000; let mut distance = 0_u16; let mut sublen = ZEROED_SUBLEN; - // Find the minimum and maximum cost. - let min_cost = stats.map_or(12.0, get_minimum_cost); + // Find the minimum and symbol costs, which we'll need to reference + // repeatedly in the loop. + let min_cost = minimum_cost(stats); + let symbol_cost = stats.ll_symbols[285] + stats.d_symbols[0]; + + while let Some((mut chunk2, mut cost2)) = iter.next() { + debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len()); - let mut i = instart; - while i < arr.len() { // Hash the remainder. - self.update_hash(&arr[i..], i); + self.update_hash(chunk2); + + let pos = chunk2.pos(); + if + // We have more than ZOPFLI_MAX_MATCH entries behind us, and twice + // twice as many ahead of us. + pos > instart + ZOPFLI_MAX_MATCH + 1 && + chunk2.block_size().get() > ZOPFLI_MAX_MATCH * 2 + 1 && + // The current and max-match-ago positions have long repetitions. + self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 && + self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 + { + // Fast forward! + let before = pos; + for (chunk3, cost3) in iter.by_ref().take(ZOPFLI_MAX_MATCH) { + // Safety: arr2.len() has at least ZOPFLI_MAX_MATCH*2+1 + // remaining entries; cost2.len() will be at least one + // more than that. + if cost2.len() <= ZOPFLI_MAX_MATCH { crate::unreachable(); } + cost2[ZOPFLI_MAX_MATCH].set(( + (f64::from(cost2[0].get().0) + symbol_cost) as f32, + LitLen::MAX_MATCH, + )); + cost2 = cost3; // The costs are rotated _after_ updating… + + chunk2 = chunk3; // …but the array is rotated beforehand. + self.update_hash(chunk2); + } - // We're in a long repetition of the same character and have more - // than ZOPFLI_MAX_MATCH ahead of and behind us. - if self._get_best_lengths_max_match(instart, i, stats, arr, costs) { - i += ZOPFLI_MAX_MATCH; + debug_assert_eq!(chunk2.pos() - before, ZOPFLI_MAX_MATCH); + debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len()); } // Find the longest remaining match. self.find( - arr, - i, + chunk2, LitLen::MAX_MATCH, &mut Some(&mut sublen), &mut distance, @@ -454,44 +581,40 @@ impl ZopfliHash { Some(instart), )?; - // Relative position for the costs and lengths, which have - // (iend - istart + 1) entries, so j is always in range when i is. - let j = i - instart; - - // This should never trigger; it is mainly a reminder to the - // compiler that our i/j indices are still applicable. - if i >= arr.len() || j + 1 >= costs.len() { break; } - - let cost_j = f64::from(costs[j].0); - let new_cost = stats.map_or_else( - || if arr[i] <= 143 { 8.0 } else { 9.0 }, - |s| s.ll_symbols[usize::from(arr[i])], - ) + cost_j; - debug_assert!(0.0 <= new_cost); + // Safety: the MAX loop (if it ran at all) only advanced the + // slices ZOPFLI_MAX_MATCH; we have more work to do! + if cost2.len() < 2 { crate::unreachable(); } // Update it if lower. - if new_cost < f64::from(costs[j + 1].0) { - costs[j + 1].0 = new_cost as f32; - costs[j + 1].1 = LitLen::L001; + let cost_j = f64::from(cost2[0].get().0); + let new_cost = stats.ll_symbols[usize::from(chunk2.first())] + cost_j; + if new_cost < f64::from(cost2[1].get().0) { + cost2[1].set((new_cost as f32, LitLen::L001)); } // If a long match was found, peek forward to recalculate those // costs, at least the ones who could benefit from the expense of // all that effort. - let limit = length.min_usize(costs.len().saturating_sub(j + 1)); + let limit = length.min_usize(cost2.len() - 1); if limit.is_matchable() { - let sublen2 = &sublen[ZOPFLI_MIN_MATCH..=limit as usize]; - let costs2 = &mut costs[j + ZOPFLI_MIN_MATCH..]; - if let Some(s) = stats { - peek_ahead_stats(cost_j, min_cost, sublen2, costs2, s); - } - else { - peek_ahead_fixed(cost_j, min_cost, sublen2, costs2); + let min_cost_add = min_cost + cost_j; + + // Safety: limit is capped to cost2.len() - 1. + if cost2.len() <= (limit as usize) { crate::unreachable(); } + + for ((dist, c), k) in sublen[ZOPFLI_MIN_MATCH..=limit as usize].iter() + .copied() + .zip(&cost2[ZOPFLI_MIN_MATCH..=limit as usize]) + .zip(LitLen::matchable_iter()) + { + let current_cost = f64::from(c.get().0); + if min_cost_add < current_cost { + // Update it if lower. + let new_cost = cost_j + stat_cost(dist, k, stats); + if new_cost < current_cost { c.set((new_cost as f32, k)); } + } } } - - // Back around again! - i += 1; } // All costs should have been updated… @@ -500,64 +623,136 @@ impl ZopfliHash { } #[allow(clippy::cast_possible_truncation)] - /// # Best Length Max Match. - /// - /// This fast-forwards through long repetitions in the middle of a - /// `ZopfliHash::get_best_lengths` block, processing `ZOPFLI_MAX_MATCH` - /// `arr` and `costs` entries in one go. + #[inline(never)] + /// # Get Best Lengths (Fixed). /// - /// Returns `true` if such a match was found so the indices can be - /// incremented accordingly on the caller's side. - fn _get_best_lengths_max_match( + /// Same as `ZopfliHash::get_best_lengths`, but simpler fixed-tree lengths + /// and symbols are used instead of variable store-specific data. + fn get_best_lengths_fixed( &mut self, - instart: usize, - mut pos: usize, - stats: Option<&SymbolStats>, - arr: &[u8], + chunk: ZopfliChunk<'_>, costs: &mut [(f32, LitLen)], - ) -> bool { - if - // We have more than ZOPFLI_MAX_MATCH entries behind us, and twice - // twice as many ahead of us. - pos > instart + ZOPFLI_MAX_MATCH + 1 && - arr.len() > pos + ZOPFLI_MAX_MATCH * 2 + 1 && - // The current and max-match-ago positions have long repetitions. - self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 && - self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 - { - // The symbol cost for ZOPFLI_MAX_LENGTH (and a distance of 1) doesn't - // need mutch calculation. - let symbol_cost = stats.map_or( - 13.0, - |s| s.ll_symbols[285] + s.d_symbols[0], - ); + lmc: &mut MatchCache, + ) -> Result<(), ZopfliError> { + /// # Adjusted Cost. + /// + /// These are really tiny so we might as well use single-byte math. + const fn fixed_cost(dist: u16, k: LitLen) -> u8 { + use super::{DISTANCE_BITS, LENGTH_SYMBOL_BITS}; + + if dist == 0 { 8 + (143 < (k as u16)) as u8 } + else { + let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize]; + DISTANCE_BITS[dsym as usize] + + LENGTH_SYMBOL_BITS[k as usize] + + (114 < (k as u16)) as u8 + + 12 + } + } + + // The costs are sized according to the (relevant) array slice; they + // should always be exactly one larger. + if costs.len() != chunk.block_size().get() + 1 { + return Err(zopfli_error!()); + } + + // Iterators will help us avoid a bunch of unsafe. + let instart = chunk.pos(); + let mut iter = chunk.reducing_block_iter().zip( + ReducingSlices::new(Cell::from_mut(costs).as_slice_of_cells()) + ); + + let mut length = LitLen::L000; + let mut distance = 0_u16; + let mut sublen = ZEROED_SUBLEN; + + while let Some((mut chunk2, mut cost2)) = iter.next() { + debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len()); + + // Hash the remainder. + self.update_hash(chunk2); + + let pos = chunk2.pos(); + if + // We have more than ZOPFLI_MAX_MATCH entries behind us, and twice + // twice as many ahead of us. + pos > instart + ZOPFLI_MAX_MATCH + 1 && + chunk2.block_size().get() > ZOPFLI_MAX_MATCH * 2 + 1 && + // The current and max-match-ago positions have long repetitions. + self.same[pos & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 * 2 && + self.same[(pos - ZOPFLI_MAX_MATCH) & ZOPFLI_WINDOW_MASK] > ZOPFLI_MAX_MATCH as u16 + { + // Fast forward! + let before = pos; + for (chunk3, cost3) in iter.by_ref().take(ZOPFLI_MAX_MATCH) { + // Safety: arr2.len() has at least ZOPFLI_MAX_MATCH*2+1 + // remaining entries; cost2.len() will be at least one + // more than that. + if cost2.len() <= ZOPFLI_MAX_MATCH { crate::unreachable(); } + cost2[ZOPFLI_MAX_MATCH].set(( + (f64::from(cost2[0].get().0) + 13.0) as f32, + LitLen::MAX_MATCH, + )); + cost2 = cost3; // The costs are rotated _after_ updating… + + chunk2 = chunk3; // …but the array is rotated beforehand. + self.update_hash(chunk2); + } - // We'll need to read data from one portion of the slice and add it - // to data in another portion. Index-based access confusing the - // compiler, so to avoid a bunch of "unsafe", we'll work with a - // slice-of-cells representation instead. - let costs = Cell::from_mut(costs).as_slice_of_cells(); - - // Fast forward! - let before = pos; - let mut iter = costs.windows(ZOPFLI_MAX_MATCH + 1).skip(pos - instart).take(ZOPFLI_MAX_MATCH); - while let Some([a, _rest @ .., z]) = iter.next() { - z.set(( - (f64::from(a.get().0) + symbol_cost) as f32, - LitLen::MAX_MATCH, - )); - pos += 1; - self.update_hash(&arr[pos..], pos); + debug_assert_eq!(chunk2.pos() - before, ZOPFLI_MAX_MATCH); + debug_assert_eq!(chunk2.block_size().get() + 1, cost2.len()); } - // We should never not hit our desired take() because the lengths - // of arr and cost are fixed and intertwined, but it's a good debug - // sort of thing to check. - debug_assert_eq!(pos - before, ZOPFLI_MAX_MATCH); + // Find the longest remaining match. + self.find( + chunk2, + LitLen::MAX_MATCH, + &mut Some(&mut sublen), + &mut distance, + &mut length, + lmc, + Some(instart), + )?; - true + // Safety: the MAX loop (if it ran at all) only advanced the + // slices ZOPFLI_MAX_MATCH; we have more work to do! + if cost2.len() < 2 { crate::unreachable(); } + + // Update it if lower. + let cost_j = f64::from(cost2[0].get().0); + let new_cost = if chunk2.first() <= 143 { 8.0 } else { 9.0 } + cost_j; + if new_cost < f64::from(cost2[1].get().0) { + cost2[1].set((new_cost as f32, LitLen::L001)); + } + + // If a long match was found, peek forward to recalculate those + // costs, at least the ones who could benefit from the expense of + // all that effort. + let limit = length.min_usize(cost2.len() - 1); + if limit.is_matchable() { + let min_cost_add = 8.0 + cost_j; + + // Safety: limit is capped to cost2.len() - 1. + if cost2.len() <= (limit as usize) { crate::unreachable(); } + + for ((dist, c), k) in sublen[ZOPFLI_MIN_MATCH..=limit as usize].iter() + .copied() + .zip(&cost2[ZOPFLI_MIN_MATCH..=limit as usize]) + .zip(LitLen::matchable_iter()) + { + let current_cost = f64::from(c.get().0); + if min_cost_add < current_cost { + // Update it if lower. + let new_cost = cost_j + f64::from(fixed_cost(dist, k)); + if new_cost < current_cost { c.set((new_cost as f32, k)); } + } + } + } } - else { false } + + // All costs should have been updated… + debug_assert!(costs.iter().all(|(cost, _)| (0.0..1E30).contains(cost))); + Ok(()) } #[allow(clippy::cast_possible_truncation)] @@ -567,22 +762,21 @@ impl ZopfliHash { /// squeeze-based path lengths. The store is updated with the results. fn follow_paths( &mut self, - arr: &[u8], - instart: usize, + chunk: ZopfliChunk<'_>, paths: &[LitLen], store: &mut LZ77Store, lmc: &mut MatchCache, ) -> Result<(), ZopfliError> { - // Easy abort. - if instart >= arr.len() { return Ok(()); } - // Reset the hash. - self.reset(arr, instart); + self.reset(chunk); // Hash the path symbols. - let mut i = instart; - for length in paths.iter().copied() { - self.update_hash(&arr[i..], i); + let instart = chunk.pos(); + let mut len_iter = paths.iter().copied(); + let mut arr_iter = chunk.reducing_block_iter(); + while let Some((length, chunk2)) = len_iter.next().zip(arr_iter.next()) { + // Hash it. + self.update_hash(chunk2); // Follow the matches! if length.is_matchable() { @@ -592,8 +786,7 @@ impl ZopfliHash { let mut test_length = LitLen::L000; let mut dist = 0; self.find( - arr, - i, + chunk2, length, &mut None, &mut dist, @@ -608,20 +801,17 @@ impl ZopfliHash { } // Add it to the store. - store.push(length, dist, i); + store.push(length, dist, chunk2.pos()); // Hash the rest of the match. - for _ in 1..(length as u16) { - i += 1; - self.update_hash(&arr[i..], i); + for chunk2 in arr_iter.by_ref().take(length as usize - 1) { + self.update_hash(chunk2); } } // It isn't matchable; add it directly to the store. else { - store.push(LitLen::from_u8(arr[i]), 0, i); + store.push(LitLen::from_u8(chunk2.first()), 0, chunk2.pos()); } - - i += 1; } Ok(()) @@ -632,16 +822,15 @@ impl ZopfliHash { #[allow(clippy::too_many_arguments)] /// # Find Longest Match. /// - /// This finds the longest match in `arr` (and/or the cache), setting the - /// passed `sublen`/`distance`/`length` values accordingly. + /// This finds the longest match in the chunk (and/or the cache), setting + /// the provided `sublen`/`distance`/`length` values accordingly. /// /// Lengths will never exceed `limit` nor `ZOPFLI_MAX_MATCH`, but they - /// might be _less_ than `ZOPFLI_MIN_MATCH`, especially near the end of a - /// slice. + /// might be _less_ than `ZOPFLI_MIN_MATCH`, especially as we near the end + /// of the block slice. fn find( &self, - arr: &[u8], - pos: usize, + chunk: ZopfliChunk<'_>, mut limit: LitLen, sublen: &mut Option<&mut [u16; SUBLEN_LEN]>, distance: &mut u16, @@ -650,6 +839,7 @@ impl ZopfliHash { cache: Option, ) -> Result<(), ZopfliError> { // Check the longest match cache first! + let pos = chunk.pos(); if let Some(blockstart) = cache { if lmc.find( pos - blockstart, @@ -658,14 +848,14 @@ impl ZopfliHash { distance, length, )? { - if pos + (*length as usize) <= arr.len() { return Ok(()); } + if (*length as usize) <= chunk.block_size().get() { return Ok(()); } return Err(zopfli_error!()); } } // We'll need at least ZOPFLI_MIN_MATCH bytes for a search; if we don't // have it, zero everything out and call it a day. - if pos + ZOPFLI_MIN_MATCH > arr.len() { + if ZOPFLI_MIN_MATCH > chunk.block_size().get() { *length = LitLen::L000; *distance = 0; return Ok(()); @@ -673,10 +863,10 @@ impl ZopfliHash { // Cap the limit to fit if needed. Note that limit will always be at // least one even if capped since pos < size. - limit = limit.min_usize(arr.len() - pos); + limit = limit.min_usize(chunk.block_size().get()); // Calculate the best distance and length. - let (bestdist, bestlength) = self.find_loop(arr, pos, limit, sublen)?; + let (bestdist, bestlength) = self.find_loop(chunk, limit, sublen); // Cache the results for next time, maybe. if limit.is_max() { @@ -690,7 +880,7 @@ impl ZopfliHash { // Update the values. *distance = bestdist; *length = bestlength; - if pos + (*length as usize) <= arr.len() { Ok(()) } + if (*length as usize) <= chunk.block_size().get() { Ok(()) } else { Err(zopfli_error!()) } } @@ -703,26 +893,26 @@ impl ZopfliHash { )] /// # Find Longest Match Loop. /// - /// This method is the (nasty-looking) workhorse of the above + /// This method is a (nasty-looking) workhorse for the above /// `ZopfliHash::find` method. It finds and returns the matching distance /// and length, or `(0, 1)` if none. fn find_loop( &self, - arr: &[u8], - pos: usize, + chunk: ZopfliChunk<'_>, limit: LitLen, sublen: &mut Option<&mut [u16; SUBLEN_LEN]>, - ) -> Result<(u16, LitLen), ZopfliError> { + ) -> (u16, LitLen) { /// # Distance Given Positions. const fn ppp_distance(p: usize, pp: usize) -> usize { if p < pp { pp - p } else { ZOPFLI_WINDOW_SIZE + pp - p } } - // This is asserted by find() too, but it's a good reminder. - if arr.len() <= pos { return Err(zopfli_error!()); } - let right = &arr[pos..]; + // Prepopulate some slices to work with directly later on. + let arr = chunk.arr(); + let right = chunk.block(); + let pos = chunk.pos(); let hpos = pos & ZOPFLI_WINDOW_MASK; // The default distance and length. We'll be wanting 16-bit values for @@ -771,9 +961,7 @@ impl ZopfliHash { // verified it was non-empty, but the compiler will have // forgotten that by now. let left = unsafe { arr.get_unchecked(pos - dist..pos - dist + right.len()) }; - if right.is_empty() || left.len() != right.len() { - unsafe { core::hint::unreachable_unchecked(); } - } + if right.is_empty() || left.len() != right.len() { crate::unreachable(); } // Check to see if we can do better than we've already done. if (bestlength as usize) >= right.len() || right[bestlength as usize] == left[bestlength as usize] { @@ -842,8 +1030,8 @@ impl ZopfliHash { } // Thus concludes the long-ass loop! // Return the distance and length values. - if (bestlength as u16) <= (limit as u16) { Ok((bestdist as u16, bestlength)) } - else { Ok((0, LitLen::L001)) } + if (bestlength as u16) <= (limit as u16) { (bestdist as u16, bestlength) } + else { (0, LitLen::L001) } } } @@ -856,11 +1044,13 @@ impl ZopfliHash { /// positions. /// /// Written values are all in the range of `0..=i16::MAX`, matching the array -/// sizes, elminating bounds checking on the upper end. +/// sizes, elminating bounds checking on the upper end. (They're effectively +/// `u15`.) /// /// The remaining "sign" bit is logically repurposed to serve as a sort of -/// `None`, allowing us to cheaply identify unwritten values. (Testing for that -/// takes care of bounds checking on the lower end.) +/// `None` flag, allowing us to cheaply identify uninitialized values. +/// (And by testing for that, we eliminate bounds checks on the lower end of +/// the range.) struct ZopfliHashChain { /// Hash value to (most recent) index. /// @@ -925,98 +1115,6 @@ impl ZopfliHashChain { -/// # Minimum Cost Model. -/// -/// This returns the minimum _statistical_ cost, which is the sum of the -/// minimum length cost and minimum distance cost. -fn get_minimum_cost(stats: &SymbolStats) -> f64 { - // Find the minimum length cost. - let mut length_cost = f64::INFINITY; - for (lsym, lbits) in LENGTH_SYMBOLS.into_iter().zip(LENGTH_SYMBOL_BITS.into_iter()).skip(3) { - let cost = f64::from(lbits) + stats.ll_symbols[lsym as usize]; - if cost < length_cost { length_cost = cost; } - } - - // Now find the minimum distance cost. - let mut dist_cost = f64::INFINITY; - for (bits, v) in MIN_COST_DISTANCES.iter().copied().zip(stats.d_symbols) { - let cost = f64::from(bits) + v; - if cost < dist_cost { dist_cost = cost; } - } - - // Add them together and we have our minimum. - length_cost + dist_cost -} - -#[allow(clippy::cast_possible_truncation)] -/// # Get Best Lengths Peek Ahead (Fixed). -fn peek_ahead_fixed( - cost_j: f64, - min_cost: f64, - sublen: &[u16], - costs: &mut [(f32, LitLen)], -) { - let min_cost_add = min_cost + cost_j; - for ((dist, c), k) in sublen.iter().copied().zip(costs).zip(LitLen::matchable_iter()) { - if min_cost_add < f64::from(c.0) { - let mut new_cost = cost_j; - if dist == 0 { - if (k as u16) <= 143 { new_cost += 8.0; } - else { new_cost += 9.0; } - } - else { - if 114 < (k as u16) { new_cost += 13.0; } - else { new_cost += 12.0; } - - let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize]; - new_cost += f64::from(DISTANCE_BITS[dsym as usize]); - new_cost += f64::from(LENGTH_SYMBOL_BITS[k as usize]); - } - - // Update it if lower. - if (0.0..f64::from(c.0)).contains(&new_cost) { - c.0 = new_cost as f32; - c.1 = k; - } - } - } -} - -#[allow(clippy::cast_possible_truncation)] -/// # Get Best Lengths Peek Ahead (Dynamic). -fn peek_ahead_stats( - cost_j: f64, - min_cost: f64, - sublen: &[u16], - costs: &mut [(f32, LitLen)], - stats: &SymbolStats, -) { - let min_cost_add = min_cost + cost_j; - for ((dist, c), k) in sublen.iter().copied().zip(costs).zip(LitLen::matchable_iter()) { - if min_cost_add < f64::from(c.0) { - let mut new_cost = cost_j; - if dist == 0 { - new_cost += stats.ll_symbols[k as usize]; - } - else { - let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize]; - new_cost += f64::from(DISTANCE_BITS[dsym as usize]); - new_cost += stats.d_symbols[dsym as usize]; - new_cost += stats.ll_symbols[LENGTH_SYMBOLS[k as usize] as usize]; - new_cost += f64::from(LENGTH_SYMBOL_BITS[k as usize]); - } - - // Update it if lower. - if (0.0..f64::from(c.0)).contains(&new_cost) { - c.0 = new_cost as f32; - c.1 = k; - } - } - } -} - - - #[cfg(test)] mod tests { use super::*; @@ -1024,8 +1122,8 @@ mod tests { #[test] fn t_fixed_cost() { // Get the largest dbit and lbit values. - let d_max: u8 = DISTANCE_BITS.into_iter().max().unwrap(); - let l_max: u8 = LENGTH_SYMBOL_BITS.into_iter().max().unwrap(); + let d_max: u8 = super::super::DISTANCE_BITS.into_iter().max().unwrap(); + let l_max: u8 = super::super::LENGTH_SYMBOL_BITS.into_iter().max().unwrap(); // Make sure their sum (along with the largest base) fits within // the u8 space, since that's what we're using at runtime. diff --git a/flapfli/src/zopflipng/iter.rs b/flapfli/src/zopflipng/iter.rs new file mode 100644 index 0000000..1dc5f2b --- /dev/null +++ b/flapfli/src/zopflipng/iter.rs @@ -0,0 +1,74 @@ +/*! +# Flapfli: Miscellaneous Iterators. +*/ + + + +/// # Reducing Slice Iterator. +/// +/// This iterator yields all non-empty slices spanning `n..`, incrementing `n` +/// by one after each cycle. +/// +/// In other words, this will start by returning the original slice, then `1..`, +/// `2..`, `3..`, etc., stopping when empty. +pub(super) struct ReducingSlices<'a, T>(&'a [T]); + +impl<'a, T> ReducingSlices<'a, T> { + /// # New. + pub(super) const fn new(arr: &'a [T]) -> Self { Self(arr) } +} + +impl<'a, T> Iterator for ReducingSlices<'a, T> { + type Item = &'a [T]; + + fn next(&mut self) -> Option { + if let [_, rest @ ..] = &self.0 { + Some(std::mem::replace(&mut self.0, rest)) + } + else { None } + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.0.len(); + (len, Some(len)) + } +} + +impl<'a, T> ExactSizeIterator for ReducingSlices<'a, T> { + #[inline] + fn len(&self) -> usize { self.0.len() } +} + + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn t_reducing_slices() { + let slice: &[u8] = &[0, 1, 2, 3, 4, 5]; + let mut reducing = ReducingSlices::new(slice); + + assert_eq!(reducing.len(), slice.len()); + assert_eq!(reducing.next(), Some(slice)); + + assert_eq!(reducing.len(), 5); + assert_eq!(reducing.next(), Some(&slice[1..])); + + assert_eq!(reducing.len(), 4); + assert_eq!(reducing.next(), Some(&slice[2..])); + + assert_eq!(reducing.len(), 3); + assert_eq!(reducing.next(), Some(&slice[3..])); + + assert_eq!(reducing.len(), 2); + assert_eq!(reducing.next(), Some(&slice[4..])); + + assert_eq!(reducing.len(), 1); + assert_eq!(reducing.next(), Some(&slice[5..])); + + assert_eq!(reducing.len(), 0); + assert_eq!(reducing.next(), None); + } +} diff --git a/flapfli/src/zopflipng/kat.rs b/flapfli/src/zopflipng/kat.rs index 04fa013..4514419 100644 --- a/flapfli/src/zopflipng/kat.rs +++ b/flapfli/src/zopflipng/kat.rs @@ -12,10 +12,7 @@ use std::{ handle_alloc_error, Layout, }, - cell::{ - Cell, - RefCell, - }, + cell::Cell, cmp::Ordering, num::{ NonZeroU32, @@ -27,6 +24,7 @@ use super::{ ArrayD, ArrayLL, DeflateSym, + DeflateSymBasic, zopfli_error, ZOPFLI_NUM_D, ZOPFLI_NUM_LL, @@ -37,12 +35,15 @@ use super::{ #[allow(unsafe_code)] -const NZ1: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1) }; +/// # One is Non-Zero. +const NZ01: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1) }; #[allow(unsafe_code)] -const NZ2: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(2) }; +/// # Two is Non-Zero. +const NZ02: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(2) }; #[allow(unsafe_code)] +/// # Fourteen is Non-Zero. const NZ14: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(14) }; /// # Zero-Filled Tree Counts. @@ -56,12 +57,7 @@ thread_local!( /// The length-limited-code-length methods need to temporarily store /// thousands of `Node` objects. Using a thread-local share for that cuts /// way down on the number of allocations we'd otherwise have to make! - static KATSCRATCH: KatScratch = KatScratch::new(); - - /// # Shared Tree Scratch. - /// - /// Similar idea as above, but for tree sizing/writing. - static TREESCRATCH: RefCell = const { RefCell::new(TreeScratch::new()) }; + static KATSCRATCH: KatScratch = KatScratch::new() ); @@ -83,7 +79,8 @@ mod sealed { /// # Crunch the Code Lengths. /// /// This method serves as the closure for the caller's call to - /// `KATSCRATCH.with_borrow_mut()`. + /// `KATSCRATCH.with_borrow_mut()`. It does all that needs doing to get + /// the desired length-limited data into the provided `bitlengths`. fn _llcl<'a>( frequencies: &'a [u32; N], bitlengths: &'a [Cell; N], @@ -101,14 +98,11 @@ mod sealed { leaves[0].frequency, leaves[1].frequency, ); - #[allow(unsafe_code)] - if lists.len() < 2 { - // Safety: `usize::min(MAXBITS, leaves.len() - 1)` (above) is - // how many lists we'll have, and since MAXBITS is at least - // seven and leaves.len() at least three, we'll always have at - // least two lists to work with. - unsafe { core::hint::unreachable_unchecked(); } - } + // Safety: `usize::min(MAXBITS, leaves.len() - 1)` (above) is + // how many lists we'll have, and since MAXBITS is at least + // seven and leaves.len() at least three, we'll always have at + // least two lists to work with. + if lists.len() < 2 { crate::unreachable(); } // In the last list, (2 * len_leaves - 2) active chains need to be // created. We have two already from initialization; each boundary_pm run @@ -124,6 +118,9 @@ mod sealed { #[inline] /// # Write Code Lengths! + /// + /// This is the final stage of the LLCL chain, where the results are + /// finally recorded! fn llcl_write(mut node: Node, leaves: &[Leaf<'_>]) -> Result<(), ZopfliError> { // Make sure we counted correctly before doing anything else. let mut last_count = node.count; @@ -251,24 +248,117 @@ pub(crate) fn best_tree_size( ll_lengths: &ArrayLL, d_lengths: &ArrayD, ) -> Result<(u8, NonZeroU32), ZopfliError> { - TREESCRATCH.with_borrow_mut(|t| - t.with_symbols(ll_lengths, d_lengths).best_tree() - ) + // Drop the last two zeroes plus any trailing zeroes, then merge them + // together into a single collection. + let all: Vec = { + let mut ll_lengths = &ll_lengths[..286]; + while let [rest @ .., DeflateSym::D00] = ll_lengths { + ll_lengths = rest; + if ll_lengths.len() == 257 { break; } // Keep all literals. + } + + let mut d_lengths = &d_lengths[..30]; + while let [rest @ .., DeflateSym::D00] = d_lengths { d_lengths = rest; } + + [ll_lengths, d_lengths].concat() + }; + + // Our targets! + let mut best_extra = 0; + let mut best_size = NonZeroU32::MAX; + + for extra in 0..8 { + let cl_counts = best_tree_size_counts(&all, extra); + let cl_lengths = cl_counts.llcl()?; + let hclen = tree_hclen(&cl_counts); + + // We can finally calculate the size! + let mut size = (hclen as u32 + 4) * 3; + size += cl_lengths.iter() + .copied() + .zip(cl_counts.iter().copied()) + .map(|(a, b)| (a as u32) * b) + .sum::(); + size += cl_counts[16] * 2; // Extra bits. + size += cl_counts[17] * 3; + size += cl_counts[18] * 7; + let size = NZ14.saturating_add(size); + + // If better, keep it! + if size < best_size { + best_extra = extra; + best_size = size; + } + } + + // Done! + Ok((best_extra, best_size)) } /// # Encode Tree. /// -/// This finds the index that produces the smallest tree size, then writes -/// that table's bits to the output. +/// This writes the best-found tree data to `out`. pub(crate) fn encode_tree( ll_lengths: &ArrayLL, d_lengths: &ArrayD, extra: u8, out: &mut ZopfliOut, ) -> Result<(), ZopfliError> { - TREESCRATCH.with_borrow_mut(|t| - t.with_symbols(ll_lengths, d_lengths).write_tree(extra, out) - ) + // Drop the last two zeroes plus any trailing zeroes, then merge them + // together into a single collection. + let mut hlit: u32 = 29; + let mut hdist: u32 = 29; + let all: Vec = { + let mut ll_lengths = &ll_lengths[..286]; + while let [rest @ .., DeflateSym::D00] = ll_lengths { + ll_lengths = rest; + hlit -= 1; + if ll_lengths.len() == 257 { break; } // Keep all literals. + } + + let mut d_lengths = &d_lengths[..30]; + while let [rest @ .., DeflateSym::D00] = d_lengths { + d_lengths = rest; + hdist -= 1; + } + + [ll_lengths, d_lengths].concat() + }; + + // We'll need to store some RLE symbols and positions too. + let mut rle: Vec<(DeflateSym, u16)> = Vec::new(); + + let cl_counts = encode_tree_counts(&all, &mut rle, extra); + let cl_lengths = cl_counts.llcl()?; + let hclen = tree_hclen(&cl_counts); + let cl_symbols = <[u32; 19]>::llcl_symbols(&cl_lengths); + + // Write the main lengths. + out.add_fixed_bits::<5>(hlit); + out.add_fixed_bits::<5>(hdist); + out.add_fixed_bits::<4>(hclen as u32); + + // Write each cl_length in the jumbled DEFLATE order. + for &o in &DeflateSym::TREE[..hclen as usize + 4] { + out.add_fixed_bits::<3>(cl_lengths[o as usize] as u32); + } + + // Write each symbol in order of appearance along with its extra bits, + // if any. + for (a, b) in rle { + let symbol = cl_symbols[a as usize]; + out.add_huffman_bits(symbol, cl_lengths[a as usize] as u32); + + // Extra bits. + match a { + DeflateSym::D16 => { out.add_fixed_bits::<2>(u32::from(b)); }, + DeflateSym::D17 => { out.add_fixed_bits::<3>(u32::from(b)); }, + DeflateSym::D18 => { out.add_fixed_bits::<7>(u32::from(b)); }, + _ => {}, + } + } + + Ok(()) } @@ -277,6 +367,10 @@ pub(crate) fn encode_tree( /// /// This is a super-cheap arena-like structure for holding all the temporary /// data required for length-limited-code-length calculations. +/// +/// This requires doing some fairly un-Rust-like things, but that would be +/// equally true of any third-party structure as well, and since we know the +/// particulars in advance, we can do it leaner and meaner ourselves. struct KatScratch { leaves: NonNull, lists: NonNull, @@ -303,7 +397,8 @@ impl KatScratch { /// # Max Nodes. /// /// This represents the theoretical maximum number of nodes a length- - /// limiting pass might generate. + /// limiting pass might generate, though it is unlikely to ever be reached + /// in practice. (Better safe than sorry!) const MAX: usize = (2 * ZOPFLI_NUM_LL - 2) * 15; /// # Leaves Array Layout. @@ -318,12 +413,19 @@ impl KatScratch { #[allow(unsafe_code)] /// # New! /// - /// Return a new instance of self, allocated but uninitialized. + /// Return a new instance of self, allocated but **uninitialized**. /// /// Similar to other mega-array structures like `ZopfliHash`, its members /// are manually allocated from pointers to keep them off the stack. Unlike /// the others, though, the `KatScratch` members remain in pointer form to - /// prevent lifetime/borrow-checker confusion. + /// prevent subsequent lifetime/borrow-checker confusion. + /// + /// ## Safety + /// + /// New values are written from pointers without first reading or dropping + /// the previous values at that position, and references to the new values + /// are only made available after said write, eliminating any UB weirdness + /// from possibly-uninitialized data. fn new() -> Self { let leaves: NonNull = NonNull::new(unsafe { alloc(Self::LEAVES_LAYOUT) }) .unwrap_or_else(|| handle_alloc_error(Self::LEAVES_LAYOUT)); @@ -347,7 +449,7 @@ impl KatScratch { /// # Make Leaves. /// /// Join the non-zero frequencies with their corresponding bitlengths into - /// a collection of leaves, then return it sorted. + /// a collection of leaves. That collection is then sorted and returned. /// /// ## Safety /// @@ -408,7 +510,7 @@ impl KatScratch { let ptr = self.nodes.cast::().as_ptr(); ptr.write(Node { weight: weight1, - count: NZ1, + count: NZ01, tail: None, }); let lookahead0 = &*ptr; @@ -417,7 +519,7 @@ impl KatScratch { let ptr = ptr.add(1); ptr.write(Node { weight: weight2, - count: NZ2, + count: NZ02, tail: None, }); let lookahead1 = &*ptr; @@ -547,7 +649,10 @@ impl<'a> PartialOrd for Leaf<'a> { #[derive(Clone, Copy)] /// # List. /// -/// This struct holds a pair of recursive node chains. +/// This struct holds a pair of recursive node chains. The lifetimes are +/// technically static, but in practice are always scoped to the more limited +/// lifetime of the borrow. (`List`s are never accessible once the session that +/// birthed them has closed.) struct List { lookahead0: &'static Node, lookahead1: &'static Node, @@ -556,10 +661,14 @@ struct List { impl List { #[inline] /// # Rotate. + /// + /// Replace the first chain with a copy of the second. fn rotate(&mut self) { self.lookahead0 = self.lookahead1; } #[inline] /// # Weight Sum. + /// + /// Add and return the sum of the weights of the two chains. const fn weight_sum(&self) -> NonZeroU32 { self.lookahead0.weight.saturating_add(self.lookahead1.weight.get()) } @@ -569,6 +678,12 @@ impl List { #[derive(Clone, Copy)] /// # Node. +/// +/// This holds a weight and frequency pair, and possibly a reference to the +/// previous `Node` this one replaced. +/// +/// As with `List`, the static lifetime is technically true, but in practice +/// references will never extend beyond the current borrow. struct Node { weight: NonZeroU32, count: NonZeroU32, @@ -579,22 +694,22 @@ impl Node { #[inline] /// # Finish Last Node! /// - /// This method establishes the final tail that the subsequent writing - /// will start with. + /// This method creates and returns the final tail to be used as the + /// starting point for the subsequent `llcl_write` call. fn last(list_y: &List, list_z: &List, leaves: &[Leaf<'_>]) -> Self { // Figure out the final node! let last_count = list_z.lookahead1.count; let weight_sum = list_y.weight_sum(); if (last_count.get() as usize) < leaves.len() && leaves[last_count.get() as usize].frequency < weight_sum { Self { - weight: NZ1, // We'll never look at this value. + weight: NZ01, // We'll never look at this value. count: last_count.saturating_add(1), tail: list_z.lookahead1.tail, } } else { Self { - weight: NZ1, // We'll never look at this value. + weight: NZ01, // We'll never look at this value. count: last_count, tail: Some(list_y.lookahead1), } @@ -604,291 +719,150 @@ impl Node { -/// # Tree Scratch. +#[allow(unsafe_code)] +/// Array of Cells. /// -/// This holds a merged length-and-distance symbol set for tree sizing and -/// writing purposes. +/// Revisualize a mutable array as an array of cells. /// -/// This isn't nearly as large as most of the other zopfli structures, but -/// referenced frequently enough to justify storing it as a thread-local static -/// that can be reused willynilly. -struct TreeScratch { - symbols: [DeflateSym; Self::MAX], - hlit: usize, - hdist: usize, - - // Note: this should really be an array with the same count as `symbols`, - // but the compiler doesn't seem to like that, so whatever. - rle: Vec<(DeflateSym, u16)>, +/// TODO: use `Cell::as_array_of_cells` once that method is stabilized. +fn array_of_cells(arr: &mut [T; N]) -> &[Cell; N] { + let cells = Cell::from_mut(arr); + // Safety: `Cell` has the same memory layout as `T`. + unsafe { &*(std::ptr::from_ref(cells).cast::<[Cell; N]>()) } } -impl TreeScratch { - /// The maximum number of symbols. - const MAX: usize = 29 + 257 + 29 + 1; - - /// # New. - const fn new() -> Self { - Self { - symbols: [DeflateSym::D00; Self::MAX], - hlit: 0, - hdist: 0, - rle: Vec::new(), +/// # Tree Counts. +/// +/// Populate and return the tree counts for `best_tree_size`. +fn best_tree_size_counts(all: &[DeflateSym], extra: u8) -> [u32; 19] { + let mut cl_counts = ZEROED_COUNTS_TREE; + let (use_16, use_17, use_18) = extra_bools(extra); + + let mut i = 0; + while i < all.len() { + let mut count = 1_u32; + let symbol = all[i]; + + macro_rules! special { + ($step:literal, $max:literal, $symbol:ident) => ( + while count >= $step { + let count2 = if count < $max { count } else { $max }; + cl_counts[DeflateSym::$symbol as usize] += 1; + count -= count2; + } + ); } - } - /// # Total Length. - /// - /// Returning a slice would be more useful, but Rust's borrow checker - /// gets confused because we'll still need to write to RLE. - const fn len(&self) -> usize { self.hlit + 257 + self.hdist + 1 } - - /// # Load Symbols (and Reset). - fn with_symbols( - &mut self, - ll_lengths: &ArrayLL, - d_lengths: &ArrayD - ) -> &mut Self { - // Find the last non-zero length symbol, starting from 285. (The offset - // (256) marks the boundary between literals and symbols; we'll use - // both literals and symbols in some places, but only the latter in - // others.) - self.hlit = 29; - while self.hlit > 0 && ll_lengths[256 + self.hlit].is_zero() { self.hlit -= 1; } - - // Now the same for distance, starting at 29 proper. - self.hdist = 29; - while self.hdist > 0 && d_lengths[self.hdist].is_zero() { self.hdist -= 1; } - - // Copy both into place. Note that both hlit and hdist are inclusive, - // so we need to +1 both for exclusivity. - let ll_end = self.hlit + 257; - self.symbols[..ll_end].copy_from_slice(&ll_lengths[..ll_end]); - self.symbols[ll_end..=ll_end + self.hdist].copy_from_slice(&d_lengths[..=self.hdist]); - - self - } -} - -impl TreeScratch { - /// # Best Tree. - /// - /// Crunch all special symbol combinations and return the "extra" key - /// (0..8) that achieved the smallest output, along with its size. - fn best_tree(&self) -> Result<(u8, NonZeroU32), ZopfliError> { - let mut best_extra = 0; - let mut best_size = NonZeroU32::MAX; - - for extra in 0..8 { - let size = self.crunch_size(extra)?; - if size < best_size { - best_extra = extra; - best_size = size; + // Peek ahead to maybe save some iteration! + if use_16 || ((use_17 || use_18) && symbol.is_zero()) { + let mut j = i + 1; + while j < all.len() && symbol == all[j] { + count += 1; + j += 1; + i += 1; } } - Ok((best_extra, best_size)) - } - - #[allow(clippy::cast_possible_truncation)] - /// # Calculate Tree Size. - fn crunch_size(&self, extra: u8) -> Result { - let (use_16, use_17, use_18) = extra_bools(extra); - - // Hold the counts. - let mut cl_counts = ZEROED_COUNTS_TREE; - - let mut i = 0; - let all = &self.symbols[..usize::min(self.len(), Self::MAX)]; - while i < all.len() { - let mut count = 1_u32; - let symbol = all[i]; - - macro_rules! special { - ($step:literal, $max:literal, $symbol:ident) => ( - while count >= $step { - let count2 = if count < $max { count } else { $max }; - cl_counts[DeflateSym::$symbol as usize] += 1; - count -= count2; - } - ); + // Repetitions of zeroes. + if symbol.is_zero() && count >= 3 { + if use_18 { + special!(11, 138, D18); } - - // Peek ahead to maybe save some iteration! - if use_16 || ((use_17 || use_18) && symbol.is_zero()) { - let mut j = i + 1; - while j < all.len() && symbol == all[j] { - count += 1; - j += 1; - i += 1; - } - } - - // Repetitions of zeroes. - if symbol.is_zero() && count >= 3 { - if use_18 { - special!(11, 138, D18); - } - if use_17 { - special!(3, 10, D17); - } - } - - // Other symbol repetitions. - if use_16 && count >= 4 { - // Always count the first one as itself. - count -= 1; - cl_counts[symbol as usize] += 1; - - special!(3, 6, D16); + if use_17 { + special!(3, 10, D17); } - - // Count the current symbol and move on. - cl_counts[symbol as usize] += count; - i += 1; } - // Update the lengths and symbols given the counts. - let cl_lengths = cl_counts.llcl()?; + // Other symbol repetitions. + if use_16 && count >= 4 { + // Always count the first one as itself. + count -= 1; + cl_counts[symbol as usize] += 1; - // Find the last non-zero count. - let mut hclen = 15; - while hclen > 0 && cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 { - hclen -= 1; + special!(3, 6, D16); } - // We can finally calculate the size! - let mut size = (hclen as u32 + 4) * 3; - for (a, b) in cl_lengths.into_iter().zip(cl_counts.iter().copied()) { - size += (a as u32) * b; - } - size += cl_counts[16] * 2; // Extra bits. - size += cl_counts[17] * 3; - size += cl_counts[18] * 7; - Ok(NZ14.saturating_add(size)) + // Count the current symbol and move on. + cl_counts[symbol as usize] += count; + i += 1; } -} - -impl TreeScratch { - #[allow(clippy::cast_possible_truncation)] - /// # Write Tree. - fn write_tree(&mut self, extra: u8, out: &mut ZopfliOut) -> Result<(), ZopfliError> { - let (use_16, use_17, use_18) = extra_bools(extra); - - // Hold the counts. - let mut cl_counts = ZEROED_COUNTS_TREE; - self.rle.truncate(0); - - let mut i = 0; - let all = &self.symbols[..usize::min(self.len(), Self::MAX)]; - while i < all.len() { - let mut count = 1_u16; - let symbol = all[i]; - - macro_rules! special { - ($step:literal, $max:literal, $symbol:ident) => ( - while count >= $step { - let count2 = if count < $max { count } else { $max }; - self.rle.push((DeflateSym::$symbol, count2 - $step)); - cl_counts[DeflateSym::$symbol as usize] += 1; - count -= count2; - } - ); - } - // Peek ahead to maybe save some iteration! - if use_16 || ((use_17 || use_18) && symbol.is_zero()) { - let mut j = i + 1; - while j < all.len() && symbol == all[j] { - count += 1; - j += 1; - i += 1; - } - } + cl_counts +} - // Repetitions of zeroes. - if count >= 3 && symbol.is_zero() { - if use_18 { - special!(11, 138, D18); - } - if use_17 { - special!(3, 10, D17); +/// # Tree Counts (Writing). +/// +/// Populate and return the tree counts for `encode_tree`, as well as the RLE +/// symbol and position details. +fn encode_tree_counts( + all: &[DeflateSym], + rle: &mut Vec<(DeflateSym, u16)>, + extra: u8, +) -> [u32; 19] { + let mut cl_counts = ZEROED_COUNTS_TREE; + let (use_16, use_17, use_18) = extra_bools(extra); + + let mut i = 0; + while i < all.len() { + let mut count = 1_u16; + let symbol = all[i]; + + macro_rules! special { + ($step:literal, $max:literal, $symbol:ident) => ( + while count >= $step { + let count2 = if count < $max { count } else { $max }; + rle.push((DeflateSym::$symbol, count2 - $step)); + cl_counts[DeflateSym::$symbol as usize] += 1; + count -= count2; } - } - - // Other symbol repetitions. - if use_16 && count >= 4 { - // Always count the first one as itself. - count -= 1; - self.rle.push((symbol, 0)); - cl_counts[symbol as usize] += 1; + ); + } - special!(3, 6, D16); + // Peek ahead to maybe save some iteration! + if use_16 || ((use_17 || use_18) && symbol.is_zero()) { + let mut j = i + 1; + while j < all.len() && symbol == all[j] { + count += 1; + j += 1; + i += 1; } - - // Count the current symbol and move on. - for _ in 0..count { self.rle.push((symbol, 0)); } - cl_counts[symbol as usize] += u32::from(count); - i += 1; } - // Update the lengths and symbols given the counts. - let cl_lengths = cl_counts.llcl()?; - - // Find the last non-zero count. - let mut hclen = 15; - while hclen > 0 && cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 { - hclen -= 1; + // Repetitions of zeroes. + if count >= 3 && symbol.is_zero() { + if use_18 { + special!(11, 138, D18); + } + if use_17 { + special!(3, 10, D17); + } } - // Convert the lengths to (different) symbols. - let cl_symbols = <[u32; 19]>::llcl_symbols(&cl_lengths); - - // Write the main lengths. - out.add_bits(self.hlit as u32, 5); - out.add_bits(self.hdist as u32, 5); - out.add_bits(hclen as u32, 4); - - // Write each cl_length in the jumbled DEFLATE order. - for &o in &DeflateSym::TREE[..hclen + 4] { - out.add_bits(cl_lengths[o as usize] as u32, 3); - } + // Other symbol repetitions. + if use_16 && count >= 4 { + // Always count the first one as itself. + count -= 1; + rle.push((symbol, 0)); + cl_counts[symbol as usize] += 1; - // Write each symbol in order of appearance along with its extra bits, - // if any. - for (a, b) in self.rle.drain(..) { - let symbol = cl_symbols[a as usize]; - out.add_huffman_bits(symbol, cl_lengths[a as usize] as u32); - - // Extra bits. - match a { - DeflateSym::D16 => { out.add_bits(u32::from(b), 2); }, - DeflateSym::D17 => { out.add_bits(u32::from(b), 3); }, - DeflateSym::D18 => { out.add_bits(u32::from(b), 7); }, - _ => {}, - } + special!(3, 6, D16); } - Ok(()) + // Count the current symbol and move on. + for _ in 0..count { rle.push((symbol, 0)); } + cl_counts[symbol as usize] += u32::from(count); + i += 1; } -} - - -#[allow(unsafe_code)] -/// Array of Cells. -/// -/// Revisualize a mutable array as an array of cells. -/// -/// TODO: use `Cell::as_array_of_cells` once stabilized. -fn array_of_cells(arr: &mut [T; N]) -> &[Cell; N] { - let cells = Cell::from_mut(arr); - // Safety: `Cell` has the same memory layout as `T`. - unsafe { &*(std::ptr::from_ref(cells).cast::<[Cell; N]>()) } + // Done! + cl_counts } /// # Extra Boolification. /// -/// Extra the use-16/17/18 bools (for tree business) from a given byte. +/// Extract the use-16/17/18 bools (for tree business) from a given byte. This +/// is easy enough, but easy enough to screw up, so handy to keep in just one +/// place. ;) const fn extra_bools(extra: u8) -> (bool, bool, bool) { (0 != extra & 1, 0 != extra & 2, 0 != extra & 4) } @@ -897,6 +871,11 @@ const fn extra_bools(extra: u8) -> (bool, bool, bool) { /// /// Add a new chain to the list, using either a leaf or combination of /// two chains from the previous list. +/// +/// Note: it would probably be more appropriate to make this a trait member or +/// at least scope it to the sealed trait's module, but doing either leads the +/// compiler to change its inlining decisions for the worse, so best to leave +/// it where it is! fn llcl_boundary_pm(leaves: &[Leaf<'_>], lists: &mut [List], nodes: &KatScratch) -> Result<(), ZopfliError> { // This method should never be called with an empty list. @@ -948,6 +927,22 @@ fn llcl_boundary_pm(leaves: &[Leaf<'_>], lists: &mut [List], nodes: &KatScratch) llcl_boundary_pm(leaves, rest, nodes) } +/// # Last Non-Zero, Non-Special Count. +/// +/// This method loops through the counts in the jumbled DEFLATE tree order, +/// returning the last index with a non-zero count. (The extended symbols are +/// ignored.) +const fn tree_hclen(cl_counts: &[u32; 19]) -> DeflateSymBasic { + let mut hclen = 15; + while cl_counts[DeflateSym::TREE[hclen + 3] as usize] == 0 { + hclen -= 1; + if hclen == 0 { break; } + } + #[allow(unsafe_code)] + // Safety: DeflateSymBasic covers all values between 0..=15. + unsafe { std::mem::transmute::(hclen as u8) } +} + #[cfg(test)] @@ -967,6 +962,14 @@ mod tests { } } + #[test] + /// # Tree Max. + /// + /// Make sure our math correctly aligns with `TreeRleIdx`. + fn t_tree_max() { + assert_eq!(TreeScratch::MAX - 1, TreeRleIdx::T315 as usize); + } + // The following tests have been adapted from the zopfli-rs crate: // diff --git a/flapfli/src/zopflipng/lz77.rs b/flapfli/src/zopflipng/lz77.rs index ce789ad..37374eb 100644 --- a/flapfli/src/zopflipng/lz77.rs +++ b/flapfli/src/zopflipng/lz77.rs @@ -4,33 +4,53 @@ This module defines the LZ77 store structures. */ -use std::ops::Range; +use std::{ + num::{ + NonZeroU32, + NonZeroUsize, + }, + ops::Range, +}; use super::{ ArrayD, ArrayLL, + DISTANCE_BITS, DISTANCE_SYMBOLS, Dsym, + DynamicLengths, + FIXED_TREE_LL, + LENGTH_SYMBOL_BITS, LENGTH_SYMBOLS, LitLen, Lsym, ZEROED_COUNTS_D, ZEROED_COUNTS_LL, zopfli_error, + ZOPFLI_MASTER_BLOCK_SIZE, ZopfliError, + ZopfliRange, }; -/// # Shared `LZ77Store` Pool. -/// -/// Each `deflate_part` run can use as many as three of these; we might as well -/// reuse the objects to cut down on the number of allocations being made. -// static POOL: Pool = Pool::new(); +#[allow(unsafe_code)] +/// # Seven is Non-Zero. +const NZ07: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(7) }; + +#[allow(unsafe_code)] +/// # Eight is Non-Zero. +const NZ08: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(8) }; #[derive(Clone)] /// # LZ77 Data Store. +/// +/// This struct holds litlen, dist, and symbol information for LZ77 block +/// compression. +/// +/// This can be thought of as the owned version of `LZ77StoreRange`, useful +/// while the data is still being gathered and manipulated. pub(crate) struct LZ77Store { pub(crate) entries: Vec, } @@ -41,33 +61,48 @@ impl LZ77Store { Self { entries: Vec::new() } } - /// # Symbol Span Range. + /// # Ranged. + /// + /// Return an immutable ranged view of the data, or an error if the range + /// is invalid. + pub(crate) fn ranged(&self, rng: ZopfliRange) -> Result { + let entries = self.entries.get(rng.rng()).ok_or(zopfli_error!())?; + Ok(LZ77StoreRange { entries }) + } + + /// # Ranged (Full). /// - /// Convert an LZ77 range to the start/end positions of the block. - pub(crate) fn byte_range(&self, rng: Range) -> Result, ZopfliError> { - let slice = self.entries.as_slice(); - if rng.start < rng.end && rng.end <= slice.len() { - let instart = slice[rng.start].pos; - let e = slice[rng.end - 1]; - Ok(instart..e.length() as usize + e.pos) + /// Same as `LZ77Store::range`, except the range is everything. This will + /// return an error if the store is empty or too large. + pub(crate) fn ranged_full(&self) -> Result { + let entries = self.entries.as_slice(); + if entries.is_empty() || ZOPFLI_MASTER_BLOCK_SIZE < entries.len() { + Err(zopfli_error!()) } - else { Err(zopfli_error!()) } + else { Ok(LZ77StoreRange { entries }) } } /// # Clear. + /// + /// Remove all previously-collected entries, allowing the store to be + /// re-used for a new set of data. pub(crate) fn clear(&mut self) { self.entries.truncate(0); } /// # Push Values. + /// + /// Create an entry from the arguments, then insert it into the store. pub(crate) fn push(&mut self, litlen: LitLen, dist: u16, pos: usize) { self.push_entry(LZ77StoreEntry::new(litlen, dist, pos)); } /// # Push Entry. + /// + /// Push an existing entry directly to the store. fn push_entry(&mut self, entry: LZ77StoreEntry) { self.entries.push(entry); } /// # Replace Store. /// - /// Replace the current content with some other store's content. + /// Replace the current store's data with what the other guy's got. pub(crate) fn replace(&mut self, other: &Self) { self.entries.truncate(0); self.entries.extend_from_slice(&other.entries); @@ -75,7 +110,9 @@ impl LZ77Store { /// # Steal/Append Entries. /// - /// Drain the entires from other and append them to self. + /// Drain the entires from `other` and append them to `self`. (This is a + /// more efficient alternative to calling `LZ77Store::replace` and + /// `LZ77Store::clear` separately.) pub(crate) fn steal_entries(&mut self, other: &mut Self) { self.entries.append(&mut other.entries); } @@ -83,31 +120,203 @@ impl LZ77Store { impl LZ77Store { /// # Length. + /// + /// Return the number of entries in the store. Unlike `LZ77StoreRange`, + /// this can return zero. pub(crate) fn len(&self) -> usize { self.entries.len() } +} + + + +#[repr(transparent)] +#[derive(Clone, Copy)] +/// # Ranged LZ77 Data Store. +/// +/// Same as `LZ77Store`, but immutable and non-empty, offering a more +/// const-friendly and performant view into some or all of the former's +/// data. +pub(crate) struct LZ77StoreRange<'a> { + pub(crate) entries: &'a [LZ77StoreEntry], +} + +impl<'a> LZ77StoreRange<'a> { + /// # Uncompressed Range. + /// + /// Return the original uncompressed range — from e.g. a `ZopfliChunk` — + /// used to build this store. If for some reason that range cannot be + /// recreated, an error will be returned instead. + pub(crate) const fn byte_range(self) -> Result { + // Safety: ranged stores are never empty. + let len = self.entries.len(); + if 0 == len { crate::unreachable(); } + + let first = self.entries[0]; + let last = self.entries[len - 1]; + ZopfliRange::new(first.pos, last.length() as usize + last.pos) + } /// # Histogram. - pub(crate) fn histogram(&self, rng: Range) - -> (ArrayLL, ArrayD) { + /// + /// Count up and return the litlen and distance symbols included in this + /// range. + pub(crate) fn histogram(self) -> (ArrayLL, ArrayD) { let mut ll_counts = ZEROED_COUNTS_LL; let mut d_counts = ZEROED_COUNTS_D; - for e in self.entries.iter().take(rng.end).skip(rng.start) { + for e in self.entries { ll_counts[e.ll_symbol as usize] += 1; if 0 < e.dist { d_counts[e.d_symbol as usize] += 1; } } (ll_counts, d_counts) } + + /// # Length. + /// + /// Return the total number of entries included in this store. Unlike + /// `LZ77Store`, this cannot be empty, so the result will always be + /// non-zero. + pub(crate) const fn len(self) -> NonZeroUsize { + #[allow(unsafe_code)] + // Safety: we verified the store is non-empty at construction. + unsafe { NonZeroUsize::new_unchecked(self.entries.len()) } + } + + #[allow(unsafe_code)] + /// # Split. + /// + /// Split the range into two at `mid`, unless that would leave either side + /// empty, in which case an error will be returned instead. + /// + /// Note: this returns two new instances; `self` is left unchanged. + pub(crate) const fn split(self, mid: usize) -> Result<(Self, Self), ZopfliError> { + if 0 == mid || self.entries.len() <= mid { Err(zopfli_error!()) } + else { + // Safety: we have checked mid is between the start and end of our + // entries. + let (a, b) = unsafe { self.entries.split_at_unchecked(mid) }; + Ok((Self { entries: a }, Self { entries: b })) + } + } + + /// # Split Iterator. + /// + /// Return an iterator that yields every possible split combination in + /// order, unless `self` has only one entry and cannot be split, in which + /// case an error is returned instead. + pub(crate) const fn splits(self) -> Result, ZopfliError> { + let len = self.entries.len(); + if 1 < len { + Ok(LZ77StoreRangeSplits { + entries: self.entries, + splits: 1..len, + }) + } + // Not big enough to split! + else { Err(zopfli_error!()) } + } +} + +impl<'a> LZ77StoreRange<'a> { + /// # Calculate Block Size (Auto). + /// + /// Return the smallest of the uncompressed, fixed, and dynamic sizes. + /// (When `try_fixed` is false, only uncompressed and dynamic sizes are + /// calculated and compared.) + pub(crate) fn block_size_auto(self, try_fixed: bool) -> Result { + // Take the smaller of the uncompressed and dynamic costs. + let cost = NonZeroU32::min( + self.block_size_uncompressed()?, + self.block_size_dynamic()?, + ); + + // Counter-intuitively, we'll usually get better block-splitting decisions + // by ignoring fixed costs entirely unless the store is really small. This + // condition is also necessary to maintain parity with the original zopfli. + if try_fixed { + let cost2 = self.block_size_fixed(); + if cost2 < cost { return Ok(cost2); } + } + + Ok(cost) + } + + /// # Calculate Block Size (Dynamic). + /// + /// This calculation is… a lot. See the `rle` module for more information. + pub(crate) fn block_size_dynamic(self) -> Result { + DynamicLengths::new(self).map(DynamicLengths::take_size) + } + + /// # Calculate Block Size (Fixed). + pub(crate) fn block_size_fixed(self) -> NonZeroU32 { + // Loop the store if we have data to loop. + let size = self.entries.iter() + .map(LZ77StoreEntry::fixed_cost) + .sum::(); + + NZ07.saturating_add(size) // FIXED_TREE_LL[256] + } + + /// # Calculate Block Size (Uncompressed). + pub(crate) fn block_size_uncompressed(self) -> Result { + let blocksize = self.byte_range()?.len32(); + + // Uncompressed blocks are split at u16::MAX. + let chunks = blocksize.get().div_ceil(u32::from(u16::MAX)); + + Ok(NZ08.saturating_mul(blocksize).saturating_add(chunks * 40)) + } +} + + + +/// # Ranged Store Splits. +/// +/// This iterator yields all non-empty split pairs of a ranged store. +pub(crate) struct LZ77StoreRangeSplits<'a> { + entries: &'a [LZ77StoreEntry], + splits: Range, +} + +impl<'a> Iterator for LZ77StoreRangeSplits<'a> { + type Item = (LZ77StoreRange<'a>, LZ77StoreRange<'a>); + + #[allow(unsafe_code)] + fn next(&mut self) -> Option { + let mid = self.splits.next()?; + // Safety: we verified splits was in between the start and end points + // of our entries. + let (a, b) = unsafe { self.entries.split_at_unchecked(mid) }; + Some(( + LZ77StoreRange { entries: a }, + LZ77StoreRange { entries: b }, + )) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.splits.len(); + (len, Some(len)) + } +} + +impl<'a> ExactSizeIterator for LZ77StoreRangeSplits<'a> { + fn len(&self) -> usize { self.splits.len() } } #[derive(Clone, Copy)] +/// # LZ77 Store Entry. +/// +/// This struct holds all of the relevant details for a given entry, including +/// its index in the original uncompressed chunk, the length and distance pair, +/// and the corresponding length and distance symbols. pub(crate) struct LZ77StoreEntry { - pub(crate) pos: usize, + pub(crate) pos: usize, // The original uncompressed chunk index. pub(crate) litlen: LitLen, pub(crate) dist: i16, - pub(crate) ll_symbol: Lsym, + pub(crate) ll_symbol: Lsym, // A symbol or literal depending on distance. pub(crate) d_symbol: Dsym, } @@ -122,8 +331,9 @@ impl LZ77StoreEntry { debug_assert!(dist < 32_768); // Using the signed type helps the compiler understand the upper - // range fits ZOPFLI_WINDOW_MAX and wraps (impossible) bad values to - // boot. + // range fits ZOPFLI_WINDOW_MAX. Impossibly large values would also + // get neatly tucked away in negative-land and ignored, but that'd be + // impossible! let dist = dist as i16; let (ll_symbol, d_symbol) = if 0 < dist {( @@ -141,6 +351,23 @@ impl LZ77StoreEntry { } } + /// # Fixed Cost. + /// + /// Note: these values all fit comfortably within `u8`, but we never just + /// want one cost, so the result is widened to `u32` to simplify + /// `LZ77StoreRange::block_size_fixed`'s efforts. + const fn fixed_cost(&self) -> u32 { + let base = FIXED_TREE_LL[self.ll_symbol as usize] as u8; + let extra = + if 0 < self.dist { + LENGTH_SYMBOL_BITS[self.litlen as usize] + + DISTANCE_BITS[self.d_symbol as usize] + + 5 // FIXED_TREE_D. + } + else { 0 }; + (base + extra) as u32 + } + /// # Length. /// /// If the distance is zero, 1, otherwise the litlen. @@ -149,3 +376,83 @@ impl LZ77StoreEntry { else { LitLen::L001 } } } + + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn t_fixed_tree_256() { + // Our use of this particular index is hardcoded for simplicity; let's + // triple-check we chose correctly! + assert_eq!(FIXED_TREE_LL[256] as u32, NZ07.get()); + } + + #[test] + fn t_fixed_tree_d5() { + // Our use of this particular index is hardcoded for simplicity; let's + // triple-check we chose correctly! + assert!(super::super::FIXED_TREE_D.iter().all(|&d| d as u32 == 5)); + } + + #[test] + fn t_ranged_splits() { + /// # Poor Man's Equal Impl. + /// + /// Most of these types do not implement (or need) `Eq`, but since + /// we're only setting `pos` and `dist` uniquely here anyway, we can + /// limit matching to those two. + fn entry_eq((a, b): (&LZ77StoreEntry, &LZ77StoreEntry)) -> bool { + a.pos == b.pos && a.dist == b.dist + } + + // Generate an entry with the given pos and dist. + macro_rules! entry { + ($i:literal) => ( + LZ77StoreEntry { + pos: $i, + litlen: LitLen::L000, + dist: $i, + ll_symbol: Lsym::L000, + d_symbol: Dsym::D00, + } + ); + } + + // These entries are nonsensical, but all we're looking to do is check + // that splits are happening in the right place, so they only really + // need to be unique from one another. + let arr: &[LZ77StoreEntry] = &[ + entry!(0), + entry!(1), + entry!(2), + entry!(3), + entry!(4), + entry!(5), + ]; + let store = LZ77StoreRange { entries: arr }; + + // Do the splits. + let mut splits = store.splits().expect("failed to split store"); + for i in 1..arr.len() { + assert_eq!(splits.len(), arr.len() - i); + let (a, b) = splits.next().expect("expected next split"); + let c = &arr[..i]; // Expected A. + let d = &arr[i..]; // Expected B. + + assert_eq!(a.len().get(), a.entries.len()); + assert_eq!(a.entries.len(), c.len()); + assert!(a.entries.iter().zip(c.iter()).all(entry_eq)); + + assert_eq!(b.len().get(), b.entries.len()); + assert_eq!(b.entries.len(), d.len()); + assert!(b.entries.iter().zip(d.iter()).all(entry_eq)); + } + + // We should be empty. + assert_eq!(splits.len(), 0); + assert!(splits.next().is_none()); + } +} diff --git a/flapfli/src/zopflipng/mod.rs b/flapfli/src/zopflipng/mod.rs index cff95ae..7852218 100644 --- a/flapfli/src/zopflipng/mod.rs +++ b/flapfli/src/zopflipng/mod.rs @@ -16,53 +16,54 @@ performant. mod blocks; mod cache; +mod chunk; mod error; mod hash; +mod iter; mod kat; mod lz77; mod rle; +mod rng; mod stats; mod symbols; -pub(crate) use blocks::{ - deflate_part, - SplitPoints, -}; +pub(crate) use blocks::deflate_part; use cache::{ MatchCache, + SplitCache, SqueezeCache, }; +pub(crate) use chunk::ZopfliChunk; use error::{ zopfli_error, ZopfliError, }; +use hash::ZopfliStateInit; pub(crate) use hash::ZopfliState; +use iter::ReducingSlices; use kat::{ best_tree_size, encode_tree, LengthLimitedCodeLengths, }; -pub(crate) use lz77::LZ77Store; -use rle::get_dynamic_lengths; -pub(crate) use rle::reset_dynamic_length_cache; -use super::{ - EncodedPNG, - lodepng::{ - DecodedImage, - LodePNGColorType, - LodePNGFilterStrategy, - LodePNGState, - ZopfliOut, - }, +use lz77::{ + LZ77Store, + LZ77StoreRange, }; +use rng::ZopfliRange; +use rle::DynamicLengths; +use super::deflate::ZopfliOut; use symbols::{ DeflateSym, + DeflateSymBasic, DISTANCE_BITS, + DISTANCE_BITS_F, DISTANCE_SYMBOLS, DISTANCE_VALUES, Dsym, LENGTH_SYMBOL_BIT_VALUES, LENGTH_SYMBOL_BITS, + LENGTH_SYMBOL_BITS_F, LENGTH_SYMBOLS, LitLen, Lsym, @@ -123,16 +124,28 @@ const FIXED_SYMBOLS_D: ArrayD = [ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ]; -// This is the biggest chunk-o-data that can be passed to deflate. +/// # Step Size for Deflate Parts. +/// +/// The "active" portion of the `ZopfliChunk` passed from lodepng will never +/// exceed a million bytes. pub(super) const ZOPFLI_MASTER_BLOCK_SIZE: usize = 1_000_000; -// The matchable hash cache range. +/// # Hash/LZ77 Window Size. +/// +/// This is the window size used by lodepng when zopfli processing is enabled, +/// and the amount expected by structs like `ZopfliHash`. +const ZOPFLI_WINDOW_SIZE: usize = 32_768; + +/// # Minimum Matchable Distance. const ZOPFLI_MIN_MATCH: usize = 3; + +/// # Maximum Matchable Distance. const ZOPFLI_MAX_MATCH: usize = 258; /// # Length of Sublength Array. /// -/// This is hardcoded in `squeeze.c`. +/// The squeeze sublength array slices have indices spanning +/// `0..=ZOPFLI_MAX_MATCH`. const SUBLEN_LEN: usize = ZOPFLI_MAX_MATCH + 1; /// # Array with `ZOPFLI_NUM_LL` Entries. @@ -140,82 +153,3 @@ type ArrayLL = [T; ZOPFLI_NUM_LL]; /// # Array with `ZOPFLI_NUM_D` Entries. type ArrayD = [T; ZOPFLI_NUM_D]; - - - -#[must_use] -/// # Optimize! -/// -/// This will attempt to losslessly recompress the source PNG with the -/// strongest Zopfli filter strategy, and return a new PNG image if the result -/// is smaller than the original. -/// -/// Note: 16-bit transformations are not lossless; such images will have their -/// bit depths reduced to a more typical 8 bits. -pub fn optimize(src: &[u8]) -> Option { - let mut dec = LodePNGState::default(); - let img = dec.decode(src)?; - - // Encode! - let strategy = best_strategy(&dec, &img); - let out = encode(&dec, &img, strategy, true)?; - - // Return it if better and nonzero! - if out.size < src.len() { Some(out) } - else { None } -} - - - -/// # Best Strategy. -/// -/// This attempts to find the best filtering strategy for the image by trying -/// all of them in fast mode, and picking whichever produces the smallest -/// output. -fn best_strategy(dec: &LodePNGState, img: &DecodedImage) -> LodePNGFilterStrategy { - [ - LodePNGFilterStrategy::LFS_ZERO, - LodePNGFilterStrategy::LFS_ONE, - LodePNGFilterStrategy::LFS_TWO, - LodePNGFilterStrategy::LFS_THREE, - LodePNGFilterStrategy::LFS_FOUR, - LodePNGFilterStrategy::LFS_MINSUM, - LodePNGFilterStrategy::LFS_ENTROPY, - LodePNGFilterStrategy::LFS_BRUTE_FORCE, - ] - .into_iter() - .filter_map(|s| encode(dec, img, s, false).map(|out| (out.size, s))) - .min_by(|a, b| a.0.cmp(&b.0)) - .map_or(LodePNGFilterStrategy::LFS_ZERO, |(_, s)| s) -} - -/// # Apply Optimizations. -/// -/// This attempts to re-encode an image using the provided filter strategy, -/// returning an `EncodedPNG` object if it all works out. -fn encode( - dec: &LodePNGState, - img: &DecodedImage, - strategy: LodePNGFilterStrategy, - slow: bool, -) -> Option { - // Encode and write to the buffer if it worked. - let mut enc = LodePNGState::encoder(dec, strategy, slow)?; - let out = enc.encode(img)?; - - // We might be able to save a couple bytes by nuking the palette if the - // image is already really small. - if - out.size < 4096 && - LodePNGColorType::LCT_PALETTE.is_match(&out) && - enc.prepare_encoder_small(img) - { - if let Some(out2) = enc.encode(img) { - if out2.size < out.size { - return Some(out2); - } - } - } - - Some(out) -} diff --git a/flapfli/src/zopflipng/rle.rs b/flapfli/src/zopflipng/rle.rs index 0170b4c..8de329a 100644 --- a/flapfli/src/zopflipng/rle.rs +++ b/flapfli/src/zopflipng/rle.rs @@ -2,18 +2,9 @@ # Flapfli: Huffman RLE Optimization. */ -use dactyl::NoHash; use std::{ - cell::{ - Cell, - RefCell, - }, - collections::{ - hash_map::Entry, - HashMap, - }, + cell::Cell, num::NonZeroU32, - ops::Range, }; use super::{ ArrayD, @@ -22,7 +13,7 @@ use super::{ DeflateSym, DISTANCE_BITS, LengthLimitedCodeLengths, - LZ77Store, + LZ77StoreRange, ZopfliError, }; @@ -34,112 +25,117 @@ const LENGTH_EXTRA_BITS: [u32; 29] = [ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, ]; -type RleCache = HashMap; - -thread_local!( - /// # Best Tree Cache. - /// - /// The dynamic length calculations are pretty terrible and can wind up - /// being repeated several times for a given block. To take out some of the - /// sting from that repetition, the results are statically cached. - /// - /// To prevent endless reallocation and minimize lookup times, the cache is - /// cleared for each new image. - static CACHE: RefCell = RefCell::new(HashMap::default()) -); - -/// # Get Dynamic Lengths. +/// # Dynamic Lengths. /// -/// This method calculates the dynamic tree symbols and size using both the -/// existing and optimized counts, then returns whichever set produces the -/// smallest output. +/// This struct is used to perform brute-force length-limited-code-length +/// calculations to determine the best (smallest) DEFLATE configuration for the +/// data. /// -/// Note: the returned size does not include the 3-bit block header. -pub(super) fn get_dynamic_lengths(store: &LZ77Store, rng: Range) --> Result<(u8, NonZeroU32, ArrayLL, ArrayD), ZopfliError> { - fn fetch( - cache: &mut RleCache, - ll_counts: &ArrayLL, - d_counts: &ArrayD, - ) -> Result<(u8, NonZeroU32, ArrayLL, ArrayD), ZopfliError> { +/// This is done in two passes: the first using the previously-collected LZ77 +/// histogram data, the second using RLE-optimized counts derived from same. +/// The best of the best is kept, the rest are forgotten. +pub(crate) struct DynamicLengths { + extra: u8, + size: NonZeroU32, + ll_lengths: ArrayLL, + d_lengths: ArrayD, +} + +impl DynamicLengths { + /// # New. + pub(crate) fn new(store: LZ77StoreRange) -> Result { + // Pull the counts from the store. + let (mut ll_counts, d_counts) = store.histogram(); + ll_counts[256] = 1; + // Pull the symbols, then get the sizes. let ll_lengths = ll_counts.llcl()?; - let d_lengths = d_llcl(d_counts)?; - let (data1, hash1) = calculate_size(cache, ll_counts, d_counts, &ll_lengths, &d_lengths)?; - - // Unless we've been here before and found optimization useless, repeat - // the process using optimized counts and symbols. - if ! data1.noop() { - let (ll_lengths2, d_lengths2) = optimized_symbols(ll_counts, d_counts)?; - let (data2, _) = calculate_size(cache, ll_counts, d_counts, &ll_lengths2, &d_lengths2)?; - - // Return this version if better. - if data2.size < data1.size { - return Ok((data2.extra(), data2.size, ll_lengths2, d_lengths2)); - } + let d_lengths = d_llcl(&d_counts)?; - // Update the original's cache to reflect that optimization didn't - // help so that we can skip all this the next time around. - if let Some(e) = cache.get_mut(&hash1) { e.set_noop(); } - } + // Calculate the sizes. + let (extra, treesize) = best_tree_size(&ll_lengths, &d_lengths)?; + let datasize = calculate_size_data(&ll_counts, &d_counts, &ll_lengths, &d_lengths); + let size = treesize.saturating_add(datasize); - // The first version was better! - Ok((data1.extra(), data1.size, ll_lengths, d_lengths)) - } + // Build the response. + let mut out = Self { extra, size, ll_lengths, d_lengths }; - // Pull the counts from the store. - let (mut ll_counts, d_counts) = store.histogram(rng); - ll_counts[256] = 1; + // But wait, there's more! Optimize the counts and repeat the process + // to see if that helps. + out.try_optimized(&ll_counts, &d_counts)?; - // Do all the work! - CACHE.with_borrow_mut(|cache| fetch(cache, &ll_counts, &d_counts)) -} + // Done! + Ok(out) + } -/// # Reset Dynamic Length Cache. -/// -/// To prevent endless reallocation and minimize lookup times, the cache is -/// cleared each time a new image is loaded. -pub(crate) fn reset_dynamic_length_cache() { CACHE.with_borrow_mut(HashMap::clear); } + #[inline(never)] + /// # Unique Symbols? + /// + /// Returns true if any of the symbols are different than the ones we + /// already have. (They wind up the same often enough that it is worth + /// checking to reduce the potential workload.) + fn is_unique(&self, ll_lengths: &ArrayLL, d_lengths: &ArrayD) -> bool { + #[allow(unsafe_code)] + /// # As Bytes. + /// + /// Reimagine a symbol array as raw bytes for more optimal comparison. + const fn deflate_bytes(arr: &[DeflateSym; N]) -> &[u8; N] { + // Safety: DeflateSym has the same size and alignment as u8. + unsafe { &* arr.as_ptr().cast() } + } + *deflate_bytes(&self.d_lengths) != *deflate_bytes(d_lengths) || + deflate_bytes(&self.ll_lengths) != deflate_bytes(ll_lengths) + } + /// # Try Optimized. + /// + /// Optimize the counts and fetch new symbols, calculate their cost, and + /// keep them if better. + fn try_optimized(&mut self, ll_counts: &ArrayLL, d_counts: &ArrayD) + -> Result<(), ZopfliError> { + let (ll_lengths2, d_lengths2) = optimized_symbols(ll_counts, d_counts)?; + + // It is only worth calculating the new sizes if the lengths are + // different than the ones we already have. + if self.is_unique(&ll_lengths2, &d_lengths2) { + // Calculate the sizes. + let (extra, treesize) = best_tree_size(&ll_lengths2, &d_lengths2)?; + let datasize = calculate_size_data(ll_counts, d_counts, &ll_lengths2, &d_lengths2); + let size = treesize.saturating_add(datasize); + + // Update our values if the new cost is lower. + if size < self.size { + self.extra = extra; + self.size = size; + self.ll_lengths = ll_lengths2; + self.d_lengths = d_lengths2; + } + } -#[derive(Clone, Copy)] -/// # Cache Entry. -struct CacheEntry { - extra: u8, // Extended alphabet used. - size: NonZeroU32, // Combined tree/data size. + Ok(()) + } } -impl CacheEntry { - /// # Extra Bits. - /// - /// The first three bits comprise the extended alphabet details. - const MASK_EXTRA: u8 = 0b0000_0111; - - /// # Fruitless Optimization Mask. - /// - /// The fourth bit is used to indicate when the secondary optimization pass - /// failed to result in better output. - const MASK_NOOP: u8 = 0b0000_1000; +impl DynamicLengths { + /// # Cost. + pub(crate) const fn cost(&self) -> NonZeroU32 { self.size } /// # Extra. - /// - /// Return the true "extra" value, without the noop bit. - const fn extra(self) -> u8 { self.extra & Self::MASK_EXTRA } + pub(crate) const fn extra(&self) -> u8 { self.extra } - /// # Fruitless Optimization? - /// - /// Returns true if optimizing the counts made no positive difference - /// during the previous pass. - const fn noop(self) -> bool { Self::MASK_NOOP == self.extra & Self::MASK_NOOP } + /// # Litlen Lengths. + pub(crate) const fn ll_lengths(&self) -> &ArrayLL { &self.ll_lengths } + + /// # Distance Lengths. + pub(crate) const fn d_lengths(&self) -> &ArrayD { &self.d_lengths } - /// # Set Fruitless Optimization. + /// # Take Size. /// - /// This sets the noop flag so the optimization pass can be skipped on - /// subsequent calls. - fn set_noop(&mut self) { self.extra |= Self::MASK_NOOP; } + /// Same as `DynamicLengths::cost`, but drop `self` in the process. + pub(crate) const fn take_size(self) -> NonZeroU32 { self.size } } @@ -150,7 +146,7 @@ impl CacheEntry { /// `true` for distance codes in a sequence of 5+ zeroes or 7+ (identical) /// non-zeroes, `false` otherwise. /// -/// This moots the need to collect such values into a vector in advance and +/// This moots the need to collect the values into a vector in advance and /// reduces the number of passes required to optimize Huffman codes. struct GoodForRle<'a> { counts: &'a [Cell], @@ -229,64 +225,38 @@ impl<'a> ExactSizeIterator for GoodForRle<'a> { -/// # Calculate Size. +/// # Calculate Dynamic Data Block Size. /// -/// Pull the best tree details from the cache, or calculate them fresh (and -/// cache them for next time). -fn calculate_size( - cache: &mut RleCache, +/// This returns the size of the data itself, basically just a sum of sums. +fn calculate_size_data( ll_counts: &ArrayLL, d_counts: &ArrayD, ll_lengths: &ArrayLL, d_lengths: &ArrayD, -) -> Result<(CacheEntry, u64), ZopfliError> { - #[inline(never)] - /// # Calculate Dynamic Block Size. - fn data_size( - ll_counts: &ArrayLL, - d_counts: &ArrayD, - ll_lengths: &ArrayLL, - d_lengths: &ArrayD, - ) -> u32 { - // The end symbol is always included. - let mut size = ll_lengths[256] as u32; - - // The early lengths and counts. - for (ll, lc) in ll_lengths.iter().copied().zip(ll_counts).take(256) { - size += (ll as u32) * lc; - } - - // The lengths and counts with extra bits. - for (i, lbit) in (257..257 + LENGTH_EXTRA_BITS.len()).zip(LENGTH_EXTRA_BITS) { - size += (ll_lengths[i] as u32 + lbit) * ll_counts[i]; - } - - // The distance lengths, counts, and extra bits. - for (i, dbit) in DISTANCE_BITS.iter().copied().enumerate().take(30) { - size += (d_lengths[i] as u32 + u32::from(dbit)) * d_counts[i]; - } - - size - } - - // Hash the symbols. - let hash = deflate_hash(ll_counts, d_counts, ll_lengths, d_lengths); - - // Check the cache first. - let entry = match cache.entry(hash) { - Entry::Occupied(e) => return Ok((*e.get(), hash)), - Entry::Vacant(e) => e, - }; - - // Calculate the sizes. - let (extra, treesize) = best_tree_size(ll_lengths, d_lengths)?; - let datasize = data_size(ll_counts, d_counts, ll_lengths, d_lengths); - let size = treesize.saturating_add(datasize); - let out = CacheEntry { extra, size }; - - // Save to cache for later, then return. - entry.insert(out); - Ok((out, hash)) +) -> u32 { + // The early lengths and counts. + let a = ll_lengths.iter().copied() + .zip(ll_counts.iter().copied()) + .take(256) + .map(|(ll, lc)| (ll as u32) * lc) + .sum::(); + + // The lengths and counts with extra bits. + let b = ll_lengths[257..].iter().copied() + .zip(ll_counts[257..].iter().copied()) + .zip(LENGTH_EXTRA_BITS) + .map(|((ll, lc), lbit)| (ll as u32 + lbit) * lc) + .sum::(); + + // The distance lengths, counts, and extra bits. + let c = d_lengths.iter().copied() + .zip(d_counts.iter().copied()) + .zip(DISTANCE_BITS) + .take(30) + .map(|((dl, dc), dbit)| (dl as u32 + u32::from(dbit)) * dc) + .sum::(); + + a + b + c + ll_lengths[256] as u32 } /// # Dynamic Length-Limited Code Lengths. @@ -296,18 +266,20 @@ fn d_llcl(d_counts: &ArrayD) -> Result, ZopfliError> { let mut d_lengths = d_counts.llcl()?; - // Buggy decoders require at least two non-zero distances. Let's see - // what we've got! + // Buggy decoders require at least two non-zero distances. Let's make sure + // we have at least that many. let mut one: Option = None; for (i, dist) in d_lengths.iter().copied().enumerate().take(30) { // We have (at least) two non-zero entries; no patching needed! if ! dist.is_zero() && one.replace(i == 0).is_some() { return Ok(d_lengths); } } + // If we're here, fewer than two non-zero distances are in the collection; + // we'll need to fake the counts to reach our quota. Haha. match one { // The first entry had a code, so patching the second gives us two. Some(true) => { d_lengths[1] = DeflateSym::D01; }, - // The first entry didn't have a code, so patching it gives us two. + // The first entry did not have a code, so patching it gives us two. Some(false) => { d_lengths[0] = DeflateSym::D01; }, // There were no codes at all, so we can just patch the first two. None => { @@ -318,33 +290,21 @@ fn d_llcl(d_counts: &ArrayD) Ok(d_lengths) } - -/// # Hash Counts and Symbols. +/* +#[inline(never)] +/// # Compare Two Symbol Sets for Uniqueness. /// -/// Calculate and return a hash for the set. This is done independently of the -/// map to reduce its signature and enable us to quickly repeat lookups if -/// necessary. -/// -/// Note: both passes from a given dynamic lengths call will have the same -/// counts, but they hash quickly enough there's no performance benefit from -/// over-complicated the formula. -fn deflate_hash( - ll_counts: &ArrayLL, - d_counts: &ArrayD, - ll_lengths: &ArrayLL, - d_lengths: &ArrayD, -) -> u64 { - use ahash::RandomState; - use std::hash::{BuildHasher, Hash, Hasher}; - +/// This compares two sets of symbols, returning `true` if they're different +/// from one another. +fn diff_symbols(a: &[DeflateSym; N], b: &[DeflateSym; N]) -> bool { #[allow(unsafe_code)] /// # As Bytes. /// - /// Convert a `DeflateSym` array into an equivalent byte array for faster - /// hashing. + /// Transform a `DeflateSym` array into an equivalent byte array for more + /// efficient comparison. (Bytes get all the love!) const fn deflate_bytes(arr: &[DeflateSym; N]) -> &[u8; N] { // Safety: DeflateSym has the same size and alignment as u8, and if - // for some reason that isn't true, this code won't compile. + // for some reason that isn't true, this code won't compile! const { assert!(std::mem::size_of::<[DeflateSym; N]>() == std::mem::size_of::<[u8; N]>()); assert!(std::mem::align_of::<[DeflateSym; N]>() == std::mem::align_of::<[u8; N]>()); @@ -352,20 +312,8 @@ fn deflate_hash( unsafe { &* arr.as_ptr().cast() } } - let mut h = RandomState::with_seeds( - 0x8596_cc44_bef0_1aa0, - 0x98d4_0948_da60_19ae, - 0x49f1_3013_c503_a6aa, - 0xc4d7_82ff_3c9f_7bef, - ).build_hasher(); - - ll_counts.hash(&mut h); - d_counts.hash(&mut h); - deflate_bytes(ll_lengths).hash(&mut h); - deflate_bytes(d_lengths).hash(&mut h); - - h.finish() -} + deflate_bytes(a) != deflate_bytes(b) +}*/ /// # Get RLE-Optimized Symbols. /// diff --git a/flapfli/src/zopflipng/rng.rs b/flapfli/src/zopflipng/rng.rs new file mode 100644 index 0000000..598d77b --- /dev/null +++ b/flapfli/src/zopflipng/rng.rs @@ -0,0 +1,125 @@ +/*! +# Flapfli: Ranges. +*/ + +use std::{ + num::{ + NonZeroU32, + NonZeroUsize, + }, + ops::Range, +}; +use super::{ + zopfli_error, + ZOPFLI_MASTER_BLOCK_SIZE, + ZopfliError, +}; + + + +#[derive(Debug, Clone, Copy)] +/// # Block Range. +/// +/// This struct exists primarily to guarantee a range is non-empty and no +/// larger than `ZOPFLI_MASTER_BLOCK_SIZE`. +/// +/// It also implements `Copy`, so there's that too! Haha. +pub(crate) struct ZopfliRange { + start: usize, + end: usize, +} + +impl ZopfliRange { + /// # New. + /// + /// Return a new instance spanning `start..end` so long as the struct's + /// requirements are met, otherwise an error. + pub(crate) const fn new(start: usize, end: usize) -> Result { + if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE { + Ok(Self { start, end }) + } + else { Err(zopfli_error!()) } + } + + /// # Update. + /// + /// Adjust the start and end positions of the range so long as the new + /// values satisfy the struct's requirements, otherwise an error. + pub(crate) fn set(&mut self, start: usize, end: usize) -> Result<(), ZopfliError> { + if start < end && end - start <= ZOPFLI_MASTER_BLOCK_SIZE { + self.start = start; + self.end = end; + Ok(()) + } + else { Err(zopfli_error!()) } + } +} + +impl ZopfliRange { + /// # Start. + pub(crate) const fn start(&self) -> usize { self.start } + + /// # End. + pub(crate) const fn end(&self) -> usize { self.end } + + /// # As (Traditional) Range. + pub(crate) const fn rng(&self) -> Range { self.start..self.end } + + #[allow(unsafe_code)] + /// # Length. + pub(crate) const fn len(&self) -> NonZeroUsize { + // Safety: we verified start is less than end during construction. + unsafe { NonZeroUsize::new_unchecked(self.end - self.start) } + } + + #[allow(unsafe_code, clippy::cast_possible_truncation)] + /// # Length (32-bit). + /// + /// Same as `ZopfliRange::len`, but more convenient in cases where 32-bit + /// values are needed (such as cost/size calculations). + /// + /// Because our ranges are capped at a million, the lengths will always fit + /// without truncation. + pub(crate) const fn len32(&self) -> NonZeroU32 { + // Safety: we verified start is less than end during construction, and + // the total is within a million. + unsafe { NonZeroU32::new_unchecked((self.end - self.start) as u32) } + } +} + + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn t_range() { + // Some simple bad ranges. + assert!(ZopfliRange::new(0, 0).is_err()); + assert!(ZopfliRange::new(3, 2).is_err()); + assert!(ZopfliRange::new(0, ZOPFLI_MASTER_BLOCK_SIZE + 1).is_err()); + + // This should fit!. + assert!(ZopfliRange::new(0, ZOPFLI_MASTER_BLOCK_SIZE).is_ok()); + + // Let's test the getters. + let mut rng = ZopfliRange::new(1, 5).expect("Range failed!"); + assert_eq!(rng.start(), 1); + assert_eq!(rng.end(), 5); + assert_eq!(rng.len(), NonZeroUsize::new(4).unwrap()); + assert_eq!(rng.rng(), 1..5); + + // And the setters. + assert!(rng.set(2, 6).is_ok()); + assert_eq!(rng.start(), 2); + assert_eq!(rng.end(), 6); + assert_eq!(rng.len(), NonZeroUsize::new(4).unwrap()); + assert_eq!(rng.rng(), 2..6); + + // This should fail. + assert!(rng.set(0, 0).is_err()); + assert!(rng.set(3, 2).is_err()); + assert!(rng.set(0, ZOPFLI_MASTER_BLOCK_SIZE + 1).is_err()); + } +} diff --git a/flapfli/src/zopflipng/stats.rs b/flapfli/src/zopflipng/stats.rs index e96c0bf..8e4c18b 100644 --- a/flapfli/src/zopflipng/stats.rs +++ b/flapfli/src/zopflipng/stats.rs @@ -19,7 +19,8 @@ use super::{ #[derive(Clone, Copy)] /// # Randomness. /// -/// This struct is only used to cheaply randomize stat frequencies. +/// This struct is only used to cheaply (and predictably) shuffle stat +/// frequencies. pub(crate) struct RanState { m_w: u32, m_z: u32, @@ -50,12 +51,11 @@ impl RanState { #[derive(Clone, Copy)] /// # Symbol Stats. /// -/// This holds the length and distance symbols and costs for a given block, -/// data that can be used to improve compression on subsequent passes. +/// This hols the length and distance symbols and costs for a given block. +/// data which can be used to improve compression on subsequent passes. pub(crate) struct SymbolStats { ll_counts: ArrayLL, d_counts: ArrayD, - pub(crate) ll_symbols: ArrayLL, pub(crate) d_symbols: ArrayD, } @@ -74,70 +74,47 @@ impl SymbolStats { } impl SymbolStats { - /// # Add Previous Stats (Weighted). + /// # Crunch Symbols. /// - /// This is essentially an `AddAssign` for `ll_counts` and `d_counts`. Each - /// previous value is halved and added to the corresponding current value. - pub(crate) fn add_last( - &mut self, - ll_counts: &ArrayLL, - d_counts: &ArrayD, - ) { - for (l, r) in self.ll_counts.iter_mut().zip(ll_counts.iter().copied()) { - *l += r.wrapping_div(2); - } - for (l, r) in self.d_counts.iter_mut().zip(d_counts.iter().copied()) { - *l += r.wrapping_div(2); - } - - // Set the end symbol. - self.ll_counts[256] = 1; - } - - /// # Clear Frequencies. + /// This calculates the "entropy" of the `ll_counts` and `d_counts` — a + /// fancy way of saying the difference between the log2 of everything and + /// the log2 of self — storing the results in the corresponding symbol + /// arrays. /// - /// Set all `ll_counts` and `d_counts` to zero and return the originals. - pub(crate) fn clear(&mut self) -> (ArrayLL, ArrayD) { - ( - std::mem::replace(&mut self.ll_counts, ZEROED_COUNTS_LL), - std::mem::replace(&mut self.d_counts, ZEROED_COUNTS_D), - ) - } - - /// # Calculate/Set Statistics. - /// - /// This calculates the "entropy" of the `ll_counts` and `d_counts`, storing the - /// results in the corresponding symbols arrays. + /// Note: the symbols are only valid for the _current_ counts, but they + /// don't need to be rebuilt after each and every little change because + /// they're only ever referenced during `ZopfliState::optimal_run` passes; + /// so long as they're (re)crunched before that method is called, life is + /// grand. pub(crate) fn crunch(&mut self) { - #[allow(clippy::cast_precision_loss)] - fn calculate_entropy(count: &[u32; N], bitlengths: &mut [f64; N]) { - let sum = count.iter().sum::(); - - if sum == 0 { - let log2sum = (N as f64).log2(); - bitlengths.fill(log2sum); - } - else { - let log2sum = f64::from(sum).log2(); - - for (c, b) in count.iter().copied().zip(bitlengths.iter_mut()) { - if c == 0 { *b = log2sum; } - else { - *b = log2sum - f64::from(c).log2(); - if b.is_sign_negative() { *b = 0.0; } - } - } - } + // Distances first. + let sum = self.d_counts.iter().copied().sum::(); + let log2sum = + if sum == 0 { 5.0 } // 32.log2() + else { f64::from(sum).log2() }; + self.d_symbols.fill(log2sum); + for (c, b) in self.d_counts.iter().copied().zip(&mut self.d_symbols) { + if c != 0 { *b -= f64::from(c).log2(); } } - calculate_entropy(&self.ll_counts, &mut self.ll_symbols); - calculate_entropy(&self.d_counts, &mut self.d_symbols); + // Lengths second. + let sum = self.ll_counts.iter().copied().sum::(); + // Safety: ll_counts[256] is always 1 — (re)load_store and randomize + // both force it — so this sum will always be nonzero. + if sum == 0 { crate::unreachable(); } + let log2sum = f64::from(sum).log2(); + self.ll_symbols.fill(log2sum); + for (c, b) in self.ll_counts.iter().copied().zip(&mut self.ll_symbols) { + if c != 0 { *b -= f64::from(c).log2(); } + } } /// # Load Statistics. /// /// This updates the `ll_counts` and `d_counts` stats using the data from the - /// `ZopfliLZ77Store` store, then crunches the results. + /// `LZ77Store` store. + /// + /// Note: this does _not_ rebuild the symbol tables. pub(crate) fn load_store(&mut self, store: &LZ77Store) { for e in &store.entries { self.ll_counts[e.ll_symbol as usize] += 1; @@ -146,26 +123,67 @@ impl SymbolStats { // Set the end symbol and crunch. self.ll_counts[256] = 1; - self.crunch(); } /// # Randomize Stat Frequencies. /// /// This randomizes the stat frequencies to allow things to maybe turn out /// different on subsequent squeeze passes. + /// + /// For this to work properly, a single `RanState` must be used for all + /// iterations, and because shuffling advances the `RanState`, litlens must + /// be processed before distances. + /// + /// Yeah… this is super weird. Haha. + /// + /// Note: this does _not_ rebuild the symbol tables. pub(crate) fn randomize(&mut self, state: &mut RanState) { - fn randomize_freqs(freqs: &mut [u32; N], state: &mut RanState) { - for i in 0..N { + fn shuffle_counts(counts: &mut [u32; N], state: &mut RanState) { + const { assert!(N == ZOPFLI_NUM_D || N == ZOPFLI_NUM_LL); } + for i in const { 0..N } { if (state.randomize() >> 4) % 3 == 0 { let index = state.randomize() as usize % N; - freqs[i] = freqs[index]; + counts[i] = counts[index]; } } } - randomize_freqs(&mut self.ll_counts, state); - randomize_freqs(&mut self.d_counts, state); + shuffle_counts(&mut self.ll_counts, state); // Lengths need to go first. + shuffle_counts(&mut self.d_counts, state); // Set the end symbol. self.ll_counts[256] = 1; } + + /// # Reload Store. + /// + /// Like `SymbolStats::load_store`, but reset or halve the counts first. + /// (Halving creates a sort of weighted average, useful once a few + /// iterations have occurred.) + /// + /// Note: this does _not_ rebuild the symbols. + pub(crate) fn reload_store(&mut self, store: &LZ77Store, weighted: bool) { + if weighted { + for c in &mut self.d_counts { *c /= 2; } + for c in &mut self.ll_counts { *c /= 2; } + } + else { + self.d_counts.fill(0); + self.ll_counts.fill(0); + } + + self.load_store(store); + } +} + + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn t_d_log2() { + // Make sure we precomputed the 32.log2() correctly! + assert_eq!((ZOPFLI_NUM_D as f64).log2(), 5.0); + } } diff --git a/flapfli/src/zopflipng/symbols.rs b/flapfli/src/zopflipng/symbols.rs index 453b395..3eb8c21 100644 --- a/flapfli/src/zopflipng/symbols.rs +++ b/flapfli/src/zopflipng/symbols.rs @@ -10,16 +10,6 @@ via `build.rs`. // terrible DISTANCE_SYMBOLS and DISTANCE_VALUES lookup tables. include!(concat!(env!("OUT_DIR"), "/symbols.rs")); -/// # Distance Extra Bits (by Symbol). -/// -/// Note only the first `30` values have meaning, but the compiler doesn't -/// understand distances are only using 15 bits. Padding the table to `32` -/// helps eliminate superfluous bounds checks. -pub(crate) const DISTANCE_BITS: [u8; 32] = [ - 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, - 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0, -]; - /// # Length Symbols by Litlen. pub(crate) const LENGTH_SYMBOLS: [Lsym; 259] = [ Lsym::L000, Lsym::L000, Lsym::L000, @@ -57,21 +47,6 @@ pub(crate) const LENGTH_SYMBOLS: [Lsym; 259] = [ Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L284, Lsym::L285, ]; -/// # Length Symbol Bits by Litlen. -pub(crate) const LENGTH_SYMBOL_BITS: [u8; 259] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, -]; - /// # Length Symbol Bit Values by Litlen. pub(crate) const LENGTH_SYMBOL_BIT_VALUES: [u8; 259] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, @@ -88,7 +63,11 @@ pub(crate) const LENGTH_SYMBOL_BIT_VALUES: [u8; 259] = [ ]; /// # Symbol Iterator. +/// +/// This trait exposes a single `all` method that returns an iterator over the +/// enum's variants. pub(crate) trait SymbolIteration>: Sized { + /// # Iterate All Variants! fn all() -> U; } @@ -127,7 +106,7 @@ impl LitLen { /// # Is Max? /// - /// Returns `true` if `self` is `Self::MAX_MATCH`. + /// Returns `true` if `self` is exactly `Self::MAX_MATCH`. pub(crate) const fn is_max(self) -> bool { matches!(self, Self::MAX_MATCH) } /// # Is Zero? @@ -146,6 +125,15 @@ impl LitLen { unsafe { std::mem::transmute::(n as u16) } } + #[allow(unsafe_code)] + /// # From U8+3. + /// + /// This reverses the work done by `LitLen::to_packed_u8`, returning the + /// `LitLen` equivalent of `n + 3`. + pub(crate) const fn from_packed_u8(n: u8) -> Self { + unsafe { std::mem::transmute::(n as u16 + 3) } + } + #[allow(unsafe_code)] /// # Min w/ U16. /// @@ -181,6 +169,20 @@ impl LitLen { pub(crate) const fn next_iter(after: Self) -> LitLenIter { LitLenIter(after as u16 + 1) } + + #[allow(clippy::cast_possible_truncation)] + /// # To Packed U8. + /// + /// This method packs (a matcheable) `LitLen` into a `u8` by subtracting + /// three. (This works because `LitLen::MAX_MATCH - 3 == u8::MAX`.) + /// + /// Values less than three shouldn't ever find their way here, but if they + /// do zero is returned. + pub(crate) const fn to_packed_u8(self) -> u8 { + let n = self as u16; + if 3 < n { (n - 3) as u8 } + else { 0 } + } } impl Lsym { @@ -196,12 +198,23 @@ impl Lsym { impl SplitLen { /// # Is Zero? + /// + /// Returns `true` if `self` is zero. pub(crate) const fn is_zero(self) -> bool { matches!(self, Self::S00) } /// # Is Max? + /// + /// Returns `true` if `self` is the maximum value (`SplitLen::S14`). pub(crate) const fn is_max(self) -> bool { matches!(self, Self::S14) } /// # Increment. + /// + /// Returns `self + 1`. + /// + /// ## Safety + /// + /// This would be UB if `self.is_max()`; the caller must explicitly check + /// that is not the case before incrementing. pub(crate) const fn increment(self) -> Self { #[allow(unsafe_code)] unsafe { @@ -209,7 +222,7 @@ impl SplitLen { // `split_lz77` and `split_raw` — both of which explicitly check // the current value, breaking their loops if/when the maximum is // reached. - if self.is_max() { core::hint::unreachable_unchecked(); } + if self.is_max() { crate::unreachable(); } // Safety: SplitLen has the same size and alignment as u8. std::mem::transmute::(self as u8 + 1) @@ -223,6 +236,18 @@ impl SplitLen { mod tests { use super::*; + #[test] + fn t_symbol_bits() { + // The DISTANCE_BITS/_F and LENGTH_SYMBOL_BITS/_F constants should have + // equivalent values. + for (f, i) in DISTANCE_BITS_F.iter().copied().zip(DISTANCE_BITS) { + assert_eq!(f, f64::from(i)); + } + for (f, i) in LENGTH_SYMBOL_BITS_F.iter().copied().zip(LENGTH_SYMBOL_BITS) { + assert_eq!(f, f64::from(i)); + } + } + #[test] /// # Deflate Symbol Size and Alignment. fn t_deflate_size_align() { diff --git a/justfile b/justfile index a3208e5..9766b99 100644 --- a/justfile +++ b/justfile @@ -105,7 +105,15 @@ export CXXFLAGS := "-Wall -Wextra -flto -march=x86-64-v3" # Make the docs. cargo rustdoc \ --release \ - --target-dir "{{ cargo_dir }}" + --manifest-path "{{ pkg_dir1 }}/Cargo.toml" \ + --target-dir "{{ cargo_dir }}" \ + -- --document-private-items + + cargo rustdoc \ + --release \ + --manifest-path "{{ pkg_dir2 }}/Cargo.toml" \ + --target-dir "{{ cargo_dir }}" \ + -- --document-private-items # Move the docs and clean up ownership. [ ! -d "{{ doc_dir }}" ] || rm -rf "{{ doc_dir }}" diff --git a/release/man/flaca.1 b/release/man/flaca.1 index 47fc9df..6685f30 100644 --- a/release/man/flaca.1 +++ b/release/man/flaca.1 @@ -1,6 +1,6 @@ -.TH "FLACA" "1" "June 2024" "Flaca v3.1.2" "User Commands" +.TH "FLACA" "1" "July 2024" "Flaca v3.1.3" "User Commands" .SH NAME -Flaca \- Manual page for flaca v3.1.2. +Flaca \- Manual page for flaca v3.1.3. .SH DESCRIPTION Brute\-force, lossless JPEG and PNG compression. .SS USAGE: diff --git a/skel/assets/pgo.b3 b/skel/assets/pgo.b3 index 9373934..675c524 100644 --- a/skel/assets/pgo.b3 +++ b/skel/assets/pgo.b3 @@ -10,6 +10,7 @@ e4ff0642f0f19d91b28125d166ad4691d66ea039896a70c45a1c40f9644b90b3 ./pgo/periodic 347c34094723a02f3b6432aa00de53ef0a47c8b735f4d8bb7a564f6890a08db9 ./pgo/smile.png 9ce27ed293bfc346b5f6a20b2e7fabb0f8d15f8fce6ac60b3669b56778fbe616 ./pgo/sr.png ed450bcfae1bb62505c9a6375b3458046942ba85b7cc9c7e12e10c906c0ecaae ./pgo/suck.png +772c152fa637e451ac1bf7352ec54ed91c824f42b0f1be4e39ceb3501e9cdb99 ./pgo/tiny.png 39db2fcb57e60439467a78b03cf9bf984e0f562ef1d676ee8537a4f71f669b52 ./pgo/ubuntu.png ba2e1f129bddbbf614553a028d24ff2d1d7b49b2d5eddf9542c1c32af8e7d739 ./pgo/venn256.png 5782a9a860dbccd78d2bb061e7facd599a5f603650187c4612dadbf316bd42fb ./pgo/venn2048.png diff --git a/skel/pgo/tiny.png b/skel/pgo/tiny.png new file mode 100644 index 0000000..0874f6a Binary files /dev/null and b/skel/pgo/tiny.png differ