From b3e2c545938e53d15091f9eb4ea5cbe2277500b2 Mon Sep 17 00:00:00 2001 From: BlueCube3310 <53150244+BlueCube3310@users.noreply.github.com> Date: Sat, 28 Sep 2024 12:05:45 +0200 Subject: [PATCH] BasisU: Update to 1.50.0 and add HDR support --- editor/import/resource_importer_texture.cpp | 5 - modules/basis_universal/SCsub | 4 + .../basis_universal/image_compress_basisu.cpp | 206 +- .../basis_universal/image_compress_basisu.h | 9 + thirdparty/README.md | 2 +- .../encoder/3rdparty/android_astc_decomp.cpp | 2052 ++++++++++ .../encoder/3rdparty/android_astc_decomp.h | 45 + .../encoder/basisu_astc_hdr_enc.cpp | 3310 +++++++++++++++ .../encoder/basisu_astc_hdr_enc.h | 224 + .../encoder/basisu_backend.cpp | 2 +- .../basis_universal/encoder/basisu_backend.h | 2 +- .../encoder/basisu_basis_file.cpp | 2 +- .../encoder/basisu_basis_file.h | 2 +- .../basis_universal/encoder/basisu_bc7enc.cpp | 5 +- .../basis_universal/encoder/basisu_bc7enc.h | 2 +- .../basis_universal/encoder/basisu_comp.cpp | 2128 ++++++++-- .../basis_universal/encoder/basisu_comp.h | 104 +- .../basis_universal/encoder/basisu_enc.cpp | 1908 ++++++++- .../basis_universal/encoder/basisu_enc.h | 653 ++- .../basis_universal/encoder/basisu_etc.cpp | 2 +- .../basis_universal/encoder/basisu_etc.h | 2 +- .../encoder/basisu_frontend.cpp | 3 +- .../basis_universal/encoder/basisu_frontend.h | 2 +- .../encoder/basisu_gpu_texture.cpp | 561 ++- .../encoder/basisu_gpu_texture.h | 50 +- .../encoder/basisu_kernels_declares.h | 2 +- .../encoder/basisu_kernels_imp.h | 2 +- .../encoder/basisu_kernels_sse.cpp | 18 +- .../basis_universal/encoder/basisu_miniz.h | 10 +- .../basis_universal/encoder/basisu_opencl.cpp | 2 +- .../basis_universal/encoder/basisu_opencl.h | 2 +- .../encoder/basisu_pvrtc1_4.cpp | 2 +- .../basis_universal/encoder/basisu_pvrtc1_4.h | 13 +- .../encoder/basisu_resample_filters.cpp | 2 +- .../encoder/basisu_resampler.cpp | 2 +- .../encoder/basisu_resampler.h | 2 +- .../encoder/basisu_resampler_filters.h | 2 +- .../basis_universal/encoder/basisu_ssim.cpp | 2 +- .../basis_universal/encoder/basisu_ssim.h | 2 +- .../encoder/basisu_uastc_enc.cpp | 21 +- .../encoder/basisu_uastc_enc.h | 2 +- .../basis_universal/encoder/cppspmd_flow.h | 2 +- .../basis_universal/encoder/cppspmd_math.h | 4 +- .../encoder/cppspmd_math_declares.h | 2 +- .../basis_universal/encoder/cppspmd_sse.h | 28 +- .../encoder/cppspmd_type_aliases.h | 2 +- .../basis_universal/encoder/pvpngreader.cpp | 18 +- .../patches/external-jpgd.patch | 0 .../patches/external-tinyexr.patch | 23 + .../patches/remove-tinydds-qoi.patch | 446 ++ .../basis_universal/transcoder/basisu.h | 105 +- .../transcoder/basisu_astc_hdr_core.h | 102 + .../transcoder/basisu_astc_helpers.h | 3587 +++++++++++++++++ .../transcoder/basisu_containers.h | 62 +- .../transcoder/basisu_containers_impl.h | 47 +- .../transcoder/basisu_file_headers.h | 5 +- .../transcoder/basisu_transcoder.cpp | 2057 +++++++++- .../transcoder/basisu_transcoder.h | 80 +- .../transcoder/basisu_transcoder_internal.h | 216 +- .../basisu_transcoder_tables_dxt1_5.inc | 2 +- .../basisu_transcoder_tables_dxt1_6.inc | 2 +- .../transcoder/basisu_transcoder_uastc.h | 1 + 62 files changed, 17244 insertions(+), 918 deletions(-) create mode 100644 thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp create mode 100644 thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h rename {modules => thirdparty}/basis_universal/patches/external-jpgd.patch (100%) create mode 100644 thirdparty/basis_universal/patches/external-tinyexr.patch create mode 100644 thirdparty/basis_universal/patches/remove-tinydds-qoi.patch create mode 100644 thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h create mode 100644 thirdparty/basis_universal/transcoder/basisu_astc_helpers.h diff --git a/editor/import/resource_importer_texture.cpp b/editor/import/resource_importer_texture.cpp index a205123df1ba..ff2778a64041 100644 --- a/editor/import/resource_importer_texture.cpp +++ b/editor/import/resource_importer_texture.cpp @@ -593,11 +593,6 @@ Error ResourceImporterTexture::import(const String &p_source_file, const String } } - if (compress_mode == COMPRESS_BASIS_UNIVERSAL && image->get_format() >= Image::FORMAT_RF) { - // Basis universal does not support float formats, fallback. - compress_mode = COMPRESS_VRAM_COMPRESSED; - } - bool detect_3d = int(p_options["detect_3d/compress_to"]) > 0; bool detect_roughness = roughness == 0; bool detect_normal = normal == 0; diff --git a/modules/basis_universal/SCsub b/modules/basis_universal/SCsub index 0142317e1ef4..986c23b7d81e 100644 --- a/modules/basis_universal/SCsub +++ b/modules/basis_universal/SCsub @@ -14,6 +14,8 @@ thirdparty_obj = [] thirdparty_dir = "#thirdparty/basis_universal/" # Sync list with upstream CMakeLists.txt encoder_sources = [ + "3rdparty/android_astc_decomp.cpp", + "basisu_astc_hdr_enc.cpp", "basisu_backend.cpp", "basisu_basis_file.cpp", "basisu_bc7enc.cpp", @@ -45,6 +47,8 @@ else: if env["builtin_zstd"]: env_basisu.Prepend(CPPPATH=["#thirdparty/zstd"]) +env_basisu.Prepend(CPPPATH=["#thirdparty/tinyexr"]) + if env.dev_build: env_basisu.Append(CPPDEFINES=[("BASISU_DEVEL_MESSAGES", 1), ("BASISD_ENABLE_DEBUG_FLAGS", 1)]) diff --git a/modules/basis_universal/image_compress_basisu.cpp b/modules/basis_universal/image_compress_basisu.cpp index ab20d00b5b17..d48ea363a7b5 100644 --- a/modules/basis_universal/image_compress_basisu.cpp +++ b/modules/basis_universal/image_compress_basisu.cpp @@ -30,6 +30,8 @@ #include "image_compress_basisu.h" +#include "core/os/os.h" +#include "core/string/print_string.h" #include "servers/rendering_server.h" #include @@ -46,9 +48,48 @@ void basis_universal_init() { } #ifdef TOOLS_ENABLED +template +inline void _basisu_pad_mipmap(const uint8_t *p_image_mip_data, Vector &r_mip_data_padded, int p_next_width, int p_next_height, int p_width, int p_height, int64_t p_size) { + // Source mip's data interpreted as 32-bit RGBA blocks to help with copying pixel data. + const T *mip_src_data = reinterpret_cast(p_image_mip_data); + + // Reserve space in the padded buffer. + r_mip_data_padded.resize(p_next_width * p_next_height * sizeof(T)); + T *data_padded_ptr = reinterpret_cast(r_mip_data_padded.ptrw()); + + // Pad mipmap to the nearest block by smearing. + int x = 0, y = 0; + for (y = 0; y < p_height; y++) { + for (x = 0; x < p_width; x++) { + data_padded_ptr[p_next_width * y + x] = mip_src_data[p_width * y + x]; + } + + // First, smear in x. + for (; x < p_next_width; x++) { + data_padded_ptr[p_next_width * y + x] = data_padded_ptr[p_next_width * y + x - 1]; + } + } + + // Then, smear in y. + for (; y < p_next_height; y++) { + for (x = 0; x < p_next_width; x++) { + data_padded_ptr[p_next_width * y + x] = data_padded_ptr[p_next_width * y + x - p_next_width]; + } + } +} + Vector basis_universal_packer(const Ref &p_image, Image::UsedChannels p_channels) { + uint64_t start_time = OS::get_singleton()->get_ticks_msec(); + Ref image = p_image->duplicate(); - image->convert(Image::FORMAT_RGBA8); + bool is_hdr = false; + + if (image->get_format() <= Image::FORMAT_RGB565) { + image->convert(Image::FORMAT_RGBA8); + } else if (image->get_format() <= Image::FORMAT_RGBE9995) { + image->convert(Image::FORMAT_RGBAF); + is_hdr = true; + } basisu::basis_compressor_params params; @@ -74,32 +115,42 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha basisu::job_pool job_pool(OS::get_singleton()->get_processor_count()); params.m_pJob_pool = &job_pool; - BasisDecompressFormat decompress_format = BASIS_DECOMPRESS_RG; - switch (p_channels) { - case Image::USED_CHANNELS_L: { - decompress_format = BASIS_DECOMPRESS_RGB; - } break; - case Image::USED_CHANNELS_LA: { - params.m_force_alpha = true; - decompress_format = BASIS_DECOMPRESS_RGBA; - } break; - case Image::USED_CHANNELS_R: { - decompress_format = BASIS_DECOMPRESS_R; - } break; - case Image::USED_CHANNELS_RG: { - params.m_force_alpha = true; - image->convert_rg_to_ra_rgba8(); - decompress_format = BASIS_DECOMPRESS_RG; - } break; - case Image::USED_CHANNELS_RGB: { - decompress_format = BASIS_DECOMPRESS_RGB; - } break; - case Image::USED_CHANNELS_RGBA: { - params.m_force_alpha = true; - decompress_format = BASIS_DECOMPRESS_RGBA; - } break; + BasisDecompressFormat decompress_format = BASIS_DECOMPRESS_MAX; + + if (is_hdr) { + decompress_format = BASIS_DECOMPRESS_HDR_RGB; + params.m_hdr = true; + params.m_uastc_hdr_options.set_quality_level(0); + + } else { + switch (p_channels) { + case Image::USED_CHANNELS_L: { + decompress_format = BASIS_DECOMPRESS_RGB; + } break; + case Image::USED_CHANNELS_LA: { + params.m_force_alpha = true; + decompress_format = BASIS_DECOMPRESS_RGBA; + } break; + case Image::USED_CHANNELS_R: { + decompress_format = BASIS_DECOMPRESS_R; + } break; + case Image::USED_CHANNELS_RG: { + params.m_force_alpha = true; + image->convert_rg_to_ra_rgba8(); + decompress_format = BASIS_DECOMPRESS_RG; + } break; + case Image::USED_CHANNELS_RGB: { + decompress_format = BASIS_DECOMPRESS_RGB; + } break; + case Image::USED_CHANNELS_RGBA: { + params.m_force_alpha = true; + decompress_format = BASIS_DECOMPRESS_RGBA; + } break; + } } + ERR_FAIL_COND_V(decompress_format == BASIS_DECOMPRESS_MAX, Vector()); + // Copy the source image data with mipmaps into BasisU. { const int orig_width = image->get_width(); @@ -113,9 +164,10 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha Vector image_data = image->get_data(); basisu::vector basisu_mipmaps; + basisu::vector basisu_mipmaps_hdr; // Buffer for storing padded mipmap data. - Vector mip_data_padded; + Vector mip_data_padded; for (int32_t i = 0; i <= image->get_mipmap_count(); i++) { int64_t ofs, size; @@ -126,31 +178,10 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha // Pad the mipmap's data if its resolution isn't divisible by 4. if (image->has_mipmaps() && !is_res_div_4 && (width > 2 && height > 2) && (width != next_width || height != next_height)) { - // Source mip's data interpreted as 32-bit RGBA blocks to help with copying pixel data. - const uint32_t *mip_src_data = reinterpret_cast(image_mip_data); - - // Reserve space in the padded buffer. - mip_data_padded.resize(next_width * next_height); - uint32_t *data_padded_ptr = mip_data_padded.ptrw(); - - // Pad mipmap to the nearest block by smearing. - int x = 0, y = 0; - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - data_padded_ptr[next_width * y + x] = mip_src_data[width * y + x]; - } - - // First, smear in x. - for (; x < next_width; x++) { - data_padded_ptr[next_width * y + x] = data_padded_ptr[next_width * y + x - 1]; - } - } - - // Then, smear in y. - for (; y < next_height; y++) { - for (x = 0; x < next_width; x++) { - data_padded_ptr[next_width * y + x] = data_padded_ptr[next_width * y + x - next_width]; - } + if (is_hdr) { + _basisu_pad_mipmap(image_mip_data, mip_data_padded, next_width, next_height, width, height, size); + } else { + _basisu_pad_mipmap(image_mip_data, mip_data_padded, next_width, next_height, width, height, size); } // Override the image_mip_data pointer with our temporary Vector. @@ -159,7 +190,7 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha // Override the mipmap's properties. width = next_width; height = next_height; - size = mip_data_padded.size() * 4; + size = mip_data_padded.size(); } // Get the next mipmap's resolution. @@ -167,44 +198,61 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha next_height /= 2; // Copy the source mipmap's data to a BasisU image. - basisu::image basisu_image(width, height); - memcpy(basisu_image.get_ptr(), image_mip_data, size); + if (is_hdr) { + basisu::imagef basisu_image(width, height); + memcpy(reinterpret_cast(basisu_image.get_ptr()), image_mip_data, size); + + if (i == 0) { + params.m_source_images_hdr.push_back(basisu_image); + } else { + basisu_mipmaps_hdr.push_back(basisu_image); + } - if (i == 0) { - params.m_source_images.push_back(basisu_image); } else { - basisu_mipmaps.push_back(basisu_image); + basisu::image basisu_image(width, height); + memcpy(basisu_image.get_ptr(), image_mip_data, size); + + if (i == 0) { + params.m_source_images.push_back(basisu_image); + } else { + basisu_mipmaps.push_back(basisu_image); + } } } - params.m_source_mipmap_images.push_back(basisu_mipmaps); + if (is_hdr) { + params.m_source_mipmap_images_hdr.push_back(basisu_mipmaps_hdr); + } else { + params.m_source_mipmap_images.push_back(basisu_mipmaps); + } } // Encode the image data. - Vector basisu_data; - basisu::basis_compressor compressor; compressor.init(params); int basisu_err = compressor.process(); - ERR_FAIL_COND_V(basisu_err != basisu::basis_compressor::cECSuccess, basisu_data); + ERR_FAIL_COND_V(basisu_err != basisu::basis_compressor::cECSuccess, Vector()); - const basisu::uint8_vec &basisu_out = compressor.get_output_basis_file(); - basisu_data.resize(basisu_out.size() + 4); + const basisu::uint8_vec &basisu_encoded = compressor.get_output_basis_file(); - // Copy the encoded data to the buffer. - { - uint8_t *wb = basisu_data.ptrw(); - *(uint32_t *)wb = decompress_format; + Vector basisu_data; + basisu_data.resize(basisu_encoded.size() + 4); + uint8_t *basisu_data_ptr = basisu_data.ptrw(); - memcpy(wb + 4, basisu_out.get_ptr(), basisu_out.size()); - } + // Copy the encoded BasisU data into the output buffer. + *(uint32_t *)basisu_data_ptr = decompress_format; + memcpy(basisu_data_ptr + 4, basisu_encoded.get_ptr(), basisu_encoded.size()); + + print_verbose(vformat("BasisU: Encoding a %dx%d image with %d mipmaps took %d ms.", p_image->get_width(), p_image->get_height(), p_image->get_mipmap_count(), OS::get_singleton()->get_ticks_msec() - start_time)); return basisu_data; } #endif // TOOLS_ENABLED Ref basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) { + uint64_t start_time = OS::get_singleton()->get_ticks_msec(); + Ref image; ERR_FAIL_NULL_V_MSG(p_data, image, "Cannot unpack invalid BasisUniversal data."); @@ -320,6 +368,23 @@ Ref basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) { } } break; + case BASIS_DECOMPRESS_HDR_RGB: { + if (bptc_supported) { + basisu_format = basist::transcoder_texture_format::cTFBC6H; + image_format = Image::FORMAT_BPTC_RGBFU; + } else if (astc_supported) { + basisu_format = basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA; + image_format = Image::FORMAT_ASTC_4x4_HDR; + } else { + // No supported VRAM compression formats, decompress. + basisu_format = basist::transcoder_texture_format::cTFRGB_9E5; + image_format = Image::FORMAT_RGBE9995; + } + + } break; + default: { + ERR_FAIL_V(image); + } break; } src_ptr += 4; @@ -371,6 +436,9 @@ Ref basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) { } } + print_verbose(vformat("BasisU: Transcoding a %dx%d image with %d mipmaps into %s took %d ms.", + image->get_width(), image->get_height(), image->get_mipmap_count(), Image::get_format_name(image_format), OS::get_singleton()->get_ticks_msec() - start_time)); + return image; } diff --git a/modules/basis_universal/image_compress_basisu.h b/modules/basis_universal/image_compress_basisu.h index 5e36d448f670..81c8511f603c 100644 --- a/modules/basis_universal/image_compress_basisu.h +++ b/modules/basis_universal/image_compress_basisu.h @@ -39,11 +39,20 @@ enum BasisDecompressFormat { BASIS_DECOMPRESS_RGBA, BASIS_DECOMPRESS_RG_AS_RA, BASIS_DECOMPRESS_R, + BASIS_DECOMPRESS_HDR_RGB, + BASIS_DECOMPRESS_MAX }; void basis_universal_init(); #ifdef TOOLS_ENABLED +struct BasisRGBAF { + uint32_t r; + uint32_t g; + uint32_t b; + uint32_t a; +}; + Vector basis_universal_packer(const Ref &p_image, Image::UsedChannels p_channels); #endif diff --git a/thirdparty/README.md b/thirdparty/README.md index 58226261f4b5..c9a0cefb051d 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -59,7 +59,7 @@ Files extracted from upstream source: ## basis_universal - Upstream: https://github.com/BinomialLLC/basis_universal -- Version: 1.16.4 (900e40fb5d2502927360fe2f31762bdbb624455f, 2023) +- Version: 1.50.0 (051ad6d8a64bb95a79e8601c317055fd1782ad3e, 2024) - License: Apache 2.0 Files extracted from upstream source: diff --git a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp new file mode 100644 index 000000000000..5abfe2faf922 --- /dev/null +++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp @@ -0,0 +1,2052 @@ +// File: android_astc_decomp.cpp + +/*------------------------------------------------------------------------- + * drawElements Quality Program Tester Core + * ---------------------------------------- + * + * Copyright 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * rg: Removed external dependencies, minor fix to decompress() so it converts non-sRGB + * output to 8-bits correctly. I've compared this decoder's output + * vs. astc-codec with random inputs. + * + *//*! + * \file + * \brief ASTC Utilities. + *//*--------------------------------------------------------------------*/ +#include "android_astc_decomp.h" +#include +#include +#include +#include + +#define DE_LENGTH_OF_ARRAY(x) (sizeof(x)/sizeof(x[0])) +#define DE_UNREF(x) (void)x + +typedef uint8_t deUint8; +typedef int8_t deInt8; +typedef uint32_t deUint32; +typedef int32_t deInt32; +typedef uint16_t deUint16; +typedef int16_t deInt16; +typedef int64_t deInt64; +typedef uint64_t deUint64; + +#define DE_ASSERT assert + +#ifdef _MSC_VER +#pragma warning (disable:4505) // unreferenced local function has been removed +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" +#endif + +namespace basisu_astc +{ + template inline S maximum(S a, S b) { return (a > b) ? a : b; } + template inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); } + template inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); } + + static bool inBounds(int v, int l, int h) + { + return (v >= l) && (v < h); + } + + static bool inRange(int v, int l, int h) + { + return (v >= l) && (v <= h); + } + + template + static inline T max(T a, T b) + { + return (a > b) ? a : b; + } + + template + static inline T min(T a, T b) + { + return (a < b) ? a : b; + } + + template + static inline T clamp(T a, T l, T h) + { + if (a < l) + return l; + else if (a > h) + return h; + return a; + } + + struct UVec4 + { + uint32_t m_c[4]; + + UVec4() + { + m_c[0] = 0; + m_c[1] = 0; + m_c[2] = 0; + m_c[3] = 0; + } + + UVec4(uint32_t x, uint32_t y, uint32_t z, uint32_t w) + { + m_c[0] = x; + m_c[1] = y; + m_c[2] = z; + m_c[3] = w; + } + + uint32_t x() const { return m_c[0]; } + uint32_t y() const { return m_c[1]; } + uint32_t z() const { return m_c[2]; } + uint32_t w() const { return m_c[3]; } + + uint32_t& x() { return m_c[0]; } + uint32_t& y() { return m_c[1]; } + uint32_t& z() { return m_c[2]; } + uint32_t& w() { return m_c[3]; } + + uint32_t operator[] (uint32_t idx) const { assert(idx < 4); return m_c[idx]; } + uint32_t& operator[] (uint32_t idx) { assert(idx < 4); return m_c[idx]; } + }; + + struct IVec4 + { + int32_t m_c[4]; + + IVec4() + { + m_c[0] = 0; + m_c[1] = 0; + m_c[2] = 0; + m_c[3] = 0; + } + + IVec4(int32_t x, int32_t y, int32_t z, int32_t w) + { + m_c[0] = x; + m_c[1] = y; + m_c[2] = z; + m_c[3] = w; + } + + int32_t x() const { return m_c[0]; } + int32_t y() const { return m_c[1]; } + int32_t z() const { return m_c[2]; } + int32_t w() const { return m_c[3]; } + + int32_t& x() { return m_c[0]; } + int32_t& y() { return m_c[1]; } + int32_t& z() { return m_c[2]; } + int32_t& w() { return m_c[3]; } + + UVec4 asUint() const + { + return UVec4(maximum(0, m_c[0]), maximum(0, m_c[1]), maximum(0, m_c[2]), maximum(0, m_c[3])); + } + + int32_t operator[] (uint32_t idx) const { assert(idx < 4); return m_c[idx]; } + int32_t& operator[] (uint32_t idx) { assert(idx < 4); return m_c[idx]; } + }; + + struct IVec3 + { + int32_t m_c[3]; + + IVec3() + { + m_c[0] = 0; + m_c[1] = 0; + m_c[2] = 0; + } + + IVec3(int32_t x, int32_t y, int32_t z) + { + m_c[0] = x; + m_c[1] = y; + m_c[2] = z; + } + + int32_t x() const { return m_c[0]; } + int32_t y() const { return m_c[1]; } + int32_t z() const { return m_c[2]; } + + int32_t& x() { return m_c[0]; } + int32_t& y() { return m_c[1]; } + int32_t& z() { return m_c[2]; } + + int32_t operator[] (uint32_t idx) const { assert(idx < 3); return m_c[idx]; } + int32_t& operator[] (uint32_t idx) { assert(idx < 3); return m_c[idx]; } + }; + + static uint32_t deDivRoundUp32(uint32_t a, uint32_t b) + { + return (a + b - 1) / b; + } + + static bool deInBounds32(uint32_t v, uint32_t l, uint32_t h) + { + return (v >= l) && (v < h); + } + +namespace astc +{ + +using std::vector; + +namespace +{ + +// Common utilities +enum +{ + MAX_BLOCK_WIDTH = 12, + MAX_BLOCK_HEIGHT = 12 +}; + +inline deUint32 getBit (deUint32 src, int ndx) +{ + DE_ASSERT(basisu_astc::inBounds(ndx, 0, 32)); + return (src >> ndx) & 1; +} + +inline deUint32 getBits (deUint32 src, int low, int high) +{ + const int numBits = (high-low) + 1; + DE_ASSERT(basisu_astc::inRange(numBits, 1, 32)); + + if (numBits < 32) + return (deUint32)((src >> low) & ((1u<> low) & 0xFFFFFFFFu); +} + +inline bool isBitSet (deUint32 src, int ndx) +{ + return getBit(src, ndx) != 0; +} + +inline deUint32 reverseBits (deUint32 src, int numBits) +{ + DE_ASSERT(basisu_astc::inRange(numBits, 0, 32)); + + deUint32 result = 0; + for (int i = 0; i < numBits; i++) + result |= ((src >> i) & 1) << (numBits-1-i); + + return result; +} + +inline deUint32 bitReplicationScale (deUint32 src, int numSrcBits, int numDstBits) +{ + DE_ASSERT(numSrcBits <= numDstBits); + DE_ASSERT((src & ((1< -numSrcBits; shift -= numSrcBits) + dst |= (shift >= 0) ? (src << shift) : (src >> -shift); + + return dst; +} + +inline deInt32 signExtend (deInt32 src, int numSrcBits) +{ + DE_ASSERT(basisu_astc::inRange(numSrcBits, 2, 31)); + + const bool negative = (src & (1 << (numSrcBits-1))) != 0; + return src | (negative ? ~((1 << numSrcBits) - 1) : 0); +} + +typedef uint16_t deFloat16; + +inline bool isFloat16InfOrNan (deFloat16 v) +{ + return getBits(v, 10, 14) == 31; +} + +float deFloat16To32(deFloat16 val16) +{ + deUint32 sign; + deUint32 expotent; + deUint32 mantissa; + + union + { + float f; + deUint32 u; + } x; + + x.u = 0u; + + sign = ((deUint32)val16 >> 15u) & 0x00000001u; + expotent = ((deUint32)val16 >> 10u) & 0x0000001fu; + mantissa = (deUint32)val16 & 0x000003ffu; + + if (expotent == 0u) + { + if (mantissa == 0u) + { + /* +/- 0 */ + x.u = sign << 31u; + return x.f; + } + else + { + /* Denormalized, normalize it. */ + + while (!(mantissa & 0x00000400u)) + { + mantissa <<= 1u; + expotent -= 1u; + } + + expotent += 1u; + mantissa &= ~0x00000400u; + } + } + else if (expotent == 31u) + { + if (mantissa == 0u) + { + /* +/- InF */ + x.u = (sign << 31u) | 0x7f800000u; + return x.f; + } + else + { + /* +/- NaN */ + x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u); + return x.f; + } + } + + expotent = expotent + (127u - 15u); + mantissa = mantissa << 13u; + + x.u = (sign << 31u) | (expotent << 23u) | mantissa; + return x.f; +} + +enum ISEMode +{ + ISEMODE_TRIT = 0, + ISEMODE_QUINT, + ISEMODE_PLAIN_BIT, + ISEMODE_LAST +}; + +struct ISEParams +{ + ISEMode mode; + int numBits; + ISEParams (ISEMode mode_, int numBits_) : mode(mode_), numBits(numBits_) {} +}; + +inline int computeNumRequiredBits (const ISEParams& iseParams, int numValues) +{ + switch (iseParams.mode) + { + case ISEMODE_TRIT: return deDivRoundUp32(numValues*8, 5) + numValues*iseParams.numBits; + case ISEMODE_QUINT: return deDivRoundUp32(numValues*7, 3) + numValues*iseParams.numBits; + case ISEMODE_PLAIN_BIT: return numValues*iseParams.numBits; + default: + DE_ASSERT(false); + return -1; + } +} + +ISEParams computeMaximumRangeISEParams (int numAvailableBits, int numValuesInSequence) +{ + int curBitsForTritMode = 6; + int curBitsForQuintMode = 5; + int curBitsForPlainBitMode = 8; + + while (true) + { + DE_ASSERT(curBitsForTritMode > 0 || curBitsForQuintMode > 0 || curBitsForPlainBitMode > 0); + const int tritRange = (curBitsForTritMode > 0) ? (3 << curBitsForTritMode) - 1 : -1; + const int quintRange = (curBitsForQuintMode > 0) ? (5 << curBitsForQuintMode) - 1 : -1; + const int plainBitRange = (curBitsForPlainBitMode > 0) ? (1 << curBitsForPlainBitMode) - 1 : -1; + const int maxRange = basisu_astc::max(basisu_astc::max(tritRange, quintRange), plainBitRange); + + if (maxRange == tritRange) + { + const ISEParams params(ISEMODE_TRIT, curBitsForTritMode); + + if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits) + return ISEParams(ISEMODE_TRIT, curBitsForTritMode); + + curBitsForTritMode--; + } + else if (maxRange == quintRange) + { + const ISEParams params(ISEMODE_QUINT, curBitsForQuintMode); + + if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits) + return ISEParams(ISEMODE_QUINT, curBitsForQuintMode); + + curBitsForQuintMode--; + } + else + { + const ISEParams params(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode); + DE_ASSERT(maxRange == plainBitRange); + + if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits) + return ISEParams(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode); + + curBitsForPlainBitMode--; + } + } +} + +inline int computeNumColorEndpointValues (deUint32 endpointMode) +{ + DE_ASSERT(endpointMode < 16); + return (endpointMode/4 + 1) * 2; +} + +// Decompression utilities +enum DecompressResult +{ + DECOMPRESS_RESULT_VALID_BLOCK = 0, //!< Decompressed valid block + DECOMPRESS_RESULT_ERROR, //!< Encountered error while decompressing, error color written + DECOMPRESS_RESULT_LAST +}; + +// A helper for getting bits from a 128-bit block. +class Block128 +{ +private: + typedef deUint64 Word; + + enum + { + WORD_BYTES = sizeof(Word), + WORD_BITS = 8*WORD_BYTES, + NUM_WORDS = 128 / WORD_BITS + }; + //DE_STATIC_ASSERT(128 % WORD_BITS == 0); + +public: + Block128 (const deUint8* src) + { + for (int wordNdx = 0; wordNdx < NUM_WORDS; wordNdx++) + { + m_words[wordNdx] = 0; + for (int byteNdx = 0; byteNdx < WORD_BYTES; byteNdx++) + m_words[wordNdx] |= (Word)src[wordNdx*WORD_BYTES + byteNdx] << (8*byteNdx); + } + } + + deUint32 getBit (int ndx) const + { + DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128)); + return (m_words[ndx / WORD_BITS] >> (ndx % WORD_BITS)) & 1; + } + + deUint32 getBits (int low, int high) const + { + DE_ASSERT(basisu_astc::inBounds(low, 0, 128)); + DE_ASSERT(basisu_astc::inBounds(high, 0, 128)); + DE_ASSERT(basisu_astc::inRange(high-low+1, 0, 32)); + + if (high-low+1 == 0) + return 0; + + const int word0Ndx = low / WORD_BITS; + const int word1Ndx = high / WORD_BITS; + // \note "foo << bar << 1" done instead of "foo << (bar+1)" to avoid overflow, i.e. shift amount being too big. + if (word0Ndx == word1Ndx) + return (deUint32)((m_words[word0Ndx] & ((((Word)1 << high%WORD_BITS << 1) - 1))) >> ((Word)low % WORD_BITS)); + else + { + DE_ASSERT(word1Ndx == word0Ndx + 1); + return (deUint32)(m_words[word0Ndx] >> (low%WORD_BITS)) | + (deUint32)((m_words[word1Ndx] & (((Word)1 << high%WORD_BITS << 1) - 1)) << (high-low - high%WORD_BITS)); + } + } + + bool isBitSet (int ndx) const + { + DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128)); + return getBit(ndx) != 0; + } + +private: + Word m_words[NUM_WORDS]; +}; + +// A helper for sequential access into a Block128. +class BitAccessStream +{ +public: + BitAccessStream (const Block128& src, int startNdxInSrc, int length, bool forward) + : m_src (src) + , m_startNdxInSrc (startNdxInSrc) + , m_length (length) + , m_forward (forward) + , m_ndx (0) + { + } + + // Get the next num bits. Bits at positions greater than or equal to m_length are zeros. + deUint32 getNext (int num) + { + if (num == 0 || m_ndx >= m_length) + return 0; + const int end = m_ndx + num; + const int numBitsFromSrc = basisu_astc::max(0, basisu_astc::min(m_length, end) - m_ndx); + const int low = m_ndx; + const int high = m_ndx + numBitsFromSrc - 1; + + m_ndx += num; + + return m_forward ? m_src.getBits(m_startNdxInSrc + low, m_startNdxInSrc + high) + : reverseBits(m_src.getBits(m_startNdxInSrc - high, m_startNdxInSrc - low), numBitsFromSrc); + } + +private: + const Block128& m_src; + const int m_startNdxInSrc; + const int m_length; + const bool m_forward; + int m_ndx; +}; + +struct ISEDecodedResult +{ + deUint32 m; + deUint32 tq; //!< Trit or quint value, depending on ISE mode. + deUint32 v; +}; + +// Data from an ASTC block's "block mode" part (i.e. bits [0,10]). +struct ASTCBlockMode +{ + bool isError; + // \note Following fields only relevant if !isError. + bool isVoidExtent; + // \note Following fields only relevant if !isVoidExtent. + bool isDualPlane; + int weightGridWidth; + int weightGridHeight; + ISEParams weightISEParams; + + ASTCBlockMode (void) + : isError (true) + , isVoidExtent (true) + , isDualPlane (true) + , weightGridWidth (-1) + , weightGridHeight (-1) + , weightISEParams (ISEMODE_LAST, -1) + { + } +}; + +inline int computeNumWeights (const ASTCBlockMode& mode) +{ + return mode.weightGridWidth * mode.weightGridHeight * (mode.isDualPlane ? 2 : 1); +} + +struct ColorEndpointPair +{ + UVec4 e0; + UVec4 e1; +}; + +struct TexelWeightPair +{ + deUint32 w[2]; +}; + +ASTCBlockMode getASTCBlockMode (deUint32 blockModeData) +{ + ASTCBlockMode blockMode; + blockMode.isError = true; // \note Set to false later, if not error. + blockMode.isVoidExtent = getBits(blockModeData, 0, 8) == 0x1fc; + if (!blockMode.isVoidExtent) + { + if ((getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 6, 8) == 7) || getBits(blockModeData, 0, 3) == 0) + return blockMode; // Invalid ("reserved"). + + deUint32 r = (deUint32)-1; // \note Set in the following branches. + + if (getBits(blockModeData, 0, 1) == 0) + { + const deUint32 r0 = getBit(blockModeData, 4); + const deUint32 r1 = getBit(blockModeData, 2); + const deUint32 r2 = getBit(blockModeData, 3); + const deUint32 i78 = getBits(blockModeData, 7, 8); + + r = (r2 << 2) | (r1 << 1) | (r0 << 0); + + if (i78 == 3) + { + const bool i5 = isBitSet(blockModeData, 5); + blockMode.weightGridWidth = i5 ? 10 : 6; + blockMode.weightGridHeight = i5 ? 6 : 10; + } + else + { + const deUint32 a = getBits(blockModeData, 5, 6); + + switch (i78) + { + case 0: blockMode.weightGridWidth = 12; blockMode.weightGridHeight = a + 2; break; + case 1: blockMode.weightGridWidth = a + 2; blockMode.weightGridHeight = 12; break; + case 2: blockMode.weightGridWidth = a + 6; blockMode.weightGridHeight = getBits(blockModeData, 9, 10) + 6; break; + default: DE_ASSERT(false); + } + } + } + else + { + const deUint32 r0 = getBit(blockModeData, 4); + const deUint32 r1 = getBit(blockModeData, 0); + const deUint32 r2 = getBit(blockModeData, 1); + const deUint32 i23 = getBits(blockModeData, 2, 3); + const deUint32 a = getBits(blockModeData, 5, 6); + + r = (r2 << 2) | (r1 << 1) | (r0 << 0); + if (i23 == 3) + { + const deUint32 b = getBit(blockModeData, 7); + const bool i8 = isBitSet(blockModeData, 8); + blockMode.weightGridWidth = i8 ? b+2 : a+2; + blockMode.weightGridHeight = i8 ? a+2 : b+6; + } + else + { + const deUint32 b = getBits(blockModeData, 7, 8); + switch (i23) + { + case 0: blockMode.weightGridWidth = b + 4; blockMode.weightGridHeight = a + 2; break; + case 1: blockMode.weightGridWidth = b + 8; blockMode.weightGridHeight = a + 2; break; + case 2: blockMode.weightGridWidth = a + 2; blockMode.weightGridHeight = b + 8; break; + default: DE_ASSERT(false); + } + } + } + + const bool zeroDH = getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 7, 8) == 2; + const bool h = zeroDH ? 0 : isBitSet(blockModeData, 9); + blockMode.isDualPlane = zeroDH ? 0 : isBitSet(blockModeData, 10); + + { + ISEMode& m = blockMode.weightISEParams.mode; + int& b = blockMode.weightISEParams.numBits; + m = ISEMODE_PLAIN_BIT; + b = 0; + if (h) + { + switch (r) + { + case 2: m = ISEMODE_QUINT; b = 1; break; + case 3: m = ISEMODE_TRIT; b = 2; break; + case 4: b = 4; break; + case 5: m = ISEMODE_QUINT; b = 2; break; + case 6: m = ISEMODE_TRIT; b = 3; break; + case 7: b = 5; break; + default: DE_ASSERT(false); + } + } + else + { + switch (r) + { + case 2: b = 1; break; + case 3: m = ISEMODE_TRIT; break; + case 4: b = 2; break; + case 5: m = ISEMODE_QUINT; break; + case 6: m = ISEMODE_TRIT; b = 1; break; + case 7: b = 3; break; + default: DE_ASSERT(false); + } + } + } + } + + blockMode.isError = false; + return blockMode; +} + +inline void setASTCErrorColorBlock (void* dst, int blockWidth, int blockHeight, bool isSRGB) +{ + if (isSRGB) + { + deUint8* const dstU = (deUint8*)dst; + for (int i = 0; i < blockWidth*blockHeight; i++) + { + dstU[4*i + 0] = 0xff; + dstU[4*i + 1] = 0; + dstU[4*i + 2] = 0xff; + dstU[4*i + 3] = 0xff; + } + } + else + { + float* const dstF = (float*)dst; + for (int i = 0; i < blockWidth*blockHeight; i++) + { + dstF[4*i + 0] = 1.0f; + dstF[4*i + 1] = 0.0f; + dstF[4*i + 2] = 1.0f; + dstF[4*i + 3] = 1.0f; + } + } +} + +DecompressResult decodeVoidExtentBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode) +{ + const deUint32 minSExtent = blockData.getBits(12, 24); + const deUint32 maxSExtent = blockData.getBits(25, 37); + const deUint32 minTExtent = blockData.getBits(38, 50); + const deUint32 maxTExtent = blockData.getBits(51, 63); + const bool allExtentsAllOnes = (minSExtent == 0x1fff) && (maxSExtent == 0x1fff) && (minTExtent == 0x1fff) && (maxTExtent == 0x1fff); + const bool isHDRBlock = blockData.isBitSet(9); + + if ((isLDRMode && isHDRBlock) || (!allExtentsAllOnes && (minSExtent >= maxSExtent || minTExtent >= maxTExtent))) + { + setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB); + return DECOMPRESS_RESULT_ERROR; + } + + const deUint32 rgba[4] = + { + blockData.getBits(64, 79), + blockData.getBits(80, 95), + blockData.getBits(96, 111), + blockData.getBits(112, 127) + }; + + if (isSRGB) + { + deUint8* const dstU = (deUint8*)dst; + for (int i = 0; i < blockWidth * blockHeight; i++) + { + for (int c = 0; c < 4; c++) + dstU[i * 4 + c] = (deUint8)((rgba[c] & 0xff00) >> 8); + } + } + else + { + float* const dstF = (float*)dst; + + if (isHDRBlock) + { + for (int c = 0; c < 4; c++) + { + if (isFloat16InfOrNan((deFloat16)rgba[c])) + { + //throw InternalError("Infinity or NaN color component in HDR void extent block in ASTC texture (behavior undefined by ASTC specification)"); + setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB); + return DECOMPRESS_RESULT_ERROR; + } + } + + for (int i = 0; i < blockWidth * blockHeight; i++) + { + for (int c = 0; c < 4; c++) + dstF[i * 4 + c] = deFloat16To32((deFloat16)rgba[c]); + } + } + else + { + for (int i = 0; i < blockWidth * blockHeight; i++) + { + for (int c = 0; c < 4; c++) + dstF[i * 4 + c] = (rgba[c] == 65535) ? 1.0f : ((float)rgba[c] / 65536.0f); + } + } + } + + return DECOMPRESS_RESULT_VALID_BLOCK; +} + +void decodeColorEndpointModes (deUint32* endpointModesDst, const Block128& blockData, int numPartitions, int extraCemBitsStart) +{ + if (numPartitions == 1) + endpointModesDst[0] = blockData.getBits(13, 16); + else + { + const deUint32 highLevelSelector = blockData.getBits(23, 24); + + if (highLevelSelector == 0) + { + const deUint32 mode = blockData.getBits(25, 28); + + for (int i = 0; i < numPartitions; i++) + endpointModesDst[i] = mode; + } + else + { + for (int partNdx = 0; partNdx < numPartitions; partNdx++) + { + const deUint32 cemClass = highLevelSelector - (blockData.isBitSet(25 + partNdx) ? 0 : 1); + const deUint32 lowBit0Ndx = numPartitions + 2*partNdx; + const deUint32 lowBit1Ndx = numPartitions + 2*partNdx + 1; + const deUint32 lowBit0 = blockData.getBit(lowBit0Ndx < 4 ? 25+lowBit0Ndx : extraCemBitsStart+lowBit0Ndx-4); + const deUint32 lowBit1 = blockData.getBit(lowBit1Ndx < 4 ? 25+lowBit1Ndx : extraCemBitsStart+lowBit1Ndx-4); + + endpointModesDst[partNdx] = (cemClass << 2) | (lowBit1 << 1) | lowBit0; + } + } + } +} + +int computeNumColorEndpointValues (const deUint32* endpointModes, int numPartitions) +{ + int result = 0; + + for (int i = 0; i < numPartitions; i++) + result += computeNumColorEndpointValues(endpointModes[i]); + + return result; +} + +void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits) +{ + DE_ASSERT(basisu_astc::inRange(numValues, 1, 5)); + + deUint32 m[5]; + m[0] = data.getNext(numBits); + deUint32 T01 = data.getNext(2); + m[1] = data.getNext(numBits); + deUint32 T23 = data.getNext(2); + m[2] = data.getNext(numBits); + deUint32 T4 = data.getNext(1); + m[3] = data.getNext(numBits); + deUint32 T56 = data.getNext(2); + m[4] = data.getNext(numBits); + deUint32 T7 = data.getNext(1); + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough=" +#endif + switch (numValues) + { + // \note Fall-throughs. + case 1: T23 = 0; + case 2: T4 = 0; + case 3: T56 = 0; + case 4: T7 = 0; + case 5: break; + default: + DE_ASSERT(false); + } +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0); + + static const deUint32 tritsFromT[256][5] = + { + { 0,0,0,0,0 }, { 1,0,0,0,0 }, { 2,0,0,0,0 }, { 0,0,2,0,0 }, { 0,1,0,0,0 }, { 1,1,0,0,0 }, { 2,1,0,0,0 }, { 1,0,2,0,0 }, { 0,2,0,0,0 }, { 1,2,0,0,0 }, { 2,2,0,0,0 }, { 2,0,2,0,0 }, { 0,2,2,0,0 }, { 1,2,2,0,0 }, { 2,2,2,0,0 }, { 2,0,2,0,0 }, + { 0,0,1,0,0 }, { 1,0,1,0,0 }, { 2,0,1,0,0 }, { 0,1,2,0,0 }, { 0,1,1,0,0 }, { 1,1,1,0,0 }, { 2,1,1,0,0 }, { 1,1,2,0,0 }, { 0,2,1,0,0 }, { 1,2,1,0,0 }, { 2,2,1,0,0 }, { 2,1,2,0,0 }, { 0,0,0,2,2 }, { 1,0,0,2,2 }, { 2,0,0,2,2 }, { 0,0,2,2,2 }, + { 0,0,0,1,0 }, { 1,0,0,1,0 }, { 2,0,0,1,0 }, { 0,0,2,1,0 }, { 0,1,0,1,0 }, { 1,1,0,1,0 }, { 2,1,0,1,0 }, { 1,0,2,1,0 }, { 0,2,0,1,0 }, { 1,2,0,1,0 }, { 2,2,0,1,0 }, { 2,0,2,1,0 }, { 0,2,2,1,0 }, { 1,2,2,1,0 }, { 2,2,2,1,0 }, { 2,0,2,1,0 }, + { 0,0,1,1,0 }, { 1,0,1,1,0 }, { 2,0,1,1,0 }, { 0,1,2,1,0 }, { 0,1,1,1,0 }, { 1,1,1,1,0 }, { 2,1,1,1,0 }, { 1,1,2,1,0 }, { 0,2,1,1,0 }, { 1,2,1,1,0 }, { 2,2,1,1,0 }, { 2,1,2,1,0 }, { 0,1,0,2,2 }, { 1,1,0,2,2 }, { 2,1,0,2,2 }, { 1,0,2,2,2 }, + { 0,0,0,2,0 }, { 1,0,0,2,0 }, { 2,0,0,2,0 }, { 0,0,2,2,0 }, { 0,1,0,2,0 }, { 1,1,0,2,0 }, { 2,1,0,2,0 }, { 1,0,2,2,0 }, { 0,2,0,2,0 }, { 1,2,0,2,0 }, { 2,2,0,2,0 }, { 2,0,2,2,0 }, { 0,2,2,2,0 }, { 1,2,2,2,0 }, { 2,2,2,2,0 }, { 2,0,2,2,0 }, + { 0,0,1,2,0 }, { 1,0,1,2,0 }, { 2,0,1,2,0 }, { 0,1,2,2,0 }, { 0,1,1,2,0 }, { 1,1,1,2,0 }, { 2,1,1,2,0 }, { 1,1,2,2,0 }, { 0,2,1,2,0 }, { 1,2,1,2,0 }, { 2,2,1,2,0 }, { 2,1,2,2,0 }, { 0,2,0,2,2 }, { 1,2,0,2,2 }, { 2,2,0,2,2 }, { 2,0,2,2,2 }, + { 0,0,0,0,2 }, { 1,0,0,0,2 }, { 2,0,0,0,2 }, { 0,0,2,0,2 }, { 0,1,0,0,2 }, { 1,1,0,0,2 }, { 2,1,0,0,2 }, { 1,0,2,0,2 }, { 0,2,0,0,2 }, { 1,2,0,0,2 }, { 2,2,0,0,2 }, { 2,0,2,0,2 }, { 0,2,2,0,2 }, { 1,2,2,0,2 }, { 2,2,2,0,2 }, { 2,0,2,0,2 }, + { 0,0,1,0,2 }, { 1,0,1,0,2 }, { 2,0,1,0,2 }, { 0,1,2,0,2 }, { 0,1,1,0,2 }, { 1,1,1,0,2 }, { 2,1,1,0,2 }, { 1,1,2,0,2 }, { 0,2,1,0,2 }, { 1,2,1,0,2 }, { 2,2,1,0,2 }, { 2,1,2,0,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,0,2,2,2 }, + { 0,0,0,0,1 }, { 1,0,0,0,1 }, { 2,0,0,0,1 }, { 0,0,2,0,1 }, { 0,1,0,0,1 }, { 1,1,0,0,1 }, { 2,1,0,0,1 }, { 1,0,2,0,1 }, { 0,2,0,0,1 }, { 1,2,0,0,1 }, { 2,2,0,0,1 }, { 2,0,2,0,1 }, { 0,2,2,0,1 }, { 1,2,2,0,1 }, { 2,2,2,0,1 }, { 2,0,2,0,1 }, + { 0,0,1,0,1 }, { 1,0,1,0,1 }, { 2,0,1,0,1 }, { 0,1,2,0,1 }, { 0,1,1,0,1 }, { 1,1,1,0,1 }, { 2,1,1,0,1 }, { 1,1,2,0,1 }, { 0,2,1,0,1 }, { 1,2,1,0,1 }, { 2,2,1,0,1 }, { 2,1,2,0,1 }, { 0,0,1,2,2 }, { 1,0,1,2,2 }, { 2,0,1,2,2 }, { 0,1,2,2,2 }, + { 0,0,0,1,1 }, { 1,0,0,1,1 }, { 2,0,0,1,1 }, { 0,0,2,1,1 }, { 0,1,0,1,1 }, { 1,1,0,1,1 }, { 2,1,0,1,1 }, { 1,0,2,1,1 }, { 0,2,0,1,1 }, { 1,2,0,1,1 }, { 2,2,0,1,1 }, { 2,0,2,1,1 }, { 0,2,2,1,1 }, { 1,2,2,1,1 }, { 2,2,2,1,1 }, { 2,0,2,1,1 }, + { 0,0,1,1,1 }, { 1,0,1,1,1 }, { 2,0,1,1,1 }, { 0,1,2,1,1 }, { 0,1,1,1,1 }, { 1,1,1,1,1 }, { 2,1,1,1,1 }, { 1,1,2,1,1 }, { 0,2,1,1,1 }, { 1,2,1,1,1 }, { 2,2,1,1,1 }, { 2,1,2,1,1 }, { 0,1,1,2,2 }, { 1,1,1,2,2 }, { 2,1,1,2,2 }, { 1,1,2,2,2 }, + { 0,0,0,2,1 }, { 1,0,0,2,1 }, { 2,0,0,2,1 }, { 0,0,2,2,1 }, { 0,1,0,2,1 }, { 1,1,0,2,1 }, { 2,1,0,2,1 }, { 1,0,2,2,1 }, { 0,2,0,2,1 }, { 1,2,0,2,1 }, { 2,2,0,2,1 }, { 2,0,2,2,1 }, { 0,2,2,2,1 }, { 1,2,2,2,1 }, { 2,2,2,2,1 }, { 2,0,2,2,1 }, + { 0,0,1,2,1 }, { 1,0,1,2,1 }, { 2,0,1,2,1 }, { 0,1,2,2,1 }, { 0,1,1,2,1 }, { 1,1,1,2,1 }, { 2,1,1,2,1 }, { 1,1,2,2,1 }, { 0,2,1,2,1 }, { 1,2,1,2,1 }, { 2,2,1,2,1 }, { 2,1,2,2,1 }, { 0,2,1,2,2 }, { 1,2,1,2,2 }, { 2,2,1,2,2 }, { 2,1,2,2,2 }, + { 0,0,0,1,2 }, { 1,0,0,1,2 }, { 2,0,0,1,2 }, { 0,0,2,1,2 }, { 0,1,0,1,2 }, { 1,1,0,1,2 }, { 2,1,0,1,2 }, { 1,0,2,1,2 }, { 0,2,0,1,2 }, { 1,2,0,1,2 }, { 2,2,0,1,2 }, { 2,0,2,1,2 }, { 0,2,2,1,2 }, { 1,2,2,1,2 }, { 2,2,2,1,2 }, { 2,0,2,1,2 }, + { 0,0,1,1,2 }, { 1,0,1,1,2 }, { 2,0,1,1,2 }, { 0,1,2,1,2 }, { 0,1,1,1,2 }, { 1,1,1,1,2 }, { 2,1,1,1,2 }, { 1,1,2,1,2 }, { 0,2,1,1,2 }, { 1,2,1,1,2 }, { 2,2,1,1,2 }, { 2,1,2,1,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,1,2,2,2 } + }; + + const deUint32 (& trits)[5] = tritsFromT[T]; + for (int i = 0; i < numValues; i++) + { + dst[i].m = m[i]; + dst[i].tq = trits[i]; + dst[i].v = (trits[i] << numBits) + m[i]; + } +} + +void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits) +{ + DE_ASSERT(basisu_astc::inRange(numValues, 1, 3)); + + deUint32 m[3]; + m[0] = data.getNext(numBits); + deUint32 Q012 = data.getNext(3); + m[1] = data.getNext(numBits); + deUint32 Q34 = data.getNext(2); + m[2] = data.getNext(numBits); + deUint32 Q56 = data.getNext(2); + +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough=" +#endif + switch (numValues) + { + // \note Fall-throughs. + case 1: Q34 = 0; + case 2: Q56 = 0; + case 3: break; + default: + DE_ASSERT(false); + } +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + + const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0); + + static const deUint32 quintsFromQ[256][3] = + { + { 0,0,0 }, { 1,0,0 }, { 2,0,0 }, { 3,0,0 }, { 4,0,0 }, { 0,4,0 }, { 4,4,0 }, { 4,4,4 }, { 0,1,0 }, { 1,1,0 }, { 2,1,0 }, { 3,1,0 }, { 4,1,0 }, { 1,4,0 }, { 4,4,1 }, { 4,4,4 }, + { 0,2,0 }, { 1,2,0 }, { 2,2,0 }, { 3,2,0 }, { 4,2,0 }, { 2,4,0 }, { 4,4,2 }, { 4,4,4 }, { 0,3,0 }, { 1,3,0 }, { 2,3,0 }, { 3,3,0 }, { 4,3,0 }, { 3,4,0 }, { 4,4,3 }, { 4,4,4 }, + { 0,0,1 }, { 1,0,1 }, { 2,0,1 }, { 3,0,1 }, { 4,0,1 }, { 0,4,1 }, { 4,0,4 }, { 0,4,4 }, { 0,1,1 }, { 1,1,1 }, { 2,1,1 }, { 3,1,1 }, { 4,1,1 }, { 1,4,1 }, { 4,1,4 }, { 1,4,4 }, + { 0,2,1 }, { 1,2,1 }, { 2,2,1 }, { 3,2,1 }, { 4,2,1 }, { 2,4,1 }, { 4,2,4 }, { 2,4,4 }, { 0,3,1 }, { 1,3,1 }, { 2,3,1 }, { 3,3,1 }, { 4,3,1 }, { 3,4,1 }, { 4,3,4 }, { 3,4,4 }, + { 0,0,2 }, { 1,0,2 }, { 2,0,2 }, { 3,0,2 }, { 4,0,2 }, { 0,4,2 }, { 2,0,4 }, { 3,0,4 }, { 0,1,2 }, { 1,1,2 }, { 2,1,2 }, { 3,1,2 }, { 4,1,2 }, { 1,4,2 }, { 2,1,4 }, { 3,1,4 }, + { 0,2,2 }, { 1,2,2 }, { 2,2,2 }, { 3,2,2 }, { 4,2,2 }, { 2,4,2 }, { 2,2,4 }, { 3,2,4 }, { 0,3,2 }, { 1,3,2 }, { 2,3,2 }, { 3,3,2 }, { 4,3,2 }, { 3,4,2 }, { 2,3,4 }, { 3,3,4 }, + { 0,0,3 }, { 1,0,3 }, { 2,0,3 }, { 3,0,3 }, { 4,0,3 }, { 0,4,3 }, { 0,0,4 }, { 1,0,4 }, { 0,1,3 }, { 1,1,3 }, { 2,1,3 }, { 3,1,3 }, { 4,1,3 }, { 1,4,3 }, { 0,1,4 }, { 1,1,4 }, + { 0,2,3 }, { 1,2,3 }, { 2,2,3 }, { 3,2,3 }, { 4,2,3 }, { 2,4,3 }, { 0,2,4 }, { 1,2,4 }, { 0,3,3 }, { 1,3,3 }, { 2,3,3 }, { 3,3,3 }, { 4,3,3 }, { 3,4,3 }, { 0,3,4 }, { 1,3,4 } + }; + + const deUint32 (& quints)[3] = quintsFromQ[Q]; + for (int i = 0; i < numValues; i++) + { + dst[i].m = m[i]; + dst[i].tq = quints[i]; + dst[i].v = (quints[i] << numBits) + m[i]; + } +} + +inline void decodeISEBitBlock (ISEDecodedResult* dst, BitAccessStream& data, int numBits) +{ + dst[0].m = data.getNext(numBits); + dst[0].v = dst[0].m; +} + +void decodeISE (ISEDecodedResult* dst, int numValues, BitAccessStream& data, const ISEParams& params) +{ + if (params.mode == ISEMODE_TRIT) + { + const int numBlocks = deDivRoundUp32(numValues, 5); + for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++) + { + const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 5*(numBlocks-1) : 5; + decodeISETritBlock(&dst[5*blockNdx], numValuesInBlock, data, params.numBits); + } + } + else if (params.mode == ISEMODE_QUINT) + { + const int numBlocks = deDivRoundUp32(numValues, 3); + for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++) + { + const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 3*(numBlocks-1) : 3; + decodeISEQuintBlock(&dst[3*blockNdx], numValuesInBlock, data, params.numBits); + } + } + else + { + DE_ASSERT(params.mode == ISEMODE_PLAIN_BIT); + for (int i = 0; i < numValues; i++) + decodeISEBitBlock(&dst[i], data, params.numBits); + } +} + +void unquantizeColorEndpoints (deUint32* dst, const ISEDecodedResult* iseResults, int numEndpoints, const ISEParams& iseParams) +{ + if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT)) + { + const int rangeCase = iseParams.numBits*2 - (iseParams.mode == ISEMODE_TRIT ? 2 : 1); + DE_ASSERT(basisu_astc::inRange(rangeCase, 0, 10)); + + static const deUint32 Ca[11] = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 }; + const deUint32 C = Ca[rangeCase]; + + for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++) + { + const deUint32 a = getBit(iseResults[endpointNdx].m, 0); + const deUint32 b = getBit(iseResults[endpointNdx].m, 1); + const deUint32 c = getBit(iseResults[endpointNdx].m, 2); + const deUint32 d = getBit(iseResults[endpointNdx].m, 3); + const deUint32 e = getBit(iseResults[endpointNdx].m, 4); + const deUint32 f = getBit(iseResults[endpointNdx].m, 5); + const deUint32 A = (a == 0) ? 0 : (1<<9)-1; + + const deUint32 B = (rangeCase == 0) ? 0 + : (rangeCase == 1) ? 0 + : (rangeCase == 2) ? ((b << 8) | (b << 4) | (b << 2) | (b << 1)) + : (rangeCase == 3) ? ((b << 8) | (b << 3) | (b << 2)) + : (rangeCase == 4) ? ((c << 8) | (b << 7) | (c << 3) | (b << 2) | (c << 1) | (b << 0)) + : (rangeCase == 5) ? ((c << 8) | (b << 7) | (c << 2) | (b << 1) | (c << 0)) + : (rangeCase == 6) ? ((d << 8) | (c << 7) | (b << 6) | (d << 2) | (c << 1) | (b << 0)) + : (rangeCase == 7) ? ((d << 8) | (c << 7) | (b << 6) | (d << 1) | (c << 0)) + : (rangeCase == 8) ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 1) | (d << 0)) + : (rangeCase == 9) ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 0)) + : (rangeCase == 10) ? ((f << 8) | (e << 7) | (d << 6) | (c << 5) | (b << 4) | (f << 0)) + : (deUint32)-1; + + DE_ASSERT(B != (deUint32)-1); + dst[endpointNdx] = (((iseResults[endpointNdx].tq*C + B) ^ A) >> 2) | (A & 0x80); + } + } + else + { + DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT); + for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++) + dst[endpointNdx] = bitReplicationScale(iseResults[endpointNdx].v, iseParams.numBits, 8); + } +} + +inline void bitTransferSigned (deInt32& a, deInt32& b) +{ + b >>= 1; + b |= a & 0x80; + a >>= 1; + a &= 0x3f; + if (isBitSet(a, 5)) + a -= 0x40; +} + +inline UVec4 clampedRGBA (const IVec4& rgba) +{ + return UVec4(basisu_astc::clamp(rgba.x(), 0, 0xff), + basisu_astc::clamp(rgba.y(), 0, 0xff), + basisu_astc::clamp(rgba.z(), 0, 0xff), + basisu_astc::clamp(rgba.w(), 0, 0xff)); +} + +inline IVec4 blueContract (int r, int g, int b, int a) +{ + return IVec4((r+b)>>1, (g+b)>>1, b, a); +} + +inline bool isColorEndpointModeHDR (deUint32 mode) +{ + return (mode == 2) || + (mode == 3) || + (mode == 7) || + (mode == 11) || + (mode == 14) || + (mode == 15); +} + +void decodeHDREndpointMode7 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3) +{ + const deUint32 m10 = getBit(v1, 7) | (getBit(v2, 7) << 1); + const deUint32 m23 = getBits(v0, 6, 7); + + const deUint32 majComp = (m10 != 3) ? m10 + : (m23 != 3) ? m23 + : 0; + + const deUint32 mode = (m10 != 3) ? m23 + : (m23 != 3) ? 4 + : 5; + + deInt32 red = (deInt32)getBits(v0, 0, 5); + deInt32 green = (deInt32)getBits(v1, 0, 4); + deInt32 blue = (deInt32)getBits(v2, 0, 4); + deInt32 scale = (deInt32)getBits(v3, 0, 4); + + { +#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT) +#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5, V6,S6) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); SHOR(V6,S6,x6); } while (false) + + const deUint32 x0 = getBit(v1, 6); + const deUint32 x1 = getBit(v1, 5); + const deUint32 x2 = getBit(v2, 6); + const deUint32 x3 = getBit(v2, 5); + const deUint32 x4 = getBit(v3, 7); + const deUint32 x5 = getBit(v3, 6); + const deUint32 x6 = getBit(v3, 5); + + deInt32& R = red; + deInt32& G = green; + deInt32& B = blue; + deInt32& S = scale; + + switch (mode) + { + case 0: ASSIGN_X_BITS(R,9, R,8, R,7, R,10, R,6, S,6, S,5); break; + case 1: ASSIGN_X_BITS(R,8, G,5, R,7, B,5, R,6, R,10, R,9); break; + case 2: ASSIGN_X_BITS(R,9, R,8, R,7, R,6, S,7, S,6, S,5); break; + case 3: ASSIGN_X_BITS(R,8, G,5, R,7, B,5, R,6, S,6, S,5); break; + case 4: ASSIGN_X_BITS(G,6, G,5, B,6, B,5, R,6, R,7, S,5); break; + case 5: ASSIGN_X_BITS(G,6, G,5, B,6, B,5, R,6, S,6, S,5); break; + default: + DE_ASSERT(false); + } +#undef ASSIGN_X_BITS +#undef SHOR + } + + static const int shiftAmounts[] = { 1, 1, 2, 3, 4, 5 }; + DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(shiftAmounts)); + + red <<= shiftAmounts[mode]; + green <<= shiftAmounts[mode]; + blue <<= shiftAmounts[mode]; + scale <<= shiftAmounts[mode]; + + if (mode != 5) + { + green = red - green; + blue = red - blue; + } + + if (majComp == 1) + std::swap(red, green); + else if (majComp == 2) + std::swap(red, blue); + + e0 = UVec4(basisu_astc::clamp(red - scale, 0, 0xfff), + basisu_astc::clamp(green - scale, 0, 0xfff), + basisu_astc::clamp(blue - scale, 0, 0xfff), + 0x780); + + e1 = UVec4(basisu_astc::clamp(red, 0, 0xfff), + basisu_astc::clamp(green, 0, 0xfff), + basisu_astc::clamp(blue, 0, 0xfff), + 0x780); +} + +void decodeHDREndpointMode11 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5) +{ + const deUint32 major = (getBit(v5, 7) << 1) | getBit(v4, 7); + + if (major == 3) + { + e0 = UVec4(v0<<4, v2<<4, getBits(v4,0,6)<<5, 0x780); + e1 = UVec4(v1<<4, v3<<4, getBits(v5,0,6)<<5, 0x780); + } + else + { + const deUint32 mode = (getBit(v3, 7) << 2) | (getBit(v2, 7) << 1) | getBit(v1, 7); + + deInt32 a = (deInt32)((getBit(v1, 6) << 8) | v0); + deInt32 c = (deInt32)(getBits(v1, 0, 5)); + deInt32 b0 = (deInt32)(getBits(v2, 0, 5)); + deInt32 b1 = (deInt32)(getBits(v3, 0, 5)); + deInt32 d0 = (deInt32)(getBits(v4, 0, 4)); + deInt32 d1 = (deInt32)(getBits(v5, 0, 4)); + + { +#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT) +#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); } while (false) + const deUint32 x0 = getBit(v2, 6); + const deUint32 x1 = getBit(v3, 6); + const deUint32 x2 = getBit(v4, 6); + const deUint32 x3 = getBit(v5, 6); + const deUint32 x4 = getBit(v4, 5); + const deUint32 x5 = getBit(v5, 5); + + switch (mode) + { + case 0: ASSIGN_X_BITS(b0,6, b1,6, d0,6, d1,6, d0,5, d1,5); break; + case 1: ASSIGN_X_BITS(b0,6, b1,6, b0,7, b1,7, d0,5, d1,5); break; + case 2: ASSIGN_X_BITS(a,9, c,6, d0,6, d1,6, d0,5, d1,5); break; + case 3: ASSIGN_X_BITS(b0,6, b1,6, a,9, c,6, d0,5, d1,5); break; + case 4: ASSIGN_X_BITS(b0,6, b1,6, b0,7, b1,7, a,9, a,10); break; + case 5: ASSIGN_X_BITS(a,9, a,10, c,7, c,6, d0,5, d1,5); break; + case 6: ASSIGN_X_BITS(b0,6, b1,6, a,11, c,6, a,9, a,10); break; + case 7: ASSIGN_X_BITS(a,9, a,10, a,11, c,6, d0,5, d1,5); break; + default: + DE_ASSERT(false); + } +#undef ASSIGN_X_BITS +#undef SHOR + } + + static const int numDBits[] = { 7, 6, 7, 6, 5, 6, 5, 6 }; + DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(numDBits)); + d0 = signExtend(d0, numDBits[mode]); + d1 = signExtend(d1, numDBits[mode]); + + const int shiftAmount = (mode >> 1) ^ 3; + a = (uint32_t)a << shiftAmount; + c = (uint32_t)c << shiftAmount; + b0 = (uint32_t)b0 << shiftAmount; + b1 = (uint32_t)b1 << shiftAmount; + d0 = (uint32_t)d0 << shiftAmount; + d1 = (uint32_t)d1 << shiftAmount; + + e0 = UVec4(basisu_astc::clamp(a-c, 0, 0xfff), basisu_astc::clamp(a-b0-c-d0, 0, 0xfff), basisu_astc::clamp(a-b1-c-d1, 0, 0xfff), 0x780); + e1 = UVec4(basisu_astc::clamp(a, 0, 0xfff), basisu_astc::clamp(a-b0, 0, 0xfff), basisu_astc::clamp(a-b1, 0, 0xfff), 0x780); + + if (major == 1) + { + std::swap(e0.x(), e0.y()); + std::swap(e1.x(), e1.y()); + } + else if (major == 2) + { + std::swap(e0.x(), e0.z()); + std::swap(e1.x(), e1.z()); + } + } +} + +void decodeHDREndpointMode15(UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5, deUint32 v6In, deUint32 v7In) +{ + decodeHDREndpointMode11(e0, e1, v0, v1, v2, v3, v4, v5); + + const deUint32 mode = (getBit(v7In, 7) << 1) | getBit(v6In, 7); + deInt32 v6 = (deInt32)getBits(v6In, 0, 6); + deInt32 v7 = (deInt32)getBits(v7In, 0, 6); + + if (mode == 3) + { + e0.w() = v6 << 5; + e1.w() = v7 << 5; + } + else + { + v6 |= (v7 << (mode+1)) & 0x780; + v7 &= (0x3f >> mode); + v7 ^= 0x20 >> mode; + v7 -= 0x20 >> mode; + v6 <<= 4-mode; + v7 <<= 4-mode; + v7 += v6; + v7 = basisu_astc::clamp(v7, 0, 0xfff); + e0.w() = v6; + e1.w() = v7; + } +} + +void decodeColorEndpoints (ColorEndpointPair* dst, const deUint32* unquantizedEndpoints, const deUint32* endpointModes, int numPartitions) +{ + int unquantizedNdx = 0; + + for (int partitionNdx = 0; partitionNdx < numPartitions; partitionNdx++) + { + const deUint32 endpointMode = endpointModes[partitionNdx]; + const deUint32* v = &unquantizedEndpoints[unquantizedNdx]; + + UVec4& e0 = dst[partitionNdx].e0; + UVec4& e1 = dst[partitionNdx].e1; + unquantizedNdx += computeNumColorEndpointValues(endpointMode); + + switch (endpointMode) + { + case 0: + { + e0 = UVec4(v[0], v[0], v[0], 0xff); + e1 = UVec4(v[1], v[1], v[1], 0xff); + break; + } + case 1: + { + const deUint32 L0 = (v[0] >> 2) | (getBits(v[1], 6, 7) << 6); + const deUint32 L1 = basisu_astc::min(0xffu, L0 + getBits(v[1], 0, 5)); + e0 = UVec4(L0, L0, L0, 0xff); + e1 = UVec4(L1, L1, L1, 0xff); + break; + } + case 2: + { + const deUint32 v1Gr = v[1] >= v[0]; + const deUint32 y0 = v1Gr ? v[0]<<4 : (v[1]<<4) + 8; + const deUint32 y1 = v1Gr ? v[1]<<4 : (v[0]<<4) - 8; + e0 = UVec4(y0, y0, y0, 0x780); + e1 = UVec4(y1, y1, y1, 0x780); + break; + } + case 3: + { + const bool m = isBitSet(v[0], 7); + const deUint32 y0 = m ? (getBits(v[1], 5, 7) << 9) | (getBits(v[0], 0, 6) << 2) + : (getBits(v[1], 4, 7) << 8) | (getBits(v[0], 0, 6) << 1); + const deUint32 d = m ? getBits(v[1], 0, 4) << 2 + : getBits(v[1], 0, 3) << 1; + const deUint32 y1 = basisu_astc::min(0xfffu, y0+d); + e0 = UVec4(y0, y0, y0, 0x780); + e1 = UVec4(y1, y1, y1, 0x780); + break; + } + case 4: + { + e0 = UVec4(v[0], v[0], v[0], v[2]); + e1 = UVec4(v[1], v[1], v[1], v[3]); + break; + } + case 5: + { + deInt32 v0 = (deInt32)v[0]; + deInt32 v1 = (deInt32)v[1]; + deInt32 v2 = (deInt32)v[2]; + deInt32 v3 = (deInt32)v[3]; + bitTransferSigned(v1, v0); + bitTransferSigned(v3, v2); + e0 = clampedRGBA(IVec4(v0, v0, v0, v2)); + e1 = clampedRGBA(IVec4(v0+v1, v0+v1, v0+v1, v2+v3)); + break; + } + case 6: + e0 = UVec4((v[0]*v[3]) >> 8, (v[1]*v[3]) >> 8, (v[2]*v[3]) >> 8, 0xff); + e1 = UVec4(v[0], v[1], v[2], 0xff); + break; + case 7: + decodeHDREndpointMode7(e0, e1, v[0], v[1], v[2], v[3]); + break; + case 8: + { + if (v[1]+v[3]+v[5] >= v[0]+v[2]+v[4]) + { + e0 = UVec4(v[0], v[2], v[4], 0xff); + e1 = UVec4(v[1], v[3], v[5], 0xff); + } + else + { + e0 = blueContract(v[1], v[3], v[5], 0xff).asUint(); + e1 = blueContract(v[0], v[2], v[4], 0xff).asUint(); + } + break; + } + case 9: + { + deInt32 v0 = (deInt32)v[0]; + deInt32 v1 = (deInt32)v[1]; + deInt32 v2 = (deInt32)v[2]; + deInt32 v3 = (deInt32)v[3]; + deInt32 v4 = (deInt32)v[4]; + deInt32 v5 = (deInt32)v[5]; + bitTransferSigned(v1, v0); + bitTransferSigned(v3, v2); + bitTransferSigned(v5, v4); + if (v1+v3+v5 >= 0) + { + e0 = clampedRGBA(IVec4(v0, v2, v4, 0xff)); + e1 = clampedRGBA(IVec4(v0+v1, v2+v3, v4+v5, 0xff)); + } + else + { + e0 = clampedRGBA(blueContract(v0+v1, v2+v3, v4+v5, 0xff)); + e1 = clampedRGBA(blueContract(v0, v2, v4, 0xff)); + } + break; + } + case 10: + { + e0 = UVec4((v[0]*v[3]) >> 8, (v[1]*v[3]) >> 8, (v[2]*v[3]) >> 8, v[4]); + e1 = UVec4(v[0], v[1], v[2], v[5]); + break; + } + case 11: + { + decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]); + break; + } + case 12: + { + if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4]) + { + e0 = UVec4(v[0], v[2], v[4], v[6]); + e1 = UVec4(v[1], v[3], v[5], v[7]); + } + else + { + e0 = clampedRGBA(blueContract(v[1], v[3], v[5], v[7])); + e1 = clampedRGBA(blueContract(v[0], v[2], v[4], v[6])); + } + break; + } + case 13: + { + deInt32 v0 = (deInt32)v[0]; + deInt32 v1 = (deInt32)v[1]; + deInt32 v2 = (deInt32)v[2]; + deInt32 v3 = (deInt32)v[3]; + deInt32 v4 = (deInt32)v[4]; + deInt32 v5 = (deInt32)v[5]; + deInt32 v6 = (deInt32)v[6]; + deInt32 v7 = (deInt32)v[7]; + bitTransferSigned(v1, v0); + bitTransferSigned(v3, v2); + bitTransferSigned(v5, v4); + bitTransferSigned(v7, v6); + if (v1+v3+v5 >= 0) + { + e0 = clampedRGBA(IVec4(v0, v2, v4, v6)); + e1 = clampedRGBA(IVec4(v0+v1, v2+v3, v4+v5, v6+v7)); + } + else + { + e0 = clampedRGBA(blueContract(v0+v1, v2+v3, v4+v5, v6+v7)); + e1 = clampedRGBA(blueContract(v0, v2, v4, v6)); + } + break; + } + case 14: + decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]); + e0.w() = v[6]; + e1.w() = v[7]; + break; + case 15: + { + decodeHDREndpointMode15(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]); + break; + } + default: + DE_ASSERT(false); + } + } +} + +void computeColorEndpoints (ColorEndpointPair* dst, const Block128& blockData, const deUint32* endpointModes, int numPartitions, int numColorEndpointValues, const ISEParams& iseParams, int numBitsAvailable) +{ + const int colorEndpointDataStart = (numPartitions == 1) ? 17 : 29; + ISEDecodedResult colorEndpointData[18]; + + { + BitAccessStream dataStream(blockData, colorEndpointDataStart, numBitsAvailable, true); + decodeISE(&colorEndpointData[0], numColorEndpointValues, dataStream, iseParams); + } + + { + deUint32 unquantizedEndpoints[18]; + unquantizeColorEndpoints(&unquantizedEndpoints[0], &colorEndpointData[0], numColorEndpointValues, iseParams); + decodeColorEndpoints(dst, &unquantizedEndpoints[0], &endpointModes[0], numPartitions); + } +} + +void unquantizeWeights (deUint32 dst[64], const ISEDecodedResult* weightGrid, const ASTCBlockMode& blockMode) +{ + const int numWeights = computeNumWeights(blockMode); + const ISEParams& iseParams = blockMode.weightISEParams; + + if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT)) + { + const int rangeCase = iseParams.numBits*2 + (iseParams.mode == ISEMODE_QUINT ? 1 : 0); + + if ((rangeCase == 0) || (rangeCase == 1)) + { + static const deUint32 map0[3] = { 0, 32, 63 }; + static const deUint32 map1[5] = { 0, 16, 32, 47, 63 }; + const deUint32* const map = (rangeCase == 0) ? &map0[0] : &map1[0]; + + for (int i = 0; i < numWeights; i++) + { + DE_ASSERT(weightGrid[i].v < (rangeCase == 0 ? 3u : 5u)); + dst[i] = map[weightGrid[i].v]; + } + } + else + { + DE_ASSERT(rangeCase <= 6); + static const deUint32 Ca[5] = { 50, 28, 23, 13, 11 }; + const deUint32 C = Ca[rangeCase-2]; + + for (int weightNdx = 0; weightNdx < numWeights; weightNdx++) + { + const deUint32 a = getBit(weightGrid[weightNdx].m, 0); + const deUint32 b = getBit(weightGrid[weightNdx].m, 1); + const deUint32 c = getBit(weightGrid[weightNdx].m, 2); + + const deUint32 A = (a == 0) ? 0 : (1<<7)-1; + const deUint32 B = (rangeCase == 2) ? 0 + : (rangeCase == 3) ? 0 + : (rangeCase == 4) ? (b << 6) | (b << 2) | (b << 0) + : (rangeCase == 5) ? (b << 6) | (b << 1) + : (rangeCase == 6) ? (c << 6) | (b << 5) | (c << 1) | (b << 0) + : (deUint32)-1; + + dst[weightNdx] = (((weightGrid[weightNdx].tq*C + B) ^ A) >> 2) | (A & 0x20); + } + } + } + else + { + DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT); + for (int weightNdx = 0; weightNdx < numWeights; weightNdx++) + dst[weightNdx] = bitReplicationScale(weightGrid[weightNdx].v, iseParams.numBits, 6); + } + + for (int weightNdx = 0; weightNdx < numWeights; weightNdx++) + dst[weightNdx] += dst[weightNdx] > 32 ? 1 : 0; + + // Initialize nonexistent weights to poison values + for (int weightNdx = numWeights; weightNdx < 64; weightNdx++) + dst[weightNdx] = ~0u; +} + +void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeights) [64], int blockWidth, int blockHeight, const ASTCBlockMode& blockMode) +{ + const int numWeightsPerTexel = blockMode.isDualPlane ? 2 : 1; + const deUint32 scaleX = (1024 + blockWidth/2) / (blockWidth-1); + const deUint32 scaleY = (1024 + blockHeight/2) / (blockHeight-1); + DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights)); + + for (int texelY = 0; texelY < blockHeight; texelY++) + { + for (int texelX = 0; texelX < blockWidth; texelX++) + { + const deUint32 gX = (scaleX*texelX*(blockMode.weightGridWidth-1) + 32) >> 6; + const deUint32 gY = (scaleY*texelY*(blockMode.weightGridHeight-1) + 32) >> 6; + const deUint32 jX = gX >> 4; + const deUint32 jY = gY >> 4; + const deUint32 fX = gX & 0xf; + const deUint32 fY = gY & 0xf; + const deUint32 w11 = (fX*fY + 8) >> 4; + const deUint32 w10 = fY - w11; + const deUint32 w01 = fX - w11; + const deUint32 w00 = 16 - fX - fY + w11; + const deUint32 i00 = jY*blockMode.weightGridWidth + jX; + const deUint32 i01 = i00 + 1; + const deUint32 i10 = i00 + blockMode.weightGridWidth; + const deUint32 i11 = i00 + blockMode.weightGridWidth + 1; + + // These addresses can be out of bounds, but respective weights will be 0 then. + DE_ASSERT(deInBounds32(i00, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w00 == 0); + DE_ASSERT(deInBounds32(i01, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w01 == 0); + DE_ASSERT(deInBounds32(i10, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w10 == 0); + DE_ASSERT(deInBounds32(i11, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w11 == 0); + + for (int texelWeightNdx = 0; texelWeightNdx < numWeightsPerTexel; texelWeightNdx++) + { + // & 0x3f clamps address to bounds of unquantizedWeights + const deUint32 p00 = unquantizedWeights[(i00 * numWeightsPerTexel + texelWeightNdx) & 0x3f]; + const deUint32 p01 = unquantizedWeights[(i01 * numWeightsPerTexel + texelWeightNdx) & 0x3f]; + const deUint32 p10 = unquantizedWeights[(i10 * numWeightsPerTexel + texelWeightNdx) & 0x3f]; + const deUint32 p11 = unquantizedWeights[(i11 * numWeightsPerTexel + texelWeightNdx) & 0x3f]; + + dst[texelY*blockWidth + texelX].w[texelWeightNdx] = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4; + } + } + } +} + +void computeTexelWeights (TexelWeightPair* dst, const Block128& blockData, int blockWidth, int blockHeight, const ASTCBlockMode& blockMode) +{ + ISEDecodedResult weightGrid[64]; + + { + BitAccessStream dataStream(blockData, 127, computeNumRequiredBits(blockMode.weightISEParams, computeNumWeights(blockMode)), false); + decodeISE(&weightGrid[0], computeNumWeights(blockMode), dataStream, blockMode.weightISEParams); + } + + { + deUint32 unquantizedWeights[64]; + unquantizeWeights(&unquantizedWeights[0], &weightGrid[0], blockMode); + + interpolateWeights(dst, unquantizedWeights, blockWidth, blockHeight, blockMode); + } +} + +inline deUint32 hash52 (deUint32 v) +{ + deUint32 p = v; + p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; + p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; + p ^= p << 6; p ^= p >> 17; + return p; +} + +int computeTexelPartition (deUint32 seedIn, deUint32 xIn, deUint32 yIn, deUint32 zIn, int numPartitions, bool smallBlock) +{ + DE_ASSERT(zIn == 0); + + const deUint32 x = smallBlock ? xIn << 1 : xIn; + const deUint32 y = smallBlock ? yIn << 1 : yIn; + const deUint32 z = smallBlock ? zIn << 1 : zIn; + const deUint32 seed = seedIn + 1024*(numPartitions-1); + const deUint32 rnum = hash52(seed); + + deUint8 seed1 = (deUint8)( rnum & 0xf); + deUint8 seed2 = (deUint8)((rnum >> 4) & 0xf); + deUint8 seed3 = (deUint8)((rnum >> 8) & 0xf); + deUint8 seed4 = (deUint8)((rnum >> 12) & 0xf); + deUint8 seed5 = (deUint8)((rnum >> 16) & 0xf); + deUint8 seed6 = (deUint8)((rnum >> 20) & 0xf); + deUint8 seed7 = (deUint8)((rnum >> 24) & 0xf); + deUint8 seed8 = (deUint8)((rnum >> 28) & 0xf); + deUint8 seed9 = (deUint8)((rnum >> 18) & 0xf); + deUint8 seed10 = (deUint8)((rnum >> 22) & 0xf); + deUint8 seed11 = (deUint8)((rnum >> 26) & 0xf); + deUint8 seed12 = (deUint8)(((rnum >> 30) | (rnum << 2)) & 0xf); + + seed1 = (deUint8)(seed1 * seed1 ); + seed2 = (deUint8)(seed2 * seed2 ); + seed3 = (deUint8)(seed3 * seed3 ); + seed4 = (deUint8)(seed4 * seed4 ); + seed5 = (deUint8)(seed5 * seed5 ); + seed6 = (deUint8)(seed6 * seed6 ); + seed7 = (deUint8)(seed7 * seed7 ); + seed8 = (deUint8)(seed8 * seed8 ); + seed9 = (deUint8)(seed9 * seed9 ); + seed10 = (deUint8)(seed10 * seed10); + seed11 = (deUint8)(seed11 * seed11); + seed12 = (deUint8)(seed12 * seed12); + + const int shA = (seed & 2) != 0 ? 4 : 5; + const int shB = numPartitions == 3 ? 6 : 5; + const int sh1 = (seed & 1) != 0 ? shA : shB; + const int sh2 = (seed & 1) != 0 ? shB : shA; + const int sh3 = (seed & 0x10) != 0 ? sh1 : sh2; + + seed1 = (deUint8)(seed1 >> sh1); + seed2 = (deUint8)(seed2 >> sh2); + seed3 = (deUint8)(seed3 >> sh1); + seed4 = (deUint8)(seed4 >> sh2); + seed5 = (deUint8)(seed5 >> sh1); + seed6 = (deUint8)(seed6 >> sh2); + seed7 = (deUint8)(seed7 >> sh1); + seed8 = (deUint8)(seed8 >> sh2); + seed9 = (deUint8)(seed9 >> sh3); + seed10 = (deUint8)(seed10 >> sh3); + seed11 = (deUint8)(seed11 >> sh3); + seed12 = (deUint8)(seed12 >> sh3); + + const int a = 0x3f & (seed1*x + seed2*y + seed11*z + (rnum >> 14)); + const int b = 0x3f & (seed3*x + seed4*y + seed12*z + (rnum >> 10)); + const int c = (numPartitions >= 3) ? 0x3f & (seed5*x + seed6*y + seed9*z + (rnum >> 6)) : 0; + const int d = (numPartitions >= 4) ? 0x3f & (seed7*x + seed8*y + seed10*z + (rnum >> 2)) : 0; + + return (a >= b && a >= c && a >= d) ? 0 + : (b >= c && b >= d) ? 1 + : (c >= d) ? 2 + : 3; +} + +DecompressResult setTexelColors (void* dst, ColorEndpointPair* colorEndpoints, TexelWeightPair* texelWeights, int ccs, deUint32 partitionIndexSeed, + int numPartitions, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode, const deUint32* colorEndpointModes) +{ + const bool smallBlock = blockWidth*blockHeight < 31; + DecompressResult result = DECOMPRESS_RESULT_VALID_BLOCK; + bool isHDREndpoint[4]; + + for (int i = 0; i < numPartitions; i++) + { + isHDREndpoint[i] = isColorEndpointModeHDR(colorEndpointModes[i]); + } + + for (int texelY = 0; texelY < blockHeight; texelY++) + { + for (int texelX = 0; texelX < blockWidth; texelX++) + { + const int texelNdx = texelY * blockWidth + texelX; + const int colorEndpointNdx = (numPartitions == 1) ? 0 : computeTexelPartition(partitionIndexSeed, texelX, texelY, 0, numPartitions, smallBlock); + + DE_ASSERT(colorEndpointNdx < numPartitions); + const UVec4& e0 = colorEndpoints[colorEndpointNdx].e0; + const UVec4& e1 = colorEndpoints[colorEndpointNdx].e1; + const TexelWeightPair& weight = texelWeights[texelNdx]; + + if (isLDRMode && isHDREndpoint[colorEndpointNdx]) + { + if (isSRGB) + { + ((deUint8*)dst)[texelNdx * 4 + 0] = 0xff; + ((deUint8*)dst)[texelNdx * 4 + 1] = 0; + ((deUint8*)dst)[texelNdx * 4 + 2] = 0xff; + ((deUint8*)dst)[texelNdx * 4 + 3] = 0xff; + } + else + { + ((float*)dst)[texelNdx * 4 + 0] = 1.0f; + ((float*)dst)[texelNdx * 4 + 1] = 0; + ((float*)dst)[texelNdx * 4 + 2] = 1.0f; + ((float*)dst)[texelNdx * 4 + 3] = 1.0f; + } + result = DECOMPRESS_RESULT_ERROR; + } + else + { + for (int channelNdx = 0; channelNdx < 4; channelNdx++) + { + if (!isHDREndpoint[colorEndpointNdx] || (channelNdx == 3 && colorEndpointModes[colorEndpointNdx] == 14)) // \note Alpha for mode 14 is treated the same as LDR. + { + const deUint32 c0 = (e0[channelNdx] << 8) | (isSRGB ? 0x80 : e0[channelNdx]); + const deUint32 c1 = (e1[channelNdx] << 8) | (isSRGB ? 0x80 : e1[channelNdx]); + const deUint32 w = weight.w[ccs == channelNdx ? 1 : 0]; + const deUint32 c = (c0 * (64 - w) + c1 * w + 32) / 64; + + if (isSRGB) + ((deUint8*)dst)[texelNdx * 4 + channelNdx] = (deUint8)((c & 0xff00) >> 8); + else + ((float*)dst)[texelNdx * 4 + channelNdx] = (c == 65535) ? 1.0f : (float)c / 65536.0f; + } + else + { + DE_ASSERT(!isSRGB); + //DE_STATIC_ASSERT((basisu_astc::meta::TypesSame::Value)); + + const deUint32 c0 = e0[channelNdx] << 4; + const deUint32 c1 = e1[channelNdx] << 4; + const deUint32 w = weight.w[(ccs == channelNdx) ? 1 : 0]; + const deUint32 c = (c0 * (64 - w) + c1 * w + 32) / 64; + const deUint32 e = getBits(c, 11, 15); + const deUint32 m = getBits(c, 0, 10); + const deUint32 mt = (m < 512) ? (3 * m) + : (m >= 1536) ? (5 * m - 2048) + : (4 * m - 512); + + const deFloat16 cf = (deFloat16)((e << 10) + (mt >> 3)); + + ((float*)dst)[texelNdx * 4 + channelNdx] = deFloat16To32(isFloat16InfOrNan(cf) ? 0x7bff : cf); + } + + } // channelNdx + } + } // texelX + } // texelY + + return result; +} + +DecompressResult decompressBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDR) +{ + DE_ASSERT(isLDR || !isSRGB); + + // Decode block mode. + const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10)); + + // Check for block mode errors. + if (blockMode.isError) + { + setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB); + return DECOMPRESS_RESULT_ERROR; + } + + // Separate path for void-extent. + if (blockMode.isVoidExtent) + return decodeVoidExtentBlock(dst, blockData, blockWidth, blockHeight, isSRGB, isLDR); + + // Compute weight grid values. + const int numWeights = computeNumWeights(blockMode); + const int numWeightDataBits = computeNumRequiredBits(blockMode.weightISEParams, numWeights); + const int numPartitions = (int)blockData.getBits(11, 12) + 1; + + // Check for errors in weight grid, partition and dual-plane parameters. + if ((numWeights > 64) || + (numWeightDataBits > 96) || + (numWeightDataBits < 24) || + (blockMode.weightGridWidth > blockWidth) || + (blockMode.weightGridHeight > blockHeight) || + ((numPartitions == 4) && blockMode.isDualPlane)) + { + setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB); + return DECOMPRESS_RESULT_ERROR; + } + + // Compute number of bits available for color endpoint data. + const bool isSingleUniqueCem = (numPartitions == 1) || (blockData.getBits(23, 24) == 0); + + const int numConfigDataBits = ((numPartitions == 1) ? 17 : isSingleUniqueCem ? 29 : 25 + 3*numPartitions) + + (blockMode.isDualPlane ? 2 : 0); + + const int numBitsForColorEndpoints = 128 - numWeightDataBits - numConfigDataBits; + + const int extraCemBitsStart = 127 - numWeightDataBits - (isSingleUniqueCem ? -1 + : (numPartitions == 4) ? 7 + : (numPartitions == 3) ? 4 + : (numPartitions == 2) ? 1 + : 0); + + // Decode color endpoint modes. + deUint32 colorEndpointModes[4]; + decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart); + const int numColorEndpointValues = computeNumColorEndpointValues(colorEndpointModes, numPartitions); + + // Check for errors in color endpoint value count. + if ((numColorEndpointValues > 18) || (numBitsForColorEndpoints < (int)deDivRoundUp32(13*numColorEndpointValues, 5))) + { + setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB); + return DECOMPRESS_RESULT_ERROR; + } + + // Compute color endpoints. + ColorEndpointPair colorEndpoints[4]; + computeColorEndpoints(&colorEndpoints[0], blockData, &colorEndpointModes[0], numPartitions, numColorEndpointValues, + computeMaximumRangeISEParams(numBitsForColorEndpoints, numColorEndpointValues), numBitsForColorEndpoints); + + // Compute texel weights. + TexelWeightPair texelWeights[MAX_BLOCK_WIDTH*MAX_BLOCK_HEIGHT]; + computeTexelWeights(&texelWeights[0], blockData, blockWidth, blockHeight, blockMode); + + // Set texel colors. + const int ccs = blockMode.isDualPlane ? (int)blockData.getBits(extraCemBitsStart-2, extraCemBitsStart-1) : -1; + const deUint32 partitionIndexSeed = (numPartitions > 1) ? blockData.getBits(13, 22) : (deUint32)-1; + + return setTexelColors(dst, &colorEndpoints[0], &texelWeights[0], ccs, partitionIndexSeed, numPartitions, blockWidth, blockHeight, isSRGB, isLDR, &colorEndpointModes[0]); +} + +// Returns -1 on error, 0 if LDR, 1 if HDR +int isHDR(const Block128& blockData, int blockWidth, int blockHeight) +{ + // Decode block mode. + const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10)); + + // Check for block mode errors. + if (blockMode.isError) + return -1; + + // Separate path for void-extent. + if (blockMode.isVoidExtent) + { + const bool isHDRBlock = blockData.isBitSet(9); + return isHDRBlock ? 1 : 0; + } + + // Compute weight grid values. + const int numWeights = computeNumWeights(blockMode); + const int numWeightDataBits = computeNumRequiredBits(blockMode.weightISEParams, numWeights); + const int numPartitions = (int)blockData.getBits(11, 12) + 1; + + // Check for errors in weight grid, partition and dual-plane parameters. + if ((numWeights > 64) || + (numWeightDataBits > 96) || + (numWeightDataBits < 24) || + (blockMode.weightGridWidth > blockWidth) || + (blockMode.weightGridHeight > blockHeight) || + ((numPartitions == 4) && blockMode.isDualPlane)) + { + return -1; + } + + // Compute number of bits available for color endpoint data. + const bool isSingleUniqueCem = (numPartitions == 1) || (blockData.getBits(23, 24) == 0); + + const int extraCemBitsStart = 127 - numWeightDataBits - (isSingleUniqueCem ? -1 + : (numPartitions == 4) ? 7 + : (numPartitions == 3) ? 4 + : (numPartitions == 2) ? 1 + : 0); + + // Decode color endpoint modes. + deUint32 colorEndpointModes[4]; + decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart); + + for (int i = 0; i < numPartitions; i++) + { + if (isColorEndpointModeHDR(colorEndpointModes[i])) + return 1; + } + + return 0; +} + +typedef uint16_t half_float; + +half_float float_to_half(float val, bool toward_zero) +{ + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1; + int s = flt_s, e = 0, m = 0; + + // inf/NaN + if (flt_e == 0xff) + { + e = 31; + if (flt_m != 0) // NaN + m = 1; + } + // not zero or denormal + else if (flt_e != 0) + { + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + { + if (toward_zero) + m = (int)truncf((1 << 24) * fabsf(fi.f)); + else + m = lrintf((1 << 24) * fabsf(fi.f)); + } + else + { + e = new_exp + 15; + if (toward_zero) + m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13))); + else + m = lrintf((float)flt_m * (1.0f / (float)(1 << 13))); + } + } + + assert((0 <= m) && (m <= 1024)); + if (m == 1024) + { + e++; + m = 0; + } + + assert((s >= 0) && (s <= 1)); + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); + + half_float result = (half_float)((s << 15) | (e << 10) | m); + return result; +} + +float half_to_float(half_float hval) +{ + union { float f; uint32_t u; } x = { 0 }; + + uint32_t s = ((uint32_t)hval >> 15) & 1; + uint32_t e = ((uint32_t)hval >> 10) & 0x1F; + uint32_t m = (uint32_t)hval & 0x3FF; + + if (!e) + { + if (!m) + { + // +- 0 + x.u = s << 31; + return x.f; + } + else + { + // denormalized + while (!(m & 0x00000400)) + { + m <<= 1; + --e; + } + + ++e; + m &= ~0x00000400; + } + } + else if (e == 31) + { + if (m == 0) + { + // +/- INF + x.u = (s << 31) | 0x7f800000; + return x.f; + } + else + { + // +/- NaN + x.u = (s << 31) | 0x7f800000 | (m << 13); + return x.f; + } + } + + e = e + (127 - 15); + m = m << 13; + + assert(s <= 1); + assert(m <= 0x7FFFFF); + assert(e <= 255); + + x.u = m | (e << 23) | (s << 31); + return x.f; +} + +} // anonymous + +// See https://registry.khronos.org/DataFormat/specs/1.3/dataformat.1.3.inline.html#_hdr_endpoint_decoding +static void convert_to_half_prec(uint32_t n, float* pVals) +{ +#if 0 + const int prev_dir = fesetround(FE_TOWARDZERO); + + for (uint32_t i = 0; i < n; i++) + pVals[i] = half_to_float(float_to_half(pVals[i])); + + fesetround(prev_dir); + + for (uint32_t i = 0; i < n; i++) + { + assert(pVals[i] == half_to_float(float_to_half(pVals[i], true))); + } +#else + // This ensures the values are rounded towards zero as half floats. + for (uint32_t i = 0; i < n; i++) + { + pVals[i] = half_to_float(float_to_half(pVals[i], true)); + } +#endif +} + +bool decompress_ldr(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth, int blockHeight) +{ + float linear[MAX_BLOCK_WIDTH * MAX_BLOCK_HEIGHT * 4]; + + const Block128 blockData(data); + + // isSRGB is true, this writes uint8_t's. Otherwise it writes floats. + if (decompressBlock(isSRGB ? (void*)pDst : (void*)&linear[0], blockData, blockWidth, blockHeight, isSRGB, true) != DECOMPRESS_RESULT_VALID_BLOCK) + { + return false; + } + + if (!isSRGB) + { + // Convert the floats to 8-bits with rounding. + int pix = 0; + for (int i = 0; i < blockHeight; i++) + { + for (int j = 0; j < blockWidth; j++, pix++) + { + pDst[4 * pix + 0] = (uint8_t)(basisu_astc::clamp((int)(linear[pix * 4 + 0] * 65536.0f + .5f), 0, 65535) >> 8); + pDst[4 * pix + 1] = (uint8_t)(basisu_astc::clamp((int)(linear[pix * 4 + 1] * 65536.0f + .5f), 0, 65535) >> 8); + pDst[4 * pix + 2] = (uint8_t)(basisu_astc::clamp((int)(linear[pix * 4 + 2] * 65536.0f + .5f), 0, 65535) >> 8); + pDst[4 * pix + 3] = (uint8_t)(basisu_astc::clamp((int)(linear[pix * 4 + 3] * 65536.0f + .5f), 0, 65535) >> 8); + } + } + } + + return true; +} + +bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight) +{ + const Block128 blockData(data); + + if (decompressBlock(pDstRGBA, blockData, blockWidth, blockHeight, false, false) != DECOMPRESS_RESULT_VALID_BLOCK) + { + return false; + } + + convert_to_half_prec(blockWidth * blockHeight * 4, pDstRGBA); + + return true; +} + +bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool &is_hdr) +{ + is_hdr = false; + + const Block128 blockData(data); + + int status = isHDR(blockData, blockWidth, blockHeight); + if (status < 0) + { + return false; + } + + is_hdr = (status == 1); + + return true; +} + +} // astc + +} // basisu_astc + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif diff --git a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h new file mode 100644 index 000000000000..ad13093a6c7d --- /dev/null +++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h @@ -0,0 +1,45 @@ +// File: android_astc_decomp.h +#ifndef _TCUASTCUTIL_HPP +#define _TCUASTCUTIL_HPP +/*------------------------------------------------------------------------- + * drawElements Quality Program Tester Core + * ---------------------------------------- + * + * Copyright 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + *//*! + * \file + * \brief ASTC Utilities. + *//*--------------------------------------------------------------------*/ + +#include +#include + +namespace basisu_astc +{ +namespace astc +{ + +// Unpacks a single ASTC block to pDst +// If isSRGB is true, the spec requires the decoder to scale the LDR 8-bit endpoints to 16-bit before interpolation slightly differently, +// which will lead to different outputs. So be sure to set it correctly (ideally it should match whatever the encoder did). +bool decompress_ldr(uint8_t* pDst, const uint8_t* data, bool isSRGB, int blockWidth, int blockHeight); +bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight); +bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool& is_hdr); + +} // astc +} // basisu + +#endif diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp new file mode 100644 index 000000000000..d698a7ff872b --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp @@ -0,0 +1,3310 @@ +// basisu_astc_hdr_enc.cpp +#include "basisu_astc_hdr_enc.h" +#include "../transcoder/basisu_transcoder.h" + +using namespace basist; + +namespace basisu +{ + +const float DEF_R_ERROR_SCALE = 2.0f; +const float DEF_G_ERROR_SCALE = 3.0f; + +static inline uint32_t get_max_qlog(uint32_t bits) +{ + switch (bits) + { + case 7: return MAX_QLOG7; + case 8: return MAX_QLOG8; + case 9: return MAX_QLOG9; + case 10: return MAX_QLOG10; + case 11: return MAX_QLOG11; + case 12: return MAX_QLOG12; + case 16: return MAX_QLOG16; + default: assert(0); break; + } + return 0; +} + +#if 0 +static inline float get_max_qlog_val(uint32_t bits) +{ + switch (bits) + { + case 7: return MAX_QLOG7_VAL; + case 8: return MAX_QLOG8_VAL; + case 9: return MAX_QLOG9_VAL; + case 10: return MAX_QLOG10_VAL; + case 11: return MAX_QLOG11_VAL; + case 12: return MAX_QLOG12_VAL; + case 16: return MAX_QLOG16_VAL; + default: assert(0); break; + } + return 0; +} +#endif + +static inline int get_bit( + int src_val, int src_bit) +{ + assert(src_bit >= 0 && src_bit <= 31); + int bit = (src_val >> src_bit) & 1; + return bit; +} + +static inline void pack_bit( + int& dst, int dst_bit, + int src_val, int src_bit = 0) +{ + assert(dst_bit >= 0 && dst_bit <= 31); + int bit = get_bit(src_val, src_bit); + dst |= (bit << dst_bit); +} + +//-------------------------------------------------------------------------------------------------------------------------- + +astc_hdr_codec_options::astc_hdr_codec_options() +{ + init(); +} + +void astc_hdr_codec_options::init() +{ + m_bc6h_err_weight = .85f; + m_r_err_scale = DEF_R_ERROR_SCALE; + m_g_err_scale = DEF_G_ERROR_SCALE; + + // Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output. + m_allow_uber_mode = false; + + // Must set best quality level first to set defaults. + set_quality_best(); + + set_quality_level(cDefaultLevel); +} + +void astc_hdr_codec_options::set_quality_best() +{ + m_mode11_direct_only = false; + + // highest achievable quality + m_use_solid = true; + + m_use_mode11 = true; + m_mode11_uber_mode = true; + m_first_mode11_weight_ise_range = MODE11_FIRST_ISE_RANGE; + m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; + m_first_mode11_submode = -1; + m_last_mode11_submode = 7; + + m_use_mode7_part1 = true; + m_first_mode7_part1_weight_ise_range = MODE7_PART1_FIRST_ISE_RANGE; + m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; + + m_use_mode7_part2 = true; + m_mode7_part2_part_masks = UINT32_MAX; + m_first_mode7_part2_weight_ise_range = MODE7_PART2_FIRST_ISE_RANGE; + m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; + + m_use_mode11_part2 = true; + m_mode11_part2_part_masks = UINT32_MAX; + m_first_mode11_part2_weight_ise_range = MODE11_PART2_FIRST_ISE_RANGE; + m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; + + m_refine_weights = true; + + m_use_estimated_partitions = false; + m_max_estimated_partitions = 0; +} + +void astc_hdr_codec_options::set_quality_normal() +{ + m_use_solid = true; + + // We'll allow uber mode in normal if the user allows it. + m_use_mode11 = true; + m_mode11_uber_mode = true; + m_first_mode11_weight_ise_range = 6; + m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = true; + m_first_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; + m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; + + m_use_mode7_part2 = true; + m_mode7_part2_part_masks = UINT32_MAX; + m_first_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; + m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; + + m_use_mode11_part2 = true; + m_mode11_part2_part_masks = UINT32_MAX; + m_first_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; + m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; + + m_refine_weights = true; +} + +void astc_hdr_codec_options::set_quality_fastest() +{ + m_use_solid = true; + + m_use_mode11 = true; + m_mode11_uber_mode = false; + m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; + m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = false; + m_use_mode7_part2 = false; + m_use_mode11_part2 = false; + + m_refine_weights = false; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void astc_hdr_codec_options::set_quality_level(int level) +{ + level = clamp(level, cMinLevel, cMaxLevel); + + m_level = level; + + switch (level) + { + case 0: + { + set_quality_fastest(); + break; + } + case 1: + { + set_quality_normal(); + + m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE - 1; + m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = false; + m_use_mode7_part2 = false; + + m_use_estimated_partitions = true; + m_max_estimated_partitions = 1; + + m_mode11_part2_part_masks = 1 | 2; + m_mode7_part2_part_masks = 1 | 2; + break; + } + case 2: + { + set_quality_normal(); + + m_use_estimated_partitions = true; + m_max_estimated_partitions = 2; + + m_mode11_part2_part_masks = 1 | 2; + m_mode7_part2_part_masks = 1 | 2; + + break; + } + case 3: + { + set_quality_best(); + + m_use_estimated_partitions = true; + m_max_estimated_partitions = 2; + + m_mode11_part2_part_masks = 1 | 2 | 4 | 8; + m_mode7_part2_part_masks = 1 | 2 | 4 | 8; + + break; + } + case 4: + { + set_quality_best(); + + break; + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +#if 0 +static inline half_float qlog12_to_half_slow(uint32_t qlog12) +{ + return qlog_to_half_slow(qlog12, 12); +} +#endif + +// max usable qlog8 value is 247, 248=inf, >=249 is nan +// max usable qlog7 value is 123, 124=inf, >=125 is nan + +// To go from a smaller qlog to an larger one, shift left by X bits. + +//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0 + +// for qlog7's shift left by 1 +//half_float g_qlog8_to_half[256]; +//float g_qlog8_to_float[256]; + +//half_float g_qlog12_to_half[4096]; +//float g_qlog12_to_float[4096]; + +static half_float g_qlog16_to_half[65536]; + +inline half_float qlog_to_half(uint32_t val, uint32_t bits) +{ + assert((bits >= 5) && (bits <= 16)); + assert(val < (1U << bits)); + return g_qlog16_to_half[val << (16 - bits)]; +} + +// nearest values given a positive half float value (only) +static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768], g_half_to_qlog9[32768], g_half_to_qlog10[32768], g_half_to_qlog11[32768], g_half_to_qlog12[32768]; + +const uint32_t HALF_TO_QLOG_TABS_BASE = 7; +static uint16_t* g_pHalf_to_qlog_tabs[8] = +{ + g_half_to_qlog7, + g_half_to_qlog8, + + g_half_to_qlog9, + g_half_to_qlog10, + + g_half_to_qlog11, + g_half_to_qlog12 +}; + +static inline uint32_t half_to_qlog7_12(half_float h, uint32_t bits) +{ + assert((bits >= HALF_TO_QLOG_TABS_BASE) && (bits <= 12)); + assert(h < 32768); + + return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE][h]; +} + +#if 0 +// Input is the low 11 bits of the qlog +// Returns the 10-bit mantissa of the half float value +static int qlog11_to_half_float_mantissa(int M) +{ + assert(M <= 0x7FF); + int Mt; + if (M < 512) + Mt = 3 * M; + else if (M >= 1536) + Mt = 5 * M - 2048; + else + Mt = 4 * M - 512; + return (Mt >> 3); +} +#endif + +// Input is the 10-bit mantissa of the half float value +// Output is the 11-bit qlog value +// Inverse of qlog11_to_half_float_mantissa() +static inline int half_float_mantissa_to_qlog11(int hf) +{ + int q0 = (hf * 8 + 2) / 3; + int q1 = (hf * 8 + 2048 + 4) / 5; + + if (q0 < 512) + return q0; + else if (q1 >= 1536) + return q1; + + int q2 = (hf * 8 + 512 + 2) / 4; + return q2; +} + +static inline int half_to_qlog16(int hf) +{ + // extract 5 bits exponent, which is carried through to qlog16 unchanged + const int exp = (hf >> 10) & 0x1F; + + // extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless) + const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF); + assert(mantissa <= 0x7FF); + + // Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights. + uint32_t qlog16 = (exp << 11) | mantissa; + + // should be a lossless operation + assert(qlog16_to_half_slow(qlog16) == hf); + + return qlog16; +} + +static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits) +{ + assert((desired_bits >= 7) && (desired_bits <= 12)); + assert(q16 <= 65535); + + const uint32_t shift = 16 - desired_bits; + uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift; + + uint32_t max_val = (1U << desired_bits) - 1U; + e = minimum(e, max_val); + + return e; +} + +static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector &qlog16_to_float) +{ + assert(bits >= 5 && bits <= 12); + const uint32_t max_val = (1 << bits) - 1; + + // For all positive half-floats + for (uint32_t h = 0; h < 32768; h++) + { + // Skip invalid values + if (is_half_inf_or_nan((half_float)h)) + continue; + const float desired_val = half_to_float((half_float)h); + + float best_err = 1e+30f; + uint32_t best_qlog = 0; + + // For all possible qlog's + for (uint32_t i = 0; i <= max_val; i++) + { + // Skip invalid values + float v = qlog16_to_float[i << (16 - bits)]; + if (std::isnan(v)) + continue; + + // Compute error + float err = fabs(v - desired_val); + + // Find best + if (err < best_err) + { + best_err = err; + best_qlog = i; + } + } + + pTable[h] = (uint16_t)best_qlog; + } + +#if 0 + uint32_t t = 0; + + const uint32_t nb = 12; + int nb_shift = 16 - nb; + + for (uint32_t q16 = 0; q16 < 65536; q16++) + { + half_float h = qlog16_to_half_slow(q16); + if (is_half_inf_or_nan(h)) + continue; + + int q7 = half_to_qlog7_12(h, nb); + + uint32_t best_err = UINT32_MAX, best_l = 0; + for (int l = 0; l < (1 << nb); l++) + { + int dec_q16 = l << nb_shift; + int err = iabs(dec_q16 - q16); + if (err < best_err) + { + best_err = err; + best_l = l; + } + } + + //int e = (q16 + 253) >> 9; // 345 + + int e = (q16 + (1 << (nb_shift - 1)) - 1) >> nb_shift; // 285 + if (best_l != e) + //if (q7 != best_l) + { + printf("q16=%u, h=%u, q7=%u, e=%u, best_l=%u\n", q16, h, q7, e, best_l); + t++; + } + } + + printf("Mismatches: %u\n", t); + exit(0); +#endif +} + +static void init_qlog_tables() +{ + basisu::vector qlog16_to_float(65536); + + // for all possible qlog16, compute the corresponding half float + for (uint32_t i = 0; i <= 65535; i++) + { + half_float h = qlog16_to_half_slow(i); + g_qlog16_to_half[i] = h; + + qlog16_to_float[i] = half_to_float(h); + } + + // for all possible half floats, find the nearest qlog5-12 float + for (uint32_t bits = HALF_TO_QLOG_TABS_BASE; bits <= 12; bits++) + { + compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE], qlog16_to_float); + } +} + +// [ise_range][0] = # levels +// [ise_range][1...] = lerp value [0,64] +// in ASTC order +// Supported ISE weight ranges: 0 to 10, 11 total +const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = 1; // ISE 1=3 levels +const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = 10; // ISE 10=24 levels + +static const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][32] = +{ + { 0 }, // ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block) + { 3, 0, 32, 64 }, // 1 + { 4, 0, 21, 43, 64 }, // 2 + { 5, 0, 16, 32, 48, 64 }, // 3 + { 6, 0, 64, 12, 52, 25, 39 }, // 4 + { 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5 + { 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6 + { 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7 + { 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8 + { 20, 0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35 }, // 9 + { 24, 0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34 } // 10 +}; + +//{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7 +//static const uint8_t g_weight_order_7[12] = { 0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1 }; + +static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels) +{ + vec3F mean(0.0f); + + for (uint32_t i = 0; i < num_pixels; i++) + { + const vec4F& p = pPixels[i]; + + mean[0] += p[0]; + mean[1] += p[1]; + mean[2] += p[2]; + } + + return mean / static_cast(num_pixels); +} + +static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color) +{ + float cov[6] = { 0, 0, 0, 0, 0, 0 }; + + for (uint32_t i = 0; i < num_pixels; i++) + { + const vec4F& v = pPixels[i]; + + float r = v[0] - mean_color[0]; + float g = v[1] - mean_color[1]; + float b = v[2] - mean_color[2]; + + cov[0] += r * r; + cov[1] += r * g; + cov[2] += r * b; + cov[3] += g * g; + cov[4] += g * b; + cov[5] += b * b; + } + + float xr = .9f, xg = 1.0f, xb = .7f; + for (uint32_t iter = 0; iter < 3; iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + + float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); + + if (m > 1e-10f) + { + m = 1.0f / m; + + r *= m; + g *= m; + b *= m; + } + + xr = r; + xg = g; + xb = b; + } + + float len = xr * xr + xg * xg + xb * xb; + + vec3F axis; + if (len < 1e-10f) + axis.set(0.0f); + else + { + len = 1.0f / sqrtf(len); + + xr *= len; + xg *= len; + xb *= len; + + axis.set(xr, xg, xb, 0); + } + + if (axis.dot(axis) < .5f) + { + axis.set(1.0f, 1.0f, 1.0f, 0.0f); + axis.normalize_in_place(); + } + + return axis; +} + +static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr) +{ +#if 0 + assert(mean[0] >= input_box[0][0]); + assert(mean[1] >= input_box[0][1]); + assert(mean[2] >= input_box[0][2]); + assert(mean[0] <= input_box[1][0]); + assert(mean[1] <= input_box[1][1]); + assert(mean[2] <= input_box[1][2]); +#endif + + if (pInside) + *pInside = false; + + vec3F k(mean + dir * df); + if (colorspace_box.contains(k)) + { + if (pInside) + *pInside = true; + + return k; + } + + // starts inside + vec3F s(mean); + + // ends outside + vec3F e(mean + dir * df); + + // a ray guaranteed to go from the outside to inside + ray3F r(e, (s - e).normalize_in_place()); + vec3F c; + float t = 0.0f; + + intersection::result res = intersection::ray_aabb(c, t, r, input_box); + if (res != intersection::cSuccess) + c = k; + + return c; +} + +// all in Q16 space, 0-65535 +static bool compute_least_squares_endpoints_rgb( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box) +{ + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + + q00_b += w * pColors[i][2]; + t_b += pColors[i][2]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (det == 0.0f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); + (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + + (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g); + (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + + (*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b); + (*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b); + + for (uint32_t c = 0; c < 3; c++) + { + float l = (*pXl)[c], h = (*pXh)[c]; + + if (input_box.get_dim(c) < .0000125f) + { + l = input_box[0][c]; + h = input_box[1][c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + vec3F mean((*pXl + *pXh) * .5f); + vec3F dir(*pXh - *pXl); + + float ln = dir.length(); + if (ln) + { + dir /= ln; + + float ld = (*pXl - mean).dot(dir); + float hd = (*pXh - mean).dot(dir); + + aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL)); + + bool was_inside1 = false; + + vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1); + if (!was_inside1) + *pXl = l; + + bool was_inside2 = false; + vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2); + if (!was_inside2) + *pXh = h; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; + +static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][astc_index] -> linear index +static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][linear_index] -> astc_index + +static void encode_astc_hdr_init() +{ + // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w + for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++) + { + const uint32_t num_levels = g_ise_weight_lerps[range][0]; + assert((num_levels >= 3) && (num_levels <= 24)); + + for (uint32_t i = 0; i < num_levels; i++) + { + float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f); + + g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); + } + } + + for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++) + { + const uint32_t num_levels = g_ise_weight_lerps[ise_range][0]; + assert((num_levels >= 3) && (num_levels <= 24)); + + uint32_t s[32]; + for (uint32_t i = 0; i < num_levels; i++) + s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i; + + std::sort(s, s + num_levels); + + for (uint32_t i = 0; i < num_levels; i++) + g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF); + + for (uint32_t i = 0; i < num_levels; i++) + g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i; + } +} + +void interpolate_qlog12_colors( + const int e[2][3], + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + for (uint32_t i = 0; i < 2; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + assert(in_range(e[i][j], 0, 0xFFF)); + } + } + + for (uint32_t i = 0; i < n; i++) + { + const int c = g_ise_weight_lerps[ise_weight_range][1 + i]; + assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range)); + + half_float rf, gf, bf; + + { + uint32_t r0 = e[0][0] << 4; + uint32_t r1 = e[1][0] << 4; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = qlog16_to_half_slow(ri); + } + + { + uint32_t g0 = e[0][1] << 4; + uint32_t g1 = e[1][1] << 4; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = qlog16_to_half_slow(gi); + } + + { + uint32_t b0 = e[0][2] << 4; + uint32_t b1 = e[1][2] << 4; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = qlog16_to_half_slow(bi); + } + + if (pDecoded_half) + { + pDecoded_half[i * 3 + 0] = rf; + pDecoded_half[i * 3 + 1] = gf; + pDecoded_half[i * 3 + 2] = bf; + } + + if (pDecoded_float) + { + pDecoded_float[i][0] = half_to_float(rf); + pDecoded_float[i][1] = half_to_float(gf); + pDecoded_float[i][2] = half_to_float(bf); + } + } +} + +// decoded in ASTC order, not linear order +// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded +bool get_astc_hdr_mode_11_block_colors( + const uint8_t* pEndpoints, + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + int e[2][3]; + if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range)) + return false; + + interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); + + return true; +} + +// decoded in ASTC order, not linear order +// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded +bool get_astc_hdr_mode_7_block_colors( + const uint8_t* pEndpoints, + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + int e[2][3]; + if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range)) + return false; + + interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); + + return true; +} + +// Fast high precision piecewise linear approximation of log2(bias+x). +// Half may be zero, positive or denormal. No NaN/Inf/negative. +static inline double q(half_float x) +{ + union { float f; int32_t i; uint32_t u; } fi; + + fi.f = fast_half_to_float_pos_not_inf_or_nan(x); + + assert(fi.f >= 0.0f); + + fi.f += .125f; + + return (double)fi.u; // approx log2f(fi.f), need to return double for the precision +} + +double eval_selectors( + uint32_t num_pixels, + uint8_t* pWeights, + const half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const half_float* pDecoded_half, + const astc_hdr_codec_options& coptions, + uint32_t usable_selector_bitmask) +{ + assert((num_pixels >= 1) && (num_pixels <= 16)); + assert(usable_selector_bitmask); + + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + double total_error = 0; + +#ifdef _DEBUG + for (uint32_t i = 0; i < num_weight_levels; i++) + { + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2])); + } +#endif + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + double lowest_e = 1e+30f; + + // this is an approximation of MSLE + for (uint32_t i = 0; i < num_weight_levels; i++) + { + if (((1 << i) & usable_selector_bitmask) == 0) + continue; + + // compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE + double rd = q(pDecoded_half[i * 3 + 0]) - q(pDesired_half[0]); + double gd = q(pDecoded_half[i * 3 + 1]) - q(pDesired_half[1]); + double bd = q(pDecoded_half[i * 3 + 2]) - q(pDesired_half[2]); + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + if (e < lowest_e) + { + lowest_e = e; + pWeights[p] = (uint8_t)i; + } + } + + total_error += lowest_e; + + } // p + + return total_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double compute_block_error(const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_options& coptions) +{ + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + double total_error = 0; + + for (uint32_t p = 0; p < 16; p++) + { + double rd = q(pOrig_block[p * 3 + 0]) - q(pPacked_block[p * 3 + 0]); + double gd = q(pOrig_block[p * 3 + 1]) - q(pPacked_block[p * 3 + 1]); + double bd = q(pOrig_block[p * 3 + 2]) - q(pPacked_block[p * 3 + 2]); + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + total_error += e; + } + + return total_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag) +{ + assert(l < h); + + if (v < l) + { + max_clamp_mag = basisu::maximum(max_clamp_mag, l - v); + + v = l; + did_clamp = true; + } + else if (v > h) + { + max_clamp_mag = basisu::maximum(max_clamp_mag, v - h); + + v = h; + did_clamp = true; + } + + return v; +} + +static bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag) +{ + assert(submode <= 7); + + const uint8_t s_b_bits[8] = { 7, 8, 6, 7, 8, 6, 7, 6 }; + const uint8_t s_c_bits[8] = { 6, 6, 7, 7, 6, 7, 7, 7 }; + const uint8_t s_d_bits[8] = { 7, 6, 7, 6, 5, 6, 5, 6 }; + + const uint32_t a_bits = 9 + (submode >> 1); + const uint32_t b_bits = s_b_bits[submode]; + const uint32_t c_bits = s_c_bits[submode]; + const uint32_t d_bits = s_d_bits[submode]; + + const int max_a_val = (1 << a_bits) - 1; + const int max_b_val = (1 << b_bits) - 1; + const int max_c_val = (1 << c_bits) - 1; + + // The maximum usable value before it turns to NaN/Inf + const int max_a_qlog = get_max_qlog(a_bits); + + const int min_d_val = -(1 << (d_bits - 1)); + const int max_d_val = -min_d_val - 1; + assert((max_d_val - min_d_val + 1) == (1 << d_bits)); + + int val_q[2][3]; + + for (uint32_t c = 0; c < 3; c++) + { +#if 1 + // this is better + const half_float l = qlog16_to_half_slow((uint32_t)std::round(low_q16[c])); + val_q[0][c] = half_to_qlog7_12(l, a_bits); + + const half_float h = qlog16_to_half_slow((uint32_t)std::round(high_q16[c])); + val_q[1][c] = half_to_qlog7_12(h, a_bits); +#else + val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits); + val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits); +#endif + +#if 1 + if (val_q[0][c] == val_q[1][c]) + { +#if 0 + if (l <= h) +#else + if (low_q16[c] < high_q16[c]) +#endif + { + if (val_q[0][c]) + val_q[0][c]--; + + if (val_q[1][c] != max_a_val) + val_q[1][c]++; + } + else + { + if (val_q[0][c] != max_a_val) + val_q[0][c]++; + + if (val_q[1][c]) + val_q[1][c]--; + } + } +#endif + + val_q[0][c] = minimum(val_q[0][c], max_a_qlog); + val_q[1][c] = minimum(val_q[1][c], max_a_qlog); + } + + int highest_q = -1, highest_val = 0, highest_comp = 0; + + for (uint32_t v = 0; v < 2; v++) + { + for (uint32_t c = 0; c < 3; c++) + { + assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val); + + if (val_q[v][c] > highest_q) + { + highest_q = val_q[v][c]; + highest_val = v; + highest_comp = c; + } + } + } + + const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q); + + if (highest_val != 1) + { + for (uint32_t c = 0; c < 3; c++) + { + std::swap(val_q[0][c], val_q[1][c]); + } + } + + if (highest_comp) + { + std::swap(val_q[0][0], val_q[0][highest_comp]); + std::swap(val_q[1][0], val_q[1][highest_comp]); + } + + int orig_q[2][3]; + memcpy(orig_q, val_q, sizeof(val_q)); + + // val[1][0] is now guaranteed to be highest + int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0; + int best_max_clamp_mag = 0; + bool best_did_clamp = false; + int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 } }; + BASISU_NOTE_UNUSED(best_q); + uint32_t best_dist = UINT_MAX; + + for (uint32_t pass = 0; pass < 2; pass++) + { + int trial_va = val_q[1][0]; + + assert(trial_va <= max_a_val); + assert(trial_va >= val_q[1][1]); + assert(trial_va >= val_q[1][2]); + + assert(trial_va >= val_q[0][0]); + assert(trial_va >= val_q[0][1]); + assert(trial_va >= val_q[0][2]); + + bool did_clamp = false; + int trial_max_clamp_mag = 0; + + int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag); + int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag); + int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag); + int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); + int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); + + if (!did_clamp) + { + // Make sure decoder gets the expected values + assert(trial_va == val_q[1][0]); + assert(trial_va - trial_vb0 == val_q[1][1]); + assert(trial_va - trial_vb1 == val_q[1][2]); + + assert((trial_va - trial_vc) == val_q[0][0]); + assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]); + assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]); + } + + const int r_e0 = clamp(trial_va, 0, max_a_val); + const int r_e1 = clamp(trial_va - trial_vb0, 0, max_a_val); + const int r_e2 = clamp(trial_va - trial_vb1, 0, max_a_val); + + const int r_f0 = clamp(trial_va - trial_vc, 0, max_a_val); + const int r_f1 = clamp(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val); + const int r_f2 = clamp(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val); + + assert(r_e0 <= max_a_qlog); + assert(r_e1 <= max_a_qlog); + assert(r_e2 <= max_a_qlog); + + assert(r_f0 <= max_a_qlog); + assert(r_f1 <= max_a_qlog); + assert(r_f2 <= max_a_qlog); + + if ((!did_clamp) || (!had_tie)) + { + best_va = trial_va; + best_vb0 = trial_vb0; + best_vb1 = trial_vb1; + best_vc = trial_vc; + best_vd0 = trial_vd0; + best_vd1 = trial_vd1; + best_max_clamp_mag = trial_max_clamp_mag; + best_did_clamp = did_clamp; + + best_q[1][0] = r_e0; + best_q[1][1] = r_e1; + best_q[1][2] = r_e2; + best_q[0][0] = r_f0; + best_q[0][1] = r_f1; + best_q[0][2] = r_f2; + break; + } + + // we had a tie and it did clamp, try swapping L/H for a potential slight gain + + const uint32_t r_dist1 = basisu::square(r_e0 - val_q[1][0]) + basisu::square(r_e1 - val_q[1][1]) + basisu::square(r_e2 - val_q[1][2]); + const uint32_t r_dist0 = basisu::square(r_f0 - val_q[0][0]) + basisu::square(r_f1 - val_q[0][1]) + basisu::square(r_f2 - val_q[0][2]); + + const uint32_t total_dist = r_dist1 + r_dist0; + + if (total_dist < best_dist) + { + best_dist = total_dist; + + best_va = trial_va; + best_vb0 = trial_vb0; + best_vb1 = trial_vb1; + best_vc = trial_vc; + best_vd0 = trial_vd0; + best_vd1 = trial_vd1; + best_did_clamp = did_clamp; + + best_q[1][0] = r_e0; + best_q[1][1] = r_e1; + best_q[1][2] = r_e2; + best_q[0][0] = r_f0; + best_q[0][1] = r_f1; + best_q[0][2] = r_f2; + } + + for (uint32_t c = 0; c < 3; c++) + std::swap(val_q[0][c], val_q[1][c]); + } + + // pack bits now + int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0; + + int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0; + switch (submode) + { + case 0: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 1: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 2: + x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 3: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 4: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); + break; + case 5: + x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 6: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); + break; + case 7: + x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + default: + break; + } + + // write mode + pack_bit(v1, 7, submode, 0); + pack_bit(v2, 7, submode, 1); + pack_bit(v3, 7, submode, 2); + + // highest component + pack_bit(v4, 7, highest_comp, 0); + pack_bit(v5, 7, highest_comp, 1); + + // write bit 8 of va + pack_bit(v1, 6, best_va, 8); + + // extra bits + pack_bit(v2, 6, x0); + pack_bit(v3, 6, x1); + pack_bit(v4, 6, x2); + pack_bit(v5, 6, x3); + pack_bit(v4, 5, x4); + pack_bit(v5, 5, x5); + + v0 = best_va & 0xFF; + v1 |= (best_vc & 63); + v2 |= (best_vb0 & 63); + v3 |= (best_vb1 & 63); + v4 |= (best_vd0 & 31); + v5 |= (best_vd1 & 31); + + assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255)); + + pEndpoints[0] = (uint8_t)v0; + pEndpoints[1] = (uint8_t)v1; + pEndpoints[2] = (uint8_t)v2; + pEndpoints[3] = (uint8_t)v3; + pEndpoints[4] = (uint8_t)v4; + pEndpoints[5] = (uint8_t)v5; + +#ifdef _DEBUG + // Test for valid pack by unpacking + { + if (highest_comp) + { + std::swap(best_q[0][0], best_q[0][highest_comp]); + std::swap(best_q[1][0], best_q[1][highest_comp]); + + std::swap(orig_q[0][0], orig_q[0][highest_comp]); + std::swap(orig_q[1][0], orig_q[1][highest_comp]); + } + + int test_e[2][3]; + decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS); + for (uint32_t i = 0; i < 2; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits)); + + if (!best_did_clamp) + { + assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) || + (orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits))); + } + } + } + } +#endif + + max_clamp_mag = best_max_clamp_mag; + + return best_did_clamp; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_astc_mode11_direct(uint8_t* pEndpoints, const vec3F& l_q16, const vec3F& h_q16) +{ + for (uint32_t i = 0; i < 3; i++) + { + // TODO: This goes from QLOG16->HALF->QLOG8/7 + half_float l_half = qlog16_to_half_slow(clamp((int)std::round(l_q16[i]), 0, 65535)); + half_float h_half = qlog16_to_half_slow(clamp((int)std::round(h_q16[i]), 0, 65535)); + + int l_q, h_q; + + if (i == 2) + { + l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)]; + h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)]; + + l_q = minimum(l_q, MAX_QLOG7); + h_q = minimum(h_q, MAX_QLOG7); + } + else + { + l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)]; + h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)]; + + l_q = minimum(l_q, MAX_QLOG8); + h_q = minimum(h_q, MAX_QLOG8); + } + +#if 1 + if (l_q == h_q) + { + const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8; + + if (l_q16[i] <= h_q16[i]) + { + if (l_q) + l_q--; + + if (h_q != m) + h_q++; + } + else + { + if (h_q) + h_q--; + + if (l_q != m) + l_q++; + } + } +#endif + + if (i == 2) + { + assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7); + l_q |= 128; + h_q |= 128; + } + else + { + assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8); + } + + pEndpoints[2 * i + 0] = (uint8_t)l_q; + pEndpoints[2 * i + 1] = (uint8_t)h_q; + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range) +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + assert(submode <= 5); + max_clamp_mag = 0; + + static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 }; + static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 }; + static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 }; + + // The precision of the components + const uint32_t prec_bits = s_r_bits[submode]; + + int qlog[4], pack_bits[4]; + + for (uint32_t i = 0; i < 4; i++) + { + const float f = (i == 3) ? s_q16 : rgb_q16[i]; + + // The # of bits the component is packed into + if (i == 0) + pack_bits[i] = s_r_bits[submode]; + else if (i == 3) + pack_bits[i] = s_s_bits[submode]; + else + pack_bits[i] = s_g_b_bits[submode]; + +#if 0 + // this is slightly worse + // TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error. + half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16); + qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits); +#else + qlog[i] = quant_qlog16(clamp((int)std::round(f), 0, MAX_QLOG16), prec_bits); + + // Only bias if there are enough texel weights, 4=6 weights + if (ise_weight_range >= 4) + { + // Explictly bias the high color, and the scale up, to better exploit the weights. + // The quantized range also then encompases the complete input range. + const uint32_t max_val = (1 << prec_bits) - 1; + const uint32_t K = 3; + if (i == 3) + { + qlog[i] = minimum(qlog[i] + K * 2, max_val); + } + else + { + qlog[i] = minimum(qlog[i] + K, max_val); + } + } +#endif + + if (i != 3) + qlog[i] = minimum(qlog[i], get_max_qlog(prec_bits)); + + // If S=0, we lose freedom for the texel weights to add any value. + if ((i == 3) && (qlog[i] == 0)) + qlog[i] = 1; + } + + uint32_t maj_index = 0; + + bool did_clamp = false; + + if (submode != 5) + { + int largest_qlog = 0; + for (uint32_t i = 0; i < 3; i++) + { + if (qlog[i] > largest_qlog) + { + largest_qlog = qlog[i]; + maj_index = i; + } + } + + if (maj_index) + { + std::swap(qlog[0], qlog[maj_index]); + } + + assert(qlog[0] >= qlog[1]); + assert(qlog[0] >= qlog[2]); + + qlog[1] = qlog[0] - qlog[1]; + qlog[2] = qlog[0] - qlog[2]; + + for (uint32_t i = 1; i < 4; i++) + { + const int max_val = (1 << pack_bits[i]) - 1; + + if (qlog[i] > max_val) + { + max_clamp_mag = maximum(max_clamp_mag, qlog[i] - max_val); + qlog[i] = max_val; + did_clamp = true; + } + } + } + + for (uint32_t i = 0; i < 4; i++) + { + const int max_val = (1 << pack_bits[i]) - 1; (void)max_val; + + assert(qlog[i] <= max_val); + } + + int mode = 0; + + int r = qlog[0] & 63; // 6-bits + int g = qlog[1] & 31; // 5-bits + int b = qlog[2] & 31; // 5-bits + int s = qlog[3] & 31; // 5-bits + + int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0; + + switch (submode) + { + case 0: + { + mode = (maj_index << 2) | 0; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 9); // R9 + x1 = get_bit(qlog[0], 8); // R8 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[0], 10); // R10 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 1: + { + mode = (maj_index << 2) | 1; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 8); // R8 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[0], 10); // R10 + x6 = get_bit(qlog[0], 9); // R9 + break; + } + case 2: + { + mode = (maj_index << 2) | 2; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 9); // R9 + x1 = get_bit(qlog[0], 8); // R8 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[0], 6); // R6 + x4 = get_bit(qlog[3], 7); // S7 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 3: + { + mode = (maj_index << 2) | 3; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 8); // R8 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 4: + { + mode = maj_index | 0xC; // 0b1100 + assert((mode & 0xC) == 0xC); + assert(mode != 0xF); + + x0 = get_bit(qlog[1], 6); // G6 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[2], 6); // B6 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[0], 7); // R7 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 5: + { + mode = 0xF; + + x0 = get_bit(qlog[1], 6); // G6 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[2], 6); // B6 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + default: + { + assert(0); + break; + } + } + + pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r); + pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g); + pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b); + pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s); + +#ifdef _DEBUG + // Test for valid pack by unpacking + { + const int inv_shift = 12 - prec_bits; + + int unpacked_e[2][3]; + if (submode != 5) + { + unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); + unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF); + unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF); + + unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF); + } + else + { + unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); + unpacked_e[1][1] = left_shift32(qlog[1], inv_shift); + unpacked_e[1][2] = left_shift32(qlog[2], inv_shift); + + unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF); + } + + if (maj_index) + { + std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]); + std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]); + } + + int e[2][3]; + decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr); + + for (uint32_t i = 0; i < 3; i++) + { + assert(unpacked_e[0][i] == e[0][i]); + assert(unpacked_e[1][i] == e[1][i]); + } + } +#endif + + return did_clamp; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void quantize_ise_endpoints(uint32_t ise_endpoint_range, const uint8_t* pSrc_endpoints, uint8_t *pDst_endpoints, uint32_t n) +{ + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) + { + memcpy(pDst_endpoints, pSrc_endpoints, n); + } + else + { + for (uint32_t i = 0; i < n; i++) + { + uint32_t v = pSrc_endpoints[i]; + assert(v <= 255); + + pDst_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_val_to_ise[v]; + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +// Note this could fail to find any valid solution if use_endpoint_range!=20. +// Returns true if improved. +static bool try_mode11(uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + vec3F& low_color_q16, const vec3F& high_color_q16, + half_float block_pixels_half[16][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, bool direct_only, uint32_t ise_endpoint_range, + bool constrain_ise_weight8_selectors, + int32_t first_submode, int32_t last_submode) // -1, 7 +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((num_weight_levels >= 3) && (num_weight_levels <= 32)); + assert((num_pixels >= 1) && (num_pixels <= 16)); + + bool improved_flag = false; + + half_float decoded_half[32][3]; + vec3F decoded_float[32]; + uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16]; + + if (direct_only) + { + first_submode = -1; + last_submode = -1; + } + + assert(first_submode <= last_submode); + assert((first_submode >= -1) && (first_submode <= 7)); + assert((last_submode >= -1) && (last_submode <= 7)); + + // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done. + for (int submode = last_submode; submode >= first_submode; submode--) + { + bool did_clamp = false; + int max_clamp_mag = 0; + if (submode == -1) + { + // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision. + pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16); + } + else + { + did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag); + + // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts. + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4; + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); + + if (!get_astc_hdr_mode_11_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range)) + continue; + + uint32_t usable_selector_bitmask = UINT32_MAX; + if ((constrain_ise_weight8_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS)) + usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15); + + double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask); + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(pWeights, trial_weights, num_pixels); + submode_used = submode + 1; + improved_flag = true; + } + + // If it didn't clamp it was a lossless encode at this precision, so we can stop early as there's probably no use trying lower precision submodes. + // (Although it may be, because a lower precision pack could try nearby voxel coords.) + // However, at lower levels quantization may cause the decoded endpoints to be very distorted, so we need to evaluate up to direct. + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) + { + if (!did_clamp) + break; + } + } + + return improved_flag; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool try_mode7( + uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + vec3F& high_color_q16, const float s_q16, + half_float block_pixels_half[16][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, + uint32_t ise_endpoint_range) +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((num_pixels >= 1) && (num_pixels <= 16)); + + bool improved_flag = false; + + half_float decoded_half[24][3]; + vec3F decoded_float[24]; + + uint8_t orig_trial_endpoints[NUM_MODE7_ENDPOINTS], trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16]; + + // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done. + for (int submode = 0; submode <= 5; submode++) + { + int max_clamp_mag = 0; + const bool did_clamp = pack_astc_mode7_submode(submode, orig_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range); + + if (submode < 5) + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4; + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); + + if (!get_astc_hdr_mode_7_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range)) + continue; + + double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions); + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(pWeights, trial_weights, num_pixels); + submode_used = submode; + improved_flag = true; + } + + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) + { + if (!did_clamp) + break; + } + } + + return improved_flag; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static double encode_astc_hdr_block_mode_11( + uint32_t num_pixels, + const vec4F* pBlock_pixels, + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, uint8_t* blk_weights, + const astc_hdr_codec_options& coptions, + bool direct_only, + uint32_t ise_endpoint_range, + bool uber_mode, + bool constrain_ise_weight8_selectors, + int32_t first_submode, int32_t last_submode) +{ + assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((num_pixels >= 1) && (num_pixels <= 16)); + + best_submode = 0; + + half_float block_pixels_half[16][3]; + vec4F block_pixels_q16[16]; + + // TODO: This is done redundantly. + for (uint32_t i = 0; i < num_pixels; i++) + { + block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); + block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]); + + block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); + block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]); + + block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); + block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]); + + block_pixels_q16[i][3] = 0.0f; + } + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); + + // TODO: should match MAX_SUPPORTED_ISE_WEIGHT_INDEX + const uint32_t MAX_WEIGHT_LEVELS = 32; + (void)MAX_WEIGHT_LEVELS; + assert(num_weight_levels <= MAX_WEIGHT_LEVELS); + + vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16)); + vec3F block_axis_q16(calc_rgb_pca(num_pixels, block_pixels_q16, block_mean_color_q16)); + + aabb3F color_box_q16(cInitExpand); + + float l = 1e+30f, h = -1e+30f; + vec3F low_color_q16, high_color_q16; + + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(block_pixels_q16[i]); + + vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + if (kd < l) + { + l = kd; + low_color_q16 = block_pixels_q16[i]; + } + + if (kd > h) + { + h = kd; + high_color_q16 = block_pixels_q16[i]; + } + } + + vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16); + for (uint32_t i = 0; i < 3; i++) + { + low_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f); + high_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f); + } + + uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS]; + uint8_t trial_blk_weights[16]; + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights); + + double trial_blk_error = 1e+30f; + + bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, + low_color_q16, high_color_q16, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, + first_submode, last_submode); + + // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. + if (!did_improve) + return cur_block_error; + + // Did the solution improve? + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(blk_weights, trial_blk_weights, num_pixels); + best_submode = trial_best_submode; + } + +#define USE_LEAST_SQUARES (1) +#if USE_LEAST_SQUARES + // least squares on the most promising trial weight indices found + const uint32_t NUM_LS_PASSES = 3; + + for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++) + { + vec3F l_q16, h_q16; + if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) + break; + + bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, + first_submode, last_submode); + + if (!was_improved) + break; + + // It's improved, so let's take the new weight indices. + memcpy(trial_blk_weights, blk_weights, num_pixels); + + } // pass +#endif + + if (uber_mode) + { + // Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost. + + uint8_t temp_astc_weights[16]; + memcpy(temp_astc_weights, trial_blk_weights, num_pixels); + + uint32_t min_lin_sel = 256, max_lin_sel = 0; + for (uint32_t i = 0; i < num_pixels; i++) + { + const uint32_t astc_sel = temp_astc_weights[i]; + + const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + assert(lin_sel < num_weight_levels); + + min_lin_sel = minimumu(min_lin_sel, lin_sel); + max_lin_sel = maximumu(max_lin_sel, lin_sel); + } + + bool was_improved = false; + (void)was_improved; + + { + bool weights_changed = false; + uint8_t trial_weights[16]; + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) + { + lin_sel++; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, + first_submode, last_submode)) + { + was_improved = true; + } + } + } + } + + { + bool weights_changed = false; + uint8_t trial_weights[16]; + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == max_lin_sel) && (lin_sel > 0)) + { + lin_sel--; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, + first_submode, last_submode)) + { + was_improved = true; + } + } + } + } + + { + bool weights_changed = false; + uint8_t trial_weights[16]; + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == max_lin_sel) && (lin_sel > 0)) + { + lin_sel--; + weights_changed = true; + } + else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) + { + lin_sel++; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, + first_submode, last_submode)) + { + was_improved = true; + } + } + } + } + } // uber_mode + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static double encode_astc_hdr_block_mode_7( + uint32_t num_pixels, const vec4F* pBlock_pixels, + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, //[4] + uint8_t* blk_weights, // [num_pixels] + const astc_hdr_codec_options& coptions, + uint32_t ise_endpoint_range) +{ + assert((num_pixels >= 1) && (num_pixels <= 16)); + assert((ise_weight_range >= 1) && (ise_weight_range <= 10)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); + + const uint32_t MAX_WEIGHT_LEVELS = 24; + assert(num_weight_levels <= MAX_WEIGHT_LEVELS); + BASISU_NOTE_UNUSED(MAX_WEIGHT_LEVELS); + + best_submode = 0; + + half_float block_pixels_half[16][3]; + + vec4F block_pixels_q16[16]; + for (uint32_t i = 0; i < num_pixels; i++) + { + block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); + block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]); + + block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); + block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]); + + block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); + block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]); + + block_pixels_q16[i][3] = 0.0f; + } + + vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16)); + + vec3F block_axis_q16(0.577350259f); + + aabb3F color_box_q16(cInitExpand); + + float l = 1e+30f, h = -1e+30f; + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(block_pixels_q16[i]); + + vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + l = basisu::minimum(l, kd); + h = basisu::maximum(h, kd); + } + + vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16)); + vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16)); + + low_color_q16.clamp(0.0f, MAX_QLOG16_VAL); + high_color_q16.clamp(0.0f, MAX_QLOG16_VAL); + + vec3F diff(high_color_q16 - low_color_q16); + float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0]; + + uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS]; + uint8_t trial_blk_weights[16]; + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights); + + double trial_blk_error = 1e+30f; + + bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, + high_color_q16, ceilf(s_q16), + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range); + + // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. + if (!did_improve) + { + return cur_block_error; + } + + // Did the solution improve? + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(blk_weights, trial_blk_weights, num_pixels); + best_submode = trial_best_submode; + } + + const float one_over_num_pixels = 1.0f / (float)num_pixels; + + const uint32_t NUM_TRIALS = 2; + for (uint32_t trial = 0; trial < NUM_TRIALS; trial++) + { + // Given a set of selectors and S, try to compute a better high color + vec3F new_high_color_q16(block_mean_color_q16); + + int e[2][3]; + int cur_s = 0; + if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range)) + break; + + cur_s <<= 4; + + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = trial_blk_weights[i]; + float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); + + float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels; + new_high_color_q16[0] += k; + new_high_color_q16[1] += k; + new_high_color_q16[2] += k; + } + + bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + new_high_color_q16, (float)cur_s, + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range); + + if (improved) + { + memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(trial_blk_weights, blk_weights, num_pixels); + } + + // Given a set of selectors and a high color, try to compute a better S. + float t = 0.0f; + + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = trial_blk_weights[i]; + float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); + + t += (1.0f) - lerp; + } + + t *= one_over_num_pixels; + + //int e[2][3]; + if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range)) + break; + + vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4)); + + if (fabs(t) > .0000125f) + { + float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t; + float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t; + float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t; + + // TODO: gather statistics on these + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_r), + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) + { + improved = true; + } + + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_g), + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) + { + improved = true; + } + + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_b), + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) + { + improved = true; + } + + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f), + block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) + { + improved = true; + } + } + + if (!improved) + break; + + memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(trial_blk_weights, blk_weights, num_pixels); + + } // trial + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions) +{ + float r = 0.0f, g = 0.0f, b = 0.0f; + + const float LOG_BIAS = .125f; + + bool solid_block = true; + for (uint32_t i = 0; i < 16; i++) + { + if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) || + (pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) || + (pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2])) + { + solid_block = false; + } + + r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS); + g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS); + b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS); + } + + if (solid_block) + { + r = pBlock_linear_colors[0][0]; + g = pBlock_linear_colors[0][1]; + b = pBlock_linear_colors[0][2]; + } + else + { + r = maximum(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS); + g = maximum(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS); + b = maximum(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS); + + // for safety + r = minimum(r, MAX_HALF_FLOAT); + g = minimum(g, MAX_HALF_FLOAT); + b = minimum(b, MAX_HALF_FLOAT); + } + + half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f); + + astc_hdr_pack_results results; + results.clear(); + + uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk; + results.m_is_solid = true; + + packed_blk[0] = 0b11111100; + packed_blk[1] = 255; + packed_blk[2] = 255; + packed_blk[3] = 255; + packed_blk[4] = 255; + packed_blk[5] = 255; + packed_blk[6] = 255; + packed_blk[7] = 255; + + packed_blk[8] = (uint8_t)rh; + packed_blk[9] = (uint8_t)(rh >> 8); + packed_blk[10] = (uint8_t)gh; + packed_blk[11] = (uint8_t)(gh >> 8); + packed_blk[12] = (uint8_t)bh; + packed_blk[13] = (uint8_t)(bh >> 8); + packed_blk[14] = (uint8_t)ah; + packed_blk[15] = (uint8_t)(ah >> 8); + + results.m_best_block_error = 0; + + if (!solid_block) + { + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + // This MUST match how errors are computed in eval_selectors(). + for (uint32_t i = 0; i < 16; i++) + { + half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); + double rd = q(rh) - q(dr); + double gd = q(gh) - q(dg); + double bd = q(bh) - q(db); + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + results.m_best_block_error += e; + } + } + + const half_float hc[3] = { rh, gh, bh }; + + bc6h_enc_block_solid_color(&results.m_bc6h_block, hc); + + all_results.push_back(results); + + return solid_block; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode11( + const vec4F* pBlock_linear_colors, + basisu::vector& all_results, + const astc_hdr_codec_options& coptions, + uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight8_selectors) +{ + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16]; + uint32_t trial_submode11 = 0; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + + for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) + { + const bool direct_only = coptions.m_mode11_direct_only; + + uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS; + if (weight_ise_range == astc_helpers::BISE_16_LEVELS) + endpoint_ise_range = astc_helpers::BISE_192_LEVELS; + else + { + assert(weight_ise_range < astc_helpers::BISE_16_LEVELS); + } + + double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_linear_colors, weight_ise_range, trial_submode11, 1e+30f, trial_endpoints, trial_weights, coptions, direct_only, + endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight8_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode); + + if (trial_error < 1e+30f) + { + astc_hdr_pack_results results; + results.clear(); + + results.m_best_block_error = trial_error; + + results.m_best_submodes[0] = trial_submode11; + results.m_constrained_weights = constrain_ise_weight8_selectors; + + results.m_best_blk.m_num_partitions = 1; + results.m_best_blk.m_color_endpoint_modes[0] = 11; + results.m_best_blk.m_weight_ise_range = weight_ise_range; + results.m_best_blk.m_endpoint_ise_range = endpoint_ise_range; + + memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(results.m_best_blk.m_weights, trial_weights, 16); + +#ifdef _DEBUG + { + half_float block_pixels_half[16][3]; + + vec4F block_pixels_q16[16]; + for (uint32_t i = 0; i < 16; i++) + { + block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]); + block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]); + block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); + } + + half_float unpacked_astc_blk_rgba[4][4][4]; + bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + half_float unpacked_astc_blk_rgb[4][4][3]; + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + for (uint32_t c = 0; c < 3; c++) + unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; + + double cmp_err = compute_block_error(&block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions); + assert(results.m_best_block_error == cmp_err); + } +#endif + + // transcode to BC6H + assert(results.m_best_blk.m_color_endpoint_modes[0] == 11); + + // Get qlog12 endpoints + int e[2][3]; + bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range); + assert(success); + BASISU_NOTE_UNUSED(success); + + // Transform endpoints to half float + half_float h_e[3][2] = + { + { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, + { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, + { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } + }; + + // Transcode to bc6h + success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); + assert(success); + + all_results.push_back(results); + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode7_single_part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions) +{ + uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16]; + uint32_t trial_submode7 = 0; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + + for (uint32_t weight_ise_range = coptions.m_first_mode7_part1_weight_ise_range; weight_ise_range <= coptions.m_last_mode7_part1_weight_ise_range; weight_ise_range++) + { + const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; + + double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_linear_colors, weight_ise_range, trial_submode7, 1e+30f, trial_endpoints, trial_weights, coptions, ise_endpoint_range); + + if (trial_error < 1e+30f) + { + astc_hdr_pack_results results; + results.clear(); + + results.m_best_block_error = trial_error; + + results.m_best_submodes[0] = trial_submode7; + + results.m_best_blk.m_num_partitions = 1; + results.m_best_blk.m_color_endpoint_modes[0] = 7; + results.m_best_blk.m_weight_ise_range = weight_ise_range; + results.m_best_blk.m_endpoint_ise_range = ise_endpoint_range; + + memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(results.m_best_blk.m_weights, trial_weights, 16); + + // transcode to BC6H + assert(results.m_best_blk.m_color_endpoint_modes[0] == 7); + + // Get qlog12 endpoints + int e[2][3]; + if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range)) + continue; + + // Transform endpoints to half float + half_float h_e[3][2] = + { + { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, + { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, + { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } + }; + + // Transcode to bc6h + bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); + assert(status); + (void)status; + + all_results.push_back(results); + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool estimate_partition2(const vec4F* pBlock_pixels, int* pBest_parts, uint32_t num_best_parts) +{ + assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + vec3F training_vecs[16], mean(0.0f); + + for (uint32_t i = 0; i < 16; i++) + { + vec3F& v = training_vecs[i]; + + v[0] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); + v[1] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); + v[2] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); + + mean += v; + } + mean *= (1.0f / 16.0f); + + vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) }; + + uint32_t cluster_pixels[2][16]; + uint32_t num_cluster_pixels[2]; + vec3F new_cluster_means[2]; + + for (uint32_t s = 0; s < 4; s++) + { + num_cluster_pixels[0] = 0; + num_cluster_pixels[1] = 0; + + new_cluster_means[0].clear(); + new_cluster_means[1].clear(); + + for (uint32_t i = 0; i < 16; i++) + { + float d0 = training_vecs[i].squared_distance(cluster_centroids[0]); + float d1 = training_vecs[i].squared_distance(cluster_centroids[1]); + + if (d0 < d1) + { + cluster_pixels[0][num_cluster_pixels[0]] = i; + new_cluster_means[0] += training_vecs[i]; + num_cluster_pixels[0]++; + } + else + { + cluster_pixels[1][num_cluster_pixels[1]] = i; + new_cluster_means[1] += training_vecs[i]; + num_cluster_pixels[1]++; + } + } + + if (!num_cluster_pixels[0] || !num_cluster_pixels[1]) + return false; + + cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0]; + cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1]; + } + + int desired_parts[4][4]; // [y][x] + for (uint32_t p = 0; p < 2; p++) + { + for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) + { + const uint32_t pix_index = cluster_pixels[p][i]; + + desired_parts[pix_index >> 2][pix_index & 3] = p; + } + } + + uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; + + for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++) + { + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + + int total_sim_non_inv = 0; + int total_sim_inv = 0; + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + + if (part == desired_parts[y][x]) + total_sim_non_inv++; + + if ((part ^ 1) == desired_parts[y][x]) + total_sim_inv++; + } + } + + int total_sim = maximum(total_sim_non_inv, total_sim_inv); + + part_similarity[part_index] = (total_sim << 8) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF; + + return true; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode7_2part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions, + int num_estimated_partitions, const int *pEstimated_partitions, + uint32_t first_weight_ise_range, uint32_t last_weight_ise_range) +{ + assert(coptions.m_mode7_part2_part_masks); + + astc_helpers::log_astc_block trial_blk; + clear_obj(trial_blk); + trial_blk.m_grid_width = 4; + trial_blk.m_grid_height = 4; + + trial_blk.m_num_partitions = 2; + trial_blk.m_color_endpoint_modes[0] = 7; + trial_blk.m_color_endpoint_modes[1] = 7; + + uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; + + if (num_estimated_partitions) + { + first_part_index = 0; + last_part_index = num_estimated_partitions; + } + + for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) + { + uint32_t part_index; + if (num_estimated_partitions) + { + part_index = pEstimated_partitions[part_index_iter]; + assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + } + else + { + part_index = part_index_iter; + if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0) + continue; + } + + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; + + vec4F part_pixels[2][16]; + uint32_t pixel_part_index[4][4]; // [y][x] + uint32_t num_part_pixels[2] = { 0, 0 }; + + // Extract each subset's texels for this partition pattern + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + if (invert_flag) + part = 1 - part; + + pixel_part_index[y][x] = part; + part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4]; + + num_part_pixels[part]++; + } + } + + trial_blk.m_partition_id = astc_pattern; + + for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) + { + assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS); + + uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; + if (weight_ise_range == astc_helpers::BISE_5_LEVELS) + ise_endpoint_range = astc_helpers::BISE_192_LEVELS; + else if (weight_ise_range == astc_helpers::BISE_6_LEVELS) + ise_endpoint_range = astc_helpers::BISE_128_LEVELS; + else if (weight_ise_range == astc_helpers::BISE_8_LEVELS) + ise_endpoint_range = astc_helpers::BISE_80_LEVELS; + + uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16]; + uint32_t trial_submode7[2]; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + clear_obj(trial_submode7); + + double total_trial_err = 0; + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + { + total_trial_err += encode_astc_hdr_block_mode_7( + num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0], + weight_ise_range, trial_submode7[pack_part_index], 1e+30f, + &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range); + + } // pack_part_index + + if (total_trial_err < 1e+30f) + { + trial_blk.m_weight_ise_range = weight_ise_range; + trial_blk.m_endpoint_ise_range = ise_endpoint_range; + + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS); + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t p = pixel_part_index[y][x]; + trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; + } + } + + astc_hdr_pack_results results; + results.clear(); + + results.m_best_block_error = total_trial_err; + results.m_best_submodes[0] = trial_submode7[0]; + results.m_best_submodes[1] = trial_submode7[1]; + results.m_best_pat_index = part_index; + + results.m_best_blk = trial_blk; + + bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); + assert(status); + BASISU_NOTE_UNUSED(status); + + all_results.push_back(results); + } + + } // weight_ise_range + + } // part_index +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode11_2part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions, + int num_estimated_partitions, const int* pEstimated_partitions) +{ + assert(coptions.m_mode11_part2_part_masks); + + astc_helpers::log_astc_block trial_blk; + clear_obj(trial_blk); + trial_blk.m_grid_width = 4; + trial_blk.m_grid_height = 4; + + trial_blk.m_num_partitions = 2; + trial_blk.m_color_endpoint_modes[0] = 11; + trial_blk.m_color_endpoint_modes[1] = 11; + + uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; + + if (num_estimated_partitions) + { + first_part_index = 0; + last_part_index = num_estimated_partitions; + } + + for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) + { + uint32_t part_index; + if (num_estimated_partitions) + { + part_index = pEstimated_partitions[part_index_iter]; + assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + } + else + { + part_index = part_index_iter; + if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0) + continue; + } + + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; + + vec4F part_pixels[2][16]; + uint32_t pixel_part_index[4][4]; // [y][x] + uint32_t num_part_pixels[2] = { 0, 0 }; + + // Extract each subset's texels for this partition pattern + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + if (invert_flag) + part = 1 - part; + + pixel_part_index[y][x] = part; + part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4]; + + num_part_pixels[part]++; + } + } + + trial_blk.m_partition_id = astc_pattern; + + for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++) + { + bool direct_only = false; + uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS; + if (weight_ise_range == astc_helpers::BISE_4_LEVELS) + ise_endpoint_range = astc_helpers::BISE_40_LEVELS; + + uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16]; + uint32_t trial_submode11[2]; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + clear_obj(trial_submode11); + + double total_trial_err = 0; + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + { + total_trial_err += encode_astc_hdr_block_mode_11( + num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0], + weight_ise_range, trial_submode11[pack_part_index], 1e+30f, + &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, + direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false, + coptions.m_first_mode11_submode, coptions.m_last_mode11_submode); + + } // pack_part_index + + if (total_trial_err < 1e+30f) + { + trial_blk.m_weight_ise_range = weight_ise_range; + trial_blk.m_endpoint_ise_range = ise_endpoint_range; + + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS); + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t p = pixel_part_index[y][x]; + trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; + } + } + + astc_hdr_pack_results results; + results.clear(); + + results.m_best_block_error = total_trial_err; + results.m_best_submodes[0] = trial_submode11[0]; + results.m_best_submodes[1] = trial_submode11[1]; + results.m_best_pat_index = part_index; + + results.m_best_blk = trial_blk; + + bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); + assert(status); + BASISU_NOTE_UNUSED(status); + + all_results.push_back(results); + } + + } // weight_ise_range + + } // part_index +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool g_astc_hdr_enc_initialized; + +void astc_hdr_enc_init() +{ + if (g_astc_hdr_enc_initialized) + return; + + astc_hdr_core_init(); + + astc_helpers::init_tables(true); + + init_qlog_tables(); + + encode_astc_hdr_init(); + + g_astc_hdr_enc_initialized = true; +} + +bool astc_hdr_enc_block( + const float* pRGBPixels, + const astc_hdr_codec_options& coptions, + basisu::vector& all_results) +{ + assert(g_astc_hdr_enc_initialized); + if (!g_astc_hdr_enc_initialized) + { + // astc_hdr_enc_init() MUST be called first. + assert(0); + return false; + } + + all_results.resize(0); + + vec4F block_linear_colors[16]; + + // Sanity check the input block. + for (uint32_t i = 0; i < 16; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + float v = pRGBPixels[i * 3 + j]; + + if (std::isinf(v) || std::isnan(v)) + { + // Input pixels cannot be NaN or +-Inf. + assert(0); + return false; + } + + if (v < 0.0f) + { + // Input pixels cannot be signed. + assert(0); + return false; + } + + if (v > MAX_HALF_FLOAT) + { + // Too large for half float. + assert(0); + return false; + } + + block_linear_colors[i][j] = v; + } + + block_linear_colors[i][3] = 1.0f; + } + + assert(coptions.m_use_solid || coptions.m_use_mode11 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2); + + bool is_solid = false; + if (coptions.m_use_solid) + is_solid = pack_solid(block_linear_colors, all_results, coptions); + + if (!is_solid) + { + if (coptions.m_use_mode11) + { + const size_t cur_num_results = all_results.size(); + + pack_mode11(block_linear_colors, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false); + + if (coptions.m_last_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS) + { + pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_16_LEVELS, true); + } + + // If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then + // fall back to weight ISE range 7 (which doesn't need any endpoint quantization). + // This is to guarantee we always get at least 1 non-solid result. + if (all_results.size() == cur_num_results) + { + if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS) + { + pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false); + } + } + } + + if (coptions.m_use_mode7_part1) + { + // Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution. + pack_mode7_single_part(block_linear_colors, all_results, coptions); + } + + bool have_est = false; + int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; + + if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2)) + { + if (coptions.m_use_estimated_partitions) + have_est = estimate_partition2(block_linear_colors, best_parts, coptions.m_max_estimated_partitions); + } + + if (coptions.m_use_mode7_part2) + { + const size_t cur_num_results = all_results.size(); + + pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, + coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range); + + // If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to + // 5 levels which doesn't require endpoint quantization. + if (all_results.size() == cur_num_results) + { + if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS) + { + pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, + astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS); + } + } + } + + if (coptions.m_use_mode11_part2) + { + // This always requires endpoint quant, so it could fail to find any usable solutions. + pack_mode11_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts); + } + } + + if (coptions.m_refine_weights) + { + // TODO: Move this above, do it once only. + basist::half_float rgb_pixels_half[16 * 3]; + for (uint32_t i = 0; i < 16; i++) + { + rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 0]); + rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 1]); + rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 2]); + } + + for (uint32_t i = 0; i < all_results.size(); i++) + { + bool status = astc_hdr_refine_weights(rgb_pixels_half, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag); + assert(status); + BASISU_NOTE_UNUSED(status); + } + } + + return true; +} + +bool astc_hdr_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_pack_results& results) +{ + assert(g_astc_hdr_enc_initialized); + if (!g_astc_hdr_enc_initialized) + return false; + + if (results.m_is_solid) + { + memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk)); + } + else + { + bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk); + if (!status) + { + assert(0); + return false; + } + } + + return true; +} + +// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error. +bool astc_hdr_refine_weights(const half_float *pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool *pImproved_flag) +{ + if (pImproved_flag) + *pImproved_flag = false; + + if (cur_results.m_is_solid) + return true; + + const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range); + + assert((total_weights >= 3) && (total_weights <= 16)); + + double best_err[4][4]; + uint8_t best_weight[4][4]; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + best_err[y][x] = 1e+30f; + best_weight[y][x] = 0; + } + } + + astc_hdr_pack_results temp_results; + + const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f }; + + for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++) + { + temp_results = cur_results; + for (uint32_t i = 0; i < 16; i++) + temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index; + + half_float unpacked_astc_blk_rgba[4][4][4]; + bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + basist::bc6h_block trial_bc6h_blk; + res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk); + assert(res); + + half_float unpacked_bc6h_blk[4][4][3]; + res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false); + assert(res); + BASISU_NOTE_UNUSED(res); + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + double total_err = 0.0f; + + for (uint32_t c = 0; c < 3; c++) + { + const half_float orig_c = pSource_block[(x + y * 4) * 3 + c]; + const double orig_c_q = q(orig_c); + + const half_float astc_c = unpacked_astc_blk_rgba[y][x][c]; + const double astc_c_q = q(astc_c); + const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c]; + + const half_float bc6h_c = unpacked_bc6h_blk[y][x][c]; + const double bc6h_c_q = q(bc6h_c); + const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c]; + + const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight; + + total_err += overall_err; + + } // c + + if (total_err < best_err[y][x]) + { + best_err[y][x] = total_err; + best_weight[y][x] = (uint8_t)weight_index; + } + + } // x + } // y + + } // weight_index + + bool any_changed = false; + for (uint32_t i = 0; i < 16; i++) + { + if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3]) + { + any_changed = true; + break; + } + } + + if (any_changed) + { + memcpy(cur_results.m_best_blk.m_weights, best_weight, 16); + + { + bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block); + assert(res); + BASISU_NOTE_UNUSED(res); + + half_float unpacked_astc_blk_rgba[4][4][4]; + res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + half_float unpacked_astc_blk_rgb[4][4][3]; + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + for (uint32_t c = 0; c < 3; c++) + unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; + + cur_results.m_best_block_error = compute_block_error(pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions); + } + + if (pImproved_flag) + *pImproved_flag = true; + } + + return true; +} + +void astc_hdr_block_stats::update(const astc_hdr_pack_results& log_blk) +{ + std::lock_guard lck(m_mutex); + + m_total_blocks++; + + if (log_blk.m_improved_via_refinement_flag) + m_total_refined++; + + if (log_blk.m_is_solid) + { + m_total_solid++; + } + else + { + int best_weight_range = log_blk.m_best_blk.m_weight_ise_range; + + if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7) + { + m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++; + + if (log_blk.m_best_blk.m_num_partitions == 2) + { + m_total_mode7_2part++; + + m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++; + m_total_2part++; + + m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++; + + m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; + } + else + { + m_total_mode7_1part++; + + m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++; + } + } + else + { + m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++; + if (log_blk.m_constrained_weights) + m_total_mode11_1part_constrained_weights++; + + if (log_blk.m_best_blk.m_num_partitions == 2) + { + m_total_mode11_2part++; + + m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++; + m_total_2part++; + + m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++; + + m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; + } + else + { + m_total_mode11_1part++; + + m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++; + } + } + } +} + +void astc_hdr_block_stats::print() +{ + std::lock_guard lck(m_mutex); + + assert(m_total_blocks); + if (!m_total_blocks) + return; + + printf("\nLow-level ASTC Encoder Statistics:\n"); + printf("Total blocks: %u\n", m_total_blocks); + printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks); + printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks); + + printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks); + printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks); + printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks); + + printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks); + printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks); + + printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks); + printf("\n"); + + printf("ISE texel weight range histogram mode 11:\n"); + for (uint32_t i = 1; i <= MODE11_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_11[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 11, 2 partition:\n"); + for (uint32_t i = 1; i <= MODE11_PART2_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_11_2part[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 7:\n"); + for (uint32_t i = 1; i <= MODE7_PART1_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_7[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 7, 2 partition:\n"); + for (uint32_t i = 1; i <= MODE7_PART2_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_7_2part[i]); + printf("\n"); + + printf("Mode 11 submode histogram:\n"); + for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding + printf("%u %u\n", i, m_mode11_submode_hist[i]); + printf("\n"); + + printf("Mode 7 submode histogram:\n"); + for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++) + printf("%u %u\n", i, m_mode7_submode_hist[i]); + printf("\n"); + + printf("Partition pattern table usage histogram:\n"); + for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++) + printf("%u:%u ", i, m_part_hist[i]); + printf("\n\n"); +} + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h new file mode 100644 index 000000000000..ee122ff7cee9 --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h @@ -0,0 +1,224 @@ +// basisu_astc_hdr_enc.h +#pragma once +#include "basisu_enc.h" +#include "basisu_gpu_texture.h" +#include "../transcoder/basisu_astc_helpers.h" +#include "../transcoder/basisu_astc_hdr_core.h" + +namespace basisu +{ + // This MUST be called before encoding any blocks. + void astc_hdr_enc_init(); + + const uint32_t MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; + const uint32_t MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; + const uint32_t MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS; + const uint32_t MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS; + const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec) + const uint32_t MODE7_TOTAL_SUBMODES = 6; + + struct astc_hdr_codec_options + { + float m_bc6h_err_weight; + + bool m_use_solid; + + bool m_use_mode11; + bool m_mode11_uber_mode; + uint32_t m_first_mode11_weight_ise_range; + uint32_t m_last_mode11_weight_ise_range; + bool m_mode11_direct_only; + int32_t m_first_mode11_submode; + int32_t m_last_mode11_submode; + + bool m_use_mode7_part1; + uint32_t m_first_mode7_part1_weight_ise_range; + uint32_t m_last_mode7_part1_weight_ise_range; + + bool m_use_mode7_part2; + uint32_t m_mode7_part2_part_masks; + uint32_t m_first_mode7_part2_weight_ise_range; + uint32_t m_last_mode7_part2_weight_ise_range; + + bool m_use_mode11_part2; + uint32_t m_mode11_part2_part_masks; + uint32_t m_first_mode11_part2_weight_ise_range; + uint32_t m_last_mode11_part2_weight_ise_range; + + float m_r_err_scale, m_g_err_scale; + + bool m_refine_weights; + + uint32_t m_level; + + bool m_use_estimated_partitions; + uint32_t m_max_estimated_partitions; + + // If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however. + bool m_allow_uber_mode; + + astc_hdr_codec_options(); + + void init(); + + // TODO: set_quality_level() is preferred to configure the codec for transcoding purposes. + static const int cMinLevel = 0; + static const int cMaxLevel = 4; + static const int cDefaultLevel = 1; + void set_quality_level(int level); + + private: + void set_quality_best(); + void set_quality_normal(); + void set_quality_fastest(); + }; + + struct astc_hdr_pack_results + { + double m_best_block_error; + double m_bc6h_block_error; // note this is not used/set by the encoder, here for convienance + + // Encoder results (logical ASTC block) + astc_helpers::log_astc_block m_best_blk; + + // For statistical use + uint32_t m_best_submodes[2]; + uint32_t m_best_pat_index; + bool m_constrained_weights; + + bool m_improved_via_refinement_flag; + + // Only valid if the block is solid + basist::astc_blk m_solid_blk; + + // The BC6H transcoded block + basist::bc6h_block m_bc6h_block; + + // Solid color/void extent flag + bool m_is_solid; + + void clear() + { + m_best_block_error = 1e+30f; + m_bc6h_block_error = 1e+30f; + + m_best_blk.clear(); + m_best_blk.m_grid_width = 4; + m_best_blk.m_grid_height = 4; + m_best_blk.m_endpoint_ise_range = 20; // 0-255 + + clear_obj(m_best_submodes); + + m_best_pat_index = 0; + m_constrained_weights = false; + + clear_obj(m_bc6h_block); + + m_is_solid = false; + m_improved_via_refinement_flag = false; + } + }; + + void interpolate_qlog12_colors( + const int e[2][3], + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range); + + bool get_astc_hdr_mode_11_block_colors( + const uint8_t* pEndpoints, + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); + + bool get_astc_hdr_mode_7_block_colors( + const uint8_t* pEndpoints, + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); + + double eval_selectors( + uint32_t num_pixels, + uint8_t* pWeights, + const basist::half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const basist::half_float* pDecoded_half, + const astc_hdr_codec_options& coptions, + uint32_t usable_selector_bitmask = UINT32_MAX); + + double compute_block_error(const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_options& coptions); + + // Encodes a 4x4 ASTC HDR block given a 4x4 array of source block pixels/texels. + // Supports solid color blocks, mode 11 (all submodes), mode 7/1 partition (all submodes), + // and mode 7/2 partitions (all submodes) - 30 patterns, only the ones also in common with the BC6H format. + // The packed ASTC weight grid dimensions are currently always 4x4 texels, but may be also 3x3 in the future. + // This function is thread safe, i.e. it may be called from multiple encoding threads simultanously with different blocks. + // + // Parameters: + // pRGBPixels - An array of 48 (16 RGB) floats: the 4x4 block to pack + // pPacked_block - A pointer to the packed ASTC HDR block + // coptions - Codec options + // pInternal_results - An optional pointer to details about how the block was packed, for statistics/debugging purposes. May be nullptr. + // + // Requirements: + // astc_hdr_enc_init() MUST have been called first to initialized the codec. + // Input pixels are checked and cannot be NaN's, Inf's, signed, or too large (greater than MAX_HALF_FLOAT, or 65504). + // Normal values and denormals are okay. + bool astc_hdr_enc_block( + const float* pRGBPixels, + const astc_hdr_codec_options& coptions, + basisu::vector &all_results); + + bool astc_hdr_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_pack_results& results); + + bool astc_hdr_refine_weights(const basist::half_float* pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool* pImproved_flag); + + struct astc_hdr_block_stats + { + std::mutex m_mutex; + + uint32_t m_total_blocks; + uint32_t m_total_2part, m_total_solid; + uint32_t m_total_mode7_1part, m_total_mode7_2part; + uint32_t m_total_mode11_1part, m_total_mode11_2part; + uint32_t m_total_mode11_1part_constrained_weights; + + uint32_t m_weight_range_hist_7[11]; + uint32_t m_weight_range_hist_7_2part[11]; + uint32_t m_mode7_submode_hist[6]; + + uint32_t m_weight_range_hist_11[11]; + uint32_t m_weight_range_hist_11_2part[11]; + uint32_t m_mode11_submode_hist[9]; + + uint32_t m_part_hist[32]; + + uint32_t m_total_refined; + + astc_hdr_block_stats() { clear(); } + + void clear() + { + std::lock_guard lck(m_mutex); + + m_total_blocks = 0; + m_total_mode7_1part = 0, m_total_mode7_2part = 0, m_total_mode11_1part = 0, m_total_2part = 0, m_total_solid = 0, m_total_mode11_2part = 0; + m_total_mode11_1part_constrained_weights = 0; + m_total_refined = 0; + + clear_obj(m_weight_range_hist_11); + clear_obj(m_weight_range_hist_11_2part); + clear_obj(m_weight_range_hist_7); + clear_obj(m_weight_range_hist_7_2part); + clear_obj(m_mode7_submode_hist); + clear_obj(m_mode11_submode_hist); + clear_obj(m_part_hist); + } + + void update(const astc_hdr_pack_results& log_blk); + + void print(); + }; + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_backend.cpp b/thirdparty/basis_universal/encoder/basisu_backend.cpp index abb61750a6f2..3fa3d8892fed 100644 --- a/thirdparty/basis_universal/encoder/basisu_backend.cpp +++ b/thirdparty/basis_universal/encoder/basisu_backend.cpp @@ -1,5 +1,5 @@ // basisu_backend.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_backend.h b/thirdparty/basis_universal/encoder/basisu_backend.h index 07778aeb9ba2..58a9a8aa0ea7 100644 --- a/thirdparty/basis_universal/encoder/basisu_backend.h +++ b/thirdparty/basis_universal/encoder/basisu_backend.h @@ -1,5 +1,5 @@ // basisu_backend.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp index f4c77bef23f4..77f467f67070 100644 --- a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp +++ b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp @@ -1,5 +1,5 @@ // basisu_basis_file.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.h b/thirdparty/basis_universal/encoder/basisu_basis_file.h index 98498a012178..57448bccb198 100644 --- a/thirdparty/basis_universal/encoder/basisu_basis_file.h +++ b/thirdparty/basis_universal/encoder/basisu_basis_file.h @@ -1,5 +1,5 @@ // basisu_basis_file.h -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp index 22fdfa603fc4..914e7fbbb9ab 100644 --- a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp @@ -1,5 +1,5 @@ // File: basisu_bc7enc.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -394,6 +394,7 @@ void bc7enc_compress_block_init() static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F* pSelector_weights, bc7enc_vec4F* pXl, bc7enc_vec4F* pXh, const color_quad_u8 *pColors) { // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf // I did this in matrix form first, expanded out all the ops, then optimized it a bit. double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; @@ -1301,6 +1302,7 @@ void check_best_overall_error(const color_cell_compressor_params *pParams, color for (uint32_t c = 0; c < 4; c++) colors[i].m_c[c] = (uint8_t)astc_interpolate_linear(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]); +#ifdef _DEBUG uint64_t total_err = 0; for (uint32_t p = 0; p < pParams->m_num_pixels; p++) { @@ -1313,6 +1315,7 @@ void check_best_overall_error(const color_cell_compressor_params *pParams, color total_err += compute_color_distance_rgb(&orig, &packed, pParams->m_perceptual, pParams->m_weights); } assert(total_err == pResults->m_best_overall_err); +#endif // HACK HACK //if (total_err != pResults->m_best_overall_err) diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.h b/thirdparty/basis_universal/encoder/basisu_bc7enc.h index 8d8b7888cac7..925d6b2e8dee 100644 --- a/thirdparty/basis_universal/encoder/basisu_bc7enc.h +++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.h @@ -1,5 +1,5 @@ // File: basisu_bc7enc.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp index 4e69e9e2eecb..81813257cd24 100644 --- a/thirdparty/basis_universal/encoder/basisu_comp.cpp +++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp @@ -1,5 +1,5 @@ // basisu_comp.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,6 +16,9 @@ #include "basisu_enc.h" #include #include +#include + +//#define UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS // basisu_transcoder.cpp is where basisu_miniz lives now, we just need the declarations here. #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES @@ -23,6 +26,8 @@ #include "basisu_opencl.h" +#include "../transcoder/basisu_astc_hdr_core.h" + #if !BASISD_SUPPORT_KTX2 #error BASISD_SUPPORT_KTX2 must be enabled (set to 1). #endif @@ -34,7 +39,7 @@ // Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all) #define BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND (0) -// Set to 1 to disable writing all KTX2 key values, triggering the validator bug. +// Set to 1 to disable writing all KTX2 key values, triggering an early validator bug. #define BASISU_DISABLE_KTX2_KEY_VALUES (0) using namespace buminiz; @@ -46,27 +51,143 @@ using namespace buminiz; namespace basisu { - basis_compressor::basis_compressor() : - m_pOpenCL_context(nullptr), + basis_compressor::basis_compressor() : + m_pOpenCL_context(nullptr), m_basis_file_size(0), m_basis_bits_per_texel(0.0f), m_total_blocks(0), m_any_source_image_has_alpha(false), - m_opencl_failed(false) + m_opencl_failed(false) { debug_printf("basis_compressor::basis_compressor\n"); assert(g_library_initialized); } - basis_compressor::~basis_compressor() - { - if (m_pOpenCL_context) - { - opencl_destroy_context(m_pOpenCL_context); - m_pOpenCL_context = nullptr; - } - } + basis_compressor::~basis_compressor() + { + if (m_pOpenCL_context) + { + opencl_destroy_context(m_pOpenCL_context); + m_pOpenCL_context = nullptr; + } + } + + void basis_compressor::check_for_hdr_inputs() + { + if ((!m_params.m_source_filenames.size()) && (!m_params.m_source_images.size())) + { + if (m_params.m_source_images_hdr.size()) + { + // Assume they want UASTC HDR if they've specified any HDR source images. + m_params.m_hdr = true; + } + } + + if (!m_params.m_hdr) + { + // See if any files are .EXR or .HDR, if so switch the compressor to UASTC HDR mode. + for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++) + { + std::string filename; + string_get_filename(m_params.m_source_filenames[i].c_str(), filename); + + std::string ext(string_get_extension(filename)); + string_tolower(ext); + + if ((ext == "exr") || (ext == "hdr")) + { + m_params.m_hdr = true; + break; + } + } + } + + if (m_params.m_hdr) + { + if (m_params.m_source_alpha_filenames.size()) + { + debug_printf("Warning: Alpha channel image filenames are not supported in UASTC HDR mode.\n"); + m_params.m_source_alpha_filenames.clear(); + } + } + + if (m_params.m_hdr) + m_params.m_uastc = true; + } + + bool basis_compressor::sanity_check_input_params() + { + // Check for no source filenames specified. + if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size())) + { + assert(0); + return false; + } + + // See if they've specified any source filenames, but didn't tell us to read them. + if ((!m_params.m_read_source_images) && (m_params.m_source_filenames.size())) + { + assert(0); + return false; + } + + // Sanity check the input image parameters. + if (m_params.m_read_source_images) + { + // Caller can't specify their own images if they want us to read source images from files. + if (m_params.m_source_images.size() || m_params.m_source_images_hdr.size()) + { + assert(0); + return false; + } + + if (m_params.m_source_mipmap_images.size() || m_params.m_source_mipmap_images_hdr.size()) + { + assert(0); + return false; + } + } + else + { + // They didn't tell us to read any source files, so check for no LDR/HDR source images. + if (!m_params.m_source_images.size() && !m_params.m_source_images_hdr.size()) + { + assert(0); + return false; + } + + // Now we know we've been supplied LDR and/or HDR source images, check for LDR vs. HDR conflicts. + + if (m_params.m_source_images.size()) + { + // They've supplied LDR images, so make sure they also haven't specified HDR input images. + if (m_params.m_source_images_hdr.size() || m_params.m_source_mipmap_images_hdr.size()) + { + assert(0); + return false; + } + } + else + { + // No LDR images, so make sure they haven't specified any LDR mipmaps. + if (m_params.m_source_mipmap_images.size()) + { + assert(0); + return false; + } + + // No LDR images, so ensure they've supplied some HDR images to process. + if (!m_params.m_source_images_hdr.size()) + { + assert(0); + return false; + } + } + } + + return true; + } bool basis_compressor::init(const basis_compressor_params ¶ms) { @@ -85,7 +206,12 @@ namespace basisu } m_params = params; - + + if ((m_params.m_compute_stats) && (!m_params.m_validate_output_data)) + m_params.m_validate_output_data = true; + + check_for_hdr_inputs(); + if (m_params.m_debug) { debug_printf("basis_compressor::init:\n"); @@ -95,8 +221,10 @@ namespace basisu #define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); #define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); - debug_printf("Source images: %u, source filenames: %u, source alpha filenames: %i, Source mipmap images: %u\n", - m_params.m_source_images.size(), m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), m_params.m_source_mipmap_images.size()); + debug_printf("Source LDR images: %u, HDR images: %u, filenames: %u, alpha filenames: %i, LDR mipmap images: %u, HDR mipmap images: %u\n", + m_params.m_source_images.size(), m_params.m_source_images_hdr.size(), + m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), + m_params.m_source_mipmap_images.size(), m_params.m_source_mipmap_images_hdr.size()); if (m_params.m_source_mipmap_images.size()) { @@ -106,6 +234,15 @@ namespace basisu debug_printf("\n"); } + if (m_params.m_source_mipmap_images_hdr.size()) + { + debug_printf("m_source_mipmap_images_hdr array sizes:\n"); + for (uint32_t i = 0; i < m_params.m_source_mipmap_images_hdr.size(); i++) + debug_printf("%u ", m_params.m_source_mipmap_images_hdr[i].size()); + debug_printf("\n"); + } + + PRINT_BOOL_VALUE(m_hdr); PRINT_BOOL_VALUE(m_uastc); PRINT_BOOL_VALUE(m_use_opencl); PRINT_BOOL_VALUE(m_y_flip); @@ -117,7 +254,7 @@ namespace basisu PRINT_BOOL_VALUE(m_no_endpoint_rdo); PRINT_BOOL_VALUE(m_no_selector_rdo); PRINT_BOOL_VALUE(m_read_source_images); - PRINT_BOOL_VALUE(m_write_output_basis_files); + PRINT_BOOL_VALUE(m_write_output_basis_or_ktx2_files); PRINT_BOOL_VALUE(m_compute_stats); PRINT_BOOL_VALUE(m_check_for_alpha); PRINT_BOOL_VALUE(m_force_alpha); @@ -146,6 +283,7 @@ namespace basisu debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters); debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters); debug_printf("m_quality_level: %i\n", m_params.m_quality_level); + debug_printf("UASTC HDR quality level: %u\n", m_params.m_uastc_hdr_options.m_level); debug_printf("m_tex_type: %u\n", m_params.m_tex_type); debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1); @@ -185,6 +323,9 @@ namespace basisu } PRINT_BOOL_VALUE(m_validate_output_data); + PRINT_BOOL_VALUE(m_hdr_ldr_srgb_to_linear_conversion); + debug_printf("Allow UASTC HDR uber mode: %u\n", m_params.m_uastc_hdr_options.m_allow_uber_mode); + PRINT_BOOL_VALUE(m_hdr_favor_astc); #undef PRINT_BOOL_VALUE #undef PRINT_INT_VALUE @@ -192,19 +333,9 @@ namespace basisu #undef PRINT_FLOAT_VALUE } - if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size())) - { - assert(0); + if (!sanity_check_input_params()) return false; - } - - if ((m_params.m_compute_stats) && (!m_params.m_validate_output_data)) - { - m_params.m_validate_output_data = true; - - debug_printf("Note: m_compute_stats is true, so forcing m_validate_output_data to true as well\n"); - } - + if ((m_params.m_use_opencl) && opencl_is_available() && !m_pOpenCL_context && !m_opencl_failed) { m_pOpenCL_context = opencl_create_context(); @@ -219,6 +350,9 @@ namespace basisu { debug_printf("basis_compressor::process\n"); + if (!read_dds_source_images()) + return cECFailedReadingSourceImages; + if (!read_source_images()) return cECFailedReadingSourceImages; @@ -228,20 +362,38 @@ namespace basisu if (m_params.m_create_ktx2_file) { if (!validate_ktx2_constraints()) + { + error_printf("Inputs do not satisfy .KTX2 texture constraints: all source images must be the same resolution and have the same number of mipmap levels.\n"); return cECFailedValidating; + } } if (!extract_source_blocks()) return cECFailedFrontEnd; - if (m_params.m_uastc) + if (m_params.m_hdr) + { + // UASTC HDR + printf("Mode: UASTC HDR Level %u\n", m_params.m_uastc_hdr_options.m_level); + + error_code ec = encode_slices_to_uastc_hdr(); + if (ec != cECSuccess) + return ec; + } + else if (m_params.m_uastc) { + // UASTC + printf("Mode: UASTC LDR Level %u\n", m_params.m_pack_uastc_flags & cPackUASTCLevelMask); + error_code ec = encode_slices_to_uastc(); if (ec != cECSuccess) return ec; } else { + // ETC1S + printf("Mode: ETC1S Quality %i, Level %i\n", m_params.m_quality_level, (int)m_params.m_compression_level); + if (!process_frontend()) return cECFailedFrontEnd; @@ -254,7 +406,7 @@ namespace basisu if (!create_basis_file_and_transcode()) return cECFailedCreateBasisFile; - + if (m_params.m_create_ktx2_file) { if (!create_ktx2_file()) @@ -267,20 +419,89 @@ namespace basisu return cECSuccess; } - basis_compressor::error_code basis_compressor::encode_slices_to_uastc() + basis_compressor::error_code basis_compressor::encode_slices_to_uastc_hdr() { - debug_printf("basis_compressor::encode_slices_to_uastc\n"); + debug_printf("basis_compressor::encode_slices_to_uastc_hdr\n"); + + interval_timer tm; + tm.start(); m_uastc_slice_textures.resize(m_slice_descs.size()); for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) - m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height); + m_uastc_slice_textures[slice_index].init(texture_format::cUASTC_HDR_4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height); - m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4; + m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC_HDR_4x4; m_uastc_backend_output.m_etc1s = false; m_uastc_backend_output.m_slice_desc = m_slice_descs; m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size()); m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size()); + + if (!m_params.m_perceptual) + { + m_params.m_uastc_hdr_options.m_r_err_scale = 1.0f; + m_params.m_uastc_hdr_options.m_g_err_scale = 1.0f; + } + + const float DEFAULT_BC6H_ERROR_WEIGHT = .85f; + const float LOWEST_BC6H_ERROR_WEIGHT = .1f; + m_params.m_uastc_hdr_options.m_bc6h_err_weight = m_params.m_hdr_favor_astc ? LOWEST_BC6H_ERROR_WEIGHT : DEFAULT_BC6H_ERROR_WEIGHT; + + std::atomic any_failures; + any_failures = false; + + astc_hdr_block_stats enc_stats; + struct uastc_blk_desc + { + uint32_t m_solid_flag; + uint32_t m_num_partitions; + uint32_t m_cem_index; + uint32_t m_weight_ise_range; + uint32_t m_endpoint_ise_range; + + bool operator< (const uastc_blk_desc& desc) const + { + if (this == &desc) + return false; + +#define COMP(XX) if (XX < desc.XX) return true; else if (XX != desc.XX) return false; + COMP(m_solid_flag) + COMP(m_num_partitions) + COMP(m_cem_index) + COMP(m_weight_ise_range) + COMP(m_endpoint_ise_range) +#undef COMP + + return false; + } + + bool operator== (const uastc_blk_desc& desc) const + { + if (this == &desc) + return true; + if ((*this < desc) || (desc < *this)) + return false; + return true; + } + + bool operator!= (const uastc_blk_desc& desc) const + { + return !(*this == desc); + } + }; + + struct uastc_blk_desc_stats + { + uastc_blk_desc_stats() : m_count(0) { } + uint32_t m_count; +#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS + basisu::vector m_blks; +#endif + }; + + std::map unique_block_descs; + std::mutex unique_block_desc_mutex; + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) { gpu_image& tex = m_uastc_slice_textures[slice_index]; @@ -290,102 +511,387 @@ namespace basisu const uint32_t num_blocks_x = tex.get_blocks_x(); const uint32_t num_blocks_y = tex.get_blocks_y(); const uint32_t total_blocks = tex.get_total_blocks(); - const image& source_image = m_slice_images[slice_index]; - + const imagef& source_image = m_slice_images_hdr[slice_index]; + std::atomic total_blocks_processed; total_blocks_processed = 0; - + const uint32_t N = 256; for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N) { const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(total_blocks, block_index_iter + N); - + // FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten. #ifndef __EMSCRIPTEN__ - m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed] + m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, + &tex, &total_blocks_processed, &any_failures, &enc_stats, &unique_block_descs, &unique_block_desc_mutex] { #endif BASISU_NOTE_UNUSED(num_blocks_y); - - uint32_t uastc_flags = m_params.m_pack_uastc_flags; - if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode)) - uastc_flags |= cPackUASTCFavorSimplerModes; + + basisu::vector all_results; + all_results.reserve(256); for (uint32_t block_index = first_index; block_index < last_index; block_index++) { const uint32_t block_x = block_index % num_blocks_x; const uint32_t block_y = block_index / num_blocks_x; - color_rgba block_pixels[4][4]; + vec4F block_pixels[16]; - source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4); + source_image.extract_block_clamped(&block_pixels[0], block_x * 4, block_y * 4, 4, 4); - basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y); + basist::astc_blk& dest_block = *(basist::astc_blk*)tex.get_block_ptr(block_x, block_y); + + float rgb_pixels[16 * 3]; + basist::half_float rgb_pixels_half[16 * 3]; + for (uint32_t i = 0; i < 16; i++) + { + rgb_pixels[i * 3 + 0] = block_pixels[i][0]; + rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(block_pixels[i][0]); - encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags); + rgb_pixels[i * 3 + 1] = block_pixels[i][1]; + rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(block_pixels[i][1]); - total_blocks_processed++; + rgb_pixels[i * 3 + 2] = block_pixels[i][2]; + rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(block_pixels[i][2]); + } + + bool status = astc_hdr_enc_block(&rgb_pixels[0], m_params.m_uastc_hdr_options, all_results); + if (!status) + { + any_failures = true; + continue; + } + + double best_err = 1e+30f; + int best_result_index = -1; + + const double bc6h_err_weight = m_params.m_uastc_hdr_options.m_bc6h_err_weight; + const double astc_err_weight = (1.0f - bc6h_err_weight); + + for (uint32_t i = 0; i < all_results.size(); i++) + { + basist::half_float unpacked_bc6h_block[4 * 4 * 3]; + unpack_bc6h(&all_results[i].m_bc6h_block, unpacked_bc6h_block, false); + + all_results[i].m_bc6h_block_error = compute_block_error(rgb_pixels_half, unpacked_bc6h_block, m_params.m_uastc_hdr_options); + + double overall_err = (all_results[i].m_bc6h_block_error * bc6h_err_weight) + (all_results[i].m_best_block_error * astc_err_weight); + + if ((!i) || (overall_err < best_err)) + { + best_err = overall_err; + best_result_index = i; + } + } + + const astc_hdr_pack_results& best_results = all_results[best_result_index]; - uint32_t val = total_blocks_processed; - if ((val & 16383) == 16383) + astc_hdr_pack_results_to_block(dest_block, best_results); + + // Verify that this block is valid UASTC HDR and we can successfully transcode it to BC6H. + // (Well, except in fastest mode.) + if (m_params.m_uastc_hdr_options.m_level > 0) { - debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); + basist::bc6h_block transcoded_bc6h_blk; + bool transcode_results = astc_hdr_transcode_to_bc6h(dest_block, transcoded_bc6h_blk); + assert(transcode_results); + if ((!transcode_results) && (!any_failures)) + { + error_printf("basis_compressor::encode_slices_to_uastc_hdr: UASTC HDR block transcode check failed!\n"); + + any_failures = true; + continue; + } + } + + if (m_params.m_debug) + { + // enc_stats has its own mutex + enc_stats.update(best_results); + + uastc_blk_desc blk_desc; + clear_obj(blk_desc); + + blk_desc.m_solid_flag = best_results.m_is_solid; + if (!blk_desc.m_solid_flag) + { + blk_desc.m_num_partitions = best_results.m_best_blk.m_num_partitions; + blk_desc.m_cem_index = best_results.m_best_blk.m_color_endpoint_modes[0]; + blk_desc.m_weight_ise_range = best_results.m_best_blk.m_weight_ise_range; + blk_desc.m_endpoint_ise_range = best_results.m_best_blk.m_endpoint_ise_range; + } + + { + std::lock_guard lck(unique_block_desc_mutex); + + auto res = unique_block_descs.insert(std::make_pair(blk_desc, uastc_blk_desc_stats())); + + (res.first)->second.m_count++; +#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS + (res.first)->second.m_blks.push_back(dest_block); +#endif + } } + total_blocks_processed++; + + uint32_t val = total_blocks_processed; + if (((val & 1023) == 1023) && m_params.m_status_output) + { + debug_printf("basis_compressor::encode_slices_to_uastc_hdr: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); + } } #ifndef __EMSCRIPTEN__ }); #endif - + } // block_index_iter #ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); #endif - if (m_params.m_rdo_uastc) - { - uastc_rdo_params rdo_params; - rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar; - rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio; - rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh; - rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size; - rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale; - rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev; - - bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(), - (const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr, - (m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0); - if (!status) - { - return cECFailedUASTCRDOPostProcess; - } - } + if (any_failures) + return cECFailedEncodeUASTC; m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes()); memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes()); - + m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0); - + } // slice_index - + + debug_printf("basis_compressor::encode_slices_to_uastc_hdr: Total time: %3.3f secs\n", tm.get_elapsed_secs()); + + if (m_params.m_debug) + { + debug_printf("\n----- Total unique UASTC block descs: %u\n", (uint32_t)unique_block_descs.size()); + + uint32_t c = 0; + for (auto it = unique_block_descs.begin(); it != unique_block_descs.end(); ++it) + { + debug_printf("%u. Total uses: %u %3.2f%%, solid color: %u\n", c, it->second.m_count, + ((float)it->second.m_count * 100.0f) / enc_stats.m_total_blocks, it->first.m_solid_flag); + + if (!it->first.m_solid_flag) + { + debug_printf(" Num partitions: %u\n", it->first.m_num_partitions); + debug_printf(" CEM index: %u\n", it->first.m_cem_index); + debug_printf(" Weight ISE range: %u (%u levels)\n", it->first.m_weight_ise_range, astc_helpers::get_ise_levels(it->first.m_weight_ise_range)); + debug_printf(" Endpoint ISE range: %u (%u levels)\n", it->first.m_endpoint_ise_range, astc_helpers::get_ise_levels(it->first.m_endpoint_ise_range)); + } + +#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS + debug_printf(" -- UASTC HDR block bytes:\n"); + for (uint32_t j = 0; j < minimum(4, it->second.m_blks.size()); j++) + { + basist::astc_blk& blk = it->second.m_blks[j]; + + debug_printf(" - UASTC HDR: { "); + for (uint32_t k = 0; k < 16; k++) + debug_printf("%u%s", ((const uint8_t*)&blk)[k], (k != 15) ? ", " : ""); + debug_printf(" }\n"); + + basist::bc6h_block bc6h_blk; + bool res = astc_hdr_transcode_to_bc6h(blk, bc6h_blk); + assert(res); + if (!res) + { + error_printf("astc_hdr_transcode_to_bc6h() failed!\n"); + return cECFailedEncodeUASTC; + } + + debug_printf(" - BC6H: { "); + for (uint32_t k = 0; k < 16; k++) + debug_printf("%u%s", ((const uint8_t*)&bc6h_blk)[k], (k != 15) ? ", " : ""); + debug_printf(" }\n"); + } +#endif + + c++; + } + printf("\n"); + + enc_stats.print(); + } + return cECSuccess; } - bool basis_compressor::generate_mipmaps(const image &img, basisu::vector &mips, bool has_alpha) + basis_compressor::error_code basis_compressor::encode_slices_to_uastc() { - debug_printf("basis_compressor::generate_mipmaps\n"); + debug_printf("basis_compressor::encode_slices_to_uastc\n"); - interval_timer tm; - tm.start(); + m_uastc_slice_textures.resize(m_slice_descs.size()); + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height); - uint32_t total_levels = 1; - uint32_t w = img.get_width(), h = img.get_height(); - while (maximum(w, h) > (uint32_t)m_params.m_mip_smallest_dimension) + m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4; + m_uastc_backend_output.m_etc1s = false; + m_uastc_backend_output.m_slice_desc = m_slice_descs; + m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size()); + m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size()); + + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) { - w = maximum(w >> 1U, 1U); + gpu_image& tex = m_uastc_slice_textures[slice_index]; + basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; + (void)slice_desc; + + const uint32_t num_blocks_x = tex.get_blocks_x(); + const uint32_t num_blocks_y = tex.get_blocks_y(); + const uint32_t total_blocks = tex.get_total_blocks(); + const image& source_image = m_slice_images[slice_index]; + + std::atomic total_blocks_processed; + total_blocks_processed = 0; + + const uint32_t N = 256; + for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N) + { + const uint32_t first_index = block_index_iter; + const uint32_t last_index = minimum(total_blocks, block_index_iter + N); + + // FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten. +#ifndef __EMSCRIPTEN__ + m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed] + { +#endif + BASISU_NOTE_UNUSED(num_blocks_y); + + uint32_t uastc_flags = m_params.m_pack_uastc_flags; + if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode)) + uastc_flags |= cPackUASTCFavorSimplerModes; + + for (uint32_t block_index = first_index; block_index < last_index; block_index++) + { + const uint32_t block_x = block_index % num_blocks_x; + const uint32_t block_y = block_index / num_blocks_x; + + color_rgba block_pixels[4][4]; + + source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4); + + basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y); + + encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags); + + total_blocks_processed++; + + uint32_t val = total_blocks_processed; + if (((val & 16383) == 16383) && m_params.m_status_output) + { + debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); + } + + } + +#ifndef __EMSCRIPTEN__ + }); +#endif + + } // block_index_iter + +#ifndef __EMSCRIPTEN__ + m_params.m_pJob_pool->wait_for_all(); +#endif + + if (m_params.m_rdo_uastc) + { + uastc_rdo_params rdo_params; + rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar; + rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio; + rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh; + rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size; + rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale; + rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev; + + bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(), + (const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr, + (m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0); + if (!status) + { + return cECFailedUASTCRDOPostProcess; + } + } + + m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes()); + memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes()); + + m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0); + + } // slice_index + + return cECSuccess; + } + + bool basis_compressor::generate_mipmaps(const imagef& img, basisu::vector& mips, bool has_alpha) + { + debug_printf("basis_compressor::generate_mipmaps\n"); + + interval_timer tm; + tm.start(); + + uint32_t total_levels = 1; + uint32_t w = img.get_width(), h = img.get_height(); + while (maximum(w, h) > (uint32_t)m_params.m_mip_smallest_dimension) + { + w = maximum(w >> 1U, 1U); + h = maximum(h >> 1U, 1U); + total_levels++; + } + + for (uint32_t level = 1; level < total_levels; level++) + { + const uint32_t level_width = maximum(1, img.get_width() >> level); + const uint32_t level_height = maximum(1, img.get_height() >> level); + + imagef& level_img = *enlarge_vector(mips, 1); + level_img.resize(level_width, level_height); + + const imagef* pSource_image = &img; + + if (m_params.m_mip_fast) + { + if (level > 1) + pSource_image = &mips[level - 1]; + } + + bool status = image_resample(*pSource_image, level_img, + //m_params.m_mip_filter.c_str(), + "box", // TODO: negative lobes in the filter are causing negative colors, try Mitchell + m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3); + if (!status) + { + error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n"); + return false; + } + + clean_hdr_image(level_img); + } + + if (m_params.m_debug) + debug_printf("Total mipmap generation time: %3.3f secs\n", tm.get_elapsed_secs()); + + return true; + } + + bool basis_compressor::generate_mipmaps(const image &img, basisu::vector &mips, bool has_alpha) + { + debug_printf("basis_compressor::generate_mipmaps\n"); + + interval_timer tm; + tm.start(); + + uint32_t total_levels = 1; + uint32_t w = img.get_width(), h = img.get_height(); + while (maximum(w, h) > (uint32_t)m_params.m_mip_smallest_dimension) + { + w = maximum(w >> 1U, 1U); h = maximum(h >> 1U, 1U); total_levels++; } @@ -463,17 +969,224 @@ namespace basisu return true; } + void basis_compressor::clean_hdr_image(imagef& src_img) + { + const uint32_t width = src_img.get_width(); + const uint32_t height = src_img.get_height(); + + float max_used_val = 0.0f; + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + vec4F& c = src_img(x, y); + for (uint32_t i = 0; i < 3; i++) + max_used_val = maximum(max_used_val, c[i]); + } + } + + double hdr_image_scale = 1.0f; + if (max_used_val > basist::ASTC_HDR_MAX_VAL) + { + hdr_image_scale = max_used_val / basist::ASTC_HDR_MAX_VAL; + + const double inv_hdr_image_scale = basist::ASTC_HDR_MAX_VAL / max_used_val; + + for (uint32_t y = 0; y < src_img.get_height(); y++) + { + for (uint32_t x = 0; x < src_img.get_width(); x++) + { + vec4F& c = src_img(x, y); + + for (uint32_t i = 0; i < 3; i++) + c[i] = (float)minimum(c[i] * inv_hdr_image_scale, basist::ASTC_HDR_MAX_VAL); + } + } + + printf("Warning: The input HDR image's maximum used float value was %f, which is too high to encode as ASTC HDR. The image's components have been linearly scaled so the maximum used value is %f, by multiplying by %f.\n", + max_used_val, basist::ASTC_HDR_MAX_VAL, inv_hdr_image_scale); + + printf("The decoded ASTC HDR texture will have to be scaled up by %f.\n", hdr_image_scale); + } + + // TODO: Determine a constant scale factor, apply if > MAX_HALF_FLOAT + if (!src_img.clean_astc_hdr_pixels(basist::ASTC_HDR_MAX_VAL)) + printf("Warning: clean_astc_hdr_pixels() had to modify the input image to encode to ASTC HDR - see previous warning(s).\n"); + + float lowest_nonzero_val = 1e+30f; + float lowest_val = 1e+30f; + float highest_val = -1e+30f; + + for (uint32_t y = 0; y < src_img.get_height(); y++) + { + for (uint32_t x = 0; x < src_img.get_width(); x++) + { + const vec4F& c = src_img(x, y); + + for (uint32_t i = 0; i < 3; i++) + { + lowest_val = basisu::minimum(lowest_val, c[i]); + + if (c[i] != 0.0f) + lowest_nonzero_val = basisu::minimum(lowest_nonzero_val, c[i]); + + highest_val = basisu::maximum(highest_val, c[i]); + } + } + } + + debug_printf("Lowest image value: %e, lowest non-zero value: %e, highest value: %e, dynamic range: %e\n", lowest_val, lowest_nonzero_val, highest_val, highest_val / lowest_nonzero_val); + } + + bool basis_compressor::read_dds_source_images() + { + debug_printf("basis_compressor::read_dds_source_images\n"); + + // Nothing to do if the caller doesn't want us reading source images. + if ((!m_params.m_read_source_images) || (!m_params.m_source_filenames.size())) + return true; + + // Just bail of the caller has specified their own source images. + if (m_params.m_source_images.size() || m_params.m_source_images_hdr.size()) + return true; + + if (m_params.m_source_mipmap_images.size() || m_params.m_source_mipmap_images_hdr.size()) + return true; + + // See if any input filenames are .DDS + bool any_dds = false, all_dds = true; + for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++) + { + std::string ext(string_get_extension(m_params.m_source_filenames[i])); + if (strcasecmp(ext.c_str(), "dds") == 0) + any_dds = true; + else + all_dds = false; + } + + // Bail if no .DDS files specified. + if (!any_dds) + return true; + + // If any input is .DDS they all must be .DDS, for simplicity. + if (!all_dds) + { + error_printf("If any filename is DDS, all filenames must be DDS.\n"); + return false; + } + + // Can't jam in alpha channel images if any .DDS files specified. + if (m_params.m_source_alpha_filenames.size()) + { + error_printf("Source alpha filenames are not supported in DDS mode.\n"); + return false; + } + + bool any_mipmaps = false; + + // Read each .DDS texture file + for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++) + { + basisu::vector ldr_mips; + basisu::vector hdr_mips; + bool status = read_uncompressed_dds_file(m_params.m_source_filenames[i].c_str(), ldr_mips, hdr_mips); + if (!status) + return false; + + assert(ldr_mips.size() || hdr_mips.size()); + + if (m_params.m_status_output) + { + printf("Read DDS file \"%s\", %s, %ux%u, %u mipmap levels\n", + m_params.m_source_filenames[i].c_str(), + ldr_mips.size() ? "LDR" : "HDR", + ldr_mips.size() ? ldr_mips[0].get_width() : hdr_mips[0].get_width(), + ldr_mips.size() ? ldr_mips[0].get_height() : hdr_mips[0].get_height(), + ldr_mips.size() ? ldr_mips.size() : hdr_mips.size()); + } + + if (ldr_mips.size()) + { + if (m_params.m_source_images_hdr.size()) + { + error_printf("All DDS files must be of the same type (all LDR, or all HDR)\n"); + return false; + } + + m_params.m_source_images.push_back(ldr_mips[0]); + m_params.m_source_mipmap_images.resize(m_params.m_source_mipmap_images.size() + 1); + + if (ldr_mips.size() > 1) + { + ldr_mips.erase(0U); + + m_params.m_source_mipmap_images.back().swap(ldr_mips); + + any_mipmaps = true; + } + } + else + { + if (m_params.m_source_images.size()) + { + error_printf("All DDS files must be of the same type (all LDR, or all HDR)\n"); + return false; + } + + m_params.m_source_images_hdr.push_back(hdr_mips[0]); + m_params.m_source_mipmap_images_hdr.resize(m_params.m_source_mipmap_images_hdr.size() + 1); + + if (hdr_mips.size() > 1) + { + hdr_mips.erase(0U); + + m_params.m_source_mipmap_images_hdr.back().swap(hdr_mips); + + any_mipmaps = true; + } + + m_params.m_hdr = true; + m_params.m_uastc = true; + } + } + + m_params.m_read_source_images = false; + m_params.m_source_filenames.clear(); + m_params.m_source_alpha_filenames.clear(); + + if (!any_mipmaps) + { + m_params.m_source_mipmap_images.clear(); + m_params.m_source_mipmap_images_hdr.clear(); + } + + if ((m_params.m_hdr) && (!m_params.m_source_images_hdr.size())) + { + error_printf("HDR mode enabled, but only LDR .DDS files were loaded. HDR mode requires half or float (HDR) .DDS inputs.\n"); + return false; + } + + return true; + } + bool basis_compressor::read_source_images() { debug_printf("basis_compressor::read_source_images\n"); - const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : (uint32_t)m_params.m_source_images.size(); + const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : + (m_params.m_hdr ? (uint32_t)m_params.m_source_images_hdr.size() : (uint32_t)m_params.m_source_images.size()); + if (!total_source_files) + { + debug_printf("basis_compressor::read_source_images: No source images to process\n"); + return false; + } m_stats.resize(0); m_slice_descs.resize(0); m_slice_images.resize(0); + m_slice_images_hdr.resize(0); m_total_blocks = 0; uint32_t total_macroblocks = 0; @@ -481,106 +1194,196 @@ namespace basisu m_any_source_image_has_alpha = false; basisu::vector source_images; + basisu::vector source_images_hdr; + basisu::vector source_filenames; + // TODO: Note HDR images don't support alpha here, currently. + // First load all source images, and determine if any have an alpha channel. for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++) { - const char *pSource_filename = ""; + const char* pSource_filename = ""; image file_image; - + imagef file_image_hdr; + if (m_params.m_read_source_images) { pSource_filename = m_params.m_source_filenames[source_file_index].c_str(); // Load the source image - if (!load_image(pSource_filename, file_image)) + if (m_params.m_hdr) { - error_printf("Failed reading source image: %s\n", pSource_filename); - return false; + if (!load_image_hdr(pSource_filename, file_image_hdr, m_params.m_hdr_ldr_srgb_to_linear_conversion)) + { + error_printf("Failed reading source image: %s\n", pSource_filename); + return false; + } + + // For now, just slam alpha to 1.0f. UASTC HDR doesn't support alpha yet. + for (uint32_t y = 0; y < file_image_hdr.get_height(); y++) + for (uint32_t x = 0; x < file_image_hdr.get_width(); x++) + file_image_hdr(x, y)[3] = 1.0f; } + else + { + if (!load_image(pSource_filename, file_image)) + { + error_printf("Failed reading source image: %s\n", pSource_filename); + return false; + } + } + + const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width(); + const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height(); if (m_params.m_status_output) { - printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height()); + printf("Read source image \"%s\", %ux%u\n", pSource_filename, width, height); } - // Optionally load another image and put a grayscale version of it into the alpha channel. - if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size())) + if (m_params.m_hdr) { - const char *pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str(); + clean_hdr_image(file_image_hdr); + } + else + { + // Optionally load another image and put a grayscale version of it into the alpha channel. + if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size())) + { + const char* pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str(); - image alpha_data; + image alpha_data; - if (!load_image(pSource_alpha_image, alpha_data)) - { - error_printf("Failed reading source image: %s\n", pSource_alpha_image); - return false; - } + if (!load_image(pSource_alpha_image, alpha_data)) + { + error_printf("Failed reading source image: %s\n", pSource_alpha_image); + return false; + } - printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height()); + printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height()); - alpha_data.crop(file_image.get_width(), file_image.get_height()); + alpha_data.crop(width, height); - for (uint32_t y = 0; y < file_image.get_height(); y++) - for (uint32_t x = 0; x < file_image.get_width(); x++) - file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma(); + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma(); + } } } else { - file_image = m_params.m_source_images[source_file_index]; + if (m_params.m_hdr) + { + file_image_hdr = m_params.m_source_images_hdr[source_file_index]; + clean_hdr_image(file_image_hdr); + } + else + { + file_image = m_params.m_source_images[source_file_index]; + } } - if (m_params.m_renormalize) - file_image.renormalize_normal_map(); + if (!m_params.m_hdr) + { + if (m_params.m_renormalize) + file_image.renormalize_normal_map(); + } bool alpha_swizzled = false; + if (m_params.m_swizzle[0] != 0 || m_params.m_swizzle[1] != 1 || m_params.m_swizzle[2] != 2 || m_params.m_swizzle[3] != 3) { - // Used for XY normal maps in RG - puts X in color, Y in alpha - for (uint32_t y = 0; y < file_image.get_height(); y++) - for (uint32_t x = 0; x < file_image.get_width(); x++) + if (!m_params.m_hdr) + { + // Used for XY normal maps in RG - puts X in color, Y in alpha + for (uint32_t y = 0; y < file_image.get_height(); y++) + { + for (uint32_t x = 0; x < file_image.get_width(); x++) + { + const color_rgba& c = file_image(x, y); + file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]); + } + } + + alpha_swizzled = (m_params.m_swizzle[3] != 3); + } + else + { + // Used for XY normal maps in RG - puts X in color, Y in alpha + for (uint32_t y = 0; y < file_image_hdr.get_height(); y++) { - const color_rgba &c = file_image(x, y); - file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]); + for (uint32_t x = 0; x < file_image_hdr.get_width(); x++) + { + const vec4F& c = file_image_hdr(x, y); + + // For now, alpha is always 1.0f in UASTC HDR. + file_image_hdr(x, y).set(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], 1.0f); // c[m_params.m_swizzle[3]]); + } } - alpha_swizzled = m_params.m_swizzle[3] != 3; + } } - + bool has_alpha = false; - if (m_params.m_force_alpha || alpha_swizzled) - has_alpha = true; - else if (!m_params.m_check_for_alpha) - file_image.set_alpha(255); - else if (file_image.has_alpha()) - has_alpha = true; - if (has_alpha) - m_any_source_image_has_alpha = true; + if (!m_params.m_hdr) + { + if (m_params.m_force_alpha || alpha_swizzled) + has_alpha = true; + else if (!m_params.m_check_for_alpha) + file_image.set_alpha(255); + else if (file_image.has_alpha()) + has_alpha = true; + + if (has_alpha) + m_any_source_image_has_alpha = true; + } + + { + const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width(); + const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height(); + + debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, width, height, has_alpha); + } - debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, file_image.get_width(), file_image.get_height(), has_alpha); - if (m_params.m_y_flip) - file_image.flip_y(); + { + if (m_params.m_hdr) + file_image_hdr.flip_y(); + else + file_image.flip_y(); + } #if DEBUG_EXTRACT_SINGLE_BLOCK - image block_image(4, 4); const uint32_t block_x = 0; const uint32_t block_y = 0; - block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0); - file_image = block_image; + + if (m_params.m_hdr) + { + imagef block_image(4, 4); + block_image_hdr.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image_hdr, 0); + file_image_hdr = block_image; + } + else + { + image block_image(4, 4); + block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0); + file_image = block_image; + } #endif #if DEBUG_CROP_TEXTURE_TO_64x64 - file_image.resize(64, 64); + if (m_params.m_hdr) + file_image_hdr.resize(64, 64); + else + file_image.resize(64, 64); #endif - if (m_params.m_resample_width > 0 && m_params.m_resample_height > 0) + if ((m_params.m_resample_width > 0) && (m_params.m_resample_height > 0)) { int new_width = basisu::minimum(m_params.m_resample_width, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); int new_height = basisu::minimum(m_params.m_resample_height, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); @@ -588,129 +1391,225 @@ namespace basisu debug_printf("Resampling to %ix%i\n", new_width, new_height); // TODO: A box filter - kaiser looks too sharp on video. Let the caller control this. - image temp_img(new_width, new_height); - image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser"); - temp_img.swap(file_image); + if (m_params.m_hdr) + { + imagef temp_img(new_width, new_height); + image_resample(file_image_hdr, temp_img, "box"); // "kaiser"); + clean_hdr_image(temp_img); + temp_img.swap(file_image_hdr); + } + else + { + image temp_img(new_width, new_height); + image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser"); + temp_img.swap(file_image); + } } else if (m_params.m_resample_factor > 0.0f) { - int new_width = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); - int new_height = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); + // TODO: A box filter - kaiser looks too sharp on video. Let the caller control this. + if (m_params.m_hdr) + { + int new_width = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image_hdr.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); + int new_height = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image_hdr.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); - debug_printf("Resampling to %ix%i\n", new_width, new_height); + debug_printf("Resampling to %ix%i\n", new_width, new_height); - // TODO: A box filter - kaiser looks too sharp on video. Let the caller control this. - image temp_img(new_width, new_height); - image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser"); - temp_img.swap(file_image); + imagef temp_img(new_width, new_height); + image_resample(file_image_hdr, temp_img, "box"); // "kaiser"); + clean_hdr_image(temp_img); + temp_img.swap(file_image_hdr); + } + else + { + int new_width = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); + int new_height = basisu::minimum(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION); + + debug_printf("Resampling to %ix%i\n", new_width, new_height); + + image temp_img(new_width, new_height); + image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser"); + temp_img.swap(file_image); + } } - if ((!file_image.get_width()) || (!file_image.get_height())) + const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width(); + const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height(); + + if ((!width) || (!height)) { error_printf("basis_compressor::read_source_images: Source image has a zero width and/or height!\n"); return false; } - if ((file_image.get_width() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (file_image.get_height() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION)) + if ((width > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (height > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION)) { error_printf("basis_compressor::read_source_images: Source image \"%s\" is too large!\n", pSource_filename); return false; } - source_images.enlarge(1)->swap(file_image); + if (!m_params.m_hdr) + source_images.enlarge(1)->swap(file_image); + else + source_images_hdr.enlarge(1)->swap(file_image_hdr); + source_filenames.push_back(pSource_filename); } // Check if the caller has generated their own mipmaps. - if (m_params.m_source_mipmap_images.size()) + if (m_params.m_hdr) { - // Make sure they've passed us enough mipmap chains. - if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size())) + if (m_params.m_source_mipmap_images_hdr.size()) { - error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n"); - return false; + // Make sure they've passed us enough mipmap chains. + if ((m_params.m_source_images_hdr.size() != m_params.m_source_mipmap_images_hdr.size()) || (total_source_files != m_params.m_source_images_hdr.size())) + { + error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images_hdr.size() must equal m_params.m_source_images_hdr.size()!\n"); + return false; + } } - - // Check if any of the user-supplied mipmap levels has alpha. - // We're assuming the user has already preswizzled their mipmap source images. - if (!m_any_source_image_has_alpha) + } + else + { + if (m_params.m_source_mipmap_images.size()) { - for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++) + // Make sure they've passed us enough mipmap chains. + if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size())) { - for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++) - { - const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index]; + error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n"); + return false; + } - if (mip_img.has_alpha()) + // Check if any of the user-supplied mipmap levels has alpha. + if (!m_any_source_image_has_alpha) + { + for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++) + { + for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++) { - m_any_source_image_has_alpha = true; - break; + const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index]; + + // Be sure to take into account any swizzling which will be applied. + if (mip_img.has_alpha(m_params.m_swizzle[3])) + { + m_any_source_image_has_alpha = true; + break; + } } - } - if (m_any_source_image_has_alpha) - break; + if (m_any_source_image_has_alpha) + break; + } } } } debug_printf("Any source image has alpha: %u\n", m_any_source_image_has_alpha); + // Now, for each source image, create the slices corresponding to that image. for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++) { const std::string &source_filename = source_filenames[source_file_index]; - - // Now, for each source image, create the slices corresponding to that image. + basisu::vector slices; + basisu::vector slices_hdr; slices.reserve(32); + slices_hdr.reserve(32); // The first (largest) mipmap level. - image& file_image = source_images[source_file_index]; - + image *pFile_image = source_images.size() ? &source_images[source_file_index] : nullptr; + imagef *pFile_image_hdr = source_images_hdr.size() ? &source_images_hdr[source_file_index] : nullptr; + // Reserve a slot for mip0. - slices.resize(1); - - if (m_params.m_source_mipmap_images.size()) + if (m_params.m_hdr) + slices_hdr.resize(1); + else + slices.resize(1); + + if ((!m_params.m_hdr) && (m_params.m_source_mipmap_images.size())) { // User-provided mipmaps for each layer or image in the texture array. for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++) { image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index]; - if (m_params.m_swizzle[0] != 0 || - m_params.m_swizzle[1] != 1 || - m_params.m_swizzle[2] != 2 || - m_params.m_swizzle[3] != 3) + if ((m_params.m_swizzle[0] != 0) || + (m_params.m_swizzle[1] != 1) || + (m_params.m_swizzle[2] != 2) || + (m_params.m_swizzle[3] != 3)) { // Used for XY normal maps in RG - puts X in color, Y in alpha for (uint32_t y = 0; y < mip_img.get_height(); y++) + { for (uint32_t x = 0; x < mip_img.get_width(); x++) { - const color_rgba &c = mip_img(x, y); + const color_rgba& c = mip_img(x, y); mip_img(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]); } + } } slices.push_back(mip_img); } } + else if ((m_params.m_hdr) && (m_params.m_source_mipmap_images_hdr.size())) + { + // User-provided mipmaps for each layer or image in the texture array. + for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images_hdr[source_file_index].size(); mip_index++) + { + imagef& mip_img = m_params.m_source_mipmap_images_hdr[source_file_index][mip_index]; + + if ((m_params.m_swizzle[0] != 0) || + (m_params.m_swizzle[1] != 1) || + (m_params.m_swizzle[2] != 2) || + (m_params.m_swizzle[3] != 3)) + { + // Used for XY normal maps in RG - puts X in color, Y in alpha + for (uint32_t y = 0; y < mip_img.get_height(); y++) + { + for (uint32_t x = 0; x < mip_img.get_width(); x++) + { + const vec4F& c = mip_img(x, y); + + // For now, HDR alpha is always 1.0f. + mip_img(x, y).set(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], 1.0f); // c[m_params.m_swizzle[3]]); + } + } + } + + clean_hdr_image(mip_img); + + slices_hdr.push_back(mip_img); + } + } else if (m_params.m_mip_gen) { // Automatically generate mipmaps. - if (!generate_mipmaps(file_image, slices, m_any_source_image_has_alpha)) - return false; + if (m_params.m_hdr) + { + if (!generate_mipmaps(*pFile_image_hdr, slices_hdr, m_any_source_image_has_alpha)) + return false; + } + else + { + if (!generate_mipmaps(*pFile_image, slices, m_any_source_image_has_alpha)) + return false; + } } // Swap in the largest mipmap level here to avoid copying it, because generate_mips() will change the array. // NOTE: file_image is now blank. - slices[0].swap(file_image); + if (m_params.m_hdr) + slices_hdr[0].swap(*pFile_image_hdr); + else + slices[0].swap(*pFile_image); - uint_vec mip_indices(slices.size()); - for (uint32_t i = 0; i < slices.size(); i++) + uint_vec mip_indices(m_params.m_hdr ? slices_hdr.size() : slices.size()); + for (uint32_t i = 0; i < (m_params.m_hdr ? slices_hdr.size() : slices.size()); i++) mip_indices[i] = i; - if ((m_any_source_image_has_alpha) && (!m_params.m_uastc)) + if ((!m_params.m_hdr) && (m_any_source_image_has_alpha) && (!m_params.m_uastc)) { // For ETC1S, if source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. basisu::vector alpha_slices; @@ -745,20 +1644,29 @@ namespace basisu mip_indices.swap(new_mip_indices); } - assert(slices.size() == mip_indices.size()); - - for (uint32_t slice_index = 0; slice_index < slices.size(); slice_index++) + if (m_params.m_hdr) + { + assert(slices_hdr.size() == mip_indices.size()); + } + else + { + assert(slices.size() == mip_indices.size()); + } + + for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? slices_hdr.size() : slices.size()); slice_index++) { - image& slice_image = slices[slice_index]; - const uint32_t orig_width = slice_image.get_width(); - const uint32_t orig_height = slice_image.get_height(); + image *pSlice_image = m_params.m_hdr ? nullptr : &slices[slice_index]; + imagef *pSlice_image_hdr = m_params.m_hdr ? &slices_hdr[slice_index] : nullptr; + + const uint32_t orig_width = m_params.m_hdr ? pSlice_image_hdr->get_width() : pSlice_image->get_width(); + const uint32_t orig_height = m_params.m_hdr ? pSlice_image_hdr->get_height() : pSlice_image->get_height(); bool is_alpha_slice = false; - if (m_any_source_image_has_alpha) + if ((!m_params.m_hdr) && (m_any_source_image_has_alpha)) { if (m_params.m_uastc) { - is_alpha_slice = slice_image.has_alpha(); + is_alpha_slice = pSlice_image->has_alpha(); } else { @@ -767,43 +1675,69 @@ namespace basisu } // Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks. - slice_image.crop_dup_borders(slice_image.get_block_width(4) * 4, slice_image.get_block_height(4) * 4); + if (m_params.m_hdr) + pSlice_image_hdr->crop_dup_borders(pSlice_image_hdr->get_block_width(4) * 4, pSlice_image_hdr->get_block_height(4) * 4); + else + pSlice_image->crop_dup_borders(pSlice_image->get_block_width(4) * 4, pSlice_image->get_block_height(4) * 4); if (m_params.m_debug_images) { - save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), slice_image); + if (m_params.m_hdr) + write_exr(string_format("basis_debug_source_image_%u_slice_%u.exr", source_file_index, slice_index).c_str(), *pSlice_image_hdr, 3, 0); + else + save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), *pSlice_image); } - const uint32_t dest_image_index = m_slice_images.size(); + const uint32_t dest_image_index = (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); enlarge_vector(m_stats, 1); - enlarge_vector(m_slice_images, 1); + + if (m_params.m_hdr) + enlarge_vector(m_slice_images_hdr, 1); + else + enlarge_vector(m_slice_images, 1); + enlarge_vector(m_slice_descs, 1); - + m_stats[dest_image_index].m_filename = source_filename.c_str(); m_stats[dest_image_index].m_width = orig_width; m_stats[dest_image_index].m_height = orig_height; - - debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), orig_width, orig_height, slice_image.get_width(), slice_image.get_height()); - basisu_backend_slice_desc &slice_desc = m_slice_descs[dest_image_index]; + debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", + m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), + orig_width, orig_height, + m_params.m_hdr ? pSlice_image_hdr->get_width() : pSlice_image->get_width(), + m_params.m_hdr ? pSlice_image_hdr->get_height() : pSlice_image->get_height()); + + basisu_backend_slice_desc& slice_desc = m_slice_descs[dest_image_index]; slice_desc.m_first_block_index = m_total_blocks; slice_desc.m_orig_width = orig_width; slice_desc.m_orig_height = orig_height; - slice_desc.m_width = slice_image.get_width(); - slice_desc.m_height = slice_image.get_height(); + if (m_params.m_hdr) + { + slice_desc.m_width = pSlice_image_hdr->get_width(); + slice_desc.m_height = pSlice_image_hdr->get_height(); + + slice_desc.m_num_blocks_x = pSlice_image_hdr->get_block_width(4); + slice_desc.m_num_blocks_y = pSlice_image_hdr->get_block_height(4); + } + else + { + slice_desc.m_width = pSlice_image->get_width(); + slice_desc.m_height = pSlice_image->get_height(); - slice_desc.m_num_blocks_x = slice_image.get_block_width(4); - slice_desc.m_num_blocks_y = slice_image.get_block_height(4); + slice_desc.m_num_blocks_x = pSlice_image->get_block_width(4); + slice_desc.m_num_blocks_y = pSlice_image->get_block_height(4); + } slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1; slice_desc.m_num_macroblocks_y = (slice_desc.m_num_blocks_y + 1) >> 1; slice_desc.m_source_file_index = source_file_index; - + slice_desc.m_mip_index = mip_indices[slice_index]; slice_desc.m_alpha = is_alpha_slice; @@ -818,8 +1752,11 @@ namespace basisu // Finally, swap in the slice's image to avoid copying it. // NOTE: slice_image is now blank. - m_slice_images[dest_image_index].swap(slice_image); - + if (m_params.m_hdr) + m_slice_images_hdr[dest_image_index].swap(*pSlice_image_hdr); + else + m_slice_images[dest_image_index].swap(*pSlice_image); + } // slice_index } // source_file_index @@ -855,7 +1792,7 @@ namespace basisu if (m_params.m_status_output) { - printf("Total basis file slices: %u\n", (uint32_t)m_slice_descs.size()); + printf("Total slices: %u\n", (uint32_t)m_slice_descs.size()); } for (uint32_t i = 0; i < m_slice_descs.size(); i++) @@ -865,11 +1802,17 @@ namespace basisu if (m_params.m_status_output) { printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n", - i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe); + i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, + slice_desc.m_width, slice_desc.m_height, + slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe); } if (m_any_source_image_has_alpha) { + // HDR doesn't support alpha yet + if (m_params.m_hdr) + return false; + if (!m_params.m_uastc) { // For ETC1S, alpha slices must be at odd slice indices. @@ -903,6 +1846,7 @@ namespace basisu if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height)) return false; + if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)) { if (!slice_desc.m_iframe) @@ -924,7 +1868,7 @@ namespace basisu uint32_t total_basis_images = 0; - for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++) + for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++) { const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index]; @@ -945,7 +1889,7 @@ namespace basisu uint_vec image_mipmap_levels(total_basis_images); int width = -1, height = -1; - for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++) + for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++) { const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index]; @@ -982,20 +1926,52 @@ namespace basisu { debug_printf("basis_compressor::extract_source_blocks\n"); - m_source_blocks.resize(m_total_blocks); + if (m_params.m_hdr) + m_source_blocks_hdr.resize(m_total_blocks); + else + m_source_blocks.resize(m_total_blocks); - for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++) + for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++) { const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; const uint32_t num_blocks_x = slice_desc.m_num_blocks_x; const uint32_t num_blocks_y = slice_desc.m_num_blocks_y; - const image& source_image = m_slice_images[slice_index]; + const image *pSource_image = m_params.m_hdr ? nullptr : &m_slice_images[slice_index]; + const imagef *pSource_image_hdr = m_params.m_hdr ? &m_slice_images_hdr[slice_index] : nullptr; for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) + { for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++) - source_image.extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4); + { + if (m_params.m_hdr) + { + vec4F* pBlock = m_source_blocks_hdr[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(); + + pSource_image_hdr->extract_block_clamped(pBlock, block_x * 4, block_y * 4, 4, 4); + + // Additional (technically optional) early sanity checking of the block texels. + for (uint32_t i = 0; i < 16; i++) + { + for (uint32_t c = 0; c < 3; c++) + { + float v = pBlock[i][c]; + + if (std::isnan(v) || std::isinf(v) || (v < 0.0f) || (v > basist::MAX_HALF_FLOAT)) + { + error_printf("basis_compressor::extract_source_blocks: invalid float component\n"); + return false; + } + } + } + } + else + { + pSource_image->extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4); + } + } + } } return true; @@ -1304,6 +2280,8 @@ namespace basisu m_output_basis_file = comp_data; uint32_t total_orig_pixels = 0, total_texels = 0, total_orig_texels = 0; + (void)total_texels; + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { const basisu_backend_slice_desc& slice_desc = m_slice_descs[i]; @@ -1335,10 +2313,21 @@ namespace basisu } m_decoded_output_textures.resize(m_slice_descs.size()); - m_decoded_output_textures_unpacked.resize(m_slice_descs.size()); - m_decoded_output_textures_bc7.resize(m_slice_descs.size()); - m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size()); + if (m_params.m_hdr) + { + m_decoded_output_textures_bc6h_hdr_unpacked.resize(m_slice_descs.size()); + + m_decoded_output_textures_astc_hdr.resize(m_slice_descs.size()); + m_decoded_output_textures_astc_hdr_unpacked.resize(m_slice_descs.size()); + } + else + { + m_decoded_output_textures_unpacked.resize(m_slice_descs.size()); + + m_decoded_output_textures_bc7.resize(m_slice_descs.size()); + m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size()); + } tm.start(); if (m_params.m_pGlobal_codebooks) @@ -1360,12 +2349,16 @@ namespace basisu for (uint32_t i = 0; i < m_slice_descs.size(); i++) { + basisu::texture_format tex_format = m_params.m_hdr ? texture_format::cBC6HUnsigned : (m_params.m_uastc ? texture_format::cUASTC4x4 : texture_format::cETC1); + basist::block_format format = m_params.m_hdr ? basist::block_format::cBC6H : (m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1); + gpu_image decoded_texture; - decoded_texture.init(m_params.m_uastc ? texture_format::cUASTC4x4 : texture_format::cETC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height); + decoded_texture.init( + tex_format, + m_slice_descs[i].m_width, m_slice_descs[i].m_height); tm.start(); - - basist::block_format format = m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1; + uint32_t bytes_per_block = m_params.m_uastc ? 16 : 8; if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, @@ -1391,43 +2384,87 @@ namespace basisu m_decoded_output_textures[i] = decoded_texture; } - double total_time_bc7 = 0; + double total_alt_transcode_time = 0; + tm.start(); - if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) && - basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S)) + if (m_params.m_hdr) { + assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA, basist::basis_tex_format::cUASTC_HDR_4x4)); + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { gpu_image decoded_texture; - decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height); + decoded_texture.init(texture_format::cASTC_HDR_4x4, m_slice_descs[i].m_width, m_slice_descs[i].m_height); tm.start(); if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, - reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16)) + reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cASTC_HDR_4x4, 16)) { - error_printf("Transcoding failed to BC7 on slice %u!\n", i); + error_printf("Transcoding failed to ASTC HDR on slice %u!\n", i); return false; } - - total_time_bc7 += tm.get_elapsed_secs(); - - m_decoded_output_textures_bc7[i] = decoded_texture; + + m_decoded_output_textures_astc_hdr[i] = decoded_texture; + } + } + else + { + if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) && + basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S)) + { + for (uint32_t i = 0; i < m_slice_descs.size(); i++) + { + gpu_image decoded_texture; + decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height); + + if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, + reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16)) + { + error_printf("Transcoding failed to BC7 on slice %u!\n", i); + return false; + } + + m_decoded_output_textures_bc7[i] = decoded_texture; + } } } + total_alt_transcode_time = tm.get_elapsed_secs(); + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { - m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]); + if (m_params.m_hdr) + { + // BC6H + bool status = m_decoded_output_textures[i].unpack_hdr(m_decoded_output_textures_bc6h_hdr_unpacked[i]); + assert(status); + BASISU_NOTE_UNUSED(status); + + // ASTC HDR + status = m_decoded_output_textures_astc_hdr[i].unpack_hdr(m_decoded_output_textures_astc_hdr_unpacked[i]); + assert(status); + } + else + { + bool status = m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]); + assert(status); + BASISU_NOTE_UNUSED(status); - if (m_decoded_output_textures_bc7[i].get_pixel_width()) - m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]); + if (m_decoded_output_textures_bc7[i].get_pixel_width()) + { + status = m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]); + assert(status); + } + } } - debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc); + debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", + m_params.m_hdr ? "BC6H" : (m_params.m_uastc ? "ASTC" : "ETC1"), + total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc); - if (total_time_bc7 != 0) - debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7); + if (total_alt_transcode_time != 0) + debug_printf("Alternate transcode in %3.3fms, %f texels/sec\n", total_alt_transcode_time * 1000.0f, total_orig_pixels / total_alt_transcode_time); for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) { @@ -1438,17 +2475,82 @@ namespace basisu assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks); } + } // if (m_params.m_validate_output_data) return true; } + bool basis_compressor::write_hdr_debug_images(const char* pBasename, const imagef& orig_hdr_img, uint32_t width, uint32_t height) + { + // Copy image to account for 4x4 block expansion + imagef hdr_img(orig_hdr_img); + hdr_img.resize(width, height); + + image srgb_img(width, height); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + vec4F p(hdr_img(x, y)); + + p[0] = clamp(p[0], 0.0f, 1.0f); + p[1] = clamp(p[1], 0.0f, 1.0f); + p[2] = clamp(p[2], 0.0f, 1.0f); + + int rc = (int)std::round(linear_to_srgb(p[0]) * 255.0f); + int gc = (int)std::round(linear_to_srgb(p[1]) * 255.0f); + int bc = (int)std::round(linear_to_srgb(p[2]) * 255.0f); + + srgb_img.set_clipped(x, y, color_rgba(rc, gc, bc, 255)); + } + } + + { + const std::string filename(string_format("%s_linear_clamped_to_srgb.png", pBasename)); + save_png(filename.c_str(), srgb_img); + printf("Wrote .PNG file %s\n", filename.c_str()); + } + + { + const std::string filename(string_format("%s_compressive_tonemapped.png", pBasename)); + image compressive_tonemapped_img; + + bool status = tonemap_image_compressive(compressive_tonemapped_img, hdr_img); + if (!status) + { + error_printf("basis_compressor::write_hdr_debug_images: tonemap_image_compressive() failed (invalid half-float input)\n"); + } + else + { + save_png(filename.c_str(), compressive_tonemapped_img); + printf("Wrote .PNG file %s\n", filename.c_str()); + } + } + + image tonemapped_img; + + for (int e = -5; e <= 5; e++) + { + const float scale = powf(2.0f, (float)e); + + tonemap_image_reinhard(tonemapped_img, hdr_img, scale); + + std::string filename(string_format("%s_reinhard_tonemapped_scale_%f.png", pBasename, scale)); + save_png(filename.c_str(), tonemapped_img, cImageSaveIgnoreAlpha); + printf("Wrote .PNG file %s\n", filename.c_str()); + } + + return true; + } + bool basis_compressor::write_output_files_and_compute_stats() { debug_printf("basis_compressor::write_output_files_and_compute_stats\n"); const uint8_vec& comp_data = m_params.m_create_ktx2_file ? m_output_ktx2_file : m_basis_file.get_compressed_data(); - if (m_params.m_write_output_basis_files) + if (m_params.m_write_output_basis_or_ktx2_files) { const std::string& output_filename = m_params.m_out_filename; @@ -1458,7 +2560,7 @@ namespace basisu return false; } - if (m_params.m_status_output) + //if (m_params.m_status_output) { printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str()); } @@ -1485,7 +2587,7 @@ namespace basisu m_basis_bits_per_texel = comp_size * 8.0f / total_texels; - debug_printf(".basis file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n", + debug_printf("Output file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n", (uint32_t)comp_data.size(), (uint32_t)comp_size, m_basis_bits_per_texel); @@ -1495,191 +2597,324 @@ namespace basisu if (m_params.m_validate_output_data) { - for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + if (m_params.m_hdr) { - const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; + if (m_params.m_print_stats) + { + printf("ASTC/BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n"); + } - if (m_params.m_compute_stats) + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) { - if (m_params.m_print_stats) - printf("Slice: %u\n", slice_index); + const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; - image_stats& s = m_stats[slice_index]; + if (m_params.m_compute_stats) + { + image_stats& s = m_stats[slice_index]; - // TODO: We used to output SSIM (during heavy encoder development), but this slowed down compression too much. We'll be adding it back. + if (m_params.m_print_stats) + { + printf("Slice: %u\n", slice_index); + } - image_metrics em; + image_metrics im; - // ---- .basis stats - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3); - if (m_params.m_print_stats) - em.print(".basis RGB Avg: "); - s.m_basis_rgb_avg_psnr = em.m_psnr; + if (m_params.m_print_stats) + { + printf("\nASTC channels:\n"); + for (uint32_t i = 0; i < 3; i++) + { + im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], i, 1, true); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4); - if (m_params.m_print_stats) - em.print(".basis RGBA Avg: "); - s.m_basis_rgba_avg_psnr = em.m_psnr; + printf("%c: ", "RGB"[i]); + im.print_hp(); + } - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1); - if (m_params.m_print_stats) - em.print(".basis R Avg: "); + printf("BC6H channels:\n"); + for (uint32_t i = 0; i < 3; i++) + { + im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], i, 1, true); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1); - if (m_params.m_print_stats) - em.print(".basis G Avg: "); + printf("%c: ", "RGB"[i]); + im.print_hp(); + } + } - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1); - if (m_params.m_print_stats) - em.print(".basis B Avg: "); + im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], 0, 3, true); + s.m_basis_rgb_avg_psnr = (float)im.m_psnr; - if (m_params.m_uastc) - { - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1); if (m_params.m_print_stats) - em.print(".basis A Avg: "); + { + printf("\nASTC RGB: "); + im.print_hp(); +#if 0 + // Validation + im.calc_half2(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], 0, 3, true); + printf("\nASTC RGB (Alt): "); + im.print_hp(); +#endif + } - s.m_basis_a_avg_psnr = em.m_psnr; + im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], 0, 3, true); + s.m_basis_rgb_avg_bc6h_psnr = (float)im.m_psnr; + + if (m_params.m_print_stats) + { + printf("BC6H RGB: "); + im.print_hp(); + printf("\n"); + } } + + if (m_params.m_debug_images) + { + std::string out_basename; + if (m_params.m_out_filename.size()) + string_get_filename(m_params.m_out_filename.c_str(), out_basename); + else if (m_params.m_source_filenames.size()) + string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0); - if (m_params.m_print_stats) - em.print(".basis 709 Luma: "); - s.m_basis_luma_709_psnr = static_cast(em.m_psnr); - s.m_basis_luma_709_ssim = static_cast(em.m_ssim); + string_remove_extension(out_basename); + out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true); - if (m_params.m_print_stats) - em.print(".basis 601 Luma: "); - s.m_basis_luma_601_psnr = static_cast(em.m_psnr); + // Write BC6H .DDS file. + { + gpu_image bc6h_tex(m_decoded_output_textures[slice_index]); + bc6h_tex.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); + + std::string filename(out_basename + "_bc6h.dds"); + write_compressed_texture_file(filename.c_str(), bc6h_tex, true); + printf("Wrote .DDS file %s\n", filename.c_str()); + } - if (m_slice_descs.size() == 1) - { - const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size(); - if (m_params.m_print_stats) + // Write ASTC .KTX/.astc files. ("astcenc -dh input.astc output.exr" to decode the astc file.) + { + gpu_image astc_tex(m_decoded_output_textures_astc_hdr[slice_index]); + astc_tex.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); + + std::string filename1(out_basename + "_astc.astc"); + write_astc_file(filename1.c_str(), astc_tex.get_ptr(), 4, 4, slice_desc.m_orig_width, slice_desc.m_orig_height); + printf("Wrote .ASTC file %s\n", filename1.c_str()); + + std::string filename2(out_basename + "_astc.ktx"); + write_compressed_texture_file(filename2.c_str(), astc_tex, true); + printf("Wrote .KTX file %s\n", filename2.c_str()); + } + + // Write unpacked ASTC image to .EXR + { + imagef astc_img(m_decoded_output_textures_astc_hdr_unpacked[slice_index]); + astc_img.resize(slice_desc.m_orig_width, slice_desc.m_orig_height); + + std::string filename(out_basename + "_unpacked_astc.exr"); + write_exr(filename.c_str(), astc_img, 3, 0); + printf("Wrote .EXR file %s\n", filename.c_str()); + } + + // Write unpacked BC6H image to .EXR { - debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); - debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + imagef bc6h_img(m_decoded_output_textures_bc6h_hdr_unpacked[slice_index]); + bc6h_img.resize(slice_desc.m_orig_width, slice_desc.m_orig_height); + + std::string filename(out_basename + "_unpacked_bc6h.exr"); + write_exr(filename.c_str(), bc6h_img, 3, 0); + printf("Wrote .EXR file %s\n", filename.c_str()); } + + // Write tonemapped/srgb images + write_hdr_debug_images((out_basename + "_source").c_str(), m_slice_images_hdr[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height); + write_hdr_debug_images((out_basename + "_unpacked_astc").c_str(), m_decoded_output_textures_astc_hdr_unpacked[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height); + write_hdr_debug_images((out_basename + "_unpacked_bc6h").c_str(), m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height); } + } + } + else + { + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + { + const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; - if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width()) + if (m_params.m_compute_stats) { - // ---- BC7 stats - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3); if (m_params.m_print_stats) - em.print("BC7 RGB Avg: "); - s.m_bc7_rgb_avg_psnr = em.m_psnr; + printf("Slice: %u\n", slice_index); + + image_stats& s = m_stats[slice_index]; + + image_metrics em; + + // ---- .basis stats + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3); + if (m_params.m_print_stats) + em.print(".basis RGB Avg: "); + s.m_basis_rgb_avg_psnr = (float)em.m_psnr; - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4); if (m_params.m_print_stats) - em.print("BC7 RGBA Avg: "); - s.m_bc7_rgba_avg_psnr = em.m_psnr; + em.print(".basis RGBA Avg: "); + s.m_basis_rgba_avg_psnr = (float)em.m_psnr; - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1); if (m_params.m_print_stats) - em.print("BC7 R Avg: "); + em.print(".basis R Avg: "); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1); if (m_params.m_print_stats) - em.print("BC7 G Avg: "); + em.print(".basis G Avg: "); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1); if (m_params.m_print_stats) - em.print("BC7 B Avg: "); + em.print(".basis B Avg: "); if (m_params.m_uastc) { - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1); if (m_params.m_print_stats) - em.print("BC7 A Avg: "); + em.print(".basis A Avg: "); - s.m_bc7_a_avg_psnr = em.m_psnr; + s.m_basis_a_avg_psnr = (float)em.m_psnr; } - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0); if (m_params.m_print_stats) - em.print("BC7 709 Luma: "); - s.m_bc7_luma_709_psnr = static_cast(em.m_psnr); - s.m_bc7_luma_709_ssim = static_cast(em.m_ssim); + em.print(".basis 709 Luma: "); + s.m_basis_luma_709_psnr = static_cast(em.m_psnr); + s.m_basis_luma_709_ssim = static_cast(em.m_ssim); - em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true); + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true); if (m_params.m_print_stats) - em.print("BC7 601 Luma: "); - s.m_bc7_luma_601_psnr = static_cast(em.m_psnr); - } + em.print(".basis 601 Luma: "); + s.m_basis_luma_601_psnr = static_cast(em.m_psnr); - if (!m_params.m_uastc) - { - // ---- Nearly best possible ETC1S stats - em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3); - if (m_params.m_print_stats) - em.print("Unquantized ETC1S RGB Avg: "); - s.m_best_etc1s_rgb_avg_psnr = static_cast(em.m_psnr); + if (m_slice_descs.size() == 1) + { + const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size(); + if (m_params.m_print_stats) + { + debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + } + } - em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0); - if (m_params.m_print_stats) - em.print("Unquantized ETC1S 709 Luma: "); - s.m_best_etc1s_luma_709_psnr = static_cast(em.m_psnr); - s.m_best_etc1s_luma_709_ssim = static_cast(em.m_ssim); + if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width()) + { + // ---- BC7 stats + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3); + //if (m_params.m_print_stats) + // em.print("BC7 RGB Avg: "); + s.m_bc7_rgb_avg_psnr = (float)em.m_psnr; + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4); + //if (m_params.m_print_stats) + // em.print("BC7 RGBA Avg: "); + s.m_bc7_rgba_avg_psnr = (float)em.m_psnr; + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1); + //if (m_params.m_print_stats) + // em.print("BC7 R Avg: "); + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1); + //if (m_params.m_print_stats) + // em.print("BC7 G Avg: "); + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1); + //if (m_params.m_print_stats) + // em.print("BC7 B Avg: "); + + if (m_params.m_uastc) + { + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1); + //if (m_params.m_print_stats) + // em.print("BC7 A Avg: "); - em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true); - if (m_params.m_print_stats) - em.print("Unquantized ETC1S 601 Luma: "); - s.m_best_etc1s_luma_601_psnr = static_cast(em.m_psnr); + s.m_bc7_a_avg_psnr = (float)em.m_psnr; + } + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0); + //if (m_params.m_print_stats) + // em.print("BC7 709 Luma: "); + s.m_bc7_luma_709_psnr = static_cast(em.m_psnr); + s.m_bc7_luma_709_ssim = static_cast(em.m_ssim); + + em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true); + //if (m_params.m_print_stats) + // em.print("BC7 601 Luma: "); + s.m_bc7_luma_601_psnr = static_cast(em.m_psnr); + } + + if (!m_params.m_uastc) + { + // ---- Nearly best possible ETC1S stats + em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3); + //if (m_params.m_print_stats) + // em.print("Unquantized ETC1S RGB Avg: "); + s.m_best_etc1s_rgb_avg_psnr = static_cast(em.m_psnr); + + em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0); + //if (m_params.m_print_stats) + // em.print("Unquantized ETC1S 709 Luma: "); + s.m_best_etc1s_luma_709_psnr = static_cast(em.m_psnr); + s.m_best_etc1s_luma_709_ssim = static_cast(em.m_ssim); + + em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true); + //if (m_params.m_print_stats) + // em.print("Unquantized ETC1S 601 Luma: "); + s.m_best_etc1s_luma_601_psnr = static_cast(em.m_psnr); + } } - } - std::string out_basename; - if (m_params.m_out_filename.size()) - string_get_filename(m_params.m_out_filename.c_str(), out_basename); - else if (m_params.m_source_filenames.size()) - string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename); + std::string out_basename; + if (m_params.m_out_filename.size()) + string_get_filename(m_params.m_out_filename.c_str(), out_basename); + else if (m_params.m_source_filenames.size()) + string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename); - string_remove_extension(out_basename); - out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index); + string_remove_extension(out_basename); + out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index); - if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images)) - { - // Write "best" ETC1S debug images - if (!m_params.m_uastc) + if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images)) { - gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]); - best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); - write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image); + // Write "best" ETC1S debug images + if (!m_params.m_uastc) + { + gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]); + best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); + write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image, true); - image best_etc1s_unpacked; - best_etc1s_gpu_image.unpack(best_etc1s_unpacked); - save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked); + image best_etc1s_unpacked; + best_etc1s_gpu_image.unpack(best_etc1s_unpacked); + save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked); + } } - } - if (m_params.m_debug_images) - { - // Write decoded ETC1S/ASTC debug images + if (m_params.m_debug_images) { - gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]); - decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); - write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc); + // Write decoded ETC1S/ASTC debug images + { + gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]); + decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); + write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc, true); - image temp(m_decoded_output_textures_unpacked[slice_index]); - temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height); - save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp); - } + image temp(m_decoded_output_textures_unpacked[slice_index]); + temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height); + save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp); + } - // Write decoded BC7 debug images - if (m_decoded_output_textures_bc7[slice_index].get_pixel_width()) - { - gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]); - decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); - write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7); + // Write decoded BC7 debug images + if (m_decoded_output_textures_bc7[slice_index].get_pixel_width()) + { + gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]); + decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); + write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7, true); - image temp(m_decoded_output_textures_unpacked_bc7[slice_index]); - temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height); - save_png(out_basename + "_transcoded_bc7.png", temp); + image temp(m_decoded_output_textures_unpacked_bc7[slice_index]); + temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height); + save_png(out_basename + "_transcoded_bc7.png", temp); + } } } - } + } // if (m_params.m_hdr) + } // if (m_params.m_validate_output_data) return true; @@ -1727,10 +2962,27 @@ namespace basisu } static uint8_t g_ktx2_etc1s_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; + static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; + static uint8_t g_ktx2_uastc_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - + static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; + + // HDR TODO - what is the best Khronos DFD to use for UASTC HDR? + static uint8_t g_ktx2_uastc_hdr_nonalpha_dfd[44] = + { + 0x2C,0x0,0x0,0x0, // 0 totalSize + 0x0,0x0,0x0,0x0, // 1 descriptorType/vendorId + 0x2,0x0,0x28,0x0, // 2 descriptorBlockSize/versionNumber + 0xA7,0x1,0x1,0x0, // 3 flags, transferFunction, colorPrimaries, colorModel + 0x3,0x3,0x0,0x0, // 4 texelBlockDimension0-texelBlockDimension3 + 0x10,0x0,0x0,0x0, // 5 bytesPlane0-bytesPlane3 + 0x0,0x0,0x0,0x0, // 6 bytesPlane4-bytesPlane7 + 0x0,0x0,0x7F,0x80, // 7 bitLength/bitOffset/channelType and Qualifer flags (KHR_DF_SAMPLE_DATATYPE_FLOAT etc.) + 0x0,0x0,0x0,0x0, // 8 samplePosition0-samplePosition3 + 0x0,0x0,0x0,0x0, // 9 sampleLower (0.0) + 0x00, 0x00, 0x80, 0x3F // 10 sampleHigher (1.0) + }; + void basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header) { const uint8_t* pDFD; @@ -1738,7 +2990,12 @@ namespace basisu if (m_params.m_uastc) { - if (m_any_source_image_has_alpha) + if (m_params.m_hdr) + { + pDFD = g_ktx2_uastc_hdr_nonalpha_dfd; + dfd_len = sizeof(g_ktx2_uastc_hdr_nonalpha_dfd); + } + else if (m_any_source_image_has_alpha) { pDFD = g_ktx2_uastc_alpha_dfd; dfd_len = sizeof(g_ktx2_uastc_alpha_dfd); @@ -1772,10 +3029,18 @@ namespace basisu dfd_bits &= ~(0xFF << 16); - if (m_params.m_ktx2_srgb_transfer_func) - dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16); - else + if (m_params.m_hdr) + { + // TODO: In HDR mode, always write linear for now. dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16); + } + else + { + if (m_params.m_ktx2_srgb_transfer_func) + dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16); + else + dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16); + } basisu::write_le_dword(dfd.data() + 3 * sizeof(uint32_t), dfd_bits); @@ -1850,7 +3115,12 @@ namespace basisu header.m_pixel_width = base_width; header.m_pixel_height = base_height; header.m_face_count = total_faces; - header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED; + + if (m_params.m_hdr) + header.m_vk_format = basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK; + else + header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED; + header.m_type_size = 1; header.m_level_count = total_levels; header.m_layer_count = (total_layers > 1) ? total_layers : 0; @@ -2061,7 +3331,8 @@ namespace basisu if (bytes_needed_to_pad < 6) bytes_needed_to_pad += 16; - printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad); + // Just add the padding. It's likely not necessary anymore, but can't really hurt. + //printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad); // We're not good - need to add a dummy key large enough to force file alignment so the mip level array gets aligned. // We can't just add some bytes before the mip level array because ktx2check will see that as extra data in the file that shouldn't be there in ktxValidator::validateDataSize(). @@ -2258,18 +3529,34 @@ namespace basisu return result; } - void* basis_compress( - const basisu::vector& source_images, + static void* basis_compress( + const basisu::vector *pSource_images, + const basisu::vector *pSource_images_hdr, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, image_stats* pStats) { + assert((pSource_images != nullptr) || (pSource_images_hdr != nullptr)); + assert(!((pSource_images != nullptr) && (pSource_images_hdr != nullptr))); + // Check input parameters - if ((!source_images.size()) || (!pSize)) + if (pSource_images) { - error_printf("basis_compress: Invalid parameter\n"); - assert(0); - return nullptr; + if ((!pSource_images->size()) || (!pSize)) + { + error_printf("basis_compress: Invalid parameter\n"); + assert(0); + return nullptr; + } + } + else + { + if ((!pSource_images_hdr->size()) || (!pSize)) + { + error_printf("basis_compress: Invalid parameter\n"); + assert(0); + return nullptr; + } } *pSize = 0; @@ -2287,40 +3574,70 @@ namespace basisu comp_params.m_y_flip = (flags_and_quality & cFlagYFlip) != 0; comp_params.m_debug = (flags_and_quality & cFlagDebug) != 0; - + comp_params.m_debug_images = (flags_and_quality & cFlagDebugImages) != 0; + // Copy the largest mipmap level - comp_params.m_source_images.resize(1); - comp_params.m_source_images[0] = source_images[0]; + if (pSource_images) + { + comp_params.m_source_images.resize(1); + comp_params.m_source_images[0] = (*pSource_images)[0]; + + // Copy the smaller mipmap levels, if any + if (pSource_images->size() > 1) + { + comp_params.m_source_mipmap_images.resize(1); + comp_params.m_source_mipmap_images[0].resize(pSource_images->size() - 1); - // Copy the smaller mipmap levels, if any - if (source_images.size() > 1) + for (uint32_t i = 1; i < pSource_images->size(); i++) + comp_params.m_source_mipmap_images[0][i - 1] = (*pSource_images)[i]; + } + } + else { - comp_params.m_source_mipmap_images.resize(1); - comp_params.m_source_mipmap_images[0].resize(source_images.size() - 1); + comp_params.m_source_images_hdr.resize(1); + comp_params.m_source_images_hdr[0] = (*pSource_images_hdr)[0]; - for (uint32_t i = 1; i < source_images.size(); i++) - comp_params.m_source_mipmap_images[0][i - 1] = source_images[i]; + // Copy the smaller mipmap levels, if any + if (pSource_images_hdr->size() > 1) + { + comp_params.m_source_mipmap_images_hdr.resize(1); + comp_params.m_source_mipmap_images_hdr[0].resize(pSource_images_hdr->size() - 1); + + for (uint32_t i = 1; i < pSource_images->size(); i++) + comp_params.m_source_mipmap_images_hdr[0][i - 1] = (*pSource_images_hdr)[i]; + } } comp_params.m_multithreading = (flags_and_quality & cFlagThreaded) != 0; comp_params.m_use_opencl = (flags_and_quality & cFlagUseOpenCL) != 0; - comp_params.m_write_output_basis_files = false; + comp_params.m_write_output_basis_or_ktx2_files = false; comp_params.m_perceptual = (flags_and_quality & cFlagSRGB) != 0; comp_params.m_mip_srgb = comp_params.m_perceptual; comp_params.m_mip_gen = (flags_and_quality & (cFlagGenMipsWrap | cFlagGenMipsClamp)) != 0; comp_params.m_mip_wrapping = (flags_and_quality & cFlagGenMipsWrap) != 0; - comp_params.m_uastc = (flags_and_quality & cFlagUASTC) != 0; - if (comp_params.m_uastc) + if ((pSource_images_hdr) || (flags_and_quality & cFlagHDR)) { - comp_params.m_pack_uastc_flags = flags_and_quality & cPackUASTCLevelMask; - comp_params.m_rdo_uastc = (flags_and_quality & cFlagUASTCRDO) != 0; - comp_params.m_rdo_uastc_quality_scalar = uastc_rdo_quality; + // In UASTC HDR mode, the compressor will jam this to true anyway. + // And there's no need to set UASTC LDR or ETC1S options. + comp_params.m_uastc = true; } else - comp_params.m_quality_level = basisu::maximum(1, flags_and_quality & 255); + { + comp_params.m_uastc = (flags_and_quality & cFlagUASTC) != 0; + if (comp_params.m_uastc) + { + comp_params.m_pack_uastc_flags = flags_and_quality & cPackUASTCLevelMask; + comp_params.m_rdo_uastc = (flags_and_quality & cFlagUASTCRDO) != 0; + comp_params.m_rdo_uastc_quality_scalar = uastc_rdo_quality; + } + else + { + comp_params.m_quality_level = basisu::maximum(1, flags_and_quality & 255); + } + } comp_params.m_create_ktx2_file = (flags_and_quality & cFlagKTX2) != 0; @@ -2337,6 +3654,15 @@ namespace basisu comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0; comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0; + if ((flags_and_quality & cFlagHDR) || (pSource_images_hdr)) + { + comp_params.m_hdr = true; + comp_params.m_uastc_hdr_options.set_quality_level(flags_and_quality & cPackUASTCLevelMask); + } + + if (flags_and_quality & cFlagHDRLDRImageSRGBToLinearConversion) + comp_params.m_hdr_ldr_srgb_to_linear_conversion = true; + // Create the compressor, initialize it, and process the input basis_compressor comp; if (!comp.init(comp_params)) @@ -2380,6 +3706,24 @@ namespace basisu return pFile_data; } + void* basis_compress( + const basisu::vector& source_images, + uint32_t flags_and_quality, float uastc_rdo_quality, + size_t* pSize, + image_stats* pStats) + { + return basis_compress(&source_images, nullptr, flags_and_quality, uastc_rdo_quality, pSize, pStats); + } + + void* basis_compress( + const basisu::vector& source_images_hdr, + uint32_t flags_and_quality, + size_t* pSize, + image_stats* pStats) + { + return basis_compress(nullptr, &source_images_hdr, flags_and_quality, 0.0f, pSize, pStats); + } + void* basis_compress( const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels, uint32_t flags_and_quality, float uastc_rdo_quality, diff --git a/thirdparty/basis_universal/encoder/basisu_comp.h b/thirdparty/basis_universal/encoder/basisu_comp.h index b6c9fef9e251..1cc75fc8a385 100644 --- a/thirdparty/basis_universal/encoder/basisu_comp.h +++ b/thirdparty/basis_universal/encoder/basisu_comp.h @@ -1,5 +1,5 @@ // basisu_comp.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,9 +18,10 @@ #include "basisu_basis_file.h" #include "../transcoder/basisu_transcoder.h" #include "basisu_uastc_enc.h" +#include "basisu_astc_hdr_enc.h" -#define BASISU_LIB_VERSION 116 -#define BASISU_LIB_VERSION_STRING "1.16" +#define BASISU_LIB_VERSION 150 +#define BASISU_LIB_VERSION_STRING "1.50" #ifndef BASISD_SUPPORT_KTX2 #error BASISD_SUPPORT_KTX2 is undefined @@ -81,6 +82,8 @@ namespace basisu m_basis_luma_601_psnr = 0.0f; m_basis_luma_709_ssim = 0.0f; + m_basis_rgb_avg_bc6h_psnr = 0.0f; + m_bc7_rgb_avg_psnr = 0.0f; m_bc7_rgba_avg_psnr = 0.0f; m_bc7_a_avg_psnr = 0.0f; @@ -100,7 +103,7 @@ namespace basisu uint32_t m_width; uint32_t m_height; - // .basis compressed (ETC1S or UASTC statistics) + // .basis/.ktx2 compressed (LDR: ETC1S or UASTC statistics, HDR: transcoded BC6H statistics) float m_basis_rgb_avg_psnr; float m_basis_rgba_avg_psnr; float m_basis_a_avg_psnr; @@ -108,7 +111,10 @@ namespace basisu float m_basis_luma_601_psnr; float m_basis_luma_709_ssim; - // BC7 statistics + // UASTC HDR only. + float m_basis_rgb_avg_bc6h_psnr; + + // LDR: BC7 statistics float m_bc7_rgb_avg_psnr; float m_bc7_rgba_avg_psnr; float m_bc7_a_avg_psnr; @@ -116,7 +122,7 @@ namespace basisu float m_bc7_luma_601_psnr; float m_bc7_luma_709_ssim; - // Highest achievable quality ETC1S statistics + // LDR: Highest achievable quality ETC1S statistics float m_best_etc1s_rgb_avg_psnr; float m_best_etc1s_luma_709_psnr; float m_best_etc1s_luma_601_psnr; @@ -256,7 +262,7 @@ namespace basisu m_no_selector_rdo.clear(); m_selector_rdo_thresh.clear(); m_read_source_images.clear(); - m_write_output_basis_files.clear(); + m_write_output_basis_or_ktx2_files.clear(); m_compression_level.clear(); m_compute_stats.clear(); m_print_stats.clear(); @@ -317,27 +323,38 @@ namespace basisu m_validate_output_data.clear(); + m_hdr_ldr_srgb_to_linear_conversion.clear(); + + m_hdr_favor_astc.clear(); + m_pJob_pool = nullptr; } - // True to generate UASTC .basis file data, otherwise ETC1S. + // True to generate UASTC .basis/.KTX2 file data, otherwise ETC1S. bool_param m_uastc; + // Set m_hdr to true to switch to UASTC HDR mode. + bool_param m_hdr; + bool_param m_use_opencl; - // If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. - // Otherwise, the compressor processes the images in m_source_images. + // If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG etc. images to read. + // Otherwise, the compressor processes the images in m_source_images or m_source_images_hdr. basisu::vector m_source_filenames; basisu::vector m_source_alpha_filenames; basisu::vector m_source_images; + basisu::vector m_source_images_hdr; + // Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual. // If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error. // The compressor applies the user-provided swizzling (in m_swizzle) to these images. basisu::vector< basisu::vector > m_source_mipmap_images; + + basisu::vector< basisu::vector > m_source_mipmap_images_hdr; - // Filename of the output basis file + // Filename of the output basis/ktx2 file std::string m_out_filename; // The params are done this way so we can detect when the user has explictly changed them. @@ -373,8 +390,8 @@ namespace basisu // Read source images from m_source_filenames/m_source_alpha_filenames bool_param m_read_source_images; - // Write the output basis file to disk using m_out_filename - bool_param m_write_output_basis_files; + // Write the output basis/ktx2 file to disk using m_out_filename + bool_param m_write_output_basis_or_ktx2_files; // Compute and display image metrics bool_param m_compute_stats; @@ -382,15 +399,15 @@ namespace basisu // Print stats to stdout, if m_compute_stats is true. bool_param m_print_stats; - // Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels + // Check to see if any input image has an alpha channel, if so then the output basis/ktx2 file will have alpha channels bool_param m_check_for_alpha; - // Always put alpha slices in the output basis file, even when the input doesn't have alpha + // Always put alpha slices in the output basis/ktx2 file, even when the input doesn't have alpha bool_param m_force_alpha; bool_param m_multithreading; - // Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels - char m_swizzle[4]; + // Split the R channel to RGB and the G channel to alpha, then write a basis/ktx2 file with alpha channels + uint8_t m_swizzle[4]; bool_param m_renormalize; @@ -448,8 +465,17 @@ namespace basisu param m_ktx2_zstd_supercompression_level; bool_param m_ktx2_srgb_transfer_func; + astc_hdr_codec_options m_uastc_hdr_options; + bool_param m_validate_output_data; + // If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion) and then processed as HDR. + // Otherwise, LDR images will be processed as HDR as-is. + bool_param m_hdr_ldr_srgb_to_linear_conversion; + + // If true, ASTC HDR quality is favored more than BC6H quality. Otherwise it's a rough balance. + bool_param m_hdr_favor_astc; + job_pool *m_pJob_pool; }; @@ -504,6 +530,7 @@ namespace basisu opencl_context_ptr m_pOpenCL_context; basisu::vector m_slice_images; + basisu::vector m_slice_images_hdr; basisu::vector m_stats; @@ -515,7 +542,9 @@ namespace basisu uint32_t m_total_blocks; basisu_frontend m_frontend; + pixel_block_vec m_source_blocks; + pixel_block_hdr_vec m_source_blocks_hdr; basisu::vector m_frontend_output_textures; @@ -526,11 +555,17 @@ namespace basisu basisu_file m_basis_file; - basisu::vector m_decoded_output_textures; + basisu::vector m_decoded_output_textures; // BC6H in HDR mode basisu::vector m_decoded_output_textures_unpacked; + basisu::vector m_decoded_output_textures_bc7; basisu::vector m_decoded_output_textures_unpacked_bc7; + basisu::vector m_decoded_output_textures_bc6h_hdr_unpacked; // BC6H in HDR mode + + basisu::vector m_decoded_output_textures_astc_hdr; + basisu::vector m_decoded_output_textures_astc_hdr_unpacked; + uint8_vec m_output_basis_file; uint8_vec m_output_ktx2_file; @@ -541,14 +576,21 @@ namespace basisu bool m_opencl_failed; + void check_for_hdr_inputs(); + bool sanity_check_input_params(); + void clean_hdr_image(imagef& src_img); + bool read_dds_source_images(); bool read_source_images(); bool extract_source_blocks(); bool process_frontend(); bool extract_frontend_texture_data(); bool process_backend(); bool create_basis_file_and_transcode(); + bool write_hdr_debug_images(const char* pBasename, const imagef& img, uint32_t width, uint32_t height); bool write_output_files_and_compute_stats(); + error_code encode_slices_to_uastc_hdr(); error_code encode_slices_to_uastc(); + bool generate_mipmaps(const imagef& img, basisu::vector& mips, bool has_alpha); bool generate_mipmaps(const image &img, basisu::vector &mips, bool has_alpha); bool validate_texture_type_constraints(); bool validate_ktx2_constraints(); @@ -568,7 +610,8 @@ namespace basisu // // flags_and_quality: Combination of the above flags logically OR'd with the ETC1S or UASTC level, i.e. "cFlagSRGB | cFlagGenMipsClamp | cFlagThreaded | 128" or "cFlagSRGB | cFlagGenMipsClamp | cFlagUASTC | cFlagThreaded | cPackUASTCLevelDefault". // In ETC1S mode, the lower 8-bits are the ETC1S quality level which ranges from [1,255] (higher=better quality/larger files) - // In UASTC mode, the lower 8-bits are the UASTC pack level (see cPackUASTCLevelFastest, etc.). Fastest/lowest quality is 0, so be sure to set it correctly. + // In UASTC mode, the lower 8-bits are the UASTC LDR/HDR pack level (see cPackUASTCLevelFastest, etc.). Fastest/lowest quality is 0, so be sure to set it correctly. Valid values are [0,4] for both LDR/HDR. + // In UASTC mode, be sure to set this, otherwise it defaults to 0 (fastest/lowest quality). // // uastc_rdo_quality: Float UASTC RDO quality level (0=no change, higher values lower quality but increase compressibility, initially try .5-1.5) // @@ -594,20 +637,36 @@ namespace basisu cFlagUASTCRDO = 1 << 18, // use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar) cFlagPrintStats = 1 << 19, // print image stats to stdout - cFlagPrintStatus = 1 << 20 // print status to stdout + cFlagPrintStatus = 1 << 20, // print status to stdout + + cFlagHDR = 1 << 21, // Force encoder into HDR mode, even if source image is LDR. + cFlagHDRLDRImageSRGBToLinearConversion = 1 << 22, // In HDR mode, convert LDR source images to linear before encoding. + + cFlagDebugImages = 1 << 23 // enable status output }; // This function accepts an array of source images. // If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled. - // Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data(). + // Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. + // Important: The returned block MUST be manually freed using basis_free_data(). // basisu_encoder_init() MUST be called first! + // LDR version. To compress the LDR source image as HDR: Use the cFlagHDR flag. void* basis_compress( const basisu::vector &source_images, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, image_stats* pStats = nullptr); - // This function only accepts a single source image. + // HDR-only version. + // Important: The returned block MUST be manually freed using basis_free_data(). + void* basis_compress( + const basisu::vector& source_images_hdr, + uint32_t flags_and_quality, + size_t* pSize, + image_stats* pStats = nullptr); + + // This function only accepts a single LDR source image. It's just a wrapper for basis_compress() above. + // Important: The returned block MUST be manually freed using basis_free_data(). void* basis_compress( const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels, uint32_t flags_and_quality, float uastc_rdo_quality, @@ -615,6 +674,7 @@ namespace basisu image_stats* pStats = nullptr); // Frees the dynamically allocated file data returned by basis_compress(). + // This MUST be called on the pointer returned by basis_compress() when you're done with it. void basis_free_data(void* p); // Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled. diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp index e87dd636a2fc..fff98e830148 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp @@ -1,5 +1,5 @@ // basisu_enc.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,10 +21,20 @@ #include "jpgd.h" #include "pvpngreader.h" #include "basisu_opencl.h" +#include "basisu_astc_hdr_enc.h" #include +#ifndef TINYEXR_USE_ZFP +#define TINYEXR_USE_ZFP (1) +#endif +#include + +#ifndef MINIZ_HEADER_FILE_ONLY #define MINIZ_HEADER_FILE_ONLY +#endif +#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES +#endif #include "basisu_miniz.h" #if defined(_WIN32) @@ -165,14 +175,14 @@ namespace basisu bool g_library_initialized; std::mutex g_encoder_init_mutex; - + // Encoder library initialization (just call once at startup) - void basisu_encoder_init(bool use_opencl, bool opencl_force_serialization) + bool basisu_encoder_init(bool use_opencl, bool opencl_force_serialization) { std::lock_guard lock(g_encoder_init_mutex); if (g_library_initialized) - return; + return true; detect_sse41(); @@ -189,7 +199,11 @@ namespace basisu interval_timer::init(); // make sure interval_timer globals are initialized from main thread to avoid TSAN reports + astc_hdr_enc_init(); + basist::bc6h_enc_init(); + g_library_initialized = true; + return true; } void basisu_encoder_deinit() @@ -316,6 +330,24 @@ namespace basisu init(); return ticks * g_timer_freq; } + + float linear_to_srgb(float l) + { + assert(l >= 0.0f && l <= 1.0f); + if (l < .0031308f) + return saturate(l * 12.92f); + else + return saturate(1.055f * powf(l, 1.0f / 2.4f) - .055f); + } + + float srgb_to_linear(float s) + { + assert(s >= 0.0f && s <= 1.0f); + if (s < .04045f) + return saturate(s * (1.0f / 12.92f)); + else + return saturate(powf((s + .055f) * (1.0f / 1.055f), 2.4f)); + } const uint32_t MAX_32BIT_ALLOC_SIZE = 250000000; @@ -336,7 +368,7 @@ namespace basisu if (sizeof(void *) == sizeof(uint32_t)) { - if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE) + if (((uint64_t)w * h * n_chans) > MAX_32BIT_ALLOC_SIZE) { error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h); @@ -371,6 +403,11 @@ namespace basisu return true; } + bool load_qoi(const char* pFilename, image& img) + { + return false; + } + bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename) { interval_timer tm; @@ -433,11 +470,178 @@ namespace basisu return load_png(pFilename, img); if (strcasecmp(pExt, "tga") == 0) return load_tga(pFilename, img); + if (strcasecmp(pExt, "qoi") == 0) + return load_qoi(pFilename, img); if ( (strcasecmp(pExt, "jpg") == 0) || (strcasecmp(pExt, "jfif") == 0) || (strcasecmp(pExt, "jpeg") == 0) ) return load_jpg(pFilename, img); return false; } + + static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear) + { + img.resize(ldr_img.get_width(), ldr_img.get_height()); + + for (uint32_t y = 0; y < ldr_img.get_height(); y++) + { + for (uint32_t x = 0; x < ldr_img.get_width(); x++) + { + const color_rgba& c = ldr_img(x, y); + + vec4F& d = img(x, y); + if (ldr_srgb_to_linear) + { + // TODO: Multiply by 100-200 nits? + d[0] = srgb_to_linear(c[0] * (1.0f / 255.0f)); + d[1] = srgb_to_linear(c[1] * (1.0f / 255.0f)); + d[2] = srgb_to_linear(c[2] * (1.0f / 255.0f)); + } + else + { + d[0] = c[0] * (1.0f / 255.0f); + d[1] = c[1] * (1.0f / 255.0f); + d[2] = c[2] * (1.0f / 255.0f); + } + d[3] = c[3] * (1.0f / 255.0f); + } + } + } + + bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear) + { + if ((!pMem) || (!mem_size)) + { + assert(0); + return false; + } + + switch (img_type) + { + case hdr_image_type::cHITRGBAHalfFloat: + { + if (mem_size != width * height * sizeof(basist::half_float) * 4) + { + assert(0); + return false; + } + + if ((!width) || (!height)) + { + assert(0); + return false; + } + + const basist::half_float* pSrc_image_h = static_cast(pMem); + + img.resize(width, height); + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const basist::half_float* pSrc_pixel = &pSrc_image_h[x * 4]; + + vec4F& dst = img(x, y); + dst[0] = basist::half_to_float(pSrc_pixel[0]); + dst[1] = basist::half_to_float(pSrc_pixel[1]); + dst[2] = basist::half_to_float(pSrc_pixel[2]); + dst[3] = basist::half_to_float(pSrc_pixel[3]); + } + + pSrc_image_h += (width * 4); + } + + break; + } + case hdr_image_type::cHITRGBAFloat: + { + if (mem_size != width * height * sizeof(float) * 4) + { + assert(0); + return false; + } + + if ((!width) || (!height)) + { + assert(0); + return false; + } + + img.resize(width, height); + memcpy(img.get_ptr(), pMem, width * height * sizeof(float) * 4); + + break; + } + case hdr_image_type::cHITPNGImage: + { + image ldr_img; + if (!load_png(static_cast(pMem), mem_size, ldr_img)) + return false; + + convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear); + break; + } + case hdr_image_type::cHITEXRImage: + { + if (!read_exr(pMem, mem_size, img)) + return false; + + break; + } + case hdr_image_type::cHITHDRImage: + { + uint8_vec buf(mem_size); + memcpy(buf.get_ptr(), pMem, mem_size); + + rgbe_header_info hdr; + if (!read_rgbe(buf, img, hdr)) + return false; + + break; + } + default: + assert(0); + return false; + } + + return true; + } + + bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear) + { + std::string ext(string_get_extension(std::string(pFilename))); + + if (ext.length() == 0) + return false; + + const char* pExt = ext.c_str(); + + if (strcasecmp(pExt, "hdr") == 0) + { + rgbe_header_info rgbe_info; + if (!read_rgbe(pFilename, img, rgbe_info)) + return false; + return true; + } + + if (strcasecmp(pExt, "exr") == 0) + { + int n_chans = 0; + if (!read_exr(pFilename, img, n_chans)) + return false; + return true; + } + + // Try loading image as LDR, then optionally convert to linear light. + { + image ldr_img; + if (!load_image(pFilename, ldr_img)) + return false; + + convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear); + } + + return true; + } bool save_png(const char* pFilename, const image &img, uint32_t image_save_flags, uint32_t grayscale_comp) { @@ -559,6 +763,45 @@ namespace basisu return true; } + bool read_file_to_data(const char* pFilename, void *pData, size_t len) + { + assert(pData && len); + if ((!pData) || (!len)) + return false; + + FILE* pFile = nullptr; +#ifdef _WIN32 + fopen_s(&pFile, pFilename, "rb"); +#else + pFile = fopen(pFilename, "rb"); +#endif + if (!pFile) + return false; + + fseek(pFile, 0, SEEK_END); +#ifdef _WIN32 + int64_t filesize = _ftelli64(pFile); +#else + int64_t filesize = ftello(pFile); +#endif + + if ((filesize < 0) || ((size_t)filesize < len)) + { + fclose(pFile); + return false; + } + fseek(pFile, 0, SEEK_SET); + + if (fread(pData, 1, (size_t)len, pFile) != (size_t)len) + { + fclose(pFile); + return false; + } + + fclose(pFile); + return true; + } + bool write_data_to_file(const char* pFilename, const void* pData, size_t len) { FILE* pFile = nullptr; @@ -581,25 +824,7 @@ namespace basisu return fclose(pFile) != EOF; } - - float linear_to_srgb(float l) - { - assert(l >= 0.0f && l <= 1.0f); - if (l < .0031308f) - return saturate(l * 12.92f); - else - return saturate(1.055f * powf(l, 1.0f/2.4f) - .055f); - } - - float srgb_to_linear(float s) - { - assert(s >= 0.0f && s <= 1.0f); - if (s < .04045f) - return saturate(s * (1.0f/12.92f)); - else - return saturate(powf((s + .055f) * (1.0f/1.055f), 2.4f)); - } - + bool image_resample(const image &src, image &dst, bool srgb, const char *pFilter, float filter_scale, bool wrapping, @@ -747,67 +972,182 @@ namespace basisu return true; } - void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms) + bool image_resample(const imagef& src, imagef& dst, + const char* pFilter, float filter_scale, + bool wrapping, + uint32_t first_comp, uint32_t num_comps) { - // See the paper "In-Place Calculation of Minimum Redundancy Codes" by Moffat and Katajainen - if (!num_syms) - return; + assert((first_comp + num_comps) <= 4); - if (1 == num_syms) + const int cMaxComps = 4; + + const uint32_t src_w = src.get_width(), src_h = src.get_height(); + const uint32_t dst_w = dst.get_width(), dst_h = dst.get_height(); + + if (maximum(src_w, src_h) > BASISU_RESAMPLER_MAX_DIMENSION) { - A[0].m_key = 1; - return; + printf("Image is too large!\n"); + return false; } - - A[0].m_key += A[1].m_key; - - int s = 2, r = 0, next; - for (next = 1; next < (num_syms - 1); ++next) - { - if ((s >= num_syms) || (A[r].m_key < A[s].m_key)) - { - A[next].m_key = A[r].m_key; - A[r].m_key = next; - ++r; - } - else - { - A[next].m_key = A[s].m_key; - ++s; - } - if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key)) - { - A[next].m_key = A[next].m_key + A[r].m_key; - A[r].m_key = next; - ++r; - } - else - { - A[next].m_key = A[next].m_key + A[s].m_key; - ++s; - } - } - A[num_syms - 2].m_key = 0; + if (!src_w || !src_h || !dst_w || !dst_h) + return false; - for (next = num_syms - 3; next >= 0; --next) + if ((num_comps < 1) || (num_comps > cMaxComps)) + return false; + + if ((minimum(dst_w, dst_h) < 1) || (maximum(dst_w, dst_h) > BASISU_RESAMPLER_MAX_DIMENSION)) { - A[next].m_key = 1 + A[A[next].m_key].m_key; + printf("Image is too large!\n"); + return false; } - int num_avail = 1, num_used = 0, depth = 0; - r = num_syms - 2; - next = num_syms - 1; - while (num_avail > 0) + if ((src_w == dst_w) && (src_h == dst_h)) { - for ( ; (r >= 0) && ((int)A[r].m_key == depth); ++num_used, --r ) - ; + dst = src; + return true; + } - for ( ; num_avail > num_used; --next, --num_avail) - A[next].m_key = depth; + std::vector samples[cMaxComps]; + Resampler* resamplers[cMaxComps]; - num_avail = 2 * num_used; - num_used = 0; + resamplers[0] = new Resampler(src_w, src_h, dst_w, dst_h, + wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 1.0f, 0.0f, // no clamping + pFilter, nullptr, nullptr, filter_scale, filter_scale, 0, 0); + samples[0].resize(src_w); + + for (uint32_t i = 1; i < num_comps; ++i) + { + resamplers[i] = new Resampler(src_w, src_h, dst_w, dst_h, + wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 1.0f, 0.0f, // no clamping + pFilter, resamplers[0]->get_clist_x(), resamplers[0]->get_clist_y(), filter_scale, filter_scale, 0, 0); + samples[i].resize(src_w); + } + + uint32_t dst_y = 0; + + for (uint32_t src_y = 0; src_y < src_h; ++src_y) + { + const vec4F* pSrc = &src(0, src_y); + + // Put source lines into resampler(s) + for (uint32_t x = 0; x < src_w; ++x) + { + for (uint32_t c = 0; c < num_comps; ++c) + { + const uint32_t comp_index = first_comp + c; + const float v = (*pSrc)[comp_index]; + + samples[c][x] = v; + } + + pSrc++; + } + + for (uint32_t c = 0; c < num_comps; ++c) + { + if (!resamplers[c]->put_line(&samples[c][0])) + { + for (uint32_t i = 0; i < num_comps; i++) + delete resamplers[i]; + return false; + } + } + + // Now retrieve any output lines + for (;;) + { + uint32_t c; + for (c = 0; c < num_comps; ++c) + { + const uint32_t comp_index = first_comp + c; + + const float* pOutput_samples = resamplers[c]->get_line(); + if (!pOutput_samples) + break; + + vec4F* pDst = &dst(0, dst_y); + + for (uint32_t x = 0; x < dst_w; x++) + { + (*pDst)[comp_index] = pOutput_samples[x]; + pDst++; + } + } + if (c < num_comps) + break; + + ++dst_y; + } + } + + for (uint32_t i = 0; i < num_comps; ++i) + delete resamplers[i]; + + return true; + } + + void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms) + { + // See the paper "In-Place Calculation of Minimum Redundancy Codes" by Moffat and Katajainen + if (!num_syms) + return; + + if (1 == num_syms) + { + A[0].m_key = 1; + return; + } + + A[0].m_key += A[1].m_key; + + int s = 2, r = 0, next; + for (next = 1; next < (num_syms - 1); ++next) + { + if ((s >= num_syms) || (A[r].m_key < A[s].m_key)) + { + A[next].m_key = A[r].m_key; + A[r].m_key = next; + ++r; + } + else + { + A[next].m_key = A[s].m_key; + ++s; + } + + if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key)) + { + A[next].m_key = A[next].m_key + A[r].m_key; + A[r].m_key = next; + ++r; + } + else + { + A[next].m_key = A[next].m_key + A[s].m_key; + ++s; + } + } + A[num_syms - 2].m_key = 0; + + for (next = num_syms - 3; next >= 0; --next) + { + A[next].m_key = 1 + A[A[next].m_key].m_key; + } + + int num_avail = 1, num_used = 0, depth = 0; + r = num_syms - 2; + next = num_syms - 1; + while (num_avail > 0) + { + for ( ; (r >= 0) && ((int)A[r].m_key == depth); ++num_used, --r ) + ; + + for ( ; num_avail > num_used; --next, --num_avail) + A[next].m_key = depth; + + num_avail = 2 * num_used; + num_used = 0; ++depth; } } @@ -1312,11 +1652,13 @@ namespace basisu uint32_t a = max_index / num_syms, b = max_index % num_syms; + const uint32_t ofs = m_entries_picked.size(); + m_entries_picked.push_back(a); m_entries_picked.push_back(b); for (uint32_t i = 0; i < num_syms; i++) - if ((i != b) && (i != a)) + if ((i != m_entries_picked[ofs + 1]) && (i != m_entries_picked[ofs])) m_entries_to_do.push_back(i); for (uint32_t i = 0; i < m_entries_to_do.size(); i++) @@ -1372,48 +1714,161 @@ namespace basisu } return which_side; } - - void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma) + + void image_metrics::calc(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool log) { assert((first_chan < 4U) && (first_chan + total_chans <= 4U)); const uint32_t width = basisu::minimum(a.get_width(), b.get_width()); const uint32_t height = basisu::minimum(a.get_height(), b.get_height()); - double hist[256]; - clear_obj(hist); + double max_e = -1e+30f; + double sum = 0.0f, sum_sqr = 0.0f; + m_has_neg = false; + m_any_abnormal = false; + m_hf_mag_overflow = false; + for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { - const color_rgba &ca = a(x, y), &cb = b(x, y); - + const vec4F& ca = a(x, y), &cb = b(x, y); + if (total_chans) { for (uint32_t c = 0; c < total_chans; c++) - hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++; + { + float fa = ca[first_chan + c], fb = cb[first_chan + c]; + + if ((fabs(fa) > basist::MAX_HALF_FLOAT) || (fabs(fb) > basist::MAX_HALF_FLOAT)) + m_hf_mag_overflow = true; + + if ((fa < 0.0f) || (fb < 0.0f)) + m_has_neg = true; + + if (std::isinf(fa) || std::isinf(fb) || std::isnan(fa) || std::isnan(fb)) + m_any_abnormal = true; + + const double delta = fabs(fa - fb); + max_e = basisu::maximum(max_e, delta); + + if (log) + { + double log2_delta = log2f(basisu::maximum(0.0f, fa) + 1.0f) - log2f(basisu::maximum(0.0f, fb) + 1.0f); + + sum += fabs(log2_delta); + sum_sqr += log2_delta * log2_delta; + } + else + { + sum += fabs(delta); + sum_sqr += delta * delta; + } + } } else { - if (use_601_luma) - hist[iabs(ca.get_601_luma() - cb.get_601_luma())]++; + for (uint32_t c = 0; c < 3; c++) + { + float fa = ca[c], fb = cb[c]; + + if ((fabs(fa) > basist::MAX_HALF_FLOAT) || (fabs(fb) > basist::MAX_HALF_FLOAT)) + m_hf_mag_overflow = true; + + if ((fa < 0.0f) || (fb < 0.0f)) + m_has_neg = true; + + if (std::isinf(fa) || std::isinf(fb) || std::isnan(fa) || std::isnan(fb)) + m_any_abnormal = true; + } + + double ca_l = get_luminance(ca), cb_l = get_luminance(cb); + + double delta = fabs(ca_l - cb_l); + max_e = basisu::maximum(max_e, delta); + + if (log) + { + double log2_delta = log2(basisu::maximum(0.0f, ca_l) + 1.0f) - log2(basisu::maximum(0.0f, cb_l) + 1.0f); + + sum += fabs(log2_delta); + sum_sqr += log2_delta * log2_delta; + } else - hist[iabs(ca.get_709_luma() - cb.get_709_luma())]++; + { + sum += delta; + sum_sqr += delta * delta; + } } } } + m_max = (double)(max_e); + + double total_values = (double)width * (double)height; + if (avg_comp_error) + total_values *= (double)clamp(total_chans, 1, 4); + + m_mean = (float)(sum / total_values); + m_mean_squared = (float)(sum_sqr / total_values); + m_rms = (float)sqrt(sum_sqr / total_values); + + const double max_val = 1.0f; + m_psnr = m_rms ? (float)clamp(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f; + } + + void image_metrics::calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error) + { + assert(total_chans); + assert((first_chan < 4U) && (first_chan + total_chans <= 4U)); + + const uint32_t width = basisu::minimum(a.get_width(), b.get_width()); + const uint32_t height = basisu::minimum(a.get_height(), b.get_height()); + + m_has_neg = false; + m_hf_mag_overflow = false; + m_any_abnormal = false; + + uint_vec hist(65536); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const vec4F& ca = a(x, y), &cb = b(x, y); + + for (uint32_t i = 0; i < 4; i++) + { + if ((ca[i] < 0.0f) || (cb[i] < 0.0f)) + m_has_neg = true; + + if ((fabs(ca[i]) > basist::MAX_HALF_FLOAT) || (fabs(cb[i]) > basist::MAX_HALF_FLOAT)) + m_hf_mag_overflow = true; + + if (std::isnan(ca[i]) || std::isnan(cb[i]) || std::isinf(ca[i]) || std::isinf(cb[i])) + m_any_abnormal = true; + } + + int cah[4] = { basist::float_to_half(ca[0]), basist::float_to_half(ca[1]), basist::float_to_half(ca[2]), basist::float_to_half(ca[3]) }; + int cbh[4] = { basist::float_to_half(cb[0]), basist::float_to_half(cb[1]), basist::float_to_half(cb[2]), basist::float_to_half(cb[3]) }; + + for (uint32_t c = 0; c < total_chans; c++) + hist[iabs(cah[first_chan + c] - cbh[first_chan + c]) & 65535]++; + + } // x + } // y + m_max = 0; double sum = 0.0f, sum2 = 0.0f; - for (uint32_t i = 0; i < 256; i++) + for (uint32_t i = 0; i < 65536; i++) { if (hist[i]) { - m_max = basisu::maximum(m_max, (float)i); - double v = i * hist[i]; + m_max = basisu::maximum(m_max, (double)i); + double v = (double)i * (double)hist[i]; sum += v; - sum2 += i * v; + sum2 += (double)i * v; } } @@ -1421,63 +1876,183 @@ namespace basisu if (avg_comp_error) total_values *= (double)clamp(total_chans, 1, 4); - m_mean = (float)clamp(sum / total_values, 0.0f, 255.0); - m_mean_squared = (float)clamp(sum2 / total_values, 0.0f, 255.0f * 255.0f); + const float max_val = 65535.0f; + m_mean = (float)clamp(sum / total_values, 0.0f, max_val); + m_mean_squared = (float)clamp(sum2 / total_values, 0.0f, max_val * max_val); m_rms = (float)sqrt(m_mean_squared); - m_psnr = m_rms ? (float)clamp(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f; + m_psnr = m_rms ? (float)clamp(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f; } - void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed) + // Alt. variant, same as calc_half(), for validation. + void image_metrics::calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error) { - rand r(seed); + assert(total_chans); + assert((first_chan < 4U) && (first_chan + total_chans <= 4U)); - uint8_t *pDst = static_cast(pBuf); + const uint32_t width = basisu::minimum(a.get_width(), b.get_width()); + const uint32_t height = basisu::minimum(a.get_height(), b.get_height()); - while (size >= sizeof(uint32_t)) - { - *(uint32_t *)pDst = r.urand32(); - pDst += sizeof(uint32_t); - size -= sizeof(uint32_t); - } + m_has_neg = false; + m_hf_mag_overflow = false; + m_any_abnormal = false; + + double sum = 0.0f, sum2 = 0.0f; + m_max = 0; - while (size) + for (uint32_t y = 0; y < height; y++) { - *pDst++ = r.byte(); - size--; - } - } + for (uint32_t x = 0; x < width; x++) + { + const vec4F& ca = a(x, y), & cb = b(x, y); - uint32_t hash_hsieh(const uint8_t *pBuf, size_t len) - { - if (!pBuf || !len) - return 0; + for (uint32_t i = 0; i < 4; i++) + { + if ((ca[i] < 0.0f) || (cb[i] < 0.0f)) + m_has_neg = true; - uint32_t h = static_cast(len); + if ((fabs(ca[i]) > basist::MAX_HALF_FLOAT) || (fabs(cb[i]) > basist::MAX_HALF_FLOAT)) + m_hf_mag_overflow = true; - const uint32_t bytes_left = len & 3; - len >>= 2; + if (std::isnan(ca[i]) || std::isnan(cb[i]) || std::isinf(ca[i]) || std::isinf(cb[i])) + m_any_abnormal = true; + } - while (len--) - { - const uint16_t *pWords = reinterpret_cast(pBuf); + int cah[4] = { basist::float_to_half(ca[0]), basist::float_to_half(ca[1]), basist::float_to_half(ca[2]), basist::float_to_half(ca[3]) }; + int cbh[4] = { basist::float_to_half(cb[0]), basist::float_to_half(cb[1]), basist::float_to_half(cb[2]), basist::float_to_half(cb[3]) }; - h += pWords[0]; - - const uint32_t t = (pWords[1] << 11) ^ h; - h = (h << 16) ^ t; - - pBuf += sizeof(uint32_t); - - h += h >> 11; - } + for (uint32_t c = 0; c < total_chans; c++) + { + int diff = iabs(cah[first_chan + c] - cbh[first_chan + c]); + if (diff) + m_max = std::max(m_max, (double)diff); - switch (bytes_left) - { - case 1: - h += *reinterpret_cast(pBuf); - h ^= h << 10; - h += h >> 1; - break; + sum += diff; + sum2 += squarei(cah[first_chan + c] - cbh[first_chan + c]); + } + + } // x + } // y + + double total_values = (double)width * (double)height; + if (avg_comp_error) + total_values *= (double)clamp(total_chans, 1, 4); + + const float max_val = 65535.0f; + m_mean = (float)clamp(sum / total_values, 0.0f, max_val); + m_mean_squared = (float)clamp(sum2 / total_values, 0.0f, max_val * max_val); + m_rms = (float)sqrt(m_mean_squared); + m_psnr = m_rms ? (float)clamp(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f; + } + + void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma) + { + assert((first_chan < 4U) && (first_chan + total_chans <= 4U)); + + const uint32_t width = basisu::minimum(a.get_width(), b.get_width()); + const uint32_t height = basisu::minimum(a.get_height(), b.get_height()); + + double hist[256]; + clear_obj(hist); + + m_has_neg = false; + m_any_abnormal = false; + m_hf_mag_overflow = false; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_rgba &ca = a(x, y), &cb = b(x, y); + + if (total_chans) + { + for (uint32_t c = 0; c < total_chans; c++) + hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++; + } + else + { + if (use_601_luma) + hist[iabs(ca.get_601_luma() - cb.get_601_luma())]++; + else + hist[iabs(ca.get_709_luma() - cb.get_709_luma())]++; + } + } + } + + m_max = 0; + double sum = 0.0f, sum2 = 0.0f; + for (uint32_t i = 0; i < 256; i++) + { + if (hist[i]) + { + m_max = basisu::maximum(m_max, (double)i); + double v = i * hist[i]; + sum += v; + sum2 += i * v; + } + } + + double total_values = (double)width * (double)height; + if (avg_comp_error) + total_values *= (double)clamp(total_chans, 1, 4); + + m_mean = (float)clamp(sum / total_values, 0.0f, 255.0); + m_mean_squared = (float)clamp(sum2 / total_values, 0.0f, 255.0f * 255.0f); + m_rms = (float)sqrt(m_mean_squared); + m_psnr = m_rms ? (float)clamp(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f; + } + + void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed) + { + rand r(seed); + + uint8_t *pDst = static_cast(pBuf); + + while (size >= sizeof(uint32_t)) + { + *(uint32_t *)pDst = r.urand32(); + pDst += sizeof(uint32_t); + size -= sizeof(uint32_t); + } + + while (size) + { + *pDst++ = r.byte(); + size--; + } + } + + uint32_t hash_hsieh(const uint8_t *pBuf, size_t len) + { + if (!pBuf || !len) + return 0; + + uint32_t h = static_cast(len); + + const uint32_t bytes_left = len & 3; + len >>= 2; + + while (len--) + { + const uint16_t *pWords = reinterpret_cast(pBuf); + + h += pWords[0]; + + const uint32_t t = (pWords[1] << 11) ^ h; + h = (h << 16) ^ t; + + pBuf += sizeof(uint32_t); + + h += h >> 11; + } + + switch (bytes_left) + { + case 1: + h += *reinterpret_cast(pBuf); + h ^= h << 10; + h += h >> 1; + break; case 2: h += *reinterpret_cast(pBuf); h ^= h << 11; @@ -1922,7 +2497,7 @@ namespace basisu } while (pixels_remaining); - assert((pDst - &input_line_buf[0]) == width * tga_bytes_per_pixel); + assert((pDst - &input_line_buf[0]) == (int)(width * tga_bytes_per_pixel)); pLine_data = &input_line_buf[0]; } @@ -2052,56 +2627,1059 @@ namespace basisu return read_tga(&filedata[0], (uint32_t)filedata.size(), width, height, n_chans); } - void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...) + static inline void hdr_convert(const color_rgba& rgbe, vec4F& c) { - char buf[2048]; + if (rgbe[3] != 0) + { + float scale = ldexp(1.0f, rgbe[3] - 128 - 8); + c.set((float)rgbe[0] * scale, (float)rgbe[1] * scale, (float)rgbe[2] * scale, 1.0f); + } + else + { + c.set(0.0f, 0.0f, 0.0f, 1.0f); + } + } - va_list args; - va_start(args, pFmt); -#ifdef _WIN32 - vsprintf_s(buf, sizeof(buf), pFmt, args); + bool string_begins_with(const std::string& str, const char* pPhrase) + { + const size_t str_len = str.size(); + + const size_t phrase_len = strlen(pPhrase); + assert(phrase_len); + + if (str_len >= phrase_len) + { +#ifdef _MSC_VER + if (_strnicmp(pPhrase, str.c_str(), phrase_len) == 0) #else - vsnprintf(buf, sizeof(buf), pFmt, args); + if (strncasecmp(pPhrase, str.c_str(), phrase_len) == 0) #endif - va_end(args); + return true; + } - const char* p = buf; + return false; + } - const uint32_t orig_x_ofs = x_ofs; + // Radiance RGBE (.HDR) image reading. + // This code tries to preserve the original logic in Radiance's ray/src/common/color.c code: + // https://www.radiance-online.org/cgi-bin/viewcvs.cgi/ray/src/common/color.c?revision=2.26&view=markup&sortby=log + // Also see: https://flipcode.com/archives/HDR_Image_Reader.shtml. + // https://github.com/LuminanceHDR/LuminanceHDR/blob/master/src/Libpfs/io/rgbereader.cpp. + // https://radsite.lbl.gov/radiance/refer/filefmts.pdf + // Buggy readers: + // stb_image.h: appears to be a clone of rgbe.c, but with goto's (doesn't support old format files, doesn't support mixture of RLE/non-RLE scanlines) + // http://www.graphics.cornell.edu/~bjw/rgbe.html - rgbe.c/h + // http://www.graphics.cornell.edu/online/formats/rgbe/ - rgbe.c/.h - buggy + bool read_rgbe(const uint8_vec &filedata, imagef& img, rgbe_header_info& hdr_info) + { + hdr_info.clear(); - while (*p) + const uint32_t MAX_SUPPORTED_DIM = 65536; + + if (filedata.size() < 4) + return false; + + // stb_image.h checks for the string "#?RADIANCE" or "#?RGBE" in the header. + // The original Radiance header code doesn't care about the specific string. + // opencv's reader only checks for "#?", so that's what we're going to do. + if ((filedata[0] != '#') || (filedata[1] != '?')) + return false; + + //uint32_t width = 0, height = 0; + bool is_rgbe = false; + size_t cur_ofs = 0; + + // Parse the lines until we encounter a blank line. + std::string cur_line; + for (; ; ) { - uint8_t c = *p++; - if ((c < 32) || (c > 127)) - c = '.'; + if (cur_ofs >= filedata.size()) + return false; - const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0]; + const uint32_t HEADER_TOO_BIG_SIZE = 4096; + if (cur_ofs >= HEADER_TOO_BIG_SIZE) + { + // Header seems too large - something is likely wrong. Return failure. + return false; + } - for (uint32_t y = 0; y < 8; y++) + uint8_t c = filedata[cur_ofs++]; + + if (c == '\n') { - uint32_t row_bits = pGlpyh[y]; - for (uint32_t x = 0; x < 8; x++) + if (!cur_line.size()) + break; + + if ((cur_line[0] == '#') && (!string_begins_with(cur_line, "#?")) && (!hdr_info.m_program.size())) { - const uint32_t q = row_bits & (1 << x); - - const color_rgba* pColor = q ? &fg : pBG; - if (!pColor) - continue; + cur_line.erase(0, 1); + while (cur_line.size() && (cur_line[0] == ' ')) + cur_line.erase(0, 1); - if (alpha_only) - fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor); - else - fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor); + hdr_info.m_program = cur_line; + } + else if (string_begins_with(cur_line, "EXPOSURE=") && (cur_line.size() > 9)) + { + hdr_info.m_exposure = atof(cur_line.c_str() + 9); + hdr_info.m_has_exposure = true; + } + else if (string_begins_with(cur_line, "GAMMA=") && (cur_line.size() > 6)) + { + hdr_info.m_exposure = atof(cur_line.c_str() + 6); + hdr_info.m_has_gamma = true; + } + else if (cur_line == "FORMAT=32-bit_rle_rgbe") + { + is_rgbe = true; } + + cur_line.resize(0); } + else + cur_line.push_back((char)c); + } - x_ofs += 8 * scale_x; - if ((x_ofs + 8 * scale_x) > m_width) + if (!is_rgbe) + return false; + + // Assume and require the final line to have the image's dimensions. We're not supporting flipping. + for (; ; ) + { + if (cur_ofs >= filedata.size()) + return false; + uint8_t c = filedata[cur_ofs++]; + if (c == '\n') + break; + cur_line.push_back((char)c); + } + + int comp[2] = { 1, 0 }; // y, x (major, minor) + int dir[2] = { -1, 1 }; // -1, 1, (major, minor), for y -1=up + uint32_t major_dim = 0, minor_dim = 0; + + // Parse the dimension string, normally it'll be "-Y # +X #" (major, minor), rarely it differs + for (uint32_t d = 0; d < 2; d++) // 0=major, 1=minor + { + const bool is_neg_x = (strncmp(&cur_line[0], "-X ", 3) == 0); + const bool is_pos_x = (strncmp(&cur_line[0], "+X ", 3) == 0); + const bool is_x = is_neg_x || is_pos_x; + + const bool is_neg_y = (strncmp(&cur_line[0], "-Y ", 3) == 0); + const bool is_pos_y = (strncmp(&cur_line[0], "+Y ", 3) == 0); + const bool is_y = is_neg_y || is_pos_y; + + if (cur_line.size() < 3) + return false; + + if (!is_x && !is_y) + return false; + + comp[d] = is_x ? 0 : 1; + dir[d] = (is_neg_x || is_neg_y) ? -1 : 1; + + uint32_t& dim = d ? minor_dim : major_dim; + + cur_line.erase(0, 3); + + while (cur_line.size()) { - x_ofs = orig_x_ofs; - y_ofs += 8 * scale_y; + char c = cur_line[0]; + if (c != ' ') + break; + cur_line.erase(0, 1); + } + + bool has_digits = false; + while (cur_line.size()) + { + char c = cur_line[0]; + cur_line.erase(0, 1); + + if (c == ' ') + break; + + if ((c < '0') || (c > '9')) + return false; + + const uint32_t prev_dim = dim; + dim = dim * 10 + (c - '0'); + if (dim < prev_dim) + return false; + + has_digits = true; } + if (!has_digits) + return false; + + if ((dim < 1) || (dim > MAX_SUPPORTED_DIM)) + return false; } - } - + + // temp image: width=minor, height=major + img.resize(minor_dim, major_dim); + + std::vector temp_scanline(minor_dim); + + // Read the scanlines. + for (uint32_t y = 0; y < major_dim; y++) + { + vec4F* pDst = &img(0, y); + + if ((filedata.size() - cur_ofs) < 4) + return false; + + // Determine if the line uses the new or old format. See the logic in color.c. + bool old_decrunch = false; + if ((minor_dim < 8) || (minor_dim > 0x7FFF)) + { + // Line is too short or long; must be old format. + old_decrunch = true; + } + else if (filedata[cur_ofs] != 2) + { + // R is not 2, must be old format + old_decrunch = true; + } + else + { + // c[0]/red is 2.Check GB and E for validity. + color_rgba c; + memcpy(&c, &filedata[cur_ofs], 4); + + if ((c[1] != 2) || (c[2] & 0x80)) + { + // G isn't 2, or the high bit of B is set which is impossible (image's > 0x7FFF pixels can't get here). Use old format. + old_decrunch = true; + } + else + { + // Check B and E. If this isn't the minor_dim in network order, something is wrong. The pixel would also be denormalized, and invalid. + uint32_t w = (c[2] << 8) | c[3]; + if (w != minor_dim) + return false; + + cur_ofs += 4; + } + } + + if (old_decrunch) + { + uint32_t rshift = 0, x = 0; + + while (x < minor_dim) + { + if ((filedata.size() - cur_ofs) < 4) + return false; + + color_rgba c; + memcpy(&c, &filedata[cur_ofs], 4); + cur_ofs += 4; + + if ((c[0] == 1) && (c[1] == 1) && (c[2] == 1)) + { + // We'll allow RLE matches to cross scanlines, but not on the very first pixel. + if ((!x) && (!y)) + return false; + + const uint32_t run_len = c[3] << rshift; + const vec4F run_color(pDst[-1]); + + if ((x + run_len) > minor_dim) + return false; + + for (uint32_t i = 0; i < run_len; i++) + *pDst++ = run_color; + + rshift += 8; + x += run_len; + } + else + { + rshift = 0; + + hdr_convert(c, *pDst); + pDst++; + x++; + } + } + continue; + } + + // New format + for (uint32_t s = 0; s < 4; s++) + { + uint32_t x_ofs = 0; + while (x_ofs < minor_dim) + { + uint32_t num_remaining = minor_dim - x_ofs; + + if (cur_ofs >= filedata.size()) + return false; + + uint8_t count = filedata[cur_ofs++]; + if (count > 128) + { + count -= 128; + if (count > num_remaining) + return false; + + if (cur_ofs >= filedata.size()) + return false; + const uint8_t val = filedata[cur_ofs++]; + + for (uint32_t i = 0; i < count; i++) + temp_scanline[x_ofs + i][s] = val; + + x_ofs += count; + } + else + { + if ((!count) || (count > num_remaining)) + return false; + + for (uint32_t i = 0; i < count; i++) + { + if (cur_ofs >= filedata.size()) + return false; + const uint8_t val = filedata[cur_ofs++]; + + temp_scanline[x_ofs + i][s] = val; + } + + x_ofs += count; + } + } // while (x_ofs < minor_dim) + } // c + + // Convert all the RGBE pixels to float now + for (uint32_t x = 0; x < minor_dim; x++, pDst++) + hdr_convert(temp_scanline[x], *pDst); + + assert((pDst - &img(0, y)) == (int)minor_dim); + + } // y + + // at here: + // img(width,height)=image pixels as read from file, x=minor axis, y=major axis + // width=minor axis dimension + // height=major axis dimension + // in file, pixels are emitted in minor order, them major (so major=scanlines in the file) + + imagef final_img; + if (comp[0] == 0) // if major axis is X + final_img.resize(major_dim, minor_dim); + else // major axis is Y, minor is X + final_img.resize(minor_dim, major_dim); + + // TODO: optimize the identity case + for (uint32_t major_iter = 0; major_iter < major_dim; major_iter++) + { + for (uint32_t minor_iter = 0; minor_iter < minor_dim; minor_iter++) + { + const vec4F& p = img(minor_iter, major_iter); + + uint32_t dst_x = 0, dst_y = 0; + + // is the minor dim output x? + if (comp[1] == 0) + { + // minor axis is x, major is y + + // is minor axis (which is output x) flipped? + if (dir[1] < 0) + dst_x = minor_dim - 1 - minor_iter; + else + dst_x = minor_iter; + + // is major axis (which is output y) flipped? -1=down in raster order, 1=up + if (dir[0] < 0) + dst_y = major_iter; + else + dst_y = major_dim - 1 - major_iter; + } + else + { + // minor axis is output y, major is output x + + // is minor axis (which is output y) flipped? + if (dir[1] < 0) + dst_y = minor_iter; + else + dst_y = minor_dim - 1 - minor_iter; + + // is major axis (which is output x) flipped? + if (dir[0] < 0) + dst_x = major_dim - 1 - major_iter; + else + dst_x = major_iter; + } + + final_img(dst_x, dst_y) = p; + } + } + + final_img.swap(img); + + return true; + } + + bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info) + { + uint8_vec filedata; + if (!read_file_to_vec(pFilename, filedata)) + return false; + return read_rgbe(filedata, img, hdr_info); + } + + static uint8_vec& append_string(uint8_vec& buf, const char* pStr) + { + const size_t str_len = strlen(pStr); + if (!str_len) + return buf; + + const size_t ofs = buf.size(); + buf.resize(ofs + str_len); + memcpy(&buf[ofs], pStr, str_len); + + return buf; + } + + static uint8_vec& append_string(uint8_vec& buf, const std::string& str) + { + if (!str.size()) + return buf; + return append_string(buf, str.c_str()); + } + + static inline void float2rgbe(color_rgba &rgbe, const vec4F &c) + { + const float red = c[0], green = c[1], blue = c[2]; + assert(red >= 0.0f && green >= 0.0f && blue >= 0.0f); + + const float max_v = basisu::maximumf(basisu::maximumf(red, green), blue); + + if (max_v < 1e-32f) + rgbe.clear(); + else + { + int e; + const float scale = frexp(max_v, &e) * 256.0f / max_v; + rgbe[0] = (uint8_t)(clamp((int)(red * scale), 0, 255)); + rgbe[1] = (uint8_t)(clamp((int)(green * scale), 0, 255)); + rgbe[2] = (uint8_t)(clamp((int)(blue * scale), 0, 255)); + rgbe[3] = (uint8_t)(e + 128); + } + } + + const bool RGBE_FORCE_RAW = false; + const bool RGBE_FORCE_OLD_CRUNCH = false; // note must readers (particularly stb_image.h's) don't properly support this, when they should + + bool write_rgbe(uint8_vec &file_data, imagef& img, rgbe_header_info& hdr_info) + { + if (!img.get_width() || !img.get_height()) + return false; + + const uint32_t width = img.get_width(), height = img.get_height(); + + file_data.resize(0); + file_data.reserve(1024 + img.get_width() * img.get_height() * 4); + + append_string(file_data, "#?RADIANCE\n"); + + if (hdr_info.m_has_exposure) + append_string(file_data, string_format("EXPOSURE=%g\n", hdr_info.m_exposure)); + + if (hdr_info.m_has_gamma) + append_string(file_data, string_format("GAMMA=%g\n", hdr_info.m_gamma)); + + append_string(file_data, "FORMAT=32-bit_rle_rgbe\n\n"); + append_string(file_data, string_format("-Y %u +X %u\n", height, width)); + + if (((width < 8) || (width > 0x7FFF)) || (RGBE_FORCE_RAW)) + { + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + color_rgba rgbe; + float2rgbe(rgbe, img(x, y)); + append_vector(file_data, (const uint8_t *)&rgbe, sizeof(rgbe)); + } + } + } + else if (RGBE_FORCE_OLD_CRUNCH) + { + for (uint32_t y = 0; y < height; y++) + { + int prev_r = -1, prev_g = -1, prev_b = -1, prev_e = -1; + uint32_t cur_run_len = 0; + + for (uint32_t x = 0; x < width; x++) + { + color_rgba rgbe; + float2rgbe(rgbe, img(x, y)); + + if ((rgbe[0] == prev_r) && (rgbe[1] == prev_g) && (rgbe[2] == prev_b) && (rgbe[3] == prev_e)) + { + if (++cur_run_len == 255) + { + // this ensures rshift stays 0, it's lame but this path is only for testing readers + color_rgba f(1, 1, 1, cur_run_len - 1); + append_vector(file_data, (const uint8_t*)&f, sizeof(f)); + append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe)); + cur_run_len = 0; + } + } + else + { + if (cur_run_len > 0) + { + color_rgba f(1, 1, 1, cur_run_len); + append_vector(file_data, (const uint8_t*)&f, sizeof(f)); + + cur_run_len = 0; + } + + append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe)); + + prev_r = rgbe[0]; + prev_g = rgbe[1]; + prev_b = rgbe[2]; + prev_e = rgbe[3]; + } + } // x + + if (cur_run_len > 0) + { + color_rgba f(1, 1, 1, cur_run_len); + append_vector(file_data, (const uint8_t*)&f, sizeof(f)); + } + } // y + } + else + { + uint8_vec temp[4]; + for (uint32_t c = 0; c < 4; c++) + temp[c].resize(width); + + for (uint32_t y = 0; y < height; y++) + { + color_rgba rgbe(2, 2, width >> 8, width & 0xFF); + append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe)); + + for (uint32_t x = 0; x < width; x++) + { + float2rgbe(rgbe, img(x, y)); + + for (uint32_t c = 0; c < 4; c++) + temp[c][x] = rgbe[c]; + } + + for (uint32_t c = 0; c < 4; c++) + { + int raw_ofs = -1; + + uint32_t x = 0; + while (x < width) + { + const uint32_t num_bytes_remaining = width - x; + const uint32_t max_run_len = basisu::minimum(num_bytes_remaining, 127); + const uint8_t cur_byte = temp[c][x]; + + uint32_t run_len = 1; + while (run_len < max_run_len) + { + if (temp[c][x + run_len] != cur_byte) + break; + run_len++; + } + + const uint32_t cost_to_keep_raw = ((raw_ofs != -1) ? 0 : 1) + run_len; // 0 or 1 bytes to start a raw run, then the repeated bytes issued as raw + const uint32_t cost_to_take_run = 2 + 1; // 2 bytes to issue the RLE, then 1 bytes to start whatever follows it (raw or RLE) + + if ((run_len >= 3) && (cost_to_take_run < cost_to_keep_raw)) + { + file_data.push_back((uint8_t)(128 + run_len)); + file_data.push_back(cur_byte); + + x += run_len; + raw_ofs = -1; + } + else + { + if (raw_ofs < 0) + { + raw_ofs = (int)file_data.size(); + file_data.push_back(0); + } + + if (++file_data[raw_ofs] == 128) + raw_ofs = -1; + + file_data.push_back(cur_byte); + + x++; + } + } // x + + } // c + } // y + } + + return true; + } + + bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info) + { + uint8_vec file_data; + if (!write_rgbe(file_data, img, hdr_info)) + return false; + return write_vec_to_file(pFilename, file_data); + } + + bool read_exr(const char* pFilename, imagef& img, int& n_chans) + { + n_chans = 0; + + int width = 0, height = 0; + float* out_rgba = nullptr; + const char* err = nullptr; + + int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err); + n_chans = 4; + if (status != 0) + { + error_printf("Failed loading .EXR image \"%s\"! (TinyEXR error: %s)\n", pFilename, err ? err : "?"); + FreeEXRErrorMessage(err); + free(out_rgba); + return false; + } + + const uint32_t MAX_SUPPORTED_DIM = 65536; + if ((width < 1) || (height < 1) || (width > (int)MAX_SUPPORTED_DIM) || (height > (int)MAX_SUPPORTED_DIM)) + { + error_printf("Invalid dimensions of .EXR image \"%s\"!\n", pFilename); + free(out_rgba); + return false; + } + + img.resize(width, height); + + if (n_chans == 1) + { + const float* pSrc = out_rgba; + vec4F* pDst = img.get_ptr(); + + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + (*pDst)[0] = pSrc[0]; + (*pDst)[1] = pSrc[1]; + (*pDst)[2] = pSrc[2]; + (*pDst)[3] = 1.0f; + + pSrc += 4; + ++pDst; + } + } + } + else + { + memcpy(img.get_ptr(), out_rgba, sizeof(float) * 4 * img.get_total_pixels()); + } + + free(out_rgba); + return true; + } + + bool read_exr(const void* pMem, size_t mem_size, imagef& img) + { + float* out_rgba = nullptr; + int width = 0, height = 0; + const char* pErr = nullptr; + int res = LoadEXRFromMemory(&out_rgba, &width, &height, (const uint8_t*)pMem, mem_size, &pErr); + if (res < 0) + { + error_printf("Failed loading .EXR image from memory! (TinyEXR error: %s)\n", pErr ? pErr : "?"); + FreeEXRErrorMessage(pErr); + free(out_rgba); + return false; + } + + img.resize(width, height); + memcpy(img.get_ptr(), out_rgba, width * height * sizeof(float) * 4); + free(out_rgba); + + return true; + } + + bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags) + { + assert((n_chans == 1) || (n_chans == 3) || (n_chans == 4)); + + const bool linear_hint = (flags & WRITE_EXR_LINEAR_HINT) != 0, + store_float = (flags & WRITE_EXR_STORE_FLOATS) != 0, + no_compression = (flags & WRITE_EXR_NO_COMPRESSION) != 0; + + const uint32_t width = img.get_width(), height = img.get_height(); + assert(width && height); + + if (!width || !height) + return false; + + float_vec layers[4]; + float* image_ptrs[4]; + for (uint32_t c = 0; c < n_chans; c++) + { + layers[c].resize(width * height); + image_ptrs[c] = layers[c].get_ptr(); + } + + // ABGR + int chan_order[4] = { 3, 2, 1, 0 }; + + if (n_chans == 1) + { + // Y + chan_order[0] = 0; + } + else if (n_chans == 3) + { + // BGR + chan_order[0] = 2; + chan_order[1] = 1; + chan_order[2] = 0; + } + else if (n_chans != 4) + { + assert(0); + return false; + } + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const vec4F& p = img(x, y); + + for (uint32_t c = 0; c < n_chans; c++) + layers[c][x + y * width] = p[chan_order[c]]; + } // x + } // y + + EXRHeader header; + InitEXRHeader(&header); + + EXRImage image; + InitEXRImage(&image); + + image.num_channels = n_chans; + image.images = (unsigned char**)image_ptrs; + image.width = width; + image.height = height; + + header.num_channels = n_chans; + + header.channels = (EXRChannelInfo*)calloc(header.num_channels, sizeof(EXRChannelInfo)); + + // Must be (A)BGR order, since most of EXR viewers expect this channel order. + for (uint32_t i = 0; i < n_chans; i++) + { + char c = 'Y'; + if (n_chans == 3) + c = "BGR"[i]; + else if (n_chans == 4) + c = "ABGR"[i]; + + header.channels[i].name[0] = c; + header.channels[i].name[1] = '\0'; + + header.channels[i].p_linear = linear_hint; + } + + header.pixel_types = (int*)calloc(header.num_channels, sizeof(int)); + header.requested_pixel_types = (int*)calloc(header.num_channels, sizeof(int)); + + if (!no_compression) + header.compression_type = TINYEXR_COMPRESSIONTYPE_ZIP; + + for (int i = 0; i < header.num_channels; i++) + { + // pixel type of input image + header.pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT; + + // pixel type of output image to be stored in .EXR + header.requested_pixel_types[i] = store_float ? TINYEXR_PIXELTYPE_FLOAT : TINYEXR_PIXELTYPE_HALF; + } + + const char* pErr_msg = nullptr; + + int ret = SaveEXRImageToFile(&image, &header, pFilename, &pErr_msg); + if (ret != TINYEXR_SUCCESS) + { + error_printf("Save EXR err: %s\n", pErr_msg); + FreeEXRErrorMessage(pErr_msg); + } + + free(header.channels); + free(header.pixel_types); + free(header.requested_pixel_types); + + return (ret == TINYEXR_SUCCESS); + } + + void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...) + { + char buf[2048]; + + va_list args; + va_start(args, pFmt); +#ifdef _WIN32 + vsprintf_s(buf, sizeof(buf), pFmt, args); +#else + vsnprintf(buf, sizeof(buf), pFmt, args); +#endif + va_end(args); + + const char* p = buf; + + const uint32_t orig_x_ofs = x_ofs; + + while (*p) + { + uint8_t c = *p++; + if ((c < 32) || (c > 127)) + c = '.'; + + const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0]; + + for (uint32_t y = 0; y < 8; y++) + { + uint32_t row_bits = pGlpyh[y]; + for (uint32_t x = 0; x < 8; x++) + { + const uint32_t q = row_bits & (1 << x); + + const color_rgba* pColor = q ? &fg : pBG; + if (!pColor) + continue; + + if (alpha_only) + fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor); + else + fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor); + } + } + + x_ofs += 8 * scale_x; + if ((x_ofs + 8 * scale_x) > m_width) + { + x_ofs = orig_x_ofs; + y_ofs += 8 * scale_y; + } + } + } + + // Very basic global Reinhard tone mapping, output converted to sRGB with no dithering, alpha is carried through unchanged. + // Only used for debugging/development. + void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure) + { + uint32_t width = hdr_img.get_width(), height = hdr_img.get_height(); + + ldr_img.resize(width, height); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + vec4F c(hdr_img(x, y)); + + for (uint32_t t = 0; t < 3; t++) + { + if (c[t] <= 0.0f) + { + c[t] = 0.0f; + } + else + { + c[t] *= exposure; + c[t] = c[t] / (1.0f + c[t]); + } + } + + c.clamp(0.0f, 1.0f); + + c[0] = linear_to_srgb(c[0]) * 255.0f; + c[1] = linear_to_srgb(c[1]) * 255.0f; + c[2] = linear_to_srgb(c[2]) * 255.0f; + c[3] = c[3] * 255.0f; + + color_rgba& o = ldr_img(x, y); + + o[0] = (uint8_t)std::round(c[0]); + o[1] = (uint8_t)std::round(c[1]); + o[2] = (uint8_t)std::round(c[2]); + o[3] = (uint8_t)std::round(c[3]); + } + } + } + + bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img) + { + const uint32_t width = hdr_test_img.get_width(); + const uint32_t height = hdr_test_img.get_height(); + + uint16_vec orig_half_img(width * 3 * height); + uint16_vec half_img(width * 3 * height); + + int max_shift = 32; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const vec4F& p = hdr_test_img(x, y); + + for (uint32_t i = 0; i < 3; i++) + { + if (p[i] < 0.0f) + return false; + if (p[i] > basist::MAX_HALF_FLOAT) + return false; + + uint32_t h = basist::float_to_half(p[i]); + //uint32_t orig_h = h; + + orig_half_img[(x + y * width) * 3 + i] = (uint16_t)h; + + // Rotate sign bit into LSB + //h = rot_left16((uint16_t)h, 1); + //assert(rot_right16((uint16_t)h, 1) == orig_h); + h <<= 1; + + half_img[(x + y * width) * 3 + i] = (uint16_t)h; + + // Determine # of leading zero bits, ignoring the sign bit + if (h) + { + int lz = clz(h) - 16; + assert(lz >= 0 && lz <= 16); + + assert((h << lz) <= 0xFFFF); + + max_shift = basisu::minimum(max_shift, lz); + } + } // i + } // x + } // y + + //printf("tonemap_image_compressive: Max leading zeros: %i\n", max_shift); + + uint32_t high_hist[256]; + clear_obj(high_hist); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + for (uint32_t i = 0; i < 3; i++) + { + uint16_t& hf = half_img[(x + y * width) * 3 + i]; + + assert(((uint32_t)hf << max_shift) <= 65535); + + hf <<= max_shift; + + uint32_t h = (uint8_t)(hf >> 8); + high_hist[h]++; + } + } // x + } // y + + uint32_t total_vals_used = 0; + int remap_old_to_new[256]; + for (uint32_t i = 0; i < 256; i++) + remap_old_to_new[i] = -1; + + for (uint32_t i = 0; i < 256; i++) + { + if (high_hist[i] != 0) + { + remap_old_to_new[i] = total_vals_used; + total_vals_used++; + } + } + + assert(total_vals_used >= 1); + + //printf("tonemap_image_compressive: Total used high byte values: %u, unused: %u\n", total_vals_used, 256 - total_vals_used); + + bool val_used[256]; + clear_obj(val_used); + + int remap_new_to_old[256]; + for (uint32_t i = 0; i < 256; i++) + remap_new_to_old[i] = -1; + BASISU_NOTE_UNUSED(remap_new_to_old); + + int prev_c = -1; + BASISU_NOTE_UNUSED(prev_c); + for (uint32_t i = 0; i < 256; i++) + { + if (remap_old_to_new[i] >= 0) + { + int c; + if (total_vals_used <= 1) + c = remap_old_to_new[i]; + else + { + c = (remap_old_to_new[i] * 255 + ((total_vals_used - 1) / 2)) / (total_vals_used - 1); + + assert(c > prev_c); + } + + assert(!val_used[c]); + + remap_new_to_old[c] = i; + + remap_old_to_new[i] = c; + prev_c = c; + + //printf("%u ", c); + + val_used[c] = true; + } + } // i + //printf("\n"); + + dst_img.resize(width, height); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + for (uint32_t c = 0; c < 3; c++) + { + uint16_t& v16 = half_img[(x + y * width) * 3 + c]; + + uint32_t hb = v16 >> 8; + //uint32_t lb = v16 & 0xFF; + + assert(remap_old_to_new[hb] != -1); + assert(remap_old_to_new[hb] <= 255); + assert(remap_new_to_old[remap_old_to_new[hb]] == (int)hb); + + hb = remap_old_to_new[hb]; + + //v16 = (uint16_t)((hb << 8) | lb); + + dst_img(x, y)[c] = (uint8_t)hb; + } + } // x + } // y + + return true; + } + } // namespace basisu diff --git a/thirdparty/basis_universal/encoder/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h index 0efeaa461fbf..780605e7b861 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.h +++ b/thirdparty/basis_universal/encoder/basisu_enc.h @@ -1,5 +1,5 @@ // basisu_enc.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,7 +48,8 @@ namespace basisu // Encoder library initialization. // This function MUST be called before encoding anything! - void basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false); + // Returns false if library initialization fails. + bool basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false); void basisu_encoder_deinit(); // basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1 @@ -70,6 +71,18 @@ namespace basisu return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); } + inline int left_shift32(int val, int shift) + { + assert((shift >= 0) && (shift < 32)); + return static_cast(static_cast(val) << shift); + } + + inline uint32_t left_shift32(uint32_t val, int shift) + { + assert((shift >= 0) && (shift < 32)); + return val << shift; + } + inline int32_t clampi(int32_t value, int32_t low, int32_t high) { if (value < low) @@ -130,6 +143,31 @@ namespace basisu return bits; } + + // Open interval + inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; } + inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; } + + // Closed interval + inline int bounds_check_incl(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; } + inline uint32_t bounds_check_incl(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; } + + inline uint32_t clz(uint32_t x) + { + if (!x) + return 32; + + uint32_t n = 0; + while ((x & 0x80000000) == 0) + { + x <<= 1u; + n++; + } + + return n; + } + + bool string_begins_with(const std::string& str, const char* pPhrase); // Hashing @@ -268,6 +306,7 @@ namespace basisu public: enum { num_elements = N }; + typedef T scalar_type; inline vec() { } inline vec(eZero) { set_zero(); } @@ -291,6 +330,7 @@ namespace basisu inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; } inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; } + inline void clear() { set_zero(); } template inline vec &set(const vec &other) @@ -391,7 +431,7 @@ namespace basisu inline T distance(const vec &other) const { return static_cast(sqrt(squared_distance(other))); } inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); } - inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; } + inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; } inline vec &clamp(T l, T h) { @@ -722,7 +762,7 @@ namespace basisu void job_thread(uint32_t index); }; - // Simple 32-bit color class + // Simple 64-bit color class class color_rgba_i16 { @@ -1116,7 +1156,9 @@ namespace basisu { std::string result(s); for (size_t i = 0; i < result.size(); i++) - result[i] = (char)tolower((int)result[i]); + { + result[i] = (char)tolower((uint8_t)(result[i])); + } return result; } @@ -1408,7 +1450,7 @@ namespace basisu size_t get_total_training_vecs() const { return m_training_vecs.size(); } const array_of_weighted_training_vecs &get_training_vecs() const { return m_training_vecs; } - array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; } + array_of_weighted_training_vecs &get_training_vecs() { return m_training_vecs; } void retrieve(basisu::vector< basisu::vector > &codebook) const { @@ -1437,36 +1479,36 @@ namespace basisu } void retrieve(uint32_t max_clusters, basisu::vector &codebook) const - { + { uint_vec node_stack; - node_stack.reserve(512); + node_stack.reserve(512); - codebook.resize(0); - codebook.reserve(max_clusters); + codebook.resize(0); + codebook.reserve(max_clusters); - uint32_t node_index = 0; + uint32_t node_index = 0; - while (true) - { - const tsvq_node& cur = m_nodes[node_index]; + while (true) + { + const tsvq_node& cur = m_nodes[node_index]; - if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters)) - { - codebook.resize(codebook.size() + 1); - codebook.back() = cur.m_training_vecs; + if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters)) + { + codebook.resize(codebook.size() + 1); + codebook.back() = cur.m_training_vecs; - if (node_stack.empty()) - break; + if (node_stack.empty()) + break; - node_index = node_stack.back(); - node_stack.pop_back(); - continue; - } + node_index = node_stack.back(); + node_stack.pop_back(); + continue; + } - node_stack.push_back(cur.m_right_index); - node_index = cur.m_left_index; - } - } + node_stack.push_back(cur.m_right_index); + node_index = cur.m_left_index; + } + } bool generate(uint32_t max_size) { @@ -2319,6 +2361,14 @@ namespace basisu m_total_bits = 0; } + inline void restart() + { + m_bytes.resize(0); + m_bit_buffer = 0; + m_bit_buffer_size = 0; + m_total_bits = 0; + } + inline const uint8_vec &get_bytes() const { return m_bytes; } inline uint64_t get_total_bits() const { return m_total_bits; } @@ -2920,11 +2970,11 @@ namespace basisu inline const color_rgba *get_ptr() const { return &m_pixels[0]; } inline color_rgba *get_ptr() { return &m_pixels[0]; } - bool has_alpha() const + bool has_alpha(uint32_t channel = 3) const { for (uint32_t y = 0; y < m_height; ++y) for (uint32_t x = 0; x < m_width; ++x) - if ((*this)(x, y).a < 255) + if ((*this)(x, y)[channel] < 255) return true; return false; @@ -3130,6 +3180,31 @@ namespace basisu return *this; } + imagef& crop_dup_borders(uint32_t w, uint32_t h) + { + const uint32_t orig_w = m_width, orig_h = m_height; + + crop(w, h); + + if (orig_w && orig_h) + { + if (m_width > orig_w) + { + for (uint32_t x = orig_w; x < m_width; x++) + for (uint32_t y = 0; y < m_height; y++) + set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U))); + } + + if (m_height > orig_h) + { + for (uint32_t y = orig_h; y < m_height; y++) + for (uint32_t x = 0; x < m_width; x++) + set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U))); + } + } + return *this; + } + inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; } inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; } @@ -3213,19 +3288,128 @@ namespace basisu inline const vec4F *get_ptr() const { return &m_pixels[0]; } inline vec4F *get_ptr() { return &m_pixels[0]; } + + bool clean_astc_hdr_pixels(float highest_mag) + { + bool status = true; + bool nan_msg = false; + bool inf_msg = false; + bool neg_zero_msg = false; + bool neg_msg = false; + bool clamp_msg = false; + + for (uint32_t iy = 0; iy < m_height; iy++) + { + for (uint32_t ix = 0; ix < m_width; ix++) + { + vec4F& c = (*this)(ix, iy); + + for (uint32_t s = 0; s < 4; s++) + { + float &p = c[s]; + union { float f; uint32_t u; } x; x.f = p; + + if ((std::isnan(p)) || (std::isinf(p)) || (x.u == 0x80000000)) + { + if (std::isnan(p)) + { + if (!nan_msg) + { + fprintf(stderr, "One or more pixels was NaN, setting to 0.\n"); + nan_msg = true; + } + } + + if (std::isinf(p)) + { + if (!inf_msg) + { + fprintf(stderr, "One or more pixels was INF, setting to 0.\n"); + inf_msg = true; + } + } + + if (x.u == 0x80000000) + { + if (!neg_zero_msg) + { + fprintf(stderr, "One or more pixels was -0, setting them to 0.\n"); + neg_zero_msg = true; + } + } + + p = 0.0f; + status = false; + } + else + { + //const float o = p; + if (p < 0.0f) + { + p = 0.0f; + + if (!neg_msg) + { + fprintf(stderr, "One or more pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n"); + neg_msg = true; + } + + status = false; + } + + if (p > highest_mag) + { + p = highest_mag; + + if (!clamp_msg) + { + fprintf(stderr, "One or more pixels had to be clamped to %f.\n", highest_mag); + clamp_msg = true; + } + + status = false; + } + } + } + } + } + + return status; + } + + imagef& flip_y() + { + for (uint32_t y = 0; y < m_height / 2; ++y) + for (uint32_t x = 0; x < m_width; ++x) + std::swap((*this)(x, y), (*this)(x, m_height - 1 - y)); + + return *this; + } private: uint32_t m_width, m_height, m_pitch; // all in pixels vec4F_vec m_pixels; }; + // REC 709 coefficients + const float REC_709_R = 0.212656f, REC_709_G = 0.715158f, REC_709_B = 0.072186f; + + inline float get_luminance(const vec4F &c) + { + return c[0] * REC_709_R + c[1] * REC_709_G + c[2] * REC_709_B; + } + + float linear_to_srgb(float l); + float srgb_to_linear(float s); + // Image metrics class image_metrics { public: // TODO: Add ssim - float m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim; + double m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim; + bool m_has_neg, m_hf_mag_overflow, m_any_abnormal; image_metrics() { @@ -3240,10 +3424,17 @@ namespace basisu m_rms = 0; m_psnr = 0; m_ssim = 0; + m_has_neg = false; + m_hf_mag_overflow = false; + m_any_abnormal = false; } - void print(const char *pPrefix = nullptr) { printf("%sMax: %3.0f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr); } + void print(const char *pPrefix = nullptr) { printf("%sMax: %3.3f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr); } + void print_hp(const char* pPrefix = nullptr) { printf("%sMax: %3.6f Mean: %3.6f RMS: %3.6f PSNR: %2.6f dB, Any Neg: %u, Half float overflow: %u, Any NaN/Inf: %u\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr, m_has_neg, m_hf_mag_overflow, m_any_abnormal); } + void calc(const imagef& a, const imagef& b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool log = false); + void calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error); + void calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error); void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false); }; @@ -3256,6 +3447,8 @@ namespace basisu bool load_tga(const char* pFilename, image& img); inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); } + bool load_qoi(const char* pFilename, image& img); + bool load_jpg(const char *pFilename, image& img); inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); } @@ -3263,9 +3456,64 @@ namespace basisu bool load_image(const char* pFilename, image& img); inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); } + // Supports .HDR and most (but not all) .EXR's (see TinyEXR). + bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true); + inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true) { return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear); } + + enum class hdr_image_type + { + cHITRGBAHalfFloat = 0, + cHITRGBAFloat = 1, + cHITPNGImage = 2, + cHITEXRImage = 3, + cHITHDRImage = 4 + }; + + bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear); + uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans); uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans); + struct rgbe_header_info + { + std::string m_program; + + // Note no validation is done, either gamma or exposure may be 0. + double m_gamma; + bool m_has_gamma; + + double m_exposure; // watts/steradian/m^2. + bool m_has_exposure; + + void clear() + { + m_program.clear(); + m_gamma = 1.0f; + m_has_gamma = false; + m_exposure = 1.0f; + m_has_exposure = false; + } + }; + + bool read_rgbe(const uint8_vec& filedata, imagef& img, rgbe_header_info& hdr_info); + bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info &hdr_info); + + bool write_rgbe(uint8_vec& file_data, imagef& img, rgbe_header_info& hdr_info); + bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info); + + bool read_exr(const char* pFilename, imagef& img, int& n_chans); + bool read_exr(const void* pMem, size_t mem_size, imagef& img); + + enum + { + WRITE_EXR_LINEAR_HINT = 1, // hint for lossy comp. methods: exr_perceptual_treatment_t, logarithmic or linear, defaults to logarithmic + WRITE_EXR_STORE_FLOATS = 2, // use 32-bit floats, otherwise it uses half floats + WRITE_EXR_NO_COMPRESSION = 4 // no compression, otherwise it uses ZIP compression (16 scanlines per block) + }; + + // Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images. + bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags); + enum { cImageSaveGrayscale = 1, @@ -3276,19 +3524,22 @@ namespace basisu inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); } bool read_file_to_vec(const char* pFilename, uint8_vec& data); - + bool read_file_to_data(const char* pFilename, void *pData, size_t len); + bool write_data_to_file(const char* pFilename, const void* pData, size_t len); inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) { return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); } - - float linear_to_srgb(float l); - float srgb_to_linear(float s); - + bool image_resample(const image &src, image &dst, bool srgb = false, const char *pFilter = "lanczos4", float filter_scale = 1.0f, bool wrapping = false, uint32_t first_comp = 0, uint32_t num_comps = 4); + bool image_resample(const imagef& src, imagef& dst, + const char* pFilter = "lanczos4", float filter_scale = 1.0f, + bool wrapping = false, + uint32_t first_comp = 0, uint32_t num_comps = 4); + // Timing typedef uint64_t timer_ticks; @@ -3319,6 +3570,8 @@ namespace basisu bool m_started, m_stopped; }; + inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); } + // 2D array template @@ -3372,8 +3625,8 @@ namespace basisu inline const T &operator[] (uint32_t i) const { return m_values[i]; } inline T &operator[] (uint32_t i) { return m_values[i]; } - inline const T &at_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width), clamp(y, 0, m_height)); } - inline T &at_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width), clamp(y, 0, m_height)); } + inline const T &at_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline T &at_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } void clear() { @@ -3450,7 +3703,327 @@ namespace basisu } }; typedef basisu::vector pixel_block_vec; - + + struct pixel_block_hdr + { + vec4F m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x] + + inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; } + inline vec4F& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; } + + inline const vec4F* get_ptr() const { return &m_pixels[0][0]; } + inline vec4F* get_ptr() { return &m_pixels[0][0]; } + + inline void clear() { clear_obj(*this); } + + inline bool operator== (const pixel_block& rhs) const + { + return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0; + } + }; + typedef basisu::vector pixel_block_hdr_vec; + + void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure); + bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img); + + // Intersection + enum eClear { cClear = 0 }; + enum eInitExpand { cInitExpand = 0 }; + + template + class ray + { + public: + typedef vector_type vector_t; + typedef typename vector_type::scalar_type scalar_type; + + inline ray() { } + inline ray(eClear) { clear(); } + inline ray(const vector_type& origin, const vector_type& direction) : m_origin(origin), m_direction(direction) { } + + inline void clear() + { + m_origin.clear(); + m_direction.clear(); + } + + inline const vector_type& get_origin(void) const { return m_origin; } + inline void set_origin(const vector_type& origin) { m_origin = origin; } + + inline const vector_type& get_direction(void) const { return m_direction; } + inline void set_direction(const vector_type& direction) { m_direction = direction; } + + inline void set_endpoints(const vector_type& start, const vector_type& end) + { + m_origin = start; + + m_direction = end - start; + m_direction.normalize_in_place(); + } + + inline vector_type eval(scalar_type t) const + { + return m_origin + m_direction * t; + } + + private: + vector_type m_origin; + vector_type m_direction; + }; + + typedef ray ray2F; + typedef ray ray3F; + + template + class vec_interval + { + public: + enum { N = T::num_elements }; + typedef typename T::scalar_type scalar_type; + + inline vec_interval(const T& v) { m_bounds[0] = v; m_bounds[1] = v; } + inline vec_interval(const T& low, const T& high) { m_bounds[0] = low; m_bounds[1] = high; } + + inline vec_interval() { } + inline vec_interval(eClear) { clear(); } + inline vec_interval(eInitExpand) { init_expand(); } + + inline void clear() { m_bounds[0].clear(); m_bounds[1].clear(); } + + inline void init_expand() + { + m_bounds[0].set(1e+30f, 1e+30f, 1e+30f); + m_bounds[1].set(-1e+30f, -1e+30f, -1e+30f); + } + + inline vec_interval expand(const T& p) + { + for (uint32_t c = 0; c < N; c++) + { + if (p[c] < m_bounds[0][c]) + m_bounds[0][c] = p[c]; + + if (p[c] > m_bounds[1][c]) + m_bounds[1][c] = p[c]; + } + + return *this; + } + + inline const T& operator[] (uint32_t i) const { assert(i < 2); return m_bounds[i]; } + inline T& operator[] (uint32_t i) { assert(i < 2); return m_bounds[i]; } + + const T& get_low() const { return m_bounds[0]; } + T& get_low() { return m_bounds[0]; } + + const T& get_high() const { return m_bounds[1]; } + T& get_high() { return m_bounds[1]; } + + scalar_type get_dim(uint32_t axis) const { return m_bounds[1][axis] - m_bounds[0][axis]; } + + bool contains(const T& p) const + { + const T& low = get_low(), high = get_high(); + + for (uint32_t i = 0; i < N; i++) + { + if (p[i] < low[i]) + return false; + + if (p[i] > high[i]) + return false; + } + return true; + } + + private: + T m_bounds[2]; + }; + + typedef vec_interval vec_interval1F; + typedef vec_interval vec_interval2F; + typedef vec_interval vec_interval3F; + typedef vec_interval vec_interval4F; + + typedef vec_interval2F aabb2F; + typedef vec_interval3F aabb3F; + + namespace intersection + { + enum result + { + cBackfacing = -1, + cFailure = 0, + cSuccess, + cParallel, + cInside, + }; + + // Returns cInside, cSuccess, or cFailure. + // Algorithm: Graphics Gems 1 + template + result ray_aabb(vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box) + { + enum + { + cNumDim = vector_type::num_elements, + cRight = 0, + cLeft = 1, + cMiddle = 2 + }; + + bool inside = true; + int quadrant[cNumDim]; + scalar_type candidate_plane[cNumDim]; + + for (int i = 0; i < cNumDim; i++) + { + if (ray.get_origin()[i] < box[0][i]) + { + quadrant[i] = cLeft; + candidate_plane[i] = box[0][i]; + inside = false; + } + else if (ray.get_origin()[i] > box[1][i]) + { + quadrant[i] = cRight; + candidate_plane[i] = box[1][i]; + inside = false; + } + else + { + quadrant[i] = cMiddle; + } + } + + if (inside) + { + coord = ray.get_origin(); + t = 0.0f; + return cInside; + } + + scalar_type max_t[cNumDim]; + for (int i = 0; i < cNumDim; i++) + { + if ((quadrant[i] != cMiddle) && (ray.get_direction()[i] != 0.0f)) + max_t[i] = (candidate_plane[i] - ray.get_origin()[i]) / ray.get_direction()[i]; + else + max_t[i] = -1.0f; + } + + int which_plane = 0; + for (int i = 1; i < cNumDim; i++) + if (max_t[which_plane] < max_t[i]) + which_plane = i; + + if (max_t[which_plane] < 0.0f) + return cFailure; + + for (int i = 0; i < cNumDim; i++) + { + if (i != which_plane) + { + coord[i] = ray.get_origin()[i] + max_t[which_plane] * ray.get_direction()[i]; + + if ((coord[i] < box[0][i]) || (coord[i] > box[1][i])) + return cFailure; + } + else + { + coord[i] = candidate_plane[i]; + } + + assert(coord[i] >= box[0][i] && coord[i] <= box[1][i]); + } + + t = max_t[which_plane]; + return cSuccess; + } + + template + result ray_aabb(bool& started_within, vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box) + { + if (!box.contains(ray.get_origin())) + { + started_within = false; + return ray_aabb(coord, t, ray, box); + } + + started_within = true; + + typename vector_type::T diag_dist = box.diagonal_length() * 1.5f; + ray_type outside_ray(ray.eval(diag_dist), -ray.get_direction()); + + result res(ray_aabb(coord, t, outside_ray, box)); + if (res != cSuccess) + return res; + + t = basisu::maximum(0.0f, diag_dist - t); + return cSuccess; + } + + } // intersect + + // This float->half conversion matches how "F32TO16" works on Intel GPU's. + // Input cannot be negative, Inf or Nan. + inline basist::half_float float_to_half_non_neg_no_nan_inf(float val) + { + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF; + int e = 0, m = 0; + + assert(((fi.i >> 31) == 0) && (flt_e != 0xFF)); + + // not zero or denormal + if (flt_e != 0) + { + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + m = lrintf((1 << 24) * fabsf(fi.f)); + else + { + e = new_exp + 15; + m = lrintf(flt_m * (1.0f / ((float)(1 << 13)))); + } + } + + assert((0 <= m) && (m <= 1024)); + if (m == 1024) + { + e++; + m = 0; + } + + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); + + basist::half_float result = (basist::half_float)((e << 10) | m); + return result; + } + + // Supports positive and denormals only. No NaN or Inf. + inline float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h) + { + assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h)); + + union fu32 + { + uint32_t u; + float f; + }; + + static const fu32 K = { 0x77800000 }; + + fu32 o; + o.u = h << 13; + o.f *= K.f; + + return o.f; + } + } // namespace basisu diff --git a/thirdparty/basis_universal/encoder/basisu_etc.cpp b/thirdparty/basis_universal/encoder/basisu_etc.cpp index f8bd0f12e5f0..ba1c14231d32 100644 --- a/thirdparty/basis_universal/encoder/basisu_etc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_etc.cpp @@ -1,5 +1,5 @@ // basis_etc.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_etc.h b/thirdparty/basis_universal/encoder/basisu_etc.h index 208f2aac1b80..5c44bd481212 100644 --- a/thirdparty/basis_universal/encoder/basisu_etc.h +++ b/thirdparty/basis_universal/encoder/basisu_etc.h @@ -1,5 +1,5 @@ // basis_etc.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp index 1f30a33c7070..750f706aa538 100644 --- a/thirdparty/basis_universal/encoder/basisu_frontend.cpp +++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp @@ -1,5 +1,5 @@ // basisu_frontend.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -2347,6 +2347,7 @@ namespace basisu continue; uint64_t overall_best_err = 0; + (void)overall_best_err; uint64_t total_err[4][4][4]; clear_obj(total_err); diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.h b/thirdparty/basis_universal/encoder/basisu_frontend.h index cda73f398473..69fc8d8ec589 100644 --- a/thirdparty/basis_universal/encoder/basisu_frontend.h +++ b/thirdparty/basis_universal/encoder/basisu_frontend.h @@ -1,5 +1,5 @@ // basisu_frontend.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp index dec769d5acbe..342446b8fd43 100644 --- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp +++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp @@ -1,5 +1,5 @@ // basisu_gpu_texture.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,13 +15,15 @@ #include "basisu_gpu_texture.h" #include "basisu_enc.h" #include "basisu_pvrtc1_4.h" -#if BASISU_USE_ASTC_DECOMPRESS -#include "basisu_astc_decomp.h" -#endif +#include "3rdparty/android_astc_decomp.h" #include "basisu_bc7enc.h" +#include "../transcoder/basisu_astc_hdr_core.h" namespace basisu { + //------------------------------------------------------------------------------------------------ + // ETC2 EAC + void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels) { static_assert(sizeof(eac_a8_block) == 8, "sizeof(eac_a8_block) == 8"); @@ -56,6 +58,8 @@ namespace basisu pPixels[15].a = clamp255(base + pTable[pBlock->get_selector(3, 3, selector_bits)] * mul); } + //------------------------------------------------------------------------------------------------ + // BC1 struct bc1_block { enum { cTotalEndpointBytes = 2, cTotalSelectorBytes = 4 }; @@ -274,6 +278,9 @@ namespace basisu return used_punchthrough; } + //------------------------------------------------------------------------------------------------ + // BC3-5 + struct bc4_block { enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 }; @@ -372,7 +379,8 @@ namespace basisu unpack_bc4(pBlock_bits, &pPixels[0].r, sizeof(color_rgba)); unpack_bc4((const uint8_t *)pBlock_bits + sizeof(bc4_block), &pPixels[0].g, sizeof(color_rgba)); } - + + //------------------------------------------------------------------------------------------------ // ATC isn't officially documented, so I'm assuming these references: // http://www.guildsoftware.com/papers/2012.Converting.DXTC.to.ATC.pdf // https://github.com/Triang3l/S3TConv/blob/master/s3tconv_atitc.c @@ -426,6 +434,7 @@ namespace basisu } } + //------------------------------------------------------------------------------------------------ // BC7 mode 0-7 decompression. // Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines. @@ -742,6 +751,255 @@ namespace basisu return false; } + static inline int bc6h_sign_extend(int val, int bits) + { + assert((bits >= 1) && (bits < 32)); + assert((val >= 0) && (val < (1 << bits))); + return (val << (32 - bits)) >> (32 - bits); + } + + static inline int bc6h_apply_delta(int base, int delta, int num_bits, int is_signed) + { + int bitmask = ((1 << num_bits) - 1); + int v = (base + delta) & bitmask; + return is_signed ? bc6h_sign_extend(v, num_bits) : v; + } + + static int bc6h_dequantize(int val, int bits, int is_signed) + { + int result; + if (is_signed) + { + if (bits >= 16) + result = val; + else + { + int s_flag = 0; + if (val < 0) + { + s_flag = 1; + val = -val; + } + + if (val == 0) + result = 0; + else if (val >= ((1 << (bits - 1)) - 1)) + result = 0x7FFF; + else + result = ((val << 15) + 0x4000) >> (bits - 1); + + if (s_flag) + result = -result; + } + } + else + { + if (bits >= 15) + result = val; + else if (!val) + result = 0; + else if (val == ((1 << bits) - 1)) + result = 0xFFFF; + else + result = ((val << 16) + 0x8000) >> bits; + } + return result; + } + + static inline int bc6h_interpolate(int a, int b, const uint8_t* pWeights, int index) + { + return (a * (64 - (int)pWeights[index]) + b * (int)pWeights[index] + 32) >> 6; + } + + static inline basist::half_float bc6h_convert_to_half(int val, int is_signed) + { + if (!is_signed) + { + // scale by 31/64 + return (basist::half_float)((val * 31) >> 6); + } + + // scale by 31/32 + val = (val < 0) ? -(((-val) * 31) >> 5) : (val * 31) >> 5; + + int s = 0; + if (val < 0) + { + s = 0x8000; + val = -val; + } + + return (basist::half_float)(s | val); + } + + static inline uint32_t bc6h_get_bits(uint32_t num_bits, uint64_t& l, uint64_t& h, uint32_t& total_bits) + { + assert((num_bits) && (num_bits <= 63)); + + uint32_t v = (uint32_t)(l & ((1U << num_bits) - 1U)); + + l >>= num_bits; + l |= (h << (64U - num_bits)); + h >>= num_bits; + + total_bits += num_bits; + assert(total_bits <= 128); + + return v; + } + + static inline uint32_t bc6h_reverse_bits(uint32_t v, uint32_t num_bits) + { + uint32_t res = 0; + for (uint32_t i = 0; i < num_bits; i++) + { + uint32_t bit = (v & (1u << i)) != 0u; + res |= (bit << (num_bits - 1u - i)); + } + return res; + } + + static inline uint64_t bc6h_read_le_qword(const void* p) + { + const uint8_t* pSrc = static_cast(p); + return ((uint64_t)read_le_dword(pSrc)) | (((uint64_t)read_le_dword(pSrc + sizeof(uint32_t))) << 32U); + } + + bool unpack_bc6h(const void* pSrc_block, void* pDst_block, bool is_signed, uint32_t dest_pitch_in_halfs) + { + assert(dest_pitch_in_halfs >= 4 * 3); + + const uint32_t MAX_SUBSETS = 2, MAX_COMPS = 3; + + const uint8_t* pSrc = static_cast(pSrc_block); + basist::half_float* pDst = static_cast(pDst_block); + + uint64_t blo = bc6h_read_le_qword(pSrc), bhi = bc6h_read_le_qword(pSrc + sizeof(uint64_t)); + + // Unpack mode + const int mode = basist::g_bc6h_mode_lookup[blo & 31]; + if (mode < 0) + { + for (int y = 0; y < 4; y++) + { + memset(pDst, 0, sizeof(basist::half_float) * 4); + pDst += dest_pitch_in_halfs; + } + return false; + } + + // Skip mode bits + uint32_t total_bits_read = 0; + bc6h_get_bits((mode < 2) ? 2 : 5, blo, bhi, total_bits_read); + + assert(mode < (int)basist::NUM_BC6H_MODES); + + const uint32_t num_subsets = (mode >= 10) ? 1 : 2; + const bool is_mode_9_or_10 = (mode == 9) || (mode == 10); + + // Unpack endpoint components + int comps[MAX_SUBSETS][MAX_COMPS][2] = { { { 0 } } }; // [subset][comp][l/h] + int part_index = 0; + + uint32_t layout_index = 0; + while (layout_index < basist::MAX_BC6H_LAYOUT_INDEX) + { + const basist::bc6h_bit_layout& layout = basist::g_bc6h_bit_layouts[mode][layout_index]; + + if (layout.m_comp < 0) + break; + + const int subset = layout.m_index >> 1, lh_index = layout.m_index & 1; + assert((layout.m_comp == 3) || ((subset >= 0) && (subset < (int)MAX_SUBSETS))); + + const int last_bit = layout.m_last_bit, first_bit = layout.m_first_bit; + assert(last_bit >= 0); + + int& res = (layout.m_comp == 3) ? part_index : comps[subset][layout.m_comp][lh_index]; + + if (first_bit < 0) + { + res |= (bc6h_get_bits(1, blo, bhi, total_bits_read) << last_bit); + } + else + { + const int total_bits = iabs(last_bit - first_bit) + 1; + const int bit_shift = basisu::minimum(first_bit, last_bit); + + int b = bc6h_get_bits(total_bits, blo, bhi, total_bits_read); + + if (last_bit < first_bit) + b = bc6h_reverse_bits(b, total_bits); + + res |= (b << bit_shift); + } + + layout_index++; + } + assert(layout_index != basist::MAX_BC6H_LAYOUT_INDEX); + + // Sign extend/dequantize endpoints + const int num_sig_bits = basist::g_bc6h_mode_sig_bits[mode][0]; + if (is_signed) + { + for (uint32_t comp = 0; comp < 3; comp++) + comps[0][comp][0] = bc6h_sign_extend(comps[0][comp][0], num_sig_bits); + } + + if (is_signed || !is_mode_9_or_10) + { + for (uint32_t subset = 0; subset < num_subsets; subset++) + for (uint32_t comp = 0; comp < 3; comp++) + for (uint32_t lh = (subset ? 0 : 1); lh < 2; lh++) + comps[subset][comp][lh] = bc6h_sign_extend(comps[subset][comp][lh], basist::g_bc6h_mode_sig_bits[mode][1 + comp]); + } + + if (!is_mode_9_or_10) + { + for (uint32_t subset = 0; subset < num_subsets; subset++) + for (uint32_t comp = 0; comp < 3; comp++) + for (uint32_t lh = (subset ? 0 : 1); lh < 2; lh++) + comps[subset][comp][lh] = bc6h_apply_delta(comps[0][comp][0], comps[subset][comp][lh], num_sig_bits, is_signed); + } + + for (uint32_t subset = 0; subset < num_subsets; subset++) + for (uint32_t comp = 0; comp < 3; comp++) + for (uint32_t lh = 0; lh < 2; lh++) + comps[subset][comp][lh] = bc6h_dequantize(comps[subset][comp][lh], num_sig_bits, is_signed); + + // Now unpack weights and output texels + const int weight_bits = (mode >= 10) ? 4 : 3; + const uint8_t* pWeights = (mode >= 10) ? basist::g_bc6h_weight4 : basist::g_bc6h_weight3; + + dest_pitch_in_halfs -= 4 * 3; + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + int subset = (num_subsets == 1) ? ((x | y) ? 0 : 0x80) : basist::g_bc6h_2subset_patterns[part_index][y][x]; + const int num_bits = weight_bits + ((subset & 0x80) ? -1 : 0); + + subset &= 1; + + const int weight_index = bc6h_get_bits(num_bits, blo, bhi, total_bits_read); + + pDst[0] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][0][0], comps[subset][0][1], pWeights, weight_index), is_signed); + pDst[1] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][1][0], comps[subset][1][1], pWeights, weight_index), is_signed); + pDst[2] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][2][0], comps[subset][2][1], pWeights, weight_index), is_signed); + + pDst += 3; + } + + pDst += dest_pitch_in_halfs; + } + + assert(total_bits_read == 128); + return true; + } + //------------------------------------------------------------------------------------------------ + // FXT1 (for fun, and because some modern Intel parts support it, and because a subset is like BC1) + struct fxt1_block { union @@ -901,6 +1159,9 @@ namespace basisu return true; } + //------------------------------------------------------------------------------------------------ + // PVRTC2 (non-interpolated, hard_flag=1 modulation=0 subset only!) + struct pvrtc2_block { uint8_t m_modulation[4]; @@ -1015,6 +1276,9 @@ namespace basisu return true; } + //------------------------------------------------------------------------------------------------ + // ETC2 EAC R11 or RG11 + struct etc2_eac_r11 { uint64_t m_base : 8; @@ -1085,13 +1349,16 @@ namespace basisu unpack_etc2_eac_r(pBlock, pPixels, c); } } - + + //------------------------------------------------------------------------------------------------ + // UASTC + void unpack_uastc(const void* p, color_rgba* pPixels) { basist::unpack_uastc(*static_cast(p), (basist::color32 *)pPixels, false); } - - // Unpacks to RGBA, R, RG, or A + + // Unpacks to RGBA, R, RG, or A. LDR GPU texture formats only. bool unpack_block(texture_format fmt, const void* pBlock, color_rgba* pPixels) { switch (fmt) @@ -1150,14 +1417,24 @@ namespace basisu unpack_etc2_eac(pBlock, pPixels); break; } - case texture_format::cASTC4x4: + case texture_format::cBC6HSigned: + case texture_format::cBC6HUnsigned: + case texture_format::cASTC_HDR_4x4: + case texture_format::cUASTC_HDR_4x4: + { + // Can't unpack HDR blocks in unpack_block() because it returns 32bpp pixel data. + assert(0); + return false; + } + case texture_format::cASTC_LDR_4x4: { -#if BASISU_USE_ASTC_DECOMPRESS const bool astc_srgb = false; - basisu_astc::astc::decompress(reinterpret_cast(pPixels), static_cast(pBlock), astc_srgb, 4, 4); -#else - memset(pPixels, 255, 16 * sizeof(color_rgba)); -#endif + bool status = basisu_astc::astc::decompress_ldr(reinterpret_cast(pPixels), static_cast(pBlock), astc_srgb, 4, 4); + assert(status); + + if (!status) + return false; + break; } case texture_format::cATC_RGB: @@ -1206,6 +1483,66 @@ namespace basisu return true; } + bool unpack_block_hdr(texture_format fmt, const void* pBlock, vec4F* pPixels) + { + switch (fmt) + { + case texture_format::cASTC_HDR_4x4: + case texture_format::cUASTC_HDR_4x4: + { +#if 1 + bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 4, 4); + assert(status); + if (!status) + return false; +#else + basist::half_float half_block[16][4]; + + astc_helpers::log_astc_block log_blk; + if (!astc_helpers::unpack_block(pBlock, log_blk, 4, 4)) + return false; + if (!astc_helpers::decode_block(log_blk, half_block, 4, 4, astc_helpers::cDecodeModeHDR16)) + return false; + + for (uint32_t p = 0; p < 16; p++) + { + pPixels[p][0] = basist::half_to_float(half_block[p][0]); + pPixels[p][1] = basist::half_to_float(half_block[p][1]); + pPixels[p][2] = basist::half_to_float(half_block[p][2]); + pPixels[p][3] = basist::half_to_float(half_block[p][3]); + } + + //memset(pPixels, 0, sizeof(vec4F) * 16); +#endif + return true; + } + case texture_format::cBC6HSigned: + case texture_format::cBC6HUnsigned: + { + basist::half_float half_block[16][3]; + + unpack_bc6h(pBlock, half_block, fmt == texture_format::cBC6HSigned); + + for (uint32_t p = 0; p < 16; p++) + { + pPixels[p][0] = basist::half_to_float(half_block[p][0]); + pPixels[p][1] = basist::half_to_float(half_block[p][1]); + pPixels[p][2] = basist::half_to_float(half_block[p][2]); + pPixels[p][3] = 1.0f; + } + + return true; + } + default: + { + break; + } + } + + assert(0); + return false; + } + bool gpu_image::unpack(image& img) const { img.resize(get_pixel_width(), get_pixel_height()); @@ -1252,7 +1589,48 @@ namespace basisu return success; } + + bool gpu_image::unpack_hdr(imagef& img) const + { + if ((m_fmt != texture_format::cASTC_HDR_4x4) && + (m_fmt != texture_format::cUASTC_HDR_4x4) && + (m_fmt != texture_format::cBC6HUnsigned) && + (m_fmt != texture_format::cBC6HSigned)) + { + // Can't call on LDR images, at least currently. (Could unpack the LDR data and convert to float.) + assert(0); + return false; + } + + img.resize(get_pixel_width(), get_pixel_height()); + img.set_all(vec4F(0.0f)); + + if (!img.get_width() || !img.get_height()) + return true; + + assert((m_block_width <= cMaxBlockSize) && (m_block_height <= cMaxBlockSize)); + vec4F pixels[cMaxBlockSize * cMaxBlockSize]; + clear_obj(pixels); + + bool success = true; + + for (uint32_t by = 0; by < m_blocks_y; by++) + { + for (uint32_t bx = 0; bx < m_blocks_x; bx++) + { + const void* pBlock = get_block_ptr(bx, by); + + if (!unpack_block_hdr(m_fmt, pBlock, pixels)) + success = false; + + img.set_block_clipped(pixels, bx * m_block_width, by * m_block_height, m_block_width, m_block_height); + } // bx + } // by + + return success; + } + // KTX1 texture file writing static const uint8_t g_ktx_file_id[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A }; // KTX/GL enums @@ -1273,6 +1651,8 @@ namespace basisu KTX_COMPRESSED_RGBA8_ETC2_EAC = 0x9278, KTX_COMPRESSED_RGBA_BPTC_UNORM = 0x8E8C, KTX_COMPRESSED_SRGB_ALPHA_BPTC_UNORM = 0x8E8D, + KTX_COMPRESSED_RGB_BPTC_SIGNED_FLOAT = 0x8E8E, + KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT = 0x8E8F, KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG = 0x8C00, KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02, KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0, @@ -1319,6 +1699,7 @@ namespace basisu uint32_t width = 0, height = 0, total_levels = 0; basisu::texture_format fmt = texture_format::cInvalidTextureFormat; + // Sanity check the input if (cubemap_flag) { if ((gpu_images.size() % 6) != 0) @@ -1327,7 +1708,7 @@ namespace basisu return false; } } - + for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++) { const gpu_image_vec &levels = gpu_images[array_index]; @@ -1426,6 +1807,18 @@ namespace basisu base_internal_fmt = KTX_RGBA; break; } + case texture_format::cBC6HSigned: + { + internal_fmt = KTX_COMPRESSED_RGB_BPTC_SIGNED_FLOAT; + base_internal_fmt = KTX_RGBA; + break; + } + case texture_format::cBC6HUnsigned: + { + internal_fmt = KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT; + base_internal_fmt = KTX_RGBA; + break; + } case texture_format::cBC7: { internal_fmt = KTX_COMPRESSED_RGBA_BPTC_UNORM; @@ -1443,7 +1836,10 @@ namespace basisu base_internal_fmt = KTX_RGBA; break; } - case texture_format::cASTC4x4: + // We use different enums for HDR vs. LDR ASTC, but internally they are both just ASTC. + case texture_format::cASTC_LDR_4x4: + case texture_format::cASTC_HDR_4x4: + case texture_format::cUASTC_HDR_4x4: // UASTC_HDR is just HDR-only ASTC { internal_fmt = KTX_COMPRESSED_RGBA_ASTC_4x4_KHR; base_internal_fmt = KTX_RGBA; @@ -1496,17 +1892,17 @@ namespace basisu return false; } } - + ktx_header header; header.clear(); memcpy(&header.m_identifier, g_ktx_file_id, sizeof(g_ktx_file_id)); header.m_endianness = KTX_ENDIAN; - + header.m_pixelWidth = width; header.m_pixelHeight = height; - + header.m_glTypeSize = 1; - + header.m_glInternalFormat = internal_fmt; header.m_glBaseInternalFormat = base_internal_fmt; @@ -1517,12 +1913,12 @@ namespace basisu header.m_numberOfMipmapLevels = total_levels; header.m_numberOfFaces = cubemap_flag ? 6 : 1; - append_vector(ktx_data, (uint8_t *)&header, sizeof(header)); + append_vector(ktx_data, (uint8_t*)&header, sizeof(header)); for (uint32_t level_index = 0; level_index < total_levels; level_index++) { uint32_t img_size = gpu_images[0][level_index].get_size_in_bytes(); - + if ((header.m_numberOfFaces == 1) || (header.m_numberOfArrayElements > 1)) { img_size = img_size * header.m_numberOfFaces * maximum(1, header.m_numberOfArrayElements); @@ -1531,9 +1927,10 @@ namespace basisu assert(img_size && ((img_size & 3) == 0)); packed_uint<4> packed_img_size(img_size); - append_vector(ktx_data, (uint8_t *)&packed_img_size, sizeof(packed_img_size)); + append_vector(ktx_data, (uint8_t*)&packed_img_size, sizeof(packed_img_size)); uint32_t bytes_written = 0; + (void)bytes_written; for (uint32_t array_index = 0; array_index < maximum(1, header.m_numberOfArrayElements); array_index++) { @@ -1541,11 +1938,11 @@ namespace basisu { const gpu_image& img = gpu_images[cubemap_flag ? (array_index * 6 + face_index) : array_index][level_index]; - append_vector(ktx_data, (uint8_t *)img.get_ptr(), img.get_size_in_bytes()); - + append_vector(ktx_data, (uint8_t*)img.get_ptr(), img.get_size_in_bytes()); + bytes_written += img.get_size_in_bytes(); } - + } // array_index } // level_index @@ -1553,7 +1950,58 @@ namespace basisu return true; } - bool write_compressed_texture_file(const char* pFilename, const basisu::vector& g, bool cubemap_flag) + bool does_dds_support_format(texture_format fmt) + { + switch (fmt) + { + case texture_format::cBC1_NV: + case texture_format::cBC1_AMD: + case texture_format::cBC1: + case texture_format::cBC3: + case texture_format::cBC4: + case texture_format::cBC5: + case texture_format::cBC6HSigned: + case texture_format::cBC6HUnsigned: + case texture_format::cBC7: + return true; + default: + break; + } + return false; + } + + // Only supports the basic DirectX BC texture formats. + // gpu_images array is: [face/layer][mipmap level] + // For cubemap arrays, # of face/layers must be a multiple of 6. + // Accepts 2D, 2D mipmapped, 2D array, 2D array mipmapped + // and cubemap, cubemap mipmapped, and cubemap array mipmapped. + bool write_dds_file(uint8_vec &dds_data, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) + { + return false; + } + + bool write_dds_file(const char* pFilename, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) + { + uint8_vec dds_data; + + if (!write_dds_file(dds_data, gpu_images, cubemap_flag, use_srgb_format)) + return false; + + if (!write_vec_to_file(pFilename, dds_data)) + { + fprintf(stderr, "write_dds_file: Failed writing DDS file data\n"); + return false; + } + + return true; + } + + bool read_uncompressed_dds_file(const char* pFilename, basisu::vector &ldr_mips, basisu::vector& hdr_mips) + { + return false; + } + + bool write_compressed_texture_file(const char* pFilename, const basisu::vector& g, bool cubemap_flag, bool use_srgb_format) { std::string extension(string_tolower(string_get_extension(pFilename))); @@ -1570,8 +2018,8 @@ namespace basisu } else if (extension == "dds") { - // TODO - return false; + if (!write_dds_file(filedata, g, cubemap_flag, use_srgb_format)) + return false; } else { @@ -1583,11 +2031,18 @@ namespace basisu return basisu::write_vec_to_file(pFilename, filedata); } - bool write_compressed_texture_file(const char* pFilename, const gpu_image& g) + bool write_compressed_texture_file(const char* pFilename, const gpu_image_vec& g, bool use_srgb_format) + { + basisu::vector a; + a.push_back(g); + return write_compressed_texture_file(pFilename, a, false, use_srgb_format); + } + + bool write_compressed_texture_file(const char* pFilename, const gpu_image& g, bool use_srgb_format) { basisu::vector v; enlarge_vector(v, 1)->push_back(g); - return write_compressed_texture_file(pFilename, v, false); + return write_compressed_texture_file(pFilename, v, false, use_srgb_format); } //const uint32_t OUT_FILE_MAGIC = 'TEXC'; @@ -1626,5 +2081,49 @@ namespace basisu return fclose(pFile) != EOF; } + + // The .astc texture format is readable using ARM's astcenc, AMD Compressonator, and other engines/tools. It oddly doesn't support mipmaps, limiting + // its usefulness/relevance. + // https://github.com/ARM-software/astc-encoder/blob/main/Docs/FileFormat.md + bool write_astc_file(const char* pFilename, const void* pBlocks, uint32_t block_width, uint32_t block_height, uint32_t dim_x, uint32_t dim_y) + { + assert(pBlocks && (block_width >= 4) && (block_height >= 4) && (dim_x > 0) && (dim_y > 0)); + + uint8_vec file_data; + file_data.push_back(0x13); + file_data.push_back(0xAB); + file_data.push_back(0xA1); + file_data.push_back(0x5C); + + file_data.push_back((uint8_t)block_width); + file_data.push_back((uint8_t)block_height); + file_data.push_back(1); + + file_data.push_back((uint8_t)dim_x); + file_data.push_back((uint8_t)(dim_x >> 8)); + file_data.push_back((uint8_t)(dim_x >> 16)); + + file_data.push_back((uint8_t)dim_y); + file_data.push_back((uint8_t)(dim_y >> 8)); + file_data.push_back((uint8_t)(dim_y >> 16)); + + file_data.push_back((uint8_t)1); + file_data.push_back((uint8_t)0); + file_data.push_back((uint8_t)0); + + const uint32_t num_blocks_x = (dim_x + block_width - 1) / block_width; + const uint32_t num_blocks_y = (dim_y + block_height - 1) / block_height; + + const uint32_t total_bytes = num_blocks_x * num_blocks_y * 16; + + const size_t cur_size = file_data.size(); + + file_data.resize(cur_size + total_bytes); + + memcpy(&file_data[cur_size], pBlocks, total_bytes); + + return write_vec_to_file(pFilename, file_data); + } + } // basisu diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h index 619926f5f95f..67c2a2bc5ec4 100644 --- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h +++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h @@ -1,5 +1,5 @@ // basisu_gpu_texture.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,6 +48,7 @@ namespace basisu } inline texture_format get_format() const { return m_fmt; } + inline bool is_hdr() const { return is_hdr_texture_format(m_fmt); } // Width/height in pixels inline uint32_t get_pixel_width() const { return m_width; } @@ -100,9 +101,13 @@ namespace basisu m_blocks.resize(m_blocks_x * m_blocks_y * m_qwords_per_block); } + // Unpacks LDR textures only. bool unpack(image& img) const; + + // Unpacks HDR textures only. + bool unpack_hdr(imagef& img) const; - void override_dimensions(uint32_t w, uint32_t h) + inline void override_dimensions(uint32_t w, uint32_t h) { m_width = w; m_height = h; @@ -116,39 +121,50 @@ namespace basisu typedef basisu::vector gpu_image_vec; - // KTX file writing - + // KTX1 file writing bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector& gpu_images, bool cubemap_flag); - - bool write_compressed_texture_file(const char *pFilename, const basisu::vector& g, bool cubemap_flag); - inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g) - { - basisu::vector a; - a.push_back(g); - return write_compressed_texture_file(pFilename, a, false); - } + bool does_dds_support_format(texture_format fmt); + bool write_dds_file(uint8_vec& dds_data, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format); + bool write_dds_file(const char* pFilename, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format); + + // Currently reads 2D 32bpp RGBA, 16-bit HALF RGBA, or 32-bit FLOAT RGBA, with or without mipmaps. No tex arrays or cubemaps, yet. + bool read_uncompressed_dds_file(const char* pFilename, basisu::vector& ldr_mips, basisu::vector& hdr_mips); - bool write_compressed_texture_file(const char *pFilename, const gpu_image &g); + // Supports DDS and KTX + bool write_compressed_texture_file(const char *pFilename, const basisu::vector& g, bool cubemap_flag, bool use_srgb_format); + bool write_compressed_texture_file(const char* pFilename, const gpu_image_vec& g, bool use_srgb_format); + bool write_compressed_texture_file(const char *pFilename, const gpu_image &g, bool use_srgb_format); bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi); // GPU texture block unpacking + // For ETC1, use in basisu_etc.h: bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha) void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels); bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha); void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride); bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels); void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels); bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels); - bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels); + bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels); // full format + bool unpack_bc6h(const void* pSrc_block, void* pDst_block, bool is_signed, uint32_t dest_pitch_in_halfs = 4 * 3); // full format, outputs HALF values, RGB texels only (not RGBA) void unpack_atc(const void* pBlock_bits, color_rgba* pPixels); + // We only support CC_MIXED non-alpha blocks here because that's the only mode the transcoder uses at the moment. bool unpack_fxt1(const void* p, color_rgba* pPixels); + // PVRTC2 is currently limited to only what our transcoder outputs (non-interpolated, hard_flag=1 modulation=0). In this mode, PVRTC2 looks much like BC1/ATC. bool unpack_pvrtc2(const void* p, color_rgba* pPixels); void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c); void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels); - + // unpack_block() is primarily intended to unpack texture data created by the transcoder. - // For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation. + // For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not yet a complete implementation. + // Unpacks LDR texture formats only. bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels); - + + // Unpacks HDR texture formats only. + bool unpack_block_hdr(texture_format fmt, const void* pBlock, vec4F* pPixels); + + bool write_astc_file(const char* pFilename, const void* pBlocks, uint32_t block_width, uint32_t block_height, uint32_t dim_x, uint32_t dim_y); + } // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h index b03e2ea6e85c..9b85a594ee8c 100644 --- a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h +++ b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h @@ -1,5 +1,5 @@ // basisu_kernels_declares.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h index dcf1ce069a6f..123862b1ddc4 100644 --- a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h +++ b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h @@ -1,5 +1,5 @@ // basisu_kernels_imp.h - Do not directly include -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp index 4f15a5a12b02..36a493d7ed8b 100644 --- a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp +++ b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp @@ -1,5 +1,5 @@ // basisu_kernels_sse.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,22 +22,6 @@ #include #endif -#if !defined(_MSC_VER) - #if __AVX__ || __AVX2__ || __AVX512F__ - #error Please check your compiler options - #endif - - #if CPPSPMD_SSE2 - #if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__ - #error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file - #endif - #else - #if !__SSE4_1__ || !__SSE3__ || !__SSSE3__ - #error Please check your compiler options - #endif - #endif -#endif - #include "cppspmd_sse.h" #include "cppspmd_type_aliases.h" diff --git a/thirdparty/basis_universal/encoder/basisu_miniz.h b/thirdparty/basis_universal/encoder/basisu_miniz.h index 18de9972322f..dab38f9f9248 100644 --- a/thirdparty/basis_universal/encoder/basisu_miniz.h +++ b/thirdparty/basis_universal/encoder/basisu_miniz.h @@ -3,7 +3,7 @@ Forked from the public domain/unlicense version at: https://code.google.com/archive/p/miniz/ - Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. + Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -1973,7 +1973,7 @@ static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahe (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) ); if (!probe_len) { - *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, TDEFL_MAX_MATCH_LEN); break; + *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, (mz_uint)TDEFL_MAX_MATCH_LEN); break; } else if ((probe_len = ((mz_uint)(p - s) * 2) + (mz_uint)(*(const mz_uint8*)p == *(const mz_uint8*)q)) > match_len) { @@ -2101,7 +2101,7 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d) total_lz_bytes += cur_match_len; lookahead_pos += cur_match_len; - dict_size = MZ_MIN(dict_size + cur_match_len, TDEFL_LZ_DICT_SIZE); + dict_size = MZ_MIN(dict_size + cur_match_len, (mz_uint)TDEFL_LZ_DICT_SIZE); cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK; MZ_ASSERT(lookahead_size >= cur_match_len); lookahead_size -= cur_match_len; @@ -2129,7 +2129,7 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d) d->m_huff_count[0][lit]++; lookahead_pos++; - dict_size = MZ_MIN(dict_size + 1, TDEFL_LZ_DICT_SIZE); + dict_size = MZ_MIN(dict_size + 1, (mz_uint)TDEFL_LZ_DICT_SIZE); cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK; lookahead_size--; @@ -2283,7 +2283,7 @@ static mz_bool tdefl_compress_normal(tdefl_compressor *d) d->m_lookahead_pos += len_to_move; MZ_ASSERT(d->m_lookahead_size >= len_to_move); d->m_lookahead_size -= len_to_move; - d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, TDEFL_LZ_DICT_SIZE); + d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, (mz_uint)TDEFL_LZ_DICT_SIZE); // Check if it's time to flush the current LZ codes to the internal output buffer. if ( (d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) || ( (d->m_total_lz_bytes > 31*1024) && (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= d->m_total_lz_bytes) || (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))) ) diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.cpp b/thirdparty/basis_universal/encoder/basisu_opencl.cpp index 81e3090a2639..e0611c18eefb 100644 --- a/thirdparty/basis_universal/encoder/basisu_opencl.cpp +++ b/thirdparty/basis_universal/encoder/basisu_opencl.cpp @@ -1,5 +1,5 @@ // basisu_opencl.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.h b/thirdparty/basis_universal/encoder/basisu_opencl.h index 4194a0841840..2546a18dabbe 100644 --- a/thirdparty/basis_universal/encoder/basisu_opencl.h +++ b/thirdparty/basis_universal/encoder/basisu_opencl.h @@ -1,5 +1,5 @@ // basisu_opencl.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Note: Undefine or set BASISU_SUPPORT_OPENCL to 0 to completely OpenCL support. // diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp index 596fc197e6d9..4bf9516f90a1 100644 --- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp +++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp @@ -1,5 +1,5 @@ // basisu_pvrtc1_4.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h index db6985a439bb..a9fe6b27aa01 100644 --- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h +++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h @@ -1,5 +1,5 @@ // basisu_pvrtc1_4.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -231,7 +231,18 @@ namespace basisu inline void set_to_black() { +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif memset(m_blocks.get_ptr(), 0, m_blocks.size_in_bytes()); +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif } inline bool get_block_uses_transparent_modulation(uint32_t bx, uint32_t by) const diff --git a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp index 597cb3f6187e..46cd837376ee 100644 --- a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp +++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp @@ -1,5 +1,5 @@ // basisu_resampler_filters.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.cpp b/thirdparty/basis_universal/encoder/basisu_resampler.cpp index f4cedf0031b2..a00c63335d09 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler.cpp +++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp @@ -1,5 +1,5 @@ // basisu_resampler.cpp -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.h b/thirdparty/basis_universal/encoder/basisu_resampler.h index dc0978caebe7..ac1ef73d7f3e 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler.h +++ b/thirdparty/basis_universal/encoder/basisu_resampler.h @@ -1,5 +1,5 @@ // basisu_resampler.h -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h index 0ebb51c334b3..4d66ac2c7031 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h +++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h @@ -1,5 +1,5 @@ // basisu_resampler_filters.h -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.cpp b/thirdparty/basis_universal/encoder/basisu_ssim.cpp index cceb400b883b..608ce937fcdd 100644 --- a/thirdparty/basis_universal/encoder/basisu_ssim.cpp +++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp @@ -1,5 +1,5 @@ // basisu_ssim.cpp -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.h b/thirdparty/basis_universal/encoder/basisu_ssim.h index 986ca3bbdf50..51cd2d78fddf 100644 --- a/thirdparty/basis_universal/encoder/basisu_ssim.h +++ b/thirdparty/basis_universal/encoder/basisu_ssim.h @@ -1,5 +1,5 @@ // basisu_ssim.h -// Copyright (C) 2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp index 271bbc6f1daf..51f6e979d458 100644 --- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp @@ -1,5 +1,5 @@ // basisu_uastc_enc.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,11 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "basisu_uastc_enc.h" - -#if BASISU_USE_ASTC_DECOMPRESS -#include "basisu_astc_decomp.h" -#endif - +#include "3rdparty/android_astc_decomp.h" #include "basisu_gpu_texture.h" #include "basisu_bc7enc.h" @@ -384,6 +380,7 @@ namespace basisu } uint32_t total_endpoint_bits = 0; + (void)total_endpoint_bits; for (uint32_t i = 0; i < total_tq_values; i++) { @@ -428,6 +425,8 @@ namespace basisu #endif uint32_t total_weight_bits = 0; + (void)total_weight_bits; + const uint32_t plane_shift = (total_planes == 2) ? 1 : 0; for (uint32_t i = 0; i < 16 * total_planes; i++) { @@ -3175,6 +3174,7 @@ namespace basisu const bool favor_bc7_error = !favor_uastc_error && ((flags & cPackUASTCFavorBC7Error) != 0); //const bool etc1_perceptual = true; + // TODO: This uses 64KB of stack space! uastc_encode_results results[MAX_ENCODE_RESULTS]; level = clampi(level, cPackUASTCLevelFastest, cPackUASTCLevelVerySlow); @@ -3567,7 +3567,6 @@ namespace basisu success = basist::unpack_uastc(temp_block, (basist::color32 *)temp_block_unpacked, false); VALIDATE(success); -#if BASISU_USE_ASTC_DECOMPRESS // Now round trip to packed ASTC and back, then decode to pixels. uint32_t astc_data[4]; @@ -3580,7 +3579,7 @@ namespace basisu } color_rgba decoded_astc_block[4][4]; - success = basisu_astc::astc::decompress((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4); + success = basisu_astc::astc::decompress_ldr((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4); VALIDATE(success); for (uint32_t y = 0; y < 4; y++) @@ -3595,7 +3594,6 @@ namespace basisu VALIDATE(temp_block_unpacked[y][x].c[3] == decoded_uastc_block[y][x].a); } } -#endif } #endif @@ -3789,8 +3787,9 @@ namespace basisu { uint64_t m_sel; uint32_t m_ofs; + uint32_t m_pad; // avoid implicit padding for selector_bitsequence_hash selector_bitsequence() { } - selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs) { } + selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs), m_pad(0) { } bool operator== (const selector_bitsequence& other) const { return (m_ofs == other.m_ofs) && (m_sel == other.m_sel); @@ -3811,7 +3810,7 @@ namespace basisu { std::size_t operator()(selector_bitsequence const& s) const noexcept { - return static_cast(hash_hsieh((uint8_t *)&s, sizeof(s)) ^ s.m_sel); + return hash_hsieh((const uint8_t*)&s, sizeof(s)); } }; diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h index ba39a558b38b..54d39380e683 100644 --- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h +++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h @@ -1,5 +1,5 @@ // basisu_uastc_enc.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/cppspmd_flow.h b/thirdparty/basis_universal/encoder/cppspmd_flow.h index f6930476aad1..93934173c4f0 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_flow.h +++ b/thirdparty/basis_universal/encoder/cppspmd_flow.h @@ -1,7 +1,7 @@ // Do not include this header directly. // Control flow functionality in common between all the headers. // -// Copyright 2020-2021 Binomial LLC +// Copyright 2020-2024 Binomial LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/cppspmd_math.h b/thirdparty/basis_universal/encoder/cppspmd_math.h index e7b3202b8ee2..3032df865f1f 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_math.h +++ b/thirdparty/basis_universal/encoder/cppspmd_math.h @@ -1,6 +1,6 @@ // Do not include this header directly. // -// Copyright 2020-2021 Binomial LLC +// Copyright 2020-2024 Binomial LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -646,7 +646,7 @@ CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x) { vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555); vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333); - return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24); + return VUINT_SHIFT_RIGHT(((v1 + (VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F)) * 0x1010101), 24); } CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) diff --git a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h index cdb6447b62ef..f76c9b7e38ea 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h +++ b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h @@ -1,7 +1,7 @@ // Do not include this header directly. // This header defines shared struct spmd_kernel helpers. // -// Copyright 2020-2021 Binomial LLC +// Copyright 2020-2024 Binomial LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/cppspmd_sse.h b/thirdparty/basis_universal/encoder/cppspmd_sse.h index 4c61bab7b1a9..79dfa1561a0d 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_sse.h +++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h @@ -450,7 +450,7 @@ struct spmd_kernel CPPSPMD_FORCE_INLINE explicit operator vint() const; private: - vbool& operator=(const vbool&); + //vbool& operator=(const vbool&); }; friend vbool operator!(const vbool& v); @@ -481,7 +481,7 @@ struct spmd_kernel CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { } private: - vfloat& operator=(const vfloat&); + //vfloat& operator=(const vfloat&); }; CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src) @@ -514,7 +514,7 @@ struct spmd_kernel float* m_pValue; private: - float_lref& operator=(const float_lref&); + //float_lref& operator=(const float_lref&); }; CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src) @@ -561,7 +561,7 @@ struct spmd_kernel float* m_pValue; private: - float_vref& operator=(const float_vref&); + //float_vref& operator=(const float_vref&); }; // Varying ref to varying float @@ -571,7 +571,7 @@ struct spmd_kernel vfloat* m_pValue; private: - vfloat_vref& operator=(const vfloat_vref&); + //vfloat_vref& operator=(const vfloat_vref&); }; // Varying ref to varying int @@ -581,7 +581,7 @@ struct spmd_kernel vint* m_pValue; private: - vint_vref& operator=(const vint_vref&); + //vint_vref& operator=(const vint_vref&); }; CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src); @@ -624,7 +624,7 @@ struct spmd_kernel int* m_pValue; private: - int_lref& operator=(const int_lref&); + //int_lref& operator=(const int_lref&); }; CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src) @@ -663,7 +663,7 @@ struct spmd_kernel int16_t* m_pValue; private: - int16_lref& operator=(const int16_lref&); + //int16_lref& operator=(const int16_lref&); }; CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src) @@ -720,7 +720,7 @@ struct spmd_kernel const int* m_pValue; private: - cint_lref& operator=(const cint_lref&); + //cint_lref& operator=(const cint_lref&); }; CPPSPMD_FORCE_INLINE vint load(const cint_lref& src) @@ -742,7 +742,7 @@ struct spmd_kernel int* m_pValue; private: - int_vref& operator=(const int_vref&); + //int_vref& operator=(const int_vref&); }; // Varying ref to constant ints @@ -752,7 +752,7 @@ struct spmd_kernel const int* m_pValue; private: - cint_vref& operator=(const cint_vref&); + //cint_vref& operator=(const cint_vref&); }; // Varying int @@ -810,7 +810,7 @@ struct spmd_kernel } private: - vint& operator=(const vint&); + //vint& operator=(const vint&); }; // Load/store linear int @@ -1206,7 +1206,7 @@ struct spmd_kernel CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src) { // TODO: There's surely a better way - __m128i k; + __m128i k = _mm_setzero_si128(); k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]); k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]); @@ -1261,7 +1261,7 @@ struct spmd_kernel } private: - lint& operator=(const lint&); + //lint& operator=(const lint&); }; CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src) diff --git a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h index 0dfb28b88f83..26004812395b 100644 --- a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h +++ b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h @@ -1,7 +1,7 @@ // cppspmd_type_aliases.h // Do not include this file directly // -// Copyright 2020-2021 Binomial LLC +// Copyright 2020-2024 Binomial LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/encoder/pvpngreader.cpp b/thirdparty/basis_universal/encoder/pvpngreader.cpp index 46639f2796ea..6b32f66cbe23 100644 --- a/thirdparty/basis_universal/encoder/pvpngreader.cpp +++ b/thirdparty/basis_universal/encoder/pvpngreader.cpp @@ -163,7 +163,7 @@ class png_memory_file : public png_file { if ((sizeof(size_t) == sizeof(uint32_t)) && (new_size > 0x7FFFFFFFUL)) return 0; - m_buf.resize(new_size); + m_buf.resize((size_t)new_size); } memcpy(&m_buf[(size_t)m_ofs], pBuf, len); @@ -178,11 +178,11 @@ class png_memory_file : public png_file return 0; uint64_t max_bytes = minimum(len, m_buf.size() - m_ofs); - memcpy(pBuf, &m_buf[(size_t)m_ofs], max_bytes); + memcpy(pBuf, &m_buf[(size_t)m_ofs], (size_t)max_bytes); m_ofs += max_bytes; - return max_bytes; + return (size_t)max_bytes; } }; @@ -249,11 +249,11 @@ class png_readonly_memory_file : public png_file return 0; uint64_t max_bytes = minimum(len, m_buf_size - m_ofs); - memcpy(pBuf, &m_pBuf[(size_t)m_ofs], max_bytes); + memcpy(pBuf, &m_pBuf[(size_t)m_ofs], (size_t)max_bytes); m_ofs += max_bytes; - return max_bytes; + return (size_t)max_bytes; } }; @@ -1626,8 +1626,8 @@ int png_decoder::png_decode_start() if (m_ihdr.m_ilace_type == 1) { - int i; - uint32_t total_lines, lines_processed; + //int i; + //uint32_t total_lines, lines_processed; m_adam7_pass_size_x[0] = adam7_pass_size(m_ihdr.m_width, 0, 8); m_adam7_pass_size_x[1] = adam7_pass_size(m_ihdr.m_width, 4, 8); @@ -1651,10 +1651,12 @@ int png_decoder::png_decode_start() m_pass_y_left = 0; +#if 0 total_lines = lines_processed = 0; for (i = 0; i < 7; i++) total_lines += m_adam7_pass_size_y[i]; +#endif for (; ; ) { @@ -1675,7 +1677,7 @@ int png_decoder::png_decode_start() } } - lines_processed++; + //lines_processed++; } m_adam7_decoded_flag = TRUE; diff --git a/modules/basis_universal/patches/external-jpgd.patch b/thirdparty/basis_universal/patches/external-jpgd.patch similarity index 100% rename from modules/basis_universal/patches/external-jpgd.patch rename to thirdparty/basis_universal/patches/external-jpgd.patch diff --git a/thirdparty/basis_universal/patches/external-tinyexr.patch b/thirdparty/basis_universal/patches/external-tinyexr.patch new file mode 100644 index 000000000000..665af1330075 --- /dev/null +++ b/thirdparty/basis_universal/patches/external-tinyexr.patch @@ -0,0 +1,23 @@ +diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp +index 6c0ac0ad370..2bf486a0287 100644 +--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp ++++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp +@@ -27,7 +27,7 @@ + #ifndef TINYEXR_USE_ZFP + #define TINYEXR_USE_ZFP (1) + #endif +-#include "3rdparty/tinyexr.h" ++#include + + #ifndef MINIZ_HEADER_FILE_ONLY + #define MINIZ_HEADER_FILE_ONLY +@@ -3257,7 +3257,8 @@ namespace basisu + float* out_rgba = nullptr; + const char* err = nullptr; + +- int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err, &n_chans); ++ int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err); ++ n_chans = 4; + if (status != 0) + { + error_printf("Failed loading .EXR image \"%s\"! (TinyEXR error: %s)\n", pFilename, err ? err : "?"); diff --git a/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch b/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch new file mode 100644 index 000000000000..a4d176602d2a --- /dev/null +++ b/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch @@ -0,0 +1,446 @@ +diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp +index 2bf486a0287..fff98e83014 100644 +--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp ++++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp +@@ -37,9 +37,6 @@ + #endif + #include "basisu_miniz.h" + +-#define QOI_IMPLEMENTATION +-#include "3rdparty/qoi.h" +- + #if defined(_WIN32) + // For QueryPerformanceCounter/QueryPerformanceFrequency + #define WIN32_LEAN_AND_MEAN +@@ -408,16 +405,7 @@ namespace basisu + + bool load_qoi(const char* pFilename, image& img) + { +- qoi_desc desc; +- clear_obj(desc); +- +- void* p = qoi_read(pFilename, &desc, 4); +- if (!p) +- return false; +- +- img.grant_ownership(static_cast(p), desc.width, desc.height); +- +- return true; ++ return false; + } + + bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename) +diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp +index 000869a5337..342446b8fd4 100644 +--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp ++++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp +@@ -19,9 +19,6 @@ + #include "basisu_bc7enc.h" + #include "../transcoder/basisu_astc_hdr_core.h" + +-#define TINYDDS_IMPLEMENTATION +-#include "3rdparty/tinydds.h" +- + namespace basisu + { + //------------------------------------------------------------------------------------------------ +@@ -1979,208 +1976,8 @@ namespace basisu + // Accepts 2D, 2D mipmapped, 2D array, 2D array mipmapped + // and cubemap, cubemap mipmapped, and cubemap array mipmapped. + bool write_dds_file(uint8_vec &dds_data, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) +- { +- if (!gpu_images.size()) +- { +- assert(0); +- return false; +- } +- +- // Sanity check the input +- uint32_t slices = 1; +- if (cubemap_flag) +- { +- if ((gpu_images.size() % 6) != 0) +- { +- assert(0); +- return false; +- } +- slices = gpu_images.size() / 6; +- } +- else +- { +- slices = gpu_images.size(); +- } +- +- uint32_t width = 0, height = 0, total_levels = 0; +- basisu::texture_format fmt = texture_format::cInvalidTextureFormat; +- +- // Sanity check the input for consistent # of dimensions and mip levels +- for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++) +- { +- const gpu_image_vec& levels = gpu_images[array_index]; +- +- if (!levels.size()) +- { +- // Empty mip chain +- assert(0); +- return false; +- } +- +- if (!array_index) +- { +- width = levels[0].get_pixel_width(); +- height = levels[0].get_pixel_height(); +- total_levels = (uint32_t)levels.size(); +- fmt = levels[0].get_format(); +- } +- else +- { +- if ((width != levels[0].get_pixel_width()) || +- (height != levels[0].get_pixel_height()) || +- (total_levels != levels.size())) +- { +- // All cubemap/texture array faces must be the same dimension +- assert(0); +- return false; +- } +- } +- +- for (uint32_t level_index = 0; level_index < levels.size(); level_index++) +- { +- if (level_index) +- { +- if ((levels[level_index].get_pixel_width() != maximum(1, levels[0].get_pixel_width() >> level_index)) || +- (levels[level_index].get_pixel_height() != maximum(1, levels[0].get_pixel_height() >> level_index))) +- { +- // Malformed mipmap chain +- assert(0); +- return false; +- } +- } +- +- if (fmt != levels[level_index].get_format()) +- { +- // All input textures must use the same GPU format +- assert(0); +- return false; +- } +- } +- } +- +- // No mipmap levels +- if (!total_levels) +- { +- assert(0); +- return false; +- } +- +- // Create the DDS mipmap level data +- uint8_vec mipmaps[32]; +- +- // See https://learn.microsoft.com/en-us/windows/win32/direct3ddds/dds-file-layout-for-cubic-environment-maps +- // DDS cubemap organization is cubemap face 0 followed by all mips, then cubemap face 1 followed by all mips, etc. +- // Unfortunately tinydds.h's writer doesn't handle this case correctly, so we work around it here. +- // This also applies with 2D texture arrays, too. RenderDoc and ddsview (DirectXTex) views each type (cubemap array and 2D texture array) correctly. +- // Also see "Using Texture Arrays in Direct3D 10/11": +- // https://learn.microsoft.com/en-us/windows/win32/direct3ddds/dx-graphics-dds-pguide +- for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++) +- { +- const gpu_image_vec& levels = gpu_images[array_index]; +- +- for (uint32_t level_index = 0; level_index < levels.size(); level_index++) +- { +- append_vector(mipmaps[0], (uint8_t*)levels[level_index].get_ptr(), levels[level_index].get_size_in_bytes()); +- +- } // level_index +- } // array_index +- +-#if 0 +- // This organization, required by tinydds.h's API, is wrong. +- { +- for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++) +- { +- const gpu_image_vec& levels = gpu_images[array_index]; +- +- for (uint32_t level_index = 0; level_index < levels.size(); level_index++) +- { +- append_vector(mipmaps[level_index], (uint8_t*)levels[level_index].get_ptr(), levels[level_index].get_size_in_bytes()); +- +- } // level_index +- } // array_index +- } +-#endif +- +- // Write DDS file using tinydds +- TinyDDS_WriteCallbacks cbs; +- cbs.error = [](void* user, char const* msg) { BASISU_NOTE_UNUSED(user); fprintf(stderr, "tinydds: %s\n", msg); }; +- cbs.alloc = [](void* user, size_t size) -> void* { BASISU_NOTE_UNUSED(user); return malloc(size); }; +- cbs.free = [](void* user, void* memory) { BASISU_NOTE_UNUSED(user); free(memory); }; +- cbs.write = [](void* user, void const* buffer, size_t byteCount) { BASISU_NOTE_UNUSED(user); uint8_vec* pVec = (uint8_vec*)user; append_vector(*pVec, (const uint8_t*)buffer, byteCount); }; +- +- uint32_t mipmap_sizes[32]; +- const void* mipmap_ptrs[32]; +- +- clear_obj(mipmap_sizes); +- clear_obj(mipmap_ptrs); +- +- assert(total_levels < 32); +- for (uint32_t i = 0; i < total_levels; i++) +- { +- mipmap_sizes[i] = mipmaps[i].size_in_bytes(); +- mipmap_ptrs[i] = mipmaps[i].get_ptr(); +- } +- +- // Select tinydds texture format +- uint32_t tinydds_fmt = 0; +- +- switch (fmt) +- { +- case texture_format::cBC1_NV: +- case texture_format::cBC1_AMD: +- case texture_format::cBC1: +- tinydds_fmt = use_srgb_format ? TDDS_BC1_RGBA_SRGB_BLOCK : TDDS_BC1_RGBA_UNORM_BLOCK; +- break; +- case texture_format::cBC3: +- tinydds_fmt = use_srgb_format ? TDDS_BC3_SRGB_BLOCK : TDDS_BC3_UNORM_BLOCK; +- break; +- case texture_format::cBC4: +- tinydds_fmt = TDDS_BC4_UNORM_BLOCK; +- break; +- case texture_format::cBC5: +- tinydds_fmt = TDDS_BC5_UNORM_BLOCK; +- break; +- case texture_format::cBC6HSigned: +- tinydds_fmt = TDDS_BC6H_SFLOAT_BLOCK; +- break; +- case texture_format::cBC6HUnsigned: +- tinydds_fmt = TDDS_BC6H_UFLOAT_BLOCK; +- break; +- case texture_format::cBC7: +- tinydds_fmt = use_srgb_format ? TDDS_BC7_SRGB_BLOCK : TDDS_BC7_UNORM_BLOCK; +- break; +- default: +- { +- fprintf(stderr, "Warning: Unsupported format in write_dds_file().\n"); +- return false; +- } +- } +- +- // DirectXTex's DDSView doesn't handle odd sizes textures correctly. RenderDoc loads them fine, however. +- // Trying to work around this here results in invalid mipmaps. +- //width = (width + 3) & ~3; +- //height = (height + 3) & ~3; +- +- bool status = TinyDDS_WriteImage(&cbs, +- &dds_data, +- width, +- height, +- 1, +- slices, +- total_levels, +- (TinyDDS_Format)tinydds_fmt, +- cubemap_flag, +- true, +- mipmap_sizes, +- mipmap_ptrs); +- +- if (!status) +- { +- fprintf(stderr, "write_dds_file: Failed creating DDS file\n"); +- return false; +- } +- +- return true; ++ { ++ return false; + } + + bool write_dds_file(const char* pFilename, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) +@@ -2201,188 +1998,6 @@ namespace basisu + + bool read_uncompressed_dds_file(const char* pFilename, basisu::vector &ldr_mips, basisu::vector& hdr_mips) + { +- const uint32_t MAX_IMAGE_DIM = 16384; +- +- TinyDDS_Callbacks cbs; +- +- cbs.errorFn = [](void* user, char const* msg) { BASISU_NOTE_UNUSED(user); fprintf(stderr, "tinydds: %s\n", msg); }; +- cbs.allocFn = [](void* user, size_t size) -> void* { BASISU_NOTE_UNUSED(user); return malloc(size); }; +- cbs.freeFn = [](void* user, void* memory) { BASISU_NOTE_UNUSED(user); free(memory); }; +- cbs.readFn = [](void* user, void* buffer, size_t byteCount) -> size_t { return (size_t)fread(buffer, 1, byteCount, (FILE*)user); }; +- +-#ifdef _MSC_VER +- cbs.seekFn = [](void* user, int64_t ofs) -> bool { return _fseeki64((FILE*)user, ofs, SEEK_SET) == 0; }; +- cbs.tellFn = [](void* user) -> int64_t { return _ftelli64((FILE*)user); }; +-#else +- cbs.seekFn = [](void* user, int64_t ofs) -> bool { return fseek((FILE*)user, (long)ofs, SEEK_SET) == 0; }; +- cbs.tellFn = [](void* user) -> int64_t { return (int64_t)ftell((FILE*)user); }; +-#endif +- +- FILE* pFile = fopen_safe(pFilename, "rb"); +- if (!pFile) +- { +- error_printf("Can't open .DDS file \"%s\"\n", pFilename); +- return false; +- } +- +- // These are the formats AMD Compressonator supports in its UI. +- enum dds_fmt +- { +- cRGBA32, +- cRGBA_HALF, +- cRGBA_FLOAT +- }; +- +- bool status = false; +- dds_fmt fmt = cRGBA32; +- uint32_t width = 0, height = 0; +- bool hdr_flag = false; +- TinyDDS_Format tfmt = TDDS_UNDEFINED; +- +- TinyDDS_ContextHandle ctx = TinyDDS_CreateContext(&cbs, pFile); +- if (!ctx) +- goto failure; +- +- status = TinyDDS_ReadHeader(ctx); +- if (!status) +- { +- error_printf("Failed parsing DDS header in file \"%s\"\n", pFilename); +- goto failure; +- } +- +- if ((!TinyDDS_Is2D(ctx)) || (TinyDDS_ArraySlices(ctx) > 1) || (TinyDDS_IsCubemap(ctx))) +- { +- error_printf("Unsupported DDS texture type in file \"%s\"\n", pFilename); +- goto failure; +- } +- +- width = TinyDDS_Width(ctx); +- height = TinyDDS_Height(ctx); +- +- if (!width || !height) +- { +- error_printf("DDS texture dimensions invalid in file \"%s\"\n", pFilename); +- goto failure; +- } +- +- if ((width > MAX_IMAGE_DIM) || (height > MAX_IMAGE_DIM)) +- { +- error_printf("DDS texture dimensions too large in file \"%s\"\n", pFilename); +- goto failure; +- } +- +- tfmt = TinyDDS_GetFormat(ctx); +- switch (tfmt) +- { +- case TDDS_R8G8B8A8_SRGB: +- case TDDS_R8G8B8A8_UNORM: +- case TDDS_B8G8R8A8_SRGB: +- case TDDS_B8G8R8A8_UNORM: +- fmt = cRGBA32; +- break; +- case TDDS_R16G16B16A16_SFLOAT: +- fmt = cRGBA_HALF; +- hdr_flag = true; +- break; +- case TDDS_R32G32B32A32_SFLOAT: +- fmt = cRGBA_FLOAT; +- hdr_flag = true; +- break; +- default: +- error_printf("File \"%s\" has an unsupported DDS texture format (only supports RGBA/BGRA 32bpp, RGBA HALF float, or RGBA FLOAT)\n", pFilename); +- goto failure; +- } +- +- if (hdr_flag) +- hdr_mips.resize(TinyDDS_NumberOfMipmaps(ctx)); +- else +- ldr_mips.resize(TinyDDS_NumberOfMipmaps(ctx)); +- +- for (uint32_t level = 0; level < TinyDDS_NumberOfMipmaps(ctx); level++) +- { +- const uint32_t level_width = TinyDDS_MipMapReduce(width, level); +- const uint32_t level_height = TinyDDS_MipMapReduce(height, level); +- const uint32_t total_level_texels = level_width * level_height; +- +- const void* pImage = TinyDDS_ImageRawData(ctx, level); +- const uint32_t image_size = TinyDDS_ImageSize(ctx, level); +- +- if (fmt == cRGBA32) +- { +- ldr_mips[level].resize(level_width, level_height); +- +- if ((ldr_mips[level].get_total_pixels() * sizeof(uint32_t) != image_size)) +- { +- assert(0); +- goto failure; +- } +- +- memcpy(ldr_mips[level].get_ptr(), pImage, image_size); +- +- if ((tfmt == TDDS_B8G8R8A8_SRGB) || (tfmt == TDDS_B8G8R8A8_UNORM)) +- { +- // Swap R and B components. +- uint32_t *pTexels = (uint32_t *)ldr_mips[level].get_ptr(); +- for (uint32_t i = 0; i < total_level_texels; i++) +- { +- const uint32_t v = pTexels[i]; +- const uint32_t r = (v >> 16) & 0xFF; +- const uint32_t b = v & 0xFF; +- pTexels[i] = r | (b << 16) | (v & 0xFF00FF00); +- } +- } +- } +- else if (fmt == cRGBA_FLOAT) +- { +- hdr_mips[level].resize(level_width, level_height); +- +- if ((hdr_mips[level].get_total_pixels() * sizeof(float) * 4 != image_size)) +- { +- assert(0); +- goto failure; +- } +- +- memcpy(hdr_mips[level].get_ptr(), pImage, image_size); +- } +- else if (fmt == cRGBA_HALF) +- { +- hdr_mips[level].resize(level_width, level_height); +- +- if ((hdr_mips[level].get_total_pixels() * sizeof(basist::half_float) * 4 != image_size)) +- { +- assert(0); +- goto failure; +- } +- +- // Unpack half to float. +- const basist::half_float* pSrc_comps = static_cast(pImage); +- vec4F* pDst_texels = hdr_mips[level].get_ptr(); +- +- for (uint32_t i = 0; i < total_level_texels; i++) +- { +- (*pDst_texels)[0] = basist::half_to_float(pSrc_comps[0]); +- (*pDst_texels)[1] = basist::half_to_float(pSrc_comps[1]); +- (*pDst_texels)[2] = basist::half_to_float(pSrc_comps[2]); +- (*pDst_texels)[3] = basist::half_to_float(pSrc_comps[3]); +- +- pSrc_comps += 4; +- pDst_texels++; +- } // y +- } +- } // level +- +- TinyDDS_DestroyContext(ctx); +- fclose(pFile); +- +- return true; +- +- failure: +- if (ctx) +- TinyDDS_DestroyContext(ctx); +- +- if (pFile) +- fclose(pFile); +- + return false; + } + diff --git a/thirdparty/basis_universal/transcoder/basisu.h b/thirdparty/basis_universal/transcoder/basisu.h index 1230b59ec618..939ee79e62b9 100644 --- a/thirdparty/basis_universal/transcoder/basisu.h +++ b/thirdparty/basis_universal/transcoder/basisu.h @@ -1,5 +1,5 @@ // basisu.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -117,13 +117,26 @@ namespace basisu typedef basisu::vector uint64_vec; typedef basisu::vector int_vec; typedef basisu::vector bool_vec; + typedef basisu::vector float_vec; void enable_debug_printf(bool enabled); void debug_printf(const char *pFmt, ...); - +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif + template inline void clear_obj(T& obj) { memset(&obj, 0, sizeof(obj)); } +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif + template inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; } template inline S maximum(S a, S b) { return (a > b) ? a : b; } @@ -162,10 +175,45 @@ namespace basisu template inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); BASISU_NOTE_UNUSED(minv); BASISU_NOTE_UNUSED(maxv); return v; } template inline T open_range_check(T v, T maxv) { assert(v < maxv); BASISU_NOTE_UNUSED(maxv); return v; } + // Open interval + inline bool in_bounds(int v, int l, int h) + { + return (v >= l) && (v < h); + } + + // Closed interval + inline bool in_range(int v, int l, int h) + { + return (v >= l) && (v <= h); + } + inline uint32_t total_bits(uint32_t v) { uint32_t l = 0; for ( ; v > 0U; ++l) v >>= 1; return l; } template inline T saturate(T val) { return clamp(val, 0.0f, 1.0f); } + inline uint32_t get_bit(uint32_t src, int ndx) + { + assert(in_bounds(ndx, 0, 32)); + return (src >> ndx) & 1; + } + + inline bool is_bit_set(uint32_t src, int ndx) + { + return get_bit(src, ndx) != 0; + } + + inline uint32_t get_bits(uint32_t val, int low, int high) + { + const int num_bits = (high - low) + 1; + assert(in_range(num_bits, 1, 32)); + + val >>= low; + if (num_bits != 32) + val &= ((1u << num_bits) - 1); + + return val; + } + template inline void append_vector(T &vec, const R *pObjs, size_t n) { if (n) @@ -267,6 +315,11 @@ namespace basisu return true; } + static inline uint32_t read_le_word(const uint8_t* pBytes) + { + return (pBytes[1] << 8U) | (pBytes[0]); + } + static inline uint32_t read_le_dword(const uint8_t *pBytes) { return (pBytes[3] << 24U) | (pBytes[2] << 16U) | (pBytes[1] << 8U) | (pBytes[0]); @@ -303,6 +356,10 @@ namespace basisu return *this; } +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#endif inline operator uint32_t() const { switch (NumBytes) @@ -354,6 +411,9 @@ namespace basisu } } } +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif }; enum eZero { cZero }; @@ -402,8 +462,11 @@ namespace basisu cBC3, // DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block) cBC4, // DXT5A cBC5, // 3DC/DXN (two BC4/DXT5A blocks) + cBC6HSigned, // HDR + cBC6HUnsigned, // HDR cBC7, - cASTC4x4, // LDR only + cASTC_LDR_4x4, // ASTC 4x4 LDR only + cASTC_HDR_4x4, // ASTC 4x4 HDR only (but may use LDR ASTC blocks internally) cPVRTC1_4_RGB, cPVRTC1_4_RGBA, cATC_RGB, @@ -413,17 +476,22 @@ namespace basisu cETC2_R11_EAC, cETC2_RG11_EAC, cUASTC4x4, + cUASTC_HDR_4x4, cBC1_NV, cBC1_AMD, - + // Uncompressed/raw pixels cRGBA32, cRGB565, cBGR565, cRGBA4444, - cABGR4444 + cABGR4444, + cRGBA_HALF, + cRGB_HALF, + cRGB_9E5 }; + // This is bytes per block for GPU formats, or bytes per texel for uncompressed formats. inline uint32_t get_bytes_per_block(texture_format fmt) { switch (fmt) @@ -443,13 +511,27 @@ namespace basisu case texture_format::cETC2_R11_EAC: return 8; case texture_format::cRGBA32: - return sizeof(uint32_t) * 16; + case texture_format::cRGB_9E5: + return sizeof(uint32_t); + case texture_format::cRGB_HALF: + return sizeof(uint16_t) * 3; + case texture_format::cRGBA_HALF: + return sizeof(uint16_t) * 4; + case texture_format::cRGB565: + case texture_format::cBGR565: + case texture_format::cRGBA4444: + case texture_format::cABGR4444: + return sizeof(uint16_t); + default: break; } + + // Everything else is 16 bytes/block. return 16; } + // This is qwords per block for GPU formats, or not valid for uncompressed formats. inline uint32_t get_qwords_per_block(texture_format fmt) { return get_bytes_per_block(fmt) >> 3; @@ -473,6 +555,17 @@ namespace basisu BASISU_NOTE_UNUSED(fmt); return 4; } + + inline bool is_hdr_texture_format(texture_format fmt) + { + if (fmt == texture_format::cASTC_HDR_4x4) + return true; + if (fmt == texture_format::cUASTC_HDR_4x4) + return true; + if ((fmt == texture_format::cBC6HSigned) || (fmt == texture_format::cBC6HUnsigned)) + return true; + return false; + } } // namespace basisu diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h new file mode 100644 index 000000000000..82dcd2bfe196 --- /dev/null +++ b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h @@ -0,0 +1,102 @@ +// File: basisu_astc_hdr_core.h +#pragma once +#include "basisu_astc_helpers.h" + +namespace basist +{ + struct astc_blk + { + uint8_t m_vals[16]; + }; + + // ASTC_HDR_MAX_VAL is the maximum color component value that can be encoded. + // If the input has values higher than this, they need to be linearly scaled so all values are between [0,ASTC_HDR_MAX_VAL], and the linear scaling inverted in the shader. + const float ASTC_HDR_MAX_VAL = 65216.0f; // actually MAX_QLOG12_VAL + + // Maximum usable QLOG encodings, and their floating point equivalent values, that don't result in NaN/Inf's. + const uint32_t MAX_QLOG7 = 123; + //const float MAX_QLOG7_VAL = 55296.0f; + + const uint32_t MAX_QLOG8 = 247; + //const float MAX_QLOG8_VAL = 60416.0f; + + const uint32_t MAX_QLOG9 = 495; + //const float MAX_QLOG9_VAL = 62976.0f; + + const uint32_t MAX_QLOG10 = 991; + //const float MAX_QLOG10_VAL = 64256.0f; + + const uint32_t MAX_QLOG11 = 1983; + //const float MAX_QLOG11_VAL = 64896.0f; + + const uint32_t MAX_QLOG12 = 3967; + //const float MAX_QLOG12_VAL = 65216.0f; + + const uint32_t MAX_QLOG16 = 63487; + const float MAX_QLOG16_VAL = 65504.0f; + + const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4; + + // Notes: + // qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless) + // However, this is not lossless in the general sense. + inline half_float qlog16_to_half_slow(uint32_t qlog16) + { + assert(qlog16 <= 0xFFFF); + + int C = qlog16; + + int E = (C & 0xF800) >> 11; + int M = C & 0x7FF; + + int Mt; + if (M < 512) + Mt = 3 * M; + else if (M >= 1536) + Mt = 5 * M - 2048; + else + Mt = 4 * M - 512; + + int Cf = (E << 10) + (Mt >> 3); + return (half_float)Cf; + } + + // This is not lossless + inline half_float qlog_to_half_slow(uint32_t qlog, uint32_t bits) + { + assert((bits >= 7U) && (bits <= 16U)); + assert(qlog < (1U << bits)); + + int C = qlog << (16 - bits); + return qlog16_to_half_slow(C); + } + + void astc_hdr_core_init(); + + void decode_mode7_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale); + + bool decode_mode7_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale, + uint32_t ise_endpoint_range); + + void decode_mode11_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3]); + + bool decode_mode11_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + uint32_t ise_endpoint_range); + + bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk); + bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk); + + bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk); + bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk); + +} // namespace basist diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h new file mode 100644 index 000000000000..09a234b2ae1f --- /dev/null +++ b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h @@ -0,0 +1,3587 @@ +// basisu_astc_helpers.h +// Be sure to define ASTC_HELPERS_IMPLEMENTATION somewhere to get the implementation, otherwise you only get the header. +#pragma once +#ifndef BASISU_ASTC_HELPERS_HEADER +#define BASISU_ASTC_HELPERS_HEADER + +#include +#include +#include +#include + +namespace astc_helpers +{ + const uint32_t MAX_WEIGHT_VALUE = 64; // grid texel weights must range from [0,64] + const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid + const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels + const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values + + static const uint32_t NUM_ASTC_BLOCK_SIZES = 14; + extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2]; + + // The Color Endpoint Modes (CEM's) + enum cems + { + CEM_LDR_LUM_DIRECT = 0, + CEM_LDR_LUM_BASE_PLUS_OFS = 1, + CEM_HDR_LUM_LARGE_RANGE = 2, + CEM_HDR_LUM_SMALL_RANGE = 3, + CEM_LDR_LUM_ALPHA_DIRECT = 4, + CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS = 5, + CEM_LDR_RGB_BASE_SCALE = 6, + CEM_HDR_RGB_BASE_SCALE = 7, + CEM_LDR_RGB_DIRECT = 8, + CEM_LDR_RGB_BASE_PLUS_OFFSET = 9, + CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A = 10, + CEM_HDR_RGB = 11, + CEM_LDR_RGBA_DIRECT = 12, + CEM_LDR_RGBA_BASE_PLUS_OFFSET = 13, + CEM_HDR_RGB_LDR_ALPHA = 14, + CEM_HDR_RGB_HDR_ALPHA = 15 + }; + + // All Bounded Integer Sequence Coding (BISE or ISE) ranges. + // Weights: Ranges [0,11] are valid. + // Endpoints: Ranges [4,20] are valid. + enum bise_levels + { + BISE_2_LEVELS = 0, + BISE_3_LEVELS = 1, + BISE_4_LEVELS = 2, + BISE_5_LEVELS = 3, + BISE_6_LEVELS = 4, + BISE_8_LEVELS = 5, + BISE_10_LEVELS = 6, + BISE_12_LEVELS = 7, + BISE_16_LEVELS = 8, + BISE_20_LEVELS = 9, + BISE_24_LEVELS = 10, + BISE_32_LEVELS = 11, + BISE_40_LEVELS = 12, + BISE_48_LEVELS = 13, + BISE_64_LEVELS = 14, + BISE_80_LEVELS = 15, + BISE_96_LEVELS = 16, + BISE_128_LEVELS = 17, + BISE_160_LEVELS = 18, + BISE_192_LEVELS = 19, + BISE_256_LEVELS = 20 + }; + + const uint32_t TOTAL_ISE_RANGES = 21; + + // Valid endpoint ISE ranges + const uint32_t FIRST_VALID_ENDPOINT_ISE_RANGE = BISE_6_LEVELS; // 4 + const uint32_t LAST_VALID_ENDPOINT_ISE_RANGE = BISE_256_LEVELS; // 20 + const uint32_t TOTAL_ENDPOINT_ISE_RANGES = LAST_VALID_ENDPOINT_ISE_RANGE - FIRST_VALID_ENDPOINT_ISE_RANGE + 1; + + // Valid weight ISE ranges + const uint32_t FIRST_VALID_WEIGHT_ISE_RANGE = BISE_2_LEVELS; // 0 + const uint32_t LAST_VALID_WEIGHT_ISE_RANGE = BISE_32_LEVELS; // 11 + const uint32_t TOTAL_WEIGHT_ISE_RANGES = LAST_VALID_WEIGHT_ISE_RANGE - FIRST_VALID_WEIGHT_ISE_RANGE + 1; + + // The ISE range table. + extern const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3]; // 0=bits (0 to 8), 1=trits (0 or 1), 2=quints (0 or 1) + + // Possible Color Component Select values, used in dual plane mode. + // The CCS component will be interpolated using the 2nd weight plane. + enum ccs + { + CCS_GBA_R = 0, + CCS_RBA_G = 1, + CCS_RGA_B = 2, + CCS_RGB_A = 3 + }; + + struct astc_block + { + uint32_t m_vals[4]; + }; + + const uint32_t MAX_PARTITIONS = 4; // Max # of partitions or subsets for single plane mode + const uint32_t MAX_DUAL_PLANE_PARTITIONS = 3; // Max # of partitions or subsets for dual plane mode + const uint32_t NUM_PARTITION_PATTERNS = 1024; // Total # of partition pattern seeds (10-bits) + const uint32_t MAX_ENDPOINTS = 18; // Maximum # of endpoint values in a block + + struct log_astc_block + { + bool m_error_flag; + + bool m_solid_color_flag_ldr, m_solid_color_flag_hdr; + uint16_t m_solid_color[4]; + + // Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr + uint32_t m_grid_width, m_grid_height; // weight grid dimensions, not the dimension of the block + + bool m_dual_plane; + + uint32_t m_weight_ise_range; // 0-11 + uint32_t m_endpoint_ise_range; // 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking + + uint32_t m_color_component_selector; // 0-3, 0=GBA R, 1=RBA G, 2=RGA B, 3=RGB A, only used in dual plane mode + + uint32_t m_num_partitions; // or the # of subsets, 1-4 (1-3 if dual plane mode) + uint32_t m_partition_id; // 10-bits, must be 0 if m_num_partitions==1 + + uint32_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's + + // ISE weight grid values. In dual plane mode, the order is p0,p1, p0,p1, etc. + uint8_t m_weights[MAX_GRID_WEIGHTS]; + + // ISE endpoint values + // Endpoint order examples: + // 1 subset LA : LL0 LH0 AL0 AH0 + // 1 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 + // 1 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 + // 2 subset LA : LL0 LH0 AL0 AH0 LL1 LH1 AL1 AH1 + // 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1 + // 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1 + uint8_t m_endpoints[MAX_ENDPOINTS]; + + void clear() + { + memset(this, 0, sizeof(*this)); + } + }; + + // Open interval + inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; } + inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; } + + inline uint32_t get_bits(uint32_t val, int low, int high) + { + const int num_bits = (high - low) + 1; + assert((num_bits >= 1) && (num_bits <= 32)); + + val >>= low; + if (num_bits != 32) + val &= ((1u << num_bits) - 1); + + return val; + } + + // Returns the number of levels in the given ISE range. + inline uint32_t get_ise_levels(uint32_t ise_range) + { + assert(ise_range < TOTAL_ISE_RANGES); + return (1 + 2 * g_ise_range_table[ise_range][1] + 4 * g_ise_range_table[ise_range][2]) << g_ise_range_table[ise_range][0]; + } + + inline int get_ise_sequence_bits(int count, int range) + { + // See 18.22 Data Size Determination + int total_bits = g_ise_range_table[range][0] * count; + total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5; + total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3; + return total_bits; + } + + inline uint32_t weight_interpolate(uint32_t l, uint32_t h, uint32_t w) + { + assert(w <= MAX_WEIGHT_VALUE); + return (l * (64 - w) + h * w + 32) >> 6; + } + + void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range); + + // Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions. + bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr); + + // Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component. + void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a); + + // Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's) + void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah); + + // These helpers are all quite slow, but are useful for table preparation. + + // Dequantizes ISE encoded endpoint val to [0,255] + uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range); // ISE ranges 4-11 + + // Dequantizes ISE encoded weight val to [0,64] + uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range); // ISE ranges 0-10 + + uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range); + uint32_t find_nearest_bise_weight(int v, uint32_t ise_range); + + void create_quant_tables( + uint8_t* pVal_to_ise, // [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65] + uint8_t* pISE_to_val, // ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels] + uint8_t* pISE_to_rank, // returns the level rank index given an ISE symbol, [levels] + uint8_t* pRank_to_ISE, // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels] + uint32_t ise_range, // ise range, [4,20] for endpoints, [0,11] for weights + bool weight_flag); // false if block endpoints, true if weights + + // True if the CEM is LDR. + bool is_cem_ldr(uint32_t mode); + inline bool is_cem_hdr(uint32_t mode) { return !is_cem_ldr(mode); } + + // True if the passed in dimensions are a valid ASTC block size. There are 14 supported configs, from 4x4 (8bpp) to 12x12 (.89bpp). + bool is_valid_block_size(uint32_t w, uint32_t h); + + bool block_has_any_hdr_cems(const log_astc_block& log_blk); + bool block_has_any_ldr_cems(const log_astc_block& log_blk); + + // Returns the # of endpoint values for the given CEM. + inline uint32_t get_num_cem_values(uint32_t cem) { assert(cem <= 15); return 2 + 2 * (cem >> 2); } + + struct dequant_table + { + basisu::vector m_val_to_ise; // [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65] + basisu::vector m_ISE_to_val; // ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels] + basisu::vector m_ISE_to_rank; // returns the level rank index given an ISE symbol, [levels] + basisu::vector m_rank_to_ISE; // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels] + + void init(bool weight_flag, uint32_t num_levels, bool init_rank_tabs) + { + m_val_to_ise.resize(weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256); + m_ISE_to_val.resize(num_levels); + if (init_rank_tabs) + { + m_ISE_to_rank.resize(num_levels); + m_rank_to_ISE.resize(num_levels); + } + } + }; + + struct dequant_tables + { + dequant_table m_weights[TOTAL_WEIGHT_ISE_RANGES]; + dequant_table m_endpoints[TOTAL_ENDPOINT_ISE_RANGES]; + + const dequant_table& get_weight_tab(uint32_t range) const + { + assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE)); + return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE]; + } + + dequant_table& get_weight_tab(uint32_t range) + { + assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE)); + return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE]; + } + + const dequant_table& get_endpoint_tab(uint32_t range) const + { + assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE)); + return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE]; + } + + dequant_table& get_endpoint_tab(uint32_t range) + { + assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE)); + return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE]; + } + + void init(bool init_rank_tabs) + { + for (uint32_t range = FIRST_VALID_WEIGHT_ISE_RANGE; range <= LAST_VALID_WEIGHT_ISE_RANGE; range++) + { + const uint32_t num_levels = get_ise_levels(range); + dequant_table& tab = get_weight_tab(range); + + tab.init(true, num_levels, init_rank_tabs); + + create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, true); + } + + for (uint32_t range = FIRST_VALID_ENDPOINT_ISE_RANGE; range <= LAST_VALID_ENDPOINT_ISE_RANGE; range++) + { + const uint32_t num_levels = get_ise_levels(range); + dequant_table& tab = get_endpoint_tab(range); + + tab.init(false, num_levels, init_rank_tabs); + + create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, false); + } + } + }; + + extern dequant_tables g_dequant_tables; + void init_tables(bool init_rank_tabs); + + // Procedurally returns the texel partition/subset index given the block coordinate and config. + int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block); + + void blue_contract( + int r, int g, int b, int a, + int& dr, int& dg, int& db, int& da); + + void bit_transfer_signed(int& a, int& b); + + void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t* pE); + + typedef uint16_t half_float; + half_float float_to_half(float val, bool toward_zero); + float half_to_float(half_float hval); + + const int MAX_RGB9E5 = 0xff80; + void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b); + uint32_t pack_rgb9e5(float r, float g, float b); + + enum decode_mode + { + cDecodeModeSRGB8 = 0, // returns uint8_t's, not valid on HDR blocks + cDecodeModeLDR8 = 1, // returns uint8_t's, not valid on HDR blocks + cDecodeModeHDR16 = 2, // returns uint16_t's (half floats), valid on all LDR/HDR blocks + cDecodeModeRGB9E5 = 3 // returns uint32_t's, packed as RGB 9E5 (shared exponent), see https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt + }; + + // Decodes logical block to output pixels. + // pPixels must point to either 32-bit pixel values (SRGB8/LDR8/9E5) or 64-bit pixel values (HDR16) + bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode); + + void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t *pBits128, uint32_t bit_ofs); + + // Unpack a physical ASTC encoded GPU texture block to a logical block description. + bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height); + +} // namespace astc_helpers + +#endif // BASISU_ASTC_HELPERS_HEADER + +//------------------------------------------------------------------ + +#ifdef BASISU_ASTC_HELPERS_IMPLEMENTATION + +namespace astc_helpers +{ + template inline T my_min(T a, T b) { return (a < b) ? a : b; } + template inline T my_max(T a, T b) { return (a > b) ? a : b; } + + const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2] = { + { 4, 4 }, { 5, 4 }, { 5, 5 }, { 6, 5 }, + { 6, 6 }, { 8, 5 }, { 8, 6 }, { 10, 5 }, + { 10, 6 }, { 8, 8 }, { 10, 8 }, { 10, 10 }, + { 12, 10 }, { 12, 12 } + }; + + const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3] = + { + //b t q + //2 3 5 // rng ise_index notes + { 1, 0, 0 }, // 0..1 0 + { 0, 1, 0 }, // 0..2 1 + { 2, 0, 0 }, // 0..3 2 + { 0, 0, 1 }, // 0..4 3 + { 1, 1, 0 }, // 0..5 4 min endpoint ISE index + { 3, 0, 0 }, // 0..7 5 + { 1, 0, 1 }, // 0..9 6 + { 2, 1, 0 }, // 0..11 7 + { 4, 0, 0 }, // 0..15 8 + { 2, 0, 1 }, // 0..19 9 + { 3, 1, 0 }, // 0..23 10 + { 5, 0, 0 }, // 0..31 11 max weight ISE index + { 3, 0, 1 }, // 0..39 12 + { 4, 1, 0 }, // 0..47 13 + { 6, 0, 0 }, // 0..63 14 + { 4, 0, 1 }, // 0..79 15 + { 5, 1, 0 }, // 0..95 16 + { 7, 0, 0 }, // 0..127 17 + { 5, 0, 1 }, // 0..159 18 + { 6, 1, 0 }, // 0..191 19 + { 8, 0, 0 }, // 0..255 20 + }; + + static inline void astc_set_bits_1_to_9(uint32_t* pDst, uint32_t& bit_offset, uint32_t code, uint32_t codesize) + { + uint8_t* pBuf = reinterpret_cast(pDst); + + assert(codesize <= 9); + if (codesize) + { + uint32_t byte_bit_offset = bit_offset & 7; + uint32_t val = code << byte_bit_offset; + + uint32_t index = bit_offset >> 3; + pBuf[index] |= (uint8_t)val; + + if (codesize > (8 - byte_bit_offset)) + pBuf[index + 1] |= (uint8_t)(val >> 8); + + bit_offset += codesize; + } + } + + static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high) + { + return (bits >> low) & ((1 << (high - low + 1)) - 1); + } + + // Writes bits to output in an endian safe way + static inline void astc_set_bits(uint32_t* pOutput, uint32_t& bit_pos, uint32_t value, uint32_t total_bits) + { + assert(total_bits <= 31); + assert(value < (1u << total_bits)); + + uint8_t* pBytes = reinterpret_cast(pOutput); + + while (total_bits) + { + const uint32_t bits_to_write = my_min(total_bits, 8 - (bit_pos & 7)); + + pBytes[bit_pos >> 3] |= static_cast(value << (bit_pos & 7)); + + bit_pos += bits_to_write; + total_bits -= bits_to_write; + value >>= bits_to_write; + } + } + + static const uint8_t g_astc_quint_encode[125] = + { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57, + 58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104, + 105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54, + 126, 127, 94, 95, 62, 39, 47, 55, 63, 7 /*31 - results in the same decode as 7*/ + }; + + // Encodes 3 values to output, usable for any range that uses quints and bits + static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n) + { + // First extract the quints and the bits from the 3 input values + int quints = 0, bits[3]; + const uint32_t bit_mask = (1 << n) - 1; + for (int i = 0; i < 3; i++) + { + static const int s_muls[3] = { 1, 5, 25 }; + + const int t = pValues[i] >> n; + + quints += t * s_muls[i]; + bits[i] = pValues[i] & bit_mask; + } + + // Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits. + // See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding + + assert(quints < 125); + const int T = g_astc_quint_encode[quints]; + + // Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96. + astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) | + (bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3); + } + + static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39, + 43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154, + 131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202, + 208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224, + 225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159, + 191, 223, 124, 125, 126 }; + + // Encodes 5 values to output, usable for any range that uses trits and bits + static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n) + { + // First extract the trits and the bits from the 5 input values + int trits = 0, bits[5]; + const uint32_t bit_mask = (1 << n) - 1; + for (int i = 0; i < 5; i++) + { + static const int s_muls[5] = { 1, 3, 9, 27, 81 }; + + const int t = pValues[i] >> n; + + trits += t * s_muls[i]; + bits[i] = pValues[i] & bit_mask; + } + + // Encode the trits, by inverting the bit manipulations done by the decoder, converting 5 trits into 8-bits. + // See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding + + assert(trits < 243); + const int T = g_astc_trit_encode[trits]; + + // Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94. + astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2); + + astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) | + (bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6); + } + + // Packs values using ASTC's BISE to output buffer. + void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range) + { + uint32_t temp[5] = { 0 }; + + const int num_bits = g_ise_range_table[range][0]; + + int group_size = 0; + if (g_ise_range_table[range][1]) + group_size = 5; + else if (g_ise_range_table[range][2]) + group_size = 3; + +#ifndef NDEBUG + const uint32_t num_levels = get_ise_levels(range); + for (int i = 0; i < num_vals; i++) + { + assert(pSrc_vals[i] < num_levels); + } +#endif + + if (group_size) + { + // Range has trits or quints - pack each group of 5 or 3 values + const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3); + + for (int group_index = 0; group_index < total_groups; group_index++) + { + uint8_t vals[5] = { 0 }; + + const int limit = my_min(group_size, num_vals - group_index * group_size); + for (int i = 0; i < limit; i++) + vals[i] = pSrc_vals[group_index * group_size + i]; + + if (group_size == 5) + astc_encode_trits(temp, vals, bit_pos, num_bits); + else + astc_encode_quints(temp, vals, bit_pos, num_bits); + } + } + else + { + for (int i = 0; i < num_vals; i++) + astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits); + } + + // TODO: Could this write too many bits on incomplete blocks? + pDst[0] |= temp[0]; pDst[1] |= temp[1]; + pDst[2] |= temp[2]; pDst[3] |= temp[3]; + } + + inline uint32_t rev_dword(uint32_t bits) + { + uint32_t v = (bits << 16) | (bits >> 16); + v = ((v & 0x00ff00ff) << 8) | ((v & 0xff00ff00) >> 8); v = ((v & 0x0f0f0f0f) << 4) | ((v & 0xf0f0f0f0) >> 4); + v = ((v & 0x33333333) << 2) | ((v & 0xcccccccc) >> 2); v = ((v & 0x55555555) << 1) | ((v & 0xaaaaaaaa) >> 1); + return v; + } + + static inline bool is_packable(int value, int num_bits) { assert((num_bits >= 1) && (num_bits < 31)); return (value >= 0) && (value < (1 << num_bits)); } + + static bool get_config_bits(const log_astc_block &log_block, uint32_t &config_bits) + { + config_bits = 0; + + const int W = log_block.m_grid_width, H = log_block.m_grid_height; + + const uint32_t P = log_block.m_weight_ise_range >= 6; // high precision + const uint32_t Dp_P = (log_block.m_dual_plane << 1) | P; // pack dual plane+high precision bits + + // See Tables 81-82 + // Compute p from weight range + uint32_t p = 2 + log_block.m_weight_ise_range - (P ? 6 : 0); + + // Rearrange p's bits to p0 p2 p1 + p = (p >> 1) + ((p & 1) << 2); + + // Try encoding each row of table 82. + + // W+4 H+2 + if (is_packable(W - 4, 2) && is_packable(H - 2, 2)) + { + config_bits = (Dp_P << 9) | ((W - 4) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | (p & 3); + return true; + } + + // W+8 H+2 + if (is_packable(W - 8, 2) && is_packable(H - 2, 2)) + { + config_bits = (Dp_P << 9) | ((W - 8) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 4 | (p & 3); + return true; + } + + // W+2 H+8 + if (is_packable(W - 2, 2) && is_packable(H - 8, 2)) + { + config_bits = (Dp_P << 9) | ((H - 8) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 8 | (p & 3); + return true; + } + + // W+2 H+6 + if (is_packable(W - 2, 2) && is_packable(H - 6, 1)) + { + config_bits = (Dp_P << 9) | ((H - 6) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3); + return true; + } + + // W+2 H+2 + if (is_packable(W - 2, 1) && is_packable(H - 2, 2)) + { + config_bits = (Dp_P << 9) | ((W) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3); + return true; + } + + // 12 H+2 + if ((W == 12) && is_packable(H - 2, 2)) + { + config_bits = (Dp_P << 9) | ((H - 2) << 5) | (p << 2); + return true; + } + + // W+2 12 + if ((H == 12) && is_packable(W - 2, 2)) + { + config_bits = (Dp_P << 9) | (1 << 7) | ((W - 2) << 5) | (p << 2); + return true; + } + + // 6 10 + if ((W == 6) && (H == 10)) + { + config_bits = (Dp_P << 9) | (3 << 7) | (p << 2); + return true; + } + + // 10 6 + if ((W == 10) && (H == 6)) + { + config_bits = (Dp_P << 9) | (0b1101 << 5) | (p << 2); + return true; + } + + // W+6 H+6 (no dual plane or high prec) + if ((!Dp_P) && is_packable(W - 6, 2) && is_packable(H - 6, 2)) + { + config_bits = ((H - 6) << 9) | 256 | ((W - 6) << 5) | (p << 2); + return true; + } + + // Failed: unsupported weight grid dimensions or config. + return false; + } + + bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range) + { + memset(&phys_block, 0, sizeof(phys_block)); + + if (pExpected_endpoint_range) + *pExpected_endpoint_range = -1; + + assert(!log_block.m_error_flag); + if (log_block.m_error_flag) + return false; + + if (log_block.m_solid_color_flag_ldr) + { + pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]); + return true; + } + else if (log_block.m_solid_color_flag_hdr) + { + pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]); + return true; + } + + if ((log_block.m_num_partitions < 1) || (log_block.m_num_partitions > MAX_PARTITIONS)) + return false; + + // Max usable weight range is 11 + if (log_block.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE) + return false; + + // See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints + if ((log_block.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_block.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE)) + return false; + + if (log_block.m_color_component_selector > 3) + return false; + + uint32_t config_bits = 0; + if (!get_config_bits(log_block, config_bits)) + return false; + + uint32_t bit_pos = 0; + astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11); + + const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height); + const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range); + + // 18.24 Illegal Encodings + if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96)) + return false; + + uint32_t total_extra_bits = 0; + + astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2); + + if (log_block.m_num_partitions > 1) + { + if (log_block.m_partition_id >= NUM_PARTITION_PATTERNS) + return false; + + astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10); + + uint32_t highest_cem = 0, lowest_cem = UINT32_MAX; + for (uint32_t j = 0; j < log_block.m_num_partitions; j++) + { + highest_cem = my_max(highest_cem, log_block.m_color_endpoint_modes[j]); + lowest_cem = my_min(lowest_cem, log_block.m_color_endpoint_modes[j]); + } + + if (highest_cem > 15) + return false; + + // Ensure CEM range is contiguous + if (((highest_cem >> 2) > (1 + (lowest_cem >> 2)))) + return false; + + // See tables 79/80 + uint32_t encoded_cem = log_block.m_color_endpoint_modes[0] << 2; + if (lowest_cem != highest_cem) + { + encoded_cem = my_min(3, 1 + (lowest_cem >> 2)); + + // See tables at 23.11 Color Endpoint Mode + for (uint32_t j = 0; j < log_block.m_num_partitions; j++) + { + const int M = log_block.m_color_endpoint_modes[j] & 3; + + const int C = (log_block.m_color_endpoint_modes[j] >> 2) - ((encoded_cem & 3) - 1); + if ((C & 1) != C) + return false; + + encoded_cem |= (C << (2 + j)) | (M << (2 + log_block.m_num_partitions + 2 * j)); + } + + total_extra_bits = 3 * log_block.m_num_partitions - 4; + + if ((total_weight_bits + total_extra_bits) > 128) + return false; + + uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits; + astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits); + } + + astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6); + } + else + { + if (log_block.m_partition_id) + return false; + if (log_block.m_color_endpoint_modes[0] > 15) + return false; + + astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4); + } + + if (log_block.m_dual_plane) + { + if (log_block.m_num_partitions > 3) + return false; + + total_extra_bits += 2; + + uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits; + astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2); + } + + const uint32_t total_config_bits = bit_pos + total_extra_bits; + const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits; + if (num_remaining_bits < 0) + return false; + + uint32_t total_cem_vals = 0; + for (uint32_t j = 0; j < log_block.m_num_partitions; j++) + total_cem_vals += 2 + 2 * (log_block.m_color_endpoint_modes[j] >> 2); + + if (total_cem_vals > MAX_ENDPOINTS) + return false; + + int endpoint_ise_range = -1; + for (int k = 20; k > 0; k--) + { + int bits = get_ise_sequence_bits(total_cem_vals, k); + if (bits <= num_remaining_bits) + { + endpoint_ise_range = k; + break; + } + } + + // See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints + if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE) + return false; + + // Ensure the caller utilized the right endpoint ISE range. + if ((int)log_block.m_endpoint_ise_range != endpoint_ise_range) + { + if (pExpected_endpoint_range) + *pExpected_endpoint_range = endpoint_ise_range; + return false; + } + + // Pack endpoints forwards + encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range); + + // Pack weights backwards + uint32_t weight_data[4] = { 0 }; + encode_bise(weight_data, log_block.m_weights, 0, total_grid_weights, log_block.m_weight_ise_range); + + for (uint32_t i = 0; i < 4; i++) + phys_block.m_vals[i] |= rev_dword(weight_data[3 - i]); + + return true; + } + + static inline uint32_t bit_replication_scale(uint32_t src, int num_src_bits, int num_dst_bits) + { + assert(num_src_bits <= num_dst_bits); + assert((src & ((1 << num_src_bits) - 1)) == src); + + uint32_t dst = 0; + for (int shift = num_dst_bits - num_src_bits; shift > -num_src_bits; shift -= num_src_bits) + dst |= (shift >= 0) ? (src << shift) : (src >> -shift); + + return dst; + } + + uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range) + { + assert((ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(val < get_ise_levels(ise_range)); + + uint32_t u = 0; + + switch (ise_range) + { + case 5: + { + u = bit_replication_scale(val, 3, 8); + break; + } + case 8: + { + u = bit_replication_scale(val, 4, 8); + break; + } + case 11: + { + u = bit_replication_scale(val, 5, 8); + break; + } + case 14: + { + u = bit_replication_scale(val, 6, 8); + break; + } + case 17: + { + u = bit_replication_scale(val, 7, 8); + break; + } + case 20: + { + u = val; + break; + } + case 4: + case 6: + case 7: + case 9: + case 10: + case 12: + case 13: + case 15: + case 16: + case 18: + case 19: + { + const uint32_t num_bits = g_ise_range_table[ise_range][0]; + const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits); + const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints); + + // compute Table 103 row index + const int range_index = (num_bits * 2 + (num_quints ? 1 : 0)) - 2; + + assert(range_index >= 0 && range_index <= 10); + + uint32_t bits = val & ((1 << num_bits) - 1); + uint32_t tval = val >> num_bits; + + assert(tval < (num_trits ? 3U : 5U)); + + uint32_t a = bits & 1; + uint32_t b = (bits >> 1) & 1; + uint32_t c = (bits >> 2) & 1; + uint32_t d = (bits >> 3) & 1; + uint32_t e = (bits >> 4) & 1; + uint32_t f = (bits >> 5) & 1; + + uint32_t A = a ? 511 : 0; + uint32_t B = 0; + + switch (range_index) + { + case 2: + { + // 876543210 + // b000b0bb0 + B = (b << 1) | (b << 2) | (b << 4) | (b << 8); + break; + } + case 3: + { + // 876543210 + // b0000bb00 + B = (b << 2) | (b << 3) | (b << 8); + break; + } + case 4: + { + // 876543210 + // cb000cbcb + B = b | (c << 1) | (b << 2) | (c << 3) | (b << 7) | (c << 8); + break; + } + case 5: + { + // 876543210 + // cb0000cbc + B = c | (b << 1) | (c << 2) | (b << 7) | (c << 8); + break; + } + case 6: + { + // 876543210 + // dcb000dcb + B = b | (c << 1) | (d << 2) | (b << 6) | (c << 7) | (d << 8); + break; + } + case 7: + { + // 876543210 + // dcb0000dc + B = c | (d << 1) | (b << 6) | (c << 7) | (d << 8); + break; + } + case 8: + { + // 876543210 + // edcb000ed + B = d | (e << 1) | (b << 5) | (c << 6) | (d << 7) | (e << 8); + break; + } + case 9: + { + // 876543210 + // edcb0000e + B = e | (b << 5) | (c << 6) | (d << 7) | (e << 8); + break; + } + case 10: + { + // 876543210 + // fedcb000f + B = f | (b << 4) | (c << 5) | (d << 6) | (e << 7) | (f << 8); + break; + } + default: + break; + } + + static uint8_t C_vals[11] = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 }; + uint32_t C = C_vals[range_index]; + uint32_t D = tval; + + u = D * C + B; + u = u ^ A; + u = (A & 0x80) | (u >> 2); + + break; + } + default: + { + assert(0); + break; + } + } + + return u; + } + + uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range) + { + assert(val < get_ise_levels(ise_range)); + + uint32_t u = 0; + switch (ise_range) + { + case 0: + { + u = val ? 63 : 0; + break; + } + case 1: // 0-2 + { + const uint8_t s_tab_0_2[3] = { 0, 32, 63 }; + u = s_tab_0_2[val]; + break; + } + case 2: // 0-3 + { + u = bit_replication_scale(val, 2, 6); + break; + } + case 3: // 0-4 + { + const uint8_t s_tab_0_4[5] = { 0, 16, 32, 47, 63 }; + u = s_tab_0_4[val]; + break; + } + case 5: // 0-7 + { + u = bit_replication_scale(val, 3, 6); + break; + } + case 8: // 0-15 + { + u = bit_replication_scale(val, 4, 6); + break; + } + case 11: // 0-31 + { + u = bit_replication_scale(val, 5, 6); + break; + } + case 4: // 0-5 + case 6: // 0-9 + case 7: // 0-11 + case 9: // 0-19 + case 10: // 0-23 + { + const uint32_t num_bits = g_ise_range_table[ise_range][0]; + const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits); + const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints); + + // compute Table 103 row index + const int range_index = num_bits * 2 + (num_quints ? 1 : 0); + + // Extract bits and tris/quints from value + const uint32_t bits = val & ((1u << num_bits) - 1); + const uint32_t D = val >> num_bits; + + assert(D < (num_trits ? 3U : 5U)); + + // Now dequantize + // See Table 103. ASTC weight unquantization parameters + static const uint32_t C_table[5] = { 50, 28, 23, 13, 11 }; + + const uint32_t a = bits & 1, b = (bits >> 1) & 1, c = (bits >> 2) & 1; + + const uint32_t A = (a == 0) ? 0 : 0x7F; + + uint32_t B = 0; + if (range_index == 4) + B = ((b << 6) | (b << 2) | (b << 0)); + else if (range_index == 5) + B = ((b << 6) | (b << 1)); + else if (range_index == 6) + B = ((c << 6) | (b << 5) | (c << 1) | (b << 0)); + + const uint32_t C = C_table[range_index - 2]; + + u = D * C + B; + u = u ^ A; + u = (A & 0x20) | (u >> 2); + break; + } + default: + assert(0); + break; + } + + if (u > 32) + u++; + + return u; + } + + // Returns the nearest ISE symbol given a [0,255] endpoint value. + uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range) + { + assert(ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE && ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE); + + const uint32_t total_levels = get_ise_levels(ise_range); + int best_e = INT_MAX, best_index = 0; + for (uint32_t i = 0; i < total_levels; i++) + { + const int qv = dequant_bise_endpoint(i, ise_range); + int e = labs(v - qv); + if (e < best_e) + { + best_e = e; + best_index = i; + if (!best_e) + break; + } + } + return best_index; + } + + // Returns the nearest ISE weight given a [0,64] endpoint value. + uint32_t find_nearest_bise_weight(int v, uint32_t ise_range) + { + assert(ise_range >= FIRST_VALID_WEIGHT_ISE_RANGE && ise_range <= LAST_VALID_WEIGHT_ISE_RANGE); + assert(v <= (int)MAX_WEIGHT_VALUE); + + const uint32_t total_levels = get_ise_levels(ise_range); + int best_e = INT_MAX, best_index = 0; + for (uint32_t i = 0; i < total_levels; i++) + { + const int qv = dequant_bise_weight(i, ise_range); + int e = labs(v - qv); + if (e < best_e) + { + best_e = e; + best_index = i; + if (!best_e) + break; + } + } + return best_index; + } + + void create_quant_tables( + uint8_t* pVal_to_ise, // [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65] + uint8_t* pISE_to_val, // ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels] + uint8_t* pISE_to_rank, // returns the level rank index given an ISE symbol, [levels] + uint8_t* pRank_to_ISE, // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels] + uint32_t ise_range, // ise range, [4,20] for endpoints, [0,11] for weights + bool weight_flag) // false if block endpoints, true if weights + { + const uint32_t num_dequant_vals = weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256; + + for (uint32_t i = 0; i < num_dequant_vals; i++) + { + uint32_t bise_index = weight_flag ? astc_helpers::find_nearest_bise_weight(i, ise_range) : astc_helpers::find_nearest_bise_endpoint(i, ise_range); + + if (pVal_to_ise) + pVal_to_ise[i] = (uint8_t)bise_index; + + if (pISE_to_val) + pISE_to_val[bise_index] = weight_flag ? (uint8_t)astc_helpers::dequant_bise_weight(bise_index, ise_range) : (uint8_t)astc_helpers::dequant_bise_endpoint(bise_index, ise_range); + } + + if (pISE_to_rank || pRank_to_ISE) + { + const uint32_t num_levels = get_ise_levels(ise_range); + + if (!g_ise_range_table[ise_range][1] && !g_ise_range_table[ise_range][2]) + { + // Only bits + for (uint32_t i = 0; i < num_levels; i++) + { + if (pISE_to_rank) + pISE_to_rank[i] = (uint8_t)i; + + if (pRank_to_ISE) + pRank_to_ISE[i] = (uint8_t)i; + } + } + else + { + // Range has trits or quints + uint32_t vals[256]; + for (uint32_t i = 0; i < num_levels; i++) + { + uint32_t v = weight_flag ? astc_helpers::dequant_bise_weight(i, ise_range) : astc_helpers::dequant_bise_endpoint(i, ise_range); + + // Low=ISE value + // High=dequantized value + vals[i] = (v << 16) | i; + } + + // Sorts by dequantized value + std::sort(vals, vals + num_levels); + + for (uint32_t rank = 0; rank < num_levels; rank++) + { + uint32_t ise_val = (uint8_t)vals[rank]; + + if (pISE_to_rank) + pISE_to_rank[ise_val] = (uint8_t)rank; + + if (pRank_to_ISE) + pRank_to_ISE[rank] = (uint8_t)ise_val; + } + } + } + } + + void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) + { + uint8_t* pDst = (uint8_t*)&blk.m_vals[0]; + memset(pDst, 0xFF, 16); + + pDst[0] = 0b11111100; + pDst[1] = 0b11111101; + + pDst[8] = (uint8_t)rh; + pDst[9] = (uint8_t)(rh >> 8); + pDst[10] = (uint8_t)gh; + pDst[11] = (uint8_t)(gh >> 8); + pDst[12] = (uint8_t)bh; + pDst[13] = (uint8_t)(bh >> 8); + pDst[14] = (uint8_t)ah; + pDst[15] = (uint8_t)(ah >> 8); + } + + // rh-ah are half-floats + void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) + { + uint8_t* pDst = (uint8_t*)&blk.m_vals[0]; + memset(pDst, 0xFF, 16); + + pDst[0] = 0b11111100; + + pDst[8] = (uint8_t)rh; + pDst[9] = (uint8_t)(rh >> 8); + pDst[10] = (uint8_t)gh; + pDst[11] = (uint8_t)(gh >> 8); + pDst[12] = (uint8_t)bh; + pDst[13] = (uint8_t)(bh >> 8); + pDst[14] = (uint8_t)ah; + pDst[15] = (uint8_t)(ah >> 8); + } + + bool is_cem_ldr(uint32_t mode) + { + switch (mode) + { + case CEM_LDR_LUM_DIRECT: + case CEM_LDR_LUM_BASE_PLUS_OFS: + case CEM_LDR_LUM_ALPHA_DIRECT: + case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS: + case CEM_LDR_RGB_BASE_SCALE: + case CEM_LDR_RGB_DIRECT: + case CEM_LDR_RGB_BASE_PLUS_OFFSET: + case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + case CEM_LDR_RGBA_DIRECT: + case CEM_LDR_RGBA_BASE_PLUS_OFFSET: + return true; + default: + break; + } + + return false; + } + + bool is_valid_block_size(uint32_t w, uint32_t h) + { + assert((w >= MIN_BLOCK_DIM) && (w <= MAX_BLOCK_DIM)); + assert((h >= MIN_BLOCK_DIM) && (h <= MAX_BLOCK_DIM)); + +#define SIZECHK(x, y) if ((w == (x)) && (h == (y))) return true; + SIZECHK(4, 4); + SIZECHK(5, 4); + + SIZECHK(5, 5); + + SIZECHK(6, 5); + SIZECHK(6, 6); + + SIZECHK(8, 5); + SIZECHK(8, 6); + SIZECHK(10, 5); + SIZECHK(10, 6); + + SIZECHK(8, 8); + SIZECHK(10, 8); + SIZECHK(10, 10); + + SIZECHK(12, 10); + SIZECHK(12, 12); +#undef SIZECHK + + return false; + } + + bool block_has_any_hdr_cems(const log_astc_block& log_blk) + { + assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS)); + + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) + if (is_cem_hdr(log_blk.m_color_endpoint_modes[i])) + return true; + + return false; + } + + bool block_has_any_ldr_cems(const log_astc_block& log_blk) + { + assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS)); + + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) + if (!is_cem_hdr(log_blk.m_color_endpoint_modes[i])) + return true; + + return false; + } + + dequant_tables g_dequant_tables; + + void precompute_texel_partitions_4x4(); + + void init_tables(bool init_rank_tabs) + { + g_dequant_tables.init(init_rank_tabs); + + precompute_texel_partitions_4x4(); + } + + struct weighted_sample + { + uint8_t m_src_x; + uint8_t m_src_y; + uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8 + }; + + static void compute_upsample_weights( + int block_width, int block_height, + int weight_grid_width, int weight_grid_height, + weighted_sample* pWeights) // there will be block_width * block_height bilinear samples + { + const uint32_t scaleX = (1024 + block_width / 2) / (block_width - 1); + const uint32_t scaleY = (1024 + block_height / 2) / (block_height - 1); + + for (int texelY = 0; texelY < block_height; texelY++) + { + for (int texelX = 0; texelX < block_width; texelX++) + { + const uint32_t gX = (scaleX * texelX * (weight_grid_width - 1) + 32) >> 6; + const uint32_t gY = (scaleY * texelY * (weight_grid_height - 1) + 32) >> 6; + const uint32_t jX = gX >> 4; + const uint32_t jY = gY >> 4; + const uint32_t fX = gX & 0xf; + const uint32_t fY = gY & 0xf; + const uint32_t w11 = (fX * fY + 8) >> 4; + const uint32_t w10 = fY - w11; + const uint32_t w01 = fX - w11; + const uint32_t w00 = 16 - fX - fY + w11; + + weighted_sample& s = pWeights[texelX + texelY * block_width]; + s.m_src_x = (uint8_t)jX; + s.m_src_y = (uint8_t)jY; + s.m_weights[0][0] = (uint8_t)w00; + s.m_weights[0][1] = (uint8_t)w01; + s.m_weights[1][0] = (uint8_t)w10; + s.m_weights[1][1] = (uint8_t)w11; + } + } + } + + // Should be dequantized [0,64] weights + static void upsample_weight_grid( + uint32_t bx, uint32_t by, // destination/to dimension + uint32_t wx, uint32_t wy, // source/from dimension + const uint8_t* pSrc_weights, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx] + uint8_t* pDst_weights) // [by][bx] + { + assert((bx >= 2) && (by >= 2) && (bx <= 12) && (by <= 12)); + assert((wx >= 2) && (wy >= 2) && (wx <= bx) && (wy <= by)); + + const uint32_t total_src_weights = wx * wy; + const uint32_t total_dst_weights = bx * by; + + if (total_src_weights == total_dst_weights) + { + memcpy(pDst_weights, pSrc_weights, total_src_weights); + return; + } + + weighted_sample weights[12 * 12]; + compute_upsample_weights(bx, by, wx, wy, weights); + + const weighted_sample* pS = weights; + + for (uint32_t y = 0; y < by; y++) + { + for (uint32_t x = 0; x < bx; x++, ++pS) + { + const uint32_t w00 = pS->m_weights[0][0]; + const uint32_t w01 = pS->m_weights[0][1]; + const uint32_t w10 = pS->m_weights[1][0]; + const uint32_t w11 = pS->m_weights[1][1]; + + assert(w00 || w01 || w10 || w11); + + const uint32_t sx = pS->m_src_x, sy = pS->m_src_y; + + uint32_t total = 8; + if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * w00; + if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * w01; + if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * w10; + if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * w11; + + pDst_weights[x + y * bx] = (uint8_t)(total >> 4); + } + } + } + + inline uint32_t hash52(uint32_t v) + { + uint32_t p = v; + p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; + p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; + p ^= p << 6; p ^= p >> 17; + return p; + } + + int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block) + { + assert(zIn == 0); + + const uint32_t x = small_block ? xIn << 1 : xIn; + const uint32_t y = small_block ? yIn << 1 : yIn; + const uint32_t z = small_block ? zIn << 1 : zIn; + const uint32_t seed = seedIn + 1024 * (num_partitions - 1); + const uint32_t rnum = hash52(seed); + + uint8_t seed1 = (uint8_t)(rnum & 0xf); + uint8_t seed2 = (uint8_t)((rnum >> 4) & 0xf); + uint8_t seed3 = (uint8_t)((rnum >> 8) & 0xf); + uint8_t seed4 = (uint8_t)((rnum >> 12) & 0xf); + uint8_t seed5 = (uint8_t)((rnum >> 16) & 0xf); + uint8_t seed6 = (uint8_t)((rnum >> 20) & 0xf); + uint8_t seed7 = (uint8_t)((rnum >> 24) & 0xf); + uint8_t seed8 = (uint8_t)((rnum >> 28) & 0xf); + uint8_t seed9 = (uint8_t)((rnum >> 18) & 0xf); + uint8_t seed10 = (uint8_t)((rnum >> 22) & 0xf); + uint8_t seed11 = (uint8_t)((rnum >> 26) & 0xf); + uint8_t seed12 = (uint8_t)(((rnum >> 30) | (rnum << 2)) & 0xf); + + seed1 = (uint8_t)(seed1 * seed1); + seed2 = (uint8_t)(seed2 * seed2); + seed3 = (uint8_t)(seed3 * seed3); + seed4 = (uint8_t)(seed4 * seed4); + seed5 = (uint8_t)(seed5 * seed5); + seed6 = (uint8_t)(seed6 * seed6); + seed7 = (uint8_t)(seed7 * seed7); + seed8 = (uint8_t)(seed8 * seed8); + seed9 = (uint8_t)(seed9 * seed9); + seed10 = (uint8_t)(seed10 * seed10); + seed11 = (uint8_t)(seed11 * seed11); + seed12 = (uint8_t)(seed12 * seed12); + + const int shA = (seed & 2) != 0 ? 4 : 5; + const int shB = (num_partitions == 3) ? 6 : 5; + const int sh1 = (seed & 1) != 0 ? shA : shB; + const int sh2 = (seed & 1) != 0 ? shB : shA; + const int sh3 = (seed & 0x10) != 0 ? sh1 : sh2; + + seed1 = (uint8_t)(seed1 >> sh1); + seed2 = (uint8_t)(seed2 >> sh2); + seed3 = (uint8_t)(seed3 >> sh1); + seed4 = (uint8_t)(seed4 >> sh2); + seed5 = (uint8_t)(seed5 >> sh1); + seed6 = (uint8_t)(seed6 >> sh2); + seed7 = (uint8_t)(seed7 >> sh1); + seed8 = (uint8_t)(seed8 >> sh2); + seed9 = (uint8_t)(seed9 >> sh3); + seed10 = (uint8_t)(seed10 >> sh3); + seed11 = (uint8_t)(seed11 >> sh3); + seed12 = (uint8_t)(seed12 >> sh3); + + const int a = 0x3f & (seed1 * x + seed2 * y + seed11 * z + (rnum >> 14)); + const int b = 0x3f & (seed3 * x + seed4 * y + seed12 * z + (rnum >> 10)); + const int c = (num_partitions >= 3) ? 0x3f & (seed5 * x + seed6 * y + seed9 * z + (rnum >> 6)) : 0; + const int d = (num_partitions >= 4) ? 0x3f & (seed7 * x + seed8 * y + seed10 * z + (rnum >> 2)) : 0; + + return (a >= b && a >= c && a >= d) ? 0 + : (b >= c && b >= d) ? 1 + : (c >= d) ? 2 + : 3; + } + + static uint32_t g_texel_partitions_4x4[1024][2]; + + void precompute_texel_partitions_4x4() + { + for (uint32_t p = 0; p < 1024; p++) + { + uint32_t v2 = 0, v3 = 0; + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + const uint32_t shift = x * 2 + y * 8; + v2 |= (compute_texel_partition(p, x, y, 0, 2, true) << shift); + v3 |= (compute_texel_partition(p, x, y, 0, 3, true) << shift); + } + } + + g_texel_partitions_4x4[p][0] = v2; + g_texel_partitions_4x4[p][1] = v3; + } + } + + static inline int get_precompute_texel_partitions_4x4(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions) + { + assert(g_texel_partitions_4x4[1][0]); + assert(seed < 1024); + assert((x <= 3) && (y <= 3)); + assert((num_partitions >= 2) && (num_partitions <= 3)); + + const uint32_t shift = x * 2 + y * 8; + return (g_texel_partitions_4x4[seed][num_partitions - 2] >> shift) & 3; + } + + void blue_contract( + int r, int g, int b, int a, + int &dr, int &dg, int &db, int &da) + { + dr = (r + b) >> 1; + dg = (g + b) >> 1; + db = b; + da = a; + } + + inline void bit_transfer_signed(int& a, int& b) + { + b >>= 1; + b |= (a & 0x80); + a >>= 1; + a &= 0x3F; + if ((a & 0x20) != 0) + a -= 0x40; + } + + static inline int clamp(int a, int l, int h) + { + if (a < l) + a = l; + else if (a > h) + a = h; + return a; + } + + static inline float clampf(float a, float l, float h) + { + if (a < l) + a = l; + else if (a > h) + a = h; + return a; + } + + inline int sign_extend(int src, int num_src_bits) + { + assert((num_src_bits >= 2) && (num_src_bits <= 31)); + + const bool negative = (src & (1 << (num_src_bits - 1))) != 0; + if (negative) + return src | ~((1 << num_src_bits) - 1); + else + return src & ((1 << num_src_bits) - 1); + } + + // endpoints is [4][2] + void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t *pE) + { + assert(cem_index <= CEM_HDR_RGB_HDR_ALPHA); + + int v0 = pE[0], v1 = pE[1]; + + int& e0_r = pEndpoints[0][0], &e0_g = pEndpoints[1][0], &e0_b = pEndpoints[2][0], &e0_a = pEndpoints[3][0]; + int& e1_r = pEndpoints[0][1], &e1_g = pEndpoints[1][1], &e1_b = pEndpoints[2][1], &e1_a = pEndpoints[3][1]; + + switch (cem_index) + { + case CEM_LDR_LUM_DIRECT: + { + e0_r = v0; e1_r = v1; + e0_g = v0; e1_g = v1; + e0_b = v0; e1_b = v1; + e0_a = 0xFF; e1_a = 0xFF; + break; + } + case CEM_LDR_LUM_BASE_PLUS_OFS: + { + int l0 = (v0 >> 2) | (v1 & 0xc0); + int l1 = l0 + (v1 & 0x3f); + + if (l1 > 0xFF) + l1 = 0xFF; + + e0_r = l0; e1_r = l1; + e0_g = l0; e1_g = l1; + e0_b = l0; e1_b = l1; + e0_a = 0xFF; e1_a = 0xFF; + break; + } + case CEM_LDR_LUM_ALPHA_DIRECT: + { + int v2 = pE[2], v3 = pE[3]; + + e0_r = v0; e1_r = v1; + e0_g = v0; e1_g = v1; + e0_b = v0; e1_b = v1; + e0_a = v2; e1_a = v3; + break; + } + case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS: + { + int v2 = pE[2], v3 = pE[3]; + + bit_transfer_signed(v1, v0); + bit_transfer_signed(v3, v2); + + e0_r = v0; e1_r = v0 + v1; + e0_g = v0; e1_g = v0 + v1; + e0_b = v0; e1_b = v0 + v1; + e0_a = v2; e1_a = v2 + v3; + + for (uint32_t c = 0; c < 4; c++) + { + pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255); + pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255); + } + + break; + } + case CEM_LDR_RGB_BASE_SCALE: + { + int v2 = pE[2], v3 = pE[3]; + + e0_r = (v0 * v3) >> 8; e1_r = v0; + e0_g = (v1 * v3) >> 8; e1_g = v1; + e0_b = (v2 * v3) >> 8; e1_b = v2; + e0_a = 0xFF; e1_a = 0xFF; + + break; + } + case CEM_LDR_RGB_DIRECT: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5]; + + if ((v1 + v3 + v5) >= (v0 + v2 + v4)) + { + e0_r = v0; e1_r = v1; + e0_g = v2; e1_g = v3; + e0_b = v4; e1_b = v5; + e0_a = 0xFF; e1_a = 0xFF; + } + else + { + blue_contract(v1, v3, v5, 0xFF, e0_r, e0_g, e0_b, e0_a); + blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a); + } + + break; + } + case CEM_LDR_RGB_BASE_PLUS_OFFSET: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5]; + + bit_transfer_signed(v1, v0); + bit_transfer_signed(v3, v2); + bit_transfer_signed(v5, v4); + + if ((v1 + v3 + v5) >= 0) + { + e0_r = v0; e1_r = v0 + v1; + e0_g = v2; e1_g = v2 + v3; + e0_b = v4; e1_b = v4 + v5; + e0_a = 0xFF; e1_a = 0xFF; + } + else + { + blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xFF, e0_r, e0_g, e0_b, e0_a); + blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a); + } + + for (uint32_t c = 0; c < 4; c++) + { + pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255); + pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255); + } + + break; + } + case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5]; + + e0_r = (v0 * v3) >> 8; e1_r = v0; + e0_g = (v1 * v3) >> 8; e1_g = v1; + e0_b = (v2 * v3) >> 8; e1_b = v2; + e0_a = v4; e1_a = v5; + + break; + } + case CEM_LDR_RGBA_DIRECT: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7]; + + if ((v1 + v3 + v5) >= (v0 + v2 + v4)) + { + e0_r = v0; e1_r = v1; + e0_g = v2; e1_g = v3; + e0_b = v4; e1_b = v5; + e0_a = v6; e1_a = v7; + } + else + { + blue_contract(v1, v3, v5, v7, e0_r, e0_g, e0_b, e0_a); + blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a); + } + + break; + } + case CEM_LDR_RGBA_BASE_PLUS_OFFSET: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7]; + + bit_transfer_signed(v1, v0); + bit_transfer_signed(v3, v2); + bit_transfer_signed(v5, v4); + bit_transfer_signed(v7, v6); + + if ((v1 + v3 + v5) >= 0) + { + e0_r = v0; e1_r = v0 + v1; + e0_g = v2; e1_g = v2 + v3; + e0_b = v4; e1_b = v4 + v5; + e0_a = v6; e1_a = v6 + v7; + } + else + { + blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7, e0_r, e0_g, e0_b, e0_a); + blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a); + } + + for (uint32_t c = 0; c < 4; c++) + { + pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255); + pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255); + } + + break; + } + case CEM_HDR_LUM_LARGE_RANGE: + { + int y0, y1; + if (v1 >= v0) + { + y0 = (v0 << 4); + y1 = (v1 << 4); + } + else + { + y0 = (v1 << 4) + 8; + y1 = (v0 << 4) - 8; + } + + e0_r = y0; e1_r = y1; + e0_g = y0; e1_g = y1; + e0_b = y0; e1_b = y1; + e0_a = 0x780; e1_a = 0x780; + + break; + } + case CEM_HDR_LUM_SMALL_RANGE: + { + int y0, y1, d; + + if ((v0 & 0x80) != 0) + { + y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2); + d = (v1 & 0x1F) << 2; + } + else + { + y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1); + d = (v1 & 0x0F) << 1; + } + + y1 = y0 + d; + if (y1 > 0xFFF) + y1 = 0xFFF; + + e0_r = y0; e1_r = y1; + e0_g = y0; e1_g = y1; + e0_b = y0; e1_b = y1; + e0_a = 0x780; e1_a = 0x780; + + break; + } + case CEM_HDR_RGB_BASE_SCALE: + { + int v2 = pE[2], v3 = pE[3]; + + int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); + + int majcomp, mode; + if ((modeval & 0xC) != 0xC) + { + majcomp = modeval >> 2; + mode = modeval & 3; + } + else if (modeval != 0xF) + { + majcomp = modeval & 3; + mode = 4; + } + else + { + majcomp = 0; + mode = 5; + } + + int red = v0 & 0x3f; + int green = v1 & 0x1f; + int blue = v2 & 0x1f; + int scale = v3 & 0x1f; + + int x0 = (v1 >> 6) & 1; + int x1 = (v1 >> 5) & 1; + int x2 = (v2 >> 6) & 1; + int x3 = (v2 >> 5) & 1; + int x4 = (v3 >> 7) & 1; + int x5 = (v3 >> 6) & 1; + int x6 = (v3 >> 5) & 1; + + int ohm = 1 << mode; + if (ohm & 0x30) green |= x0 << 6; + if (ohm & 0x3A) green |= x1 << 5; + if (ohm & 0x30) blue |= x2 << 6; + if (ohm & 0x3A) blue |= x3 << 5; + if (ohm & 0x3D) scale |= x6 << 5; + if (ohm & 0x2D) scale |= x5 << 6; + if (ohm & 0x04) scale |= x4 << 7; + if (ohm & 0x3B) red |= x4 << 6; + if (ohm & 0x04) red |= x3 << 6; + if (ohm & 0x10) red |= x5 << 7; + if (ohm & 0x0F) red |= x2 << 7; + if (ohm & 0x05) red |= x1 << 8; + if (ohm & 0x0A) red |= x0 << 8; + if (ohm & 0x05) red |= x0 << 9; + if (ohm & 0x02) red |= x6 << 9; + if (ohm & 0x01) red |= x3 << 10; + if (ohm & 0x02) red |= x5 << 10; + + static const int s_shamts[6] = { 1,1,2,3,4,5 }; + + const int shamt = s_shamts[mode]; + red <<= shamt; + green <<= shamt; + blue <<= shamt; + scale <<= shamt; + + if (mode != 5) + { + green = red - green; + blue = red - blue; + } + + if (majcomp == 1) + std::swap(red, green); + + if (majcomp == 2) + std::swap(red, blue); + + e1_r = clamp(red, 0, 0xFFF); + e1_g = clamp(green, 0, 0xFFF); + e1_b = clamp(blue, 0, 0xFFF); + e1_a = 0x780; + + e0_r = clamp(red - scale, 0, 0xFFF); + e0_g = clamp(green - scale, 0, 0xFFF); + e0_b = clamp(blue - scale, 0, 0xFFF); + e0_a = 0x780; + + break; + } + case CEM_HDR_RGB_HDR_ALPHA: + case CEM_HDR_RGB_LDR_ALPHA: + case CEM_HDR_RGB: + { + int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5]; + + int majcomp = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6); + + e0_a = 0x780; + e1_a = 0x780; + + if (majcomp == 3) + { + e0_r = v0 << 4; + e0_g = v2 << 4; + e0_b = (v4 & 0x7f) << 5; + + e1_r = v1 << 4; + e1_g = v3 << 4; + e1_b = (v5 & 0x7f) << 5; + } + else + { + int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5); + int va = v0 | ((v1 & 0x40) << 2); + int vb0 = v2 & 0x3f; + int vb1 = v3 & 0x3f; + int vc = v1 & 0x3f; + int vd0 = v4 & 0x7f; + int vd1 = v5 & 0x7f; + + static const int s_dbitstab[8] = { 7,6,7,6,5,6,5,6 }; + vd0 = sign_extend(vd0, s_dbitstab[mode]); + vd1 = sign_extend(vd1, s_dbitstab[mode]); + + int x0 = (v2 >> 6) & 1; + int x1 = (v3 >> 6) & 1; + int x2 = (v4 >> 6) & 1; + int x3 = (v5 >> 6) & 1; + int x4 = (v4 >> 5) & 1; + int x5 = (v5 >> 5) & 1; + + int ohm = 1 << mode; + if (ohm & 0xA4) va |= x0 << 9; + if (ohm & 0x08) va |= x2 << 9; + if (ohm & 0x50) va |= x4 << 9; + if (ohm & 0x50) va |= x5 << 10; + if (ohm & 0xA0) va |= x1 << 10; + if (ohm & 0xC0) va |= x2 << 11; + if (ohm & 0x04) vc |= x1 << 6; + if (ohm & 0xE8) vc |= x3 << 6; + if (ohm & 0x20) vc |= x2 << 7; + if (ohm & 0x5B) vb0 |= x0 << 6; + if (ohm & 0x5B) vb1 |= x1 << 6; + if (ohm & 0x12) vb0 |= x2 << 7; + if (ohm & 0x12) vb1 |= x3 << 7; + + int shamt = (mode >> 1) ^ 3; + va = (uint32_t)va << shamt; + vb0 = (uint32_t)vb0 << shamt; + vb1 = (uint32_t)vb1 << shamt; + vc = (uint32_t)vc << shamt; + vd0 = (uint32_t)vd0 << shamt; + vd1 = (uint32_t)vd1 << shamt; + + e1_r = clamp(va, 0, 0xFFF); + e1_g = clamp(va - vb0, 0, 0xFFF); + e1_b = clamp(va - vb1, 0, 0xFFF); + + e0_r = clamp(va - vc, 0, 0xFFF); + e0_g = clamp(va - vb0 - vc - vd0, 0, 0xFFF); + e0_b = clamp(va - vb1 - vc - vd1, 0, 0xFFF); + + if (majcomp == 1) + { + std::swap(e0_r, e0_g); + std::swap(e1_r, e1_g); + } + else if (majcomp == 2) + { + std::swap(e0_r, e0_b); + std::swap(e1_r, e1_b); + } + } + + if (cem_index == CEM_HDR_RGB_LDR_ALPHA) + { + int v6 = pE[6], v7 = pE[7]; + + e0_a = v6; + e1_a = v7; + } + else if (cem_index == CEM_HDR_RGB_HDR_ALPHA) + { + int v6 = pE[6], v7 = pE[7]; + + // Extract mode bits + int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2); + v6 &= 0x7F; + v7 &= 0x7F; + + if (mode == 3) + { + e0_a = v6 << 5; + e1_a = v7 << 5; + } + else + { + v6 |= (v7 << (mode + 1)) & 0x780; + v7 &= (0x3F >> mode); + v7 ^= (0x20 >> mode); + v7 -= (0x20 >> mode); + v6 <<= (4 - mode); + v7 <<= (4 - mode); + + v7 += v6; + v7 = clamp(v7, 0, 0xFFF); + e0_a = v6; + e1_a = v7; + } + } + + break; + } + default: + { + assert(0); + for (uint32_t c = 0; c < 4; c++) + { + pEndpoints[c][0] = 0; + pEndpoints[c][1] = 0; + } + break; + } + } + } + + static inline bool is_half_inf_or_nan(half_float v) + { + return get_bits(v, 10, 14) == 31; + } + + // This float->half conversion matches how "F32TO16" works on Intel GPU's. + half_float float_to_half(float val, bool toward_zero) + { + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1; + int s = flt_s, e = 0, m = 0; + + // inf/NaN + if (flt_e == 0xff) + { + e = 31; + if (flt_m != 0) // NaN + m = 1; + } + // not zero or denormal + else if (flt_e != 0) + { + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + { + if (toward_zero) + m = (int)truncf((1 << 24) * fabsf(fi.f)); + else + m = lrintf((1 << 24) * fabsf(fi.f)); + } + else + { + e = new_exp + 15; + if (toward_zero) + m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13))); + else + m = lrintf((float)flt_m * (1.0f / (float)(1 << 13))); + } + } + + assert((0 <= m) && (m <= 1024)); + if (m == 1024) + { + e++; + m = 0; + } + + assert((s >= 0) && (s <= 1)); + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); + + half_float result = (half_float)((s << 15) | (e << 10) | m); + return result; + } + + float half_to_float(half_float hval) + { + union { float f; uint32_t u; } x = { 0 }; + + uint32_t s = ((uint32_t)hval >> 15) & 1; + uint32_t e = ((uint32_t)hval >> 10) & 0x1F; + uint32_t m = (uint32_t)hval & 0x3FF; + + if (!e) + { + if (!m) + { + // +- 0 + x.u = s << 31; + return x.f; + } + else + { + // denormalized + while (!(m & 0x00000400)) + { + m <<= 1; + --e; + } + + ++e; + m &= ~0x00000400; + } + } + else if (e == 31) + { + if (m == 0) + { + // +/- INF + x.u = (s << 31) | 0x7f800000; + return x.f; + } + else + { + // +/- NaN + x.u = (s << 31) | 0x7f800000 | (m << 13); + return x.f; + } + } + + e = e + (127 - 15); + m = m << 13; + + assert(s <= 1); + assert(m <= 0x7FFFFF); + assert(e <= 255); + + x.u = m | (e << 23) | (s << 31); + return x.f; + } + + static inline half_float qlog16_to_half(int k) + { + assert((k >= 0) && (k <= 0xFFFF)); + + int E = (k & 0xF800) >> 11; + int M = k & 0x7FF; + + int Mt; + if (M < 512) + Mt = 3 * M; + else if (M >= 1536) + Mt = 5 * M - 2048; + else + Mt = 4 * M - 512; + + return (half_float)((E << 10) + (Mt >> 3)); + } + + // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt + const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31; + const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS); + const int RGB9E5_MANTISSA_VALUES = (1 << RGB9E5_MANTISSA_BITS); + const int MAX_RGB9E5_MANTISSA = (RGB9E5_MANTISSA_VALUES - 1); + //const int MAX_RGB9E5 = (int)(((float)MAX_RGB9E5_MANTISSA) / RGB9E5_MANTISSA_VALUES * (1 << MAX_RGB9E5_EXP)); + const int EPSILON_RGB9E5 = (int)((1.0f / (float)RGB9E5_MANTISSA_VALUES) / (float)(1 << RGB9E5_EXP_BIAS)); + + void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b) + { + int x = packed & 511; + int y = (packed >> 9) & 511; + int z = (packed >> 18) & 511; + int w = (packed >> 27) & 31; + + const float scale = powf(2.0f, static_cast(w - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS)); + + r = x * scale; + g = y * scale; + b = z * scale; + } + + // floor_log2 is not correct for the denorm and zero values, but we are going to do a max of this value with the minimum rgb9e5 exponent that will hide these problem cases. + static inline int floor_log2(float x) + { + union float754 + { + unsigned int raw; + float value; + }; + + float754 f; + f.value = x; + // Extract float exponent + return ((f.raw >> 23) & 0xFF) - 127; + } + + static inline int maximumi(int a, int b) { return (a > b) ? a : b; } + static inline float maximumf(float a, float b) { return (a > b) ? a : b; } + + uint32_t pack_rgb9e5(float r, float g, float b) + { + r = clampf(r, 0.0f, MAX_RGB9E5); + g = clampf(g, 0.0f, MAX_RGB9E5); + b = clampf(b, 0.0f, MAX_RGB9E5); + + float maxrgb = maximumf(maximumf(r, g), b); + int exp_shared = maximumi(-RGB9E5_EXP_BIAS - 1, floor_log2(maxrgb)) + 1 + RGB9E5_EXP_BIAS; + assert((exp_shared >= 0) && (exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP)); + + float denom = powf(2.0f, (float)(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS)); + + int maxm = (int)floorf((maxrgb / denom) + 0.5f); + if (maxm == (MAX_RGB9E5_MANTISSA + 1)) + { + denom *= 2; + exp_shared += 1; + assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP); + } + else + { + assert(maxm <= MAX_RGB9E5_MANTISSA); + } + + int rm = (int)floorf((r / denom) + 0.5f); + int gm = (int)floorf((g / denom) + 0.5f); + int bm = (int)floorf((b / denom) + 0.5f); + + assert((rm >= 0) && (rm <= MAX_RGB9E5_MANTISSA)); + assert((gm >= 0) && (gm <= MAX_RGB9E5_MANTISSA)); + assert((bm >= 0) && (bm <= MAX_RGB9E5_MANTISSA)); + + return rm | (gm << 9) | (bm << 18) | (exp_shared << 27); + } + + static inline int clz17(uint32_t x) + { + assert(x <= 0x1FFFF); + x &= 0x1FFFF; + + if (!x) + return 17; + + uint32_t n = 0; + while ((x & 0x10000) == 0) + { + x <<= 1u; + n++; + } + + return n; + } + + static inline uint32_t pack_rgb9e5_ldr_astc(int Cr, int Cg, int Cb) + { + int lz = clz17(Cr | Cg | Cb | 1); + if (Cr == 65535) { Cr = 65536; lz = 0; } + if (Cg == 65535) { Cg = 65536; lz = 0; } + if (Cb == 65535) { Cb = 65536; lz = 0; } + Cr <<= lz; Cg <<= lz; Cb <<= lz; + Cr = (Cr >> 8) & 0x1FF; + Cg = (Cg >> 8) & 0x1FF; + Cb = (Cb >> 8) & 0x1FF; + uint32_t exponent = 16 - lz; + uint32_t texel = (exponent << 27) | (Cb << 18) | (Cg << 9) | Cr; + return texel; + } + + static inline uint32_t pack_rgb9e5_hdr_astc(int Cr, int Cg, int Cb) + { + if (Cr > 0x7c00) Cr = 0; else if (Cr == 0x7c00) Cr = 0x7bff; + if (Cg > 0x7c00) Cg = 0; else if (Cg == 0x7c00) Cg = 0x7bff; + if (Cb > 0x7c00) Cb = 0; else if (Cb == 0x7c00) Cb = 0x7bff; + int Re = (Cr >> 10) & 0x1F; + int Ge = (Cg >> 10) & 0x1F; + int Be = (Cb >> 10) & 0x1F; + int Rex = (Re == 0) ? 1 : Re; + int Gex = (Ge == 0) ? 1 : Ge; + int Bex = (Be == 0) ? 1 : Be; + int Xm = ((Cr | Cg | Cb) & 0x200) >> 9; + int Xe = Re | Ge | Be; + uint32_t rshift, gshift, bshift, expo; + + if (Xe == 0) + { + expo = rshift = gshift = bshift = Xm; + } + else if (Re >= Ge && Re >= Be) + { + expo = Rex + 1; + rshift = 2; + gshift = Rex - Gex + 2; + bshift = Rex - Bex + 2; + } + else if (Ge >= Be) + { + expo = Gex + 1; + rshift = Gex - Rex + 2; + gshift = 2; + bshift = Gex - Bex + 2; + } + else + { + expo = Bex + 1; + rshift = Bex - Rex + 2; + gshift = Bex - Gex + 2; + bshift = 2; + } + + int Rm = (Cr & 0x3FF) | (Re == 0 ? 0 : 0x400); + int Gm = (Cg & 0x3FF) | (Ge == 0 ? 0 : 0x400); + int Bm = (Cb & 0x3FF) | (Be == 0 ? 0 : 0x400); + Rm = (Rm >> rshift) & 0x1FF; + Gm = (Gm >> gshift) & 0x1FF; + Bm = (Bm >> bshift) & 0x1FF; + + uint32_t texel = (expo << 27) | (Bm << 18) | (Gm << 9) | (Rm << 0); + return texel; + } + + // Important: pPixels is either 32-bit/texel or 64-bit/texel. + bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode) + { + assert(is_valid_block_size(blk_width, blk_height)); + + assert(g_dequant_tables.m_endpoints[0].m_ISE_to_val.size()); + if (!g_dequant_tables.m_endpoints[0].m_ISE_to_val.size()) + return false; + + const uint32_t num_blk_pixels = blk_width * blk_height; + + // Write block error color + if (dec_mode == cDecodeModeHDR16) + { + // NaN's + memset(pPixels, 0xFF, num_blk_pixels * sizeof(half_float) * 4); + } + else if (dec_mode == cDecodeModeRGB9E5) + { + const uint32_t purple_9e5 = pack_rgb9e5(1.0f, 0.0f, 1.0f); + + for (uint32_t i = 0; i < num_blk_pixels; i++) + ((uint32_t*)pPixels)[i] = purple_9e5; + } + else + { + for (uint32_t i = 0; i < num_blk_pixels; i++) + ((uint32_t*)pPixels)[i] = 0xFFFF00FF; + } + + if (log_blk.m_error_flag) + { + // Should this return false? It's not an invalid logical block config, though. + return false; + } + + // Handle solid color blocks + if (log_blk.m_solid_color_flag_ldr) + { + // LDR solid block + if (dec_mode == cDecodeModeHDR16) + { + // Convert LDR pixels to half-float + half_float h[4]; + for (uint32_t c = 0; c < 4; c++) + h[c] = (log_blk.m_solid_color[c] == 0xFFFF) ? 0x3C00 : float_to_half((float)log_blk.m_solid_color[c] * (1.0f / 65536.0f), true); + + for (uint32_t i = 0; i < num_blk_pixels; i++) + memcpy((uint16_t*)pPixels + i * 4, h, sizeof(half_float) * 4); + } + else if (dec_mode == cDecodeModeRGB9E5) + { + float r = (log_blk.m_solid_color[0] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[0] * (1.0f / 65536.0f)); + float g = (log_blk.m_solid_color[1] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[1] * (1.0f / 65536.0f)); + float b = (log_blk.m_solid_color[2] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[2] * (1.0f / 65536.0f)); + + const uint32_t packed = pack_rgb9e5(r, g, b); + + for (uint32_t i = 0; i < num_blk_pixels; i++) + ((uint32_t*)pPixels)[i] = packed; + } + else + { + // Convert LDR pixels to 8-bits + for (uint32_t i = 0; i < num_blk_pixels; i++) + for (uint32_t c = 0; c < 4; c++) + ((uint8_t*)pPixels)[i * 4 + c] = (log_blk.m_solid_color[c] >> 8); + } + + return true; + } + else if (log_blk.m_solid_color_flag_hdr) + { + // HDR solid block, decode mode must be half-float or RGB9E5 + if (dec_mode == cDecodeModeHDR16) + { + for (uint32_t i = 0; i < num_blk_pixels; i++) + memcpy((uint16_t*)pPixels + i * 4, log_blk.m_solid_color, sizeof(half_float) * 4); + } + else if (dec_mode == cDecodeModeRGB9E5) + { + float r = half_to_float(log_blk.m_solid_color[0]); + float g = half_to_float(log_blk.m_solid_color[1]); + float b = half_to_float(log_blk.m_solid_color[2]); + + const uint32_t packed = pack_rgb9e5(r, g, b); + + for (uint32_t i = 0; i < num_blk_pixels; i++) + ((uint32_t*)pPixels)[i] = packed; + } + else + { + return false; + } + + return true; + } + + // Sanity check block's config + if ((log_blk.m_grid_width < 2) || (log_blk.m_grid_height < 2)) + return false; + if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height)) + return false; + + if ((log_blk.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_blk.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE)) + return false; + if ((log_blk.m_weight_ise_range < FIRST_VALID_WEIGHT_ISE_RANGE) || (log_blk.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE)) + return false; + if ((log_blk.m_num_partitions < 1) || (log_blk.m_num_partitions > MAX_PARTITIONS)) + return false; + if ((log_blk.m_dual_plane) && (log_blk.m_num_partitions > MAX_DUAL_PLANE_PARTITIONS)) + return false; + if (log_blk.m_partition_id >= NUM_PARTITION_PATTERNS) + return false; + if ((log_blk.m_num_partitions == 1) && (log_blk.m_partition_id > 0)) + return false; + if (log_blk.m_color_component_selector > 3) + return false; + + const uint32_t total_endpoint_levels = get_ise_levels(log_blk.m_endpoint_ise_range); + const uint32_t total_weight_levels = get_ise_levels(log_blk.m_weight_ise_range); + + bool is_ldr_endpoints[MAX_PARTITIONS]; + + // Check CEM's + uint32_t total_cem_vals = 0; + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) + { + if (log_blk.m_color_endpoint_modes[i] > 15) + return false; + + total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[i]); + + is_ldr_endpoints[i] = is_cem_ldr(log_blk.m_color_endpoint_modes[i]); + } + + if (total_cem_vals > MAX_ENDPOINTS) + return false; + + const dequant_table& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range); + const uint8_t* pEndpoint_dequant = endpoint_dequant_tab.m_ISE_to_val.data(); + + // Dequantized endpoints to [0,255] + uint8_t dequantized_endpoints[MAX_ENDPOINTS]; + for (uint32_t i = 0; i < total_cem_vals; i++) + { + if (log_blk.m_endpoints[i] >= total_endpoint_levels) + return false; + dequantized_endpoints[i] = pEndpoint_dequant[log_blk.m_endpoints[i]]; + } + + // Dequantize weights to [0,64] + uint8_t dequantized_weights[2][12 * 12]; + + const dequant_table& weight_dequant_tab = g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range); + const uint8_t* pWeight_dequant = weight_dequant_tab.m_ISE_to_val.data(); + + const uint32_t total_weight_vals = (log_blk.m_dual_plane ? 2 : 1) * log_blk.m_grid_width * log_blk.m_grid_height; + for (uint32_t i = 0; i < total_weight_vals; i++) + { + if (log_blk.m_weights[i] >= total_weight_levels) + return false; + + const uint32_t plane_index = log_blk.m_dual_plane ? (i & 1) : 0; + const uint32_t grid_index = log_blk.m_dual_plane ? (i >> 1) : i; + + dequantized_weights[plane_index][grid_index] = pWeight_dequant[log_blk.m_weights[i]]; + } + + // Upsample weight grid. [0,64] weights + uint8_t upsampled_weights[2][12 * 12]; + + upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[0][0], &upsampled_weights[0][0]); + if (log_blk.m_dual_plane) + upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[1][0], &upsampled_weights[1][0]); + + // Decode CEM's + int endpoints[4][4][2]; // [subset][comp][l/h] + + uint32_t endpoint_val_index = 0; + for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++) + { + const uint32_t cem_index = log_blk.m_color_endpoint_modes[subset]; + + decode_endpoint(cem_index, &endpoints[subset][0], &dequantized_endpoints[endpoint_val_index]); + + endpoint_val_index += get_num_cem_values(cem_index); + } + + // Decode texels + const bool small_block = num_blk_pixels < 31; + const bool use_precomputed_texel_partitions = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3); + const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX; + + bool success = true; + + if (dec_mode == cDecodeModeRGB9E5) + { + // returns uint32_t's + for (uint32_t y = 0; y < blk_height; y++) + { + for (uint32_t x = 0; x < blk_width; x++) + { + const uint32_t pixel_index = x + y * blk_width; + const uint32_t subset = (log_blk.m_num_partitions > 1) ? + (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) + : 0; + + int comp[3]; + + for (uint32_t c = 0; c < 3; c++) + { + const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index]; + + if (is_ldr_endpoints[subset]) + { + assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF)); + assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF)); + + int le = endpoints[subset][c][0]; + int he = endpoints[subset][c][1]; + + le = (le << 8) | le; + he = (he << 8) | he; + + int k = weight_interpolate(le, he, w); + assert((k >= 0) && (k <= 0xFFFF)); + + comp[c] = k; // 1.0 + } + else + { + assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF)); + assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF)); + + int le = endpoints[subset][c][0] << 4; + int he = endpoints[subset][c][1] << 4; + + int qlog16 = weight_interpolate(le, he, w); + + comp[c] = qlog16_to_half(qlog16); + + if (is_half_inf_or_nan((half_float)comp[c])) + comp[c] = 0x7BFF; + } + + } // c + + uint32_t packed; + if (is_ldr_endpoints[subset]) + packed = pack_rgb9e5_ldr_astc(comp[0], comp[1], comp[2]); + else + packed = pack_rgb9e5_hdr_astc(comp[0], comp[1], comp[2]); + + ((uint32_t*)pPixels)[pixel_index] = packed; + + } // x + } // y + } + else if (dec_mode == cDecodeModeHDR16) + { + // Note: must round towards zero when converting float to half for ASTC (18.19 Weight Application) + + // returns half floats + for (uint32_t y = 0; y < blk_height; y++) + { + for (uint32_t x = 0; x < blk_width; x++) + { + const uint32_t pixel_index = x + y * blk_width; + const uint32_t subset = (log_blk.m_num_partitions > 1) ? + (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) + : 0; + + for (uint32_t c = 0; c < 4; c++) + { + const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index]; + + half_float o; + + if ( (is_ldr_endpoints[subset]) || + ((log_blk.m_color_endpoint_modes[subset] == CEM_HDR_RGB_LDR_ALPHA) && (c == 3)) ) + { + assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF)); + assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF)); + + int le = endpoints[subset][c][0]; + int he = endpoints[subset][c][1]; + + le = (le << 8) | le; + he = (he << 8) | he; + + int k = weight_interpolate(le, he, w); + assert((k >= 0) && (k <= 0xFFFF)); + + if (k == 0xFFFF) + o = 0x3C00; // 1.0 + else + o = float_to_half((float)k * (1.0f / 65536.0f), true); + } + else + { + assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF)); + assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF)); + + int le = endpoints[subset][c][0] << 4; + int he = endpoints[subset][c][1] << 4; + + int qlog16 = weight_interpolate(le, he, w); + + o = qlog16_to_half(qlog16); + + if (is_half_inf_or_nan(o)) + o = 0x7BFF; + } + + ((half_float*)pPixels)[pixel_index * 4 + c] = o; + } + + } // x + } // y + } + else + { + // returns uint8_t's + for (uint32_t y = 0; y < blk_height; y++) + { + for (uint32_t x = 0; x < blk_width; x++) + { + const uint32_t pixel_index = x + y * blk_width; + + const uint32_t subset = (log_blk.m_num_partitions > 1) ? + (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) + : 0; + + if (!is_ldr_endpoints[subset]) + { + ((uint32_t*)pPixels)[pixel_index * 4] = 0xFFFF00FF; + success = false; + } + else + { + for (uint32_t c = 0; c < 4; c++) + { + const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index]; + + int le = endpoints[subset][c][0]; + int he = endpoints[subset][c][1]; + + // FIXME: the spec is apparently wrong? this matches ARM's and Google's decoder + //if ((dec_mode == cDecodeModeSRGB8) && (c <= 2)) + // See https://github.com/ARM-software/astc-encoder/issues/447 + if (dec_mode == cDecodeModeSRGB8) + { + le = (le << 8) | 0x80; + he = (he << 8) | 0x80; + } + else + { + le = (le << 8) | le; + he = (he << 8) | he; + } + + uint32_t k = weight_interpolate(le, he, w); + + // FIXME: This is what the spec says to do in LDR mode, but this is not what ARM's decoder does + // See decompress_symbolic_block(), decode_texel() and unorm16_to_sf16. + // It seems to effectively divide by 65535.0 and convert to FP16, then back to float, mul by 255.0, add .5 and then convert to 8-bit. + ((uint8_t*)pPixels)[pixel_index * 4 + c] = (uint8_t)(k >> 8); + } + } + + } // x + } // y + } + + return success; + } + + //------------------------------------------------ + // Physical to logical block decoding + + // unsigned 128-bit int, with some signed helpers + class uint128 + { + uint64_t m_lo, m_hi; + + public: + uint128() = default; + inline uint128(uint64_t lo) : m_lo(lo), m_hi(0) { } + inline uint128(uint64_t lo, uint64_t hi) : m_lo(lo), m_hi(hi) { } + inline uint128(const uint128& other) : m_lo(other.m_lo), m_hi(other.m_hi) { } + + inline uint128& set_signed(int64_t lo) { m_lo = lo; m_hi = (lo < 0) ? UINT64_MAX : 0; return *this; } + inline uint128& set(uint64_t lo) { m_lo = lo; m_hi = 0; return *this; } + + inline explicit operator uint8_t () const { return (uint8_t)m_lo; } + inline explicit operator uint16_t () const { return (uint16_t)m_lo; } + inline explicit operator uint32_t () const { return (uint32_t)m_lo; } + inline explicit operator uint64_t () const { return m_lo; } + + inline uint128& operator= (const uint128& rhs) { m_lo = rhs.m_lo; m_hi = rhs.m_hi; return *this; } + inline uint128& operator= (const uint64_t val) { m_lo = val; m_hi = 0; return *this; } + + inline uint64_t get_low() const { return m_lo; } + inline uint64_t& get_low() { return m_lo; } + + inline uint64_t get_high() const { return m_hi; } + inline uint64_t& get_high() { return m_hi; } + + inline bool operator== (const uint128& rhs) const { return (m_lo == rhs.m_lo) && (m_hi == rhs.m_hi); } + inline bool operator!= (const uint128& rhs) const { return (m_lo != rhs.m_lo) || (m_hi != rhs.m_hi); } + + inline bool operator< (const uint128& rhs) const + { + if (m_hi < rhs.m_hi) + return true; + + if (m_hi == rhs.m_hi) + { + if (m_lo < rhs.m_lo) + return true; + } + + return false; + } + + inline bool operator> (const uint128& rhs) const { return (rhs < *this); } + + inline bool operator<= (const uint128& rhs) const { return (*this == rhs) || (*this < rhs); } + inline bool operator>= (const uint128& rhs) const { return (*this == rhs) || (*this > rhs); } + + inline bool is_zero() const { return (m_lo == 0) && (m_hi == 0); } + inline bool is_all_ones() const { return (m_lo == UINT64_MAX) && (m_hi == UINT64_MAX); } + inline bool is_non_zero() const { return (m_lo != 0) || (m_hi != 0); } + inline explicit operator bool() const { return is_non_zero(); } + inline bool is_signed() const { return ((int64_t)m_hi) < 0; } + + inline bool signed_less(const uint128& rhs) const + { + const bool l_signed = is_signed(), r_signed = rhs.is_signed(); + + if (l_signed == r_signed) + return *this < rhs; + + if (l_signed && !r_signed) + return true; + + assert(!l_signed && r_signed); + return false; + } + + inline bool signed_greater(const uint128& rhs) const { return rhs.signed_less(*this); } + inline bool signed_less_equal(const uint128& rhs) const { return !rhs.signed_less(*this); } + inline bool signed_greater_equal(const uint128& rhs) const { return !signed_less(rhs); } + + double get_double() const + { + double res = 0; + + if (m_hi) + res = (double)m_hi * pow(2.0f, 64.0f); + + res += (double)m_lo; + + return res; + } + + double get_signed_double() const + { + if (is_signed()) + return -(uint128(*this).abs().get_double()); + else + return get_double(); + } + + inline uint128 abs() const + { + uint128 res(*this); + if (res.is_signed()) + res = -res; + return res; + } + + inline uint128& operator<<= (int shift) + { + assert(shift >= 0); + if (shift < 0) + return *this; + + m_hi = (shift >= 64) ? ((shift >= 128) ? 0 : (m_lo << (shift - 64))) : (m_hi << shift); + + if ((shift) && (shift < 64)) + m_hi |= (m_lo >> (64 - shift)); + + m_lo = (shift >= 64) ? 0 : (m_lo << shift); + + return *this; + } + + inline uint128 operator<< (int shift) const { uint128 res(*this); res <<= shift; return res; } + + inline uint128& operator>>= (int shift) + { + assert(shift >= 0); + if (shift < 0) + return *this; + + m_lo = (shift >= 64) ? ((shift >= 128) ? 0 : (m_hi >> (shift - 64))) : (m_lo >> shift); + + if ((shift) && (shift < 64)) + m_lo |= (m_hi << (64 - shift)); + + m_hi = (shift >= 64) ? 0 : (m_hi >> shift); + + return *this; + } + + inline uint128 operator>> (int shift) const { uint128 res(*this); res >>= shift; return res; } + + inline uint128 signed_shift_right(int shift) const + { + uint128 res(*this); + res >>= shift; + + if (is_signed()) + { + uint128 x(0U); + x = ~x; + x >>= shift; + res |= (~x); + } + + return res; + } + + inline uint128& operator |= (const uint128& rhs) { m_lo |= rhs.m_lo; m_hi |= rhs.m_hi; return *this; } + inline uint128 operator | (const uint128& rhs) const { uint128 res(*this); res |= rhs; return res; } + + inline uint128& operator &= (const uint128& rhs) { m_lo &= rhs.m_lo; m_hi &= rhs.m_hi; return *this; } + inline uint128 operator & (const uint128& rhs) const { uint128 res(*this); res &= rhs; return res; } + + inline uint128& operator ^= (const uint128& rhs) { m_lo ^= rhs.m_lo; m_hi ^= rhs.m_hi; return *this; } + inline uint128 operator ^ (const uint128& rhs) const { uint128 res(*this); res ^= rhs; return res; } + + inline uint128 operator ~() const { return uint128(~m_lo, ~m_hi); } + + inline uint128 operator -() const { uint128 res(~*this); if (++res.m_lo == 0) ++res.m_hi; return res; } + + // prefix + inline uint128 operator ++() + { + if (++m_lo == 0) + ++m_hi; + return *this; + } + + // postfix + inline uint128 operator ++(int) + { + uint128 res(*this); + if (++m_lo == 0) + ++m_hi; + return res; + } + + // prefix + inline uint128 operator --() + { + const uint64_t t = m_lo; + if (--m_lo > t) + --m_hi; + return *this; + } + + // postfix + inline uint128 operator --(int) + { + const uint64_t t = m_lo; + uint128 res(*this); + if (--m_lo > t) + --m_hi; + return res; + } + + inline uint128& operator+= (const uint128& rhs) + { + const uint64_t t = m_lo + rhs.m_lo; + m_hi = m_hi + rhs.m_hi + (t < m_lo); + m_lo = t; + return *this; + } + + inline uint128 operator+ (const uint128& rhs) const { uint128 res(*this); res += rhs; return res; } + + inline uint128& operator-= (const uint128& rhs) + { + const uint64_t t = m_lo - rhs.m_lo; + m_hi = m_hi - rhs.m_hi - (t > m_lo); + m_lo = t; + return *this; + } + + inline uint128 operator- (const uint128& rhs) const { uint128 res(*this); res -= rhs; return res; } + + // computes bit by bit, very slow + uint128& operator*=(const uint128& rhs) + { + uint128 temp(*this), result(0U); + + for (uint128 bitmask(rhs); bitmask; bitmask >>= 1, temp <<= 1) + if (bitmask.get_low() & 1) + result += temp; + + *this = result; + return *this; + } + + uint128 operator*(const uint128& rhs) const { uint128 res(*this); res *= rhs; return res; } + + // computes bit by bit, very slow + friend uint128 divide(const uint128& dividend, const uint128& divisor, uint128& remainder) + { + remainder = 0; + + if (!divisor) + { + assert(0); + return ~uint128(0U); + } + + uint128 quotient(0), one(1); + + for (int i = 127; i >= 0; i--) + { + remainder = (remainder << 1) | ((dividend >> i) & one); + if (remainder >= divisor) + { + remainder -= divisor; + quotient |= (one << i); + } + } + + return quotient; + } + + uint128 operator/(const uint128& rhs) const { uint128 remainder, res; res = divide(*this, rhs, remainder); return res; } + uint128 operator/=(const uint128& rhs) { uint128 remainder; *this = divide(*this, rhs, remainder); return *this; } + + uint128 operator%(const uint128& rhs) const { uint128 remainder; divide(*this, rhs, remainder); return remainder; } + uint128 operator%=(const uint128& rhs) { uint128 remainder; divide(*this, rhs, remainder); *this = remainder; return *this; } + + void print_hex(FILE* pFile) const + { + fprintf(pFile, "0x%016llx%016llx", (unsigned long long int)m_hi, (unsigned long long int)m_lo); + } + + void format_unsigned(std::string& res) const + { + basisu::vector digits; + digits.reserve(39 + 1); + + uint128 k(*this), ten(10); + do + { + uint128 r; + k = divide(k, ten, r); + digits.push_back((uint8_t)r); + } while (k); + + for (int i = (int)digits.size() - 1; i >= 0; i--) + res += ('0' + digits[i]); + } + + void format_signed(std::string& res) const + { + uint128 val(*this); + + if (val.is_signed()) + { + res.push_back('-'); + val = -val; + } + + val.format_unsigned(res); + } + + void print_unsigned(FILE* pFile) + { + std::string str; + format_unsigned(str); + fprintf(pFile, "%s", str.c_str()); + } + + void print_signed(FILE* pFile) + { + std::string str; + format_signed(str); + fprintf(pFile, "%s", str.c_str()); + } + + uint128 get_reversed_bits() const + { + uint128 res; + + const uint32_t* pSrc = (const uint32_t*)this; + uint32_t* pDst = (uint32_t*)&res; + + pDst[0] = rev_dword(pSrc[3]); + pDst[1] = rev_dword(pSrc[2]); + pDst[2] = rev_dword(pSrc[1]); + pDst[3] = rev_dword(pSrc[0]); + + return res; + } + + uint128 get_byteswapped() const + { + uint128 res; + + const uint8_t* pSrc = (const uint8_t*)this; + uint8_t* pDst = (uint8_t*)&res; + + for (uint32_t i = 0; i < 16; i++) + pDst[i] = pSrc[15 - i]; + + return res; + } + + inline uint64_t get_bits64(uint32_t bit_ofs, uint32_t bit_len) const + { + assert(bit_ofs < 128); + assert(bit_len && (bit_len <= 64) && ((bit_ofs + bit_len) <= 128)); + + uint128 res(*this); + res >>= bit_ofs; + + const uint64_t bitmask = (bit_len == 64) ? UINT64_MAX : ((1ull << bit_len) - 1); + return res.get_low() & bitmask; + } + + inline uint32_t get_bits(uint32_t bit_ofs, uint32_t bit_len) const + { + assert(bit_len <= 32); + return (uint32_t)get_bits64(bit_ofs, bit_len); + } + + inline uint32_t next_bits(uint32_t& bit_ofs, uint32_t len) const + { + assert(len && (len <= 32)); + uint32_t x = get_bits(bit_ofs, len); + bit_ofs += len; + return x; + } + + inline uint128& set_bits(uint64_t val, uint32_t bit_ofs, uint32_t num_bits) + { + assert(bit_ofs < 128); + assert(num_bits && (num_bits <= 64) && ((bit_ofs + num_bits) <= 128)); + + uint128 bitmask(1); + bitmask = (bitmask << num_bits) - 1; + assert(uint128(val) <= bitmask); + + bitmask <<= bit_ofs; + *this &= ~bitmask; + + *this = *this | (uint128(val) << bit_ofs); + return *this; + } + }; + + static bool decode_void_extent(const uint128& bits, log_astc_block& log_blk) + { + if (bits.get_bits(10, 2) != 0b11) + return false; + + uint32_t bit_ofs = 12; + const uint32_t min_s = bits.next_bits(bit_ofs, 13); + const uint32_t max_s = bits.next_bits(bit_ofs, 13); + const uint32_t min_t = bits.next_bits(bit_ofs, 13); + const uint32_t max_t = bits.next_bits(bit_ofs, 13); + assert(bit_ofs == 64); + + const bool all_extents_all_ones = (min_s == 0x1FFF) && (max_s == 0x1FFF) && (min_t == 0x1FFF) && (max_t == 0x1FFF); + + if (!all_extents_all_ones && ((min_s >= max_s) || (min_t >= max_t))) + return false; + + const bool hdr_flag = bits.get_bits(9, 1) != 0; + + if (hdr_flag) + log_blk.m_solid_color_flag_hdr = true; + else + log_blk.m_solid_color_flag_ldr = true; + + log_blk.m_solid_color[0] = (uint16_t)bits.get_bits(64, 16); + log_blk.m_solid_color[1] = (uint16_t)bits.get_bits(80, 16); + log_blk.m_solid_color[2] = (uint16_t)bits.get_bits(96, 16); + log_blk.m_solid_color[3] = (uint16_t)bits.get_bits(112, 16); + + if (log_blk.m_solid_color_flag_hdr) + { + for (uint32_t c = 0; c < 4; c++) + if (is_half_inf_or_nan(log_blk.m_solid_color[c])) + return false; + } + + return true; + } + + struct astc_dec_row + { + int8_t Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs; + }; + + static const astc_dec_row s_dec_rows[10] = + { + // Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs; + { 10, 9, 7, 2, 5, 2, 4, 2, 4, 0, 1 }, // 4 2 + { 10, 9, 7, 2, 5, 2, 8, 2, 4, 0, 1 }, // 8 2 + { 10, 9, 5, 2, 7, 2, 2, 8, 4, 0, 1 }, // 2 8 + { 10, 9, 5, 2, 7, 1, 2, 6, 4, 0, 1 }, // 2 6 + + { 10, 9, 7, 1, 5, 2, 2, 2, 4, 0, 1 }, // 2 2 + { 10, 9, 0, 0, 5, 2, 12, 2, 4, 2, 3 }, // 12 2 + { 10, 9, 5, 2, 0, 0, 2, 12, 4, 2, 3 }, // 2 12 + { 10, 9, 0, 0, 0, 0, 6, 10, 4, 2, 3 }, // 6 10 + + { 10, 9, 0, 0, 0, 0, 10, 6, 4, 2, 3 }, // 10 6 + { -1, -1, 5, 2, 9, 2, 6, 6, 4, 2, 3 }, // 6 6 + }; + + static bool decode_config(const uint128& bits, log_astc_block& log_blk) + { + // Reserved + if (bits.get_bits(0, 4) == 0) + return false; + + // Reserved + if ((bits.get_bits(0, 2) == 0) && (bits.get_bits(6, 3) == 0b111)) + { + if (bits.get_bits(2, 4) != 0b1111) + return false; + } + + // Void extent + if (bits.get_bits(0, 9) == 0b111111100) + return decode_void_extent(bits, log_blk); + + // Check rows + const uint32_t x0_2 = bits.get_bits(0, 2), x2_2 = bits.get_bits(2, 2); + const uint32_t x5_4 = bits.get_bits(5, 4), x8_1 = bits.get_bits(8, 1); + const uint32_t x7_2 = bits.get_bits(7, 2); + + int row_index = -1; + if (x0_2 == 0) + { + if (x7_2 == 0b00) + row_index = 5; + else if (x7_2 == 0b01) + row_index = 6; + else if (x5_4 == 0b1100) + row_index = 7; + else if (x5_4 == 0b1101) + row_index = 8; + else if (x7_2 == 0b10) + row_index = 9; + } + else + { + if (x2_2 == 0b00) + row_index = 0; + else if (x2_2 == 0b01) + row_index = 1; + else if (x2_2 == 0b10) + row_index = 2; + else if ((x2_2 == 0b11) && (x8_1 == 0)) + row_index = 3; + else if ((x2_2 == 0b11) && (x8_1 == 1)) + row_index = 4; + } + if (row_index < 0) + return false; + + const astc_dec_row& r = s_dec_rows[row_index]; + + bool P = false, Dp = false; + uint32_t W = r.W_bias, H = r.H_bias; + + if (r.P_ofs >= 0) + P = bits.get_bits(r.P_ofs, 1) != 0; + + if (r.Dp_ofs >= 0) + Dp = bits.get_bits(r.Dp_ofs, 1) != 0; + + if (r.W_size) + W += bits.get_bits(r.W_ofs, r.W_size); + + if (r.H_size) + H += bits.get_bits(r.H_ofs, r.H_size); + + assert((W >= MIN_GRID_DIM) && (W <= MAX_BLOCK_DIM)); + assert((H >= MIN_GRID_DIM) && (H <= MAX_BLOCK_DIM)); + + int p0 = bits.get_bits(r.p0_ofs, 1); + int p1 = bits.get_bits(r.p1_ofs, 1); + int p2 = bits.get_bits(r.p2_ofs, 1); + + uint32_t p = p0 | (p1 << 1) | (p2 << 2); + if (p < 2) + return false; + + log_blk.m_grid_width = W; + log_blk.m_grid_height = H; + + log_blk.m_weight_ise_range = (p - 2) + (P * BISE_10_LEVELS); + assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE); + + log_blk.m_dual_plane = Dp; + + return true; + } + + static inline uint32_t read_le_dword(const uint8_t* pBytes) + { + return (pBytes[0]) | (pBytes[1] << 8U) | (pBytes[2] << 16U) | (pBytes[3] << 24U); + } + + // See 18.12.Integer Sequence Encoding - tables computed by executing the decoder functions with all possible 8/7-bit inputs. + static const uint8_t s_trit_decode[256][5] = + { + {0,0,0,0,0},{1,0,0,0,0},{2,0,0,0,0},{0,0,2,0,0},{0,1,0,0,0},{1,1,0,0,0},{2,1,0,0,0},{1,0,2,0,0}, + {0,2,0,0,0},{1,2,0,0,0},{2,2,0,0,0},{2,0,2,0,0},{0,2,2,0,0},{1,2,2,0,0},{2,2,2,0,0},{2,0,2,0,0}, + {0,0,1,0,0},{1,0,1,0,0},{2,0,1,0,0},{0,1,2,0,0},{0,1,1,0,0},{1,1,1,0,0},{2,1,1,0,0},{1,1,2,0,0}, + {0,2,1,0,0},{1,2,1,0,0},{2,2,1,0,0},{2,1,2,0,0},{0,0,0,2,2},{1,0,0,2,2},{2,0,0,2,2},{0,0,2,2,2}, + {0,0,0,1,0},{1,0,0,1,0},{2,0,0,1,0},{0,0,2,1,0},{0,1,0,1,0},{1,1,0,1,0},{2,1,0,1,0},{1,0,2,1,0}, + {0,2,0,1,0},{1,2,0,1,0},{2,2,0,1,0},{2,0,2,1,0},{0,2,2,1,0},{1,2,2,1,0},{2,2,2,1,0},{2,0,2,1,0}, + {0,0,1,1,0},{1,0,1,1,0},{2,0,1,1,0},{0,1,2,1,0},{0,1,1,1,0},{1,1,1,1,0},{2,1,1,1,0},{1,1,2,1,0}, + {0,2,1,1,0},{1,2,1,1,0},{2,2,1,1,0},{2,1,2,1,0},{0,1,0,2,2},{1,1,0,2,2},{2,1,0,2,2},{1,0,2,2,2}, + {0,0,0,2,0},{1,0,0,2,0},{2,0,0,2,0},{0,0,2,2,0},{0,1,0,2,0},{1,1,0,2,0},{2,1,0,2,0},{1,0,2,2,0}, + {0,2,0,2,0},{1,2,0,2,0},{2,2,0,2,0},{2,0,2,2,0},{0,2,2,2,0},{1,2,2,2,0},{2,2,2,2,0},{2,0,2,2,0}, + {0,0,1,2,0},{1,0,1,2,0},{2,0,1,2,0},{0,1,2,2,0},{0,1,1,2,0},{1,1,1,2,0},{2,1,1,2,0},{1,1,2,2,0}, + {0,2,1,2,0},{1,2,1,2,0},{2,2,1,2,0},{2,1,2,2,0},{0,2,0,2,2},{1,2,0,2,2},{2,2,0,2,2},{2,0,2,2,2}, + {0,0,0,0,2},{1,0,0,0,2},{2,0,0,0,2},{0,0,2,0,2},{0,1,0,0,2},{1,1,0,0,2},{2,1,0,0,2},{1,0,2,0,2}, + {0,2,0,0,2},{1,2,0,0,2},{2,2,0,0,2},{2,0,2,0,2},{0,2,2,0,2},{1,2,2,0,2},{2,2,2,0,2},{2,0,2,0,2}, + {0,0,1,0,2},{1,0,1,0,2},{2,0,1,0,2},{0,1,2,0,2},{0,1,1,0,2},{1,1,1,0,2},{2,1,1,0,2},{1,1,2,0,2}, + {0,2,1,0,2},{1,2,1,0,2},{2,2,1,0,2},{2,1,2,0,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,0,2,2,2}, + {0,0,0,0,1},{1,0,0,0,1},{2,0,0,0,1},{0,0,2,0,1},{0,1,0,0,1},{1,1,0,0,1},{2,1,0,0,1},{1,0,2,0,1}, + {0,2,0,0,1},{1,2,0,0,1},{2,2,0,0,1},{2,0,2,0,1},{0,2,2,0,1},{1,2,2,0,1},{2,2,2,0,1},{2,0,2,0,1}, + {0,0,1,0,1},{1,0,1,0,1},{2,0,1,0,1},{0,1,2,0,1},{0,1,1,0,1},{1,1,1,0,1},{2,1,1,0,1},{1,1,2,0,1}, + {0,2,1,0,1},{1,2,1,0,1},{2,2,1,0,1},{2,1,2,0,1},{0,0,1,2,2},{1,0,1,2,2},{2,0,1,2,2},{0,1,2,2,2}, + {0,0,0,1,1},{1,0,0,1,1},{2,0,0,1,1},{0,0,2,1,1},{0,1,0,1,1},{1,1,0,1,1},{2,1,0,1,1},{1,0,2,1,1}, + {0,2,0,1,1},{1,2,0,1,1},{2,2,0,1,1},{2,0,2,1,1},{0,2,2,1,1},{1,2,2,1,1},{2,2,2,1,1},{2,0,2,1,1}, + {0,0,1,1,1},{1,0,1,1,1},{2,0,1,1,1},{0,1,2,1,1},{0,1,1,1,1},{1,1,1,1,1},{2,1,1,1,1},{1,1,2,1,1}, + {0,2,1,1,1},{1,2,1,1,1},{2,2,1,1,1},{2,1,2,1,1},{0,1,1,2,2},{1,1,1,2,2},{2,1,1,2,2},{1,1,2,2,2}, + {0,0,0,2,1},{1,0,0,2,1},{2,0,0,2,1},{0,0,2,2,1},{0,1,0,2,1},{1,1,0,2,1},{2,1,0,2,1},{1,0,2,2,1}, + {0,2,0,2,1},{1,2,0,2,1},{2,2,0,2,1},{2,0,2,2,1},{0,2,2,2,1},{1,2,2,2,1},{2,2,2,2,1},{2,0,2,2,1}, + {0,0,1,2,1},{1,0,1,2,1},{2,0,1,2,1},{0,1,2,2,1},{0,1,1,2,1},{1,1,1,2,1},{2,1,1,2,1},{1,1,2,2,1}, + {0,2,1,2,1},{1,2,1,2,1},{2,2,1,2,1},{2,1,2,2,1},{0,2,1,2,2},{1,2,1,2,2},{2,2,1,2,2},{2,1,2,2,2}, + {0,0,0,1,2},{1,0,0,1,2},{2,0,0,1,2},{0,0,2,1,2},{0,1,0,1,2},{1,1,0,1,2},{2,1,0,1,2},{1,0,2,1,2}, + {0,2,0,1,2},{1,2,0,1,2},{2,2,0,1,2},{2,0,2,1,2},{0,2,2,1,2},{1,2,2,1,2},{2,2,2,1,2},{2,0,2,1,2}, + {0,0,1,1,2},{1,0,1,1,2},{2,0,1,1,2},{0,1,2,1,2},{0,1,1,1,2},{1,1,1,1,2},{2,1,1,1,2},{1,1,2,1,2}, + {0,2,1,1,2},{1,2,1,1,2},{2,2,1,1,2},{2,1,2,1,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,1,2,2,2} + }; + + static const uint8_t s_quint_decode[128][3] = + { + {0,0,0},{1,0,0},{2,0,0},{3,0,0},{4,0,0},{0,4,0},{4,4,0},{4,4,4}, + {0,1,0},{1,1,0},{2,1,0},{3,1,0},{4,1,0},{1,4,0},{4,4,1},{4,4,4}, + {0,2,0},{1,2,0},{2,2,0},{3,2,0},{4,2,0},{2,4,0},{4,4,2},{4,4,4}, + {0,3,0},{1,3,0},{2,3,0},{3,3,0},{4,3,0},{3,4,0},{4,4,3},{4,4,4}, + {0,0,1},{1,0,1},{2,0,1},{3,0,1},{4,0,1},{0,4,1},{4,0,4},{0,4,4}, + {0,1,1},{1,1,1},{2,1,1},{3,1,1},{4,1,1},{1,4,1},{4,1,4},{1,4,4}, + {0,2,1},{1,2,1},{2,2,1},{3,2,1},{4,2,1},{2,4,1},{4,2,4},{2,4,4}, + {0,3,1},{1,3,1},{2,3,1},{3,3,1},{4,3,1},{3,4,1},{4,3,4},{3,4,4}, + {0,0,2},{1,0,2},{2,0,2},{3,0,2},{4,0,2},{0,4,2},{2,0,4},{3,0,4}, + {0,1,2},{1,1,2},{2,1,2},{3,1,2},{4,1,2},{1,4,2},{2,1,4},{3,1,4}, + {0,2,2},{1,2,2},{2,2,2},{3,2,2},{4,2,2},{2,4,2},{2,2,4},{3,2,4}, + {0,3,2},{1,3,2},{2,3,2},{3,3,2},{4,3,2},{3,4,2},{2,3,4},{3,3,4}, + {0,0,3},{1,0,3},{2,0,3},{3,0,3},{4,0,3},{0,4,3},{0,0,4},{1,0,4}, + {0,1,3},{1,1,3},{2,1,3},{3,1,3},{4,1,3},{1,4,3},{0,1,4},{1,1,4}, + {0,2,3},{1,2,3},{2,2,3},{3,2,3},{4,2,3},{2,4,3},{0,2,4},{1,2,4}, + {0,3,3},{1,3,3},{2,3,3},{3,3,3},{4,3,3},{3,4,3},{0,3,4},{1,3,4} + }; + + static void decode_trit_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val) + { + assert((num_vals >= 1) && (num_vals <= 5)); + uint32_t m[5] = { 0 }, T = 0; + + static const uint8_t s_t_bits[5] = { 2, 2, 1, 2, 1 }; + + for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++) + { + if (bits_per_val) + m[c] = bits.next_bits(bit_ofs, bits_per_val); + T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs); + T_ofs += s_t_bits[c]; + } + + const uint8_t (&p_trits)[5] = s_trit_decode[T]; + + for (uint32_t i = 0; i < num_vals; i++) + pVals[i] = (uint8_t)((p_trits[i] << bits_per_val) | m[i]); + } + + static void decode_quint_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val) + { + assert((num_vals >= 1) && (num_vals <= 3)); + uint32_t m[3] = { 0 }, T = 0; + + static const uint8_t s_t_bits[3] = { 3, 2, 2 }; + + for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++) + { + if (bits_per_val) + m[c] = bits.next_bits(bit_ofs, bits_per_val); + T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs); + T_ofs += s_t_bits[c]; + } + + const uint8_t (&p_quints)[3] = s_quint_decode[T]; + + for (uint32_t i = 0; i < num_vals; i++) + pVals[i] = (uint8_t)((p_quints[i] << bits_per_val) | m[i]); + } + + static void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t bit_ofs) + { + assert(num_vals && (ise_range < TOTAL_ISE_RANGES)); + + const uint32_t bits_per_val = g_ise_range_table[ise_range][0]; + + if (g_ise_range_table[ise_range][1]) + { + // Trits+bits, 5 vals per block, 7 bits extra per block + const uint32_t total_blocks = (num_vals + 4) / 5; + for (uint32_t b = 0; b < total_blocks; b++) + { + const uint32_t num_vals_in_block = std::min(num_vals - 5 * b, 5); + decode_trit_block(pVals + 5 * b, num_vals_in_block, bits, bit_ofs, bits_per_val); + } + } + else if (g_ise_range_table[ise_range][2]) + { + // Quints+bits, 3 vals per block, 8 bits extra per block + const uint32_t total_blocks = (num_vals + 2) / 3; + for (uint32_t b = 0; b < total_blocks; b++) + { + const uint32_t num_vals_in_block = std::min(num_vals - 3 * b, 3); + decode_quint_block(pVals + 3 * b, num_vals_in_block, bits, bit_ofs, bits_per_val); + } + } + else + { + assert(bits_per_val); + + // Only bits + for (uint32_t i = 0; i < num_vals; i++) + pVals[i] = (uint8_t)bits.next_bits(bit_ofs, bits_per_val); + } + } + + void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t* pBits128, uint32_t bit_ofs) + { + const uint128 bits( + (uint64_t)read_le_dword(pBits128) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t))) << 32), + (uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 3)) << 32)); + + return decode_bise(ise_range, pVals, num_vals, bits, bit_ofs); + } + + // Decodes a physical ASTC block to a logical ASTC block. + // blk_width/blk_height are only used to validate the weight grid's dimensions. + bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height) + { + assert(is_valid_block_size(blk_width, blk_height)); + + const uint8_t* pS = (uint8_t*)pASTC_block; + + log_blk.clear(); + log_blk.m_error_flag = true; + + const uint128 bits( + (uint64_t)read_le_dword(pS) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t))) << 32), + (uint64_t)read_le_dword(pS + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t) * 3)) << 32)); + + const uint128 rev_bits(bits.get_reversed_bits()); + + if (!decode_config(bits, log_blk)) + return false; + + if (log_blk.m_solid_color_flag_hdr || log_blk.m_solid_color_flag_ldr) + { + // Void extent + log_blk.m_error_flag = false; + return true; + } + + // Check grid dimensions + if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height)) + return false; + + // Now we have the grid width/height, dual plane, weight ISE range + + const uint32_t total_grid_weights = (log_blk.m_dual_plane ? 2 : 1) * (log_blk.m_grid_width * log_blk.m_grid_height); + const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_blk.m_weight_ise_range); + + // 18.24 Illegal Encodings + if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96)) + return false; + + const uint32_t end_of_weight_bit_ofs = 128 - total_weight_bits; + + uint32_t total_extra_bits = 0; + + // Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane. + + log_blk.m_num_partitions = bits.get_bits(11, 2) + 1; + if (log_blk.m_num_partitions == 1) + log_blk.m_color_endpoint_modes[0] = bits.get_bits(13, 4); // read CEM bits + else + { + // 2 or more partitions + if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4)) + return false; + + log_blk.m_partition_id = bits.get_bits(13, 10); + + uint32_t cem_bits = bits.get_bits(23, 6); + + if ((cem_bits & 3) == 0) + { + // All CEM's the same + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) + log_blk.m_color_endpoint_modes[i] = cem_bits >> 2; + } + else + { + // CEM's different, but within up to 2 adjacent classes + const uint32_t first_cem_index = ((cem_bits & 3) - 1) * 4; + + total_extra_bits = 3 * log_blk.m_num_partitions - 4; + + if ((total_weight_bits + total_extra_bits) > 128) + return false; + + uint32_t cem_bit_pos = end_of_weight_bit_ofs - total_extra_bits; + + uint32_t c[4] = { 0 }, m[4] = { 0 }; + + cem_bits >>= 2; + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++, cem_bits >>= 1) + c[i] = cem_bits & 1; + + switch (log_blk.m_num_partitions) + { + case 2: + { + m[0] = cem_bits & 3; + m[1] = bits.next_bits(cem_bit_pos, 2); + break; + } + case 3: + { + m[0] = cem_bits & 1; + m[0] |= (bits.next_bits(cem_bit_pos, 1) << 1); + m[1] = bits.next_bits(cem_bit_pos, 2); + m[2] = bits.next_bits(cem_bit_pos, 2); + break; + } + case 4: + { + for (uint32_t i = 0; i < 4; i++) + m[i] = bits.next_bits(cem_bit_pos, 2); + break; + } + default: + { + assert(0); + break; + } + } + + assert(cem_bit_pos == end_of_weight_bit_ofs); + + for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) + { + log_blk.m_color_endpoint_modes[i] = first_cem_index + (c[i] * 4) + m[i]; + assert(log_blk.m_color_endpoint_modes[i] <= 15); + } + } + } + + // Now we have all the CEM indices. + + if (log_blk.m_dual_plane) + { + // Read CCS bits, beneath any CEM bits + total_extra_bits += 2; + + if (total_extra_bits > end_of_weight_bit_ofs) + return false; + + uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits; + log_blk.m_color_component_selector = bits.get_bits(ccs_bit_pos, 2); + } + + uint32_t config_bit_pos = 11 + 2; // config+num_parts + if (log_blk.m_num_partitions == 1) + config_bit_pos += 4; // CEM bits + else + config_bit_pos += 10 + 6; // part_id+CEM bits + + // config+num_parts+total_extra_bits (CEM extra+CCS) + uint32_t total_config_bits = config_bit_pos + total_extra_bits; + + // Compute number of remaining bits in block + const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits; + if (num_remaining_bits < 0) + return false; + + // Compute total number of ISE encoded color endpoint mode values + uint32_t total_cem_vals = 0; + for (uint32_t j = 0; j < log_blk.m_num_partitions; j++) + total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[j]); + + if (total_cem_vals > MAX_ENDPOINTS) + return false; + + // Infer endpoint ISE range based off the # of values we need to encode, and the # of remaining bits in the block + int endpoint_ise_range = -1; + for (int k = 20; k > 0; k--) + { + int b = get_ise_sequence_bits(total_cem_vals, k); + if (b <= num_remaining_bits) + { + endpoint_ise_range = k; + break; + } + } + + // See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints + if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE) + return false; + + log_blk.m_endpoint_ise_range = endpoint_ise_range; + + // Decode endpoints forwards in block + decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos); + + // Decode grid weights backwards in block + decode_bise(log_blk.m_weight_ise_range, log_blk.m_weights, total_grid_weights, rev_bits, 0); + + log_blk.m_error_flag = false; + + return true; + } + +} // namespace astc_helpers + +#endif //BASISU_ASTC_HELPERS_IMPLEMENTATION diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h index d3e14369ba07..bfc51bb499cc 100644 --- a/thirdparty/basis_universal/transcoder/basisu_containers.h +++ b/thirdparty/basis_universal/transcoder/basisu_containers.h @@ -188,8 +188,9 @@ namespace basisu #define BASISU_IS_SCALAR_TYPE(T) (scalar_type::cFlag) -#if defined(__GNUC__) && __GNUC__<5 - #define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) +#if !defined(BASISU_HAVE_STD_TRIVIALLY_COPYABLE) && defined(__GNUC__) && __GNUC__<5 + //#define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) + #define BASISU_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__) #else #define BASISU_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value #endif @@ -286,8 +287,19 @@ namespace basisu if (BASISU_IS_BITWISE_COPYABLE(T)) { +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif if ((m_p) && (other.m_p)) memcpy(m_p, other.m_p, m_size * sizeof(T)); +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif } else { @@ -330,8 +342,19 @@ namespace basisu if (BASISU_IS_BITWISE_COPYABLE(T)) { +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif if ((m_p) && (other.m_p)) memcpy(m_p, other.m_p, other.m_size * sizeof(T)); +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif } else { @@ -501,7 +524,7 @@ namespace basisu if (new_capacity > m_capacity) { - if (!increase_capacity(new_capacity, false)) + if (!increase_capacity(new_capacity, false, true)) return false; } else if (new_capacity < m_capacity) @@ -509,7 +532,8 @@ namespace basisu // Must work around the lack of a "decrease_capacity()" method. // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize. vector tmp; - tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false); + if (!tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false, true)) + return false; tmp = *this; swap(tmp); } @@ -750,7 +774,21 @@ namespace basisu } // Copy "down" the objects to preserve, filling in the empty slots. + +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif + memmove(pDst, pSrc, num_to_move * sizeof(T)); + +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif } else { @@ -1003,7 +1041,21 @@ namespace basisu inline void set_all(const T& o) { if ((sizeof(T) == 1) && (scalar_type::cFlag)) + { +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif +#endif memset(m_p, *reinterpret_cast(&o), m_size); + +#ifndef __EMSCRIPTEN__ +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +#endif + } else { T* pDst = m_p; @@ -1029,7 +1081,7 @@ namespace basisu // Important: This method is used in Basis Universal. If you change how this container allocates memory, you'll need to change any users of this method. inline bool grant_ownership(T* p, uint32_t size, uint32_t capacity) { - // To to prevent the caller from obviously shooting themselves in the foot. + // To prevent the caller from obviously shooting themselves in the foot. if (((p + capacity) > m_p) && (p < (m_p + m_capacity))) { // Can grant ownership of a block inside the container itself! diff --git a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h index d5cb61569b03..60c0b3d89f91 100644 --- a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h +++ b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h @@ -19,23 +19,30 @@ namespace basisu if (m_capacity >= min_new_capacity) return true; - size_t new_capacity = min_new_capacity; - if ((grow_hint) && (!helpers::is_power_of_2((uint64_t)new_capacity))) - { - new_capacity = (size_t)helpers::next_pow2((uint64_t)new_capacity); - - assert(new_capacity && (new_capacity > m_capacity)); + uint64_t new_capacity_u64 = min_new_capacity; + if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64))) + new_capacity_u64 = helpers::next_pow2(new_capacity_u64); - if (new_capacity < min_new_capacity) - { - if (nofail) - return false; - fprintf(stderr, "vector too large\n"); - abort(); - } + size_t new_capacity = (size_t)new_capacity_u64; + if (new_capacity != new_capacity_u64) + { + if (nofail) + return false; + fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n"); + abort(); } - const size_t desired_size = element_size * new_capacity; + const uint64_t desired_size_u64 = (uint64_t)element_size * new_capacity; + + const size_t desired_size = (size_t)desired_size_u64; + if (desired_size_u64 != desired_size) + { + if (nofail) + return false; + fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n"); + abort(); + } + size_t actual_size = 0; if (!pMover) { @@ -46,11 +53,7 @@ namespace basisu return false; char buf[256]; -#ifdef _MSC_VER - sprintf_s(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size); -#else - sprintf(buf, "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size); -#endif + snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size); fprintf(stderr, "%s", buf); abort(); } @@ -75,11 +78,7 @@ namespace basisu return false; char buf[256]; -#ifdef _MSC_VER - sprintf_s(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size); -#else - sprintf(buf, "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size); -#endif + snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size); fprintf(stderr, "%s", buf); abort(); } diff --git a/thirdparty/basis_universal/transcoder/basisu_file_headers.h b/thirdparty/basis_universal/transcoder/basisu_file_headers.h index 4316d738e6b6..d29e3feb0340 100644 --- a/thirdparty/basis_universal/transcoder/basisu_file_headers.h +++ b/thirdparty/basis_universal/transcoder/basisu_file_headers.h @@ -1,5 +1,5 @@ // basis_file_headers.h -// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -89,7 +89,8 @@ namespace basist enum class basis_tex_format { cETC1S = 0, - cUASTC4x4 = 1 + cUASTC4x4 = 1, + cUASTC_HDR_4x4 = 2 }; struct basis_file_header diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp index c698861f3b9f..32018cd282d9 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp @@ -1,5 +1,5 @@ // basisu_transcoder.cpp -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -17,6 +17,11 @@ #include #include "basisu_containers_impl.h" +#define BASISU_ASTC_HELPERS_IMPLEMENTATION +#include "basisu_astc_helpers.h" + +#include "basisu_astc_hdr_core.h" + #ifndef BASISD_IS_BIG_ENDIAN // TODO: This doesn't work on OSX. How can this be so difficult? //#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN) @@ -139,6 +144,10 @@ #endif #endif +#ifndef BASISD_SUPPORT_UASTC_HDR + #define BASISD_SUPPORT_UASTC_HDR 1 +#endif + #define BASISD_WRITE_NEW_BC7_MODE5_TABLES 0 #define BASISD_WRITE_NEW_DXT1_TABLES 0 #define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES 0 @@ -1908,17 +1917,24 @@ namespace basist void basisu_transcoder_init() { if (g_transcoder_initialized) - { - BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Called more than once\n"); + { + BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Called more than once\n"); return; - } + } - BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Initializing (this is not an error)\n"); + BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Initializing (this is not an error)\n"); #if BASISD_SUPPORT_UASTC uastc_init(); #endif +#if BASISD_SUPPORT_UASTC_HDR + // TODO: Examine this, optimize for startup time/mem utilization. + astc_helpers::init_tables(false); + + astc_hdr_core_init(); +#endif + #if BASISD_SUPPORT_ASTC transcoder_init_astc(); #endif @@ -2027,6 +2043,10 @@ namespace basist transcoder_init_pvrtc2(); #endif +#if BASISD_SUPPORT_UASTC_HDR + bc6h_enc_init(); +#endif + g_transcoder_initialized = true; } @@ -6928,7 +6948,7 @@ namespace basist static inline int sq(int x) { return x * x; } - // PVRTC2 is a slightly borked format for alpha: In Non-Interpolated mode, the way AlphaB8 is exanded from 4 to 8 bits means it can never be 0. + // PVRTC2 is a slightly borked format for alpha: In Non-Interpolated mode, the way AlphaB8 is expanded from 4 to 8 bits means it can never be 0. // This is actually very bad, because on 100% transparent blocks which have non-trivial color pixels, part of the color channel will leak into alpha! // And there's nothing straightforward we can do because using the other modes is too expensive/complex. I can see why Apple didn't adopt it. static void convert_etc1s_to_pvrtc2_rgba(void* pDst, const endpoint* pEndpoints, const selector* pSelector, const endpoint* pEndpoint_codebook, const selector* pSelector_codebook) @@ -7515,6 +7535,8 @@ namespace basist } #endif // BASISD_SUPPORT_PVRTC2 + //------------------------------------------------------------------------------------------------ + basisu_lowlevel_etc1s_transcoder::basisu_lowlevel_etc1s_transcoder() : m_pGlobal_codebook(nullptr), m_selector_history_buf_size(0) @@ -8620,7 +8642,7 @@ namespace basist // Now make sure the output buffer is large enough, or we'll overwrite memory. if (output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n"); + BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n"); return false; } } @@ -8632,7 +8654,7 @@ namespace basist if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n"); + BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n"); return false; } } @@ -8640,7 +8662,7 @@ namespace basist { if (output_blocks_buf_size_in_blocks_or_pixels < total_slice_blocks) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n"); + BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n"); return false; } } @@ -9242,13 +9264,17 @@ namespace basist return status; } + + //------------------------------------------------------------------------------------------------ basisu_lowlevel_uastc_transcoder::basisu_lowlevel_uastc_transcoder() { } - bool basisu_lowlevel_uastc_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, - uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, + bool basisu_lowlevel_uastc_transcoder::transcode_slice( + void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, + const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags) { BASISU_NOTE_UNUSED(pState); @@ -9784,6 +9810,317 @@ namespace basist return status; } + + //------------------------------------------------------------------------------------------------ + + basisu_lowlevel_uastc_hdr_transcoder::basisu_lowlevel_uastc_hdr_transcoder() + { + } + + bool basisu_lowlevel_uastc_hdr_transcoder::transcode_slice( + void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, + const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags) + { + BASISU_NOTE_UNUSED(pState); + BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks); + BASISU_NOTE_UNUSED(has_alpha); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(decode_flags); + + assert(g_transcoder_initialized); + if (!g_transcoder_initialized) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder not globally initialized.\n"); + return false; + } + +#if BASISD_SUPPORT_UASTC_HDR + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + + if (!output_row_pitch_in_blocks_or_pixels) + { + if (basis_block_format_is_uncompressed(fmt)) + output_row_pitch_in_blocks_or_pixels = orig_width; + else + output_row_pitch_in_blocks_or_pixels = num_blocks_x; + } + + if (basis_block_format_is_uncompressed(fmt)) + { + if (!output_rows_in_pixels) + output_rows_in_pixels = orig_height; + } + + uint32_t total_expected_block_bytes = sizeof(astc_blk) * total_blocks; + if (image_data_size < total_expected_block_bytes) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); + return false; + } + + const astc_blk* pSource_block = reinterpret_cast(pImage_data); + + bool status = false; + + // TODO: Optimize pure memcpy() case. + + for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y) + { + void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes; + + for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t*)pDst_block + output_block_or_pixel_stride_in_bytes) + { + switch (fmt) + { + case block_format::cUASTC_HDR_4x4: + case block_format::cASTC_HDR_4x4: + { + // Nothing to do, UASTC HDR is just ASTC. + memcpy(pDst_block, pSource_block, sizeof(uastc_block)); + status = true; + break; + } + case block_format::cBC6H: + { + status = astc_hdr_transcode_to_bc6h(*pSource_block, *(bc6h_block *)pDst_block); + break; + } + case block_format::cRGB_9E5: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4); + if (status) + { + uint32_t* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t) + ); + + uint32_t blk_texels[4][4]; + + status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeRGB9E5); + + if (status) + { + const uint32_t max_x = basisu::minimum(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4); + const uint32_t max_y = basisu::minimum(4, (int)output_rows_in_pixels - (int)block_y * 4); + + for (uint32_t y = 0; y < max_y; y++) + { + memcpy(pDst_pixels, &blk_texels[y][0], sizeof(uint32_t) * max_x); + + pDst_pixels += output_row_pitch_in_blocks_or_pixels; + } // y + } + } + + break; + } + case block_format::cRGBA_HALF: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4); + if (status) + { + half_float* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 4 + ); + + half_float blk_texels[4][4][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeHDR16); + + if (status) + { + const uint32_t max_x = basisu::minimum(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4); + const uint32_t max_y = basisu::minimum(4, (int)output_rows_in_pixels - (int)block_y * 4); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 4 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 4 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 4 * x] = blk_texels[y][x][2]; + pDst_pixels[3 + 4 * x] = blk_texels[y][x][3]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 4; + } // y + } + } + + break; + } + case block_format::cRGB_HALF: + { + astc_helpers:: log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4); + if (status) + { + half_float* pDst_pixels = + reinterpret_cast(static_cast(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 3); + + half_float blk_texels[4][4][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeHDR16); + if (status) + { + const uint32_t max_x = basisu::minimum(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4); + const uint32_t max_y = basisu::minimum(4, (int)output_rows_in_pixels - (int)block_y * 4); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 3 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 3 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 3 * x] = blk_texels[y][x][2]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 3; + } // y + } + } + + break; + } + default: + assert(0); + break; + + } + + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder failed to unpack a UASTC HDR block - this is a bug, or the data was corrupted\n"); return false; + } + + } // block_x + + } // block_y + + return true; +#else + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: UASTC_HDR is unsupported\n"); + + BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(output_rows_in_pixels); + BASISU_NOTE_UNUSED(output_row_pitch_in_blocks_or_pixels); + BASISU_NOTE_UNUSED(output_block_or_pixel_stride_in_bytes); + BASISU_NOTE_UNUSED(fmt); + BASISU_NOTE_UNUSED(image_data_size); + BASISU_NOTE_UNUSED(pImage_data); + BASISU_NOTE_UNUSED(num_blocks_x); + BASISU_NOTE_UNUSED(num_blocks_y); + BASISU_NOTE_UNUSED(pDst_blocks); + + return false; +#endif + } + + bool basisu_lowlevel_uastc_hdr_transcoder::transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags, + bool has_alpha, + bool is_video, + uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, + uint32_t output_rows_in_pixels, + int channel0, int channel1) + { + BASISU_NOTE_UNUSED(is_video); + BASISU_NOTE_UNUSED(level_index); + BASISU_NOTE_UNUSED(decode_flags); + + if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: source data buffer too small\n"); + return false; + } + + const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); + const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; + + if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: output buffer size too small\n"); + return false; + } + + bool status = false; + + switch (target_format) + { + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_HDR_4x4, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n"); + } + break; + } + case transcoder_texture_format::cTFBC6H: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC6H, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to BC6H failed\n"); + } + break; + } + case transcoder_texture_format::cTFRGB_HALF: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n"); + } + break; + } + case transcoder_texture_format::cTFRGBA_HALF: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + } + break; + } + case transcoder_texture_format::cTFRGB_9E5: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_9E5, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + } + break; + } + default: + { + assert(0); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: Invalid format\n"); + break; + } + } + + return status; + } + + //------------------------------------------------------------------------------------------------ basisu_transcoder::basisu_transcoder() : m_ready_to_transcode(false) @@ -10390,7 +10727,7 @@ namespace basist } else { - // Nothing special to do for UASTC. + // Nothing special to do for UASTC/UASTC HDR. if (m_lowlevel_etc1s_decoder.m_local_endpoints.size()) { m_lowlevel_etc1s_decoder.clear(); @@ -10510,7 +10847,14 @@ namespace basist return false; } - if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + { + return m_lowlevel_uastc_hdr_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + output_rows_in_pixels, channel0, channel1, decode_flags); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) { return m_lowlevel_uastc_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, @@ -10742,7 +11086,18 @@ namespace basist memset(static_cast(pOutput_blocks) + total_slice_blocks * bytes_per_block_or_pixel, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block_or_pixel); } - if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + { + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; + + // Use the container independent image transcode method. + status = m_lowlevel_uastc_hdr_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) { const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; @@ -10808,20 +11163,27 @@ namespace basist return 8; case transcoder_texture_format::cTFBC7_RGBA: case transcoder_texture_format::cTFBC7_ALT: + case transcoder_texture_format::cTFBC6H: case transcoder_texture_format::cTFETC2_RGBA: case transcoder_texture_format::cTFBC3_RGBA: case transcoder_texture_format::cTFBC5_RG: case transcoder_texture_format::cTFASTC_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: case transcoder_texture_format::cTFATC_RGBA: case transcoder_texture_format::cTFFXT1_RGB: case transcoder_texture_format::cTFETC2_EAC_RG11: return 16; case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB_9E5: return sizeof(uint32_t); case transcoder_texture_format::cTFRGB565: case transcoder_texture_format::cTFBGR565: case transcoder_texture_format::cTFRGBA4444: return sizeof(uint16_t); + case transcoder_texture_format::cTFRGB_HALF: + return sizeof(half_float) * 3; + case transcoder_texture_format::cTFRGBA_HALF: + return sizeof(half_float) * 4; default: assert(0); BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); @@ -10845,17 +11207,22 @@ namespace basist case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA"; case transcoder_texture_format::cTFBC5_RG: return "BC5_RG"; case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA"; + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return "ASTC_HDR_RGBA"; case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB"; case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA"; case transcoder_texture_format::cTFRGBA32: return "RGBA32"; case transcoder_texture_format::cTFRGB565: return "RGB565"; case transcoder_texture_format::cTFBGR565: return "BGR565"; case transcoder_texture_format::cTFRGBA4444: return "RGBA4444"; + case transcoder_texture_format::cTFRGBA_HALF: return "RGBA_HALF"; + case transcoder_texture_format::cTFRGB_9E5: return "RGB_9E5"; + case transcoder_texture_format::cTFRGB_HALF: return "RGB_HALF"; case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB"; case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB"; case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11"; case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11"; + case transcoder_texture_format::cTFBC6H: return "BC6H"; default: assert(0); BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); @@ -10881,7 +11248,13 @@ namespace basist case block_format::cRGB565: return "RGB565"; case block_format::cBGR565: return "BGR565"; case block_format::cRGBA4444: return "RGBA4444"; + case block_format::cRGBA_HALF: return "RGBA_HALF"; + case block_format::cRGB_HALF: return "RGB_HALF"; + case block_format::cRGB_9E5: return "RGB_9E5"; case block_format::cUASTC_4x4: return "UASTC_4x4"; + case block_format::cUASTC_HDR_4x4: return "UASTC_HDR_4x4"; + case block_format::cBC6H: return "BC6H"; + case block_format::cASTC_HDR_4x4: return "ASTC_HDR_4x4"; case block_format::cFXT1_RGB: return "FXT1_RGB"; case block_format::cPVRTC2_4_RGB: return "PVRTC2_4_RGB"; case block_format::cPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; @@ -10914,11 +11287,13 @@ namespace basist bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt) { + // TODO: Technically ASTC_HDR does support alpha, but UASTC_HDR doesn't yet support it. Unsure what to do here. switch (fmt) { case transcoder_texture_format::cTFETC2_RGBA: case transcoder_texture_format::cTFBC3_RGBA: case transcoder_texture_format::cTFASTC_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: case transcoder_texture_format::cTFBC7_RGBA: case transcoder_texture_format::cTFBC7_ALT: case transcoder_texture_format::cTFPVRTC1_4_RGBA: @@ -10926,6 +11301,23 @@ namespace basist case transcoder_texture_format::cTFATC_RGBA: case transcoder_texture_format::cTFRGBA32: case transcoder_texture_format::cTFRGBA4444: + case transcoder_texture_format::cTFRGBA_HALF: + return true; + default: + break; + } + return false; + } + + bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt) + { + switch (fmt) + { + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: return true; default: break; @@ -10947,13 +11339,18 @@ namespace basist case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA; case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3; case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5; - case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC4x4; + case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC_LDR_4x4; + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return basisu::texture_format::cASTC_HDR_4x4; + case transcoder_texture_format::cTFBC6H: return basisu::texture_format::cBC6HUnsigned; case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB; case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA; case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32; case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565; case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565; case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444; + case transcoder_texture_format::cTFRGBA_HALF: return basisu::texture_format::cRGBA_HALF; + case transcoder_texture_format::cTFRGB_9E5: return basisu::texture_format::cRGB_9E5; + case transcoder_texture_format::cTFRGB_HALF: return basisu::texture_format::cRGB_HALF; case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB; case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA; case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA; @@ -10975,6 +11372,9 @@ namespace basist case transcoder_texture_format::cTFRGB565: case transcoder_texture_format::cTFBGR565: case transcoder_texture_format::cTFRGBA4444: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_9E5: return true; default: break; @@ -10995,6 +11395,9 @@ namespace basist case block_format::cRGBA4444_COLOR: case block_format::cRGBA4444_ALPHA: case block_format::cRGBA4444_COLOR_OPAQUE: + case block_format::cRGBA_HALF: + case block_format::cRGB_HALF: + case block_format::cRGB_9E5: return true; default: break; @@ -11007,11 +11410,16 @@ namespace basist switch (fmt) { case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB_9E5: return sizeof(uint32_t); case transcoder_texture_format::cTFRGB565: case transcoder_texture_format::cTFBGR565: case transcoder_texture_format::cTFRGBA4444: return sizeof(uint16_t); + case transcoder_texture_format::cTFRGB_HALF: + return sizeof(half_float) * 3; + case transcoder_texture_format::cTFRGBA_HALF: + return sizeof(half_float) * 4; default: break; } @@ -11038,8 +11446,26 @@ namespace basist bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt) { - if (fmt == basis_tex_format::cUASTC4x4) + if (fmt == basis_tex_format::cUASTC_HDR_4x4) + { + // UASTC HDR +#if BASISD_SUPPORT_UASTC_HDR + switch (tex_type) + { + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return true; + default: + break; + } +#endif + } + else if (fmt == basis_tex_format::cUASTC4x4) { + // UASTC LDR #if BASISD_SUPPORT_UASTC switch (tex_type) { @@ -11049,6 +11475,12 @@ namespace basist case transcoder_texture_format::cTFATC_RGB: case transcoder_texture_format::cTFATC_RGBA: case transcoder_texture_format::cTFFXT1_RGB: + // UASTC LDR doesn't support transcoding to HDR formats + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: return false; default: return true; @@ -11057,6 +11489,7 @@ namespace basist } else { + // ETC1S switch (tex_type) { // ETC1 and uncompressed are always supported. @@ -11812,7 +12245,7 @@ namespace basist // Encodes 3 values to output, usable for any range that uses quints and bits static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n) { - // First extract the trits and the bits from the 5 input values + // First extract the quints and the bits from the 3 input values int quints = 0, bits[3]; const uint32_t bit_mask = (1 << n) - 1; for (int i = 0; i < 3; i++) @@ -12131,11 +12564,13 @@ namespace basist return bits & ((1U << codesize) - 1U); } - - uint32_t byte_bit_offset = bit_offset & 7U; - const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]); - bit_offset += codesize; - return (w >> byte_bit_offset)& ((1U << codesize) - 1U); + else + { + uint32_t byte_bit_offset = bit_offset & 7U; + const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]); + bit_offset += codesize; + return (w >> byte_bit_offset) & ((1U << codesize) - 1U); + } } bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints) @@ -12170,6 +12605,7 @@ namespace basist return false; unpacked.m_mode = mode; + unpacked.m_common_pattern = 0; uint32_t bit_ofs = g_uastc_mode_huff_codes[mode][1]; @@ -16663,10 +17099,12 @@ namespace basist memcpy(&m_header, pData, sizeof(m_header)); - // We only support UASTC and ETC1S - if (m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED) + // We only support UASTC LDR, UASTC HDR and ETC1S. + // Note the DFD's contents are what we are guided by for decoding the KTX2 file, not this format field (currently). + if ((m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED) && + (m_header.m_vk_format != basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK)) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC format\n"); + BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC LDR/HDR format\n"); return false; } @@ -16890,6 +17328,16 @@ namespace basist // We're assuming "DATA" means RGBA so it has alpha. m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); } + else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC_HDR) + { + m_format = basist::basis_tex_format::cUASTC_HDR_4x4; + + m_dfd_samples = 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); + + // We're assuming "DATA" means RGBA so it has alpha. + m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); + } else { // Unsupported DFD color model. @@ -17167,7 +17615,8 @@ namespace basist return false; } } - else if (m_format == basist::basis_tex_format::cUASTC4x4) + else if ((m_format == basist::basis_tex_format::cUASTC4x4) || + (m_format == basist::basis_tex_format::cUASTC_HDR_4x4)) { // Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices. assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length); @@ -17188,14 +17637,29 @@ namespace basist return false; } - if (!m_uastc_transcoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index, - 0, (uint32_t)total_2D_image_size, - decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + if (m_format == basist::basis_tex_format::cUASTC_HDR_4x4) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); - return false; + if (!m_uastc_hdr_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index, + 0, (uint32_t)total_2D_image_size, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } + } + else + { + if (!m_uastc_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index, + 0, (uint32_t)total_2D_image_size, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } } } else @@ -17476,4 +17940,1531 @@ namespace basist #endif } + //------------------------------- + +#ifdef BASISD_SUPPORT_UASTC_HDR + // This float->half conversion matches how "F32TO16" works on Intel GPU's. + basist::half_float float_to_half(float val) + { + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1; + int s = flt_s, e = 0, m = 0; + + // inf/NaN + if (flt_e == 0xff) + { + e = 31; + if (flt_m != 0) // NaN + m = 1; + } + // not zero or denormal + else if (flt_e != 0) + { + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + m = lrintf((1 << 24) * fabsf(fi.f)); + else + { + e = new_exp + 15; + m = lrintf(flt_m * (1.0f / ((float)(1 << 13)))); + } + } + + assert((0 <= m) && (m <= 1024)); + if (m == 1024) + { + e++; + m = 0; + } + + assert((s >= 0) && (s <= 1)); + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); + + basist::half_float result = (basist::half_float)((s << 15) | (e << 10) | m); + return result; + } + + //------------------------------------------------------------------------------------------------ + // HDR support + // + // Originally from bc6h_enc.cpp + // BC6H decoder fuzzed vs. DirectXTex's for unsigned/signed + + const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4] = // base bits, r, g, b + { + // 2 subsets + { 10, 5, 5, 5, }, // 0, mode 1 in MS/D3D docs + { 7, 6, 6, 6, }, // 1 + { 11, 5, 4, 4, }, // 2 + { 11, 4, 5, 4, }, // 3 + { 11, 4, 4, 5, }, // 4 + { 9, 5, 5, 5, }, // 5 + { 8, 6, 5, 5, }, // 6 + { 8, 5, 6, 5, }, // 7 + { 8, 5, 5, 6, }, // 8 + { 6, 6, 6, 6, }, // 9, endpoints not delta encoded, mode 10 in MS/D3D docs + // 1 subset + { 10, 10, 10, 10, }, // 10, endpoints not delta encoded, mode 11 in MS/D3D docs + { 11, 9, 9, 9, }, // 11 + { 12, 8, 8, 8, }, // 12 + { 16, 4, 4, 4, } // 13, also useful for solid blocks + }; + + const int8_t g_bc6h_mode_lookup[32] = { 0, 1, 2, 10, 0, 1, 3, 11, 0, 1, 4, 12, 0, 1, 5, 13, 0, 1, 6, -1, 0, 1, 7, -1, 0, 1, 8, -1, 0, 1, 9, -1 }; + + const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX] = + { + // comp_index, subset*2+lh_index, last_bit, first_bit + //------------------------ mode 0: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (10.555, 10.555, 10.555), delta + { { 1, 2, 4, -1 }, { 2, 2, 4, -1 }, { 2, 3, 4, -1 }, { 0, 0, 9, 0 }, { 1, 0, 9, 0 }, { 2, 0, 9, 0 }, { 0, 1, 4, 0 }, + { 1, 3, 4, -1 }, { 1, 2, 3, 0 }, { 1, 1, 4, 0 }, { 2, 3, 0, -1 }, { 1, 3, 3, 0 }, { 2, 1, 4, 0 }, { 2, 3, 1, -1 }, + { 2, 2, 3, 0 }, { 0, 2, 4, 0 }, { 2, 3, 2, -1 }, { 0, 3, 4, 0 }, { 2, 3, 3, -1 }, { 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 1: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (7.666, 7.666, 7.666), delta + { { 1, 2, 5, -1 },{ 1, 3, 4, -1 },{ 1, 3, 5, -1 },{ 0, 0, 6, 0 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 }, + { 1, 0, 6, 0 },{ 2, 2, 5, -1 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 6, 0 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 }, + { 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 2: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.555, 11.444, 11.444), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 4, 0 },{ 0, 0, 10, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 },{ 1, 0, 10, -1 }, + { 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 }, + { 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 3: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.555, 11.444), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 }, + { 1, 0, 10, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 0, -1 }, + { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 1, 2, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 4: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.444, 11.555), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 2, 2, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 }, + { 1, 0, 10, -1 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 0, 10, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 1, -1 }, + { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 2, 3, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 5: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (9.555, 9.555, 9.555), delta + { { 0, 0, 8, 0 },{ 2, 2, 4, -1 },{ 1, 0, 8, 0 },{ 1, 2, 4, -1 },{ 2, 0, 8, 0 },{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 }, + { 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 }, + { 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 6: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.666, 8.555, 8.555), delta + { { 0, 0, 7, 0 },{ 1, 3, 4, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 3, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, + { 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 7: 2 subsets, Weight bits: 46 bits, Endpoints bits: 72 bits (8.555, 8.666, 8.555), delta + { { 0, 0, 7, 0 },{ 2, 3, 0, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 1, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 1, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, + { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 8: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.555, 8.555, 8.666), delta + { { 0, 0, 7, 0 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 }, + { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 9: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (6.6.6.6, 6.6.6.6, 6.6.6.6), NO delta + { { 0, 0, 5, 0 },{ 1, 3, 4, -1 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 5, 0 },{ 1, 2, 5, -1 },{ 2, 2, 5, -1 }, + { 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 5, 0 },{ 1, 3, 5, -1 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 },{ 2, 3, 4, -1 },{ 0, 1, 5, 0 }, + { 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 10: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (10.10, 10.10, 10.10), NO delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 9, 0 },{ 1, 1, 9, 0 },{ 2, 1, 9, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 11: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (11.9, 11.9, 11.9), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 8, 0 },{ 0, 0, 10, -1 },{ 1, 1, 8, 0 },{ 1, 0, 10, -1 },{ 2, 1, 8, 0 },{ 2, 0, 10, -1 }, {-1, 0, 0, 0} }, + //------------------------ mode 12: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (12.8, 12.8, 12.8), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 7, 0 },{ 0, 0, 10, 11 },{ 1, 1, 7, 0 },{ 1, 0, 10, 11 },{ 2, 1, 7, 0 },{ 2, 0, 10, 11 }, {-1, 0, 0, 0} }, + //------------------------ mode 13: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (16.4, 16.4, 16.4), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, 15 },{ 1, 1, 3, 0 },{ 1, 0, 10, 15 },{ 2, 1, 3, 0 },{ 2, 0, 10, 15 }, {-1, 0, 0, 0} } + }; + + // The same as the first 32 2-subset patterns in BC7. + // Bit 7 is a flag indicating that the weight uses 1 less bit than usual. + const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4] = // [pat][y][x] + { + { {0x80, 0, 1, 1}, { 0, 0, 1, 1 }, { 0, 0, 1, 1 }, { 0, 0, 1, 0x81 }}, { {0x80, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0x81} }, + { {0x80, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 0x81} }, { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {1, 1, 1, 0}, {1, 1, 1, 0x81} }, { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 1, 0x81, 1}, {0, 0, 1, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, + { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 1, 0, 0}, {1, 1, 1, 0} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, { {0x80, 1, 1, 1}, {0, 0, 1, 1}, { 0, 0, 1, 1}, {0, 0, 0, 0x81} }, + { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, + { {0x80, 1, 0x81, 0}, {0, 1, 1, 0}, {0, 1, 1, 0}, {0, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {0, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 0, 0} }, + { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {0x81, 1, 1, 0}, {1, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {0x81, 1, 1, 1}, {0, 0, 0, 0} }, + { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {1, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {1, 0, 0, 1}, {1, 0, 0, 1}, {1, 1, 0, 0} } + }; + + const uint8_t g_bc6h_weight3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; + const uint8_t g_bc6h_weight4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + + struct bc6h_logical_block + { + uint32_t m_mode; + uint32_t m_partition_pattern; // must be 0 if 1 subset + uint32_t m_endpoints[3][4]; // [comp][subset*2+lh_index] - must be already properly packed + uint8_t m_weights[16]; // weights must be of the proper size, taking into account skipped MSB's which must be 0 + + void clear() + { + basisu::clear_obj(*this); + } + }; + + static inline void write_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) + { + assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); + assert(val < (1ULL << num_bits)); + + if (bit_pos < 64) + { + l |= (val << bit_pos); + + if ((bit_pos + num_bits) > 64) + h |= (val >> (64 - bit_pos)); + } + else + { + h |= (val << (bit_pos - 64)); + } + + bit_pos += num_bits; + assert(bit_pos <= 128); + } + + static inline void write_rev_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) + { + assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); + assert(val < (1ULL << num_bits)); + + for (uint32_t i = 0; i < num_bits; i++) + write_bits((val >> (num_bits - 1u - i)) & 1, 1, bit_pos, l, h); + } + + static void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk) + { + const uint8_t s_mode_bits[NUM_BC6H_MODES] = { 0b00, 0b01, 0b00010, 0b00110, 0b01010, 0b01110, 0b10010, 0b10110, 0b11010, 0b11110, 0b00011, 0b00111, 0b01011, 0b01111 }; + + const uint32_t mode = log_blk.m_mode; + assert(mode < NUM_BC6H_MODES); + + uint64_t l = s_mode_bits[mode], h = 0; + uint32_t bit_pos = (mode >= 2) ? 5 : 2; + + const uint32_t num_subsets = (mode >= BC6H_FIRST_1SUBSET_MODE_INDEX) ? 1 : 2; + + assert(((num_subsets == 2) && (log_blk.m_partition_pattern < TOTAL_BC6H_PARTITION_PATTERNS)) || + ((num_subsets == 1) && (!log_blk.m_partition_pattern))); + + // Sanity checks + for (uint32_t c = 0; c < 3; c++) + { + assert(log_blk.m_endpoints[c][0] < (1u << g_bc6h_mode_sig_bits[mode][0])); // 1st subset l, base bits + assert(log_blk.m_endpoints[c][1] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 1st subset h, these are deltas except for modes 9,10 + assert(log_blk.m_endpoints[c][2] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset l + assert(log_blk.m_endpoints[c][3] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset h + } + + const bc6h_bit_layout* pLayout = &g_bc6h_bit_layouts[mode][0]; + + while (pLayout->m_comp != -1) + { + uint32_t v = (pLayout->m_comp == 3) ? log_blk.m_partition_pattern : log_blk.m_endpoints[pLayout->m_comp][pLayout->m_index]; + + if (pLayout->m_first_bit == -1) + { + write_bits((v >> pLayout->m_last_bit) & 1, 1, bit_pos, l, h); + } + else + { + const uint32_t total_bits = basisu::iabs(pLayout->m_last_bit - pLayout->m_first_bit) + 1; + + v >>= basisu::minimum(pLayout->m_first_bit, pLayout->m_last_bit); + v &= ((1 << total_bits) - 1); + + if (pLayout->m_first_bit > pLayout->m_last_bit) + write_rev_bits(v, total_bits, bit_pos, l, h); + else + write_bits(v, total_bits, bit_pos, l, h); + } + + pLayout++; + } + + const uint32_t num_mode_sel_bits = (num_subsets == 1) ? 4 : 3; + const uint8_t* pPat = &g_bc6h_2subset_patterns[log_blk.m_partition_pattern][0][0]; + + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t sel = log_blk.m_weights[i]; + + uint32_t num_bits = num_mode_sel_bits; + if (num_subsets == 2) + { + const uint32_t subset_index = pPat[i]; + num_bits -= (subset_index >> 7); + } + else if (!i) + { + num_bits--; + } + + assert(sel < (1u << num_bits)); + + write_bits(sel, num_bits, bit_pos, l, h); + } + + assert(bit_pos == 128); + + basisu::write_le_dword(&dst_blk.m_bytes[0], (uint32_t)l); + basisu::write_le_dword(&dst_blk.m_bytes[4], (uint32_t)(l >> 32u)); + basisu::write_le_dword(&dst_blk.m_bytes[8], (uint32_t)h); + basisu::write_le_dword(&dst_blk.m_bytes[12], (uint32_t)(h >> 32u)); + } + +#if 0 + static inline uint32_t bc6h_blog_dequantize_to_blog16(uint32_t comp, uint32_t bits_per_comp) + { + int unq; + + if (bits_per_comp >= 15) + unq = comp; + else if (comp == 0) + unq = 0; + else if (comp == ((1u << bits_per_comp) - 1u)) + unq = 0xFFFFu; + else + unq = ((comp << 16u) + 0x8000u) >> bits_per_comp; + + return unq; + } +#endif + + // Suboptimal, but very close. + static inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits) + { + assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + return (h * 64 + 30) / (31 * (1 << (16 - num_bits))); + } + + // 6,7,8,9,10,11,12 + const uint32_t BC6H_BLOG_TAB_MIN = 6; + const uint32_t BC6H_BLOG_TAB_MAX = 12; + //const uint32_t BC6H_BLOG_TAB_NUM = BC6H_BLOG_TAB_MAX - BC6H_BLOG_TAB_MIN + 1; + + // Handles 16, or 6-12 bits. Others assert. + static inline uint32_t half_to_blog_tab(half_float h, uint32_t num_bits) + { + BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MIN); + BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MAX); + + assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + + if (num_bits == 16) + { + return bc6h_half_to_blog(h, 16); + } + else + { + assert((num_bits >= BC6H_BLOG_TAB_MIN) && (num_bits <= BC6H_BLOG_TAB_MAX)); + + // Note: This used to be done using a table lookup, but it required ~224KB of tables. This isn't quite as accurate, but the error is very slight (+-1 half values as ints). + return bc6h_half_to_blog(h, num_bits); + } + } + + bool g_bc6h_enc_initialized; + + void bc6h_enc_init() + { + if (g_bc6h_enc_initialized) + return; + + g_bc6h_enc_initialized = true; + } + + // mode 10, 4-bit weights + void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 15); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + // Convert half endpoints to blog10 (mode 10 doesn't use delta encoding) + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 10); + log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 10); + } + + memcpy(log_blk.m_weights, pWeights, 16); + + if (log_blk.m_weights[0] & 8) + { + for (uint32_t i = 0; i < 16; i++) + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; + + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + } + } + + log_blk.m_mode = BC6H_FIRST_1SUBSET_MODE_INDEX; + pack_bc6h_block(*pPacked_block, log_blk); + } + + // Tries modes 11-13 (delta endpoint) encoding, falling back to mode 10 only when necessary, 4-bit weights + void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 15); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + for (uint32_t mode = BC6H_LAST_MODE_INDEX; mode > BC6H_FIRST_1SUBSET_MODE_INDEX; mode--) + { + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0], num_delta_bits = g_bc6h_mode_sig_bits[mode][1]; + const int base_bitmask = (1 << num_base_bits) - 1; + const int delta_bitmask = (1 << num_delta_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); + + assert(num_delta_bits < num_base_bits); + assert((num_delta_bits == g_bc6h_mode_sig_bits[mode][2]) && (num_delta_bits == g_bc6h_mode_sig_bits[mode][3])); + + uint32_t blog_endpoints[3][2]; + + // Convert half endpoints to blog 16, 12, or 11 + for (uint32_t c = 0; c < 3; c++) + { + blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); + assert((int)blog_endpoints[c][0] <= base_bitmask); + + blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); + assert((int)blog_endpoints[c][1] <= base_bitmask); + } + + // Copy weights + memcpy(log_blk.m_weights, pWeights, 16); + + // Ensure first weight MSB is 0 + if (log_blk.m_weights[0] & 8) + { + // Invert weights + for (uint32_t i = 0; i < 16; i++) + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; + + // Swap blog quantized endpoints + for (uint32_t c = 0; c < 3; c++) + { + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + } + } + + const int max_delta = (1 << (num_delta_bits - 1)) - 1; + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmask); + + bool failed_flag = false; + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + + int delta = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + if ((delta < min_delta) || (delta > max_delta)) + { + failed_flag = true; + break; + } + + log_blk.m_endpoints[c][1] = delta & delta_bitmask; + } + + if (failed_flag) + continue; + + log_blk.m_mode = mode; + pack_bc6h_block(*pPacked_block, log_blk); + + return; + } + + // Worst case fall back to mode 10, which can handle any endpoints + bc6h_enc_block_mode10(pPacked_block, pEndpoints, pWeights); + } + + // Mode 9 (direct endpoint encoding), 3-bit weights, but only 1 subset + void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 6); + log_blk.m_endpoints[c][2] = log_blk.m_endpoints[c][0]; + + log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 6); + log_blk.m_endpoints[c][3] = log_blk.m_endpoints[c][1]; + } + + memcpy(log_blk.m_weights, pWeights, 16); + + const uint32_t pat_index = 0; + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + + if (log_blk.m_weights[0] & 4) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + if (log_blk.m_weights[15] & 4) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + log_blk.m_mode = 9; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + } + + // Tries modes 0-8, falls back to mode 9 + void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) + { + static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least + const uint32_t mode = s_mode_order[mode_iter]; + + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; + const int base_bitmask = (1 << num_base_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); + + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; + + uint32_t blog_endpoints[3][4]; + + // Convert half endpoints to blog 7-11 + for (uint32_t c = 0; c < 3; c++) + { + blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); + blog_endpoints[c][2] = blog_endpoints[c][0]; + assert((int)blog_endpoints[c][0] <= base_bitmask); + + blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); + blog_endpoints[c][3] = blog_endpoints[c][1]; + assert((int)blog_endpoints[c][1] <= base_bitmask); + } + + const uint32_t pat_index = 0; + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + + memcpy(log_blk.m_weights, pWeights, 16); + + if (log_blk.m_weights[0] & 4) + { + // Swap part 0's endpoints/weights + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + if (log_blk.m_weights[15] & 4) + { + // Swap part 1's endpoints/weights + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + bool failed_flag = false; + + for (uint32_t c = 0; c < 3; c++) + { + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmasks[c]); + + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + + int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; + int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; + + if ((delta0 < min_delta) || (delta0 > max_delta) || + (delta1 < min_delta) || (delta1 > max_delta) || + (delta2 < min_delta) || (delta2 > max_delta)) + { + failed_flag = true; + break; + } + + log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; + log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; + log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + + if (failed_flag) + break; + } + if (failed_flag) + continue; + + log_blk.m_mode = mode; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + + return; + + } // mode_iter + + bc6h_enc_block_1subset_mode9_3bit_weights(pPacked_block, pEndpoints, pWeights); + } + + // pEndpoints[subset][comp][lh_index] + void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + assert(common_part_index < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) + for (uint32_t s = 0; s < 2; s++) + { + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], 6); + log_blk.m_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], 6); + } + } + + memcpy(log_blk.m_weights, pWeights, 16); + + //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + + const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; + if (invert_flag) + { + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][2]); + std::swap(log_blk.m_endpoints[c][1], log_blk.m_endpoints[c][3]); + } + } + + const uint32_t pat_index = bc7_pattern; + assert(pat_index < 32); + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + + bool swap_flags[2] = { false, false }; + for (uint32_t i = 0; i < 16; i++) + { + if ((pPat[i] & 0x80) == 0) + continue; + + if (log_blk.m_weights[i] & 4) + { + const uint32_t p = pPat[i] & 1; + swap_flags[p] = true; + } + } + + if (swap_flags[0]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + if (swap_flags[1]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + log_blk.m_mode = 9; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + } + + void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } + + bc6h_logical_block log_blk; + log_blk.clear(); + + for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) + { + static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least + const uint32_t mode = s_mode_order[mode_iter]; + + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; + const int base_bitmask = (1 << num_base_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); + + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; + + uint32_t blog_endpoints[3][4]; + + // Convert half endpoints to blog 7-11 + for (uint32_t s = 0; s < 2; s++) + { + for (uint32_t c = 0; c < 3; c++) + { + blog_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], num_base_bits); + blog_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], num_base_bits); + } + } + + memcpy(log_blk.m_weights, pWeights, 16); + + //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + + const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; + if (invert_flag) + { + for (uint32_t c = 0; c < 3; c++) + { + std::swap(blog_endpoints[c][0], blog_endpoints[c][2]); + std::swap(blog_endpoints[c][1], blog_endpoints[c][3]); + } + } + + const uint32_t pat_index = bc7_pattern; + assert(pat_index < 32); + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + + bool swap_flags[2] = { false, false }; + for (uint32_t i = 0; i < 16; i++) + { + if ((pPat[i] & 0x80) == 0) + continue; + + if (log_blk.m_weights[i] & 4) + { + const uint32_t p = pPat[i] & 1; + swap_flags[p] = true; + } + } + + if (swap_flags[0]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + if (swap_flags[1]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); + + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } + + // Try packing the endpoints + bool failed_flag = false; + + for (uint32_t c = 0; c < 3; c++) + { + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmasks[c]); + + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + + int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; + int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; + + if ((delta0 < min_delta) || (delta0 > max_delta) || + (delta1 < min_delta) || (delta1 > max_delta) || + (delta2 < min_delta) || (delta2 > max_delta)) + { + failed_flag = true; + break; + } + + log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; + log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; + log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + + if (failed_flag) + break; + } + if (failed_flag) + continue; + + log_blk.m_mode = mode; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + + //half_float blk[16 * 3]; + //unpack_bc6h(pPacked_block, blk, false); + + return; + } + + bc6h_enc_block_2subset_mode9_3bit_weights(pPacked_block, common_part_index, pEndpoints, pWeights); + } + + bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]) + { + assert(g_bc6h_enc_initialized); + + if ((pColor[0] | pColor[1] | pColor[2]) & 0x8000) + return false; + + // ASTC block unpacker won't allow Inf/NaN's to come through. + //if (is_half_inf_or_nan(pColor[0]) || is_half_inf_or_nan(pColor[1]) || is_half_inf_or_nan(pColor[2])) + // return false; + + uint8_t weights[16]; + memset(weights, 0, sizeof(weights)); + + half_float endpoints[3][2]; + endpoints[0][0] = pColor[0]; + endpoints[0][1] = pColor[0]; + + endpoints[1][0] = pColor[1]; + endpoints[1][1] = pColor[1]; + + endpoints[2][0] = pColor[2]; + endpoints[2][1] = pColor[2]; + + bc6h_enc_block_1subset_4bit_weights(pPacked_block, endpoints, weights); + + return true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + // basisu_astc_hdr_core.cpp + + static bool g_astc_hdr_core_initialized; + static int8_t g_astc_partition_id_to_common_bc7_pat_index[1024]; + + //-------------------------------------------------------------------------------------------------------------------------- + + void astc_hdr_core_init() + { + if (g_astc_hdr_core_initialized) + return; + + memset(g_astc_partition_id_to_common_bc7_pat_index, 0xFF, sizeof(g_astc_partition_id_to_common_bc7_pat_index)); + + for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; ++part_index) + { + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + //const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + + assert(astc_pattern < 1024); + g_astc_partition_id_to_common_bc7_pat_index[astc_pattern] = (int8_t)part_index; + } + + g_astc_hdr_core_initialized = true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + + static inline int astc_hdr_sign_extend(int src, int num_src_bits) + { + assert(basisu::in_range(num_src_bits, 2, 31)); + + const bool negative = (src & (1 << (num_src_bits - 1))) != 0; + if (negative) + return src | ~((1 << num_src_bits) - 1); + else + return src & ((1 << num_src_bits) - 1); + } + + static inline void astc_hdr_pack_bit( + int& dst, int dst_bit, + int src_val, int src_bit = 0) + { + assert(dst_bit >= 0 && dst_bit <= 31); + int bit = basisu::get_bit(src_val, src_bit); + dst |= (bit << dst_bit); + } + + //-------------------------------------------------------------------------------------------------------------------------- + + void decode_mode7_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale) + { + assert(g_astc_hdr_core_initialized); + + for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) + { + assert(pEndpoints[i] <= 255); + } + + const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3]; + + // Extract mode bits and unpack to major component and mode. + const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); + + int majcomp, mode; + if ((modeval & 0xC) != 0xC) + { + majcomp = modeval >> 2; + mode = modeval & 3; + } + else if (modeval != 0xF) + { + majcomp = modeval & 3; + mode = 4; + } + else + { + majcomp = 0; + mode = 5; + } + + // Extract low-order bits of r, g, b, and s. + int red = v0 & 0x3f; + int green = v1 & 0x1f; + int blue = v2 & 0x1f; + int scale = v3 & 0x1f; + + // Extract high-order bits, which may be assigned depending on mode + int x0 = (v1 >> 6) & 1; + int x1 = (v1 >> 5) & 1; + int x2 = (v2 >> 6) & 1; + int x3 = (v2 >> 5) & 1; + int x4 = (v3 >> 7) & 1; + int x5 = (v3 >> 6) & 1; + int x6 = (v3 >> 5) & 1; + + // Now move the high-order xs into the right place. + const int ohm = 1 << mode; + if (ohm & 0x30) green |= x0 << 6; + if (ohm & 0x3A) green |= x1 << 5; + if (ohm & 0x30) blue |= x2 << 6; + if (ohm & 0x3A) blue |= x3 << 5; + if (ohm & 0x3D) scale |= x6 << 5; + if (ohm & 0x2D) scale |= x5 << 6; + if (ohm & 0x04) scale |= x4 << 7; + if (ohm & 0x3B) red |= x4 << 6; + if (ohm & 0x04) red |= x3 << 6; + if (ohm & 0x10) red |= x5 << 7; + if (ohm & 0x0F) red |= x2 << 7; + if (ohm & 0x05) red |= x1 << 8; + if (ohm & 0x0A) red |= x0 << 8; + if (ohm & 0x05) red |= x0 << 9; + if (ohm & 0x02) red |= x6 << 9; + if (ohm & 0x01) red |= x3 << 10; + if (ohm & 0x02) red |= x5 << 10; + + // Shift the bits to the top of the 12-bit result. + static const int s_shamts[6] = { 1,1,2,3,4,5 }; + + const int shamt = s_shamts[mode]; + red <<= shamt; + green <<= shamt; + blue <<= shamt; + scale <<= shamt; + + // Minor components are stored as differences + if (mode != 5) + { + green = red - green; + blue = red - blue; + } + + // Swizzle major component into place + if (majcomp == 1) + std::swap(red, green); + + if (majcomp == 2) + std::swap(red, blue); + + // Clamp output values, set alpha to 1.0 + e[1][0] = basisu::clamp(red, 0, 0xFFF); + e[1][1] = basisu::clamp(green, 0, 0xFFF); + e[1][2] = basisu::clamp(blue, 0, 0xFFF); + + e[0][0] = basisu::clamp(red - scale, 0, 0xFFF); + e[0][1] = basisu::clamp(green - scale, 0, 0xFFF); + e[0][2] = basisu::clamp(blue - scale, 0, 0xFFF); + + if (pScale) + *pScale = scale; + } + + //-------------------------------------------------------------------------------------------------------------------------- + + bool decode_mode7_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale, + uint32_t ise_endpoint_range) + { + assert(g_astc_hdr_core_initialized); + + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) + { + decode_mode7_to_qlog12_ise20(pEndpoints, e, pScale); + } + else + { + uint8_t dequantized_endpoints[NUM_MODE7_ENDPOINTS]; + + for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) + dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + + decode_mode7_to_qlog12_ise20(dequantized_endpoints, e, pScale); + } + + for (uint32_t i = 0; i < 2; i++) + { + if (e[i][0] > (int)MAX_QLOG12) + return false; + + if (e[i][1] > (int)MAX_QLOG12) + return false; + + if (e[i][2] > (int)MAX_QLOG12) + return false; + } + + return true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + + void decode_mode11_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3]) + { +#ifdef _DEBUG + for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) + { + assert(pEndpoints[i] <= 255); + } +#endif + + const uint32_t maj_comp = basisu::get_bit(pEndpoints[4], 7) | (basisu::get_bit(pEndpoints[5], 7) << 1); + + if (maj_comp == 3) + { + // Direct, qlog8 and qlog7 + e[0][0] = pEndpoints[0] << 4; + e[1][0] = pEndpoints[1] << 4; + + e[0][1] = pEndpoints[2] << 4; + e[1][1] = pEndpoints[3] << 4; + + e[0][2] = (pEndpoints[4] & 127) << 5; + e[1][2] = (pEndpoints[5] & 127) << 5; + } + else + { + int v0 = pEndpoints[0]; + int v1 = pEndpoints[1]; + int v2 = pEndpoints[2]; + int v3 = pEndpoints[3]; + int v4 = pEndpoints[4]; + int v5 = pEndpoints[5]; + + int mode = 0; + astc_hdr_pack_bit(mode, 0, v1, 7); + astc_hdr_pack_bit(mode, 1, v2, 7); + astc_hdr_pack_bit(mode, 2, v3, 7); + + int va = v0; + astc_hdr_pack_bit(va, 8, v1, 6); + + int vb0 = v2 & 63; + int vb1 = v3 & 63; + int vc = v1 & 63; + + int vd0 = v4 & 0x7F; // this takes more bits than is sometimes needed + int vd1 = v5 & 0x7F; // this takes more bits than is sometimes needed + static const int8_t dbitstab[8] = { 7,6,7,6,5,6,5,6 }; + vd0 = astc_hdr_sign_extend(vd0, dbitstab[mode]); + vd1 = astc_hdr_sign_extend(vd1, dbitstab[mode]); + + int x0 = basisu::get_bit(v2, 6); + int x1 = basisu::get_bit(v3, 6); + int x2 = basisu::get_bit(v4, 6); + int x3 = basisu::get_bit(v5, 6); + int x4 = basisu::get_bit(v4, 5); + int x5 = basisu::get_bit(v5, 5); + + const uint32_t ohm = 1U << mode; + if (ohm & 0xA4) va |= (x0 << 9); + if (ohm & 0x08) va |= (x2 << 9); + if (ohm & 0x50) va |= (x4 << 9); + if (ohm & 0x50) va |= (x5 << 10); + if (ohm & 0xA0) va |= (x1 << 10); + if (ohm & 0xC0) va |= (x2 << 11); + if (ohm & 0x04) vc |= (x1 << 6); + if (ohm & 0xE8) vc |= (x3 << 6); + if (ohm & 0x20) vc |= (x2 << 7); + if (ohm & 0x5B) vb0 |= (x0 << 6); + if (ohm & 0x5B) vb1 |= (x1 << 6); + if (ohm & 0x12) vb0 |= (x2 << 7); + if (ohm & 0x12) vb1 |= (x3 << 7); + + const int shamt = (mode >> 1) ^ 3; + + va = (uint32_t)va << shamt; + vb0 = (uint32_t)vb0 << shamt; + vb1 = (uint32_t)vb1 << shamt; + vc = (uint32_t)vc << shamt; + vd0 = (uint32_t)vd0 << shamt; + vd1 = (uint32_t)vd1 << shamt; + + // qlog12 + e[1][0] = basisu::clamp(va, 0, 0xFFF); + e[1][1] = basisu::clamp(va - vb0, 0, 0xFFF); + e[1][2] = basisu::clamp(va - vb1, 0, 0xFFF); + + e[0][0] = basisu::clamp(va - vc, 0, 0xFFF); + e[0][1] = basisu::clamp(va - vb0 - vc - vd0, 0, 0xFFF); + e[0][2] = basisu::clamp(va - vb1 - vc - vd1, 0, 0xFFF); + + if (maj_comp) + { + std::swap(e[0][0], e[0][maj_comp]); + std::swap(e[1][0], e[1][maj_comp]); + } + } + } + + //-------------------------------------------------------------------------------------------------------------------------- + + bool decode_mode11_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + uint32_t ise_endpoint_range) + { + assert(g_astc_hdr_core_initialized); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) + { + decode_mode11_to_qlog12_ise20(pEndpoints, e); + } + else + { + uint8_t dequantized_endpoints[NUM_MODE11_ENDPOINTS]; + + for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) + dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + + decode_mode11_to_qlog12_ise20(dequantized_endpoints, e); + } + + for (uint32_t i = 0; i < 2; i++) + { + if (e[i][0] > (int)MAX_QLOG12) + return false; + + if (e[i][1] > (int)MAX_QLOG12) + return false; + + if (e[i][2] > (int)MAX_QLOG12) + return false; + } + + return true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + + bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) + { + assert(g_astc_hdr_core_initialized); + assert((best_blk.m_weight_ise_range >= 1) && (best_blk.m_weight_ise_range <= 8)); + + if (best_blk.m_weight_ise_range == 5) + { + // Use 3-bit BC6H weights which are a perfect match for 3-bit ASTC weights, but encode 1-subset as 2 equal subsets + bc6h_enc_block_1subset_3bit_weights(&transcoded_bc6h_blk, h_e, best_blk.m_weights); + } + else + { + uint8_t bc6h_weights[16]; + + if (best_blk.m_weight_ise_range == 1) + { + // weight ISE 1: 3 levels + static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 8, 15 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 2) + { + // weight ISE 2: 4 levels + static const uint8_t s_astc2_to_bc6h_4[4] = { 0, 5, 10, 15 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc2_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 3) + { + // weight ISE 3: 5 levels + static const uint8_t s_astc3_to_bc6h_4[5] = { 0, 4, 7, 11, 15 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc3_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 4) + { + // weight ISE 4: 6 levels + static const uint8_t s_astc4_to_bc6h_4[6] = { 0, 15, 3, 12, 6, 9 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc4_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 6) + { + // weight ISE 6: 10 levels + static const uint8_t s_astc6_to_bc6h_4[10] = { 0, 15, 2, 13, 3, 12, 5, 10, 6, 9 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc6_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 7) + { + // weight ISE 7: 12 levels + static const uint8_t s_astc7_to_bc6h_4[12] = { 0, 15, 4, 11, 1, 14, 5, 10, 2, 13, 6, 9 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc7_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 8) + { + // 16 levels + memcpy(bc6h_weights, best_blk.m_weights, 16); + } + else + { + assert(0); + return false; + } + + bc6h_enc_block_1subset_4bit_weights(&transcoded_bc6h_blk, h_e, bc6h_weights); + } + + return true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + + bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) + { + assert(g_astc_hdr_core_initialized); + assert(best_blk.m_num_partitions == 2); + assert(common_part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + half_float bc6h_endpoints[2][3][2]; // [subset][comp][lh_index] + + // UASTC HDR checks + // Both CEM's must be equal in 2-subset UASTC HDR. + if (best_blk.m_color_endpoint_modes[0] != best_blk.m_color_endpoint_modes[1]) + return false; + if ((best_blk.m_color_endpoint_modes[0] != 7) && (best_blk.m_color_endpoint_modes[0] != 11)) + return false; + + if (best_blk.m_color_endpoint_modes[0] == 7) + { + if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 20)) || + ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 20)) || + ((best_blk.m_weight_ise_range == 3) && (best_blk.m_endpoint_ise_range == 19)) || + ((best_blk.m_weight_ise_range == 4) && (best_blk.m_endpoint_ise_range == 17)) || + ((best_blk.m_weight_ise_range == 5) && (best_blk.m_endpoint_ise_range == 15)))) + { + return false; + } + } + else + { + if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 14)) || + ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 12)))) + { + return false; + } + } + + for (uint32_t s = 0; s < 2; s++) + { + int e[2][3]; + if (best_blk.m_color_endpoint_modes[0] == 7) + { + bool success = decode_mode7_to_qlog12(best_blk.m_endpoints + s * NUM_MODE7_ENDPOINTS, e, nullptr, best_blk.m_endpoint_ise_range); + if (!success) + return false; + } + else + { + bool success = decode_mode11_to_qlog12(best_blk.m_endpoints + s * NUM_MODE11_ENDPOINTS, e, best_blk.m_endpoint_ise_range); + if (!success) + return false; + } + + for (uint32_t c = 0; c < 3; c++) + { + bc6h_endpoints[s][c][0] = qlog_to_half_slow(e[0][c], 12); + if (is_half_inf_or_nan(bc6h_endpoints[s][c][0])) + return false; + + bc6h_endpoints[s][c][1] = qlog_to_half_slow(e[1][c], 12); + if (is_half_inf_or_nan(bc6h_endpoints[s][c][1])) + return false; + } + } + + uint8_t bc6h_weights[16]; + if (best_blk.m_weight_ise_range == 1) + { + static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 4, 7 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 2) + { + static const uint8_t s_astc2_to_bc6h_3[4] = { 0, 2, 5, 7 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc2_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 3) + { + static const uint8_t s_astc3_to_bc6h_3[5] = { 0, 2, 4, 5, 7 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc3_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 4) + { + static const uint8_t s_astc4_to_bc6h_3[6] = { 0, 7, 1, 6, 3, 4 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc4_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 5) + { + memcpy(bc6h_weights, best_blk.m_weights, 16); + } + else + { + assert(0); + return false; + } + + bc6h_enc_block_2subset_3bit_weights(&transcoded_bc6h_blk, common_part_index, bc6h_endpoints, bc6h_weights); + + return true; + } + + //-------------------------------------------------------------------------------------------------------------------------- + // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. + bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk) + { + assert(g_astc_hdr_core_initialized); + if (!g_astc_hdr_core_initialized) + { + assert(0); + return false; + } + + astc_helpers::log_astc_block log_blk; + + if (!astc_helpers::unpack_block(&src_blk, log_blk, 4, 4)) + { + // Failed unpacking ASTC data + return false; + } + + return astc_hdr_transcode_to_bc6h(log_blk, dst_blk); + } + + //-------------------------------------------------------------------------------------------------------------------------- + // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. + bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk) + { + assert(g_astc_hdr_core_initialized); + if (!g_astc_hdr_core_initialized) + { + assert(0); + return false; + } + + if (log_blk.m_solid_color_flag_ldr) + { + // Don't support LDR solid colors. + return false; + } + + if (log_blk.m_solid_color_flag_hdr) + { + // Solid color HDR block + return bc6h_enc_block_solid_color(&dst_blk, log_blk.m_solid_color); + } + + // Only support 4x4 grid sizes + if ((log_blk.m_grid_width != 4) || (log_blk.m_grid_height != 4)) + return false; + + // Don't support dual plane encoding + if (log_blk.m_dual_plane) + return false; + + if (log_blk.m_num_partitions == 1) + { + // Handle 1 partition (or subset) + + // UASTC HDR checks + if ((log_blk.m_weight_ise_range < 1) || (log_blk.m_weight_ise_range > 8)) + return false; + + int e[2][3]; + bool success; + + if (log_blk.m_color_endpoint_modes[0] == 7) + { + if (log_blk.m_endpoint_ise_range != 20) + return false; + + success = decode_mode7_to_qlog12(log_blk.m_endpoints, e, nullptr, log_blk.m_endpoint_ise_range); + } + else if (log_blk.m_color_endpoint_modes[0] == 11) + { + // UASTC HDR checks + if (log_blk.m_weight_ise_range <= 7) + { + if (log_blk.m_endpoint_ise_range != 20) + return false; + } + else if (log_blk.m_endpoint_ise_range != 19) + { + return false; + } + + success = decode_mode11_to_qlog12(log_blk.m_endpoints, e, log_blk.m_endpoint_ise_range); + } + else + { + return false; + } + + if (!success) + return false; + + // Transform endpoints to half float + half_float h_e[3][2] = + { + { qlog_to_half_slow(e[0][0], 12), qlog_to_half_slow(e[1][0], 12) }, + { qlog_to_half_slow(e[0][1], 12), qlog_to_half_slow(e[1][1], 12) }, + { qlog_to_half_slow(e[0][2], 12), qlog_to_half_slow(e[1][2], 12) } + }; + + // Sanity check for NaN/Inf + for (uint32_t i = 0; i < 2; i++) + if (is_half_inf_or_nan(h_e[0][i]) || is_half_inf_or_nan(h_e[1][i]) || is_half_inf_or_nan(h_e[2][i])) + return false; + + // Transcode to bc6h + if (!transcode_bc6h_1subset(h_e, log_blk, dst_blk)) + return false; + } + else if (log_blk.m_num_partitions == 2) + { + // Handle 2 partition (or subset) + int common_bc7_pat_index = g_astc_partition_id_to_common_bc7_pat_index[log_blk.m_partition_id]; + if (common_bc7_pat_index < 0) + return false; + + assert(common_bc7_pat_index < (int)basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + if (!transcode_bc6h_2subsets(common_bc7_pat_index, log_blk, dst_blk)) + return false; + } + else + { + // Only supports 1 or 2 partitions (or subsets) + return false; + } + + return true; + } +#endif // BASISD_SUPPORT_UASTC_HDR + } // namespace basist diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.h b/thirdparty/basis_universal/transcoder/basisu_transcoder.h index 3327e8ddb732..8324e996989c 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.h +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.h @@ -1,5 +1,5 @@ // basisu_transcoder.h -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,6 +29,7 @@ // Set BASISU_FORCE_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development. #ifndef BASISU_FORCE_DEVEL_MESSAGES + // TODO - disable before checking in #define BASISU_FORCE_DEVEL_MESSAGES 0 #endif @@ -55,7 +56,7 @@ namespace basist cTFETC2_RGBA = 1, // Opaque+alpha, ETC2_EAC_A8 block followed by a ETC1 block, alpha channel will be opaque for opaque .basis files // BC1-5, BC7 (desktop, some mobile devices) - cTFBC1_RGB = 2, // Opaque only, no punchthrough alpha support yet, transcodes alpha slice if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified + cTFBC1_RGB = 2, // Opaque only, no punchthrough alpha support yet, transcodes alpha slice if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified cTFBC3_RGBA = 3, // Opaque+alpha, BC4 followed by a BC1 block, alpha channel will be opaque for opaque .basis files cTFBC4_R = 4, // Red only, alpha slice is transcoded to output if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified cTFBC5_RG = 5, // XY: Two BC4 blocks, X=R and Y=Alpha, .basis file should have alpha data (if not Y will be all 255's) @@ -63,10 +64,11 @@ namespace basist // PVRTC1 4bpp (mobile, PowerVR devices) cTFPVRTC1_4_RGB = 8, // Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified, nearly lowest quality of any texture format. - cTFPVRTC1_4_RGBA = 9, // Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format. + cTFPVRTC1_4_RGBA = 9, // Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format. // ASTC (mobile, Intel devices, hopefully all desktop GPU's one day) - cTFASTC_4x4_RGBA = 10, // Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions. + cTFASTC_4x4_RGBA = 10, // LDR. Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. + // LDR: Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions. // ATC (mobile, Adreno devices, this is a niche format) cTFATC_RGB = 11, // Opaque, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. ATI ATC (GL_ATC_RGB_AMD) @@ -74,8 +76,8 @@ namespace basist // FXT1 (desktop, Intel devices, this is a super obscure format) cTFFXT1_RGB = 17, // Opaque only, uses exclusively CC_MIXED blocks. Notable for having a 8x4 block size. GL_3DFX_texture_compression_FXT1 is supported on Intel integrated GPU's (such as HD 630). - // Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now. - // See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h. + // Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now. + // See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h. cTFPVRTC2_4_RGB = 18, // Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB). cTFPVRTC2_4_RGBA = 19, // Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks. @@ -83,13 +85,22 @@ namespace basist cTFETC2_EAC_R11 = 20, // R only (ETC2 EAC R11 unsigned) cTFETC2_EAC_RG11 = 21, // RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps + cTFBC6H = 22, // HDR, RGB only, unsigned + cTFASTC_HDR_4x4_RGBA = 23, // HDR, RGBA (currently UASTC HDR is only RGB), unsigned + // Uncompressed (raw pixel) formats + // Note these uncompressed formats (RGBA32, 565, and 4444) can only be transcoded to from LDR input files (ETC1S or UASTC LDR). cTFRGBA32 = 13, // 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte. cTFRGB565 = 14, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11 cTFBGR565 = 15, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0 - cTFRGBA4444 = 16, // 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0 + cTFRGBA4444 = 16, // 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0 + + // Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR). + cTFRGB_HALF = 24, // 48bpp RGB half (16-bits/component, 3 components) + cTFRGBA_HALF = 25, // 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha) + cTFRGB_9E5 = 26, // 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent) - cTFTotalTextureFormats = 22, + cTFTotalTextureFormats = 27, // Old enums for compatibility with code compiled against previous versions cTFETC1 = cTFETC1_RGB, @@ -124,6 +135,9 @@ namespace basist // Returns true if the format supports an alpha channel. bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt); + // Returns true if the format is HDR. + bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt); + // Returns the basisu::texture_format corresponding to the specified transcoder_texture_format. basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt); @@ -142,7 +156,7 @@ namespace basist // Returns the block height for the specified texture format, which is currently always 4. uint32_t basis_get_block_height(transcoder_texture_format tex_type); - // Returns true if the specified format was enabled at compile time. + // Returns true if the specified format was enabled at compile time, and is supported for the specific basis/ktx2 texture format (ETC1S, UASTC, or UASTC HDR). bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S); // Validates that the output buffer is large enough to hold the entire transcoded texture. @@ -317,6 +331,42 @@ namespace basist int channel0 = -1, int channel1 = -1); }; + class basisu_lowlevel_uastc_hdr_transcoder + { + friend class basisu_transcoder; + + public: + basisu_lowlevel_uastc_hdr_transcoder(); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0) + { + return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, + output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, + pState, output_rows_in_pixels, channel0, channel1, decode_flags); + } + + // Container independent transcoding + bool transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags = 0, + bool has_alpha = false, + bool is_video = false, + uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, + uint32_t output_rows_in_pixels = 0, + int channel0 = -1, int channel1 = -1); + }; + struct basisu_slice_info { uint32_t m_orig_width; @@ -530,6 +580,7 @@ namespace basist private: mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder; mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder; + mutable basisu_lowlevel_uastc_hdr_transcoder m_lowlevel_uastc_hdr_decoder; bool m_ready_to_transcode; @@ -612,10 +663,12 @@ namespace basist #pragma pack(pop) const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0; + const uint32_t KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK = 1000066000; // TODO, is this correct? const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166; + const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR = 167; const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163; const uint32_t KTX2_IMAGE_IS_P_FRAME = 2; - const uint32_t KTX2_UASTC_BLOCK_SIZE = 16; + const uint32_t KTX2_UASTC_BLOCK_SIZE = 16; // also the block size for UASTC_HDR const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased // The KTX2 transfer functions supported by KTX2 @@ -800,13 +853,15 @@ namespace basist // Returns 0 or the number of layers in the texture array or texture video. Valid after init(). uint32_t get_layers() const { return m_header.m_layer_count; } - // Returns cETC1S or cUASTC4x4. Valid after init(). + // Returns cETC1S, cUASTC4x4, or cUASTC_HDR_4x4. Valid after init(). basist::basis_tex_format get_format() const { return m_format; } - + bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; } bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; } + bool is_hdr() const { return get_format() == basist::basis_tex_format::cUASTC_HDR_4x4; } + // Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init(). uint32_t get_has_alpha() const { return m_has_alpha; } @@ -913,6 +968,7 @@ namespace basist basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder; basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder; + basist::basisu_lowlevel_uastc_hdr_transcoder m_uastc_hdr_transcoder; ktx2_transcoder_state m_def_transcoder_state; diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h index 0505df6ea67e..17c9dc7c8c9d 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h @@ -1,5 +1,5 @@ // basisu_transcoder_internal.h - Universal texture format transcoder library. -// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved. +// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved. // // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing // @@ -20,8 +20,9 @@ #pragma warning (disable: 4127) // conditional expression is constant #endif -#define BASISD_LIB_VERSION 116 -#define BASISD_VERSION_STRING "01.16" +// v1.50: Added UASTC HDR support +#define BASISD_LIB_VERSION 150 +#define BASISD_VERSION_STRING "01.50" #ifdef _DEBUG #define BASISD_BUILD_DEBUG @@ -82,9 +83,15 @@ namespace basist cRGBA4444_ALPHA, cRGBA4444_COLOR_OPAQUE, cRGBA4444, - - cUASTC_4x4, - + cRGBA_HALF, + cRGB_HALF, + cRGB_9E5, + + cUASTC_4x4, // LDR, universal + cUASTC_HDR_4x4, // HDR, transcodes only to 4x4 HDR ASTC, BC6H, or uncompressed + cBC6H, + cASTC_HDR_4x4, + cTotalBlockFormats }; @@ -264,8 +271,8 @@ namespace basist } const basisu::uint8_vec &get_code_sizes() const { return m_code_sizes; } - const basisu::int_vec get_lookup() const { return m_lookup; } - const basisu::int16_vec get_tree() const { return m_tree; } + const basisu::int_vec &get_lookup() const { return m_lookup; } + const basisu::int16_vec &get_tree() const { return m_tree; } bool is_valid() const { return m_code_sizes.size() > 0; } @@ -789,7 +796,198 @@ namespace basist }; bool basis_block_format_is_uncompressed(block_format tex_type); - + + //------------------------------------ + + typedef uint16_t half_float; + + const double MIN_DENORM_HALF_FLOAT = 0.000000059604645; // smallest positive subnormal number + const double MIN_HALF_FLOAT = 0.00006103515625; // smallest positive normal number + const double MAX_HALF_FLOAT = 65504.0; // largest normal number + + inline uint32_t get_bits(uint32_t val, int low, int high) + { + const int num_bits = (high - low) + 1; + assert((num_bits >= 1) && (num_bits <= 32)); + + val >>= low; + if (num_bits != 32) + val &= ((1u << num_bits) - 1); + + return val; + } + + inline bool is_half_inf_or_nan(half_float v) + { + return get_bits(v, 10, 14) == 31; + } + + inline bool is_half_denorm(half_float v) + { + int e = (v >> 10) & 31; + return !e; + } + + inline int get_half_exp(half_float v) + { + int e = ((v >> 10) & 31); + return e ? (e - 15) : -14; + } + + inline int get_half_mantissa(half_float v) + { + if (is_half_denorm(v)) + return v & 0x3FF; + return (v & 0x3FF) | 0x400; + } + + inline float get_half_mantissaf(half_float v) + { + return ((float)get_half_mantissa(v)) / 1024.0f; + } + + inline int get_half_sign(half_float v) + { + return v ? ((v & 0x8000) ? -1 : 1) : 0; + } + + inline bool half_is_signed(half_float v) + { + return (v & 0x8000) != 0; + } + +#if 0 + int hexp = get_half_exp(Cf); + float hman = get_half_mantissaf(Cf); + int hsign = get_half_sign(Cf); + float k = powf(2.0f, hexp) * hman * hsign; + if (is_half_inf_or_nan(Cf)) + k = std::numeric_limits::quiet_NaN(); +#endif + + half_float float_to_half(float val); + + inline float half_to_float(half_float hval) + { + union { float f; uint32_t u; } x = { 0 }; + + uint32_t s = ((uint32_t)hval >> 15) & 1; + uint32_t e = ((uint32_t)hval >> 10) & 0x1F; + uint32_t m = (uint32_t)hval & 0x3FF; + + if (!e) + { + if (!m) + { + // +- 0 + x.u = s << 31; + return x.f; + } + else + { + // denormalized + while (!(m & 0x00000400)) + { + m <<= 1; + --e; + } + + ++e; + m &= ~0x00000400; + } + } + else if (e == 31) + { + if (m == 0) + { + // +/- INF + x.u = (s << 31) | 0x7f800000; + return x.f; + } + else + { + // +/- NaN + x.u = (s << 31) | 0x7f800000 | (m << 13); + return x.f; + } + } + + e = e + (127 - 15); + m = m << 13; + + assert(s <= 1); + assert(m <= 0x7FFFFF); + assert(e <= 255); + + x.u = m | (e << 23) | (s << 31); + return x.f; + } + + // Originally from bc6h_enc.h + + void bc6h_enc_init(); + + const uint32_t MAX_BLOG16_VAL = 0xFFFF; + + // BC6H internals + const uint32_t NUM_BC6H_MODES = 14; + const uint32_t BC6H_LAST_MODE_INDEX = 13; + const uint32_t BC6H_FIRST_1SUBSET_MODE_INDEX = 10; // in the MS docs, this is "mode 11" (where the first mode is 1), 60 bits for endpoints (10.10, 10.10, 10.10), 63 bits for weights + const uint32_t TOTAL_BC6H_PARTITION_PATTERNS = 32; + + extern const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4]; // base, r, g, b + + struct bc6h_bit_layout + { + int8_t m_comp; // R=0,G=1,B=2,D=3 (D=partition index) + int8_t m_index; // 0-3, 0-1 Low/High subset 1, 2-3 Low/High subset 2, -1=partition index (d) + int8_t m_last_bit; + int8_t m_first_bit; // may be -1 if a single bit, may be >m_last_bit if reversed + }; + + const uint32_t MAX_BC6H_LAYOUT_INDEX = 25; + extern const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX]; + + extern const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4]; // [y][x] + + extern const uint8_t g_bc6h_weight3[8]; + extern const uint8_t g_bc6h_weight4[16]; + + extern const int8_t g_bc6h_mode_lookup[32]; + + // Converts b16 to half float + inline half_float bc6h_blog16_to_half(uint32_t comp) + { + assert(comp <= 0xFFFF); + + // scale the magnitude by 31/64 + comp = (comp * 31u) >> 6u; + return (half_float)comp; + } + + const uint32_t MAX_BC6H_HALF_FLOAT_AS_UINT = 0x7BFF; + + // Inverts bc6h_blog16_to_half(). + // Returns the nearest blog16 given a half value. + inline uint32_t bc6h_half_to_blog16(half_float h) + { + assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + return (h * 64 + 30) / 31; + } + + struct bc6h_block + { + uint8_t m_bytes[16]; + }; + + void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights); + void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights); + void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights); + void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights); + void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index] + void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index] + bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]); + } // namespace basist diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc index 8244550959dd..205758b3d7ff 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc @@ -1,4 +1,4 @@ -// Copyright (C) 2017-2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2017-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc index fad45fe22d0a..f2d324fcc333 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc @@ -1,4 +1,4 @@ -// Copyright (C) 2017-2019 Binomial LLC. All Rights Reserved. +// Copyright (C) 2017-2024 Binomial LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h index f91314f4ff43..457bd51e3011 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h @@ -13,6 +13,7 @@ namespace basist const uint32_t UASTC_MODE_INDEX_SOLID_COLOR = 8; const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS2 = 30; + const uint32_t TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 = 27; // BC6H only supports only 5-bit pattern indices, BC7 supports 4-bit or 6-bit const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS3 = 11; const uint32_t TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS = 19;