From b3e2c545938e53d15091f9eb4ea5cbe2277500b2 Mon Sep 17 00:00:00 2001
From: BlueCube3310 <53150244+BlueCube3310@users.noreply.github.com>
Date: Sat, 28 Sep 2024 12:05:45 +0200
Subject: [PATCH] BasisU: Update to 1.50.0 and add HDR support

---
 editor/import/resource_importer_texture.cpp   |    5 -
 modules/basis_universal/SCsub                 |    4 +
 .../basis_universal/image_compress_basisu.cpp |  206 +-
 .../basis_universal/image_compress_basisu.h   |    9 +
 thirdparty/README.md                          |    2 +-
 .../encoder/3rdparty/android_astc_decomp.cpp  | 2052 ++++++++++
 .../encoder/3rdparty/android_astc_decomp.h    |   45 +
 .../encoder/basisu_astc_hdr_enc.cpp           | 3310 +++++++++++++++
 .../encoder/basisu_astc_hdr_enc.h             |  224 +
 .../encoder/basisu_backend.cpp                |    2 +-
 .../basis_universal/encoder/basisu_backend.h  |    2 +-
 .../encoder/basisu_basis_file.cpp             |    2 +-
 .../encoder/basisu_basis_file.h               |    2 +-
 .../basis_universal/encoder/basisu_bc7enc.cpp |    5 +-
 .../basis_universal/encoder/basisu_bc7enc.h   |    2 +-
 .../basis_universal/encoder/basisu_comp.cpp   | 2128 ++++++++--
 .../basis_universal/encoder/basisu_comp.h     |  104 +-
 .../basis_universal/encoder/basisu_enc.cpp    | 1908 ++++++++-
 .../basis_universal/encoder/basisu_enc.h      |  653 ++-
 .../basis_universal/encoder/basisu_etc.cpp    |    2 +-
 .../basis_universal/encoder/basisu_etc.h      |    2 +-
 .../encoder/basisu_frontend.cpp               |    3 +-
 .../basis_universal/encoder/basisu_frontend.h |    2 +-
 .../encoder/basisu_gpu_texture.cpp            |  561 ++-
 .../encoder/basisu_gpu_texture.h              |   50 +-
 .../encoder/basisu_kernels_declares.h         |    2 +-
 .../encoder/basisu_kernels_imp.h              |    2 +-
 .../encoder/basisu_kernels_sse.cpp            |   18 +-
 .../basis_universal/encoder/basisu_miniz.h    |   10 +-
 .../basis_universal/encoder/basisu_opencl.cpp |    2 +-
 .../basis_universal/encoder/basisu_opencl.h   |    2 +-
 .../encoder/basisu_pvrtc1_4.cpp               |    2 +-
 .../basis_universal/encoder/basisu_pvrtc1_4.h |   13 +-
 .../encoder/basisu_resample_filters.cpp       |    2 +-
 .../encoder/basisu_resampler.cpp              |    2 +-
 .../encoder/basisu_resampler.h                |    2 +-
 .../encoder/basisu_resampler_filters.h        |    2 +-
 .../basis_universal/encoder/basisu_ssim.cpp   |    2 +-
 .../basis_universal/encoder/basisu_ssim.h     |    2 +-
 .../encoder/basisu_uastc_enc.cpp              |   21 +-
 .../encoder/basisu_uastc_enc.h                |    2 +-
 .../basis_universal/encoder/cppspmd_flow.h    |    2 +-
 .../basis_universal/encoder/cppspmd_math.h    |    4 +-
 .../encoder/cppspmd_math_declares.h           |    2 +-
 .../basis_universal/encoder/cppspmd_sse.h     |   28 +-
 .../encoder/cppspmd_type_aliases.h            |    2 +-
 .../basis_universal/encoder/pvpngreader.cpp   |   18 +-
 .../patches/external-jpgd.patch               |    0
 .../patches/external-tinyexr.patch            |   23 +
 .../patches/remove-tinydds-qoi.patch          |  446 ++
 .../basis_universal/transcoder/basisu.h       |  105 +-
 .../transcoder/basisu_astc_hdr_core.h         |  102 +
 .../transcoder/basisu_astc_helpers.h          | 3587 +++++++++++++++++
 .../transcoder/basisu_containers.h            |   62 +-
 .../transcoder/basisu_containers_impl.h       |   47 +-
 .../transcoder/basisu_file_headers.h          |    5 +-
 .../transcoder/basisu_transcoder.cpp          | 2057 +++++++++-
 .../transcoder/basisu_transcoder.h            |   80 +-
 .../transcoder/basisu_transcoder_internal.h   |  216 +-
 .../basisu_transcoder_tables_dxt1_5.inc       |    2 +-
 .../basisu_transcoder_tables_dxt1_6.inc       |    2 +-
 .../transcoder/basisu_transcoder_uastc.h      |    1 +
 62 files changed, 17244 insertions(+), 918 deletions(-)
 create mode 100644 thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
 create mode 100644 thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h
 create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp
 create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h
 rename {modules => thirdparty}/basis_universal/patches/external-jpgd.patch (100%)
 create mode 100644 thirdparty/basis_universal/patches/external-tinyexr.patch
 create mode 100644 thirdparty/basis_universal/patches/remove-tinydds-qoi.patch
 create mode 100644 thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h
 create mode 100644 thirdparty/basis_universal/transcoder/basisu_astc_helpers.h

diff --git a/editor/import/resource_importer_texture.cpp b/editor/import/resource_importer_texture.cpp
index a205123df1ba..ff2778a64041 100644
--- a/editor/import/resource_importer_texture.cpp
+++ b/editor/import/resource_importer_texture.cpp
@@ -593,11 +593,6 @@ Error ResourceImporterTexture::import(const String &p_source_file, const String
 		}
 	}
 
-	if (compress_mode == COMPRESS_BASIS_UNIVERSAL && image->get_format() >= Image::FORMAT_RF) {
-		// Basis universal does not support float formats, fallback.
-		compress_mode = COMPRESS_VRAM_COMPRESSED;
-	}
-
 	bool detect_3d = int(p_options["detect_3d/compress_to"]) > 0;
 	bool detect_roughness = roughness == 0;
 	bool detect_normal = normal == 0;
diff --git a/modules/basis_universal/SCsub b/modules/basis_universal/SCsub
index 0142317e1ef4..986c23b7d81e 100644
--- a/modules/basis_universal/SCsub
+++ b/modules/basis_universal/SCsub
@@ -14,6 +14,8 @@ thirdparty_obj = []
 thirdparty_dir = "#thirdparty/basis_universal/"
 # Sync list with upstream CMakeLists.txt
 encoder_sources = [
+    "3rdparty/android_astc_decomp.cpp",
+    "basisu_astc_hdr_enc.cpp",
     "basisu_backend.cpp",
     "basisu_basis_file.cpp",
     "basisu_bc7enc.cpp",
@@ -45,6 +47,8 @@ else:
 if env["builtin_zstd"]:
     env_basisu.Prepend(CPPPATH=["#thirdparty/zstd"])
 
+env_basisu.Prepend(CPPPATH=["#thirdparty/tinyexr"])
+
 if env.dev_build:
     env_basisu.Append(CPPDEFINES=[("BASISU_DEVEL_MESSAGES", 1), ("BASISD_ENABLE_DEBUG_FLAGS", 1)])
 
diff --git a/modules/basis_universal/image_compress_basisu.cpp b/modules/basis_universal/image_compress_basisu.cpp
index ab20d00b5b17..d48ea363a7b5 100644
--- a/modules/basis_universal/image_compress_basisu.cpp
+++ b/modules/basis_universal/image_compress_basisu.cpp
@@ -30,6 +30,8 @@
 
 #include "image_compress_basisu.h"
 
+#include "core/os/os.h"
+#include "core/string/print_string.h"
 #include "servers/rendering_server.h"
 
 #include <transcoder/basisu_transcoder.h>
@@ -46,9 +48,48 @@ void basis_universal_init() {
 }
 
 #ifdef TOOLS_ENABLED
+template <typename T>
+inline void _basisu_pad_mipmap(const uint8_t *p_image_mip_data, Vector<uint8_t> &r_mip_data_padded, int p_next_width, int p_next_height, int p_width, int p_height, int64_t p_size) {
+	// Source mip's data interpreted as 32-bit RGBA blocks to help with copying pixel data.
+	const T *mip_src_data = reinterpret_cast<const T *>(p_image_mip_data);
+
+	// Reserve space in the padded buffer.
+	r_mip_data_padded.resize(p_next_width * p_next_height * sizeof(T));
+	T *data_padded_ptr = reinterpret_cast<T *>(r_mip_data_padded.ptrw());
+
+	// Pad mipmap to the nearest block by smearing.
+	int x = 0, y = 0;
+	for (y = 0; y < p_height; y++) {
+		for (x = 0; x < p_width; x++) {
+			data_padded_ptr[p_next_width * y + x] = mip_src_data[p_width * y + x];
+		}
+
+		// First, smear in x.
+		for (; x < p_next_width; x++) {
+			data_padded_ptr[p_next_width * y + x] = data_padded_ptr[p_next_width * y + x - 1];
+		}
+	}
+
+	// Then, smear in y.
+	for (; y < p_next_height; y++) {
+		for (x = 0; x < p_next_width; x++) {
+			data_padded_ptr[p_next_width * y + x] = data_padded_ptr[p_next_width * y + x - p_next_width];
+		}
+	}
+}
+
 Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedChannels p_channels) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
 	Ref<Image> image = p_image->duplicate();
-	image->convert(Image::FORMAT_RGBA8);
+	bool is_hdr = false;
+
+	if (image->get_format() <= Image::FORMAT_RGB565) {
+		image->convert(Image::FORMAT_RGBA8);
+	} else if (image->get_format() <= Image::FORMAT_RGBE9995) {
+		image->convert(Image::FORMAT_RGBAF);
+		is_hdr = true;
+	}
 
 	basisu::basis_compressor_params params;
 
@@ -74,32 +115,42 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 	basisu::job_pool job_pool(OS::get_singleton()->get_processor_count());
 	params.m_pJob_pool = &job_pool;
 
-	BasisDecompressFormat decompress_format = BASIS_DECOMPRESS_RG;
-	switch (p_channels) {
-		case Image::USED_CHANNELS_L: {
-			decompress_format = BASIS_DECOMPRESS_RGB;
-		} break;
-		case Image::USED_CHANNELS_LA: {
-			params.m_force_alpha = true;
-			decompress_format = BASIS_DECOMPRESS_RGBA;
-		} break;
-		case Image::USED_CHANNELS_R: {
-			decompress_format = BASIS_DECOMPRESS_R;
-		} break;
-		case Image::USED_CHANNELS_RG: {
-			params.m_force_alpha = true;
-			image->convert_rg_to_ra_rgba8();
-			decompress_format = BASIS_DECOMPRESS_RG;
-		} break;
-		case Image::USED_CHANNELS_RGB: {
-			decompress_format = BASIS_DECOMPRESS_RGB;
-		} break;
-		case Image::USED_CHANNELS_RGBA: {
-			params.m_force_alpha = true;
-			decompress_format = BASIS_DECOMPRESS_RGBA;
-		} break;
+	BasisDecompressFormat decompress_format = BASIS_DECOMPRESS_MAX;
+
+	if (is_hdr) {
+		decompress_format = BASIS_DECOMPRESS_HDR_RGB;
+		params.m_hdr = true;
+		params.m_uastc_hdr_options.set_quality_level(0);
+
+	} else {
+		switch (p_channels) {
+			case Image::USED_CHANNELS_L: {
+				decompress_format = BASIS_DECOMPRESS_RGB;
+			} break;
+			case Image::USED_CHANNELS_LA: {
+				params.m_force_alpha = true;
+				decompress_format = BASIS_DECOMPRESS_RGBA;
+			} break;
+			case Image::USED_CHANNELS_R: {
+				decompress_format = BASIS_DECOMPRESS_R;
+			} break;
+			case Image::USED_CHANNELS_RG: {
+				params.m_force_alpha = true;
+				image->convert_rg_to_ra_rgba8();
+				decompress_format = BASIS_DECOMPRESS_RG;
+			} break;
+			case Image::USED_CHANNELS_RGB: {
+				decompress_format = BASIS_DECOMPRESS_RGB;
+			} break;
+			case Image::USED_CHANNELS_RGBA: {
+				params.m_force_alpha = true;
+				decompress_format = BASIS_DECOMPRESS_RGBA;
+			} break;
+		}
 	}
 
+	ERR_FAIL_COND_V(decompress_format == BASIS_DECOMPRESS_MAX, Vector<uint8_t>());
+
 	// Copy the source image data with mipmaps into BasisU.
 	{
 		const int orig_width = image->get_width();
@@ -113,9 +164,10 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 
 		Vector<uint8_t> image_data = image->get_data();
 		basisu::vector<basisu::image> basisu_mipmaps;
+		basisu::vector<basisu::imagef> basisu_mipmaps_hdr;
 
 		// Buffer for storing padded mipmap data.
-		Vector<uint32_t> mip_data_padded;
+		Vector<uint8_t> mip_data_padded;
 
 		for (int32_t i = 0; i <= image->get_mipmap_count(); i++) {
 			int64_t ofs, size;
@@ -126,31 +178,10 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 
 			// Pad the mipmap's data if its resolution isn't divisible by 4.
 			if (image->has_mipmaps() && !is_res_div_4 && (width > 2 && height > 2) && (width != next_width || height != next_height)) {
-				// Source mip's data interpreted as 32-bit RGBA blocks to help with copying pixel data.
-				const uint32_t *mip_src_data = reinterpret_cast<const uint32_t *>(image_mip_data);
-
-				// Reserve space in the padded buffer.
-				mip_data_padded.resize(next_width * next_height);
-				uint32_t *data_padded_ptr = mip_data_padded.ptrw();
-
-				// Pad mipmap to the nearest block by smearing.
-				int x = 0, y = 0;
-				for (y = 0; y < height; y++) {
-					for (x = 0; x < width; x++) {
-						data_padded_ptr[next_width * y + x] = mip_src_data[width * y + x];
-					}
-
-					// First, smear in x.
-					for (; x < next_width; x++) {
-						data_padded_ptr[next_width * y + x] = data_padded_ptr[next_width * y + x - 1];
-					}
-				}
-
-				// Then, smear in y.
-				for (; y < next_height; y++) {
-					for (x = 0; x < next_width; x++) {
-						data_padded_ptr[next_width * y + x] = data_padded_ptr[next_width * y + x - next_width];
-					}
+				if (is_hdr) {
+					_basisu_pad_mipmap<BasisRGBAF>(image_mip_data, mip_data_padded, next_width, next_height, width, height, size);
+				} else {
+					_basisu_pad_mipmap<uint32_t>(image_mip_data, mip_data_padded, next_width, next_height, width, height, size);
 				}
 
 				// Override the image_mip_data pointer with our temporary Vector.
@@ -159,7 +190,7 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 				// Override the mipmap's properties.
 				width = next_width;
 				height = next_height;
-				size = mip_data_padded.size() * 4;
+				size = mip_data_padded.size();
 			}
 
 			// Get the next mipmap's resolution.
@@ -167,44 +198,61 @@ Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedCha
 			next_height /= 2;
 
 			// Copy the source mipmap's data to a BasisU image.
-			basisu::image basisu_image(width, height);
-			memcpy(basisu_image.get_ptr(), image_mip_data, size);
+			if (is_hdr) {
+				basisu::imagef basisu_image(width, height);
+				memcpy(reinterpret_cast<uint8_t *>(basisu_image.get_ptr()), image_mip_data, size);
+
+				if (i == 0) {
+					params.m_source_images_hdr.push_back(basisu_image);
+				} else {
+					basisu_mipmaps_hdr.push_back(basisu_image);
+				}
 
-			if (i == 0) {
-				params.m_source_images.push_back(basisu_image);
 			} else {
-				basisu_mipmaps.push_back(basisu_image);
+				basisu::image basisu_image(width, height);
+				memcpy(basisu_image.get_ptr(), image_mip_data, size);
+
+				if (i == 0) {
+					params.m_source_images.push_back(basisu_image);
+				} else {
+					basisu_mipmaps.push_back(basisu_image);
+				}
 			}
 		}
 
-		params.m_source_mipmap_images.push_back(basisu_mipmaps);
+		if (is_hdr) {
+			params.m_source_mipmap_images_hdr.push_back(basisu_mipmaps_hdr);
+		} else {
+			params.m_source_mipmap_images.push_back(basisu_mipmaps);
+		}
 	}
 
 	// Encode the image data.
-	Vector<uint8_t> basisu_data;
-
 	basisu::basis_compressor compressor;
 	compressor.init(params);
 
 	int basisu_err = compressor.process();
-	ERR_FAIL_COND_V(basisu_err != basisu::basis_compressor::cECSuccess, basisu_data);
+	ERR_FAIL_COND_V(basisu_err != basisu::basis_compressor::cECSuccess, Vector<uint8_t>());
 
-	const basisu::uint8_vec &basisu_out = compressor.get_output_basis_file();
-	basisu_data.resize(basisu_out.size() + 4);
+	const basisu::uint8_vec &basisu_encoded = compressor.get_output_basis_file();
 
-	// Copy the encoded data to the buffer.
-	{
-		uint8_t *wb = basisu_data.ptrw();
-		*(uint32_t *)wb = decompress_format;
+	Vector<uint8_t> basisu_data;
+	basisu_data.resize(basisu_encoded.size() + 4);
+	uint8_t *basisu_data_ptr = basisu_data.ptrw();
 
-		memcpy(wb + 4, basisu_out.get_ptr(), basisu_out.size());
-	}
+	// Copy the encoded BasisU data into the output buffer.
+	*(uint32_t *)basisu_data_ptr = decompress_format;
+	memcpy(basisu_data_ptr + 4, basisu_encoded.get_ptr(), basisu_encoded.size());
+
+	print_verbose(vformat("BasisU: Encoding a %dx%d image with %d mipmaps took %d ms.", p_image->get_width(), p_image->get_height(), p_image->get_mipmap_count(), OS::get_singleton()->get_ticks_msec() - start_time));
 
 	return basisu_data;
 }
 #endif // TOOLS_ENABLED
 
 Ref<Image> basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
 	Ref<Image> image;
 	ERR_FAIL_NULL_V_MSG(p_data, image, "Cannot unpack invalid BasisUniversal data.");
 
@@ -320,6 +368,23 @@ Ref<Image> basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) {
 			}
 
 		} break;
+		case BASIS_DECOMPRESS_HDR_RGB: {
+			if (bptc_supported) {
+				basisu_format = basist::transcoder_texture_format::cTFBC6H;
+				image_format = Image::FORMAT_BPTC_RGBFU;
+			} else if (astc_supported) {
+				basisu_format = basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA;
+				image_format = Image::FORMAT_ASTC_4x4_HDR;
+			} else {
+				// No supported VRAM compression formats, decompress.
+				basisu_format = basist::transcoder_texture_format::cTFRGB_9E5;
+				image_format = Image::FORMAT_RGBE9995;
+			}
+
+		} break;
+		default: {
+			ERR_FAIL_V(image);
+		} break;
 	}
 
 	src_ptr += 4;
@@ -371,6 +436,9 @@ Ref<Image> basis_universal_unpacker_ptr(const uint8_t *p_data, int p_size) {
 		}
 	}
 
+	print_verbose(vformat("BasisU: Transcoding a %dx%d image with %d mipmaps into %s took %d ms.",
+			image->get_width(), image->get_height(), image->get_mipmap_count(), Image::get_format_name(image_format), OS::get_singleton()->get_ticks_msec() - start_time));
+
 	return image;
 }
 
diff --git a/modules/basis_universal/image_compress_basisu.h b/modules/basis_universal/image_compress_basisu.h
index 5e36d448f670..81c8511f603c 100644
--- a/modules/basis_universal/image_compress_basisu.h
+++ b/modules/basis_universal/image_compress_basisu.h
@@ -39,11 +39,20 @@ enum BasisDecompressFormat {
 	BASIS_DECOMPRESS_RGBA,
 	BASIS_DECOMPRESS_RG_AS_RA,
 	BASIS_DECOMPRESS_R,
+	BASIS_DECOMPRESS_HDR_RGB,
+	BASIS_DECOMPRESS_MAX
 };
 
 void basis_universal_init();
 
 #ifdef TOOLS_ENABLED
+struct BasisRGBAF {
+	uint32_t r;
+	uint32_t g;
+	uint32_t b;
+	uint32_t a;
+};
+
 Vector<uint8_t> basis_universal_packer(const Ref<Image> &p_image, Image::UsedChannels p_channels);
 #endif
 
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 58226261f4b5..c9a0cefb051d 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -59,7 +59,7 @@ Files extracted from upstream source:
 ## basis_universal
 
 - Upstream: https://github.com/BinomialLLC/basis_universal
-- Version: 1.16.4 (900e40fb5d2502927360fe2f31762bdbb624455f, 2023)
+- Version: 1.50.0 (051ad6d8a64bb95a79e8601c317055fd1782ad3e, 2024)
 - License: Apache 2.0
 
 Files extracted from upstream source:
diff --git a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
new file mode 100644
index 000000000000..5abfe2faf922
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp
@@ -0,0 +1,2052 @@
+// File: android_astc_decomp.cpp
+
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * rg: Removed external dependencies, minor fix to decompress() so it converts non-sRGB
+ * output to 8-bits correctly. I've compared this decoder's output
+ * vs. astc-codec with random inputs.
+ * 
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+#include "android_astc_decomp.h"
+#include <assert.h>
+#include <algorithm>
+#include <fenv.h>
+#include <math.h>
+
+#define DE_LENGTH_OF_ARRAY(x) (sizeof(x)/sizeof(x[0]))
+#define DE_UNREF(x) (void)x
+
+typedef uint8_t deUint8;
+typedef int8_t deInt8;
+typedef uint32_t deUint32;
+typedef int32_t deInt32;
+typedef uint16_t deUint16;
+typedef int16_t deInt16;
+typedef int64_t deInt64;
+typedef uint64_t deUint64;
+
+#define DE_ASSERT assert
+
+#ifdef _MSC_VER
+#pragma warning (disable:4505) // unreferenced local function has been removed
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace basisu_astc
+{
+    template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
+    template <typename S> inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
+    template <typename S> inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
+
+    static bool inBounds(int v, int l, int h)
+    {
+        return (v >= l) && (v < h);
+    }
+
+    static bool inRange(int v, int l, int h)
+    {
+        return (v >= l) && (v <= h);
+    }
+
+    template<typename T>
+    static inline T max(T a, T b)
+    {
+        return (a > b) ? a : b;
+    }
+
+    template<typename T>
+    static inline T min(T a, T b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    template<typename T>
+    static inline T clamp(T a, T l, T h)
+    {
+        if (a < l)
+            return l;
+        else if (a > h)
+            return h;
+        return a;
+    }
+
+    struct UVec4
+    {
+        uint32_t m_c[4];
+
+        UVec4()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+            m_c[3] = 0;
+        }
+
+        UVec4(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+            m_c[3] = w;
+        }
+
+        uint32_t x() const { return m_c[0]; }
+        uint32_t y() const { return m_c[1]; }
+        uint32_t z() const { return m_c[2]; }
+        uint32_t w() const { return m_c[3]; }
+
+        uint32_t& x() { return m_c[0]; }
+        uint32_t& y() { return m_c[1]; }
+        uint32_t& z() { return m_c[2]; }
+        uint32_t& w() { return m_c[3]; }
+
+        uint32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+        uint32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+    };
+
+    struct IVec4
+    {
+        int32_t m_c[4];
+
+        IVec4()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+            m_c[3] = 0;
+        }
+
+        IVec4(int32_t x, int32_t y, int32_t z, int32_t w)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+            m_c[3] = w;
+        }
+
+        int32_t x() const { return m_c[0]; }
+        int32_t y() const { return m_c[1]; }
+        int32_t z() const { return m_c[2]; }
+        int32_t w() const { return m_c[3]; }
+
+        int32_t& x() { return m_c[0]; }
+        int32_t& y() { return m_c[1]; }
+        int32_t& z() { return m_c[2]; }
+        int32_t& w() { return m_c[3]; }
+
+        UVec4 asUint() const
+        {
+            return UVec4(maximum(0, m_c[0]), maximum(0, m_c[1]), maximum(0, m_c[2]), maximum(0, m_c[3]));
+        }
+
+        int32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+        int32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+    };
+
+    struct IVec3
+    {
+        int32_t m_c[3];
+
+        IVec3()
+        {
+            m_c[0] = 0;
+            m_c[1] = 0;
+            m_c[2] = 0;
+        }
+
+        IVec3(int32_t x, int32_t y, int32_t z)
+        {
+            m_c[0] = x;
+            m_c[1] = y;
+            m_c[2] = z;
+        }
+
+        int32_t x() const { return m_c[0]; }
+        int32_t y() const { return m_c[1]; }
+        int32_t z() const { return m_c[2]; }
+
+        int32_t& x() { return m_c[0]; }
+        int32_t& y() { return m_c[1]; }
+        int32_t& z() { return m_c[2]; }
+
+        int32_t operator[] (uint32_t idx) const { assert(idx < 3);  return m_c[idx]; }
+        int32_t& operator[] (uint32_t idx) { assert(idx < 3);  return m_c[idx]; }
+    };
+
+    static uint32_t deDivRoundUp32(uint32_t a, uint32_t b)
+    {
+        return (a + b - 1) / b;
+    }
+
+    static bool deInBounds32(uint32_t v, uint32_t l, uint32_t h)
+    {
+        return (v >= l) && (v < h);
+    }
+
+namespace astc 
+{
+
+using std::vector;
+
+namespace
+{
+
+// Common utilities
+enum
+{
+    MAX_BLOCK_WIDTH     = 12,
+    MAX_BLOCK_HEIGHT    = 12
+};
+
+inline deUint32 getBit (deUint32 src, int ndx)
+{
+    DE_ASSERT(basisu_astc::inBounds(ndx, 0, 32));
+    return (src >> ndx) & 1;
+}
+
+inline deUint32 getBits (deUint32 src, int low, int high)
+{
+    const int numBits = (high-low) + 1;
+    DE_ASSERT(basisu_astc::inRange(numBits, 1, 32));
+
+    if (numBits < 32)
+        return (deUint32)((src >> low) & ((1u<<numBits)-1));
+    else
+        return (deUint32)((src >> low) & 0xFFFFFFFFu);
+}
+
+inline bool isBitSet (deUint32 src, int ndx)
+{
+    return getBit(src, ndx) != 0;
+}
+
+inline deUint32 reverseBits (deUint32 src, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numBits, 0, 32));
+    
+    deUint32 result = 0;
+    for (int i = 0; i < numBits; i++)
+        result |= ((src >> i) & 1) << (numBits-1-i);
+
+    return result;
+}
+
+inline deUint32 bitReplicationScale (deUint32 src, int numSrcBits, int numDstBits)
+{
+    DE_ASSERT(numSrcBits <= numDstBits);
+    DE_ASSERT((src & ((1<<numSrcBits)-1)) == src);
+
+    deUint32 dst = 0;
+    for (int shift = numDstBits-numSrcBits; shift > -numSrcBits; shift -= numSrcBits)
+        dst |= (shift >= 0) ? (src << shift) : (src >> -shift);
+
+    return dst;
+}
+
+inline deInt32 signExtend (deInt32 src, int numSrcBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numSrcBits, 2, 31));
+
+    const bool negative = (src & (1 << (numSrcBits-1))) != 0;
+    return src | (negative ? ~((1 << numSrcBits) - 1) : 0);
+}
+
+typedef uint16_t deFloat16;
+
+inline bool isFloat16InfOrNan (deFloat16 v)
+{
+    return getBits(v, 10, 14) == 31;
+}
+
+float deFloat16To32(deFloat16 val16)
+{
+    deUint32 sign;
+    deUint32 expotent;
+    deUint32 mantissa;
+
+    union
+    {
+        float       f;
+        deUint32    u;
+    } x;
+
+    x.u = 0u;
+
+    sign = ((deUint32)val16 >> 15u) & 0x00000001u;
+    expotent = ((deUint32)val16 >> 10u) & 0x0000001fu;
+    mantissa = (deUint32)val16 & 0x000003ffu;
+
+    if (expotent == 0u)
+    {
+        if (mantissa == 0u)
+        {
+            /* +/- 0 */
+            x.u = sign << 31u;
+            return x.f;
+        }
+        else
+        {
+            /* Denormalized, normalize it. */
+
+            while (!(mantissa & 0x00000400u))
+            {
+                mantissa <<= 1u;
+                expotent -= 1u;
+            }
+
+            expotent += 1u;
+            mantissa &= ~0x00000400u;
+        }
+    }
+    else if (expotent == 31u)
+    {
+        if (mantissa == 0u)
+        {
+            /* +/- InF */
+            x.u = (sign << 31u) | 0x7f800000u;
+            return x.f;
+        }
+        else
+        {
+            /* +/- NaN */
+            x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u);
+            return x.f;
+        }
+    }
+
+    expotent = expotent + (127u - 15u);
+    mantissa = mantissa << 13u;
+
+    x.u = (sign << 31u) | (expotent << 23u) | mantissa;
+    return x.f;
+}
+
+enum ISEMode
+{
+    ISEMODE_TRIT = 0,
+    ISEMODE_QUINT,
+    ISEMODE_PLAIN_BIT,
+    ISEMODE_LAST
+};
+
+struct ISEParams
+{
+    ISEMode     mode;
+    int         numBits;
+    ISEParams (ISEMode mode_, int numBits_) : mode(mode_), numBits(numBits_) {}
+};
+
+inline int computeNumRequiredBits (const ISEParams& iseParams, int numValues)
+{
+    switch (iseParams.mode)
+    {
+        case ISEMODE_TRIT:          return deDivRoundUp32(numValues*8, 5) + numValues*iseParams.numBits;
+        case ISEMODE_QUINT:         return deDivRoundUp32(numValues*7, 3) + numValues*iseParams.numBits;
+        case ISEMODE_PLAIN_BIT:     return numValues*iseParams.numBits;
+        default:
+            DE_ASSERT(false);
+            return -1;
+    }
+}
+
+ISEParams computeMaximumRangeISEParams (int numAvailableBits, int numValuesInSequence)
+{
+    int curBitsForTritMode      = 6;
+    int curBitsForQuintMode     = 5;
+    int curBitsForPlainBitMode  = 8;
+
+    while (true)
+    {
+        DE_ASSERT(curBitsForTritMode > 0 || curBitsForQuintMode > 0 || curBitsForPlainBitMode > 0);
+        const int tritRange         = (curBitsForTritMode > 0)        ? (3 << curBitsForTritMode) - 1         : -1;
+        const int quintRange        = (curBitsForQuintMode > 0)       ? (5 << curBitsForQuintMode) - 1        : -1;
+        const int plainBitRange     = (curBitsForPlainBitMode > 0)    ? (1 << curBitsForPlainBitMode) - 1     : -1;
+        const int maxRange          = basisu_astc::max(basisu_astc::max(tritRange, quintRange), plainBitRange);
+
+        if (maxRange == tritRange)
+        {
+            const ISEParams params(ISEMODE_TRIT, curBitsForTritMode);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_TRIT, curBitsForTritMode);
+
+            curBitsForTritMode--;
+        }
+        else if (maxRange == quintRange)
+        {
+            const ISEParams params(ISEMODE_QUINT, curBitsForQuintMode);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_QUINT, curBitsForQuintMode);
+
+            curBitsForQuintMode--;
+        }
+        else
+        {
+            const ISEParams params(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+            DE_ASSERT(maxRange == plainBitRange);
+
+            if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+                return ISEParams(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+
+            curBitsForPlainBitMode--;
+        }
+    }
+}
+
+inline int computeNumColorEndpointValues (deUint32 endpointMode)
+{
+    DE_ASSERT(endpointMode < 16);
+    return (endpointMode/4 + 1) * 2;
+}
+
+// Decompression utilities
+enum DecompressResult
+{
+    DECOMPRESS_RESULT_VALID_BLOCK   = 0,    //!< Decompressed valid block
+    DECOMPRESS_RESULT_ERROR,                //!< Encountered error while decompressing, error color written
+    DECOMPRESS_RESULT_LAST
+};
+
+// A helper for getting bits from a 128-bit block.
+class Block128
+{
+private:
+    typedef deUint64 Word;
+
+    enum
+    {
+        WORD_BYTES  = sizeof(Word),
+        WORD_BITS   = 8*WORD_BYTES,
+        NUM_WORDS   = 128 / WORD_BITS
+    };
+    //DE_STATIC_ASSERT(128 % WORD_BITS == 0);
+
+public:
+    Block128 (const deUint8* src)
+    {
+        for (int wordNdx = 0; wordNdx < NUM_WORDS; wordNdx++)
+        {
+            m_words[wordNdx] = 0;
+            for (int byteNdx = 0; byteNdx < WORD_BYTES; byteNdx++)
+                m_words[wordNdx] |= (Word)src[wordNdx*WORD_BYTES + byteNdx] << (8*byteNdx);
+        }
+    }
+
+    deUint32 getBit (int ndx) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+        return (m_words[ndx / WORD_BITS] >> (ndx % WORD_BITS)) & 1;
+    }
+
+    deUint32 getBits (int low, int high) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(low, 0, 128));
+        DE_ASSERT(basisu_astc::inBounds(high, 0, 128));
+        DE_ASSERT(basisu_astc::inRange(high-low+1, 0, 32));
+
+        if (high-low+1 == 0)
+            return 0;
+
+        const int word0Ndx = low / WORD_BITS;
+        const int word1Ndx = high / WORD_BITS;
+        // \note "foo << bar << 1" done instead of "foo << (bar+1)" to avoid overflow, i.e. shift amount being too big.
+        if (word0Ndx == word1Ndx)
+            return (deUint32)((m_words[word0Ndx] & ((((Word)1 << high%WORD_BITS << 1) - 1))) >> ((Word)low % WORD_BITS));
+        else
+        {
+            DE_ASSERT(word1Ndx == word0Ndx + 1);
+            return (deUint32)(m_words[word0Ndx] >> (low%WORD_BITS)) |
+                   (deUint32)((m_words[word1Ndx] & (((Word)1 << high%WORD_BITS << 1) - 1)) << (high-low - high%WORD_BITS));
+        }
+    }
+
+    bool isBitSet (int ndx) const
+    {
+        DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+        return getBit(ndx) != 0;
+    }
+
+private:
+    Word m_words[NUM_WORDS];
+};
+
+// A helper for sequential access into a Block128.
+class BitAccessStream
+{
+public:
+    BitAccessStream (const Block128& src, int startNdxInSrc, int length, bool forward)
+        : m_src             (src)
+        , m_startNdxInSrc   (startNdxInSrc)
+        , m_length          (length)
+        , m_forward         (forward)
+        , m_ndx             (0)
+    {
+    }
+
+    // Get the next num bits. Bits at positions greater than or equal to m_length are zeros.
+    deUint32 getNext (int num)
+    {
+        if (num == 0 || m_ndx >= m_length)
+            return 0;
+        const int end               = m_ndx + num;
+        const int numBitsFromSrc    = basisu_astc::max(0, basisu_astc::min(m_length, end) - m_ndx);
+        const int low               = m_ndx;
+        const int high              = m_ndx + numBitsFromSrc - 1;
+
+        m_ndx += num;
+        
+        return m_forward ?             m_src.getBits(m_startNdxInSrc + low,  m_startNdxInSrc + high)
+                         : reverseBits(m_src.getBits(m_startNdxInSrc - high, m_startNdxInSrc - low), numBitsFromSrc);
+    }
+
+private:
+    const Block128&     m_src;
+    const int           m_startNdxInSrc;
+    const int           m_length;
+    const bool          m_forward;
+    int                 m_ndx;
+};
+
+struct ISEDecodedResult
+{
+    deUint32 m;
+    deUint32 tq; //!< Trit or quint value, depending on ISE mode.
+    deUint32 v;
+};
+
+// Data from an ASTC block's "block mode" part (i.e. bits [0,10]).
+struct ASTCBlockMode
+{
+    bool        isError;
+    // \note Following fields only relevant if !isError.
+    bool        isVoidExtent;
+    // \note Following fields only relevant if !isVoidExtent.
+    bool        isDualPlane;
+    int         weightGridWidth;
+    int         weightGridHeight;
+    ISEParams   weightISEParams;
+
+    ASTCBlockMode (void)
+        : isError           (true)
+        , isVoidExtent      (true)
+        , isDualPlane       (true)
+        , weightGridWidth   (-1)
+        , weightGridHeight  (-1)
+        , weightISEParams   (ISEMODE_LAST, -1)
+    {
+    }
+};
+
+inline int computeNumWeights (const ASTCBlockMode& mode)
+{
+    return mode.weightGridWidth * mode.weightGridHeight * (mode.isDualPlane ? 2 : 1);
+}
+
+struct ColorEndpointPair
+{
+    UVec4 e0;
+    UVec4 e1;
+};
+
+struct TexelWeightPair
+{
+    deUint32 w[2];
+};
+
+ASTCBlockMode getASTCBlockMode (deUint32 blockModeData)
+{
+    ASTCBlockMode blockMode;
+    blockMode.isError = true; // \note Set to false later, if not error.
+    blockMode.isVoidExtent = getBits(blockModeData, 0, 8) == 0x1fc;
+    if (!blockMode.isVoidExtent)
+    {
+        if ((getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 6, 8) == 7) || getBits(blockModeData, 0, 3) == 0)
+            return blockMode; // Invalid ("reserved").
+
+        deUint32 r = (deUint32)-1; // \note Set in the following branches.
+
+        if (getBits(blockModeData, 0, 1) == 0)
+        {
+            const deUint32 r0   = getBit(blockModeData, 4);
+            const deUint32 r1   = getBit(blockModeData, 2);
+            const deUint32 r2   = getBit(blockModeData, 3);
+            const deUint32 i78  = getBits(blockModeData, 7, 8);
+            
+            r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+
+            if (i78 == 3)
+            {
+                const bool i5 = isBitSet(blockModeData, 5);
+                blockMode.weightGridWidth   = i5 ? 10 : 6;
+                blockMode.weightGridHeight  = i5 ? 6  : 10;
+            }
+            else
+            {
+                const deUint32 a = getBits(blockModeData, 5, 6);
+
+                switch (i78)
+                {
+                    case 0:     blockMode.weightGridWidth = 12;     blockMode.weightGridHeight = a + 2;                                 break;
+                    case 1:     blockMode.weightGridWidth = a + 2;  blockMode.weightGridHeight = 12;                                    break;
+                    case 2:     blockMode.weightGridWidth = a + 6;  blockMode.weightGridHeight = getBits(blockModeData, 9, 10) + 6;     break;
+                    default: DE_ASSERT(false);
+                }
+            }
+        }
+        else
+        {
+            const deUint32 r0   = getBit(blockModeData, 4);
+            const deUint32 r1   = getBit(blockModeData, 0);
+            const deUint32 r2   = getBit(blockModeData, 1);
+            const deUint32 i23  = getBits(blockModeData, 2, 3);
+            const deUint32 a    = getBits(blockModeData, 5, 6);
+
+            r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+            if (i23 == 3)
+            {
+                const deUint32  b   = getBit(blockModeData, 7);
+                const bool      i8  = isBitSet(blockModeData, 8);
+                blockMode.weightGridWidth   = i8 ? b+2 : a+2;
+                blockMode.weightGridHeight  = i8 ? a+2 : b+6;
+            }
+            else
+            {
+                const deUint32 b = getBits(blockModeData, 7, 8);
+                switch (i23)
+                {
+                    case 0:     blockMode.weightGridWidth = b + 4;  blockMode.weightGridHeight = a + 2; break;
+                    case 1:     blockMode.weightGridWidth = b + 8;  blockMode.weightGridHeight = a + 2; break;
+                    case 2:     blockMode.weightGridWidth = a + 2;  blockMode.weightGridHeight = b + 8; break;
+                    default: DE_ASSERT(false);
+                }
+            }
+        }
+
+        const bool  zeroDH      = getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 7, 8) == 2;
+        const bool  h           = zeroDH ? 0 : isBitSet(blockModeData, 9);
+        blockMode.isDualPlane   = zeroDH ? 0 : isBitSet(blockModeData, 10);
+
+        {
+            ISEMode&    m   = blockMode.weightISEParams.mode;
+            int&        b   = blockMode.weightISEParams.numBits;
+            m = ISEMODE_PLAIN_BIT;
+            b = 0;
+            if (h)
+            {
+                switch (r)
+                {
+                    case 2:                         m = ISEMODE_QUINT;  b = 1;  break;
+                    case 3:     m = ISEMODE_TRIT;                       b = 2;  break;
+                    case 4:                                             b = 4;  break;
+                    case 5:                         m = ISEMODE_QUINT;  b = 2;  break;
+                    case 6:     m = ISEMODE_TRIT;                       b = 3;  break;
+                    case 7:                                             b = 5;  break;
+                    default:    DE_ASSERT(false);
+                }
+            }
+            else
+            {
+                switch (r)
+                {
+                    case 2:                                             b = 1;  break;
+                    case 3:     m = ISEMODE_TRIT;                               break;
+                    case 4:                                             b = 2;  break;
+                    case 5:                         m = ISEMODE_QUINT;          break;
+                    case 6:     m = ISEMODE_TRIT;                       b = 1;  break;
+                    case 7:                                             b = 3;  break;
+                    default:    DE_ASSERT(false);
+                }
+            }
+        }
+    }
+
+    blockMode.isError = false;
+    return blockMode;
+}
+
+inline void setASTCErrorColorBlock (void* dst, int blockWidth, int blockHeight, bool isSRGB)
+{
+    if (isSRGB)
+    {
+        deUint8* const dstU = (deUint8*)dst;
+        for (int i = 0; i < blockWidth*blockHeight; i++)
+        {
+            dstU[4*i + 0] = 0xff;
+            dstU[4*i + 1] = 0;
+            dstU[4*i + 2] = 0xff;
+            dstU[4*i + 3] = 0xff;
+        }
+    }
+    else
+    {
+        float* const dstF = (float*)dst;
+        for (int i = 0; i < blockWidth*blockHeight; i++)
+        {
+            dstF[4*i + 0] = 1.0f;
+            dstF[4*i + 1] = 0.0f;
+            dstF[4*i + 2] = 1.0f;
+            dstF[4*i + 3] = 1.0f;
+        }
+    }
+}
+
+DecompressResult decodeVoidExtentBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode)
+{
+    const deUint32  minSExtent          = blockData.getBits(12, 24);
+    const deUint32  maxSExtent          = blockData.getBits(25, 37);
+    const deUint32  minTExtent          = blockData.getBits(38, 50);
+    const deUint32  maxTExtent          = blockData.getBits(51, 63);
+    const bool      allExtentsAllOnes   = (minSExtent == 0x1fff) && (maxSExtent == 0x1fff) && (minTExtent == 0x1fff) && (maxTExtent == 0x1fff);
+    const bool      isHDRBlock          = blockData.isBitSet(9);
+    
+    if ((isLDRMode && isHDRBlock) || (!allExtentsAllOnes && (minSExtent >= maxSExtent || minTExtent >= maxTExtent)))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    const deUint32 rgba[4] =
+    {
+        blockData.getBits(64,  79),
+        blockData.getBits(80,  95),
+        blockData.getBits(96,  111),
+        blockData.getBits(112, 127)
+    };
+
+    if (isSRGB)
+    {
+        deUint8* const dstU = (deUint8*)dst;
+        for (int i = 0; i < blockWidth * blockHeight; i++)
+        {
+            for (int c = 0; c < 4; c++)
+                dstU[i * 4 + c] = (deUint8)((rgba[c] & 0xff00) >> 8);
+        }
+    }
+    else
+    {
+        float* const dstF = (float*)dst;
+
+        if (isHDRBlock)
+        {
+            for (int c = 0; c < 4; c++)
+            {
+                if (isFloat16InfOrNan((deFloat16)rgba[c]))
+                {
+                    //throw InternalError("Infinity or NaN color component in HDR void extent block in ASTC texture (behavior undefined by ASTC specification)");
+                    setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+                    return DECOMPRESS_RESULT_ERROR;
+                }
+            }
+
+            for (int i = 0; i < blockWidth * blockHeight; i++)
+            {
+                for (int c = 0; c < 4; c++)
+                    dstF[i * 4 + c] = deFloat16To32((deFloat16)rgba[c]);
+            }
+        }
+        else
+        {
+            for (int i = 0; i < blockWidth * blockHeight; i++)
+            {
+                for (int c = 0; c < 4; c++)
+                    dstF[i * 4 + c] = (rgba[c] == 65535) ? 1.0f : ((float)rgba[c] / 65536.0f);
+            }
+        }
+    }
+
+    return DECOMPRESS_RESULT_VALID_BLOCK;
+}
+
+void decodeColorEndpointModes (deUint32* endpointModesDst, const Block128& blockData, int numPartitions, int extraCemBitsStart)
+{
+    if (numPartitions == 1)
+        endpointModesDst[0] = blockData.getBits(13, 16);
+    else
+    {
+        const deUint32 highLevelSelector = blockData.getBits(23, 24);
+
+        if (highLevelSelector == 0)
+        {
+            const deUint32 mode = blockData.getBits(25, 28);
+
+            for (int i = 0; i < numPartitions; i++)
+                endpointModesDst[i] = mode;
+        }
+        else
+        {
+            for (int partNdx = 0; partNdx < numPartitions; partNdx++)
+            {
+                const deUint32 cemClass     = highLevelSelector - (blockData.isBitSet(25 + partNdx) ? 0 : 1);
+                const deUint32 lowBit0Ndx   = numPartitions + 2*partNdx;
+                const deUint32 lowBit1Ndx   = numPartitions + 2*partNdx + 1;
+                const deUint32 lowBit0      = blockData.getBit(lowBit0Ndx < 4 ? 25+lowBit0Ndx : extraCemBitsStart+lowBit0Ndx-4);
+                const deUint32 lowBit1      = blockData.getBit(lowBit1Ndx < 4 ? 25+lowBit1Ndx : extraCemBitsStart+lowBit1Ndx-4);
+
+                endpointModesDst[partNdx] = (cemClass << 2) | (lowBit1 << 1) | lowBit0;
+            }
+        }
+    }
+}
+
+int computeNumColorEndpointValues (const deUint32* endpointModes, int numPartitions)
+{
+    int result = 0;
+
+    for (int i = 0; i < numPartitions; i++)
+        result += computeNumColorEndpointValues(endpointModes[i]);
+
+    return result;
+}
+
+void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numValues, 1, 5));
+
+    deUint32 m[5];
+    m[0]            = data.getNext(numBits);
+    deUint32 T01    = data.getNext(2);
+    m[1]            = data.getNext(numBits);
+    deUint32 T23    = data.getNext(2);
+    m[2]            = data.getNext(numBits);
+    deUint32 T4     = data.getNext(1);
+    m[3]            = data.getNext(numBits);
+    deUint32 T56    = data.getNext(2);
+    m[4]            = data.getNext(numBits);
+    deUint32 T7     = data.getNext(1);
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
+#endif  
+    switch (numValues)
+    {
+        // \note Fall-throughs.
+        case 1: T23     = 0;
+        case 2: T4      = 0;
+        case 3: T56     = 0;
+        case 4: T7      = 0;
+        case 5: break;
+        default:
+            DE_ASSERT(false);
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif 
+
+    const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0);
+
+    static const deUint32 tritsFromT[256][5] =
+    {
+        { 0,0,0,0,0 }, { 1,0,0,0,0 }, { 2,0,0,0,0 }, { 0,0,2,0,0 }, { 0,1,0,0,0 }, { 1,1,0,0,0 }, { 2,1,0,0,0 }, { 1,0,2,0,0 }, { 0,2,0,0,0 }, { 1,2,0,0,0 }, { 2,2,0,0,0 }, { 2,0,2,0,0 }, { 0,2,2,0,0 }, { 1,2,2,0,0 }, { 2,2,2,0,0 }, { 2,0,2,0,0 },
+        { 0,0,1,0,0 }, { 1,0,1,0,0 }, { 2,0,1,0,0 }, { 0,1,2,0,0 }, { 0,1,1,0,0 }, { 1,1,1,0,0 }, { 2,1,1,0,0 }, { 1,1,2,0,0 }, { 0,2,1,0,0 }, { 1,2,1,0,0 }, { 2,2,1,0,0 }, { 2,1,2,0,0 }, { 0,0,0,2,2 }, { 1,0,0,2,2 }, { 2,0,0,2,2 }, { 0,0,2,2,2 },
+        { 0,0,0,1,0 }, { 1,0,0,1,0 }, { 2,0,0,1,0 }, { 0,0,2,1,0 }, { 0,1,0,1,0 }, { 1,1,0,1,0 }, { 2,1,0,1,0 }, { 1,0,2,1,0 }, { 0,2,0,1,0 }, { 1,2,0,1,0 }, { 2,2,0,1,0 }, { 2,0,2,1,0 }, { 0,2,2,1,0 }, { 1,2,2,1,0 }, { 2,2,2,1,0 }, { 2,0,2,1,0 },
+        { 0,0,1,1,0 }, { 1,0,1,1,0 }, { 2,0,1,1,0 }, { 0,1,2,1,0 }, { 0,1,1,1,0 }, { 1,1,1,1,0 }, { 2,1,1,1,0 }, { 1,1,2,1,0 }, { 0,2,1,1,0 }, { 1,2,1,1,0 }, { 2,2,1,1,0 }, { 2,1,2,1,0 }, { 0,1,0,2,2 }, { 1,1,0,2,2 }, { 2,1,0,2,2 }, { 1,0,2,2,2 },
+        { 0,0,0,2,0 }, { 1,0,0,2,0 }, { 2,0,0,2,0 }, { 0,0,2,2,0 }, { 0,1,0,2,0 }, { 1,1,0,2,0 }, { 2,1,0,2,0 }, { 1,0,2,2,0 }, { 0,2,0,2,0 }, { 1,2,0,2,0 }, { 2,2,0,2,0 }, { 2,0,2,2,0 }, { 0,2,2,2,0 }, { 1,2,2,2,0 }, { 2,2,2,2,0 }, { 2,0,2,2,0 },
+        { 0,0,1,2,0 }, { 1,0,1,2,0 }, { 2,0,1,2,0 }, { 0,1,2,2,0 }, { 0,1,1,2,0 }, { 1,1,1,2,0 }, { 2,1,1,2,0 }, { 1,1,2,2,0 }, { 0,2,1,2,0 }, { 1,2,1,2,0 }, { 2,2,1,2,0 }, { 2,1,2,2,0 }, { 0,2,0,2,2 }, { 1,2,0,2,2 }, { 2,2,0,2,2 }, { 2,0,2,2,2 },
+        { 0,0,0,0,2 }, { 1,0,0,0,2 }, { 2,0,0,0,2 }, { 0,0,2,0,2 }, { 0,1,0,0,2 }, { 1,1,0,0,2 }, { 2,1,0,0,2 }, { 1,0,2,0,2 }, { 0,2,0,0,2 }, { 1,2,0,0,2 }, { 2,2,0,0,2 }, { 2,0,2,0,2 }, { 0,2,2,0,2 }, { 1,2,2,0,2 }, { 2,2,2,0,2 }, { 2,0,2,0,2 },
+        { 0,0,1,0,2 }, { 1,0,1,0,2 }, { 2,0,1,0,2 }, { 0,1,2,0,2 }, { 0,1,1,0,2 }, { 1,1,1,0,2 }, { 2,1,1,0,2 }, { 1,1,2,0,2 }, { 0,2,1,0,2 }, { 1,2,1,0,2 }, { 2,2,1,0,2 }, { 2,1,2,0,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,0,2,2,2 },
+        { 0,0,0,0,1 }, { 1,0,0,0,1 }, { 2,0,0,0,1 }, { 0,0,2,0,1 }, { 0,1,0,0,1 }, { 1,1,0,0,1 }, { 2,1,0,0,1 }, { 1,0,2,0,1 }, { 0,2,0,0,1 }, { 1,2,0,0,1 }, { 2,2,0,0,1 }, { 2,0,2,0,1 }, { 0,2,2,0,1 }, { 1,2,2,0,1 }, { 2,2,2,0,1 }, { 2,0,2,0,1 },
+        { 0,0,1,0,1 }, { 1,0,1,0,1 }, { 2,0,1,0,1 }, { 0,1,2,0,1 }, { 0,1,1,0,1 }, { 1,1,1,0,1 }, { 2,1,1,0,1 }, { 1,1,2,0,1 }, { 0,2,1,0,1 }, { 1,2,1,0,1 }, { 2,2,1,0,1 }, { 2,1,2,0,1 }, { 0,0,1,2,2 }, { 1,0,1,2,2 }, { 2,0,1,2,2 }, { 0,1,2,2,2 },
+        { 0,0,0,1,1 }, { 1,0,0,1,1 }, { 2,0,0,1,1 }, { 0,0,2,1,1 }, { 0,1,0,1,1 }, { 1,1,0,1,1 }, { 2,1,0,1,1 }, { 1,0,2,1,1 }, { 0,2,0,1,1 }, { 1,2,0,1,1 }, { 2,2,0,1,1 }, { 2,0,2,1,1 }, { 0,2,2,1,1 }, { 1,2,2,1,1 }, { 2,2,2,1,1 }, { 2,0,2,1,1 },
+        { 0,0,1,1,1 }, { 1,0,1,1,1 }, { 2,0,1,1,1 }, { 0,1,2,1,1 }, { 0,1,1,1,1 }, { 1,1,1,1,1 }, { 2,1,1,1,1 }, { 1,1,2,1,1 }, { 0,2,1,1,1 }, { 1,2,1,1,1 }, { 2,2,1,1,1 }, { 2,1,2,1,1 }, { 0,1,1,2,2 }, { 1,1,1,2,2 }, { 2,1,1,2,2 }, { 1,1,2,2,2 },
+        { 0,0,0,2,1 }, { 1,0,0,2,1 }, { 2,0,0,2,1 }, { 0,0,2,2,1 }, { 0,1,0,2,1 }, { 1,1,0,2,1 }, { 2,1,0,2,1 }, { 1,0,2,2,1 }, { 0,2,0,2,1 }, { 1,2,0,2,1 }, { 2,2,0,2,1 }, { 2,0,2,2,1 }, { 0,2,2,2,1 }, { 1,2,2,2,1 }, { 2,2,2,2,1 }, { 2,0,2,2,1 },
+        { 0,0,1,2,1 }, { 1,0,1,2,1 }, { 2,0,1,2,1 }, { 0,1,2,2,1 }, { 0,1,1,2,1 }, { 1,1,1,2,1 }, { 2,1,1,2,1 }, { 1,1,2,2,1 }, { 0,2,1,2,1 }, { 1,2,1,2,1 }, { 2,2,1,2,1 }, { 2,1,2,2,1 }, { 0,2,1,2,2 }, { 1,2,1,2,2 }, { 2,2,1,2,2 }, { 2,1,2,2,2 },
+        { 0,0,0,1,2 }, { 1,0,0,1,2 }, { 2,0,0,1,2 }, { 0,0,2,1,2 }, { 0,1,0,1,2 }, { 1,1,0,1,2 }, { 2,1,0,1,2 }, { 1,0,2,1,2 }, { 0,2,0,1,2 }, { 1,2,0,1,2 }, { 2,2,0,1,2 }, { 2,0,2,1,2 }, { 0,2,2,1,2 }, { 1,2,2,1,2 }, { 2,2,2,1,2 }, { 2,0,2,1,2 },
+        { 0,0,1,1,2 }, { 1,0,1,1,2 }, { 2,0,1,1,2 }, { 0,1,2,1,2 }, { 0,1,1,1,2 }, { 1,1,1,1,2 }, { 2,1,1,1,2 }, { 1,1,2,1,2 }, { 0,2,1,1,2 }, { 1,2,1,1,2 }, { 2,2,1,1,2 }, { 2,1,2,1,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,1,2,2,2 }
+    };
+
+    const deUint32 (& trits)[5] = tritsFromT[T];
+    for (int i = 0; i < numValues; i++)
+    {
+        dst[i].m    = m[i];
+        dst[i].tq   = trits[i];
+        dst[i].v    = (trits[i] << numBits) + m[i];
+    }
+}
+
+void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+    DE_ASSERT(basisu_astc::inRange(numValues, 1, 3));
+
+    deUint32 m[3];
+    m[0]            = data.getNext(numBits);
+    deUint32 Q012   = data.getNext(3);
+    m[1]            = data.getNext(numBits);
+    deUint32 Q34    = data.getNext(2);
+    m[2]            = data.getNext(numBits);
+    deUint32 Q56    = data.getNext(2);
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-fallthrough="            
+#endif  
+    switch (numValues)
+    {
+        // \note Fall-throughs.
+        case 1: Q34     = 0;
+        case 2: Q56     = 0;
+        case 3: break;
+        default:
+            DE_ASSERT(false);
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif 
+
+    const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0);
+
+    static const deUint32 quintsFromQ[256][3] =
+    {
+        { 0,0,0 }, { 1,0,0 }, { 2,0,0 }, { 3,0,0 }, { 4,0,0 }, { 0,4,0 }, { 4,4,0 }, { 4,4,4 }, { 0,1,0 }, { 1,1,0 }, { 2,1,0 }, { 3,1,0 }, { 4,1,0 }, { 1,4,0 }, { 4,4,1 }, { 4,4,4 },
+        { 0,2,0 }, { 1,2,0 }, { 2,2,0 }, { 3,2,0 }, { 4,2,0 }, { 2,4,0 }, { 4,4,2 }, { 4,4,4 }, { 0,3,0 }, { 1,3,0 }, { 2,3,0 }, { 3,3,0 }, { 4,3,0 }, { 3,4,0 }, { 4,4,3 }, { 4,4,4 },
+        { 0,0,1 }, { 1,0,1 }, { 2,0,1 }, { 3,0,1 }, { 4,0,1 }, { 0,4,1 }, { 4,0,4 }, { 0,4,4 }, { 0,1,1 }, { 1,1,1 }, { 2,1,1 }, { 3,1,1 }, { 4,1,1 }, { 1,4,1 }, { 4,1,4 }, { 1,4,4 },
+        { 0,2,1 }, { 1,2,1 }, { 2,2,1 }, { 3,2,1 }, { 4,2,1 }, { 2,4,1 }, { 4,2,4 }, { 2,4,4 }, { 0,3,1 }, { 1,3,1 }, { 2,3,1 }, { 3,3,1 }, { 4,3,1 }, { 3,4,1 }, { 4,3,4 }, { 3,4,4 },
+        { 0,0,2 }, { 1,0,2 }, { 2,0,2 }, { 3,0,2 }, { 4,0,2 }, { 0,4,2 }, { 2,0,4 }, { 3,0,4 }, { 0,1,2 }, { 1,1,2 }, { 2,1,2 }, { 3,1,2 }, { 4,1,2 }, { 1,4,2 }, { 2,1,4 }, { 3,1,4 },
+        { 0,2,2 }, { 1,2,2 }, { 2,2,2 }, { 3,2,2 }, { 4,2,2 }, { 2,4,2 }, { 2,2,4 }, { 3,2,4 }, { 0,3,2 }, { 1,3,2 }, { 2,3,2 }, { 3,3,2 }, { 4,3,2 }, { 3,4,2 }, { 2,3,4 }, { 3,3,4 },
+        { 0,0,3 }, { 1,0,3 }, { 2,0,3 }, { 3,0,3 }, { 4,0,3 }, { 0,4,3 }, { 0,0,4 }, { 1,0,4 }, { 0,1,3 }, { 1,1,3 }, { 2,1,3 }, { 3,1,3 }, { 4,1,3 }, { 1,4,3 }, { 0,1,4 }, { 1,1,4 },
+        { 0,2,3 }, { 1,2,3 }, { 2,2,3 }, { 3,2,3 }, { 4,2,3 }, { 2,4,3 }, { 0,2,4 }, { 1,2,4 }, { 0,3,3 }, { 1,3,3 }, { 2,3,3 }, { 3,3,3 }, { 4,3,3 }, { 3,4,3 }, { 0,3,4 }, { 1,3,4 }
+    };
+
+    const deUint32 (& quints)[3] = quintsFromQ[Q];
+    for (int i = 0; i < numValues; i++)
+    {
+        dst[i].m    = m[i];
+        dst[i].tq   = quints[i];
+        dst[i].v    = (quints[i] << numBits) + m[i];
+    }
+}
+
+inline void decodeISEBitBlock (ISEDecodedResult* dst, BitAccessStream& data, int numBits)
+{
+    dst[0].m = data.getNext(numBits);
+    dst[0].v = dst[0].m;
+}
+
+void decodeISE (ISEDecodedResult* dst, int numValues, BitAccessStream& data, const ISEParams& params)
+{
+    if (params.mode == ISEMODE_TRIT)
+    {
+        const int numBlocks = deDivRoundUp32(numValues, 5);
+        for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+        {
+            const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 5*(numBlocks-1) : 5;
+            decodeISETritBlock(&dst[5*blockNdx], numValuesInBlock, data, params.numBits);
+        }
+    }
+    else if (params.mode == ISEMODE_QUINT)
+    {
+        const int numBlocks = deDivRoundUp32(numValues, 3);
+        for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+        {
+            const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 3*(numBlocks-1) : 3;
+            decodeISEQuintBlock(&dst[3*blockNdx], numValuesInBlock, data, params.numBits);
+        }
+    }
+    else
+    {
+        DE_ASSERT(params.mode == ISEMODE_PLAIN_BIT);
+        for (int i = 0; i < numValues; i++)
+            decodeISEBitBlock(&dst[i], data, params.numBits);
+    }
+}
+
+void unquantizeColorEndpoints (deUint32* dst, const ISEDecodedResult* iseResults, int numEndpoints, const ISEParams& iseParams)
+{
+    if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT))
+    {
+        const int rangeCase             = iseParams.numBits*2 - (iseParams.mode == ISEMODE_TRIT ? 2 : 1);
+        DE_ASSERT(basisu_astc::inRange(rangeCase, 0, 10));
+
+        static const deUint32   Ca[11]  = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
+        const deUint32          C       = Ca[rangeCase];
+
+        for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+        {
+            const deUint32 a = getBit(iseResults[endpointNdx].m, 0);
+            const deUint32 b = getBit(iseResults[endpointNdx].m, 1);
+            const deUint32 c = getBit(iseResults[endpointNdx].m, 2);
+            const deUint32 d = getBit(iseResults[endpointNdx].m, 3);
+            const deUint32 e = getBit(iseResults[endpointNdx].m, 4);
+            const deUint32 f = getBit(iseResults[endpointNdx].m, 5);
+            const deUint32 A = (a == 0) ? 0 : (1<<9)-1;
+
+            const deUint32 B = (rangeCase == 0)   ? 0
+                             : (rangeCase == 1)   ? 0
+                             : (rangeCase == 2)   ? ((b << 8) | (b << 4) | (b << 2) | (b << 1))
+                             : (rangeCase == 3)   ? ((b << 8) | (b << 3) | (b << 2))
+                             : (rangeCase == 4)   ? ((c << 8) | (b << 7) | (c << 3) | (b << 2) | (c << 1) | (b << 0))
+                             : (rangeCase == 5)   ? ((c << 8) | (b << 7) | (c << 2) | (b << 1) | (c << 0))
+                             : (rangeCase == 6)   ? ((d << 8) | (c << 7) | (b << 6) | (d << 2) | (c << 1) | (b << 0))
+                             : (rangeCase == 7)   ? ((d << 8) | (c << 7) | (b << 6) | (d << 1) | (c << 0))
+                             : (rangeCase == 8)   ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 1) | (d << 0))
+                             : (rangeCase == 9)   ? ((e << 8) | (d << 7) | (c << 6) | (b << 5) | (e << 0))
+                             : (rangeCase == 10)  ? ((f << 8) | (e << 7) | (d << 6) | (c << 5) | (b << 4) | (f << 0))
+                             : (deUint32)-1;
+
+            DE_ASSERT(B != (deUint32)-1);
+            dst[endpointNdx] = (((iseResults[endpointNdx].tq*C + B) ^ A) >> 2) | (A & 0x80);
+        }
+    }
+    else
+    {
+        DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+        for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+            dst[endpointNdx] = bitReplicationScale(iseResults[endpointNdx].v, iseParams.numBits, 8);
+    }
+}
+
+inline void bitTransferSigned (deInt32& a, deInt32& b)
+{
+    b >>= 1;
+    b |= a & 0x80;
+    a >>= 1;
+    a &= 0x3f;
+    if (isBitSet(a, 5))
+        a -= 0x40;
+}
+
+inline UVec4 clampedRGBA (const IVec4& rgba)
+{
+    return UVec4(basisu_astc::clamp(rgba.x(), 0, 0xff),
+        basisu_astc::clamp(rgba.y(), 0, 0xff),
+        basisu_astc::clamp(rgba.z(), 0, 0xff),
+        basisu_astc::clamp(rgba.w(), 0, 0xff));
+}
+
+inline IVec4 blueContract (int r, int g, int b, int a)
+{
+    return IVec4((r+b)>>1, (g+b)>>1, b, a);
+}
+
+inline bool isColorEndpointModeHDR (deUint32 mode)
+{
+    return (mode == 2)    ||
+           (mode == 3)    ||
+           (mode == 7)    ||
+           (mode == 11)   ||
+           (mode == 14)   ||
+           (mode == 15);
+}
+
+void decodeHDREndpointMode7 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3)
+{
+    const deUint32 m10      = getBit(v1, 7) | (getBit(v2, 7) << 1);
+    const deUint32 m23      = getBits(v0, 6, 7);
+
+    const deUint32 majComp  = (m10 != 3)  ? m10
+                            : (m23 != 3)  ? m23
+                            :             0;
+    
+    const deUint32 mode     = (m10 != 3)  ? m23
+                            : (m23 != 3)  ? 4
+                            :             5;
+
+    deInt32         red     = (deInt32)getBits(v0, 0, 5);
+    deInt32         green   = (deInt32)getBits(v1, 0, 4);
+    deInt32         blue    = (deInt32)getBits(v2, 0, 4);
+    deInt32         scale   = (deInt32)getBits(v3, 0, 4);
+
+    {
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5, V6,S6) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); SHOR(V6,S6,x6); } while (false)
+
+        const deUint32  x0  = getBit(v1, 6);
+        const deUint32  x1  = getBit(v1, 5);
+        const deUint32  x2  = getBit(v2, 6);
+        const deUint32  x3  = getBit(v2, 5);
+        const deUint32  x4  = getBit(v3, 7);
+        const deUint32  x5  = getBit(v3, 6);
+        const deUint32  x6  = getBit(v3, 5);
+
+        deInt32&        R   = red;
+        deInt32&        G   = green;
+        deInt32&        B   = blue;
+        deInt32&        S   = scale;
+
+        switch (mode)
+        {
+            case 0: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,10,  R,6,  S,6,   S,5); break;
+            case 1: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  R,10,  R,9); break;
+            case 2: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,6,   S,7,  S,6,   S,5); break;
+            case 3: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  S,6,   S,5); break;
+            case 4: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  R,7,   S,5); break;
+            case 5: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  S,6,   S,5); break;
+            default:
+                DE_ASSERT(false);
+        }
+#undef ASSIGN_X_BITS
+#undef SHOR
+    }
+
+    static const int shiftAmounts[] = { 1, 1, 2, 3, 4, 5 };
+    DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(shiftAmounts));
+
+    red     <<= shiftAmounts[mode];
+    green   <<= shiftAmounts[mode];
+    blue    <<= shiftAmounts[mode];
+    scale   <<= shiftAmounts[mode];
+
+    if (mode != 5)
+    {
+        green   = red - green;
+        blue    = red - blue;
+    }
+
+    if (majComp == 1)
+        std::swap(red, green);
+    else if (majComp == 2)
+        std::swap(red, blue);
+
+    e0 = UVec4(basisu_astc::clamp(red   - scale,    0, 0xfff),
+        basisu_astc::clamp(green    - scale,    0, 0xfff),
+        basisu_astc::clamp(blue - scale,    0, 0xfff),
+               0x780);
+
+    e1 = UVec4(basisu_astc::clamp(red,              0, 0xfff),
+        basisu_astc::clamp(green,               0, 0xfff),
+        basisu_astc::clamp(blue,                0, 0xfff),
+               0x780);
+}
+
+void decodeHDREndpointMode11 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5)
+{
+    const deUint32 major = (getBit(v5, 7) << 1) | getBit(v4, 7);
+
+    if (major == 3)
+    {
+        e0 = UVec4(v0<<4, v2<<4, getBits(v4,0,6)<<5, 0x780);
+        e1 = UVec4(v1<<4, v3<<4, getBits(v5,0,6)<<5, 0x780);
+    }
+    else
+    {
+        const deUint32 mode = (getBit(v3, 7) << 2) | (getBit(v2, 7) << 1) | getBit(v1, 7);
+
+        deInt32 a   = (deInt32)((getBit(v1, 6) << 8) | v0);
+        deInt32 c   = (deInt32)(getBits(v1, 0, 5));
+        deInt32 b0  = (deInt32)(getBits(v2, 0, 5));
+        deInt32 b1  = (deInt32)(getBits(v3, 0, 5));
+        deInt32 d0  = (deInt32)(getBits(v4, 0, 4));
+        deInt32 d1  = (deInt32)(getBits(v5, 0, 4));
+
+        {
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); } while (false)
+            const deUint32 x0 = getBit(v2, 6);
+            const deUint32 x1 = getBit(v3, 6);
+            const deUint32 x2 = getBit(v4, 6);
+            const deUint32 x3 = getBit(v5, 6);
+            const deUint32 x4 = getBit(v4, 5);
+            const deUint32 x5 = getBit(v5, 5);
+
+            switch (mode)
+            {
+                case 0: ASSIGN_X_BITS(b0,6,  b1,6,   d0,6,  d1,6,  d0,5,  d1,5); break;
+                case 1: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  d0,5,  d1,5); break;
+                case 2: ASSIGN_X_BITS(a,9,   c,6,    d0,6,  d1,6,  d0,5,  d1,5); break;
+                case 3: ASSIGN_X_BITS(b0,6,  b1,6,   a,9,   c,6,   d0,5,  d1,5); break;
+                case 4: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  a,9,   a,10); break;
+                case 5: ASSIGN_X_BITS(a,9,   a,10,   c,7,   c,6,   d0,5,  d1,5); break;
+                case 6: ASSIGN_X_BITS(b0,6,  b1,6,   a,11,  c,6,   a,9,   a,10); break;
+                case 7: ASSIGN_X_BITS(a,9,   a,10,   a,11,  c,6,   d0,5,  d1,5); break;
+                default:
+                    DE_ASSERT(false);
+            }
+#undef ASSIGN_X_BITS
+#undef SHOR
+        }
+
+        static const int numDBits[] = { 7, 6, 7, 6, 5, 6, 5, 6 };
+        DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(numDBits));
+        d0 = signExtend(d0, numDBits[mode]);
+        d1 = signExtend(d1, numDBits[mode]);
+        
+        const int shiftAmount = (mode >> 1) ^ 3;
+        a   = (uint32_t)a  << shiftAmount;
+        c   = (uint32_t)c  << shiftAmount;
+        b0  = (uint32_t)b0 << shiftAmount;
+        b1  = (uint32_t)b1 << shiftAmount;
+        d0  = (uint32_t)d0 << shiftAmount;
+        d1  = (uint32_t)d1 << shiftAmount;
+
+        e0 = UVec4(basisu_astc::clamp(a-c, 0, 0xfff), basisu_astc::clamp(a-b0-c-d0, 0, 0xfff), basisu_astc::clamp(a-b1-c-d1, 0, 0xfff), 0x780);
+        e1 = UVec4(basisu_astc::clamp(a, 0, 0xfff), basisu_astc::clamp(a-b0, 0, 0xfff), basisu_astc::clamp(a-b1, 0, 0xfff), 0x780);
+
+        if (major == 1)
+        {
+            std::swap(e0.x(), e0.y());
+            std::swap(e1.x(), e1.y());
+        }
+        else if (major == 2)
+        {
+            std::swap(e0.x(), e0.z());
+            std::swap(e1.x(), e1.z());
+        }
+    }
+}
+
+void decodeHDREndpointMode15(UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5, deUint32 v6In, deUint32 v7In)
+{
+    decodeHDREndpointMode11(e0, e1, v0, v1, v2, v3, v4, v5);
+    
+    const deUint32  mode    = (getBit(v7In, 7) << 1) | getBit(v6In, 7);
+    deInt32         v6      = (deInt32)getBits(v6In, 0, 6);
+    deInt32         v7      = (deInt32)getBits(v7In, 0, 6);
+
+    if (mode == 3)
+    {
+        e0.w() = v6 << 5;
+        e1.w() = v7 << 5;
+    }
+    else
+    {
+        v6 |= (v7 << (mode+1)) & 0x780;
+        v7 &= (0x3f >> mode);
+        v7 ^= 0x20 >> mode;
+        v7 -= 0x20 >> mode;
+        v6 <<= 4-mode;
+        v7 <<= 4-mode;
+        v7 += v6;
+        v7 = basisu_astc::clamp(v7, 0, 0xfff);
+        e0.w() = v6;
+        e1.w() = v7;
+    }
+}
+
+void decodeColorEndpoints (ColorEndpointPair* dst, const deUint32* unquantizedEndpoints, const deUint32* endpointModes, int numPartitions)
+{
+    int unquantizedNdx = 0;
+
+    for (int partitionNdx = 0; partitionNdx < numPartitions; partitionNdx++)
+    {
+        const deUint32      endpointMode    = endpointModes[partitionNdx];
+        const deUint32*     v               = &unquantizedEndpoints[unquantizedNdx];
+
+        UVec4&              e0              = dst[partitionNdx].e0;
+        UVec4&              e1              = dst[partitionNdx].e1;
+        unquantizedNdx += computeNumColorEndpointValues(endpointMode);
+
+        switch (endpointMode)
+        {
+            case 0:
+            {
+                e0 = UVec4(v[0], v[0], v[0], 0xff);
+                e1 = UVec4(v[1], v[1], v[1], 0xff);
+                break;
+            }
+            case 1:
+            {
+                const deUint32 L0 = (v[0] >> 2) | (getBits(v[1], 6, 7) << 6);
+                const deUint32 L1 = basisu_astc::min(0xffu, L0 + getBits(v[1], 0, 5));
+                e0 = UVec4(L0, L0, L0, 0xff);
+                e1 = UVec4(L1, L1, L1, 0xff);
+                break;
+            }
+            case 2:
+            {
+                const deUint32 v1Gr     = v[1] >= v[0];
+                const deUint32 y0       = v1Gr ? v[0]<<4 : (v[1]<<4) + 8;
+                const deUint32 y1       = v1Gr ? v[1]<<4 : (v[0]<<4) - 8;
+                e0 = UVec4(y0, y0, y0, 0x780);
+                e1 = UVec4(y1, y1, y1, 0x780);
+                break;
+            }
+            case 3:
+            {
+                const bool      m   = isBitSet(v[0], 7);
+                const deUint32  y0  = m ? (getBits(v[1], 5, 7) << 9) | (getBits(v[0], 0, 6) << 2)
+                                        : (getBits(v[1], 4, 7) << 8) | (getBits(v[0], 0, 6) << 1);
+                const deUint32  d   = m ? getBits(v[1], 0, 4) << 2
+                                        : getBits(v[1], 0, 3) << 1;
+                const deUint32  y1  = basisu_astc::min(0xfffu, y0+d);
+                e0 = UVec4(y0, y0, y0, 0x780);
+                e1 = UVec4(y1, y1, y1, 0x780);
+                break;
+            }
+            case 4:
+            {
+                e0 = UVec4(v[0], v[0], v[0], v[2]);
+                e1 = UVec4(v[1], v[1], v[1], v[3]);
+                break;
+            }
+            case 5:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                e0 = clampedRGBA(IVec4(v0,      v0,     v0,     v2));
+                e1 = clampedRGBA(IVec4(v0+v1,   v0+v1,  v0+v1,  v2+v3));
+                break;
+            }
+            case 6:
+                e0 = UVec4((v[0]*v[3]) >> 8,    (v[1]*v[3]) >> 8,   (v[2]*v[3]) >> 8,   0xff);
+                e1 = UVec4(v[0],                v[1],               v[2],               0xff);
+                break;
+            case 7:
+                decodeHDREndpointMode7(e0, e1, v[0], v[1], v[2], v[3]);
+                break;
+            case 8:
+            {
+                if (v[1]+v[3]+v[5] >= v[0]+v[2]+v[4])
+                {
+                    e0 = UVec4(v[0], v[2], v[4], 0xff);
+                    e1 = UVec4(v[1], v[3], v[5], 0xff);
+                }
+                else
+                {
+                    e0 = blueContract(v[1], v[3], v[5], 0xff).asUint();
+                    e1 = blueContract(v[0], v[2], v[4], 0xff).asUint();
+                }
+                break;
+            }
+            case 9:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                deInt32 v4 = (deInt32)v[4];
+                deInt32 v5 = (deInt32)v[5];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                bitTransferSigned(v5, v4);
+                if (v1+v3+v5 >= 0)
+                {
+                    e0 = clampedRGBA(IVec4(v0,      v2,     v4,     0xff));
+                    e1 = clampedRGBA(IVec4(v0+v1,   v2+v3,  v4+v5,  0xff));
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v0+v1,    v2+v3,  v4+v5,  0xff));
+                    e1 = clampedRGBA(blueContract(v0,       v2,     v4,     0xff));
+                }
+                break;
+            }
+            case 10:
+            {
+                e0 = UVec4((v[0]*v[3]) >> 8,    (v[1]*v[3]) >> 8,   (v[2]*v[3]) >> 8,   v[4]);
+                e1 = UVec4(v[0],                v[1],               v[2],               v[5]);
+                break;
+            }
+            case 11:
+            {
+                decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+                break;
+            }
+            case 12:
+            {
+                if (v[1] + v[3] + v[5] >= v[0] + v[2] + v[4])
+                {
+                    e0 = UVec4(v[0], v[2], v[4], v[6]);
+                    e1 = UVec4(v[1], v[3], v[5], v[7]);
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v[1], v[3], v[5], v[7]));
+                    e1 = clampedRGBA(blueContract(v[0], v[2], v[4], v[6]));
+                }
+                break;
+            }
+            case 13:
+            {
+                deInt32 v0 = (deInt32)v[0];
+                deInt32 v1 = (deInt32)v[1];
+                deInt32 v2 = (deInt32)v[2];
+                deInt32 v3 = (deInt32)v[3];
+                deInt32 v4 = (deInt32)v[4];
+                deInt32 v5 = (deInt32)v[5];
+                deInt32 v6 = (deInt32)v[6];
+                deInt32 v7 = (deInt32)v[7];
+                bitTransferSigned(v1, v0);
+                bitTransferSigned(v3, v2);
+                bitTransferSigned(v5, v4);
+                bitTransferSigned(v7, v6);
+                if (v1+v3+v5 >= 0)
+                {
+                    e0 = clampedRGBA(IVec4(v0,      v2,     v4,     v6));
+                    e1 = clampedRGBA(IVec4(v0+v1,   v2+v3,  v4+v5,  v6+v7));
+                }
+                else
+                {
+                    e0 = clampedRGBA(blueContract(v0+v1,    v2+v3,  v4+v5,  v6+v7));
+                    e1 = clampedRGBA(blueContract(v0,       v2,     v4,     v6));
+                }
+                break;
+            }
+            case 14:
+                decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+                e0.w() = v[6];
+                e1.w() = v[7];
+                break;
+            case 15:
+            {
+                decodeHDREndpointMode15(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+                break;
+            }
+            default:
+                DE_ASSERT(false);
+        }
+    }
+}
+
+void computeColorEndpoints (ColorEndpointPair* dst, const Block128& blockData, const deUint32* endpointModes, int numPartitions, int numColorEndpointValues, const ISEParams& iseParams, int numBitsAvailable)
+{
+    const int           colorEndpointDataStart = (numPartitions == 1) ? 17 : 29;
+    ISEDecodedResult    colorEndpointData[18];
+    
+    {
+        BitAccessStream dataStream(blockData, colorEndpointDataStart, numBitsAvailable, true);
+        decodeISE(&colorEndpointData[0], numColorEndpointValues, dataStream, iseParams);
+    }
+
+    {
+        deUint32 unquantizedEndpoints[18];
+        unquantizeColorEndpoints(&unquantizedEndpoints[0], &colorEndpointData[0], numColorEndpointValues, iseParams);
+        decodeColorEndpoints(dst, &unquantizedEndpoints[0], &endpointModes[0], numPartitions);
+    }
+}
+
+void unquantizeWeights (deUint32 dst[64], const ISEDecodedResult* weightGrid, const ASTCBlockMode& blockMode)
+{
+    const int           numWeights  = computeNumWeights(blockMode);
+    const ISEParams&    iseParams   = blockMode.weightISEParams;
+
+    if ((iseParams.mode == ISEMODE_TRIT) || (iseParams.mode == ISEMODE_QUINT))
+    {
+        const int rangeCase = iseParams.numBits*2 + (iseParams.mode == ISEMODE_QUINT ? 1 : 0);
+
+        if ((rangeCase == 0) || (rangeCase == 1))
+        {
+            static const deUint32 map0[3]   = { 0, 32, 63 };
+            static const deUint32 map1[5]   = { 0, 16, 32, 47, 63 };
+            const deUint32* const map = (rangeCase == 0) ? &map0[0] : &map1[0];
+
+            for (int i = 0; i < numWeights; i++)
+            {
+                DE_ASSERT(weightGrid[i].v < (rangeCase == 0 ? 3u : 5u));
+                dst[i] = map[weightGrid[i].v];
+            }
+        }
+        else
+        {
+            DE_ASSERT(rangeCase <= 6);
+            static const deUint32   Ca[5]   = { 50, 28, 23, 13, 11 };
+            const deUint32          C       = Ca[rangeCase-2];
+
+            for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+            {
+                const deUint32 a = getBit(weightGrid[weightNdx].m, 0);
+                const deUint32 b = getBit(weightGrid[weightNdx].m, 1);
+                const deUint32 c = getBit(weightGrid[weightNdx].m, 2);
+                
+                const deUint32 A = (a == 0) ? 0 : (1<<7)-1;
+                const deUint32 B = (rangeCase == 2) ? 0
+                                 : (rangeCase == 3) ? 0
+                                 : (rangeCase == 4) ? (b << 6) | (b << 2) | (b << 0)
+                                 : (rangeCase == 5) ? (b << 6) | (b << 1)
+                                 : (rangeCase == 6) ? (c << 6) | (b << 5) | (c << 1) |  (b << 0)
+                                 : (deUint32)-1;
+
+                dst[weightNdx] = (((weightGrid[weightNdx].tq*C + B) ^ A) >> 2) | (A & 0x20);
+            }
+        }
+    }
+    else
+    {
+        DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+        for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+            dst[weightNdx] = bitReplicationScale(weightGrid[weightNdx].v, iseParams.numBits, 6);
+    }
+
+    for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+        dst[weightNdx] += dst[weightNdx] > 32 ? 1 : 0;
+
+    // Initialize nonexistent weights to poison values
+    for (int weightNdx = numWeights; weightNdx < 64; weightNdx++)
+        dst[weightNdx] = ~0u;
+}
+
+void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeights) [64], int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+    const int       numWeightsPerTexel  = blockMode.isDualPlane ? 2 : 1;
+    const deUint32  scaleX              = (1024 + blockWidth/2) / (blockWidth-1);
+    const deUint32  scaleY              = (1024 + blockHeight/2) / (blockHeight-1);
+    DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights));
+
+    for (int texelY = 0; texelY < blockHeight; texelY++)
+    {
+        for (int texelX = 0; texelX < blockWidth; texelX++)
+        {
+            const deUint32 gX   = (scaleX*texelX*(blockMode.weightGridWidth-1) + 32) >> 6;
+            const deUint32 gY   = (scaleY*texelY*(blockMode.weightGridHeight-1) + 32) >> 6;
+            const deUint32 jX   = gX >> 4;
+            const deUint32 jY   = gY >> 4;
+            const deUint32 fX   = gX & 0xf;
+            const deUint32 fY   = gY & 0xf;
+            const deUint32 w11  = (fX*fY + 8) >> 4;
+            const deUint32 w10  = fY - w11;
+            const deUint32 w01  = fX - w11;
+            const deUint32 w00  = 16 - fX - fY + w11;
+            const deUint32 i00  = jY*blockMode.weightGridWidth + jX;
+            const deUint32 i01  = i00 + 1;
+            const deUint32 i10  = i00 + blockMode.weightGridWidth;
+            const deUint32 i11  = i00 + blockMode.weightGridWidth + 1;
+            
+            // These addresses can be out of bounds, but respective weights will be 0 then.
+            DE_ASSERT(deInBounds32(i00, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w00 == 0);
+            DE_ASSERT(deInBounds32(i01, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w01 == 0);
+            DE_ASSERT(deInBounds32(i10, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w10 == 0);
+            DE_ASSERT(deInBounds32(i11, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w11 == 0);
+
+            for (int texelWeightNdx = 0; texelWeightNdx < numWeightsPerTexel; texelWeightNdx++)
+            {
+                // & 0x3f clamps address to bounds of unquantizedWeights
+                const deUint32 p00  = unquantizedWeights[(i00 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p01  = unquantizedWeights[(i01 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p10  = unquantizedWeights[(i10 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                const deUint32 p11  = unquantizedWeights[(i11 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+                                
+                dst[texelY*blockWidth + texelX].w[texelWeightNdx] = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+            }
+        }
+    }
+}
+
+void computeTexelWeights (TexelWeightPair* dst, const Block128& blockData, int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+    ISEDecodedResult weightGrid[64];
+
+    {
+        BitAccessStream dataStream(blockData, 127, computeNumRequiredBits(blockMode.weightISEParams, computeNumWeights(blockMode)), false);
+        decodeISE(&weightGrid[0], computeNumWeights(blockMode), dataStream, blockMode.weightISEParams);
+    }
+
+    {
+        deUint32 unquantizedWeights[64];
+        unquantizeWeights(&unquantizedWeights[0], &weightGrid[0], blockMode);
+
+        interpolateWeights(dst, unquantizedWeights, blockWidth, blockHeight, blockMode);
+    }
+}
+
+inline deUint32 hash52 (deUint32 v)
+{
+    deUint32 p = v;
+    p ^= p >> 15;   p -= p << 17;   p += p << 7;    p += p << 4;
+    p ^= p >>  5;   p += p << 16;   p ^= p >> 7;    p ^= p >> 3;
+    p ^= p <<  6;   p ^= p >> 17;
+    return p;
+}
+
+int computeTexelPartition (deUint32 seedIn, deUint32 xIn, deUint32 yIn, deUint32 zIn, int numPartitions, bool smallBlock)
+{
+    DE_ASSERT(zIn == 0);
+
+    const deUint32  x       = smallBlock ? xIn << 1 : xIn;
+    const deUint32  y       = smallBlock ? yIn << 1 : yIn;
+    const deUint32  z       = smallBlock ? zIn << 1 : zIn;
+    const deUint32  seed    = seedIn + 1024*(numPartitions-1);
+    const deUint32  rnum    = hash52(seed);
+
+    deUint8         seed1   = (deUint8)( rnum                           & 0xf);
+    deUint8         seed2   = (deUint8)((rnum >>  4)                    & 0xf);
+    deUint8         seed3   = (deUint8)((rnum >>  8)                    & 0xf);
+    deUint8         seed4   = (deUint8)((rnum >> 12)                    & 0xf);
+    deUint8         seed5   = (deUint8)((rnum >> 16)                    & 0xf);
+    deUint8         seed6   = (deUint8)((rnum >> 20)                    & 0xf);
+    deUint8         seed7   = (deUint8)((rnum >> 24)                    & 0xf);
+    deUint8         seed8   = (deUint8)((rnum >> 28)                    & 0xf);
+    deUint8         seed9   = (deUint8)((rnum >> 18)                    & 0xf);
+    deUint8         seed10  = (deUint8)((rnum >> 22)                    & 0xf);
+    deUint8         seed11  = (deUint8)((rnum >> 26)                    & 0xf);
+    deUint8         seed12  = (deUint8)(((rnum >> 30) | (rnum << 2))    & 0xf);
+
+    seed1  = (deUint8)(seed1  * seed1 );
+    seed2  = (deUint8)(seed2  * seed2 );
+    seed3  = (deUint8)(seed3  * seed3 );
+    seed4  = (deUint8)(seed4  * seed4 );
+    seed5  = (deUint8)(seed5  * seed5 );
+    seed6  = (deUint8)(seed6  * seed6 );
+    seed7  = (deUint8)(seed7  * seed7 );
+    seed8  = (deUint8)(seed8  * seed8 );
+    seed9  = (deUint8)(seed9  * seed9 );
+    seed10 = (deUint8)(seed10 * seed10);
+    seed11 = (deUint8)(seed11 * seed11);
+    seed12 = (deUint8)(seed12 * seed12);
+
+    const int shA = (seed & 2) != 0     ? 4     : 5;
+    const int shB = numPartitions == 3  ? 6     : 5;
+    const int sh1 = (seed & 1) != 0     ? shA   : shB;
+    const int sh2 = (seed & 1) != 0     ? shB   : shA;
+    const int sh3 = (seed & 0x10) != 0  ? sh1   : sh2;
+
+    seed1  = (deUint8)(seed1  >> sh1);
+    seed2  = (deUint8)(seed2  >> sh2);
+    seed3  = (deUint8)(seed3  >> sh1);
+    seed4  = (deUint8)(seed4  >> sh2);
+    seed5  = (deUint8)(seed5  >> sh1);
+    seed6  = (deUint8)(seed6  >> sh2);
+    seed7  = (deUint8)(seed7  >> sh1);
+    seed8  = (deUint8)(seed8  >> sh2);
+    seed9  = (deUint8)(seed9  >> sh3);
+    seed10 = (deUint8)(seed10 >> sh3);
+    seed11 = (deUint8)(seed11 >> sh3);
+    seed12 = (deUint8)(seed12 >> sh3);
+
+    const int a =                         0x3f & (seed1*x + seed2*y + seed11*z + (rnum >> 14));
+    const int b =                         0x3f & (seed3*x + seed4*y + seed12*z + (rnum >> 10));
+    const int c = (numPartitions >= 3) ?  0x3f & (seed5*x + seed6*y + seed9*z  + (rnum >>  6))    : 0;
+    const int d = (numPartitions >= 4) ?  0x3f & (seed7*x + seed8*y + seed10*z + (rnum >>  2))    : 0;
+
+    return (a >= b && a >= c && a >= d) ? 0
+         : (b >= c && b >= d)           ? 1
+         : (c >= d)                     ? 2
+         :                                3;
+}
+
+DecompressResult setTexelColors (void* dst, ColorEndpointPair* colorEndpoints, TexelWeightPair* texelWeights, int ccs, deUint32 partitionIndexSeed,
+                                 int numPartitions, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode, const deUint32* colorEndpointModes)
+{
+    const bool          smallBlock  = blockWidth*blockHeight < 31;
+    DecompressResult    result      = DECOMPRESS_RESULT_VALID_BLOCK;
+    bool                isHDREndpoint[4];
+
+    for (int i = 0; i < numPartitions; i++)
+    {
+        isHDREndpoint[i] = isColorEndpointModeHDR(colorEndpointModes[i]);
+    }
+
+    for (int texelY = 0; texelY < blockHeight; texelY++)
+    {
+        for (int texelX = 0; texelX < blockWidth; texelX++)
+        {
+            const int texelNdx = texelY * blockWidth + texelX;
+            const int colorEndpointNdx = (numPartitions == 1) ? 0 : computeTexelPartition(partitionIndexSeed, texelX, texelY, 0, numPartitions, smallBlock);
+
+            DE_ASSERT(colorEndpointNdx < numPartitions);
+            const UVec4& e0 = colorEndpoints[colorEndpointNdx].e0;
+            const UVec4& e1 = colorEndpoints[colorEndpointNdx].e1;
+            const TexelWeightPair& weight = texelWeights[texelNdx];
+
+            if (isLDRMode && isHDREndpoint[colorEndpointNdx])
+            {
+                if (isSRGB)
+                {
+                    ((deUint8*)dst)[texelNdx * 4 + 0] = 0xff;
+                    ((deUint8*)dst)[texelNdx * 4 + 1] = 0;
+                    ((deUint8*)dst)[texelNdx * 4 + 2] = 0xff;
+                    ((deUint8*)dst)[texelNdx * 4 + 3] = 0xff;
+                }
+                else
+                {
+                    ((float*)dst)[texelNdx * 4 + 0] = 1.0f;
+                    ((float*)dst)[texelNdx * 4 + 1] = 0;
+                    ((float*)dst)[texelNdx * 4 + 2] = 1.0f;
+                    ((float*)dst)[texelNdx * 4 + 3] = 1.0f;
+                }
+                result = DECOMPRESS_RESULT_ERROR;
+            }
+            else
+            {
+                for (int channelNdx = 0; channelNdx < 4; channelNdx++)
+                {
+                    if (!isHDREndpoint[colorEndpointNdx] || (channelNdx == 3 && colorEndpointModes[colorEndpointNdx] == 14)) // \note Alpha for mode 14 is treated the same as LDR.
+                    {
+                        const deUint32 c0 = (e0[channelNdx] << 8) | (isSRGB ? 0x80 : e0[channelNdx]);
+                        const deUint32 c1 = (e1[channelNdx] << 8) | (isSRGB ? 0x80 : e1[channelNdx]);
+                        const deUint32 w = weight.w[ccs == channelNdx ? 1 : 0];
+                        const deUint32 c = (c0 * (64 - w) + c1 * w + 32) / 64;
+
+                        if (isSRGB)
+                            ((deUint8*)dst)[texelNdx * 4 + channelNdx] = (deUint8)((c & 0xff00) >> 8);
+                        else
+                            ((float*)dst)[texelNdx * 4 + channelNdx] = (c == 65535) ? 1.0f : (float)c / 65536.0f;
+                    }
+                    else
+                    {
+                        DE_ASSERT(!isSRGB);
+                        //DE_STATIC_ASSERT((basisu_astc::meta::TypesSame<deFloat16, deUint16>::Value));
+
+                        const deUint32      c0 = e0[channelNdx] << 4;
+                        const deUint32      c1 = e1[channelNdx] << 4;
+                        const deUint32      w = weight.w[(ccs == channelNdx) ? 1 : 0];
+                        const deUint32      c = (c0 * (64 - w) + c1 * w + 32) / 64;
+                        const deUint32      e = getBits(c, 11, 15);
+                        const deUint32      m = getBits(c, 0, 10);
+                        const deUint32      mt = (m < 512) ? (3 * m)
+                            : (m >= 1536) ? (5 * m - 2048)
+                            : (4 * m - 512);
+
+                        const deFloat16     cf = (deFloat16)((e << 10) + (mt >> 3));
+
+                        ((float*)dst)[texelNdx * 4 + channelNdx] = deFloat16To32(isFloat16InfOrNan(cf) ? 0x7bff : cf);
+                    }
+                
+                } // channelNdx
+            }
+        } // texelX
+    } // texelY
+
+    return result;
+}
+
+DecompressResult decompressBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDR)
+{
+    DE_ASSERT(isLDR || !isSRGB);
+    
+    // Decode block mode.
+    const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10));
+    
+    // Check for block mode errors.
+    if (blockMode.isError)
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Separate path for void-extent.
+    if (blockMode.isVoidExtent)
+        return decodeVoidExtentBlock(dst, blockData, blockWidth, blockHeight, isSRGB, isLDR);
+    
+    // Compute weight grid values.
+    const int numWeights            = computeNumWeights(blockMode);
+    const int numWeightDataBits     = computeNumRequiredBits(blockMode.weightISEParams, numWeights);
+    const int numPartitions         = (int)blockData.getBits(11, 12) + 1;
+    
+    // Check for errors in weight grid, partition and dual-plane parameters.
+    if ((numWeights > 64)                               ||
+        (numWeightDataBits > 96)                        ||
+        (numWeightDataBits < 24)                        ||
+        (blockMode.weightGridWidth > blockWidth)        ||
+        (blockMode.weightGridHeight > blockHeight)      ||
+        ((numPartitions == 4) && blockMode.isDualPlane))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Compute number of bits available for color endpoint data.
+    const bool  isSingleUniqueCem           = (numPartitions == 1) || (blockData.getBits(23, 24) == 0);
+
+    const int   numConfigDataBits           = ((numPartitions == 1) ? 17 : isSingleUniqueCem ? 29 : 25 + 3*numPartitions) +
+                                              (blockMode.isDualPlane ? 2 : 0);
+
+    const int   numBitsForColorEndpoints    = 128 - numWeightDataBits - numConfigDataBits;
+
+    const int   extraCemBitsStart           = 127 - numWeightDataBits - (isSingleUniqueCem      ? -1
+                                                                        : (numPartitions == 4)  ? 7
+                                                                        : (numPartitions == 3)  ? 4
+                                                                        : (numPartitions == 2)  ? 1
+                                                                        : 0);
+    
+    // Decode color endpoint modes.
+    deUint32 colorEndpointModes[4];
+    decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart);
+    const int numColorEndpointValues = computeNumColorEndpointValues(colorEndpointModes, numPartitions);
+    
+    // Check for errors in color endpoint value count.
+    if ((numColorEndpointValues > 18) || (numBitsForColorEndpoints < (int)deDivRoundUp32(13*numColorEndpointValues, 5)))
+    {
+        setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+        return DECOMPRESS_RESULT_ERROR;
+    }
+    
+    // Compute color endpoints.
+    ColorEndpointPair colorEndpoints[4];
+    computeColorEndpoints(&colorEndpoints[0], blockData, &colorEndpointModes[0], numPartitions, numColorEndpointValues,
+                          computeMaximumRangeISEParams(numBitsForColorEndpoints, numColorEndpointValues), numBitsForColorEndpoints);
+    
+    // Compute texel weights.
+    TexelWeightPair texelWeights[MAX_BLOCK_WIDTH*MAX_BLOCK_HEIGHT];
+    computeTexelWeights(&texelWeights[0], blockData, blockWidth, blockHeight, blockMode);
+    
+    // Set texel colors.
+    const int       ccs                     = blockMode.isDualPlane ? (int)blockData.getBits(extraCemBitsStart-2, extraCemBitsStart-1) : -1;
+    const deUint32  partitionIndexSeed      = (numPartitions > 1) ? blockData.getBits(13, 22) : (deUint32)-1;
+
+    return setTexelColors(dst, &colorEndpoints[0], &texelWeights[0], ccs, partitionIndexSeed, numPartitions, blockWidth, blockHeight, isSRGB, isLDR, &colorEndpointModes[0]);
+}
+
+// Returns -1 on error, 0 if LDR, 1 if HDR
+int isHDR(const Block128& blockData, int blockWidth, int blockHeight)
+{
+    // Decode block mode.
+    const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10));
+
+    // Check for block mode errors.
+    if (blockMode.isError)
+        return -1;
+
+    // Separate path for void-extent.
+    if (blockMode.isVoidExtent)
+    {
+        const bool isHDRBlock = blockData.isBitSet(9);
+        return isHDRBlock ? 1 : 0;
+    }
+
+    // Compute weight grid values.
+    const int numWeights = computeNumWeights(blockMode);
+    const int numWeightDataBits = computeNumRequiredBits(blockMode.weightISEParams, numWeights);
+    const int numPartitions = (int)blockData.getBits(11, 12) + 1;
+
+    // Check for errors in weight grid, partition and dual-plane parameters.
+    if ((numWeights > 64) ||
+        (numWeightDataBits > 96) ||
+        (numWeightDataBits < 24) ||
+        (blockMode.weightGridWidth > blockWidth) ||
+        (blockMode.weightGridHeight > blockHeight) ||
+        ((numPartitions == 4) && blockMode.isDualPlane))
+    {
+        return -1;
+    }
+
+    // Compute number of bits available for color endpoint data.
+    const bool  isSingleUniqueCem = (numPartitions == 1) || (blockData.getBits(23, 24) == 0);
+
+    const int   extraCemBitsStart = 127 - numWeightDataBits - (isSingleUniqueCem ? -1
+        : (numPartitions == 4) ? 7
+        : (numPartitions == 3) ? 4
+        : (numPartitions == 2) ? 1
+        : 0);
+
+    // Decode color endpoint modes.
+    deUint32 colorEndpointModes[4];
+    decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart);
+    
+    for (int i = 0; i < numPartitions; i++)
+    {
+        if (isColorEndpointModeHDR(colorEndpointModes[i]))
+            return 1;
+    }
+
+    return 0;
+}
+
+typedef uint16_t half_float;
+
+half_float float_to_half(float val, bool toward_zero)
+{
+    union { float f; int32_t i; uint32_t u; } fi = { val };
+    const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
+    int s = flt_s, e = 0, m = 0;
+
+    // inf/NaN
+    if (flt_e == 0xff)
+    {
+        e = 31;
+        if (flt_m != 0) // NaN
+            m = 1;
+    }
+    // not zero or denormal
+    else if (flt_e != 0)
+    {
+        int new_exp = flt_e - 127;
+        if (new_exp > 15)
+            e = 31;
+        else if (new_exp < -14)
+        {
+            if (toward_zero)
+                m = (int)truncf((1 << 24) * fabsf(fi.f));
+            else
+                m = lrintf((1 << 24) * fabsf(fi.f));
+        }
+        else
+        {
+            e = new_exp + 15;
+            if (toward_zero)
+                m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13)));
+            else
+                m = lrintf((float)flt_m * (1.0f / (float)(1 << 13)));
+        }
+    }
+
+    assert((0 <= m) && (m <= 1024));
+    if (m == 1024)
+    {
+        e++;
+        m = 0;
+    }
+
+    assert((s >= 0) && (s <= 1));
+    assert((e >= 0) && (e <= 31));
+    assert((m >= 0) && (m <= 1023));
+
+    half_float result = (half_float)((s << 15) | (e << 10) | m);
+    return result;
+}
+
+float half_to_float(half_float hval)
+{
+    union { float f; uint32_t u; } x = { 0 };
+
+    uint32_t s = ((uint32_t)hval >> 15) & 1;
+    uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
+    uint32_t m = (uint32_t)hval & 0x3FF;
+
+    if (!e)
+    {
+        if (!m)
+        {
+            // +- 0
+            x.u = s << 31;
+            return x.f;
+        }
+        else
+        {
+            // denormalized
+            while (!(m & 0x00000400))
+            {
+                m <<= 1;
+                --e;
+            }
+
+            ++e;
+            m &= ~0x00000400;
+        }
+    }
+    else if (e == 31)
+    {
+        if (m == 0)
+        {
+            // +/- INF
+            x.u = (s << 31) | 0x7f800000;
+            return x.f;
+        }
+        else
+        {
+            // +/- NaN
+            x.u = (s << 31) | 0x7f800000 | (m << 13);
+            return x.f;
+        }
+    }
+
+    e = e + (127 - 15);
+    m = m << 13;
+
+    assert(s <= 1);
+    assert(m <= 0x7FFFFF);
+    assert(e <= 255);
+
+    x.u = m | (e << 23) | (s << 31);
+    return x.f;
+}
+
+} // anonymous
+
+// See https://registry.khronos.org/DataFormat/specs/1.3/dataformat.1.3.inline.html#_hdr_endpoint_decoding
+static void convert_to_half_prec(uint32_t n, float* pVals)
+{
+#if 0
+    const int prev_dir = fesetround(FE_TOWARDZERO);
+
+    for (uint32_t i = 0; i < n; i++)
+        pVals[i] = half_to_float(float_to_half(pVals[i]));
+
+    fesetround(prev_dir);
+
+    for (uint32_t i = 0; i < n; i++)
+    {
+        assert(pVals[i] == half_to_float(float_to_half(pVals[i], true)));
+    }
+#else
+    // This ensures the values are rounded towards zero as half floats.
+    for (uint32_t i = 0; i < n; i++)
+    {
+        pVals[i] = half_to_float(float_to_half(pVals[i], true));
+    }
+#endif
+}
+
+bool decompress_ldr(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth, int blockHeight)
+{
+    float linear[MAX_BLOCK_WIDTH * MAX_BLOCK_HEIGHT * 4];
+
+    const Block128 blockData(data);
+    
+    // isSRGB is true, this writes uint8_t's. Otherwise it writes floats.
+    if (decompressBlock(isSRGB ? (void*)pDst : (void*)&linear[0], blockData, blockWidth, blockHeight, isSRGB, true) != DECOMPRESS_RESULT_VALID_BLOCK)
+    {
+        return false;
+    }
+
+    if (!isSRGB)
+    {
+        // Convert the floats to 8-bits with rounding.
+        int pix = 0;
+        for (int i = 0; i < blockHeight; i++)
+        {
+            for (int j = 0; j < blockWidth; j++, pix++)
+            {
+                pDst[4 * pix + 0] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 0] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 1] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 1] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 2] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 2] * 65536.0f + .5f), 0, 65535) >> 8);
+                pDst[4 * pix + 3] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 3] * 65536.0f + .5f), 0, 65535) >> 8);
+            }
+        }
+    }
+
+    return true;
+}
+
+bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight)
+{
+    const Block128 blockData(data);
+
+    if (decompressBlock(pDstRGBA, blockData, blockWidth, blockHeight, false, false) != DECOMPRESS_RESULT_VALID_BLOCK)
+    {
+        return false;
+    }
+
+    convert_to_half_prec(blockWidth * blockHeight * 4, pDstRGBA);
+        
+    return true;
+}
+
+bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool &is_hdr)
+{
+    is_hdr = false;
+
+    const Block128 blockData(data);
+    
+    int status = isHDR(blockData, blockWidth, blockHeight);
+    if (status < 0)
+    {
+        return false;
+    }
+
+    is_hdr = (status == 1);
+
+    return true;
+}
+
+} // astc
+
+} // basisu_astc
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h
new file mode 100644
index 000000000000..ad13093a6c7d
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.h
@@ -0,0 +1,45 @@
+// File: android_astc_decomp.h
+#ifndef _TCUASTCUTIL_HPP
+#define _TCUASTCUTIL_HPP
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+
+#include <vector>
+#include <stdint.h>
+
+namespace basisu_astc
+{
+namespace astc
+{
+
+// Unpacks a single ASTC block to pDst
+// If isSRGB is true, the spec requires the decoder to scale the LDR 8-bit endpoints to 16-bit before interpolation slightly differently, 
+// which will lead to different outputs. So be sure to set it correctly (ideally it should match whatever the encoder did).
+bool decompress_ldr(uint8_t* pDst, const uint8_t* data, bool isSRGB, int blockWidth, int blockHeight);
+bool decompress_hdr(float* pDstRGBA, const uint8_t* data, int blockWidth, int blockHeight);
+bool is_hdr(const uint8_t* data, int blockWidth, int blockHeight, bool& is_hdr);
+
+} // astc
+} // basisu
+
+#endif
diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp
new file mode 100644
index 000000000000..d698a7ff872b
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp
@@ -0,0 +1,3310 @@
+// basisu_astc_hdr_enc.cpp
+#include "basisu_astc_hdr_enc.h"
+#include "../transcoder/basisu_transcoder.h"
+
+using namespace basist;
+
+namespace basisu
+{
+
+const float DEF_R_ERROR_SCALE = 2.0f;
+const float DEF_G_ERROR_SCALE = 3.0f;
+
+static inline uint32_t get_max_qlog(uint32_t bits)
+{
+	switch (bits)
+	{
+	case 7: return MAX_QLOG7;
+	case 8: return MAX_QLOG8;
+	case 9: return MAX_QLOG9;
+	case 10: return MAX_QLOG10;
+	case 11: return MAX_QLOG11;
+	case 12: return MAX_QLOG12;
+	case 16: return MAX_QLOG16;
+	default: assert(0); break;
+	}
+	return 0;
+}
+
+#if 0
+static inline float get_max_qlog_val(uint32_t bits)
+{
+	switch (bits)
+	{
+	case 7: return MAX_QLOG7_VAL;
+	case 8: return MAX_QLOG8_VAL;
+	case 9: return MAX_QLOG9_VAL;
+	case 10: return MAX_QLOG10_VAL;
+	case 11: return MAX_QLOG11_VAL;
+	case 12: return MAX_QLOG12_VAL;
+	case 16: return MAX_QLOG16_VAL;
+	default: assert(0); break;
+	}
+	return 0;
+}
+#endif
+
+static inline int get_bit(
+	int src_val, int src_bit)
+{
+	assert(src_bit >= 0 && src_bit <= 31);
+	int bit = (src_val >> src_bit) & 1;
+	return bit;
+}
+
+static inline void pack_bit(
+	int& dst, int dst_bit,
+	int src_val, int src_bit = 0)
+{
+	assert(dst_bit >= 0 && dst_bit <= 31);
+	int bit = get_bit(src_val, src_bit);
+	dst |= (bit << dst_bit);
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+astc_hdr_codec_options::astc_hdr_codec_options()
+{
+	init();
+}
+
+void astc_hdr_codec_options::init()
+{
+	m_bc6h_err_weight = .85f;
+	m_r_err_scale = DEF_R_ERROR_SCALE;
+	m_g_err_scale = DEF_G_ERROR_SCALE;
+
+	// Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output.
+	m_allow_uber_mode = false;
+
+	// Must set best quality level first to set defaults.
+	set_quality_best();
+
+	set_quality_level(cDefaultLevel);
+}
+
+void astc_hdr_codec_options::set_quality_best()
+{
+	m_mode11_direct_only = false;
+		
+	// highest achievable quality
+	m_use_solid = true;
+
+	m_use_mode11 = true;
+	m_mode11_uber_mode = true;
+	m_first_mode11_weight_ise_range = MODE11_FIRST_ISE_RANGE;
+	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
+	m_first_mode11_submode = -1;
+	m_last_mode11_submode = 7;
+
+	m_use_mode7_part1 = true;
+	m_first_mode7_part1_weight_ise_range = MODE7_PART1_FIRST_ISE_RANGE;
+	m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
+
+	m_use_mode7_part2 = true;
+	m_mode7_part2_part_masks = UINT32_MAX;
+	m_first_mode7_part2_weight_ise_range = MODE7_PART2_FIRST_ISE_RANGE;
+	m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
+
+	m_use_mode11_part2 = true;
+	m_mode11_part2_part_masks = UINT32_MAX;
+	m_first_mode11_part2_weight_ise_range = MODE11_PART2_FIRST_ISE_RANGE;
+	m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
+
+	m_refine_weights = true;
+
+	m_use_estimated_partitions = false;
+	m_max_estimated_partitions = 0;
+}
+
+void astc_hdr_codec_options::set_quality_normal()
+{
+	m_use_solid = true;
+
+	// We'll allow uber mode in normal if the user allows it.
+	m_use_mode11 = true;
+	m_mode11_uber_mode = true;
+	m_first_mode11_weight_ise_range = 6;
+	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
+
+	m_use_mode7_part1 = true;
+	m_first_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
+	m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE;
+
+	m_use_mode7_part2 = true;
+	m_mode7_part2_part_masks = UINT32_MAX;
+	m_first_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
+	m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE;
+
+	m_use_mode11_part2 = true;
+	m_mode11_part2_part_masks = UINT32_MAX;
+	m_first_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
+	m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE;
+
+	m_refine_weights = true;
+}
+
+void astc_hdr_codec_options::set_quality_fastest()
+{
+	m_use_solid = true;
+
+	m_use_mode11 = true;
+	m_mode11_uber_mode = false;
+	m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
+	m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
+
+	m_use_mode7_part1 = false;
+	m_use_mode7_part2 = false;
+	m_use_mode11_part2 = false;
+
+	m_refine_weights = false;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+void astc_hdr_codec_options::set_quality_level(int level)
+{
+	level = clamp(level, cMinLevel, cMaxLevel);
+	
+	m_level = level;
+
+	switch (level)
+	{
+	case 0:
+	{
+		set_quality_fastest();
+		break;
+	}
+	case 1:
+	{
+		set_quality_normal();
+
+		m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE - 1;
+		m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE;
+
+		m_use_mode7_part1 = false;
+		m_use_mode7_part2 = false;
+
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 1;
+
+		m_mode11_part2_part_masks = 1 | 2;
+		m_mode7_part2_part_masks = 1 | 2;
+		break;
+	}
+	case 2:
+	{
+		set_quality_normal();
+
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 2;
+
+		m_mode11_part2_part_masks = 1 | 2;
+		m_mode7_part2_part_masks = 1 | 2;
+
+		break;
+	}
+	case 3:
+	{
+		set_quality_best();
+
+		m_use_estimated_partitions = true;
+		m_max_estimated_partitions = 2;
+
+		m_mode11_part2_part_masks = 1 | 2 | 4 | 8;
+		m_mode7_part2_part_masks = 1 | 2 | 4 | 8;
+
+		break;
+	}
+	case 4:
+	{
+		set_quality_best();
+
+		break;
+	}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+#if 0
+static inline half_float qlog12_to_half_slow(uint32_t qlog12)
+{
+	return qlog_to_half_slow(qlog12, 12);
+}
+#endif
+
+// max usable qlog8 value is 247, 248=inf, >=249 is nan
+// max usable qlog7 value is 123, 124=inf, >=125 is nan
+
+// To go from a smaller qlog to an larger one, shift left by X bits.
+
+//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0
+
+// for qlog7's shift left by 1
+//half_float g_qlog8_to_half[256];
+//float g_qlog8_to_float[256];
+
+//half_float g_qlog12_to_half[4096];
+//float g_qlog12_to_float[4096];
+
+static half_float g_qlog16_to_half[65536];
+
+inline half_float qlog_to_half(uint32_t val, uint32_t bits)
+{
+	assert((bits >= 5) && (bits <= 16));
+	assert(val < (1U << bits));
+	return g_qlog16_to_half[val << (16 - bits)];
+}
+
+// nearest values given a positive half float value (only)
+static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768], g_half_to_qlog9[32768], g_half_to_qlog10[32768], g_half_to_qlog11[32768], g_half_to_qlog12[32768];
+
+const uint32_t HALF_TO_QLOG_TABS_BASE = 7;
+static uint16_t* g_pHalf_to_qlog_tabs[8] =
+{
+	g_half_to_qlog7,
+	g_half_to_qlog8,
+
+	g_half_to_qlog9,
+	g_half_to_qlog10,
+
+	g_half_to_qlog11,
+	g_half_to_qlog12
+};
+
+static inline uint32_t half_to_qlog7_12(half_float h, uint32_t bits)
+{
+	assert((bits >= HALF_TO_QLOG_TABS_BASE) && (bits <= 12));
+	assert(h < 32768);
+
+	return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE][h];
+}
+
+#if 0
+// Input is the low 11 bits of the qlog
+// Returns the 10-bit mantissa of the half float value
+static int qlog11_to_half_float_mantissa(int M)
+{
+	assert(M <= 0x7FF);
+	int Mt;
+	if (M < 512)
+		Mt = 3 * M;
+	else if (M >= 1536)
+		Mt = 5 * M - 2048;
+	else
+		Mt = 4 * M - 512;
+	return (Mt >> 3);
+}
+#endif
+
+// Input is the 10-bit mantissa of the half float value
+// Output is the 11-bit qlog value
+// Inverse of qlog11_to_half_float_mantissa()
+static inline int half_float_mantissa_to_qlog11(int hf)
+{
+	int q0 = (hf * 8 + 2) / 3;
+	int q1 = (hf * 8 + 2048 + 4) / 5;
+
+	if (q0 < 512)
+		return q0;
+	else if (q1 >= 1536)
+		return q1;
+
+	int q2 = (hf * 8 + 512 + 2) / 4;
+	return q2;
+}
+
+static inline int half_to_qlog16(int hf)
+{
+	// extract 5 bits exponent, which is carried through to qlog16 unchanged
+	const int exp = (hf >> 10) & 0x1F;
+
+	// extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless)
+	const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF);
+	assert(mantissa <= 0x7FF);
+
+	// Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights.
+	uint32_t qlog16 = (exp << 11) | mantissa;
+
+	// should be a lossless operation
+	assert(qlog16_to_half_slow(qlog16) == hf);
+
+	return qlog16;
+}
+
+static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits)
+{
+	assert((desired_bits >= 7) && (desired_bits <= 12));
+	assert(q16 <= 65535);
+
+	const uint32_t shift = 16 - desired_bits;
+	uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift;
+
+	uint32_t max_val = (1U << desired_bits) - 1U;
+	e = minimum<uint32_t>(e, max_val);
+
+	return e;
+}
+
+static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector<float> &qlog16_to_float)
+{
+	assert(bits >= 5 && bits <= 12);
+	const uint32_t max_val = (1 << bits) - 1;
+
+	// For all positive half-floats
+	for (uint32_t h = 0; h < 32768; h++)
+	{
+		// Skip invalid values
+		if (is_half_inf_or_nan((half_float)h))
+			continue;
+		const float desired_val = half_to_float((half_float)h);
+
+		float best_err = 1e+30f;
+		uint32_t best_qlog = 0;
+
+		// For all possible qlog's
+		for (uint32_t i = 0; i <= max_val; i++)
+		{
+			// Skip invalid values
+			float v = qlog16_to_float[i << (16 - bits)];
+			if (std::isnan(v))
+				continue;
+
+			// Compute error
+			float err = fabs(v - desired_val);
+
+			// Find best
+			if (err < best_err)
+			{
+				best_err = err;
+				best_qlog = i;
+			}
+		}
+
+		pTable[h] = (uint16_t)best_qlog;
+	}
+
+#if 0
+	uint32_t t = 0;
+
+	const uint32_t nb = 12;
+	int nb_shift = 16 - nb;
+
+	for (uint32_t q16 = 0; q16 < 65536; q16++)
+	{
+		half_float h = qlog16_to_half_slow(q16);
+		if (is_half_inf_or_nan(h))
+			continue;
+
+		int q7 = half_to_qlog7_12(h, nb);
+
+		uint32_t best_err = UINT32_MAX, best_l = 0;
+		for (int l = 0; l < (1 << nb); l++)
+		{
+			int dec_q16 = l << nb_shift;
+			int err = iabs(dec_q16 - q16);
+			if (err < best_err)
+			{
+				best_err = err;
+				best_l = l;
+			}
+		}
+
+		//int e = (q16 + 253) >> 9; // 345
+
+		int e = (q16 + (1 << (nb_shift - 1)) - 1) >> nb_shift; // 285
+		if (best_l != e)
+			//if (q7 != best_l)
+		{
+			printf("q16=%u, h=%u, q7=%u, e=%u, best_l=%u\n", q16, h, q7, e, best_l);
+			t++;
+		}
+	}
+
+	printf("Mismatches: %u\n", t);
+	exit(0);
+#endif
+}
+
+static void init_qlog_tables()
+{
+	basisu::vector<float> qlog16_to_float(65536);
+
+	// for all possible qlog16, compute the corresponding half float
+	for (uint32_t i = 0; i <= 65535; i++)
+	{
+		half_float h = qlog16_to_half_slow(i);
+		g_qlog16_to_half[i] = h;
+
+		qlog16_to_float[i] = half_to_float(h);
+	}
+
+	// for all possible half floats, find the nearest qlog5-12 float
+	for (uint32_t bits = HALF_TO_QLOG_TABS_BASE; bits <= 12; bits++)
+	{
+		compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE], qlog16_to_float);
+	}
+}
+
+// [ise_range][0] = # levels
+// [ise_range][1...] = lerp value [0,64]
+// in ASTC order
+// Supported ISE weight ranges: 0 to 10, 11 total
+const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = 1; // ISE 1=3 levels
+const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = 10; // ISE 10=24 levels
+
+static const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][32] =
+{
+	{ 0 }, // ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block)
+	{ 3, 0, 32, 64 }, // 1
+	{ 4, 0, 21, 43, 64 }, // 2
+	{ 5, 0, 16, 32, 48, 64 }, // 3
+	{ 6, 0, 64, 12, 52, 25, 39 }, // 4
+	{ 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5
+	{ 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6
+	{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
+	{ 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8
+	{ 20, 0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35 }, // 9
+	{ 24, 0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34 } // 10
+};
+
+//{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7
+//static const uint8_t g_weight_order_7[12] = { 0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1 };
+
+static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels)
+{
+	vec3F mean(0.0f);
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		const vec4F& p = pPixels[i];
+
+		mean[0] += p[0];
+		mean[1] += p[1];
+		mean[2] += p[2];
+	}
+
+	return mean / static_cast<float>(num_pixels);
+}
+
+static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color)
+{
+	float cov[6] = { 0, 0, 0, 0, 0, 0 };
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		const vec4F& v = pPixels[i];
+
+		float r = v[0] - mean_color[0];
+		float g = v[1] - mean_color[1];
+		float b = v[2] - mean_color[2];
+
+		cov[0] += r * r;
+		cov[1] += r * g;
+		cov[2] += r * b;
+		cov[3] += g * g;
+		cov[4] += g * b;
+		cov[5] += b * b;
+	}
+
+	float xr = .9f, xg = 1.0f, xb = .7f;
+	for (uint32_t iter = 0; iter < 3; iter++)
+	{
+		float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
+		float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
+		float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
+
+		float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+
+		if (m > 1e-10f)
+		{
+			m = 1.0f / m;
+
+			r *= m;
+			g *= m;
+			b *= m;
+		}
+
+		xr = r;
+		xg = g;
+		xb = b;
+	}
+
+	float len = xr * xr + xg * xg + xb * xb;
+
+	vec3F axis;
+	if (len < 1e-10f)
+		axis.set(0.0f);
+	else
+	{
+		len = 1.0f / sqrtf(len);
+
+		xr *= len;
+		xg *= len;
+		xb *= len;
+
+		axis.set(xr, xg, xb, 0);
+	}
+
+	if (axis.dot(axis) < .5f)
+	{
+		axis.set(1.0f, 1.0f, 1.0f, 0.0f);
+		axis.normalize_in_place();
+	}
+
+	return axis;
+}
+
+static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr)
+{
+#if 0
+	assert(mean[0] >= input_box[0][0]);
+	assert(mean[1] >= input_box[0][1]);
+	assert(mean[2] >= input_box[0][2]);
+	assert(mean[0] <= input_box[1][0]);
+	assert(mean[1] <= input_box[1][1]);
+	assert(mean[2] <= input_box[1][2]);
+#endif
+
+	if (pInside)
+		*pInside = false;
+
+	vec3F k(mean + dir * df);
+	if (colorspace_box.contains(k))
+	{
+		if (pInside)
+			*pInside = true;
+
+		return k;
+	}
+
+	// starts inside
+	vec3F s(mean);
+
+	// ends outside
+	vec3F e(mean + dir * df);
+
+	// a ray guaranteed to go from the outside to inside
+	ray3F r(e, (s - e).normalize_in_place());
+	vec3F c;
+	float t = 0.0f;
+
+	intersection::result res = intersection::ray_aabb(c, t, r, input_box);
+	if (res != intersection::cSuccess)
+		c = k;
+
+	return c;
+}
+
+// all in Q16 space, 0-65535
+static bool compute_least_squares_endpoints_rgb(
+	uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights,
+	vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel][0];
+		z10 += pSelector_weights[sel][1];
+		z11 += pSelector_weights[sel][2];
+
+		float w = pSelector_weights[sel][3];
+		q00_r += w * pColors[i][0];
+		t_r += pColors[i][0];
+
+		q00_g += w * pColors[i][1];
+		t_g += pColors[i][1];
+
+		q00_b += w * pColors[i][2];
+		t_b += pColors[i][2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	float det = z00 * z11 - z01 * z10;
+	if (det == 0.0f)
+		return false;
+
+	det = 1.0f / det;
+
+	float iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	(*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r);
+	(*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+
+	(*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g);
+	(*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+
+	(*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b);
+	(*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		float l = (*pXl)[c], h = (*pXh)[c];
+
+		if (input_box.get_dim(c) < .0000125f)
+		{
+			l = input_box[0][c];
+			h = input_box[1][c];
+		}
+
+		(*pXl)[c] = l;
+		(*pXh)[c] = h;
+	}
+
+	vec3F mean((*pXl + *pXh) * .5f);
+	vec3F dir(*pXh - *pXl);
+
+	float ln = dir.length();
+	if (ln)
+	{
+		dir /= ln;
+
+		float ld = (*pXl - mean).dot(dir);
+		float hd = (*pXh - mean).dot(dir);
+
+		aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL));
+
+		bool was_inside1 = false;
+
+		vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1);
+		if (!was_inside1)
+			*pXl = l;
+
+		bool was_inside2 = false;
+		vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2);
+		if (!was_inside2)
+			*pXh = h;
+	}
+
+	pXl->clamp(0.0f, MAX_QLOG16_VAL);
+	pXh->clamp(0.0f, MAX_QLOG16_VAL);
+
+	return true;
+}
+
+static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24];
+
+static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][astc_index] -> linear index
+static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][linear_index] -> astc_index
+
+static void encode_astc_hdr_init()
+{
+	// Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
+	for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++)
+	{
+		const uint32_t num_levels = g_ise_weight_lerps[range][0];
+		assert((num_levels >= 3) && (num_levels <= 24));
+
+		for (uint32_t i = 0; i < num_levels; i++)
+		{
+			float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f);
+
+			g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w);
+		}
+	}
+
+	for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++)
+	{
+		const uint32_t num_levels = g_ise_weight_lerps[ise_range][0];
+		assert((num_levels >= 3) && (num_levels <= 24));
+
+		uint32_t s[32];
+		for (uint32_t i = 0; i < num_levels; i++)
+			s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i;
+
+		std::sort(s, s + num_levels);
+
+		for (uint32_t i = 0; i < num_levels; i++)
+			g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF);
+
+		for (uint32_t i = 0; i < num_levels; i++)
+			g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i;
+	}
+}
+
+void interpolate_qlog12_colors(
+	const int e[2][3],
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range)
+{
+	assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	for (uint32_t i = 0; i < 2; i++)
+	{
+		for (uint32_t j = 0; j < 3; j++)
+		{
+			assert(in_range(e[i][j], 0, 0xFFF));
+		}
+	}
+
+	for (uint32_t i = 0; i < n; i++)
+	{
+		const int c = g_ise_weight_lerps[ise_weight_range][1 + i];
+		assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range));
+
+		half_float rf, gf, bf;
+
+		{
+			uint32_t r0 = e[0][0] << 4;
+			uint32_t r1 = e[1][0] << 4;
+			int ri = (r0 * (64 - c) + r1 * c + 32) / 64;
+			rf = qlog16_to_half_slow(ri);
+		}
+
+		{
+			uint32_t g0 = e[0][1] << 4;
+			uint32_t g1 = e[1][1] << 4;
+			int gi = (g0 * (64 - c) + g1 * c + 32) / 64;
+			gf = qlog16_to_half_slow(gi);
+		}
+
+		{
+			uint32_t b0 = e[0][2] << 4;
+			uint32_t b1 = e[1][2] << 4;
+			int bi = (b0 * (64 - c) + b1 * c + 32) / 64;
+			bf = qlog16_to_half_slow(bi);
+		}
+
+		if (pDecoded_half)
+		{
+			pDecoded_half[i * 3 + 0] = rf;
+			pDecoded_half[i * 3 + 1] = gf;
+			pDecoded_half[i * 3 + 2] = bf;
+		}
+
+		if (pDecoded_float)
+		{
+			pDecoded_float[i][0] = half_to_float(rf);
+			pDecoded_float[i][1] = half_to_float(gf);
+			pDecoded_float[i][2] = half_to_float(bf);
+		}
+	}
+}
+
+// decoded in ASTC order, not linear order
+// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
+bool get_astc_hdr_mode_11_block_colors(
+	const uint8_t* pEndpoints,
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	int e[2][3];
+	if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range))
+		return false;
+
+	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
+
+	return true;
+}
+
+// decoded in ASTC order, not linear order
+// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded
+bool get_astc_hdr_mode_7_block_colors(
+	const uint8_t* pEndpoints,
+	half_float* pDecoded_half,
+	vec3F* pDecoded_float,
+	uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range)
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	int e[2][3];
+	if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range))
+		return false;
+
+	interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range);
+
+	return true;
+}
+
+// Fast high precision piecewise linear approximation of log2(bias+x).
+// Half may be zero, positive or denormal. No NaN/Inf/negative.
+static inline double q(half_float x)
+{
+	union { float f; int32_t i; uint32_t u; } fi;
+
+	fi.f = fast_half_to_float_pos_not_inf_or_nan(x);
+
+	assert(fi.f >= 0.0f);
+
+	fi.f += .125f;
+
+	return (double)fi.u; // approx log2f(fi.f), need to return double for the precision
+}
+
+double eval_selectors(
+	uint32_t num_pixels,
+	uint8_t* pWeights,
+	const half_float* pBlock_pixels_half,
+	uint32_t num_weight_levels,
+	const half_float* pDecoded_half,
+	const astc_hdr_codec_options& coptions,
+	uint32_t usable_selector_bitmask)
+{
+	assert((num_pixels >= 1) && (num_pixels <= 16));
+	assert(usable_selector_bitmask);
+
+	const float R_WEIGHT = coptions.m_r_err_scale;
+	const float G_WEIGHT = coptions.m_g_err_scale;
+
+	double total_error = 0;
+
+#ifdef _DEBUG
+	for (uint32_t i = 0; i < num_weight_levels; i++)
+	{
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1]));
+		assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2]));
+	}
+#endif
+
+	for (uint32_t p = 0; p < num_pixels; p++)
+	{
+		const half_float* pDesired_half = &pBlock_pixels_half[p * 3];
+
+		double lowest_e = 1e+30f;
+
+		// this is an approximation of MSLE
+		for (uint32_t i = 0; i < num_weight_levels; i++)
+		{
+			if (((1 << i) & usable_selector_bitmask) == 0)
+				continue;
+
+			// compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE
+			double rd = q(pDecoded_half[i * 3 + 0]) - q(pDesired_half[0]);
+			double gd = q(pDecoded_half[i * 3 + 1]) - q(pDesired_half[1]);
+			double bd = q(pDecoded_half[i * 3 + 2]) - q(pDesired_half[2]);
+
+			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+			if (e < lowest_e)
+			{
+				lowest_e = e;
+				pWeights[p] = (uint8_t)i;
+			}
+		}
+
+		total_error += lowest_e;
+
+	} // p
+
+	return total_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+double compute_block_error(const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_options& coptions)
+{
+	const float R_WEIGHT = coptions.m_r_err_scale;
+	const float G_WEIGHT = coptions.m_g_err_scale;
+
+	double total_error = 0;
+		
+	for (uint32_t p = 0; p < 16; p++)
+	{
+		double rd = q(pOrig_block[p * 3 + 0]) - q(pPacked_block[p * 3 + 0]);
+		double gd = q(pOrig_block[p * 3 + 1]) - q(pPacked_block[p * 3 + 1]);
+		double bd = q(pOrig_block[p * 3 + 2]) - q(pPacked_block[p * 3 + 2]);
+
+		double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+		total_error += e;
+	}
+
+	return total_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag)
+{
+	assert(l < h);
+
+	if (v < l)
+	{
+		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, l - v);
+
+		v = l;
+		did_clamp = true;
+	}
+	else if (v > h)
+	{
+		max_clamp_mag = basisu::maximum<int>(max_clamp_mag, v - h);
+
+		v = h;
+		did_clamp = true;
+	}
+
+	return v;
+}
+
+static bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag)
+{
+	assert(submode <= 7);
+
+	const uint8_t s_b_bits[8] = { 7, 8, 6, 7,  8, 6, 7, 6 };
+	const uint8_t s_c_bits[8] = { 6, 6, 7, 7,  6, 7, 7, 7 };
+	const uint8_t s_d_bits[8] = { 7, 6, 7, 6,  5, 6, 5, 6 };
+
+	const uint32_t a_bits = 9 + (submode >> 1);
+	const uint32_t b_bits = s_b_bits[submode];
+	const uint32_t c_bits = s_c_bits[submode];
+	const uint32_t d_bits = s_d_bits[submode];
+
+	const int max_a_val = (1 << a_bits) - 1;
+	const int max_b_val = (1 << b_bits) - 1;
+	const int max_c_val = (1 << c_bits) - 1;
+
+	// The maximum usable value before it turns to NaN/Inf
+	const int max_a_qlog = get_max_qlog(a_bits);
+
+	const int min_d_val = -(1 << (d_bits - 1));
+	const int max_d_val = -min_d_val - 1;
+	assert((max_d_val - min_d_val + 1) == (1 << d_bits));
+
+	int val_q[2][3];
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+#if 1
+		// this is better
+		const half_float l = qlog16_to_half_slow((uint32_t)std::round(low_q16[c]));
+		val_q[0][c] = half_to_qlog7_12(l, a_bits);
+		
+		const half_float h = qlog16_to_half_slow((uint32_t)std::round(high_q16[c]));
+		val_q[1][c] = half_to_qlog7_12(h, a_bits);
+#else
+		val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits);
+		val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits);
+#endif
+				
+#if 1
+		if (val_q[0][c] == val_q[1][c])
+		{
+#if 0
+			if (l <= h)
+#else
+			if (low_q16[c] < high_q16[c])
+#endif
+			{
+				if (val_q[0][c])
+					val_q[0][c]--;
+
+				if (val_q[1][c] != max_a_val)
+					val_q[1][c]++;
+			}
+			else
+			{
+				if (val_q[0][c] != max_a_val)
+					val_q[0][c]++;
+
+				if (val_q[1][c])
+					val_q[1][c]--;
+			}
+		}
+#endif
+
+		val_q[0][c] = minimum<uint32_t>(val_q[0][c], max_a_qlog);
+		val_q[1][c] = minimum<uint32_t>(val_q[1][c], max_a_qlog);
+	}
+
+	int highest_q = -1, highest_val = 0, highest_comp = 0;
+
+	for (uint32_t v = 0; v < 2; v++)
+	{
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val);
+
+			if (val_q[v][c] > highest_q)
+			{
+				highest_q = val_q[v][c];
+				highest_val = v;
+				highest_comp = c;
+			}
+		}
+	}
+
+	const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q);
+
+	if (highest_val != 1)
+	{
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			std::swap(val_q[0][c], val_q[1][c]);
+		}
+	}
+
+	if (highest_comp)
+	{
+		std::swap(val_q[0][0], val_q[0][highest_comp]);
+		std::swap(val_q[1][0], val_q[1][highest_comp]);
+	}
+
+	int orig_q[2][3];
+	memcpy(orig_q, val_q, sizeof(val_q));
+
+	// val[1][0] is now guaranteed to be highest
+	int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0;
+	int best_max_clamp_mag = 0;
+	bool best_did_clamp = false;
+	int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 }  };
+	BASISU_NOTE_UNUSED(best_q);
+	uint32_t best_dist = UINT_MAX;
+
+	for (uint32_t pass = 0; pass < 2; pass++)
+	{
+		int trial_va = val_q[1][0];
+
+		assert(trial_va <= max_a_val);
+		assert(trial_va >= val_q[1][1]);
+		assert(trial_va >= val_q[1][2]);
+
+		assert(trial_va >= val_q[0][0]);
+		assert(trial_va >= val_q[0][1]);
+		assert(trial_va >= val_q[0][2]);
+
+		bool did_clamp = false;
+		int trial_max_clamp_mag = 0;
+
+		int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag);
+		int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag);
+		int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag);
+		int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
+		int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag);
+
+		if (!did_clamp)
+		{
+			// Make sure decoder gets the expected values
+			assert(trial_va == val_q[1][0]);
+			assert(trial_va - trial_vb0 == val_q[1][1]);
+			assert(trial_va - trial_vb1 == val_q[1][2]);
+
+			assert((trial_va - trial_vc) == val_q[0][0]);
+			assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]);
+			assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]);
+		}
+
+		const int r_e0 = clamp<int>(trial_va, 0, max_a_val);
+		const int r_e1 = clamp<int>(trial_va - trial_vb0, 0, max_a_val);
+		const int r_e2 = clamp<int>(trial_va - trial_vb1, 0, max_a_val);
+
+		const int r_f0 = clamp<int>(trial_va - trial_vc, 0, max_a_val);
+		const int r_f1 = clamp<int>(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val);
+		const int r_f2 = clamp<int>(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val);
+
+		assert(r_e0 <= max_a_qlog);
+		assert(r_e1 <= max_a_qlog);
+		assert(r_e2 <= max_a_qlog);
+
+		assert(r_f0 <= max_a_qlog);
+		assert(r_f1 <= max_a_qlog);
+		assert(r_f2 <= max_a_qlog);
+
+		if ((!did_clamp) || (!had_tie))
+		{
+			best_va = trial_va;
+			best_vb0 = trial_vb0;
+			best_vb1 = trial_vb1;
+			best_vc = trial_vc;
+			best_vd0 = trial_vd0;
+			best_vd1 = trial_vd1;
+			best_max_clamp_mag = trial_max_clamp_mag;
+			best_did_clamp = did_clamp;
+
+			best_q[1][0] = r_e0;
+			best_q[1][1] = r_e1;
+			best_q[1][2] = r_e2;
+			best_q[0][0] = r_f0;
+			best_q[0][1] = r_f1;
+			best_q[0][2] = r_f2;
+			break;
+		}
+
+		// we had a tie and it did clamp, try swapping L/H for a potential slight gain
+
+		const uint32_t r_dist1 = basisu::square<int>(r_e0 - val_q[1][0]) + basisu::square<int>(r_e1 - val_q[1][1]) + basisu::square<int>(r_e2 - val_q[1][2]);
+		const uint32_t r_dist0 = basisu::square<int>(r_f0 - val_q[0][0]) + basisu::square<int>(r_f1 - val_q[0][1]) + basisu::square<int>(r_f2 - val_q[0][2]);
+
+		const uint32_t total_dist = r_dist1 + r_dist0;
+
+		if (total_dist < best_dist)
+		{
+			best_dist = total_dist;
+
+			best_va = trial_va;
+			best_vb0 = trial_vb0;
+			best_vb1 = trial_vb1;
+			best_vc = trial_vc;
+			best_vd0 = trial_vd0;
+			best_vd1 = trial_vd1;
+			best_did_clamp = did_clamp;
+
+			best_q[1][0] = r_e0;
+			best_q[1][1] = r_e1;
+			best_q[1][2] = r_e2;
+			best_q[0][0] = r_f0;
+			best_q[0][1] = r_f1;
+			best_q[0][2] = r_f2;
+		}
+
+		for (uint32_t c = 0; c < 3; c++)
+			std::swap(val_q[0][c], val_q[1][c]);
+	}
+
+	// pack bits now
+	int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0;
+
+	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0;
+	switch (submode)
+	{
+	case 0:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 1:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 2:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 3:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 4:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
+		break;
+	case 5:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	case 6:
+		x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10);
+		break;
+	case 7:
+		x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5);
+		break;
+	default:
+		break;
+	}
+
+	// write mode
+	pack_bit(v1, 7, submode, 0);
+	pack_bit(v2, 7, submode, 1);
+	pack_bit(v3, 7, submode, 2);
+
+	// highest component
+	pack_bit(v4, 7, highest_comp, 0);
+	pack_bit(v5, 7, highest_comp, 1);
+
+	// write bit 8 of va
+	pack_bit(v1, 6, best_va, 8);
+
+	// extra bits
+	pack_bit(v2, 6, x0);
+	pack_bit(v3, 6, x1);
+	pack_bit(v4, 6, x2);
+	pack_bit(v5, 6, x3);
+	pack_bit(v4, 5, x4);
+	pack_bit(v5, 5, x5);
+
+	v0 = best_va & 0xFF;
+	v1 |= (best_vc & 63);
+	v2 |= (best_vb0 & 63);
+	v3 |= (best_vb1 & 63);
+	v4 |= (best_vd0 & 31);
+	v5 |= (best_vd1 & 31);
+
+	assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255));
+
+	pEndpoints[0] = (uint8_t)v0;
+	pEndpoints[1] = (uint8_t)v1;
+	pEndpoints[2] = (uint8_t)v2;
+	pEndpoints[3] = (uint8_t)v3;
+	pEndpoints[4] = (uint8_t)v4;
+	pEndpoints[5] = (uint8_t)v5;
+
+#ifdef _DEBUG
+	// Test for valid pack by unpacking
+	{
+		if (highest_comp)
+		{
+			std::swap(best_q[0][0], best_q[0][highest_comp]);
+			std::swap(best_q[1][0], best_q[1][highest_comp]);
+
+			std::swap(orig_q[0][0], orig_q[0][highest_comp]);
+			std::swap(orig_q[1][0], orig_q[1][highest_comp]);
+		}
+
+		int test_e[2][3];
+		decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS);
+		for (uint32_t i = 0; i < 2; i++)
+		{
+			for (uint32_t j = 0; j < 3; j++)
+			{
+				assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits));
+
+				if (!best_did_clamp)
+				{
+					assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) ||
+						(orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits)));
+				}
+			}
+		}
+	}
+#endif
+
+	max_clamp_mag = best_max_clamp_mag;
+
+	return best_did_clamp;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_astc_mode11_direct(uint8_t* pEndpoints, const vec3F& l_q16, const vec3F& h_q16)
+{
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		// TODO: This goes from QLOG16->HALF->QLOG8/7
+		half_float l_half = qlog16_to_half_slow(clamp((int)std::round(l_q16[i]), 0, 65535));
+		half_float h_half = qlog16_to_half_slow(clamp((int)std::round(h_q16[i]), 0, 65535));
+
+		int l_q, h_q;
+
+		if (i == 2)
+		{
+			l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)];
+			h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)];
+
+			l_q = minimum<uint32_t>(l_q, MAX_QLOG7);
+			h_q = minimum<uint32_t>(h_q, MAX_QLOG7);
+		}
+		else
+		{
+			l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)];
+			h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)];
+
+			l_q = minimum<uint32_t>(l_q, MAX_QLOG8);
+			h_q = minimum<uint32_t>(h_q, MAX_QLOG8);
+		}
+
+#if 1
+		if (l_q == h_q)
+		{
+			const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8;
+
+			if (l_q16[i] <= h_q16[i])
+			{
+				if (l_q)
+					l_q--;
+
+				if (h_q != m)
+					h_q++;
+			}
+			else
+			{
+				if (h_q)
+					h_q--;
+
+				if (l_q != m)
+					l_q++;
+			}
+		}
+#endif
+				
+		if (i == 2)
+		{
+			assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7);
+			l_q |= 128;
+			h_q |= 128;
+		}
+		else
+		{
+			assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8);
+		}
+
+		pEndpoints[2 * i + 0] = (uint8_t)l_q;
+		pEndpoints[2 * i + 1] = (uint8_t)h_q;
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range)
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+
+	assert(submode <= 5);
+	max_clamp_mag = 0;
+
+	static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 };
+	static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 };
+	static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 };
+
+	// The precision of the components
+	const uint32_t prec_bits = s_r_bits[submode];
+
+	int qlog[4], pack_bits[4];
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		const float f = (i == 3) ? s_q16 : rgb_q16[i];
+
+		// The # of bits the component is packed into
+		if (i == 0)
+			pack_bits[i] = s_r_bits[submode];
+		else if (i == 3)
+			pack_bits[i] = s_s_bits[submode];
+		else
+			pack_bits[i] = s_g_b_bits[submode];
+
+#if 0
+		// this is slightly worse
+		// TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error.
+		half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16);
+		qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits);
+#else
+		qlog[i] = quant_qlog16(clamp<int>((int)std::round(f), 0, MAX_QLOG16), prec_bits);
+
+		// Only bias if there are enough texel weights, 4=6 weights
+		if (ise_weight_range >= 4)
+		{
+			// Explictly bias the high color, and the scale up, to better exploit the weights.
+			// The quantized range also then encompases the complete input range.
+			const uint32_t max_val = (1 << prec_bits) - 1;
+			const uint32_t K = 3;
+			if (i == 3)
+			{
+				qlog[i] = minimum<uint32_t>(qlog[i] + K * 2, max_val);
+			}
+			else
+			{
+				qlog[i] = minimum<uint32_t>(qlog[i] + K, max_val);
+			}
+		}
+#endif
+
+		if (i != 3)
+			qlog[i] = minimum<uint32_t>(qlog[i], get_max_qlog(prec_bits));
+
+		// If S=0, we lose freedom for the texel weights to add any value.
+		if ((i == 3) && (qlog[i] == 0))
+			qlog[i] = 1;
+	}
+
+	uint32_t maj_index = 0;
+
+	bool did_clamp = false;
+
+	if (submode != 5)
+	{
+		int largest_qlog = 0;
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (qlog[i] > largest_qlog)
+			{
+				largest_qlog = qlog[i];
+				maj_index = i;
+			}
+		}
+
+		if (maj_index)
+		{
+			std::swap(qlog[0], qlog[maj_index]);
+		}
+
+		assert(qlog[0] >= qlog[1]);
+		assert(qlog[0] >= qlog[2]);
+
+		qlog[1] = qlog[0] - qlog[1];
+		qlog[2] = qlog[0] - qlog[2];
+
+		for (uint32_t i = 1; i < 4; i++)
+		{
+			const int max_val = (1 << pack_bits[i]) - 1;
+
+			if (qlog[i] > max_val)
+			{
+				max_clamp_mag = maximum<int>(max_clamp_mag, qlog[i] - max_val);
+				qlog[i] = max_val;
+				did_clamp = true;
+			}
+		}
+	}
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		const int max_val = (1 << pack_bits[i]) - 1; (void)max_val;
+
+		assert(qlog[i] <= max_val);
+	}
+
+	int mode = 0;
+
+	int r = qlog[0] & 63; // 6-bits
+	int g = qlog[1] & 31; // 5-bits
+	int b = qlog[2] & 31; // 5-bits
+	int s = qlog[3] & 31; // 5-bits
+
+	int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0;
+
+	switch (submode)
+	{
+	case 0:
+	{
+		mode = (maj_index << 2) | 0;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 9); // R9
+		x1 = get_bit(qlog[0], 8); // R8
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[0], 10); // R10
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 1:
+	{
+		mode = (maj_index << 2) | 1;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 8); // R8
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[0], 10); // R10
+		x6 = get_bit(qlog[0], 9); // R9
+		break;
+	}
+	case 2:
+	{
+		mode = (maj_index << 2) | 2;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 9); // R9
+		x1 = get_bit(qlog[0], 8); // R8
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[0], 6); // R6
+		x4 = get_bit(qlog[3], 7); // S7 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 3:
+	{
+		mode = (maj_index << 2) | 3;
+		assert((mode & 0xC) != 0xC);
+
+		x0 = get_bit(qlog[0], 8); // R8
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[0], 7); // R7
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 4:
+	{
+		mode = maj_index | 0xC; // 0b1100
+		assert((mode & 0xC) == 0xC);
+		assert(mode != 0xF);
+
+		x0 = get_bit(qlog[1], 6); // G6
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[2], 6); // B6
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[0], 7); // R7
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	case 5:
+	{
+		mode = 0xF;
+
+		x0 = get_bit(qlog[1], 6); // G6
+		x1 = get_bit(qlog[1], 5); // G5
+		x2 = get_bit(qlog[2], 6); // B6
+		x3 = get_bit(qlog[2], 5); // B5
+		x4 = get_bit(qlog[0], 6); // R6 
+		x5 = get_bit(qlog[3], 6); // S6
+		x6 = get_bit(qlog[3], 5); // S5
+		break;
+	}
+	default:
+	{
+		assert(0);
+		break;
+	}
+	}
+
+	pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r);
+	pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g);
+	pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b);
+	pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s);
+
+#ifdef _DEBUG
+	// Test for valid pack by unpacking
+	{
+		const int inv_shift = 12 - prec_bits;
+
+		int unpacked_e[2][3];
+		if (submode != 5)
+		{
+			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
+			unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF);
+			unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF);
+
+			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF);
+		}
+		else
+		{
+			unpacked_e[1][0] = left_shift32(qlog[0], inv_shift);
+			unpacked_e[1][1] = left_shift32(qlog[1], inv_shift);
+			unpacked_e[1][2] = left_shift32(qlog[2], inv_shift);
+
+			unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF);
+			unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF);
+		}
+
+		if (maj_index)
+		{
+			std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]);
+			std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]);
+		}
+
+		int e[2][3];
+		decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr);
+
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			assert(unpacked_e[0][i] == e[0][i]);
+			assert(unpacked_e[1][i] == e[1][i]);
+		}
+	}
+#endif
+
+	return did_clamp;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void quantize_ise_endpoints(uint32_t ise_endpoint_range, const uint8_t* pSrc_endpoints, uint8_t *pDst_endpoints, uint32_t n)
+{
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+
+	if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
+	{
+		memcpy(pDst_endpoints, pSrc_endpoints, n);
+	}
+	else
+	{
+		for (uint32_t i = 0; i < n; i++)
+		{
+			uint32_t v = pSrc_endpoints[i];
+			assert(v <= 255);
+
+			pDst_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_val_to_ise[v];
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+// Note this could fail to find any valid solution if use_endpoint_range!=20.
+// Returns true if improved.
+static bool try_mode11(uint32_t num_pixels,
+	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+	vec3F& low_color_q16, const vec3F& high_color_q16,
+	half_float block_pixels_half[16][3],
+	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, bool direct_only, uint32_t ise_endpoint_range, 
+	bool constrain_ise_weight8_selectors, 
+	int32_t first_submode, int32_t last_submode) // -1, 7
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((num_weight_levels >= 3) && (num_weight_levels <= 32));
+	assert((num_pixels >= 1) && (num_pixels <= 16));
+
+	bool improved_flag = false;
+
+	half_float decoded_half[32][3];
+	vec3F decoded_float[32];
+	uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
+
+	if (direct_only)
+	{
+		first_submode = -1;
+		last_submode = -1;
+	}
+
+	assert(first_submode <= last_submode);
+	assert((first_submode >= -1) && (first_submode <= 7));
+	assert((last_submode >= -1) && (last_submode <= 7));
+
+	// TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
+	for (int submode = last_submode; submode >= first_submode; submode--)
+	{
+		bool did_clamp = false;
+		int max_clamp_mag = 0;
+		if (submode == -1)
+		{
+			// If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision.
+			pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16);
+		}
+		else
+		{
+			did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag);
+
+			// If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts.
+			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
+			if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+				continue;
+		}
+				
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
+		
+		if (!get_astc_hdr_mode_11_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
+			continue;
+
+		uint32_t usable_selector_bitmask = UINT32_MAX;
+		if ((constrain_ise_weight8_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS))
+			usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15);
+
+		double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask);
+		if (trial_blk_error < cur_block_error)
+		{
+			cur_block_error = trial_blk_error;
+			memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
+			memcpy(pWeights, trial_weights, num_pixels);
+			submode_used = submode + 1;
+			improved_flag = true;
+		}
+
+		// If it didn't clamp it was a lossless encode at this precision, so we can stop early as there's probably no use trying lower precision submodes.
+		// (Although it may be, because a lower precision pack could try nearby voxel coords.)
+		// However, at lower levels quantization may cause the decoded endpoints to be very distorted, so we need to evaluate up to direct.
+		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) 
+		{
+			if (!did_clamp)
+				break;
+		}
+	}
+
+	return improved_flag;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool try_mode7(
+	uint32_t num_pixels,
+	uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used,
+	vec3F& high_color_q16, const float s_q16,
+	half_float block_pixels_half[16][3],
+	uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, 
+	uint32_t ise_endpoint_range)
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((num_pixels >= 1) && (num_pixels <= 16));
+
+	bool improved_flag = false;
+
+	half_float decoded_half[24][3];
+	vec3F decoded_float[24];
+
+	uint8_t orig_trial_endpoints[NUM_MODE7_ENDPOINTS], trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
+
+	// TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done.
+	for (int submode = 0; submode <= 5; submode++)
+	{
+		int max_clamp_mag = 0;
+		const bool did_clamp = pack_astc_mode7_submode(submode, orig_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range);
+
+		if (submode < 5)
+		{
+			const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4;
+			if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH))
+				continue;
+		}
+
+		// This will distort the endpoints if the ISE endpoint range isn't 256 levels (20).
+		// It could massively distort the endpoints, but still result in a valid encoding.
+		quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
+
+		if (!get_astc_hdr_mode_7_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range))
+			continue;
+
+		double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions);
+		if (trial_blk_error < cur_block_error)
+		{
+			cur_block_error = trial_blk_error;
+			memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
+			memcpy(pWeights, trial_weights, num_pixels);
+			submode_used = submode;
+			improved_flag = true;
+		}
+
+		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
+		{
+			if (!did_clamp)
+				break;
+		}
+	}
+
+	return improved_flag;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static double encode_astc_hdr_block_mode_11(
+	uint32_t num_pixels,
+	const vec4F* pBlock_pixels,
+	uint32_t ise_weight_range,
+	uint32_t& best_submode,
+	double cur_block_error,
+	uint8_t* blk_endpoints, uint8_t* blk_weights,
+	const astc_hdr_codec_options& coptions,
+	bool direct_only,
+	uint32_t ise_endpoint_range,
+	bool uber_mode,
+	bool constrain_ise_weight8_selectors,
+	int32_t first_submode, int32_t last_submode)
+{
+	assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+	assert((num_pixels >= 1) && (num_pixels <= 16));
+
+	best_submode = 0;
+
+	half_float block_pixels_half[16][3];
+	vec4F block_pixels_q16[16];
+		
+	// TODO: This is done redundantly.
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
+		block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
+
+		block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
+		block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
+
+		block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
+		block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
+
+		block_pixels_q16[i][3] = 0.0f;
+	}
+
+	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
+	
+	// TODO: should match MAX_SUPPORTED_ISE_WEIGHT_INDEX
+	const uint32_t MAX_WEIGHT_LEVELS = 32;
+	(void)MAX_WEIGHT_LEVELS;
+	assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
+
+	vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
+	vec3F block_axis_q16(calc_rgb_pca(num_pixels, block_pixels_q16, block_mean_color_q16));
+
+	aabb3F color_box_q16(cInitExpand);
+
+	float l = 1e+30f, h = -1e+30f;
+	vec3F low_color_q16, high_color_q16;
+
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(block_pixels_q16[i]);
+
+		vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		if (kd < l)
+		{
+			l = kd;
+			low_color_q16 = block_pixels_q16[i];
+		}
+
+		if (kd > h)
+		{
+			h = kd;
+			high_color_q16 = block_pixels_q16[i];
+		}
+	}
+
+	vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16);
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		low_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f);
+		high_color_q16[i] = lerp<float>(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f);
+	}
+		
+	uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS];
+	uint8_t trial_blk_weights[16];
+	uint32_t trial_best_submode = 0;
+	
+	clear_obj(trial_blk_endpoints);
+	clear_obj(trial_blk_weights);
+	
+	double trial_blk_error = 1e+30f;
+
+	bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
+		low_color_q16, high_color_q16,
+		block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
+		first_submode, last_submode);
+	
+	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
+	if (!did_improve)
+		return cur_block_error;
+
+	// Did the solution improve?
+	if (trial_blk_error < cur_block_error)
+	{
+		cur_block_error = trial_blk_error;
+		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS);
+		memcpy(blk_weights, trial_blk_weights, num_pixels);
+		best_submode = trial_best_submode;
+	}
+		
+#define USE_LEAST_SQUARES (1)
+#if USE_LEAST_SQUARES
+	// least squares on the most promising trial weight indices found
+	const uint32_t NUM_LS_PASSES = 3;
+
+	for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++)
+	{
+		vec3F l_q16, h_q16;
+		if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
+			break;
+
+		bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+			l_q16, h_q16,
+			block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
+			first_submode, last_submode);
+
+		if (!was_improved)
+			break;
+
+		// It's improved, so let's take the new weight indices.
+		memcpy(trial_blk_weights, blk_weights, num_pixels);
+
+	} // pass
+#endif
+		
+	if (uber_mode)
+	{
+		// Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost.
+
+		uint8_t temp_astc_weights[16];
+		memcpy(temp_astc_weights, trial_blk_weights, num_pixels);
+
+		uint32_t min_lin_sel = 256, max_lin_sel = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t astc_sel = temp_astc_weights[i];
+
+			const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+			assert(lin_sel < num_weight_levels);
+
+			min_lin_sel = minimumu(min_lin_sel, lin_sel);
+			max_lin_sel = maximumu(max_lin_sel, lin_sel);
+		}
+
+		bool was_improved = false;
+		(void)was_improved;
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[16];
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
+				{
+					lin_sel++;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, 
+						first_submode, last_submode))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[16];
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
+				{
+					lin_sel--;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
+						first_submode, last_submode))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+
+		{
+			bool weights_changed = false;
+			uint8_t trial_weights[16];
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				uint32_t astc_sel = temp_astc_weights[i];
+				uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel];
+
+				if ((lin_sel == max_lin_sel) && (lin_sel > 0))
+				{
+					lin_sel--;
+					weights_changed = true;
+				}
+				else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1)))
+				{
+					lin_sel++;
+					weights_changed = true;
+				}
+
+				trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel];
+			}
+
+			if (weights_changed)
+			{
+				vec3F l_q16, h_q16;
+				if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16))
+				{
+					if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+						l_q16, h_q16,
+						block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors,
+						first_submode, last_submode))
+					{
+						was_improved = true;
+					}
+				}
+			}
+		}
+	} // uber_mode
+
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static double encode_astc_hdr_block_mode_7(
+	uint32_t num_pixels, const vec4F* pBlock_pixels,
+	uint32_t ise_weight_range,
+	uint32_t& best_submode,
+	double cur_block_error,
+	uint8_t* blk_endpoints,  //[4]
+	uint8_t* blk_weights, // [num_pixels]
+	const astc_hdr_codec_options& coptions,
+	uint32_t ise_endpoint_range)
+{
+	assert((num_pixels >= 1) && (num_pixels <= 16));
+	assert((ise_weight_range >= 1) && (ise_weight_range <= 10));
+	assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+	const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range);
+
+	const uint32_t MAX_WEIGHT_LEVELS = 24;
+	assert(num_weight_levels <= MAX_WEIGHT_LEVELS);
+	BASISU_NOTE_UNUSED(MAX_WEIGHT_LEVELS);
+
+	best_submode = 0;
+
+	half_float block_pixels_half[16][3];
+
+	vec4F block_pixels_q16[16];
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
+		block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]);
+
+		block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
+		block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]);
+
+		block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
+		block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]);
+
+		block_pixels_q16[i][3] = 0.0f;
+	}
+
+	vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16));
+
+	vec3F block_axis_q16(0.577350259f);
+
+	aabb3F color_box_q16(cInitExpand);
+
+	float l = 1e+30f, h = -1e+30f;
+	for (uint32_t i = 0; i < num_pixels; i++)
+	{
+		color_box_q16.expand(block_pixels_q16[i]);
+
+		vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16);
+		float kd = k.dot(block_axis_q16);
+
+		l = basisu::minimum<float>(l, kd);
+		h = basisu::maximum<float>(h, kd);
+	}
+
+	vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16));
+	vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16));
+
+	low_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
+	high_color_q16.clamp(0.0f, MAX_QLOG16_VAL);
+
+	vec3F diff(high_color_q16 - low_color_q16);
+	float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0];
+
+	uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS];
+	uint8_t trial_blk_weights[16];
+	uint32_t trial_best_submode = 0;
+
+	clear_obj(trial_blk_endpoints);
+	clear_obj(trial_blk_weights);
+
+	double trial_blk_error = 1e+30f;
+
+	bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode,
+		high_color_q16, ceilf(s_q16),
+		block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
+
+	// If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do.
+	if (!did_improve)
+	{
+		return cur_block_error;
+	}
+
+	// Did the solution improve?
+	if (trial_blk_error < cur_block_error)
+	{
+		cur_block_error = trial_blk_error;
+		memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS);
+		memcpy(blk_weights, trial_blk_weights, num_pixels);
+		best_submode = trial_best_submode;
+	}
+
+	const float one_over_num_pixels = 1.0f / (float)num_pixels;
+
+	const uint32_t NUM_TRIALS = 2;
+	for (uint32_t trial = 0; trial < NUM_TRIALS; trial++)
+	{
+		// Given a set of selectors and S, try to compute a better high color
+		vec3F new_high_color_q16(block_mean_color_q16);
+
+		int e[2][3];
+		int cur_s = 0;
+		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range))
+			break;
+
+		cur_s <<= 4;
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			uint32_t astc_sel = trial_blk_weights[i];
+			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
+
+			float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels;
+			new_high_color_q16[0] += k;
+			new_high_color_q16[1] += k;
+			new_high_color_q16[2] += k;
+		}
+
+		bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+			new_high_color_q16, (float)cur_s,
+			block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range);
+
+		if (improved)
+		{
+			memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
+			memcpy(trial_blk_weights, blk_weights, num_pixels);
+		}
+
+		// Given a set of selectors and a high color, try to compute a better S.
+		float t = 0.0f;
+
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			uint32_t astc_sel = trial_blk_weights[i];
+			float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f);
+
+			t += (1.0f) - lerp;
+		}
+
+		t *= one_over_num_pixels;
+
+		//int e[2][3];
+		if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range))
+			break;
+
+		vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4));
+
+		if (fabs(t) > .0000125f)
+		{
+			float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t;
+			float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t;
+			float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t;
+
+			// TODO: gather statistics on these
+			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				cur_h_q16, ceilf(s_r),
+				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
+			{
+				improved = true;
+			}
+
+			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				cur_h_q16, ceilf(s_g),
+				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
+			{
+				improved = true;
+			}
+
+			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				cur_h_q16, ceilf(s_b),
+				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
+			{
+				improved = true;
+			}
+
+			if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode,
+				cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f),
+				block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range))
+			{
+				improved = true;
+			}
+		}
+
+		if (!improved)
+			break;
+
+		memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS);
+		memcpy(trial_blk_weights, blk_weights, num_pixels);
+
+	} // trial
+
+	return cur_block_error;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
+{
+	float r = 0.0f, g = 0.0f, b = 0.0f;
+
+	const float LOG_BIAS = .125f;
+
+	bool solid_block = true;
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) ||
+			(pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) ||
+			(pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2]))
+		{
+			solid_block = false;
+		}
+
+		r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS);
+		g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS);
+		b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS);
+	}
+
+	if (solid_block)
+	{
+		r = pBlock_linear_colors[0][0];
+		g = pBlock_linear_colors[0][1];
+		b = pBlock_linear_colors[0][2];
+	}
+	else
+	{
+		r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS);
+		g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS);
+		b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS);
+
+		// for safety
+		r = minimum<float>(r, MAX_HALF_FLOAT);
+		g = minimum<float>(g, MAX_HALF_FLOAT);
+		b = minimum<float>(b, MAX_HALF_FLOAT);
+	}
+
+	half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f);
+
+	astc_hdr_pack_results results;
+	results.clear();
+
+	uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk;
+	results.m_is_solid = true;
+
+	packed_blk[0] = 0b11111100;
+	packed_blk[1] = 255;
+	packed_blk[2] = 255;
+	packed_blk[3] = 255;
+	packed_blk[4] = 255;
+	packed_blk[5] = 255;
+	packed_blk[6] = 255;
+	packed_blk[7] = 255;
+
+	packed_blk[8] = (uint8_t)rh;
+	packed_blk[9] = (uint8_t)(rh >> 8);
+	packed_blk[10] = (uint8_t)gh;
+	packed_blk[11] = (uint8_t)(gh >> 8);
+	packed_blk[12] = (uint8_t)bh;
+	packed_blk[13] = (uint8_t)(bh >> 8);
+	packed_blk[14] = (uint8_t)ah;
+	packed_blk[15] = (uint8_t)(ah >> 8);
+
+	results.m_best_block_error = 0;
+
+	if (!solid_block)
+	{
+		const float R_WEIGHT = coptions.m_r_err_scale;
+		const float G_WEIGHT = coptions.m_g_err_scale;
+
+		// This MUST match how errors are computed in eval_selectors().
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
+			double rd = q(rh) - q(dr);
+			double gd = q(gh) - q(dg);
+			double bd = q(bh) - q(db);
+
+			double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd;
+
+			results.m_best_block_error += e;
+		}
+	}
+
+	const half_float hc[3] = { rh, gh, bh };
+
+	bc6h_enc_block_solid_color(&results.m_bc6h_block, hc);
+
+	all_results.push_back(results);
+
+	return solid_block;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode11(
+	const vec4F* pBlock_linear_colors, 
+	basisu::vector<astc_hdr_pack_results>& all_results, 
+	const astc_hdr_codec_options& coptions, 
+	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight8_selectors)
+{
+	uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16];
+	uint32_t trial_submode11 = 0;
+
+	clear_obj(trial_endpoints);
+	clear_obj(trial_weights);
+		
+	for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
+	{
+		const bool direct_only = coptions.m_mode11_direct_only;
+		
+		uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS;
+		if (weight_ise_range == astc_helpers::BISE_16_LEVELS)
+			endpoint_ise_range = astc_helpers::BISE_192_LEVELS;
+		else
+		{
+			assert(weight_ise_range < astc_helpers::BISE_16_LEVELS);
+		}
+				
+		double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_linear_colors, weight_ise_range, trial_submode11, 1e+30f, trial_endpoints, trial_weights, coptions, direct_only, 
+			endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight8_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
+
+		if (trial_error < 1e+30f)
+		{
+			astc_hdr_pack_results results;
+			results.clear();
+
+			results.m_best_block_error = trial_error;
+
+			results.m_best_submodes[0] = trial_submode11;
+			results.m_constrained_weights = constrain_ise_weight8_selectors;
+						
+			results.m_best_blk.m_num_partitions = 1;
+			results.m_best_blk.m_color_endpoint_modes[0] = 11;
+			results.m_best_blk.m_weight_ise_range = weight_ise_range;
+			results.m_best_blk.m_endpoint_ise_range = endpoint_ise_range;
+			
+			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS);
+			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
+
+#ifdef _DEBUG
+			{
+				half_float block_pixels_half[16][3];
+
+				vec4F block_pixels_q16[16];
+				for (uint32_t i = 0; i < 16; i++)
+				{
+					block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]);
+					block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]);
+					block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]);
+				}
+				
+				half_float unpacked_astc_blk_rgba[4][4][4];
+				bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+				assert(res);
+
+				half_float unpacked_astc_blk_rgb[4][4][3];
+				for (uint32_t y = 0; y < 4; y++)
+					for (uint32_t x = 0; x < 4; x++)
+						for (uint32_t c = 0; c < 3; c++)
+							unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
+
+				double cmp_err = compute_block_error(&block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions);
+				assert(results.m_best_block_error == cmp_err);
+			}
+#endif
+
+			// transcode to BC6H
+			assert(results.m_best_blk.m_color_endpoint_modes[0] == 11);
+			
+			// Get qlog12 endpoints
+			int e[2][3];
+			bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range);
+			assert(success);
+			BASISU_NOTE_UNUSED(success);
+
+			// Transform endpoints to half float
+			half_float h_e[3][2] =
+			{
+				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
+				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
+				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
+			};
+
+			// Transcode to bc6h
+			success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
+			assert(success);
+
+			all_results.push_back(results);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode7_single_part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions)
+{
+	uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16];
+	uint32_t trial_submode7 = 0;
+
+	clear_obj(trial_endpoints);
+	clear_obj(trial_weights);
+
+	for (uint32_t weight_ise_range = coptions.m_first_mode7_part1_weight_ise_range; weight_ise_range <= coptions.m_last_mode7_part1_weight_ise_range; weight_ise_range++)
+	{
+		const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
+
+		double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_linear_colors, weight_ise_range, trial_submode7, 1e+30f, trial_endpoints, trial_weights, coptions, ise_endpoint_range);
+
+		if (trial_error < 1e+30f)
+		{
+			astc_hdr_pack_results results;
+			results.clear();
+
+			results.m_best_block_error = trial_error;
+
+			results.m_best_submodes[0] = trial_submode7;
+			
+			results.m_best_blk.m_num_partitions = 1;
+			results.m_best_blk.m_color_endpoint_modes[0] = 7;
+			results.m_best_blk.m_weight_ise_range = weight_ise_range;
+			results.m_best_blk.m_endpoint_ise_range = ise_endpoint_range;
+			
+			memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS);
+			memcpy(results.m_best_blk.m_weights, trial_weights, 16);
+
+			// transcode to BC6H
+			assert(results.m_best_blk.m_color_endpoint_modes[0] == 7);
+			
+			// Get qlog12 endpoints
+			int e[2][3];
+			if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range))
+				continue;
+
+			// Transform endpoints to half float
+			half_float h_e[3][2] =
+			{
+				{ qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) },
+				{ qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) },
+				{ qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) }
+			};
+
+			// Transcode to bc6h
+			bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block);
+			assert(status);
+			(void)status;
+
+			all_results.push_back(results);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static bool estimate_partition2(const vec4F* pBlock_pixels, int* pBest_parts, uint32_t num_best_parts)
+{
+	assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+
+	vec3F training_vecs[16], mean(0.0f);
+
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		vec3F& v = training_vecs[i];
+
+		v[0] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]);
+		v[1] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]);
+		v[2] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]);
+
+		mean += v;
+	}
+	mean *= (1.0f / 16.0f);
+
+	vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) };
+
+	uint32_t cluster_pixels[2][16];
+	uint32_t num_cluster_pixels[2];
+	vec3F new_cluster_means[2];
+
+	for (uint32_t s = 0; s < 4; s++)
+	{
+		num_cluster_pixels[0] = 0;
+		num_cluster_pixels[1] = 0;
+
+		new_cluster_means[0].clear();
+		new_cluster_means[1].clear();
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			float d0 = training_vecs[i].squared_distance(cluster_centroids[0]);
+			float d1 = training_vecs[i].squared_distance(cluster_centroids[1]);
+
+			if (d0 < d1)
+			{
+				cluster_pixels[0][num_cluster_pixels[0]] = i;
+				new_cluster_means[0] += training_vecs[i];
+				num_cluster_pixels[0]++;
+			}
+			else
+			{
+				cluster_pixels[1][num_cluster_pixels[1]] = i;
+				new_cluster_means[1] += training_vecs[i];
+				num_cluster_pixels[1]++;
+			}
+		}
+
+		if (!num_cluster_pixels[0] || !num_cluster_pixels[1])
+			return false;
+
+		cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0];
+		cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1];
+	}
+
+	int desired_parts[4][4]; // [y][x]
+	for (uint32_t p = 0; p < 2; p++)
+	{
+		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
+		{
+			const uint32_t pix_index = cluster_pixels[p][i];
+
+			desired_parts[pix_index >> 2][pix_index & 3] = p;
+		}
+	}
+
+	uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
+
+	for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++)
+	{
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+
+		int total_sim_non_inv = 0;
+		int total_sim_inv = 0;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+				if (part == desired_parts[y][x])
+					total_sim_non_inv++;
+
+				if ((part ^ 1) == desired_parts[y][x])
+					total_sim_inv++;
+			}
+		}
+
+		int total_sim = maximum(total_sim_non_inv, total_sim_inv);
+
+		part_similarity[part_index] = (total_sim << 8) | part_index;
+
+	} // part_index;
+
+	std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+
+	for (uint32_t i = 0; i < num_best_parts; i++)
+		pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF;
+
+	return true;
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode7_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
+	int num_estimated_partitions, const int *pEstimated_partitions,
+	uint32_t first_weight_ise_range, uint32_t last_weight_ise_range)
+{
+	assert(coptions.m_mode7_part2_part_masks);
+
+	astc_helpers::log_astc_block trial_blk;
+	clear_obj(trial_blk);
+	trial_blk.m_grid_width = 4;
+	trial_blk.m_grid_height = 4;
+
+	trial_blk.m_num_partitions = 2;
+	trial_blk.m_color_endpoint_modes[0] = 7;
+	trial_blk.m_color_endpoint_modes[1] = 7;
+
+	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
+		
+	if (num_estimated_partitions)
+	{
+		first_part_index = 0;
+		last_part_index = num_estimated_partitions;
+	}
+	
+	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
+	{
+		uint32_t part_index;
+		if (num_estimated_partitions)
+		{
+			part_index = pEstimated_partitions[part_index_iter];
+			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+		}
+		else
+		{
+			part_index = part_index_iter;
+			if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0)
+				continue;
+		}
+								
+		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
+
+		vec4F part_pixels[2][16];
+		uint32_t pixel_part_index[4][4]; // [y][x]
+		uint32_t num_part_pixels[2] = { 0, 0 };
+
+		// Extract each subset's texels for this partition pattern
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+				if (invert_flag)
+					part = 1 - part;
+
+				pixel_part_index[y][x] = part;
+				part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
+
+				num_part_pixels[part]++;
+			}
+		}
+
+		trial_blk.m_partition_id = astc_pattern;
+				
+		for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++)
+		{
+			assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS);
+
+			uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS;
+			if (weight_ise_range == astc_helpers::BISE_5_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_192_LEVELS;
+			else if (weight_ise_range == astc_helpers::BISE_6_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_128_LEVELS;
+			else if (weight_ise_range == astc_helpers::BISE_8_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_80_LEVELS;
+
+			uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16];
+			uint32_t trial_submode7[2];
+
+			clear_obj(trial_endpoints);
+			clear_obj(trial_weights);
+			clear_obj(trial_submode7);
+
+			double total_trial_err = 0;
+			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+			{
+				total_trial_err += encode_astc_hdr_block_mode_7(
+					num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
+					weight_ise_range, trial_submode7[pack_part_index], 1e+30f,
+					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range);
+
+			} // pack_part_index
+
+			if (total_trial_err < 1e+30f)
+			{
+				trial_blk.m_weight_ise_range = weight_ise_range;
+				trial_blk.m_endpoint_ise_range = ise_endpoint_range;
+
+				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS);
+
+				uint32_t src_pixel_index[2] = { 0, 0 };
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						uint32_t p = pixel_part_index[y][x];
+						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
+					}
+				}
+								
+				astc_hdr_pack_results results;
+				results.clear();
+
+				results.m_best_block_error = total_trial_err;
+				results.m_best_submodes[0] = trial_submode7[0];
+				results.m_best_submodes[1] = trial_submode7[1];
+				results.m_best_pat_index = part_index;
+
+				results.m_best_blk = trial_blk;
+
+				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
+				assert(status);
+				BASISU_NOTE_UNUSED(status);
+
+				all_results.push_back(results);
+			}
+
+		} // weight_ise_range
+
+	} // part_index
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+static void pack_mode11_2part(const vec4F* pBlock_linear_colors, basisu::vector<astc_hdr_pack_results>& all_results, const astc_hdr_codec_options& coptions,
+	int num_estimated_partitions, const int* pEstimated_partitions)
+{
+	assert(coptions.m_mode11_part2_part_masks);
+
+	astc_helpers::log_astc_block trial_blk;
+	clear_obj(trial_blk);
+	trial_blk.m_grid_width = 4;
+	trial_blk.m_grid_height = 4;
+
+	trial_blk.m_num_partitions = 2;
+	trial_blk.m_color_endpoint_modes[0] = 11;
+	trial_blk.m_color_endpoint_modes[1] = 11;
+			
+	uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2;
+
+	if (num_estimated_partitions)
+	{
+		first_part_index = 0;
+		last_part_index = num_estimated_partitions;
+	}
+
+	for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter)
+	{
+		uint32_t part_index;
+		if (num_estimated_partitions)
+		{
+			part_index = pEstimated_partitions[part_index_iter];
+			assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+		}
+		else
+		{
+			part_index = part_index_iter;
+			if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0)
+				continue;
+		}
+
+		const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+		const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert;
+
+		vec4F part_pixels[2][16];
+		uint32_t pixel_part_index[4][4]; // [y][x]
+		uint32_t num_part_pixels[2] = { 0, 0 };
+
+		// Extract each subset's texels for this partition pattern
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+				if (invert_flag)
+					part = 1 - part;
+
+				pixel_part_index[y][x] = part;
+				part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4];
+
+				num_part_pixels[part]++;
+			}
+		}
+				
+		trial_blk.m_partition_id = astc_pattern;
+						
+		for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++)
+		{
+			bool direct_only = false;
+			uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS;
+			if (weight_ise_range == astc_helpers::BISE_4_LEVELS)
+				ise_endpoint_range = astc_helpers::BISE_40_LEVELS;
+
+			uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16];
+			uint32_t trial_submode11[2];
+
+			clear_obj(trial_endpoints); 
+			clear_obj(trial_weights);
+			clear_obj(trial_submode11);
+
+			double total_trial_err = 0;
+			for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+			{
+				total_trial_err += encode_astc_hdr_block_mode_11(
+					num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0],
+					weight_ise_range, trial_submode11[pack_part_index], 1e+30f,
+					&trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions,
+					direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false,
+					coptions.m_first_mode11_submode, coptions.m_last_mode11_submode);
+
+			} // pack_part_index
+
+			if (total_trial_err < 1e+30f)
+			{
+				trial_blk.m_weight_ise_range = weight_ise_range;
+				trial_blk.m_endpoint_ise_range = ise_endpoint_range;
+
+				for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++)
+					memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS);
+
+				uint32_t src_pixel_index[2] = { 0, 0 };
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						uint32_t p = pixel_part_index[y][x];
+						trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++];
+					}
+				}
+								
+				astc_hdr_pack_results results;
+				results.clear();
+
+				results.m_best_block_error = total_trial_err;
+				results.m_best_submodes[0] = trial_submode11[0];
+				results.m_best_submodes[1] = trial_submode11[1];
+				results.m_best_pat_index = part_index;
+
+				results.m_best_blk = trial_blk;
+
+				bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block);
+				assert(status);
+				BASISU_NOTE_UNUSED(status);
+
+				all_results.push_back(results);
+			}
+
+		} // weight_ise_range
+
+	} // part_index
+}
+
+//--------------------------------------------------------------------------------------------------------------------------
+
+bool g_astc_hdr_enc_initialized;
+
+void astc_hdr_enc_init()
+{
+	if (g_astc_hdr_enc_initialized)
+		return;
+
+	astc_hdr_core_init();
+
+	astc_helpers::init_tables(true);
+			
+	init_qlog_tables();
+
+	encode_astc_hdr_init();
+								
+	g_astc_hdr_enc_initialized = true;
+}
+
+bool astc_hdr_enc_block(
+	const float* pRGBPixels, 
+	const astc_hdr_codec_options& coptions,
+	basisu::vector<astc_hdr_pack_results>& all_results)
+{
+	assert(g_astc_hdr_enc_initialized);
+	if (!g_astc_hdr_enc_initialized)
+	{
+		// astc_hdr_enc_init() MUST be called first.
+		assert(0);
+		return false;
+	}
+
+	all_results.resize(0);
+				
+	vec4F block_linear_colors[16];
+
+	// Sanity check the input block.
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		for (uint32_t j = 0; j < 3; j++)
+		{
+			float v = pRGBPixels[i * 3 + j];
+
+			if (std::isinf(v) || std::isnan(v))
+			{
+				// Input pixels cannot be NaN or +-Inf.
+				assert(0);
+				return false;
+			}
+
+			if (v < 0.0f)
+			{
+				// Input pixels cannot be signed.
+				assert(0);
+				return false;
+			}
+
+			if (v > MAX_HALF_FLOAT)
+			{
+				// Too large for half float.
+				assert(0);
+				return false;
+			}
+			
+			block_linear_colors[i][j] = v;
+		}
+		
+		block_linear_colors[i][3] = 1.0f;
+	}
+
+	assert(coptions.m_use_solid || coptions.m_use_mode11 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2);
+					
+	bool is_solid = false;
+	if (coptions.m_use_solid)
+		is_solid = pack_solid(block_linear_colors, all_results, coptions);
+
+	if (!is_solid)
+	{
+		if (coptions.m_use_mode11)
+		{
+			const size_t cur_num_results = all_results.size();
+
+			pack_mode11(block_linear_colors, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false);
+
+			if (coptions.m_last_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
+			{
+				pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_16_LEVELS, true);
+			}
+
+			// If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then 
+			// fall back to weight ISE range 7 (which doesn't need any endpoint quantization).
+			// This is to guarantee we always get at least 1 non-solid result.
+			if (all_results.size() == cur_num_results)
+			{
+				if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS)
+				{
+					pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false);
+				}
+			}
+		}
+				
+		if (coptions.m_use_mode7_part1)
+		{
+			// Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution.
+			pack_mode7_single_part(block_linear_colors, all_results, coptions);
+		}
+				
+		bool have_est = false;
+		int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2];
+
+		if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2))
+		{
+			if (coptions.m_use_estimated_partitions)
+				have_est = estimate_partition2(block_linear_colors, best_parts, coptions.m_max_estimated_partitions);
+		}
+
+		if (coptions.m_use_mode7_part2)
+		{
+			const size_t cur_num_results = all_results.size();
+
+			pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
+				coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range);
+
+			// If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to 
+			// 5 levels which doesn't require endpoint quantization.
+			if (all_results.size() == cur_num_results)
+			{
+				if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS)
+				{
+					pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, 
+						astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS);
+				}
+			}
+		}
+		
+		if (coptions.m_use_mode11_part2)
+		{
+			// This always requires endpoint quant, so it could fail to find any usable solutions.
+			pack_mode11_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts);
+		}
+	}
+
+	if (coptions.m_refine_weights)
+	{
+		// TODO: Move this above, do it once only.
+		basist::half_float rgb_pixels_half[16 * 3];
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 0]);
+			rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 1]);
+			rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 2]);
+		}
+
+		for (uint32_t i = 0; i < all_results.size(); i++)
+		{
+			bool status = astc_hdr_refine_weights(rgb_pixels_half, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag);
+			assert(status);
+			BASISU_NOTE_UNUSED(status);
+		}
+	}
+
+	return true;
+}
+
+bool astc_hdr_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_pack_results& results)
+{
+	assert(g_astc_hdr_enc_initialized);
+	if (!g_astc_hdr_enc_initialized)
+		return false;
+
+	if (results.m_is_solid)
+	{
+		memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk));
+	}
+	else
+	{
+		bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk);
+		if (!status)
+		{
+			assert(0);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error.
+bool astc_hdr_refine_weights(const half_float *pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool *pImproved_flag)
+{
+	if (pImproved_flag)
+		*pImproved_flag = false;
+
+	if (cur_results.m_is_solid)
+		return true;
+
+	const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range);
+
+	assert((total_weights >= 3) && (total_weights <= 16));
+
+	double best_err[4][4];
+	uint8_t best_weight[4][4];
+	for (uint32_t y = 0; y < 4; y++)
+	{
+		for (uint32_t x = 0; x < 4; x++)
+		{
+			best_err[y][x] = 1e+30f;
+			best_weight[y][x] = 0;
+		}
+	}
+
+	astc_hdr_pack_results temp_results;
+
+	const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f };
+
+	for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++)
+	{
+		temp_results = cur_results;
+		for (uint32_t i = 0; i < 16; i++)
+			temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index;
+		
+		half_float unpacked_astc_blk_rgba[4][4][4];
+		bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+		assert(res);
+
+		basist::bc6h_block trial_bc6h_blk;
+		res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk);
+		assert(res);
+				
+		half_float unpacked_bc6h_blk[4][4][3];
+		res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false);
+		assert(res);
+		BASISU_NOTE_UNUSED(res);
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				double total_err = 0.0f;
+
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					const half_float orig_c = pSource_block[(x + y * 4) * 3 + c];
+					const double orig_c_q = q(orig_c);
+					
+					const half_float astc_c = unpacked_astc_blk_rgba[y][x][c];
+					const double astc_c_q = q(astc_c);
+					const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c];
+					
+					const half_float bc6h_c = unpacked_bc6h_blk[y][x][c];
+					const double bc6h_c_q = q(bc6h_c);
+					const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c];
+
+					const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight;
+
+					total_err += overall_err;
+
+				} //  c
+
+				if (total_err < best_err[y][x])
+				{
+					best_err[y][x] = total_err;
+					best_weight[y][x] = (uint8_t)weight_index;
+				}
+
+			} // x
+		} // y
+
+	} // weight_index
+
+	bool any_changed = false;
+	for (uint32_t i = 0; i < 16; i++)
+	{
+		if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3])
+		{
+			any_changed = true;
+			break;
+		}
+	}
+
+	if (any_changed)
+	{
+		memcpy(cur_results.m_best_blk.m_weights, best_weight, 16);
+
+		{
+			bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block);
+			assert(res);
+			BASISU_NOTE_UNUSED(res);
+
+			half_float unpacked_astc_blk_rgba[4][4][4];
+			res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16);
+			assert(res);
+
+			half_float unpacked_astc_blk_rgb[4][4][3];
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					for (uint32_t c = 0; c < 3; c++)
+						unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c];
+
+			cur_results.m_best_block_error = compute_block_error(pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions);
+		}
+
+		if (pImproved_flag)
+			*pImproved_flag = true;
+	}
+
+	return true;
+}
+
+void astc_hdr_block_stats::update(const astc_hdr_pack_results& log_blk)
+{
+	std::lock_guard<std::mutex> lck(m_mutex);
+
+	m_total_blocks++;
+
+	if (log_blk.m_improved_via_refinement_flag)
+		m_total_refined++;
+
+	if (log_blk.m_is_solid)
+	{
+		m_total_solid++;
+	}
+	else
+	{
+		int best_weight_range = log_blk.m_best_blk.m_weight_ise_range;
+
+		if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7)
+		{
+			m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++;
+
+			if (log_blk.m_best_blk.m_num_partitions == 2)
+			{
+				m_total_mode7_2part++;
+
+				m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++;
+				m_total_2part++;
+
+				m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++;
+
+				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
+			}
+			else
+			{
+				m_total_mode7_1part++;
+
+				m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++;
+			}
+		}
+		else
+		{
+			m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++;
+			if (log_blk.m_constrained_weights)
+				m_total_mode11_1part_constrained_weights++;
+
+			if (log_blk.m_best_blk.m_num_partitions == 2)
+			{
+				m_total_mode11_2part++;
+
+				m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++;
+				m_total_2part++;
+
+				m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++;
+
+				m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++;
+			}
+			else
+			{
+				m_total_mode11_1part++;
+
+				m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++;
+			}
+		}
+	}
+}
+
+void astc_hdr_block_stats::print()
+{
+	std::lock_guard<std::mutex> lck(m_mutex);
+
+	assert(m_total_blocks);
+	if (!m_total_blocks)
+		return;
+
+	printf("\nLow-level ASTC Encoder Statistics:\n");
+	printf("Total blocks: %u\n", m_total_blocks);
+	printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks);
+	printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks);
+
+	printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks);
+	printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks);
+	printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks);
+
+	printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks);
+	printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks);
+
+	printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 11:\n");
+	for (uint32_t i = 1; i <= MODE11_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_11[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 11, 2 partition:\n");
+	for (uint32_t i = 1; i <= MODE11_PART2_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_11_2part[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 7:\n");
+	for (uint32_t i = 1; i <= MODE7_PART1_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_7[i]);
+	printf("\n");
+
+	printf("ISE texel weight range histogram mode 7, 2 partition:\n");
+	for (uint32_t i = 1; i <= MODE7_PART2_LAST_ISE_RANGE; i++)
+		printf("%u %u\n", i, m_weight_range_hist_7_2part[i]);
+	printf("\n");
+
+	printf("Mode 11 submode histogram:\n");
+	for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding
+		printf("%u %u\n", i, m_mode11_submode_hist[i]);
+	printf("\n");
+
+	printf("Mode 7 submode histogram:\n");
+	for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++)
+		printf("%u %u\n", i, m_mode7_submode_hist[i]);
+	printf("\n");
+
+	printf("Partition pattern table usage histogram:\n");
+	for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++)
+		printf("%u:%u ", i, m_part_hist[i]);
+	printf("\n\n");
+}
+
+} // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h
new file mode 100644
index 000000000000..ee122ff7cee9
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h
@@ -0,0 +1,224 @@
+// basisu_astc_hdr_enc.h
+#pragma once
+#include "basisu_enc.h"
+#include "basisu_gpu_texture.h"
+#include "../transcoder/basisu_astc_helpers.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
+
+namespace basisu
+{
+	// This MUST be called before encoding any blocks.
+	void astc_hdr_enc_init();
+
+	const uint32_t MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
+	const uint32_t MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS;
+	const uint32_t MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS;
+	const uint32_t MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS;
+	const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec)
+	const uint32_t MODE7_TOTAL_SUBMODES = 6;
+		
+	struct astc_hdr_codec_options
+	{
+		float m_bc6h_err_weight;
+
+		bool m_use_solid;
+
+		bool m_use_mode11;
+		bool m_mode11_uber_mode;
+		uint32_t m_first_mode11_weight_ise_range;
+		uint32_t m_last_mode11_weight_ise_range;
+		bool m_mode11_direct_only;
+		int32_t m_first_mode11_submode;
+		int32_t m_last_mode11_submode;
+
+		bool m_use_mode7_part1;
+		uint32_t m_first_mode7_part1_weight_ise_range;
+		uint32_t m_last_mode7_part1_weight_ise_range;
+
+		bool m_use_mode7_part2;
+		uint32_t m_mode7_part2_part_masks;
+		uint32_t m_first_mode7_part2_weight_ise_range;
+		uint32_t m_last_mode7_part2_weight_ise_range;
+
+		bool m_use_mode11_part2;
+		uint32_t m_mode11_part2_part_masks;
+		uint32_t m_first_mode11_part2_weight_ise_range;
+		uint32_t m_last_mode11_part2_weight_ise_range;
+
+		float m_r_err_scale, m_g_err_scale;
+
+		bool m_refine_weights;
+
+		uint32_t m_level;
+
+		bool m_use_estimated_partitions;
+		uint32_t m_max_estimated_partitions;
+
+		// If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however.
+		bool m_allow_uber_mode;
+
+		astc_hdr_codec_options();
+
+		void init();
+				
+		// TODO: set_quality_level() is preferred to configure the codec for transcoding purposes.
+		static const int cMinLevel = 0;
+		static const int cMaxLevel = 4;
+		static const int cDefaultLevel = 1;
+		void set_quality_level(int level);
+
+	private:
+		void set_quality_best();
+		void set_quality_normal();
+		void set_quality_fastest();
+	};
+
+	struct astc_hdr_pack_results
+	{
+		double m_best_block_error;
+		double m_bc6h_block_error; // note this is not used/set by the encoder, here for convienance 
+
+		// Encoder results (logical ASTC block)
+		astc_helpers::log_astc_block m_best_blk;
+		
+		// For statistical use
+		uint32_t m_best_submodes[2];
+		uint32_t m_best_pat_index;
+		bool m_constrained_weights;
+
+		bool m_improved_via_refinement_flag;
+				
+		// Only valid if the block is solid
+		basist::astc_blk m_solid_blk;
+		
+		// The BC6H transcoded block
+		basist::bc6h_block m_bc6h_block;
+
+		// Solid color/void extent flag
+		bool m_is_solid;
+
+		void clear()
+		{
+			m_best_block_error = 1e+30f;
+			m_bc6h_block_error = 1e+30f;
+
+			m_best_blk.clear();
+			m_best_blk.m_grid_width = 4;
+			m_best_blk.m_grid_height = 4;
+			m_best_blk.m_endpoint_ise_range = 20; // 0-255
+
+			clear_obj(m_best_submodes);
+
+			m_best_pat_index = 0;
+			m_constrained_weights = false;
+									
+			clear_obj(m_bc6h_block);
+			
+			m_is_solid = false;
+			m_improved_via_refinement_flag = false;
+		}
+	};
+			
+	void interpolate_qlog12_colors(
+		const int e[2][3],
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range);
+		
+	bool get_astc_hdr_mode_11_block_colors(
+		const uint8_t* pEndpoints,
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
+		
+	bool get_astc_hdr_mode_7_block_colors(
+		const uint8_t* pEndpoints,
+		basist::half_float* pDecoded_half,
+		vec3F* pDecoded_float,
+		uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range);
+
+	double eval_selectors(
+		uint32_t num_pixels,
+		uint8_t* pWeights,
+		const basist::half_float* pBlock_pixels_half,
+		uint32_t num_weight_levels,
+		const basist::half_float* pDecoded_half,
+		const astc_hdr_codec_options& coptions,
+		uint32_t usable_selector_bitmask = UINT32_MAX);
+
+	double compute_block_error(const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_options& coptions);
+
+	// Encodes a 4x4 ASTC HDR block given a 4x4 array of source block pixels/texels.
+	// Supports solid color blocks, mode 11 (all submodes), mode 7/1 partition (all submodes), 
+	// and mode 7/2 partitions (all submodes) - 30 patterns, only the ones also in common with the BC6H format.
+	// The packed ASTC weight grid dimensions are currently always 4x4 texels, but may be also 3x3 in the future.
+	// This function is thread safe, i.e. it may be called from multiple encoding threads simultanously with different blocks.
+	// 
+	// Parameters:
+	// pRGBPixels - An array of 48 (16 RGB) floats: the 4x4 block to pack
+	// pPacked_block - A pointer to the packed ASTC HDR block
+	// coptions - Codec options
+	// pInternal_results - An optional pointer to details about how the block was packed, for statistics/debugging purposes. May be nullptr.
+	// 
+	// Requirements: 
+	// astc_hdr_enc_init() MUST have been called first to initialized the codec.
+	// Input pixels are checked and cannot be NaN's, Inf's, signed, or too large (greater than MAX_HALF_FLOAT, or 65504). 
+	// Normal values and denormals are okay.
+	bool astc_hdr_enc_block(
+		const float* pRGBPixels,
+		const astc_hdr_codec_options& coptions,
+		basisu::vector<astc_hdr_pack_results> &all_results);
+
+	bool astc_hdr_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_pack_results& results);
+		
+	bool astc_hdr_refine_weights(const basist::half_float* pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool* pImproved_flag);
+
+	struct astc_hdr_block_stats
+	{
+		std::mutex m_mutex;
+
+		uint32_t m_total_blocks;
+		uint32_t m_total_2part, m_total_solid;
+		uint32_t m_total_mode7_1part, m_total_mode7_2part;
+		uint32_t m_total_mode11_1part, m_total_mode11_2part;
+		uint32_t m_total_mode11_1part_constrained_weights;
+
+		uint32_t m_weight_range_hist_7[11];
+		uint32_t m_weight_range_hist_7_2part[11];
+		uint32_t m_mode7_submode_hist[6];
+
+		uint32_t m_weight_range_hist_11[11];
+		uint32_t m_weight_range_hist_11_2part[11];
+		uint32_t m_mode11_submode_hist[9];
+								
+		uint32_t m_part_hist[32];
+
+		uint32_t m_total_refined;
+								
+		astc_hdr_block_stats() { clear(); }
+
+		void clear()
+		{
+			std::lock_guard<std::mutex> lck(m_mutex);
+
+			m_total_blocks = 0;
+			m_total_mode7_1part = 0, m_total_mode7_2part = 0, m_total_mode11_1part = 0, m_total_2part = 0, m_total_solid = 0, m_total_mode11_2part = 0;
+			m_total_mode11_1part_constrained_weights = 0;
+			m_total_refined = 0;
+
+			clear_obj(m_weight_range_hist_11);
+			clear_obj(m_weight_range_hist_11_2part);
+			clear_obj(m_weight_range_hist_7);
+			clear_obj(m_weight_range_hist_7_2part);
+			clear_obj(m_mode7_submode_hist);
+			clear_obj(m_mode11_submode_hist);
+			clear_obj(m_part_hist);
+		}
+
+		void update(const astc_hdr_pack_results& log_blk);
+		
+		void print();
+	};
+		
+} // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_backend.cpp b/thirdparty/basis_universal/encoder/basisu_backend.cpp
index abb61750a6f2..3fa3d8892fed 100644
--- a/thirdparty/basis_universal/encoder/basisu_backend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_backend.cpp
@@ -1,5 +1,5 @@
 // basisu_backend.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_backend.h b/thirdparty/basis_universal/encoder/basisu_backend.h
index 07778aeb9ba2..58a9a8aa0ea7 100644
--- a/thirdparty/basis_universal/encoder/basisu_backend.h
+++ b/thirdparty/basis_universal/encoder/basisu_backend.h
@@ -1,5 +1,5 @@
 // basisu_backend.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
index f4c77bef23f4..77f467f67070 100644
--- a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
@@ -1,5 +1,5 @@
 // basisu_basis_file.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.h b/thirdparty/basis_universal/encoder/basisu_basis_file.h
index 98498a012178..57448bccb198 100644
--- a/thirdparty/basis_universal/encoder/basisu_basis_file.h
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.h
@@ -1,5 +1,5 @@
 // basisu_basis_file.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
index 22fdfa603fc4..914e7fbbb9ab 100644
--- a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
@@ -1,5 +1,5 @@
 // File: basisu_bc7enc.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -394,6 +394,7 @@ void bc7enc_compress_block_init()
 static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F* pSelector_weights, bc7enc_vec4F* pXl, bc7enc_vec4F* pXh, const color_quad_u8 *pColors)
 {
 	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf
 	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
 	double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
 	double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
@@ -1301,6 +1302,7 @@ void check_best_overall_error(const color_cell_compressor_params *pParams, color
 		for (uint32_t c = 0; c < 4; c++)
 			colors[i].m_c[c] = (uint8_t)astc_interpolate_linear(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]);
 
+#ifdef _DEBUG
 	uint64_t total_err = 0;
 	for (uint32_t p = 0; p < pParams->m_num_pixels; p++)
 	{
@@ -1313,6 +1315,7 @@ void check_best_overall_error(const color_cell_compressor_params *pParams, color
 			total_err += compute_color_distance_rgb(&orig, &packed, pParams->m_perceptual, pParams->m_weights);
 	}
 	assert(total_err == pResults->m_best_overall_err);
+#endif
 	
 	// HACK HACK
 	//if (total_err != pResults->m_best_overall_err)
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.h b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
index 8d8b7888cac7..925d6b2e8dee 100644
--- a/thirdparty/basis_universal/encoder/basisu_bc7enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
@@ -1,5 +1,5 @@
 // File: basisu_bc7enc.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp
index 4e69e9e2eecb..81813257cd24 100644
--- a/thirdparty/basis_universal/encoder/basisu_comp.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp
@@ -1,5 +1,5 @@
 // basisu_comp.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,6 +16,9 @@
 #include "basisu_enc.h"
 #include <unordered_set>
 #include <atomic>
+#include <map>
+
+//#define UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS
 
 // basisu_transcoder.cpp is where basisu_miniz lives now, we just need the declarations here.
 #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
@@ -23,6 +26,8 @@
 
 #include "basisu_opencl.h"
 
+#include "../transcoder/basisu_astc_hdr_core.h"
+
 #if !BASISD_SUPPORT_KTX2
 #error BASISD_SUPPORT_KTX2 must be enabled (set to 1).
 #endif
@@ -34,7 +39,7 @@
 // Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all)
 #define BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND (0)
 
-// Set to 1 to disable writing all KTX2 key values, triggering the validator bug.
+// Set to 1 to disable writing all KTX2 key values, triggering an early validator bug.
 #define BASISU_DISABLE_KTX2_KEY_VALUES (0)
 
 using namespace buminiz;
@@ -46,27 +51,143 @@ using namespace buminiz;
 
 namespace basisu
 {
-   basis_compressor::basis_compressor() :
-	   m_pOpenCL_context(nullptr),
+	basis_compressor::basis_compressor() :
+		m_pOpenCL_context(nullptr),
 		m_basis_file_size(0),
 		m_basis_bits_per_texel(0.0f),
 		m_total_blocks(0),
 		m_any_source_image_has_alpha(false),
-	   m_opencl_failed(false)
+		m_opencl_failed(false)
 	{
 		debug_printf("basis_compressor::basis_compressor\n");
 		
 		assert(g_library_initialized);
 	}
 
-   basis_compressor::~basis_compressor()
-   {
-	   if (m_pOpenCL_context)
-	   {
-		   opencl_destroy_context(m_pOpenCL_context);
-		   m_pOpenCL_context = nullptr;
-	   }
-   }
+	basis_compressor::~basis_compressor()
+	{
+		if (m_pOpenCL_context)
+		{
+			opencl_destroy_context(m_pOpenCL_context);
+			m_pOpenCL_context = nullptr;
+		}
+	}
+
+	void basis_compressor::check_for_hdr_inputs()
+	{
+		if ((!m_params.m_source_filenames.size()) && (!m_params.m_source_images.size()))
+		{
+			if (m_params.m_source_images_hdr.size())
+			{
+				// Assume they want UASTC HDR if they've specified any HDR source images.
+				m_params.m_hdr = true;
+			}
+		}
+
+		if (!m_params.m_hdr)
+		{
+			// See if any files are .EXR or .HDR, if so switch the compressor to UASTC HDR mode.
+			for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++)
+			{
+				std::string filename;
+				string_get_filename(m_params.m_source_filenames[i].c_str(), filename);
+
+				std::string ext(string_get_extension(filename));
+				string_tolower(ext);
+
+				if ((ext == "exr") || (ext == "hdr"))
+				{
+					m_params.m_hdr = true;
+					break;
+				}
+			}
+		}
+
+		if (m_params.m_hdr)
+		{
+			if (m_params.m_source_alpha_filenames.size())
+			{
+				debug_printf("Warning: Alpha channel image filenames are not supported in UASTC HDR mode.\n");
+				m_params.m_source_alpha_filenames.clear();
+			}
+		}
+
+		if (m_params.m_hdr)
+			m_params.m_uastc = true;
+	}
+
+	bool basis_compressor::sanity_check_input_params()
+	{
+		// Check for no source filenames specified.
+		if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size()))
+		{
+			assert(0);
+			return false;
+		}
+
+		// See if they've specified any source filenames, but didn't tell us to read them.
+		if ((!m_params.m_read_source_images) && (m_params.m_source_filenames.size()))
+		{
+			assert(0);
+			return false;
+		}
+
+		// Sanity check the input image parameters.
+		if (m_params.m_read_source_images)
+		{
+			// Caller can't specify their own images if they want us to read source images from files.
+			if (m_params.m_source_images.size() || m_params.m_source_images_hdr.size())
+			{
+				assert(0);
+				return false;
+			}
+
+			if (m_params.m_source_mipmap_images.size() || m_params.m_source_mipmap_images_hdr.size())
+			{
+				assert(0);
+				return false;
+			}
+		}
+		else
+		{
+			// They didn't tell us to read any source files, so check for no LDR/HDR source images.
+			if (!m_params.m_source_images.size() && !m_params.m_source_images_hdr.size())
+			{
+				assert(0);
+				return false;
+			}
+
+			// Now we know we've been supplied LDR and/or HDR source images, check for LDR vs. HDR conflicts.
+
+			if (m_params.m_source_images.size())
+			{
+				// They've supplied LDR images, so make sure they also haven't specified HDR input images.
+				if (m_params.m_source_images_hdr.size() || m_params.m_source_mipmap_images_hdr.size())
+				{
+					assert(0);
+					return false;
+				}
+			}
+			else
+			{
+				// No LDR images, so make sure they haven't specified any LDR mipmaps.
+				if (m_params.m_source_mipmap_images.size())
+				{
+					assert(0);
+					return false;
+				}
+
+				// No LDR images, so ensure they've supplied some HDR images to process.
+				if (!m_params.m_source_images_hdr.size())
+				{
+					assert(0);
+					return false;
+				}
+			}
+		}
+
+		return true;
+	}
 
 	bool basis_compressor::init(const basis_compressor_params &params)
 	{
@@ -85,7 +206,12 @@ namespace basisu
 		}
 				
 		m_params = params;
-				
+
+		if ((m_params.m_compute_stats) && (!m_params.m_validate_output_data))
+			m_params.m_validate_output_data = true;
+
+		check_for_hdr_inputs();
+		
 		if (m_params.m_debug)
 		{
 			debug_printf("basis_compressor::init:\n");
@@ -95,8 +221,10 @@ namespace basisu
 #define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<uint32_t>(m_params.v), m_params.v.was_changed());
 #define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast<float>(m_params.v), m_params.v.was_changed());
 						
-			debug_printf("Source images: %u, source filenames: %u, source alpha filenames: %i, Source mipmap images: %u\n",
-				m_params.m_source_images.size(), m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), m_params.m_source_mipmap_images.size());
+			debug_printf("Source LDR images: %u, HDR images: %u, filenames: %u, alpha filenames: %i, LDR mipmap images: %u, HDR mipmap images: %u\n",
+				m_params.m_source_images.size(), m_params.m_source_images_hdr.size(),
+				m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(),
+				m_params.m_source_mipmap_images.size(), m_params.m_source_mipmap_images_hdr.size());
 
 			if (m_params.m_source_mipmap_images.size())
 			{
@@ -106,6 +234,15 @@ namespace basisu
 				debug_printf("\n");
 			}
 
+			if (m_params.m_source_mipmap_images_hdr.size())
+			{
+				debug_printf("m_source_mipmap_images_hdr array sizes:\n");
+				for (uint32_t i = 0; i < m_params.m_source_mipmap_images_hdr.size(); i++)
+					debug_printf("%u ", m_params.m_source_mipmap_images_hdr[i].size());
+				debug_printf("\n");
+			}
+
+			PRINT_BOOL_VALUE(m_hdr);
 			PRINT_BOOL_VALUE(m_uastc);
 			PRINT_BOOL_VALUE(m_use_opencl);
 			PRINT_BOOL_VALUE(m_y_flip);
@@ -117,7 +254,7 @@ namespace basisu
 			PRINT_BOOL_VALUE(m_no_endpoint_rdo);
 			PRINT_BOOL_VALUE(m_no_selector_rdo);
 			PRINT_BOOL_VALUE(m_read_source_images);
-			PRINT_BOOL_VALUE(m_write_output_basis_files);
+			PRINT_BOOL_VALUE(m_write_output_basis_or_ktx2_files);
 			PRINT_BOOL_VALUE(m_compute_stats);
 			PRINT_BOOL_VALUE(m_check_for_alpha);
 			PRINT_BOOL_VALUE(m_force_alpha);
@@ -146,6 +283,7 @@ namespace basisu
 			debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters);
 			debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters);
 			debug_printf("m_quality_level: %i\n", m_params.m_quality_level);
+			debug_printf("UASTC HDR quality level: %u\n", m_params.m_uastc_hdr_options.m_level);
 
 			debug_printf("m_tex_type: %u\n", m_params.m_tex_type);
 			debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1);
@@ -185,6 +323,9 @@ namespace basisu
 			}
 
 			PRINT_BOOL_VALUE(m_validate_output_data);
+			PRINT_BOOL_VALUE(m_hdr_ldr_srgb_to_linear_conversion);
+			debug_printf("Allow UASTC HDR uber mode: %u\n", m_params.m_uastc_hdr_options.m_allow_uber_mode);
+			PRINT_BOOL_VALUE(m_hdr_favor_astc);
 						
 #undef PRINT_BOOL_VALUE
 #undef PRINT_INT_VALUE
@@ -192,19 +333,9 @@ namespace basisu
 #undef PRINT_FLOAT_VALUE
 		}
 
-		if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size()))
-		{
-			assert(0);
+		if (!sanity_check_input_params())
 			return false;
-		}
-
-		if ((m_params.m_compute_stats) && (!m_params.m_validate_output_data))
-		{
-			m_params.m_validate_output_data = true;
-			
-			debug_printf("Note: m_compute_stats is true, so forcing m_validate_output_data to true as well\n");
-		}
-
+		
 		if ((m_params.m_use_opencl) && opencl_is_available() && !m_pOpenCL_context && !m_opencl_failed)
 		{
 			m_pOpenCL_context = opencl_create_context();
@@ -219,6 +350,9 @@ namespace basisu
 	{
 		debug_printf("basis_compressor::process\n");
 
+		if (!read_dds_source_images())
+			return cECFailedReadingSourceImages;
+
 		if (!read_source_images())
 			return cECFailedReadingSourceImages;
 
@@ -228,20 +362,38 @@ namespace basisu
 		if (m_params.m_create_ktx2_file)
 		{
 			if (!validate_ktx2_constraints())
+			{
+				error_printf("Inputs do not satisfy .KTX2 texture constraints: all source images must be the same resolution and have the same number of mipmap levels.\n");
 				return cECFailedValidating;
+			}
 		}
 
 		if (!extract_source_blocks())
 			return cECFailedFrontEnd;
 
-		if (m_params.m_uastc)
+		if (m_params.m_hdr)
+		{
+			// UASTC HDR
+			printf("Mode: UASTC HDR Level %u\n", m_params.m_uastc_hdr_options.m_level);
+
+			error_code ec = encode_slices_to_uastc_hdr();
+			if (ec != cECSuccess)
+				return ec;
+		}
+		else if (m_params.m_uastc)
 		{
+			// UASTC
+			printf("Mode: UASTC LDR Level %u\n", m_params.m_pack_uastc_flags & cPackUASTCLevelMask);
+
 			error_code ec = encode_slices_to_uastc();
 			if (ec != cECSuccess)
 				return ec;
 		}
 		else
 		{
+			// ETC1S
+			printf("Mode: ETC1S Quality %i, Level %i\n", m_params.m_quality_level, (int)m_params.m_compression_level);
+			
 			if (!process_frontend())
 				return cECFailedFrontEnd;
 
@@ -254,7 +406,7 @@ namespace basisu
 
 		if (!create_basis_file_and_transcode())
 			return cECFailedCreateBasisFile;
-		
+
 		if (m_params.m_create_ktx2_file)
 		{
 			if (!create_ktx2_file())
@@ -267,20 +419,89 @@ namespace basisu
 		return cECSuccess;
 	}
 
-	basis_compressor::error_code basis_compressor::encode_slices_to_uastc()
+	basis_compressor::error_code basis_compressor::encode_slices_to_uastc_hdr()
 	{
-		debug_printf("basis_compressor::encode_slices_to_uastc\n");
+		debug_printf("basis_compressor::encode_slices_to_uastc_hdr\n");
+
+		interval_timer tm;
+		tm.start();
 
 		m_uastc_slice_textures.resize(m_slice_descs.size());
 		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
-			m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height);
+			m_uastc_slice_textures[slice_index].init(texture_format::cUASTC_HDR_4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height);
 
-		m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4;
+		m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC_HDR_4x4;
 		m_uastc_backend_output.m_etc1s = false;
 		m_uastc_backend_output.m_slice_desc = m_slice_descs;
 		m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size());
 		m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size());
+
+		if (!m_params.m_perceptual)
+		{
+			m_params.m_uastc_hdr_options.m_r_err_scale = 1.0f;
+			m_params.m_uastc_hdr_options.m_g_err_scale = 1.0f;
+		}
+		
+		const float DEFAULT_BC6H_ERROR_WEIGHT = .85f;
+		const float LOWEST_BC6H_ERROR_WEIGHT = .1f;
+		m_params.m_uastc_hdr_options.m_bc6h_err_weight = m_params.m_hdr_favor_astc ? LOWEST_BC6H_ERROR_WEIGHT : DEFAULT_BC6H_ERROR_WEIGHT;
+
+		std::atomic<bool> any_failures;
+		any_failures = false;
+
+		astc_hdr_block_stats enc_stats;
 				
+		struct uastc_blk_desc
+		{
+			uint32_t m_solid_flag;
+			uint32_t m_num_partitions;
+			uint32_t m_cem_index;
+			uint32_t m_weight_ise_range;
+			uint32_t m_endpoint_ise_range;
+
+			bool operator< (const uastc_blk_desc& desc) const
+			{
+				if (this == &desc)
+					return false;
+
+#define COMP(XX) if (XX < desc.XX) return true; else if (XX != desc.XX) return false;
+				COMP(m_solid_flag)
+				COMP(m_num_partitions)
+				COMP(m_cem_index)
+				COMP(m_weight_ise_range)
+				COMP(m_endpoint_ise_range)
+#undef COMP
+
+				return false;
+			}
+			
+			bool operator== (const uastc_blk_desc& desc) const
+			{
+				if (this == &desc)
+					return true;
+				if ((*this < desc) || (desc < *this))
+					return false;
+				return true;
+			}
+
+			bool operator!= (const uastc_blk_desc& desc) const
+			{
+				return !(*this == desc);
+			}
+		};
+
+		struct uastc_blk_desc_stats
+		{
+			uastc_blk_desc_stats() : m_count(0) { }
+			uint32_t m_count;
+#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS
+			basisu::vector<basist::astc_blk> m_blks;
+#endif
+		};
+
+		std::map<uastc_blk_desc, uastc_blk_desc_stats> unique_block_descs;
+		std::mutex unique_block_desc_mutex;
+		
 		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
 		{
 			gpu_image& tex = m_uastc_slice_textures[slice_index];
@@ -290,102 +511,387 @@ namespace basisu
 			const uint32_t num_blocks_x = tex.get_blocks_x();
 			const uint32_t num_blocks_y = tex.get_blocks_y();
 			const uint32_t total_blocks = tex.get_total_blocks();
-			const image& source_image = m_slice_images[slice_index];
-			
+			const imagef& source_image = m_slice_images_hdr[slice_index];
+
 			std::atomic<uint32_t> total_blocks_processed;
 			total_blocks_processed = 0;
-
+						
 			const uint32_t N = 256;
 			for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N)
 			{
 				const uint32_t first_index = block_index_iter;
 				const uint32_t last_index = minimum<uint32_t>(total_blocks, block_index_iter + N);
-
+			
 				// FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten.
 #ifndef __EMSCRIPTEN__
-				m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed]
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, 
+					&tex, &total_blocks_processed, &any_failures, &enc_stats, &unique_block_descs, &unique_block_desc_mutex]
 					{
 #endif
 						BASISU_NOTE_UNUSED(num_blocks_y);
-						
-						uint32_t uastc_flags = m_params.m_pack_uastc_flags;
-						if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode))
-							uastc_flags |= cPackUASTCFavorSimplerModes;
+
+						basisu::vector<astc_hdr_pack_results> all_results;
+						all_results.reserve(256);
 
 						for (uint32_t block_index = first_index; block_index < last_index; block_index++)
 						{
 							const uint32_t block_x = block_index % num_blocks_x;
 							const uint32_t block_y = block_index / num_blocks_x;
 
-							color_rgba block_pixels[4][4];
+							vec4F block_pixels[16];
 
-							source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4);
+							source_image.extract_block_clamped(&block_pixels[0], block_x * 4, block_y * 4, 4, 4);
 
-							basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y);
+							basist::astc_blk& dest_block = *(basist::astc_blk*)tex.get_block_ptr(block_x, block_y);
+														
+							float rgb_pixels[16 * 3];
+							basist::half_float rgb_pixels_half[16 * 3];
+							for (uint32_t i = 0; i < 16; i++)
+							{
+								rgb_pixels[i * 3 + 0] = block_pixels[i][0];
+								rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(block_pixels[i][0]);
 
-							encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags);
+								rgb_pixels[i * 3 + 1] = block_pixels[i][1];
+								rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(block_pixels[i][1]);
 
-							total_blocks_processed++;
+								rgb_pixels[i * 3 + 2] = block_pixels[i][2];
+								rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(block_pixels[i][2]);
+							}
+														
+							bool status = astc_hdr_enc_block(&rgb_pixels[0], m_params.m_uastc_hdr_options, all_results);
+							if (!status)
+							{
+								any_failures = true;
+								continue;
+							}
+
+							double best_err = 1e+30f;
+							int best_result_index = -1;
+											
+							const double bc6h_err_weight = m_params.m_uastc_hdr_options.m_bc6h_err_weight;
+							const double astc_err_weight = (1.0f - bc6h_err_weight);
+										
+							for (uint32_t i = 0; i < all_results.size(); i++)
+							{
+								basist::half_float unpacked_bc6h_block[4 * 4 * 3];
+								unpack_bc6h(&all_results[i].m_bc6h_block, unpacked_bc6h_block, false);
+
+								all_results[i].m_bc6h_block_error = compute_block_error(rgb_pixels_half, unpacked_bc6h_block, m_params.m_uastc_hdr_options);
+
+								double overall_err = (all_results[i].m_bc6h_block_error * bc6h_err_weight) + (all_results[i].m_best_block_error * astc_err_weight);
+
+								if ((!i) || (overall_err < best_err))
+								{
+									best_err = overall_err;
+									best_result_index = i;
+								}
+							}
+
+							const astc_hdr_pack_results& best_results = all_results[best_result_index];
 							
-							uint32_t val = total_blocks_processed;
-							if ((val & 16383) == 16383)
+							astc_hdr_pack_results_to_block(dest_block, best_results);
+								
+							// Verify that this block is valid UASTC HDR and we can successfully transcode it to BC6H.
+							// (Well, except in fastest mode.)
+							if (m_params.m_uastc_hdr_options.m_level > 0)
 							{
-								debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast<float>(val) * 100.0f / total_blocks);
+								basist::bc6h_block transcoded_bc6h_blk;
+								bool transcode_results = astc_hdr_transcode_to_bc6h(dest_block, transcoded_bc6h_blk);
+								assert(transcode_results);
+								if ((!transcode_results) && (!any_failures))
+								{
+									error_printf("basis_compressor::encode_slices_to_uastc_hdr: UASTC HDR block transcode check failed!\n");
+
+									any_failures = true;
+									continue;
+								}
+							}
+
+							if (m_params.m_debug)
+							{
+								// enc_stats has its own mutex
+								enc_stats.update(best_results);
+
+								uastc_blk_desc blk_desc;
+								clear_obj(blk_desc);
+
+								blk_desc.m_solid_flag = best_results.m_is_solid;
+								if (!blk_desc.m_solid_flag)
+								{
+									blk_desc.m_num_partitions = best_results.m_best_blk.m_num_partitions;
+									blk_desc.m_cem_index = best_results.m_best_blk.m_color_endpoint_modes[0];
+									blk_desc.m_weight_ise_range = best_results.m_best_blk.m_weight_ise_range;
+									blk_desc.m_endpoint_ise_range = best_results.m_best_blk.m_endpoint_ise_range;
+								}
+								
+								{
+									std::lock_guard<std::mutex> lck(unique_block_desc_mutex);
+																		
+									auto res = unique_block_descs.insert(std::make_pair(blk_desc, uastc_blk_desc_stats()));
+									
+									(res.first)->second.m_count++;
+#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS
+									(res.first)->second.m_blks.push_back(dest_block);
+#endif
+								}
 							}
 
+							total_blocks_processed++;
+
+							uint32_t val = total_blocks_processed;
+							if (((val & 1023) == 1023) && m_params.m_status_output)
+							{
+								debug_printf("basis_compressor::encode_slices_to_uastc_hdr: %3.1f%% done\n", static_cast<float>(val) * 100.0f / total_blocks);
+							}
 						}
 
 #ifndef __EMSCRIPTEN__
 					});
 #endif
-
+			
 			} // block_index_iter
 
 #ifndef __EMSCRIPTEN__
 			m_params.m_pJob_pool->wait_for_all();
 #endif
 
-			if (m_params.m_rdo_uastc)
-			{
-				uastc_rdo_params rdo_params;
-				rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar;
-				rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio;
-				rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh;
-				rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size;
-				rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale;
-				rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev;
-								
-				bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(),
-					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr,
-					(m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
-				if (!status)
-				{
-					return cECFailedUASTCRDOPostProcess;
-				}
-			}
+			if (any_failures)
+				return cECFailedEncodeUASTC;
 
 			m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes());
 			memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes());
-			
+
 			m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0);
-						
+
 		} // slice_index
-				
+		
+		debug_printf("basis_compressor::encode_slices_to_uastc_hdr: Total time: %3.3f secs\n", tm.get_elapsed_secs());
+
+		if (m_params.m_debug)
+		{
+			debug_printf("\n----- Total unique UASTC block descs: %u\n", (uint32_t)unique_block_descs.size());
+
+			uint32_t c = 0;
+			for (auto it = unique_block_descs.begin(); it != unique_block_descs.end(); ++it)
+			{
+				debug_printf("%u. Total uses: %u %3.2f%%, solid color: %u\n", c, it->second.m_count,
+					((float)it->second.m_count * 100.0f) / enc_stats.m_total_blocks, it->first.m_solid_flag);
+
+				if (!it->first.m_solid_flag)
+				{
+					debug_printf("  Num partitions: %u\n", it->first.m_num_partitions);
+					debug_printf("  CEM index: %u\n", it->first.m_cem_index);
+					debug_printf("  Weight ISE range: %u (%u levels)\n", it->first.m_weight_ise_range, astc_helpers::get_ise_levels(it->first.m_weight_ise_range));
+					debug_printf("  Endpoint ISE range: %u (%u levels)\n", it->first.m_endpoint_ise_range, astc_helpers::get_ise_levels(it->first.m_endpoint_ise_range));
+				}
+
+#ifdef UASTC_HDR_DEBUG_SAVE_CATEGORIZED_BLOCKS
+				debug_printf("  -- UASTC HDR block bytes:\n");
+				for (uint32_t j = 0; j < minimum<uint32_t>(4, it->second.m_blks.size()); j++)
+				{
+					basist::astc_blk& blk = it->second.m_blks[j];
+
+					debug_printf("    - UASTC HDR: { ");
+					for (uint32_t k = 0; k < 16; k++)
+						debug_printf("%u%s", ((const uint8_t*)&blk)[k], (k != 15) ? ", " : "");
+					debug_printf(" }\n");
+
+					basist::bc6h_block bc6h_blk;
+					bool res = astc_hdr_transcode_to_bc6h(blk, bc6h_blk);
+					assert(res);
+					if (!res)
+					{
+						error_printf("astc_hdr_transcode_to_bc6h() failed!\n");
+						return cECFailedEncodeUASTC;
+					}
+
+					debug_printf("    - BC6H: { ");
+					for (uint32_t k = 0; k < 16; k++)
+						debug_printf("%u%s", ((const uint8_t*)&bc6h_blk)[k], (k != 15) ? ", " : "");
+					debug_printf(" }\n");
+				}
+#endif
+					
+				c++;
+			}
+			printf("\n");
+			
+			enc_stats.print();
+		}
+
 		return cECSuccess;
 	}
 
-	bool basis_compressor::generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha)
+	basis_compressor::error_code basis_compressor::encode_slices_to_uastc()
 	{
-		debug_printf("basis_compressor::generate_mipmaps\n");
+		debug_printf("basis_compressor::encode_slices_to_uastc\n");
 
-		interval_timer tm;
-		tm.start();
+		m_uastc_slice_textures.resize(m_slice_descs.size());
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height);
 
-		uint32_t total_levels = 1;
-		uint32_t w = img.get_width(), h = img.get_height();
-		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
+		m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4;
+		m_uastc_backend_output.m_etc1s = false;
+		m_uastc_backend_output.m_slice_desc = m_slice_descs;
+		m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size());
+		m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size());
+				
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
 		{
-			w = maximum(w >> 1U, 1U);
+			gpu_image& tex = m_uastc_slice_textures[slice_index];
+			basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+			(void)slice_desc;
+
+			const uint32_t num_blocks_x = tex.get_blocks_x();
+			const uint32_t num_blocks_y = tex.get_blocks_y();
+			const uint32_t total_blocks = tex.get_total_blocks();
+			const image& source_image = m_slice_images[slice_index];
+			
+			std::atomic<uint32_t> total_blocks_processed;
+			total_blocks_processed = 0;
+
+			const uint32_t N = 256;
+			for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(total_blocks, block_index_iter + N);
+
+				// FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten.
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed]
+					{
+#endif
+						BASISU_NOTE_UNUSED(num_blocks_y);
+						
+						uint32_t uastc_flags = m_params.m_pack_uastc_flags;
+						if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode))
+							uastc_flags |= cPackUASTCFavorSimplerModes;
+
+						for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+						{
+							const uint32_t block_x = block_index % num_blocks_x;
+							const uint32_t block_y = block_index / num_blocks_x;
+
+							color_rgba block_pixels[4][4];
+
+							source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4);
+
+							basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y);
+
+							encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags);
+
+							total_blocks_processed++;
+							
+							uint32_t val = total_blocks_processed;
+							if (((val & 16383) == 16383) && m_params.m_status_output)
+							{
+								debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast<float>(val) * 100.0f / total_blocks);
+							}
+
+						}
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			if (m_params.m_rdo_uastc)
+			{
+				uastc_rdo_params rdo_params;
+				rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar;
+				rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio;
+				rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh;
+				rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size;
+				rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale;
+				rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev;
+								
+				bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(),
+					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr,
+					(m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
+				if (!status)
+				{
+					return cECFailedUASTCRDOPostProcess;
+				}
+			}
+
+			m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes());
+			memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes());
+			
+			m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0);
+						
+		} // slice_index
+				
+		return cECSuccess;
+	}
+
+	bool basis_compressor::generate_mipmaps(const imagef& img, basisu::vector<imagef>& mips, bool has_alpha)
+	{
+		debug_printf("basis_compressor::generate_mipmaps\n");
+
+		interval_timer tm;
+		tm.start();
+
+		uint32_t total_levels = 1;
+		uint32_t w = img.get_width(), h = img.get_height();
+		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
+		{
+			w = maximum(w >> 1U, 1U);
+			h = maximum(h >> 1U, 1U);
+			total_levels++;
+		}
+
+		for (uint32_t level = 1; level < total_levels; level++)
+		{
+			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
+			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
+
+			imagef& level_img = *enlarge_vector(mips, 1);
+			level_img.resize(level_width, level_height);
+
+			const imagef* pSource_image = &img;
+
+			if (m_params.m_mip_fast)
+			{
+				if (level > 1)
+					pSource_image = &mips[level - 1];
+			}
+
+			bool status = image_resample(*pSource_image, level_img, 
+				//m_params.m_mip_filter.c_str(), 
+				"box", // TODO: negative lobes in the filter are causing negative colors, try Mitchell
+				m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
+			if (!status)
+			{
+				error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n");
+				return false;
+			}
+
+			clean_hdr_image(level_img);
+		}
+
+		if (m_params.m_debug)
+			debug_printf("Total mipmap generation time: %3.3f secs\n", tm.get_elapsed_secs());
+
+		return true;
+	}
+
+	bool basis_compressor::generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha)
+	{
+		debug_printf("basis_compressor::generate_mipmaps\n");
+
+		interval_timer tm;
+		tm.start();
+
+		uint32_t total_levels = 1;
+		uint32_t w = img.get_width(), h = img.get_height();
+		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
+		{
+			w = maximum(w >> 1U, 1U);
 			h = maximum(h >> 1U, 1U);
 			total_levels++;
 		}
@@ -463,17 +969,224 @@ namespace basisu
 		return true;
 	}
 
+	void basis_compressor::clean_hdr_image(imagef& src_img)
+	{
+		const uint32_t width = src_img.get_width();
+		const uint32_t height = src_img.get_height();
+
+		float max_used_val = 0.0f;
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				vec4F& c = src_img(x, y);
+				for (uint32_t i = 0; i < 3; i++)
+					max_used_val = maximum(max_used_val, c[i]);
+			}
+		}
+
+		double hdr_image_scale = 1.0f;
+		if (max_used_val > basist::ASTC_HDR_MAX_VAL)
+		{
+			hdr_image_scale = max_used_val / basist::ASTC_HDR_MAX_VAL;
+
+			const double inv_hdr_image_scale = basist::ASTC_HDR_MAX_VAL / max_used_val;
+
+			for (uint32_t y = 0; y < src_img.get_height(); y++)
+			{
+				for (uint32_t x = 0; x < src_img.get_width(); x++)
+				{
+					vec4F& c = src_img(x, y);
+
+					for (uint32_t i = 0; i < 3; i++)
+						c[i] = (float)minimum<double>(c[i] * inv_hdr_image_scale, basist::ASTC_HDR_MAX_VAL);
+				}
+			}
+
+			printf("Warning: The input HDR image's maximum used float value was %f, which is too high to encode as ASTC HDR. The image's components have been linearly scaled so the maximum used value is %f, by multiplying by %f.\n",
+				max_used_val, basist::ASTC_HDR_MAX_VAL, inv_hdr_image_scale);
+
+			printf("The decoded ASTC HDR texture will have to be scaled up by %f.\n", hdr_image_scale);
+		}
+
+		// TODO: Determine a constant scale factor, apply if > MAX_HALF_FLOAT
+		if (!src_img.clean_astc_hdr_pixels(basist::ASTC_HDR_MAX_VAL))
+			printf("Warning: clean_astc_hdr_pixels() had to modify the input image to encode to ASTC HDR - see previous warning(s).\n");
+
+		float lowest_nonzero_val = 1e+30f;
+		float lowest_val = 1e+30f;
+		float highest_val = -1e+30f;
+
+		for (uint32_t y = 0; y < src_img.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < src_img.get_width(); x++)
+			{
+				const vec4F& c = src_img(x, y);
+
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					lowest_val = basisu::minimum(lowest_val, c[i]);
+
+					if (c[i] != 0.0f)
+						lowest_nonzero_val = basisu::minimum(lowest_nonzero_val, c[i]);
+
+					highest_val = basisu::maximum(highest_val, c[i]);
+				}
+			}
+		}
+
+		debug_printf("Lowest image value: %e, lowest non-zero value: %e, highest value: %e, dynamic range: %e\n", lowest_val, lowest_nonzero_val, highest_val, highest_val / lowest_nonzero_val);
+	}
+
+	bool basis_compressor::read_dds_source_images()
+	{
+		debug_printf("basis_compressor::read_dds_source_images\n");
+
+		// Nothing to do if the caller doesn't want us reading source images.
+		if ((!m_params.m_read_source_images) || (!m_params.m_source_filenames.size()))
+			return true;
+
+		// Just bail of the caller has specified their own source images.
+		if (m_params.m_source_images.size() || m_params.m_source_images_hdr.size())
+			return true;
+
+		if (m_params.m_source_mipmap_images.size() || m_params.m_source_mipmap_images_hdr.size())
+			return true;
+				
+		// See if any input filenames are .DDS
+		bool any_dds = false, all_dds = true;
+		for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++)
+		{
+			std::string ext(string_get_extension(m_params.m_source_filenames[i]));
+			if (strcasecmp(ext.c_str(), "dds") == 0)
+				any_dds = true;
+			else
+				all_dds = false;
+		}
+
+		// Bail if no .DDS files specified.
+		if (!any_dds)
+			return true;
+
+		// If any input is .DDS they all must be .DDS, for simplicity.
+		if (!all_dds)
+		{
+			error_printf("If any filename is DDS, all filenames must be DDS.\n");
+			return false;
+		}
+
+		// Can't jam in alpha channel images if any .DDS files specified.
+		if (m_params.m_source_alpha_filenames.size())
+		{
+			error_printf("Source alpha filenames are not supported in DDS mode.\n");
+			return false;
+		}
+
+		bool any_mipmaps = false;
+
+		// Read each .DDS texture file
+		for (uint32_t i = 0; i < m_params.m_source_filenames.size(); i++)
+		{
+			basisu::vector<image> ldr_mips;
+			basisu::vector<imagef> hdr_mips;
+			bool status = read_uncompressed_dds_file(m_params.m_source_filenames[i].c_str(), ldr_mips, hdr_mips);
+			if (!status)
+				return false;
+
+			assert(ldr_mips.size() || hdr_mips.size());
+
+			if (m_params.m_status_output)
+			{
+				printf("Read DDS file \"%s\", %s, %ux%u, %u mipmap levels\n",
+					m_params.m_source_filenames[i].c_str(),
+					ldr_mips.size() ? "LDR" : "HDR",
+					ldr_mips.size() ? ldr_mips[0].get_width() : hdr_mips[0].get_width(),
+					ldr_mips.size() ? ldr_mips[0].get_height() : hdr_mips[0].get_height(),
+					ldr_mips.size() ? ldr_mips.size() : hdr_mips.size());
+			}
+
+			if (ldr_mips.size())
+			{
+				if (m_params.m_source_images_hdr.size())
+				{
+					error_printf("All DDS files must be of the same type (all LDR, or all HDR)\n");
+					return false;
+				}
+
+				m_params.m_source_images.push_back(ldr_mips[0]);
+				m_params.m_source_mipmap_images.resize(m_params.m_source_mipmap_images.size() + 1);
+
+				if (ldr_mips.size() > 1)
+				{
+					ldr_mips.erase(0U);
+
+					m_params.m_source_mipmap_images.back().swap(ldr_mips);
+					
+					any_mipmaps = true;
+				}
+			}
+			else
+			{
+				if (m_params.m_source_images.size())
+				{
+					error_printf("All DDS files must be of the same type (all LDR, or all HDR)\n");
+					return false;
+				}
+
+				m_params.m_source_images_hdr.push_back(hdr_mips[0]);
+				m_params.m_source_mipmap_images_hdr.resize(m_params.m_source_mipmap_images_hdr.size() + 1);
+
+				if (hdr_mips.size() > 1)
+				{
+					hdr_mips.erase(0U);
+
+					m_params.m_source_mipmap_images_hdr.back().swap(hdr_mips);
+					
+					any_mipmaps = true;
+				}
+
+				m_params.m_hdr = true;
+				m_params.m_uastc = true;
+			}
+		}
+
+		m_params.m_read_source_images = false;
+		m_params.m_source_filenames.clear();
+		m_params.m_source_alpha_filenames.clear();
+
+		if (!any_mipmaps)
+		{
+			m_params.m_source_mipmap_images.clear();
+			m_params.m_source_mipmap_images_hdr.clear();
+		}
+
+		if ((m_params.m_hdr) && (!m_params.m_source_images_hdr.size()))
+		{
+			error_printf("HDR mode enabled, but only LDR .DDS files were loaded. HDR mode requires half or float (HDR) .DDS inputs.\n");
+			return false;
+		}
+		
+		return true;
+	}
+
 	bool basis_compressor::read_source_images()
 	{
 		debug_printf("basis_compressor::read_source_images\n");
 
-		const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : (uint32_t)m_params.m_source_images.size();
+		const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : 
+			(m_params.m_hdr ? (uint32_t)m_params.m_source_images_hdr.size() : (uint32_t)m_params.m_source_images.size());
+
 		if (!total_source_files)
+		{
+			debug_printf("basis_compressor::read_source_images: No source images to process\n");
+
 			return false;
+		}
 
 		m_stats.resize(0);
 		m_slice_descs.resize(0);
 		m_slice_images.resize(0);
+		m_slice_images_hdr.resize(0);
 
 		m_total_blocks = 0;
 		uint32_t total_macroblocks = 0;
@@ -481,106 +1194,196 @@ namespace basisu
 		m_any_source_image_has_alpha = false;
 
 		basisu::vector<image> source_images;
+		basisu::vector<imagef> source_images_hdr;
+
 		basisu::vector<std::string> source_filenames;
 		
+		// TODO: Note HDR images don't support alpha here, currently.
+
 		// First load all source images, and determine if any have an alpha channel.
 		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
 		{
-			const char *pSource_filename = "";
+			const char* pSource_filename = "";
 
 			image file_image;
-			
+			imagef file_image_hdr;
+
 			if (m_params.m_read_source_images)
 			{
 				pSource_filename = m_params.m_source_filenames[source_file_index].c_str();
 
 				// Load the source image
-				if (!load_image(pSource_filename, file_image))
+				if (m_params.m_hdr)
 				{
-					error_printf("Failed reading source image: %s\n", pSource_filename);
-					return false;
+					if (!load_image_hdr(pSource_filename, file_image_hdr, m_params.m_hdr_ldr_srgb_to_linear_conversion))
+					{
+						error_printf("Failed reading source image: %s\n", pSource_filename);
+						return false;
+					}
+
+					// For now, just slam alpha to 1.0f. UASTC HDR doesn't support alpha yet.
+					for (uint32_t y = 0; y < file_image_hdr.get_height(); y++)
+						for (uint32_t x = 0; x < file_image_hdr.get_width(); x++)
+							file_image_hdr(x, y)[3] = 1.0f;
 				}
+				else
+				{
+					if (!load_image(pSource_filename, file_image))
+					{
+						error_printf("Failed reading source image: %s\n", pSource_filename);
+						return false;
+					}
+				}
+
+				const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width();
+				const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height();
 
 				if (m_params.m_status_output)
 				{
-					printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
+					printf("Read source image \"%s\", %ux%u\n", pSource_filename, width, height);
 				}
 
-				// Optionally load another image and put a grayscale version of it into the alpha channel.
-				if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
+				if (m_params.m_hdr)
 				{
-					const char *pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str();
+					clean_hdr_image(file_image_hdr);
+				}
+				else
+				{
+					// Optionally load another image and put a grayscale version of it into the alpha channel.
+					if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
+					{
+						const char* pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str();
 
-					image alpha_data;
+						image alpha_data;
 
-					if (!load_image(pSource_alpha_image, alpha_data))
-					{
-						error_printf("Failed reading source image: %s\n", pSource_alpha_image);
-						return false;
-					}
+						if (!load_image(pSource_alpha_image, alpha_data))
+						{
+							error_printf("Failed reading source image: %s\n", pSource_alpha_image);
+							return false;
+						}
 
-					printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height());
+						printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height());
 
-					alpha_data.crop(file_image.get_width(), file_image.get_height());
+						alpha_data.crop(width, height);
 
-					for (uint32_t y = 0; y < file_image.get_height(); y++)
-						for (uint32_t x = 0; x < file_image.get_width(); x++)
-							file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma();
+						for (uint32_t y = 0; y < height; y++)
+							for (uint32_t x = 0; x < width; x++)
+								file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma();
+					}
 				}
 			}
 			else
 			{
-				file_image = m_params.m_source_images[source_file_index];
+				if (m_params.m_hdr)
+				{
+					file_image_hdr = m_params.m_source_images_hdr[source_file_index];
+					clean_hdr_image(file_image_hdr);
+				}
+				else
+				{
+					file_image = m_params.m_source_images[source_file_index];
+				}
 			}
 
-			if (m_params.m_renormalize)
-				file_image.renormalize_normal_map();
+			if (!m_params.m_hdr)
+			{
+				if (m_params.m_renormalize)
+					file_image.renormalize_normal_map();
+			}
 
 			bool alpha_swizzled = false;
+
 			if (m_params.m_swizzle[0] != 0 ||
 				m_params.m_swizzle[1] != 1 ||
 				m_params.m_swizzle[2] != 2 ||
 				m_params.m_swizzle[3] != 3)
 			{
-				// Used for XY normal maps in RG - puts X in color, Y in alpha
-				for (uint32_t y = 0; y < file_image.get_height(); y++)
-					for (uint32_t x = 0; x < file_image.get_width(); x++)
+				if (!m_params.m_hdr)
+				{
+					// Used for XY normal maps in RG - puts X in color, Y in alpha
+					for (uint32_t y = 0; y < file_image.get_height(); y++)
+					{
+						for (uint32_t x = 0; x < file_image.get_width(); x++)
+						{
+							const color_rgba& c = file_image(x, y);
+							file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+						}
+					}
+
+					alpha_swizzled = (m_params.m_swizzle[3] != 3);
+				}
+				else
+				{
+					// Used for XY normal maps in RG - puts X in color, Y in alpha
+					for (uint32_t y = 0; y < file_image_hdr.get_height(); y++)
 					{
-						const color_rgba &c = file_image(x, y);
-						file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+						for (uint32_t x = 0; x < file_image_hdr.get_width(); x++)
+						{
+							const vec4F& c = file_image_hdr(x, y);
+							
+							// For now, alpha is always 1.0f in UASTC HDR.
+							file_image_hdr(x, y).set(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], 1.0f); // c[m_params.m_swizzle[3]]);
+						}
 					}
-				alpha_swizzled = m_params.m_swizzle[3] != 3;
+				}
 			}
-						
+
 			bool has_alpha = false;
-			if (m_params.m_force_alpha || alpha_swizzled)
-				has_alpha = true;
-			else if (!m_params.m_check_for_alpha)
-				file_image.set_alpha(255);
-			else if (file_image.has_alpha())
-				has_alpha = true;
 
-			if (has_alpha)
-				m_any_source_image_has_alpha = true;
+			if (!m_params.m_hdr)
+			{
+				if (m_params.m_force_alpha || alpha_swizzled)
+					has_alpha = true;
+				else if (!m_params.m_check_for_alpha)
+					file_image.set_alpha(255);
+				else if (file_image.has_alpha())
+					has_alpha = true;
+
+				if (has_alpha)
+					m_any_source_image_has_alpha = true;
+			}
+
+			{
+				const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width();
+				const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height();
+
+				debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, width, height, has_alpha);
+			}
 
-			debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, file_image.get_width(), file_image.get_height(), has_alpha);
-												
 			if (m_params.m_y_flip)
-				file_image.flip_y();
+			{
+				if (m_params.m_hdr)
+					file_image_hdr.flip_y();
+				else
+					file_image.flip_y();
+			}
 
 #if DEBUG_EXTRACT_SINGLE_BLOCK
-			image block_image(4, 4);
 			const uint32_t block_x = 0;
 			const uint32_t block_y = 0;
-			block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0);
-			file_image = block_image;
+
+			if (m_params.m_hdr)
+			{
+				imagef block_image(4, 4);
+				block_image_hdr.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image_hdr, 0);
+				file_image_hdr = block_image;
+			}
+			else
+			{
+				image block_image(4, 4);
+				block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0);
+				file_image = block_image;
+			}
 #endif
 
 #if DEBUG_CROP_TEXTURE_TO_64x64
-			file_image.resize(64, 64);
+			if (m_params.m_hdr)
+				file_image_hdr.resize(64, 64);
+			else
+				file_image.resize(64, 64);
 #endif
 
-			if (m_params.m_resample_width > 0 && m_params.m_resample_height > 0)
+			if ((m_params.m_resample_width > 0) && (m_params.m_resample_height > 0))
 			{
 				int new_width = basisu::minimum<int>(m_params.m_resample_width, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
 				int new_height = basisu::minimum<int>(m_params.m_resample_height, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
@@ -588,129 +1391,225 @@ namespace basisu
 				debug_printf("Resampling to %ix%i\n", new_width, new_height);
 
 				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
-				image temp_img(new_width, new_height);
-				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
-				temp_img.swap(file_image);
+				if (m_params.m_hdr)
+				{
+					imagef temp_img(new_width, new_height);
+					image_resample(file_image_hdr, temp_img, "box"); // "kaiser");
+					clean_hdr_image(temp_img);
+					temp_img.swap(file_image_hdr);
+				}
+				else
+				{
+					image temp_img(new_width, new_height);
+					image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+					temp_img.swap(file_image);
+				}
 			}
 			else if (m_params.m_resample_factor > 0.0f)
 			{
-				int new_width = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
-				int new_height = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
+				if (m_params.m_hdr)
+				{
+					int new_width = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image_hdr.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+					int new_height = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image_hdr.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
 
-				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+					debug_printf("Resampling to %ix%i\n", new_width, new_height);
 
-				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
-				image temp_img(new_width, new_height);
-				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
-				temp_img.swap(file_image);
+					imagef temp_img(new_width, new_height);
+					image_resample(file_image_hdr, temp_img, "box"); // "kaiser");
+					clean_hdr_image(temp_img);
+					temp_img.swap(file_image_hdr);
+				}
+				else
+				{
+					int new_width = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+					int new_height = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+					debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+					image temp_img(new_width, new_height);
+					image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+					temp_img.swap(file_image);
+				}
 			}
 
-			if ((!file_image.get_width()) || (!file_image.get_height()))
+			const uint32_t width = m_params.m_hdr ? file_image_hdr.get_width() : file_image.get_width();
+			const uint32_t height = m_params.m_hdr ? file_image_hdr.get_height() : file_image.get_height();
+
+			if ((!width) || (!height))
 			{
 				error_printf("basis_compressor::read_source_images: Source image has a zero width and/or height!\n");
 				return false;
 			}
 
-			if ((file_image.get_width() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (file_image.get_height() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION))
+			if ((width > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (height > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION))
 			{
 				error_printf("basis_compressor::read_source_images: Source image \"%s\" is too large!\n", pSource_filename);
 				return false;
 			}
 
-			source_images.enlarge(1)->swap(file_image);
+			if (!m_params.m_hdr)
+				source_images.enlarge(1)->swap(file_image);
+			else
+				source_images_hdr.enlarge(1)->swap(file_image_hdr);
+
 			source_filenames.push_back(pSource_filename);
 		}
 
 		// Check if the caller has generated their own mipmaps. 
-		if (m_params.m_source_mipmap_images.size())
+		if (m_params.m_hdr)
 		{
-			// Make sure they've passed us enough mipmap chains.
-			if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size()))
+			if (m_params.m_source_mipmap_images_hdr.size())
 			{
-				error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n");
-				return false;
+				// Make sure they've passed us enough mipmap chains.
+				if ((m_params.m_source_images_hdr.size() != m_params.m_source_mipmap_images_hdr.size()) || (total_source_files != m_params.m_source_images_hdr.size()))
+				{
+					error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images_hdr.size() must equal m_params.m_source_images_hdr.size()!\n");
+					return false;
+				}
 			}
-
-			// Check if any of the user-supplied mipmap levels has alpha.
-			// We're assuming the user has already preswizzled their mipmap source images.
-			if (!m_any_source_image_has_alpha)
+		}
+		else 
+		{
+			if (m_params.m_source_mipmap_images.size())
 			{
-				for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+				// Make sure they've passed us enough mipmap chains.
+				if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size()))
 				{
-					for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
-					{
-						const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+					error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n");
+					return false;
+				}
 
-						if (mip_img.has_alpha())
+				// Check if any of the user-supplied mipmap levels has alpha.
+				if (!m_any_source_image_has_alpha)
+				{
+					for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+					{
+						for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
 						{
-							m_any_source_image_has_alpha = true;
-							break;
+							const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+
+							// Be sure to take into account any swizzling which will be applied.
+							if (mip_img.has_alpha(m_params.m_swizzle[3]))
+							{
+								m_any_source_image_has_alpha = true;
+								break;
+							}
 						}
-					}
 
-					if (m_any_source_image_has_alpha)
-						break;
+						if (m_any_source_image_has_alpha)
+							break;
+					}
 				}
 			}
 		}
 
 		debug_printf("Any source image has alpha: %u\n", m_any_source_image_has_alpha);
 
+		// Now, for each source image, create the slices corresponding to that image.
 		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
 		{
 			const std::string &source_filename = source_filenames[source_file_index];
-
-			// Now, for each source image, create the slices corresponding to that image.
+						
 			basisu::vector<image> slices;
+			basisu::vector<imagef> slices_hdr;
 			
 			slices.reserve(32);
+			slices_hdr.reserve(32);
 						
 			// The first (largest) mipmap level.
-			image& file_image = source_images[source_file_index];
-						
+			image *pFile_image = source_images.size() ? &source_images[source_file_index] : nullptr;
+			imagef *pFile_image_hdr = source_images_hdr.size() ? &source_images_hdr[source_file_index] : nullptr;
+									
 			// Reserve a slot for mip0.
-			slices.resize(1);
-												
-			if (m_params.m_source_mipmap_images.size())
+			if (m_params.m_hdr)
+				slices_hdr.resize(1);
+			else
+				slices.resize(1);
+			
+			if ((!m_params.m_hdr) && (m_params.m_source_mipmap_images.size()))
 			{
 				// User-provided mipmaps for each layer or image in the texture array.
 				for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
 				{
 					image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
 
-					if (m_params.m_swizzle[0] != 0 ||
-						m_params.m_swizzle[1] != 1 ||
-						m_params.m_swizzle[2] != 2 ||
-						m_params.m_swizzle[3] != 3)
+					if ((m_params.m_swizzle[0] != 0) ||
+						(m_params.m_swizzle[1] != 1) ||
+						(m_params.m_swizzle[2] != 2) ||
+						(m_params.m_swizzle[3] != 3))
 					{
 						// Used for XY normal maps in RG - puts X in color, Y in alpha
 						for (uint32_t y = 0; y < mip_img.get_height(); y++)
+						{
 							for (uint32_t x = 0; x < mip_img.get_width(); x++)
 							{
-								const color_rgba &c = mip_img(x, y);
+								const color_rgba& c = mip_img(x, y);
 								mip_img(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
 							}
+						}
 					}
 
 					slices.push_back(mip_img);
 				}
 			}
+			else if ((m_params.m_hdr) && (m_params.m_source_mipmap_images_hdr.size()))
+			{
+				// User-provided mipmaps for each layer or image in the texture array.
+				for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images_hdr[source_file_index].size(); mip_index++)
+				{
+					imagef& mip_img = m_params.m_source_mipmap_images_hdr[source_file_index][mip_index];
+
+					if ((m_params.m_swizzle[0] != 0) ||
+						(m_params.m_swizzle[1] != 1) ||
+						(m_params.m_swizzle[2] != 2) ||
+						(m_params.m_swizzle[3] != 3))
+					{
+						// Used for XY normal maps in RG - puts X in color, Y in alpha
+						for (uint32_t y = 0; y < mip_img.get_height(); y++)
+						{
+							for (uint32_t x = 0; x < mip_img.get_width(); x++)
+							{
+								const vec4F& c = mip_img(x, y);
+
+								// For now, HDR alpha is always 1.0f.
+								mip_img(x, y).set(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], 1.0f); // c[m_params.m_swizzle[3]]);
+							}
+						}
+					}
+
+					clean_hdr_image(mip_img);
+
+					slices_hdr.push_back(mip_img);
+				}
+			}
 			else if (m_params.m_mip_gen)
 			{
 				// Automatically generate mipmaps.
-				if (!generate_mipmaps(file_image, slices, m_any_source_image_has_alpha))
-					return false;
+				if (m_params.m_hdr)
+				{
+					if (!generate_mipmaps(*pFile_image_hdr, slices_hdr, m_any_source_image_has_alpha))
+						return false;
+				}
+				else
+				{
+					if (!generate_mipmaps(*pFile_image, slices, m_any_source_image_has_alpha))
+						return false;
+				}
 			}
 
 			// Swap in the largest mipmap level here to avoid copying it, because generate_mips() will change the array.
 			// NOTE: file_image is now blank.
-			slices[0].swap(file_image);
+			if (m_params.m_hdr)
+				slices_hdr[0].swap(*pFile_image_hdr);
+			else
+				slices[0].swap(*pFile_image);
 
-			uint_vec mip_indices(slices.size());
-			for (uint32_t i = 0; i < slices.size(); i++)
+			uint_vec mip_indices(m_params.m_hdr ? slices_hdr.size() : slices.size());
+			for (uint32_t i = 0; i < (m_params.m_hdr ? slices_hdr.size() : slices.size()); i++)
 				mip_indices[i] = i;
 						
-			if ((m_any_source_image_has_alpha) && (!m_params.m_uastc))
+			if ((!m_params.m_hdr) && (m_any_source_image_has_alpha) && (!m_params.m_uastc))
 			{
 				// For ETC1S, if source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. 
 				basisu::vector<image> alpha_slices;
@@ -745,20 +1644,29 @@ namespace basisu
 				mip_indices.swap(new_mip_indices);
 			}
 
-			assert(slices.size() == mip_indices.size());
-						
-			for (uint32_t slice_index = 0; slice_index < slices.size(); slice_index++)
+			if (m_params.m_hdr)
+			{
+				assert(slices_hdr.size() == mip_indices.size());
+			}
+			else
+			{
+				assert(slices.size() == mip_indices.size());
+			}
+					
+			for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? slices_hdr.size() : slices.size()); slice_index++)
 			{
-				image& slice_image = slices[slice_index];
-				const uint32_t orig_width = slice_image.get_width();
-				const uint32_t orig_height = slice_image.get_height();
+				image *pSlice_image = m_params.m_hdr ? nullptr : &slices[slice_index];
+				imagef *pSlice_image_hdr = m_params.m_hdr ? &slices_hdr[slice_index] : nullptr;
+
+				const uint32_t orig_width = m_params.m_hdr ? pSlice_image_hdr->get_width() : pSlice_image->get_width();
+				const uint32_t orig_height = m_params.m_hdr ? pSlice_image_hdr->get_height() : pSlice_image->get_height();
 
 				bool is_alpha_slice = false;
-				if (m_any_source_image_has_alpha)
+				if ((!m_params.m_hdr) && (m_any_source_image_has_alpha))
 				{
 					if (m_params.m_uastc)
 					{
-						is_alpha_slice = slice_image.has_alpha();
+						is_alpha_slice = pSlice_image->has_alpha();
 					}
 					else
 					{
@@ -767,43 +1675,69 @@ namespace basisu
 				}
 
 				// Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks.
-				slice_image.crop_dup_borders(slice_image.get_block_width(4) * 4, slice_image.get_block_height(4) * 4);
+				if (m_params.m_hdr)
+					pSlice_image_hdr->crop_dup_borders(pSlice_image_hdr->get_block_width(4) * 4, pSlice_image_hdr->get_block_height(4) * 4);
+				else
+					pSlice_image->crop_dup_borders(pSlice_image->get_block_width(4) * 4, pSlice_image->get_block_height(4) * 4);
 
 				if (m_params.m_debug_images)
 				{
-					save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), slice_image);
+					if (m_params.m_hdr)
+						write_exr(string_format("basis_debug_source_image_%u_slice_%u.exr", source_file_index, slice_index).c_str(), *pSlice_image_hdr, 3, 0);
+					else
+						save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), *pSlice_image);
 				}
 
-				const uint32_t dest_image_index = m_slice_images.size();
+				const uint32_t dest_image_index = (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size());
 
 				enlarge_vector(m_stats, 1);
-				enlarge_vector(m_slice_images, 1);
+
+				if (m_params.m_hdr)
+					enlarge_vector(m_slice_images_hdr, 1);
+				else
+					enlarge_vector(m_slice_images, 1);
+
 				enlarge_vector(m_slice_descs, 1);
-								
+
 				m_stats[dest_image_index].m_filename = source_filename.c_str();
 				m_stats[dest_image_index].m_width = orig_width;
 				m_stats[dest_image_index].m_height = orig_height;
-								
-				debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), orig_width, orig_height, slice_image.get_width(), slice_image.get_height());
 
-				basisu_backend_slice_desc &slice_desc = m_slice_descs[dest_image_index];
+				debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", 
+					m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), 
+					orig_width, orig_height, 
+					m_params.m_hdr ? pSlice_image_hdr->get_width() : pSlice_image->get_width(), 
+					m_params.m_hdr ? pSlice_image_hdr->get_height() : pSlice_image->get_height());
+
+				basisu_backend_slice_desc& slice_desc = m_slice_descs[dest_image_index];
 
 				slice_desc.m_first_block_index = m_total_blocks;
 
 				slice_desc.m_orig_width = orig_width;
 				slice_desc.m_orig_height = orig_height;
 
-				slice_desc.m_width = slice_image.get_width();
-				slice_desc.m_height = slice_image.get_height();
+				if (m_params.m_hdr)
+				{
+					slice_desc.m_width = pSlice_image_hdr->get_width();
+					slice_desc.m_height = pSlice_image_hdr->get_height();
+
+					slice_desc.m_num_blocks_x = pSlice_image_hdr->get_block_width(4);
+					slice_desc.m_num_blocks_y = pSlice_image_hdr->get_block_height(4);
+				}
+				else
+				{
+					slice_desc.m_width = pSlice_image->get_width();
+					slice_desc.m_height = pSlice_image->get_height();
 
-				slice_desc.m_num_blocks_x = slice_image.get_block_width(4);
-				slice_desc.m_num_blocks_y = slice_image.get_block_height(4);
+					slice_desc.m_num_blocks_x = pSlice_image->get_block_width(4);
+					slice_desc.m_num_blocks_y = pSlice_image->get_block_height(4);
+				}
 
 				slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1;
 				slice_desc.m_num_macroblocks_y = (slice_desc.m_num_blocks_y + 1) >> 1;
 
 				slice_desc.m_source_file_index = source_file_index;
-				
+
 				slice_desc.m_mip_index = mip_indices[slice_index];
 
 				slice_desc.m_alpha = is_alpha_slice;
@@ -818,8 +1752,11 @@ namespace basisu
 
 				// Finally, swap in the slice's image to avoid copying it.
 				// NOTE: slice_image is now blank.
-				m_slice_images[dest_image_index].swap(slice_image);
-			
+				if (m_params.m_hdr)
+					m_slice_images_hdr[dest_image_index].swap(*pSlice_image_hdr);
+				else
+					m_slice_images[dest_image_index].swap(*pSlice_image);
+
 			} // slice_index
 
 		} // source_file_index
@@ -855,7 +1792,7 @@ namespace basisu
 
 		if (m_params.m_status_output)
 		{
-			printf("Total basis file slices: %u\n", (uint32_t)m_slice_descs.size());
+			printf("Total slices: %u\n", (uint32_t)m_slice_descs.size());
 		}
 
 		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
@@ -865,11 +1802,17 @@ namespace basisu
 			if (m_params.m_status_output)
 			{
 				printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n",
-					i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
+					i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, 
+					slice_desc.m_width, slice_desc.m_height, 
+					slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
 			}
 
 			if (m_any_source_image_has_alpha)
 			{
+				// HDR doesn't support alpha yet
+				if (m_params.m_hdr)
+					return false;
+
 				if (!m_params.m_uastc)
 				{
 					// For ETC1S, alpha slices must be at odd slice indices.
@@ -903,6 +1846,7 @@ namespace basisu
 
 			if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height))
 				return false;
+
 			if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
 			{
 				if (!slice_desc.m_iframe)
@@ -924,7 +1868,7 @@ namespace basisu
 				
 		uint32_t total_basis_images = 0;
 
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++)
 		{
 			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
 				
@@ -945,7 +1889,7 @@ namespace basisu
 		uint_vec image_mipmap_levels(total_basis_images);
 
 		int width = -1, height = -1;
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++)
 		{
 			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
 
@@ -982,20 +1926,52 @@ namespace basisu
 	{
 		debug_printf("basis_compressor::extract_source_blocks\n");
 
-		m_source_blocks.resize(m_total_blocks);
+		if (m_params.m_hdr)
+			m_source_blocks_hdr.resize(m_total_blocks);
+		else
+			m_source_blocks.resize(m_total_blocks);
 
-		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		for (uint32_t slice_index = 0; slice_index < (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); slice_index++)
 		{
 			const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
 
 			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
 			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
 
-			const image& source_image = m_slice_images[slice_index];
+			const image *pSource_image = m_params.m_hdr ? nullptr : &m_slice_images[slice_index];
+			const imagef *pSource_image_hdr = m_params.m_hdr ? &m_slice_images_hdr[slice_index] : nullptr;
 
 			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-					source_image.extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4);
+				{
+					if (m_params.m_hdr)
+					{
+						vec4F* pBlock = m_source_blocks_hdr[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr();
+
+						pSource_image_hdr->extract_block_clamped(pBlock, block_x * 4, block_y * 4, 4, 4);
+
+						// Additional (technically optional) early sanity checking of the block texels.
+						for (uint32_t i = 0; i < 16; i++)
+						{
+							for (uint32_t c = 0; c < 3; c++)
+							{
+								float v = pBlock[i][c];
+
+								if (std::isnan(v) || std::isinf(v) || (v < 0.0f) || (v > basist::MAX_HALF_FLOAT))
+								{
+									error_printf("basis_compressor::extract_source_blocks: invalid float component\n");
+									return false;
+								}
+							}
+						}
+					}
+					else
+					{
+						pSource_image->extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4);
+					}
+				}
+			}
 		}
 
 		return true;
@@ -1304,6 +2280,8 @@ namespace basisu
 		m_output_basis_file = comp_data;
 
 		uint32_t total_orig_pixels = 0, total_texels = 0, total_orig_texels = 0;
+		(void)total_texels;
+
 		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 		{
 			const basisu_backend_slice_desc& slice_desc = m_slice_descs[i];
@@ -1335,10 +2313,21 @@ namespace basisu
 			}
 
 			m_decoded_output_textures.resize(m_slice_descs.size());
-			m_decoded_output_textures_unpacked.resize(m_slice_descs.size());
 
-			m_decoded_output_textures_bc7.resize(m_slice_descs.size());
-			m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size());
+			if (m_params.m_hdr)
+			{
+				m_decoded_output_textures_bc6h_hdr_unpacked.resize(m_slice_descs.size());
+
+				m_decoded_output_textures_astc_hdr.resize(m_slice_descs.size());
+				m_decoded_output_textures_astc_hdr_unpacked.resize(m_slice_descs.size());
+			}
+			else
+			{
+				m_decoded_output_textures_unpacked.resize(m_slice_descs.size());
+
+				m_decoded_output_textures_bc7.resize(m_slice_descs.size());
+				m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size());
+			}
 
 			tm.start();
 			if (m_params.m_pGlobal_codebooks)
@@ -1360,12 +2349,16 @@ namespace basisu
 
 			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 			{
+				basisu::texture_format tex_format = m_params.m_hdr ? texture_format::cBC6HUnsigned : (m_params.m_uastc ? texture_format::cUASTC4x4 : texture_format::cETC1);
+				basist::block_format format = m_params.m_hdr ? basist::block_format::cBC6H : (m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1);
+
 				gpu_image decoded_texture;
-				decoded_texture.init(m_params.m_uastc ? texture_format::cUASTC4x4 : texture_format::cETC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+				decoded_texture.init(
+					tex_format, 
+					m_slice_descs[i].m_width, m_slice_descs[i].m_height);
 
 				tm.start();
-
-				basist::block_format format = m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1;
+								
 				uint32_t bytes_per_block = m_params.m_uastc ? 16 : 8;
 
 				if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
@@ -1391,43 +2384,87 @@ namespace basisu
 				m_decoded_output_textures[i] = decoded_texture;
 			}
 
-			double total_time_bc7 = 0;
+			double total_alt_transcode_time = 0;
+			tm.start();
 
-			if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) &&
-				basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S))
+			if (m_params.m_hdr)
 			{
+				assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA, basist::basis_tex_format::cUASTC_HDR_4x4));
+
 				for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 				{
 					gpu_image decoded_texture;
-					decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+					decoded_texture.init(texture_format::cASTC_HDR_4x4, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
 
 					tm.start();
 
 					if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
-						reinterpret_cast<etc_block*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+						reinterpret_cast<basist::astc_blk*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cASTC_HDR_4x4, 16))
 					{
-						error_printf("Transcoding failed to BC7 on slice %u!\n", i);
+						error_printf("Transcoding failed to ASTC HDR on slice %u!\n", i);
 						return false;
 					}
-
-					total_time_bc7 += tm.get_elapsed_secs();
-
-					m_decoded_output_textures_bc7[i] = decoded_texture;
+										
+					m_decoded_output_textures_astc_hdr[i] = decoded_texture;
+				}
+			}
+			else
+			{
+				if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) &&
+					basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S))
+				{
+					for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+					{
+						gpu_image decoded_texture;
+						decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+												
+						if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+							reinterpret_cast<etc_block*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+						{
+							error_printf("Transcoding failed to BC7 on slice %u!\n", i);
+							return false;
+						}
+												
+						m_decoded_output_textures_bc7[i] = decoded_texture;
+					}
 				}
 			}
 
+			total_alt_transcode_time = tm.get_elapsed_secs();
+
 			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
 			{
-				m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
+				if (m_params.m_hdr)
+				{
+					// BC6H
+					bool status = m_decoded_output_textures[i].unpack_hdr(m_decoded_output_textures_bc6h_hdr_unpacked[i]);
+					assert(status);
+					BASISU_NOTE_UNUSED(status);
+					
+					// ASTC HDR
+					status = m_decoded_output_textures_astc_hdr[i].unpack_hdr(m_decoded_output_textures_astc_hdr_unpacked[i]);
+					assert(status);
+				}
+				else
+				{
+					bool status = m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
+					assert(status);
+					BASISU_NOTE_UNUSED(status);
 
-				if (m_decoded_output_textures_bc7[i].get_pixel_width())
-					m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
+					if (m_decoded_output_textures_bc7[i].get_pixel_width())
+					{
+						status = m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
+						assert(status);
+					}
+				}
 			}
 
-			debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc);
+			debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", 
+				m_params.m_hdr ? "BC6H" : (m_params.m_uastc ? "ASTC" : "ETC1"),
+				total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc);
 
-			if (total_time_bc7 != 0)
-				debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7);
+			if (total_alt_transcode_time != 0)
+				debug_printf("Alternate transcode in %3.3fms, %f texels/sec\n", total_alt_transcode_time * 1000.0f, total_orig_pixels / total_alt_transcode_time);
 
 			for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
 			{
@@ -1438,17 +2475,82 @@ namespace basisu
 
 				assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks);
 			}
+
 		} // if (m_params.m_validate_output_data)
 				
 		return true;
 	}
 
+	bool basis_compressor::write_hdr_debug_images(const char* pBasename, const imagef& orig_hdr_img, uint32_t width, uint32_t height)
+	{
+		// Copy image to account for 4x4 block expansion
+		imagef hdr_img(orig_hdr_img);
+		hdr_img.resize(width, height);
+
+		image srgb_img(width, height);
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				vec4F p(hdr_img(x, y));
+
+				p[0] = clamp(p[0], 0.0f, 1.0f);
+				p[1] = clamp(p[1], 0.0f, 1.0f);
+				p[2] = clamp(p[2], 0.0f, 1.0f);
+
+				int rc = (int)std::round(linear_to_srgb(p[0]) * 255.0f);
+				int gc = (int)std::round(linear_to_srgb(p[1]) * 255.0f);
+				int bc = (int)std::round(linear_to_srgb(p[2]) * 255.0f);
+
+				srgb_img.set_clipped(x, y, color_rgba(rc, gc, bc, 255));
+			}
+		}
+
+		{
+			const std::string filename(string_format("%s_linear_clamped_to_srgb.png", pBasename));
+			save_png(filename.c_str(), srgb_img);
+			printf("Wrote .PNG file %s\n", filename.c_str());
+		}
+
+		{
+			const std::string filename(string_format("%s_compressive_tonemapped.png", pBasename));
+			image compressive_tonemapped_img;
+			
+			bool status = tonemap_image_compressive(compressive_tonemapped_img, hdr_img);
+			if (!status)
+			{
+				error_printf("basis_compressor::write_hdr_debug_images: tonemap_image_compressive() failed (invalid half-float input)\n");
+			}
+			else
+			{
+				save_png(filename.c_str(), compressive_tonemapped_img);
+				printf("Wrote .PNG file %s\n", filename.c_str());
+			}
+		}
+
+		image tonemapped_img;
+
+		for (int e = -5; e <= 5; e++)
+		{
+			const float scale = powf(2.0f, (float)e);
+
+			tonemap_image_reinhard(tonemapped_img, hdr_img, scale);
+
+			std::string filename(string_format("%s_reinhard_tonemapped_scale_%f.png", pBasename, scale));
+			save_png(filename.c_str(), tonemapped_img, cImageSaveIgnoreAlpha);
+			printf("Wrote .PNG file %s\n", filename.c_str());
+		}
+
+		return true;
+	}
+
 	bool basis_compressor::write_output_files_and_compute_stats()
 	{
 		debug_printf("basis_compressor::write_output_files_and_compute_stats\n");
 
 		const uint8_vec& comp_data = m_params.m_create_ktx2_file ? m_output_ktx2_file : m_basis_file.get_compressed_data();
-		if (m_params.m_write_output_basis_files)
+		if (m_params.m_write_output_basis_or_ktx2_files)
 		{
 			const std::string& output_filename = m_params.m_out_filename;
 
@@ -1458,7 +2560,7 @@ namespace basisu
 				return false;
 			}
 
-			if (m_params.m_status_output)
+			//if (m_params.m_status_output)
 			{
 				printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str());
 			}
@@ -1485,7 +2587,7 @@ namespace basisu
 			
 			m_basis_bits_per_texel = comp_size * 8.0f / total_texels;
 
-			debug_printf(".basis file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n",
+			debug_printf("Output file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n",
 				(uint32_t)comp_data.size(),
 				(uint32_t)comp_size,
 				m_basis_bits_per_texel);
@@ -1495,191 +2597,324 @@ namespace basisu
 		
 		if (m_params.m_validate_output_data)
 		{
-			for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			if (m_params.m_hdr)
 			{
-				const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+				if (m_params.m_print_stats)
+				{
+					printf("ASTC/BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");
+				}
 
-				if (m_params.m_compute_stats)
+				for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
 				{
-					if (m_params.m_print_stats)
-						printf("Slice: %u\n", slice_index);
+					const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
 
-					image_stats& s = m_stats[slice_index];
+					if (m_params.m_compute_stats)
+					{
+						image_stats& s = m_stats[slice_index];
 
-					// TODO: We used to output SSIM (during heavy encoder development), but this slowed down compression too much. We'll be adding it back.
+						if (m_params.m_print_stats)
+						{
+							printf("Slice: %u\n", slice_index);
+						}
 
-					image_metrics em;
+						image_metrics im;
 
-					// ---- .basis stats
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
-					if (m_params.m_print_stats)
-						em.print(".basis RGB Avg:          ");
-					s.m_basis_rgb_avg_psnr = em.m_psnr;
+						if (m_params.m_print_stats)
+						{
+							printf("\nASTC channels:\n");
+							for (uint32_t i = 0; i < 3; i++)
+							{
+								im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], i, 1, true);
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
-					if (m_params.m_print_stats)
-						em.print(".basis RGBA Avg:         ");
-					s.m_basis_rgba_avg_psnr = em.m_psnr;
+								printf("%c:   ", "RGB"[i]);
+								im.print_hp();
+							}
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
-					if (m_params.m_print_stats)
-						em.print(".basis R   Avg:          ");
+							printf("BC6H channels:\n");
+							for (uint32_t i = 0; i < 3; i++)
+							{
+								im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], i, 1, true);
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
-					if (m_params.m_print_stats)
-						em.print(".basis G   Avg:          ");
+								printf("%c:   ", "RGB"[i]);
+								im.print_hp();
+							}
+						}
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
-					if (m_params.m_print_stats)
-						em.print(".basis B   Avg:          ");
+						im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], 0, 3, true);
+						s.m_basis_rgb_avg_psnr = (float)im.m_psnr;
 
-					if (m_params.m_uastc)
-					{
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
 						if (m_params.m_print_stats)
-							em.print(".basis A   Avg:          ");
+						{
+							printf("\nASTC RGB: ");
+							im.print_hp();
+#if 0
+							// Validation
+							im.calc_half2(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], 0, 3, true);
+							printf("\nASTC RGB (Alt): ");
+							im.print_hp();
+#endif
+						}
 
-						s.m_basis_a_avg_psnr = em.m_psnr;
+						im.calc_half(m_slice_images_hdr[slice_index], m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], 0, 3, true);
+						s.m_basis_rgb_avg_bc6h_psnr = (float)im.m_psnr;
+
+						if (m_params.m_print_stats)
+						{
+							printf("BC6H RGB: ");
+							im.print_hp();
+							printf("\n");
+						}
 					}
+					
+					if (m_params.m_debug_images)
+					{
+						std::string out_basename;
+						if (m_params.m_out_filename.size())
+							string_get_filename(m_params.m_out_filename.c_str(), out_basename);
+						else if (m_params.m_source_filenames.size())
+							string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
-					if (m_params.m_print_stats)
-						em.print(".basis 709 Luma:         ");
-					s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
-					s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
+						string_remove_extension(out_basename);
+						out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
 
-					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
-					if (m_params.m_print_stats)
-						em.print(".basis 601 Luma:         ");
-					s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
+						// Write BC6H .DDS file.
+						{
+							gpu_image bc6h_tex(m_decoded_output_textures[slice_index]);
+							bc6h_tex.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							
+							std::string filename(out_basename + "_bc6h.dds");
+							write_compressed_texture_file(filename.c_str(), bc6h_tex, true);
+							printf("Wrote .DDS file %s\n", filename.c_str());
+						}
 
-					if (m_slice_descs.size() == 1)
-					{
-						const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
-						if (m_params.m_print_stats)
+						// Write ASTC .KTX/.astc files. ("astcenc -dh input.astc output.exr" to decode the astc file.)
+						{
+							gpu_image astc_tex(m_decoded_output_textures_astc_hdr[slice_index]);
+							astc_tex.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							
+							std::string filename1(out_basename + "_astc.astc");
+							write_astc_file(filename1.c_str(), astc_tex.get_ptr(), 4, 4, slice_desc.m_orig_width, slice_desc.m_orig_height);
+							printf("Wrote .ASTC file %s\n", filename1.c_str());
+
+							std::string filename2(out_basename + "_astc.ktx");
+							write_compressed_texture_file(filename2.c_str(), astc_tex, true);
+							printf("Wrote .KTX file %s\n", filename2.c_str());
+						}
+
+						// Write unpacked ASTC image to .EXR
+						{
+							imagef astc_img(m_decoded_output_textures_astc_hdr_unpacked[slice_index]);
+							astc_img.resize(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							
+							std::string filename(out_basename + "_unpacked_astc.exr");
+							write_exr(filename.c_str(), astc_img, 3, 0);
+							printf("Wrote .EXR file %s\n", filename.c_str());
+						}
+
+						// Write unpacked BC6H image to .EXR
 						{
-							debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
-							debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+							imagef bc6h_img(m_decoded_output_textures_bc6h_hdr_unpacked[slice_index]);
+							bc6h_img.resize(slice_desc.m_orig_width, slice_desc.m_orig_height);
+
+							std::string filename(out_basename + "_unpacked_bc6h.exr");
+							write_exr(filename.c_str(), bc6h_img, 3, 0);
+							printf("Wrote .EXR file %s\n", filename.c_str());
 						}
+
+						// Write tonemapped/srgb images
+						write_hdr_debug_images((out_basename + "_source").c_str(), m_slice_images_hdr[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height);
+						write_hdr_debug_images((out_basename + "_unpacked_astc").c_str(), m_decoded_output_textures_astc_hdr_unpacked[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height);
+						write_hdr_debug_images((out_basename + "_unpacked_bc6h").c_str(), m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], slice_desc.m_orig_width, slice_desc.m_orig_height);
 					}
+				}
+			}
+			else
+			{
+				for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+				{
+					const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
 
-					if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
+					if (m_params.m_compute_stats)
 					{
-						// ---- BC7 stats
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
 						if (m_params.m_print_stats)
-							em.print("BC7 RGB Avg:             ");
-						s.m_bc7_rgb_avg_psnr = em.m_psnr;
+							printf("Slice: %u\n", slice_index);
+
+						image_stats& s = m_stats[slice_index];
+												
+						image_metrics em;
+
+						// ---- .basis stats
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
+						if (m_params.m_print_stats)
+							em.print(".basis RGB Avg:          ");
+						s.m_basis_rgb_avg_psnr = (float)em.m_psnr;
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
 						if (m_params.m_print_stats)
-							em.print("BC7 RGBA Avg:            ");
-						s.m_bc7_rgba_avg_psnr = em.m_psnr;
+							em.print(".basis RGBA Avg:         ");
+						s.m_basis_rgba_avg_psnr = (float)em.m_psnr;
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
 						if (m_params.m_print_stats)
-							em.print("BC7 R   Avg:             ");
+							em.print(".basis R   Avg:          ");
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
 						if (m_params.m_print_stats)
-							em.print("BC7 G   Avg:             ");
+							em.print(".basis G   Avg:          ");
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
 						if (m_params.m_print_stats)
-							em.print("BC7 B   Avg:             ");
+							em.print(".basis B   Avg:          ");
 
 						if (m_params.m_uastc)
 						{
-							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
 							if (m_params.m_print_stats)
-								em.print("BC7 A   Avg:             ");
+								em.print(".basis A   Avg:          ");
 
-							s.m_bc7_a_avg_psnr = em.m_psnr;
+							s.m_basis_a_avg_psnr = (float)em.m_psnr;
 						}
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
 						if (m_params.m_print_stats)
-							em.print("BC7 709 Luma:            ");
-						s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
-						s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
+							em.print(".basis 709 Luma:         ");
+						s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
+						s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
 
-						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
 						if (m_params.m_print_stats)
-							em.print("BC7 601 Luma:            ");
-						s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
-					}
+							em.print(".basis 601 Luma:         ");
+						s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
 
-					if (!m_params.m_uastc)
-					{
-						// ---- Nearly best possible ETC1S stats
-						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
-						if (m_params.m_print_stats)
-							em.print("Unquantized ETC1S RGB Avg:     ");
-						s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
+						if (m_slice_descs.size() == 1)
+						{
+							const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
+							if (m_params.m_print_stats)
+							{
+								debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+								debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+							}
+						}
 
-						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
-						if (m_params.m_print_stats)
-							em.print("Unquantized ETC1S 709 Luma:    ");
-						s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
-						s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
+						if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
+						{
+							// ---- BC7 stats
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 RGB Avg:             ");
+							s.m_bc7_rgb_avg_psnr = (float)em.m_psnr;
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 RGBA Avg:            ");
+							s.m_bc7_rgba_avg_psnr = (float)em.m_psnr;
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 R   Avg:             ");
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 G   Avg:             ");
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 B   Avg:             ");
+
+							if (m_params.m_uastc)
+							{
+								em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
+								//if (m_params.m_print_stats)
+								//	em.print("BC7 A   Avg:             ");
 
-						em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
-						if (m_params.m_print_stats)
-							em.print("Unquantized ETC1S 601 Luma:    ");
-						s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
+								s.m_bc7_a_avg_psnr = (float)em.m_psnr;
+							}
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 709 Luma:            ");
+							s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
+							s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+							em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
+							//if (m_params.m_print_stats)
+							//	em.print("BC7 601 Luma:            ");
+							s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
+						}
+
+						if (!m_params.m_uastc)
+						{
+							// ---- Nearly best possible ETC1S stats
+							em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
+							//if (m_params.m_print_stats)
+							//	em.print("Unquantized ETC1S RGB Avg:     ");
+							s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
+
+							em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
+							//if (m_params.m_print_stats)
+							//	em.print("Unquantized ETC1S 709 Luma:    ");
+							s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
+							s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+							em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
+							//if (m_params.m_print_stats)
+							//	em.print("Unquantized ETC1S 601 Luma:    ");
+							s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
+						}
 					}
-				}
 
-				std::string out_basename;
-				if (m_params.m_out_filename.size())
-					string_get_filename(m_params.m_out_filename.c_str(), out_basename);
-				else if (m_params.m_source_filenames.size())
-					string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
+					std::string out_basename;
+					if (m_params.m_out_filename.size())
+						string_get_filename(m_params.m_out_filename.c_str(), out_basename);
+					else if (m_params.m_source_filenames.size())
+						string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
 
-				string_remove_extension(out_basename);
-				out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
+					string_remove_extension(out_basename);
+					out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
 
-				if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images))
-				{
-					// Write "best" ETC1S debug images
-					if (!m_params.m_uastc)
+					if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images))
 					{
-						gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]);
-						best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-						write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image);
+						// Write "best" ETC1S debug images
+						if (!m_params.m_uastc)
+						{
+							gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]);
+							best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image, true);
 
-						image best_etc1s_unpacked;
-						best_etc1s_gpu_image.unpack(best_etc1s_unpacked);
-						save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked);
+							image best_etc1s_unpacked;
+							best_etc1s_gpu_image.unpack(best_etc1s_unpacked);
+							save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked);
+						}
 					}
-				}
 
-				if (m_params.m_debug_images)
-				{
-					// Write decoded ETC1S/ASTC debug images
+					if (m_params.m_debug_images)
 					{
-						gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]);
-						decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-						write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc);
+						// Write decoded ETC1S/ASTC debug images
+						{
+							gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]);
+							decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc, true);
 
-						image temp(m_decoded_output_textures_unpacked[slice_index]);
-						temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
-						save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp);
-					}
+							image temp(m_decoded_output_textures_unpacked[slice_index]);
+							temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp);
+						}
 
-					// Write decoded BC7 debug images
-					if (m_decoded_output_textures_bc7[slice_index].get_pixel_width())
-					{
-						gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]);
-						decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
-						write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7);
+						// Write decoded BC7 debug images
+						if (m_decoded_output_textures_bc7[slice_index].get_pixel_width())
+						{
+							gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]);
+							decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7, true);
 
-						image temp(m_decoded_output_textures_unpacked_bc7[slice_index]);
-						temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
-						save_png(out_basename + "_transcoded_bc7.png", temp);
+							image temp(m_decoded_output_textures_unpacked_bc7[slice_index]);
+							temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+							save_png(out_basename + "_transcoded_bc7.png", temp);
+						}
 					}
 				}
-			}
+			} // if (m_params.m_hdr)
+
 		} // if (m_params.m_validate_output_data)
 				
 		return true;
@@ -1727,10 +2962,27 @@ namespace basisu
 	}
 
 	static uint8_t g_ktx2_etc1s_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
-	static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_etc1s_alpha_dfd[60]    = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	
 	static uint8_t g_ktx2_uastc_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
-	static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
-		
+	static uint8_t g_ktx2_uastc_alpha_dfd[44]    = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+
+	// HDR TODO - what is the best Khronos DFD to use for UASTC HDR?
+	static uint8_t g_ktx2_uastc_hdr_nonalpha_dfd[44] = 
+	{
+		0x2C,0x0,0x0,0x0,		// 0 totalSize
+		0x0,0x0,0x0,0x0,		// 1 descriptorType/vendorId
+		0x2,0x0,0x28,0x0,		// 2 descriptorBlockSize/versionNumber
+		0xA7,0x1,0x1,0x0,		// 3 flags, transferFunction, colorPrimaries, colorModel
+		0x3,0x3,0x0,0x0,		// 4 texelBlockDimension0-texelBlockDimension3
+		0x10,0x0,0x0,0x0,		// 5 bytesPlane0-bytesPlane3
+		0x0,0x0,0x0,0x0,		// 6 bytesPlane4-bytesPlane7
+		0x0,0x0,0x7F,0x80,		// 7 bitLength/bitOffset/channelType and Qualifer flags (KHR_DF_SAMPLE_DATATYPE_FLOAT etc.)
+		0x0,0x0,0x0,0x0,		// 8 samplePosition0-samplePosition3
+		0x0,0x0,0x0,0x0,		// 9 sampleLower (0.0)
+		0x00, 0x00, 0x80, 0x3F  // 10 sampleHigher (1.0)
+	};
+			
 	void basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header)
 	{
 		const uint8_t* pDFD;
@@ -1738,7 +2990,12 @@ namespace basisu
 
 		if (m_params.m_uastc)
 		{
-			if (m_any_source_image_has_alpha)
+			if (m_params.m_hdr)
+			{
+				pDFD = g_ktx2_uastc_hdr_nonalpha_dfd;
+				dfd_len = sizeof(g_ktx2_uastc_hdr_nonalpha_dfd);
+			}
+			else if (m_any_source_image_has_alpha)
 			{
 				pDFD = g_ktx2_uastc_alpha_dfd;
 				dfd_len = sizeof(g_ktx2_uastc_alpha_dfd);
@@ -1772,10 +3029,18 @@ namespace basisu
 		
 		dfd_bits &= ~(0xFF << 16);
 
-		if (m_params.m_ktx2_srgb_transfer_func)
-			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16);
-		else
+		if (m_params.m_hdr)
+		{
+			// TODO: In HDR mode, always write linear for now.
 			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16);
+		}
+		else
+		{
+			if (m_params.m_ktx2_srgb_transfer_func)
+				dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16);
+			else
+				dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16);
+		}
 
 		basisu::write_le_dword(dfd.data() + 3 * sizeof(uint32_t), dfd_bits);
 
@@ -1850,7 +3115,12 @@ namespace basisu
 		header.m_pixel_width = base_width;
 		header.m_pixel_height = base_height;
 		header.m_face_count = total_faces;
-		header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED;
+		
+		if (m_params.m_hdr)
+			header.m_vk_format = basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK;
+		else
+			header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED;
+
 		header.m_type_size = 1;
 		header.m_level_count = total_levels;
 		header.m_layer_count = (total_layers > 1) ? total_layers : 0;
@@ -2061,7 +3331,8 @@ namespace basisu
 			if (bytes_needed_to_pad < 6)
 				bytes_needed_to_pad += 16;
 
-			printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad);
+			// Just add the padding. It's likely not necessary anymore, but can't really hurt.
+			//printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad);
 			
 			// We're not good - need to add a dummy key large enough to force file alignment so the mip level array gets aligned. 
 			// We can't just add some bytes before the mip level array because ktx2check will see that as extra data in the file that shouldn't be there in ktxValidator::validateDataSize().
@@ -2258,18 +3529,34 @@ namespace basisu
 		return result;
 	}
 
-	void* basis_compress(
-		const basisu::vector<image>& source_images,
+	static void* basis_compress(
+		const basisu::vector<image> *pSource_images,
+		const basisu::vector<imagef> *pSource_images_hdr,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
 		size_t* pSize,
 		image_stats* pStats)
 	{
+		assert((pSource_images != nullptr) || (pSource_images_hdr != nullptr));
+		assert(!((pSource_images != nullptr) && (pSource_images_hdr != nullptr)));
+		
 		// Check input parameters
-		if ((!source_images.size()) || (!pSize))
+		if (pSource_images)
 		{
-			error_printf("basis_compress: Invalid parameter\n");
-			assert(0);
-			return nullptr;
+			if ((!pSource_images->size()) || (!pSize))
+			{
+				error_printf("basis_compress: Invalid parameter\n");
+				assert(0);
+				return nullptr;
+			}
+		}
+		else
+		{
+			if ((!pSource_images_hdr->size()) || (!pSize))
+			{
+				error_printf("basis_compress: Invalid parameter\n");
+				assert(0);
+				return nullptr;
+			}
 		}
 
 		*pSize = 0;
@@ -2287,40 +3574,70 @@ namespace basisu
 
 		comp_params.m_y_flip = (flags_and_quality & cFlagYFlip) != 0;
 		comp_params.m_debug = (flags_and_quality & cFlagDebug) != 0;
-
+		comp_params.m_debug_images = (flags_and_quality & cFlagDebugImages) != 0;
+		
 		// Copy the largest mipmap level
-		comp_params.m_source_images.resize(1);
-		comp_params.m_source_images[0] = source_images[0];
+		if (pSource_images)
+		{
+			comp_params.m_source_images.resize(1);
+			comp_params.m_source_images[0] = (*pSource_images)[0];
+
+			// Copy the smaller mipmap levels, if any
+			if (pSource_images->size() > 1)
+			{
+				comp_params.m_source_mipmap_images.resize(1);
+				comp_params.m_source_mipmap_images[0].resize(pSource_images->size() - 1);
 
-		// Copy the smaller mipmap levels, if any
-		if (source_images.size() > 1)
+				for (uint32_t i = 1; i < pSource_images->size(); i++)
+					comp_params.m_source_mipmap_images[0][i - 1] = (*pSource_images)[i];
+			}
+		}
+		else
 		{
-			comp_params.m_source_mipmap_images.resize(1);
-			comp_params.m_source_mipmap_images[0].resize(source_images.size() - 1);
+			comp_params.m_source_images_hdr.resize(1);
+			comp_params.m_source_images_hdr[0] = (*pSource_images_hdr)[0];
 
-			for (uint32_t i = 1; i < source_images.size(); i++)
-				comp_params.m_source_mipmap_images[0][i - 1] = source_images[i];
+			// Copy the smaller mipmap levels, if any
+			if (pSource_images_hdr->size() > 1)
+			{
+				comp_params.m_source_mipmap_images_hdr.resize(1);
+				comp_params.m_source_mipmap_images_hdr[0].resize(pSource_images_hdr->size() - 1);
+
+				for (uint32_t i = 1; i < pSource_images->size(); i++)
+					comp_params.m_source_mipmap_images_hdr[0][i - 1] = (*pSource_images_hdr)[i];
+			}
 		}
 				
 		comp_params.m_multithreading = (flags_and_quality & cFlagThreaded) != 0;
 		comp_params.m_use_opencl = (flags_and_quality & cFlagUseOpenCL) != 0;
 
-		comp_params.m_write_output_basis_files = false;
+		comp_params.m_write_output_basis_or_ktx2_files = false;
 
 		comp_params.m_perceptual = (flags_and_quality & cFlagSRGB) != 0;
 		comp_params.m_mip_srgb = comp_params.m_perceptual;
 		comp_params.m_mip_gen = (flags_and_quality & (cFlagGenMipsWrap | cFlagGenMipsClamp)) != 0;
 		comp_params.m_mip_wrapping = (flags_and_quality & cFlagGenMipsWrap) != 0;
 
-		comp_params.m_uastc = (flags_and_quality & cFlagUASTC) != 0;
-		if (comp_params.m_uastc)
+		if ((pSource_images_hdr) || (flags_and_quality & cFlagHDR))
 		{
-			comp_params.m_pack_uastc_flags = flags_and_quality & cPackUASTCLevelMask;
-			comp_params.m_rdo_uastc = (flags_and_quality & cFlagUASTCRDO) != 0;
-			comp_params.m_rdo_uastc_quality_scalar = uastc_rdo_quality;
+			// In UASTC HDR mode, the compressor will jam this to true anyway.
+			// And there's no need to set UASTC LDR or ETC1S options.
+			comp_params.m_uastc = true;
 		}
 		else
-			comp_params.m_quality_level = basisu::maximum<uint32_t>(1, flags_and_quality & 255);
+		{
+			comp_params.m_uastc = (flags_and_quality & cFlagUASTC) != 0;
+			if (comp_params.m_uastc)
+			{
+				comp_params.m_pack_uastc_flags = flags_and_quality & cPackUASTCLevelMask;
+				comp_params.m_rdo_uastc = (flags_and_quality & cFlagUASTCRDO) != 0;
+				comp_params.m_rdo_uastc_quality_scalar = uastc_rdo_quality;
+			}
+			else
+			{
+				comp_params.m_quality_level = basisu::maximum<uint32_t>(1, flags_and_quality & 255);
+			}
+		}
 				
 		comp_params.m_create_ktx2_file = (flags_and_quality & cFlagKTX2) != 0;
 						
@@ -2337,6 +3654,15 @@ namespace basisu
 		comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0;
 		comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0;
 
+		if ((flags_and_quality & cFlagHDR) || (pSource_images_hdr))
+		{
+			comp_params.m_hdr = true;
+			comp_params.m_uastc_hdr_options.set_quality_level(flags_and_quality & cPackUASTCLevelMask);
+		}
+
+		if (flags_and_quality & cFlagHDRLDRImageSRGBToLinearConversion)
+			comp_params.m_hdr_ldr_srgb_to_linear_conversion = true;
+
 		// Create the compressor, initialize it, and process the input
 		basis_compressor comp;
 		if (!comp.init(comp_params))
@@ -2380,6 +3706,24 @@ namespace basisu
 		return pFile_data;
 	}
 
+	void* basis_compress(
+		const basisu::vector<image>& source_images,
+		uint32_t flags_and_quality, float uastc_rdo_quality,
+		size_t* pSize,
+		image_stats* pStats)
+	{
+		return basis_compress(&source_images, nullptr, flags_and_quality, uastc_rdo_quality, pSize, pStats);
+	}
+
+	void* basis_compress(
+		const basisu::vector<imagef>& source_images_hdr,
+		uint32_t flags_and_quality,
+		size_t* pSize,
+		image_stats* pStats)
+	{
+		return basis_compress(nullptr, &source_images_hdr, flags_and_quality, 0.0f, pSize, pStats);
+	}
+
 	void* basis_compress(
 		const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
diff --git a/thirdparty/basis_universal/encoder/basisu_comp.h b/thirdparty/basis_universal/encoder/basisu_comp.h
index b6c9fef9e251..1cc75fc8a385 100644
--- a/thirdparty/basis_universal/encoder/basisu_comp.h
+++ b/thirdparty/basis_universal/encoder/basisu_comp.h
@@ -1,5 +1,5 @@
 // basisu_comp.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -18,9 +18,10 @@
 #include "basisu_basis_file.h"
 #include "../transcoder/basisu_transcoder.h"
 #include "basisu_uastc_enc.h"
+#include "basisu_astc_hdr_enc.h"
 
-#define BASISU_LIB_VERSION 116
-#define BASISU_LIB_VERSION_STRING "1.16"
+#define BASISU_LIB_VERSION 150
+#define BASISU_LIB_VERSION_STRING "1.50"
 
 #ifndef BASISD_SUPPORT_KTX2
 	#error BASISD_SUPPORT_KTX2 is undefined
@@ -81,6 +82,8 @@ namespace basisu
 			m_basis_luma_601_psnr = 0.0f;
 			m_basis_luma_709_ssim = 0.0f;
 
+			m_basis_rgb_avg_bc6h_psnr = 0.0f;
+
 			m_bc7_rgb_avg_psnr = 0.0f;
 			m_bc7_rgba_avg_psnr = 0.0f;
 			m_bc7_a_avg_psnr = 0.0f;
@@ -100,7 +103,7 @@ namespace basisu
 		uint32_t m_width;
 		uint32_t m_height;
 
-		// .basis compressed (ETC1S or UASTC statistics)
+		// .basis/.ktx2 compressed (LDR: ETC1S or UASTC statistics, HDR: transcoded BC6H statistics)
 		float m_basis_rgb_avg_psnr;
 		float m_basis_rgba_avg_psnr;
 		float m_basis_a_avg_psnr;
@@ -108,7 +111,10 @@ namespace basisu
 		float m_basis_luma_601_psnr;
 		float m_basis_luma_709_ssim;
 
-		// BC7 statistics
+		// UASTC HDR only.
+		float m_basis_rgb_avg_bc6h_psnr;
+
+		// LDR: BC7 statistics
 		float m_bc7_rgb_avg_psnr;
 		float m_bc7_rgba_avg_psnr;
 		float m_bc7_a_avg_psnr;
@@ -116,7 +122,7 @@ namespace basisu
 		float m_bc7_luma_601_psnr;
 		float m_bc7_luma_709_ssim;
 		
-		// Highest achievable quality ETC1S statistics
+		// LDR: Highest achievable quality ETC1S statistics
 		float m_best_etc1s_rgb_avg_psnr;
 		float m_best_etc1s_luma_709_psnr;
 		float m_best_etc1s_luma_601_psnr;
@@ -256,7 +262,7 @@ namespace basisu
 			m_no_selector_rdo.clear();
 			m_selector_rdo_thresh.clear();
 			m_read_source_images.clear();
-			m_write_output_basis_files.clear();
+			m_write_output_basis_or_ktx2_files.clear();
 			m_compression_level.clear();
 			m_compute_stats.clear();
 			m_print_stats.clear();
@@ -317,27 +323,38 @@ namespace basisu
 
 			m_validate_output_data.clear();
 
+			m_hdr_ldr_srgb_to_linear_conversion.clear();
+
+			m_hdr_favor_astc.clear();
+			
 			m_pJob_pool = nullptr;
 		}
 						
-		// True to generate UASTC .basis file data, otherwise ETC1S.
+		// True to generate UASTC .basis/.KTX2 file data, otherwise ETC1S.
 		bool_param<false> m_uastc;
 
+		// Set m_hdr to true to switch to UASTC HDR mode.
+		bool_param<false> m_hdr;
+
 		bool_param<false> m_use_opencl;
 
-		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
-		// Otherwise, the compressor processes the images in m_source_images.
+		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG etc. images to read. 
+		// Otherwise, the compressor processes the images in m_source_images or m_source_images_hdr.
 		basisu::vector<std::string> m_source_filenames;
 		basisu::vector<std::string> m_source_alpha_filenames;
 		
 		basisu::vector<image> m_source_images;
 		
+		basisu::vector<imagef> m_source_images_hdr;
+				
 		// Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual.
 		// If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error.
 		// The compressor applies the user-provided swizzling (in m_swizzle) to these images.
 		basisu::vector< basisu::vector<image> > m_source_mipmap_images;
+
+		basisu::vector< basisu::vector<imagef> > m_source_mipmap_images_hdr;
 						
-		// Filename of the output basis file
+		// Filename of the output basis/ktx2 file
 		std::string m_out_filename;
 
 		// The params are done this way so we can detect when the user has explictly changed them.
@@ -373,8 +390,8 @@ namespace basisu
 		// Read source images from m_source_filenames/m_source_alpha_filenames
 		bool_param<false> m_read_source_images;
 
-		// Write the output basis file to disk using m_out_filename
-		bool_param<false> m_write_output_basis_files;
+		// Write the output basis/ktx2 file to disk using m_out_filename
+		bool_param<false> m_write_output_basis_or_ktx2_files;
 								
 		// Compute and display image metrics 
 		bool_param<false> m_compute_stats;
@@ -382,15 +399,15 @@ namespace basisu
 		// Print stats to stdout, if m_compute_stats is true.
 		bool_param<true> m_print_stats;
 		
-		// Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels
+		// Check to see if any input image has an alpha channel, if so then the output basis/ktx2 file will have alpha channels
 		bool_param<true> m_check_for_alpha;
 		
-		// Always put alpha slices in the output basis file, even when the input doesn't have alpha
+		// Always put alpha slices in the output basis/ktx2 file, even when the input doesn't have alpha
 		bool_param<false> m_force_alpha; 
 		bool_param<true> m_multithreading;
 		
-		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
-		char m_swizzle[4];
+		// Split the R channel to RGB and the G channel to alpha, then write a basis/ktx2 file with alpha channels
+		uint8_t m_swizzle[4];
 
 		bool_param<false> m_renormalize;
 
@@ -448,8 +465,17 @@ namespace basisu
 		param<int> m_ktx2_zstd_supercompression_level;
 		bool_param<false> m_ktx2_srgb_transfer_func;
 
+		astc_hdr_codec_options m_uastc_hdr_options;
+
 		bool_param<false> m_validate_output_data;
 
+		// If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion) and then processed as HDR. 
+		// Otherwise, LDR images will be processed as HDR as-is.
+		bool_param<true> m_hdr_ldr_srgb_to_linear_conversion;
+
+		// If true, ASTC HDR quality is favored more than BC6H quality. Otherwise it's a rough balance.
+		bool_param<false> m_hdr_favor_astc;
+						
 		job_pool *m_pJob_pool;
 	};
 
@@ -504,6 +530,7 @@ namespace basisu
 		opencl_context_ptr m_pOpenCL_context;
 		
 		basisu::vector<image> m_slice_images;
+		basisu::vector<imagef> m_slice_images_hdr;
 
 		basisu::vector<image_stats> m_stats;
 
@@ -515,7 +542,9 @@ namespace basisu
 		uint32_t m_total_blocks;
 		
 		basisu_frontend m_frontend;
+
 		pixel_block_vec m_source_blocks;
+		pixel_block_hdr_vec m_source_blocks_hdr;
 
 		basisu::vector<gpu_image> m_frontend_output_textures;
 
@@ -526,11 +555,17 @@ namespace basisu
 
 		basisu_file m_basis_file;
 
-		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<gpu_image> m_decoded_output_textures;			// BC6H in HDR mode
 		basisu::vector<image> m_decoded_output_textures_unpacked;
+		
 		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
 		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;
 
+		basisu::vector<imagef> m_decoded_output_textures_bc6h_hdr_unpacked;	// BC6H in HDR mode
+
+		basisu::vector<gpu_image> m_decoded_output_textures_astc_hdr;
+		basisu::vector<imagef> m_decoded_output_textures_astc_hdr_unpacked;
+
 		uint8_vec m_output_basis_file;
 		uint8_vec m_output_ktx2_file;
 		
@@ -541,14 +576,21 @@ namespace basisu
 
 		bool m_opencl_failed;
 
+		void check_for_hdr_inputs();
+		bool sanity_check_input_params();
+		void clean_hdr_image(imagef& src_img);
+		bool read_dds_source_images();
 		bool read_source_images();
 		bool extract_source_blocks();
 		bool process_frontend();
 		bool extract_frontend_texture_data();
 		bool process_backend();
 		bool create_basis_file_and_transcode();
+		bool write_hdr_debug_images(const char* pBasename, const imagef& img, uint32_t width, uint32_t height);
 		bool write_output_files_and_compute_stats();
+		error_code encode_slices_to_uastc_hdr();
 		error_code encode_slices_to_uastc();
+		bool generate_mipmaps(const imagef& img, basisu::vector<imagef>& mips, bool has_alpha);
 		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
 		bool validate_texture_type_constraints();
 		bool validate_ktx2_constraints();
@@ -568,7 +610,8 @@ namespace basisu
 	//   
 	// flags_and_quality: Combination of the above flags logically OR'd with the ETC1S or UASTC level, i.e. "cFlagSRGB | cFlagGenMipsClamp | cFlagThreaded | 128" or "cFlagSRGB | cFlagGenMipsClamp | cFlagUASTC | cFlagThreaded | cPackUASTCLevelDefault".
 	//	  In ETC1S mode, the lower 8-bits are the ETC1S quality level which ranges from [1,255] (higher=better quality/larger files)
-	//	  In UASTC mode, the lower 8-bits are the UASTC pack level (see cPackUASTCLevelFastest, etc.). Fastest/lowest quality is 0, so be sure to set it correctly. 
+	//	  In UASTC mode, the lower 8-bits are the UASTC LDR/HDR pack level (see cPackUASTCLevelFastest, etc.). Fastest/lowest quality is 0, so be sure to set it correctly. Valid values are [0,4] for both LDR/HDR.
+	//	  In UASTC mode, be sure to set this, otherwise it defaults to 0 (fastest/lowest quality).
 	// 
 	// uastc_rdo_quality: Float UASTC RDO quality level (0=no change, higher values lower quality but increase compressibility, initially try .5-1.5)
 	// 
@@ -594,20 +637,36 @@ namespace basisu
 		cFlagUASTCRDO = 1 << 18,		// use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar)
 		
 		cFlagPrintStats = 1 << 19,	// print image stats to stdout
-		cFlagPrintStatus = 1 << 20	// print status to stdout
+		cFlagPrintStatus = 1 << 20,	// print status to stdout
+		
+		cFlagHDR = 1 << 21,			// Force encoder into HDR mode, even if source image is LDR.
+		cFlagHDRLDRImageSRGBToLinearConversion = 1 << 22, // In HDR mode, convert LDR source images to linear before encoding.
+		
+		cFlagDebugImages = 1 << 23	// enable status output
 	};
 
 	// This function accepts an array of source images. 
 	// If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled.
-	// Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data().
+	// Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. 
+	// Important: The returned block MUST be manually freed using basis_free_data().
 	// basisu_encoder_init() MUST be called first!
+	// LDR version. To compress the LDR source image as HDR: Use the cFlagHDR flag.
 	void* basis_compress(
 		const basisu::vector<image> &source_images,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
 		size_t* pSize,
 		image_stats* pStats = nullptr);
 
-	// This function only accepts a single source image.
+	// HDR-only version.
+	// Important: The returned block MUST be manually freed using basis_free_data().
+	void* basis_compress(
+		const basisu::vector<imagef>& source_images_hdr,
+		uint32_t flags_and_quality, 
+		size_t* pSize,
+		image_stats* pStats = nullptr);
+
+	// This function only accepts a single LDR source image. It's just a wrapper for basis_compress() above.
+	// Important: The returned block MUST be manually freed using basis_free_data().
 	void* basis_compress(
 		const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels,
 		uint32_t flags_and_quality, float uastc_rdo_quality,
@@ -615,6 +674,7 @@ namespace basisu
 		image_stats* pStats = nullptr);
 
 	// Frees the dynamically allocated file data returned by basis_compress().
+	// This MUST be called on the pointer returned by basis_compress() when you're done with it.
 	void basis_free_data(void* p);
 
 	// Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled.
diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
index e87dd636a2fc..fff98e830148 100644
--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
@@ -1,5 +1,5 @@
 // basisu_enc.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,10 +21,20 @@
 #include "jpgd.h"
 #include "pvpngreader.h"
 #include "basisu_opencl.h"
+#include "basisu_astc_hdr_enc.h"
 #include <vector>
 
+#ifndef TINYEXR_USE_ZFP
+#define TINYEXR_USE_ZFP (1)
+#endif
+#include <tinyexr.h>
+
+#ifndef MINIZ_HEADER_FILE_ONLY
 #define MINIZ_HEADER_FILE_ONLY
+#endif
+#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
 #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+#endif
 #include "basisu_miniz.h"
 
 #if defined(_WIN32)
@@ -165,14 +175,14 @@ namespace basisu
 
 	bool g_library_initialized;
 	std::mutex g_encoder_init_mutex;
-
+		
 	// Encoder library initialization (just call once at startup)
-	void basisu_encoder_init(bool use_opencl, bool opencl_force_serialization)
+	bool basisu_encoder_init(bool use_opencl, bool opencl_force_serialization)
 	{
 		std::lock_guard<std::mutex> lock(g_encoder_init_mutex);
 
 		if (g_library_initialized)
-			return;
+			return true;
 
 		detect_sse41();
 
@@ -189,7 +199,11 @@ namespace basisu
 
 		interval_timer::init(); // make sure interval_timer globals are initialized from main thread to avoid TSAN reports
 
+		astc_hdr_enc_init();
+		basist::bc6h_enc_init();
+
 		g_library_initialized = true;
+		return true;
 	}
 
 	void basisu_encoder_deinit()
@@ -316,6 +330,24 @@ namespace basisu
 			init();
 		return ticks * g_timer_freq;
 	}
+
+	float linear_to_srgb(float l)
+	{
+		assert(l >= 0.0f && l <= 1.0f);
+		if (l < .0031308f)
+			return saturate(l * 12.92f);
+		else
+			return saturate(1.055f * powf(l, 1.0f / 2.4f) - .055f);
+	}
+
+	float srgb_to_linear(float s)
+	{
+		assert(s >= 0.0f && s <= 1.0f);
+		if (s < .04045f)
+			return saturate(s * (1.0f / 12.92f));
+		else
+			return saturate(powf((s + .055f) * (1.0f / 1.055f), 2.4f));
+	}
 		
 	const uint32_t MAX_32BIT_ALLOC_SIZE = 250000000;
 		
@@ -336,7 +368,7 @@ namespace basisu
 
 		if (sizeof(void *) == sizeof(uint32_t))
 		{
-			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			if (((uint64_t)w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
 			{
 				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
 
@@ -371,6 +403,11 @@ namespace basisu
 		return true;
 	}
 
+	bool load_qoi(const char* pFilename, image& img)
+	{
+		return false;
+	}
+
 	bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename)
 	{
 		interval_timer tm;
@@ -433,11 +470,178 @@ namespace basisu
 			return load_png(pFilename, img);
 		if (strcasecmp(pExt, "tga") == 0)
 			return load_tga(pFilename, img);
+		if (strcasecmp(pExt, "qoi") == 0)
+			return load_qoi(pFilename, img);
 		if ( (strcasecmp(pExt, "jpg") == 0) || (strcasecmp(pExt, "jfif") == 0) || (strcasecmp(pExt, "jpeg") == 0) )
 			return load_jpg(pFilename, img);
 
 		return false;
 	}
+
+	static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear)
+	{
+		img.resize(ldr_img.get_width(), ldr_img.get_height());
+
+		for (uint32_t y = 0; y < ldr_img.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < ldr_img.get_width(); x++)
+			{
+				const color_rgba& c = ldr_img(x, y);
+
+				vec4F& d = img(x, y);
+				if (ldr_srgb_to_linear)
+				{
+					// TODO: Multiply by 100-200 nits?
+					d[0] = srgb_to_linear(c[0] * (1.0f / 255.0f));
+					d[1] = srgb_to_linear(c[1] * (1.0f / 255.0f));
+					d[2] = srgb_to_linear(c[2] * (1.0f / 255.0f));
+				}
+				else
+				{
+					d[0] = c[0] * (1.0f / 255.0f);
+					d[1] = c[1] * (1.0f / 255.0f);
+					d[2] = c[2] * (1.0f / 255.0f);
+				}
+				d[3] = c[3] * (1.0f / 255.0f);
+			}
+		}
+	}
+
+	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear)
+	{
+		if ((!pMem) || (!mem_size))
+		{
+			assert(0);
+			return false;
+		}
+
+		switch (img_type)
+		{
+		case hdr_image_type::cHITRGBAHalfFloat:
+		{
+			if (mem_size != width * height * sizeof(basist::half_float) * 4)
+			{
+				assert(0);
+				return false;
+			}
+
+			if ((!width) || (!height))
+			{
+				assert(0);
+				return false;
+			}
+
+			const basist::half_float* pSrc_image_h = static_cast<const basist::half_float *>(pMem);
+
+			img.resize(width, height);
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const basist::half_float* pSrc_pixel = &pSrc_image_h[x * 4];
+
+					vec4F& dst = img(x, y);
+					dst[0] = basist::half_to_float(pSrc_pixel[0]);
+					dst[1] = basist::half_to_float(pSrc_pixel[1]);
+					dst[2] = basist::half_to_float(pSrc_pixel[2]);
+					dst[3] = basist::half_to_float(pSrc_pixel[3]);
+				}
+			
+				pSrc_image_h += (width * 4);
+			}
+
+			break;
+		}
+		case hdr_image_type::cHITRGBAFloat:
+		{
+			if (mem_size != width * height * sizeof(float) * 4)
+			{
+				assert(0);
+				return false;
+			}
+
+			if ((!width) || (!height))
+			{
+				assert(0);
+				return false;
+			}
+
+			img.resize(width, height);
+			memcpy(img.get_ptr(), pMem, width * height * sizeof(float) * 4);
+
+			break;
+		}
+		case hdr_image_type::cHITPNGImage:
+		{
+			image ldr_img;
+			if (!load_png(static_cast<const uint8_t *>(pMem), mem_size, ldr_img))
+				return false;
+
+			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear);
+			break;
+		}
+		case hdr_image_type::cHITEXRImage:
+		{
+			if (!read_exr(pMem, mem_size, img))
+				return false;
+
+			break;
+		}
+		case hdr_image_type::cHITHDRImage:
+		{
+			uint8_vec buf(mem_size);
+			memcpy(buf.get_ptr(), pMem, mem_size);
+
+			rgbe_header_info hdr;
+			if (!read_rgbe(buf, img, hdr))
+				return false;
+
+			break;
+		}
+		default:
+			assert(0);
+			return false;
+		}
+
+		return true;
+	}
+	
+	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear)
+	{
+		std::string ext(string_get_extension(std::string(pFilename)));
+
+		if (ext.length() == 0)
+			return false;
+
+		const char* pExt = ext.c_str();
+
+		if (strcasecmp(pExt, "hdr") == 0)
+		{
+			rgbe_header_info rgbe_info;
+			if (!read_rgbe(pFilename, img, rgbe_info))
+				return false;
+			return true;
+		}
+					
+		if (strcasecmp(pExt, "exr") == 0)
+		{
+			int n_chans = 0;
+			if (!read_exr(pFilename, img, n_chans))
+				return false;
+			return true;
+		}
+
+		// Try loading image as LDR, then optionally convert to linear light.
+		{
+			image ldr_img;
+			if (!load_image(pFilename, ldr_img))
+				return false;
+
+			convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear);
+		}
+
+		return true;
+	}
 	
 	bool save_png(const char* pFilename, const image &img, uint32_t image_save_flags, uint32_t grayscale_comp)
 	{
@@ -559,6 +763,45 @@ namespace basisu
 		return true;
 	}
 
+	bool read_file_to_data(const char* pFilename, void *pData, size_t len)
+	{
+		assert(pData && len);
+		if ((!pData) || (!len))
+			return false;
+
+		FILE* pFile = nullptr;
+#ifdef _WIN32
+		fopen_s(&pFile, pFilename, "rb");
+#else
+		pFile = fopen(pFilename, "rb");
+#endif
+		if (!pFile)
+			return false;
+
+		fseek(pFile, 0, SEEK_END);
+#ifdef _WIN32
+		int64_t filesize = _ftelli64(pFile);
+#else
+		int64_t filesize = ftello(pFile);
+#endif
+
+		if ((filesize < 0) || ((size_t)filesize < len))
+		{
+			fclose(pFile);
+			return false;
+		}
+		fseek(pFile, 0, SEEK_SET);
+				
+		if (fread(pData, 1, (size_t)len, pFile) != (size_t)len)
+		{
+			fclose(pFile);
+			return false;
+		}
+
+		fclose(pFile);
+		return true;
+	}
+
 	bool write_data_to_file(const char* pFilename, const void* pData, size_t len)
 	{
 		FILE* pFile = nullptr;
@@ -581,25 +824,7 @@ namespace basisu
 
 		return fclose(pFile) != EOF;
 	}
-
-	float linear_to_srgb(float l)
-	{
-		assert(l >= 0.0f && l <= 1.0f);
-		if (l < .0031308f)
-			return saturate(l * 12.92f);
-		else
-			return saturate(1.055f * powf(l, 1.0f/2.4f) - .055f);
-	}
-
-	float srgb_to_linear(float s)
-	{
-		assert(s >= 0.0f && s <= 1.0f);
-		if (s < .04045f)
-			return saturate(s * (1.0f/12.92f));
-		else
-			return saturate(powf((s + .055f) * (1.0f/1.055f), 2.4f));
-	}
-
+		
 	bool image_resample(const image &src, image &dst, bool srgb,
 		const char *pFilter, float filter_scale, 
 		bool wrapping,
@@ -747,67 +972,182 @@ namespace basisu
 		return true;
 	}
 
-	void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms)
+	bool image_resample(const imagef& src, imagef& dst, 
+		const char* pFilter, float filter_scale,
+		bool wrapping,
+		uint32_t first_comp, uint32_t num_comps)
 	{
-		// See the paper "In-Place Calculation of Minimum Redundancy Codes" by Moffat and Katajainen
-		if (!num_syms)
-			return;
+		assert((first_comp + num_comps) <= 4);
 
-		if (1 == num_syms)
+		const int cMaxComps = 4;
+
+		const uint32_t src_w = src.get_width(), src_h = src.get_height();
+		const uint32_t dst_w = dst.get_width(), dst_h = dst.get_height();
+
+		if (maximum(src_w, src_h) > BASISU_RESAMPLER_MAX_DIMENSION)
 		{
-			A[0].m_key = 1;
-			return;
+			printf("Image is too large!\n");
+			return false;
 		}
-		
-		A[0].m_key += A[1].m_key;
-		
-		int s = 2, r = 0, next;
-		for (next = 1; next < (num_syms - 1); ++next)
-		{
-			if ((s >= num_syms) || (A[r].m_key < A[s].m_key))
-			{
-				A[next].m_key = A[r].m_key;
-				A[r].m_key = next;
-				++r;
-			}
-			else
-			{
-				A[next].m_key = A[s].m_key;
-				++s;
-			}
 
-			if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key))
-			{
-				A[next].m_key = A[next].m_key + A[r].m_key;
-				A[r].m_key = next;
-				++r;
-			}
-			else
-			{
-				A[next].m_key = A[next].m_key + A[s].m_key;
-				++s;
-			}
-		}
-		A[num_syms - 2].m_key = 0;
+		if (!src_w || !src_h || !dst_w || !dst_h)
+			return false;
 
-		for (next = num_syms - 3; next >= 0; --next)
+		if ((num_comps < 1) || (num_comps > cMaxComps))
+			return false;
+
+		if ((minimum(dst_w, dst_h) < 1) || (maximum(dst_w, dst_h) > BASISU_RESAMPLER_MAX_DIMENSION))
 		{
-			A[next].m_key = 1 + A[A[next].m_key].m_key;
+			printf("Image is too large!\n");
+			return false;
 		}
 
-		int num_avail = 1, num_used = 0, depth = 0;
-		r = num_syms - 2;
-		next = num_syms - 1;
-		while (num_avail > 0)
+		if ((src_w == dst_w) && (src_h == dst_h))
 		{
-			for ( ; (r >= 0) && ((int)A[r].m_key == depth); ++num_used, --r )
-				;
+			dst = src;
+			return true;
+		}
 
-			for ( ; num_avail > num_used; --next, --num_avail)
-				A[next].m_key = depth;
+		std::vector<float> samples[cMaxComps];
+		Resampler* resamplers[cMaxComps];
 
-			num_avail = 2 * num_used;
-			num_used = 0;
+		resamplers[0] = new Resampler(src_w, src_h, dst_w, dst_h,
+			wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 1.0f, 0.0f, // no clamping
+			pFilter, nullptr, nullptr, filter_scale, filter_scale, 0, 0);
+		samples[0].resize(src_w);
+
+		for (uint32_t i = 1; i < num_comps; ++i)
+		{
+			resamplers[i] = new Resampler(src_w, src_h, dst_w, dst_h,
+				wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 1.0f, 0.0f, // no clamping
+				pFilter, resamplers[0]->get_clist_x(), resamplers[0]->get_clist_y(), filter_scale, filter_scale, 0, 0);
+			samples[i].resize(src_w);
+		}
+
+		uint32_t dst_y = 0;
+
+		for (uint32_t src_y = 0; src_y < src_h; ++src_y)
+		{
+			const vec4F* pSrc = &src(0, src_y);
+
+			// Put source lines into resampler(s)
+			for (uint32_t x = 0; x < src_w; ++x)
+			{
+				for (uint32_t c = 0; c < num_comps; ++c)
+				{
+					const uint32_t comp_index = first_comp + c;
+					const float v = (*pSrc)[comp_index];
+
+					samples[c][x] = v;
+				}
+
+				pSrc++;
+			}
+
+			for (uint32_t c = 0; c < num_comps; ++c)
+			{
+				if (!resamplers[c]->put_line(&samples[c][0]))
+				{
+					for (uint32_t i = 0; i < num_comps; i++)
+						delete resamplers[i];
+					return false;
+				}
+			}
+
+			// Now retrieve any output lines
+			for (;;)
+			{
+				uint32_t c;
+				for (c = 0; c < num_comps; ++c)
+				{
+					const uint32_t comp_index = first_comp + c;
+
+					const float* pOutput_samples = resamplers[c]->get_line();
+					if (!pOutput_samples)
+						break;
+										
+					vec4F* pDst = &dst(0, dst_y);
+
+					for (uint32_t x = 0; x < dst_w; x++)
+					{
+						(*pDst)[comp_index] = pOutput_samples[x];
+						pDst++;
+					}
+				}
+				if (c < num_comps)
+					break;
+
+				++dst_y;
+			}
+		}
+
+		for (uint32_t i = 0; i < num_comps; ++i)
+			delete resamplers[i];
+
+		return true;
+	}
+
+	void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms)
+	{
+		// See the paper "In-Place Calculation of Minimum Redundancy Codes" by Moffat and Katajainen
+		if (!num_syms)
+			return;
+
+		if (1 == num_syms)
+		{
+			A[0].m_key = 1;
+			return;
+		}
+		
+		A[0].m_key += A[1].m_key;
+		
+		int s = 2, r = 0, next;
+		for (next = 1; next < (num_syms - 1); ++next)
+		{
+			if ((s >= num_syms) || (A[r].m_key < A[s].m_key))
+			{
+				A[next].m_key = A[r].m_key;
+				A[r].m_key = next;
+				++r;
+			}
+			else
+			{
+				A[next].m_key = A[s].m_key;
+				++s;
+			}
+
+			if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key))
+			{
+				A[next].m_key = A[next].m_key + A[r].m_key;
+				A[r].m_key = next;
+				++r;
+			}
+			else
+			{
+				A[next].m_key = A[next].m_key + A[s].m_key;
+				++s;
+			}
+		}
+		A[num_syms - 2].m_key = 0;
+
+		for (next = num_syms - 3; next >= 0; --next)
+		{
+			A[next].m_key = 1 + A[A[next].m_key].m_key;
+		}
+
+		int num_avail = 1, num_used = 0, depth = 0;
+		r = num_syms - 2;
+		next = num_syms - 1;
+		while (num_avail > 0)
+		{
+			for ( ; (r >= 0) && ((int)A[r].m_key == depth); ++num_used, --r )
+				;
+
+			for ( ; num_avail > num_used; --next, --num_avail)
+				A[next].m_key = depth;
+
+			num_avail = 2 * num_used;
+			num_used = 0;
 			++depth;
 		}
 	}
@@ -1312,11 +1652,13 @@ namespace basisu
 
 		uint32_t a = max_index / num_syms, b = max_index % num_syms;
 
+		const uint32_t ofs = m_entries_picked.size();
+
 		m_entries_picked.push_back(a);
 		m_entries_picked.push_back(b);
 
 		for (uint32_t i = 0; i < num_syms; i++)
-			if ((i != b) && (i != a))
+			if ((i != m_entries_picked[ofs + 1]) && (i != m_entries_picked[ofs]))
 				m_entries_to_do.push_back(i);
 
 		for (uint32_t i = 0; i < m_entries_to_do.size(); i++)
@@ -1372,48 +1714,161 @@ namespace basisu
 		}
 		return which_side;
 	}
-
-	void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
+	
+	void image_metrics::calc(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool log)
 	{
 		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
 
 		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
 		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
 
-		double hist[256];
-		clear_obj(hist);
+		double max_e = -1e+30f;
+		double sum = 0.0f, sum_sqr = 0.0f;
 
+		m_has_neg = false;
+		m_any_abnormal = false;
+		m_hf_mag_overflow = false;
+				
 		for (uint32_t y = 0; y < height; y++)
 		{
 			for (uint32_t x = 0; x < width; x++)
 			{
-				const color_rgba &ca = a(x, y), &cb = b(x, y);
-
+				const vec4F& ca = a(x, y), &cb = b(x, y);
+								
 				if (total_chans)
 				{
 					for (uint32_t c = 0; c < total_chans; c++)
-						hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++;
+					{
+						float fa = ca[first_chan + c], fb = cb[first_chan + c];
+
+						if ((fabs(fa) > basist::MAX_HALF_FLOAT) || (fabs(fb) > basist::MAX_HALF_FLOAT))
+							m_hf_mag_overflow = true;
+
+						if ((fa < 0.0f) || (fb < 0.0f))
+							m_has_neg = true;
+
+						if (std::isinf(fa) || std::isinf(fb) || std::isnan(fa) || std::isnan(fb))
+							m_any_abnormal = true;
+												
+						const double delta = fabs(fa - fb);
+						max_e = basisu::maximum<double>(max_e, delta);
+
+						if (log)
+						{
+							double log2_delta = log2f(basisu::maximum(0.0f, fa) + 1.0f) - log2f(basisu::maximum(0.0f, fb) + 1.0f);
+
+							sum += fabs(log2_delta);
+							sum_sqr += log2_delta * log2_delta;
+						}
+						else
+						{
+							sum += fabs(delta);
+							sum_sqr += delta * delta;
+						}
+					}
 				}
 				else
 				{
-					if (use_601_luma)
-						hist[iabs(ca.get_601_luma() - cb.get_601_luma())]++;
+					for (uint32_t c = 0; c < 3; c++)
+					{
+						float fa = ca[c], fb = cb[c];
+
+						if ((fabs(fa) > basist::MAX_HALF_FLOAT) || (fabs(fb) > basist::MAX_HALF_FLOAT))
+							m_hf_mag_overflow = true;
+
+						if ((fa < 0.0f) || (fb < 0.0f))
+							m_has_neg = true;
+
+						if (std::isinf(fa) || std::isinf(fb) || std::isnan(fa) || std::isnan(fb))
+							m_any_abnormal = true;
+					}
+
+					double ca_l = get_luminance(ca), cb_l = get_luminance(cb);
+					
+					double delta = fabs(ca_l - cb_l);
+					max_e = basisu::maximum(max_e, delta);
+					
+					if (log)
+					{
+						double log2_delta = log2(basisu::maximum<double>(0.0f, ca_l) + 1.0f) - log2(basisu::maximum<double>(0.0f, cb_l) + 1.0f);
+
+						sum += fabs(log2_delta);
+						sum_sqr += log2_delta * log2_delta;
+					}
 					else
-						hist[iabs(ca.get_709_luma() - cb.get_709_luma())]++;
+					{
+						sum += delta;
+						sum_sqr += delta * delta;
+					}
 				}
 			}
 		}
 
+		m_max = (double)(max_e);
+
+		double total_values = (double)width * (double)height;
+		if (avg_comp_error)
+			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
+
+		m_mean = (float)(sum / total_values);
+		m_mean_squared = (float)(sum_sqr / total_values);
+		m_rms = (float)sqrt(sum_sqr / total_values);
+		
+		const double max_val = 1.0f;
+		m_psnr = m_rms ? (float)clamp<double>(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f;
+	}
+
+	void image_metrics::calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error)
+	{
+		assert(total_chans);
+		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
+
+		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
+		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
+
+		m_has_neg = false;
+		m_hf_mag_overflow = false;
+		m_any_abnormal = false;
+
+		uint_vec hist(65536);
+		
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const vec4F& ca = a(x, y), &cb = b(x, y);
+
+				for (uint32_t i = 0; i < 4; i++)
+				{
+					if ((ca[i] < 0.0f) || (cb[i] < 0.0f))
+						m_has_neg = true;
+					
+					if ((fabs(ca[i]) > basist::MAX_HALF_FLOAT) || (fabs(cb[i]) > basist::MAX_HALF_FLOAT))
+						m_hf_mag_overflow = true;
+
+					if (std::isnan(ca[i]) || std::isnan(cb[i]) || std::isinf(ca[i]) || std::isinf(cb[i]))
+						m_any_abnormal = true;
+				}
+
+				int cah[4] = { basist::float_to_half(ca[0]), basist::float_to_half(ca[1]), basist::float_to_half(ca[2]), basist::float_to_half(ca[3]) };
+				int cbh[4] = { basist::float_to_half(cb[0]), basist::float_to_half(cb[1]), basist::float_to_half(cb[2]), basist::float_to_half(cb[3]) };
+
+				for (uint32_t c = 0; c < total_chans; c++)
+					hist[iabs(cah[first_chan + c] - cbh[first_chan + c]) & 65535]++;
+
+			} // x
+		} // y
+
 		m_max = 0;
 		double sum = 0.0f, sum2 = 0.0f;
-		for (uint32_t i = 0; i < 256; i++)
+		for (uint32_t i = 0; i < 65536; i++)
 		{
 			if (hist[i])
 			{
-				m_max = basisu::maximum<float>(m_max, (float)i);
-				double v = i * hist[i];
+				m_max = basisu::maximum<double>(m_max, (double)i);
+				double v = (double)i * (double)hist[i];
 				sum += v;
-				sum2 += i * v;
+				sum2 += (double)i * v;
 			}
 		}
 
@@ -1421,63 +1876,183 @@ namespace basisu
 		if (avg_comp_error)
 			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
 
-		m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
-		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
+		const float max_val = 65535.0f;
+		m_mean = (float)clamp<double>(sum / total_values, 0.0f, max_val);
+		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, max_val * max_val);
 		m_rms = (float)sqrt(m_mean_squared);
-		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f;
+		m_psnr = m_rms ? (float)clamp<double>(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f;
 	}
 
-	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed)
+	// Alt. variant, same as calc_half(), for validation.
+	void image_metrics::calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error)
 	{
-		rand r(seed);
+		assert(total_chans);
+		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
 
-		uint8_t *pDst = static_cast<uint8_t *>(pBuf);
+		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
+		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
 
-		while (size >= sizeof(uint32_t))
-		{
-			*(uint32_t *)pDst = r.urand32();
-			pDst += sizeof(uint32_t);
-			size -= sizeof(uint32_t);
-		}
+		m_has_neg = false;
+		m_hf_mag_overflow = false;
+		m_any_abnormal = false;
+				
+		double sum = 0.0f, sum2 = 0.0f;
+		m_max = 0;
 
-		while (size)
+		for (uint32_t y = 0; y < height; y++)
 		{
-			*pDst++ = r.byte();
-			size--;
-		}
-	}
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const vec4F& ca = a(x, y), & cb = b(x, y);
 
-	uint32_t hash_hsieh(const uint8_t *pBuf, size_t len)
-	{
-		if (!pBuf || !len) 
-			return 0;
+				for (uint32_t i = 0; i < 4; i++)
+				{
+					if ((ca[i] < 0.0f) || (cb[i] < 0.0f))
+						m_has_neg = true;
 
-		uint32_t h = static_cast<uint32_t>(len);
+					if ((fabs(ca[i]) > basist::MAX_HALF_FLOAT) || (fabs(cb[i]) > basist::MAX_HALF_FLOAT))
+						m_hf_mag_overflow = true;
 
-		const uint32_t bytes_left = len & 3;
-		len >>= 2;
+					if (std::isnan(ca[i]) || std::isnan(cb[i]) || std::isinf(ca[i]) || std::isinf(cb[i]))
+						m_any_abnormal = true;
+				}
 
-		while (len--)
-		{
-			const uint16_t *pWords = reinterpret_cast<const uint16_t *>(pBuf);
+				int cah[4] = { basist::float_to_half(ca[0]), basist::float_to_half(ca[1]), basist::float_to_half(ca[2]), basist::float_to_half(ca[3]) };
+				int cbh[4] = { basist::float_to_half(cb[0]), basist::float_to_half(cb[1]), basist::float_to_half(cb[2]), basist::float_to_half(cb[3]) };
 
-			h += pWords[0];
-			
-			const uint32_t t = (pWords[1] << 11) ^ h;
-			h = (h << 16) ^ t;
-			
-			pBuf += sizeof(uint32_t);
-			
-			h += h >> 11;
-		}
+				for (uint32_t c = 0; c < total_chans; c++)
+				{
+					int diff = iabs(cah[first_chan + c] - cbh[first_chan + c]);
+					if (diff)
+						m_max = std::max<double>(m_max, (double)diff);
 
-		switch (bytes_left)
-		{
-		case 1: 
-			h += *reinterpret_cast<const signed char*>(pBuf);
-			h ^= h << 10;
-			h += h >> 1;
-			break;
+					sum += diff;
+					sum2 += squarei(cah[first_chan + c] - cbh[first_chan + c]);
+				}
+
+			} // x
+		} // y
+						
+		double total_values = (double)width * (double)height;
+		if (avg_comp_error)
+			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
+
+		const float max_val = 65535.0f;
+		m_mean = (float)clamp<double>(sum / total_values, 0.0f, max_val);
+		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, max_val * max_val);
+		m_rms = (float)sqrt(m_mean_squared);
+		m_psnr = m_rms ? (float)clamp<double>(log10(max_val / m_rms) * 20.0f, 0.0f, 1000.0f) : 1000.0f;
+	}
+
+	void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
+	{
+		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
+
+		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
+		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
+
+		double hist[256];
+		clear_obj(hist);
+
+		m_has_neg = false;
+		m_any_abnormal = false;
+		m_hf_mag_overflow = false;
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const color_rgba &ca = a(x, y), &cb = b(x, y);
+
+				if (total_chans)
+				{
+					for (uint32_t c = 0; c < total_chans; c++)
+						hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++;
+				}
+				else
+				{
+					if (use_601_luma)
+						hist[iabs(ca.get_601_luma() - cb.get_601_luma())]++;
+					else
+						hist[iabs(ca.get_709_luma() - cb.get_709_luma())]++;
+				}
+			}
+		}
+
+		m_max = 0;
+		double sum = 0.0f, sum2 = 0.0f;
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			if (hist[i])
+			{
+				m_max = basisu::maximum<double>(m_max, (double)i);
+				double v = i * hist[i];
+				sum += v;
+				sum2 += i * v;
+			}
+		}
+
+		double total_values = (double)width * (double)height;
+		if (avg_comp_error)
+			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
+
+		m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
+		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
+		m_rms = (float)sqrt(m_mean_squared);
+		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f;
+	}
+
+	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed)
+	{
+		rand r(seed);
+
+		uint8_t *pDst = static_cast<uint8_t *>(pBuf);
+
+		while (size >= sizeof(uint32_t))
+		{
+			*(uint32_t *)pDst = r.urand32();
+			pDst += sizeof(uint32_t);
+			size -= sizeof(uint32_t);
+		}
+
+		while (size)
+		{
+			*pDst++ = r.byte();
+			size--;
+		}
+	}
+
+	uint32_t hash_hsieh(const uint8_t *pBuf, size_t len)
+	{
+		if (!pBuf || !len) 
+			return 0;
+
+		uint32_t h = static_cast<uint32_t>(len);
+
+		const uint32_t bytes_left = len & 3;
+		len >>= 2;
+
+		while (len--)
+		{
+			const uint16_t *pWords = reinterpret_cast<const uint16_t *>(pBuf);
+
+			h += pWords[0];
+			
+			const uint32_t t = (pWords[1] << 11) ^ h;
+			h = (h << 16) ^ t;
+			
+			pBuf += sizeof(uint32_t);
+			
+			h += h >> 11;
+		}
+
+		switch (bytes_left)
+		{
+		case 1: 
+			h += *reinterpret_cast<const signed char*>(pBuf);
+			h ^= h << 10;
+			h += h >> 1;
+			break;
 		case 2: 
 			h += *reinterpret_cast<const uint16_t *>(pBuf);
 			h ^= h << 11;
@@ -1922,7 +2497,7 @@ namespace basisu
 
 				} while (pixels_remaining);
 
-				assert((pDst - &input_line_buf[0]) == width * tga_bytes_per_pixel);
+				assert((pDst - &input_line_buf[0]) == (int)(width * tga_bytes_per_pixel));
 
 				pLine_data = &input_line_buf[0];
 			}
@@ -2052,56 +2627,1059 @@ namespace basisu
 		return read_tga(&filedata[0], (uint32_t)filedata.size(), width, height, n_chans);
 	}
 
-	void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...)
+	static inline void hdr_convert(const color_rgba& rgbe, vec4F& c)
 	{
-		char buf[2048];
+		if (rgbe[3] != 0)
+		{
+			float scale = ldexp(1.0f, rgbe[3] - 128 - 8);
+			c.set((float)rgbe[0] * scale, (float)rgbe[1] * scale, (float)rgbe[2] * scale, 1.0f);
+		}
+		else
+		{
+			c.set(0.0f, 0.0f, 0.0f, 1.0f);
+		}
+	}
 
-		va_list args;
-		va_start(args, pFmt);
-#ifdef _WIN32		
-		vsprintf_s(buf, sizeof(buf), pFmt, args);
+	bool string_begins_with(const std::string& str, const char* pPhrase)
+	{
+		const size_t str_len = str.size();
+
+		const size_t phrase_len = strlen(pPhrase);
+		assert(phrase_len);
+
+		if (str_len >= phrase_len)
+		{
+#ifdef _MSC_VER
+			if (_strnicmp(pPhrase, str.c_str(), phrase_len) == 0)
 #else
-		vsnprintf(buf, sizeof(buf), pFmt, args);
+			if (strncasecmp(pPhrase, str.c_str(), phrase_len) == 0)
 #endif
-		va_end(args);
+				return true;
+		}
 
-		const char* p = buf;
+		return false;
+	}
 
-		const uint32_t orig_x_ofs = x_ofs;
+	// Radiance RGBE (.HDR) image reading.
+	// This code tries to preserve the original logic in Radiance's ray/src/common/color.c code:
+	// https://www.radiance-online.org/cgi-bin/viewcvs.cgi/ray/src/common/color.c?revision=2.26&view=markup&sortby=log
+	// Also see: https://flipcode.com/archives/HDR_Image_Reader.shtml.
+	// https://github.com/LuminanceHDR/LuminanceHDR/blob/master/src/Libpfs/io/rgbereader.cpp.
+	// https://radsite.lbl.gov/radiance/refer/filefmts.pdf
+	// Buggy readers:
+	// stb_image.h: appears to be a clone of rgbe.c, but with goto's (doesn't support old format files, doesn't support mixture of RLE/non-RLE scanlines)
+	// http://www.graphics.cornell.edu/~bjw/rgbe.html - rgbe.c/h
+	// http://www.graphics.cornell.edu/online/formats/rgbe/ - rgbe.c/.h - buggy
+	bool read_rgbe(const uint8_vec &filedata, imagef& img, rgbe_header_info& hdr_info)
+	{
+		hdr_info.clear();
 
-		while (*p)
+		const uint32_t MAX_SUPPORTED_DIM = 65536;
+
+		if (filedata.size() < 4)
+			return false;
+
+		// stb_image.h checks for the string "#?RADIANCE" or "#?RGBE" in the header.
+		// The original Radiance header code doesn't care about the specific string.
+		// opencv's reader only checks for "#?", so that's what we're going to do.
+		if ((filedata[0] != '#') || (filedata[1] != '?'))
+			return false;
+
+		//uint32_t width = 0, height = 0;
+		bool is_rgbe = false;
+		size_t cur_ofs = 0;
+
+		// Parse the lines until we encounter a blank line.
+		std::string cur_line;
+		for (; ; )
 		{
-			uint8_t c = *p++;
-			if ((c < 32) || (c > 127))
-				c = '.';
+			if (cur_ofs >= filedata.size())
+				return false;
 
-			const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0];
+			const uint32_t HEADER_TOO_BIG_SIZE = 4096;
+			if (cur_ofs >= HEADER_TOO_BIG_SIZE)
+			{
+				// Header seems too large - something is likely wrong. Return failure.
+				return false;
+			}
 
-			for (uint32_t y = 0; y < 8; y++)
+			uint8_t c = filedata[cur_ofs++];
+
+			if (c == '\n')
 			{
-				uint32_t row_bits = pGlpyh[y];
-				for (uint32_t x = 0; x < 8; x++)
+				if (!cur_line.size())
+					break;
+
+				if ((cur_line[0] == '#') && (!string_begins_with(cur_line, "#?")) && (!hdr_info.m_program.size()))
 				{
-					const uint32_t q = row_bits & (1 << x);
-										
-					const color_rgba* pColor = q ? &fg : pBG;
-					if (!pColor)
-						continue;
+					cur_line.erase(0, 1);
+					while (cur_line.size() && (cur_line[0] == ' '))
+						cur_line.erase(0, 1);
 
-					if (alpha_only)
-						fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
-					else
-						fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+					hdr_info.m_program = cur_line;
+				}
+				else if (string_begins_with(cur_line, "EXPOSURE=") && (cur_line.size() > 9))
+				{
+					hdr_info.m_exposure = atof(cur_line.c_str() + 9);
+					hdr_info.m_has_exposure = true;
+				}
+				else if (string_begins_with(cur_line, "GAMMA=") && (cur_line.size() > 6))
+				{
+					hdr_info.m_exposure = atof(cur_line.c_str() + 6);
+					hdr_info.m_has_gamma = true;
+				}
+				else if (cur_line == "FORMAT=32-bit_rle_rgbe")
+				{
+					is_rgbe = true;
 				}
+
+				cur_line.resize(0);
 			}
+			else
+				cur_line.push_back((char)c);
+		}
 
-			x_ofs += 8 * scale_x;
-			if ((x_ofs + 8 * scale_x) > m_width)
+		if (!is_rgbe)
+			return false;
+
+		// Assume and require the final line to have the image's dimensions. We're not supporting flipping.
+		for (; ; )
+		{
+			if (cur_ofs >= filedata.size())
+				return false;
+			uint8_t c = filedata[cur_ofs++];
+			if (c == '\n')
+				break;
+			cur_line.push_back((char)c);
+		}
+
+		int comp[2] = { 1, 0 }; // y, x (major, minor)
+		int dir[2] = { -1, 1 }; // -1, 1, (major, minor), for y -1=up
+		uint32_t major_dim = 0, minor_dim = 0;
+
+		// Parse the dimension string, normally it'll be "-Y # +X #" (major, minor), rarely it differs
+		for (uint32_t d = 0; d < 2; d++) // 0=major, 1=minor
+		{
+			const bool is_neg_x = (strncmp(&cur_line[0], "-X ", 3) == 0);
+			const bool is_pos_x = (strncmp(&cur_line[0], "+X ", 3) == 0);
+			const bool is_x = is_neg_x || is_pos_x;
+
+			const bool is_neg_y = (strncmp(&cur_line[0], "-Y ", 3) == 0);
+			const bool is_pos_y = (strncmp(&cur_line[0], "+Y ", 3) == 0);
+			const bool is_y = is_neg_y || is_pos_y;
+
+			if (cur_line.size() < 3)
+				return false;
+			
+			if (!is_x && !is_y)
+				return false;
+
+			comp[d] = is_x ? 0 : 1;
+			dir[d] = (is_neg_x || is_neg_y) ? -1 : 1;
+			
+			uint32_t& dim = d ? minor_dim : major_dim;
+
+			cur_line.erase(0, 3);
+
+			while (cur_line.size())
 			{
-				x_ofs = orig_x_ofs;
-				y_ofs += 8 * scale_y;
+				char c = cur_line[0];
+				if (c != ' ')
+					break;
+				cur_line.erase(0, 1);
+			}
+
+			bool has_digits = false;
+			while (cur_line.size())
+			{
+				char c = cur_line[0];
+				cur_line.erase(0, 1);
+
+				if (c == ' ')
+					break;
+
+				if ((c < '0') || (c > '9'))
+					return false;
+
+				const uint32_t prev_dim = dim;
+				dim = dim * 10 + (c - '0');
+				if (dim < prev_dim)
+					return false;
+
+				has_digits = true;
 			}
+			if (!has_digits)
+				return false;
+
+			if ((dim < 1) || (dim > MAX_SUPPORTED_DIM))
+				return false;
 		}
-	}
-		
+				
+		// temp image: width=minor, height=major
+		img.resize(minor_dim, major_dim);
+
+		std::vector<color_rgba> temp_scanline(minor_dim);
+
+		// Read the scanlines.
+		for (uint32_t y = 0; y < major_dim; y++)
+		{
+			vec4F* pDst = &img(0, y);
+
+			if ((filedata.size() - cur_ofs) < 4)
+				return false;
+
+			// Determine if the line uses the new or old format. See the logic in color.c.
+			bool old_decrunch = false;
+			if ((minor_dim < 8) || (minor_dim > 0x7FFF))
+			{
+				// Line is too short or long; must be old format.
+				old_decrunch = true;
+			}
+			else if (filedata[cur_ofs] != 2)
+			{
+				// R is not 2, must be old format
+				old_decrunch = true;
+			}
+			else
+			{
+				// c[0]/red is 2.Check GB and E for validity.				
+				color_rgba c;
+				memcpy(&c, &filedata[cur_ofs], 4);
+
+				if ((c[1] != 2) || (c[2] & 0x80))
+				{
+					// G isn't 2, or the high bit of B is set which is impossible (image's > 0x7FFF pixels can't get here). Use old format.
+					old_decrunch = true;
+				}
+				else
+				{
+					// Check B and E. If this isn't the minor_dim in network order, something is wrong. The pixel would also be denormalized, and invalid.
+					uint32_t w = (c[2] << 8) | c[3];
+					if (w != minor_dim)
+						return false;
+
+					cur_ofs += 4;
+				}
+			}
+
+			if (old_decrunch)
+			{
+				uint32_t rshift = 0, x = 0;
+
+				while (x < minor_dim)
+				{
+					if ((filedata.size() - cur_ofs) < 4)
+						return false;
+
+					color_rgba c;
+					memcpy(&c, &filedata[cur_ofs], 4);
+					cur_ofs += 4;
+
+					if ((c[0] == 1) && (c[1] == 1) && (c[2] == 1))
+					{
+						// We'll allow RLE matches to cross scanlines, but not on the very first pixel.
+						if ((!x) && (!y))
+							return false;
+
+						const uint32_t run_len = c[3] << rshift;
+						const vec4F run_color(pDst[-1]);
+
+						if ((x + run_len) > minor_dim)
+							return false;
+
+						for (uint32_t i = 0; i < run_len; i++)
+							*pDst++ = run_color;
+
+						rshift += 8;
+						x += run_len;
+					}
+					else
+					{
+						rshift = 0;
+
+						hdr_convert(c, *pDst);
+						pDst++;
+						x++;
+					}
+				}
+				continue;
+			}
+
+			// New format
+			for (uint32_t s = 0; s < 4; s++)
+			{
+				uint32_t x_ofs = 0;
+				while (x_ofs < minor_dim)
+				{
+					uint32_t num_remaining = minor_dim - x_ofs;
+
+					if (cur_ofs >= filedata.size())
+						return false;
+
+					uint8_t count = filedata[cur_ofs++];
+					if (count > 128)
+					{
+						count -= 128;
+						if (count > num_remaining)
+							return false;
+
+						if (cur_ofs >= filedata.size())
+							return false;
+						const uint8_t val = filedata[cur_ofs++];
+
+						for (uint32_t i = 0; i < count; i++)
+							temp_scanline[x_ofs + i][s] = val;
+
+						x_ofs += count;
+					}
+					else
+					{
+						if ((!count) || (count > num_remaining))
+							return false;
+
+						for (uint32_t i = 0; i < count; i++)
+						{
+							if (cur_ofs >= filedata.size())
+								return false;
+							const uint8_t val = filedata[cur_ofs++];
+
+							temp_scanline[x_ofs + i][s] = val;
+						}
+
+						x_ofs += count;
+					}
+				} // while (x_ofs < minor_dim)
+			} // c
+
+			// Convert all the RGBE pixels to float now
+			for (uint32_t x = 0; x < minor_dim; x++, pDst++)
+				hdr_convert(temp_scanline[x], *pDst);
+
+			assert((pDst - &img(0, y)) == (int)minor_dim);
+
+		} // y
+
+		// at here:
+		// img(width,height)=image pixels as read from file, x=minor axis, y=major axis
+		// width=minor axis dimension
+		// height=major axis dimension
+		// in file, pixels are emitted in minor order, them major (so major=scanlines in the file)
+		
+		imagef final_img;
+		if (comp[0] == 0) // if major axis is X
+			final_img.resize(major_dim, minor_dim);
+		else // major axis is Y, minor is X
+			final_img.resize(minor_dim, major_dim);
+
+		// TODO: optimize the identity case
+		for (uint32_t major_iter = 0; major_iter < major_dim; major_iter++)
+		{
+			for (uint32_t minor_iter = 0; minor_iter < minor_dim; minor_iter++)
+			{
+				const vec4F& p = img(minor_iter, major_iter);
+
+				uint32_t dst_x = 0, dst_y = 0;
+
+				// is the minor dim output x?
+				if (comp[1] == 0) 
+				{
+					// minor axis is x, major is y
+					
+					// is minor axis (which is output x) flipped?
+					if (dir[1] < 0)
+						dst_x = minor_dim - 1 - minor_iter;
+					else
+						dst_x = minor_iter;
+
+					// is major axis (which is output y) flipped? -1=down in raster order, 1=up
+					if (dir[0] < 0)
+						dst_y = major_iter;
+					else
+						dst_y = major_dim - 1 - major_iter;
+				}
+				else
+				{
+					// minor axis is output y, major is output x
+
+					// is minor axis (which is output y) flipped?
+					if (dir[1] < 0)
+						dst_y = minor_iter;
+					else
+						dst_y = minor_dim - 1 - minor_iter;
+
+					// is major axis (which is output x) flipped?
+					if (dir[0] < 0)
+						dst_x = major_dim - 1 - major_iter;
+					else
+						dst_x = major_iter;
+				}
+
+				final_img(dst_x, dst_y) = p;
+			}
+		}
+
+		final_img.swap(img);
+
+		return true;
+	}
+
+	bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info)
+	{
+		uint8_vec filedata;
+		if (!read_file_to_vec(pFilename, filedata))
+			return false;
+		return read_rgbe(filedata, img, hdr_info);
+	}
+
+	static uint8_vec& append_string(uint8_vec& buf, const char* pStr)
+	{
+		const size_t str_len = strlen(pStr);
+		if (!str_len)
+			return buf;
+
+		const size_t ofs = buf.size();
+		buf.resize(ofs + str_len);
+		memcpy(&buf[ofs], pStr, str_len);
+
+		return buf;
+	}
+	
+	static uint8_vec& append_string(uint8_vec& buf, const std::string& str)
+	{
+		if (!str.size())
+			return buf;
+		return append_string(buf, str.c_str());
+	}
+
+	static inline void float2rgbe(color_rgba &rgbe, const vec4F &c)
+	{
+		const float red = c[0], green = c[1], blue = c[2];
+		assert(red >= 0.0f && green >= 0.0f && blue >= 0.0f);
+
+		const float max_v = basisu::maximumf(basisu::maximumf(red, green), blue);
+
+		if (max_v < 1e-32f)
+			rgbe.clear();
+		else 
+		{
+			int e;
+			const float scale = frexp(max_v, &e) * 256.0f / max_v;
+			rgbe[0] = (uint8_t)(clamp<int>((int)(red * scale), 0, 255));
+			rgbe[1] = (uint8_t)(clamp<int>((int)(green * scale), 0, 255));
+			rgbe[2] = (uint8_t)(clamp<int>((int)(blue * scale), 0, 255));
+			rgbe[3] = (uint8_t)(e + 128);
+		}
+	}
+
+	const bool RGBE_FORCE_RAW = false;
+	const bool RGBE_FORCE_OLD_CRUNCH = false; // note must readers (particularly stb_image.h's) don't properly support this, when they should
+		
+	bool write_rgbe(uint8_vec &file_data, imagef& img, rgbe_header_info& hdr_info)
+	{
+		if (!img.get_width() || !img.get_height())
+			return false;
+
+		const uint32_t width = img.get_width(), height = img.get_height();
+		
+		file_data.resize(0);
+		file_data.reserve(1024 + img.get_width() * img.get_height() * 4);
+
+		append_string(file_data, "#?RADIANCE\n");
+
+		if (hdr_info.m_has_exposure)
+			append_string(file_data, string_format("EXPOSURE=%g\n", hdr_info.m_exposure));
+
+		if (hdr_info.m_has_gamma)
+			append_string(file_data, string_format("GAMMA=%g\n", hdr_info.m_gamma));
+
+		append_string(file_data, "FORMAT=32-bit_rle_rgbe\n\n");
+		append_string(file_data, string_format("-Y %u +X %u\n", height, width));
+
+		if (((width < 8) || (width > 0x7FFF)) || (RGBE_FORCE_RAW))
+		{
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					color_rgba rgbe;
+					float2rgbe(rgbe, img(x, y));
+					append_vector(file_data, (const uint8_t *)&rgbe, sizeof(rgbe));
+				}
+			}
+		}
+		else if (RGBE_FORCE_OLD_CRUNCH)
+		{
+			for (uint32_t y = 0; y < height; y++)
+			{
+				int prev_r = -1, prev_g = -1, prev_b = -1, prev_e = -1;
+				uint32_t cur_run_len = 0;
+				
+				for (uint32_t x = 0; x < width; x++)
+				{
+					color_rgba rgbe;
+					float2rgbe(rgbe, img(x, y));
+
+					if ((rgbe[0] == prev_r) && (rgbe[1] == prev_g) && (rgbe[2] == prev_b) && (rgbe[3] == prev_e))
+					{
+						if (++cur_run_len == 255)
+						{
+							// this ensures rshift stays 0, it's lame but this path is only for testing readers
+							color_rgba f(1, 1, 1, cur_run_len - 1);
+							append_vector(file_data, (const uint8_t*)&f, sizeof(f));
+							append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe)); 
+							cur_run_len = 0;
+						}
+					}
+					else
+					{
+						if (cur_run_len > 0)
+						{
+							color_rgba f(1, 1, 1, cur_run_len);
+							append_vector(file_data, (const uint8_t*)&f, sizeof(f));
+							
+							cur_run_len = 0;
+						}
+						
+						append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe));
+																		
+						prev_r = rgbe[0];
+						prev_g = rgbe[1];
+						prev_b = rgbe[2];
+						prev_e = rgbe[3];
+					}
+				} // x
+
+				if (cur_run_len > 0)
+				{
+					color_rgba f(1, 1, 1, cur_run_len);
+					append_vector(file_data, (const uint8_t*)&f, sizeof(f));
+				}
+			} // y
+		}
+		else
+		{
+			uint8_vec temp[4];
+			for (uint32_t c = 0; c < 4; c++)
+				temp[c].resize(width);
+
+			for (uint32_t y = 0; y < height; y++)
+			{
+				color_rgba rgbe(2, 2, width >> 8, width & 0xFF);
+				append_vector(file_data, (const uint8_t*)&rgbe, sizeof(rgbe));
+								
+				for (uint32_t x = 0; x < width; x++)
+				{
+					float2rgbe(rgbe, img(x, y));
+
+					for (uint32_t c = 0; c < 4; c++)
+						temp[c][x] = rgbe[c];
+				}
+
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					int raw_ofs = -1;
+					
+					uint32_t x = 0;
+					while (x < width)
+					{
+						const uint32_t num_bytes_remaining = width - x;
+						const uint32_t max_run_len = basisu::minimum<uint32_t>(num_bytes_remaining, 127);
+						const uint8_t cur_byte = temp[c][x];
+
+						uint32_t run_len = 1;
+						while (run_len < max_run_len)
+						{
+							if (temp[c][x + run_len] != cur_byte)
+								break;
+							run_len++;
+						}
+												
+						const uint32_t cost_to_keep_raw = ((raw_ofs != -1) ? 0 : 1) + run_len; // 0 or 1 bytes to start a raw run, then the repeated bytes issued as raw
+						const uint32_t cost_to_take_run = 2 + 1; // 2 bytes to issue the RLE, then 1 bytes to start whatever follows it (raw or RLE)
+
+						if ((run_len >= 3) && (cost_to_take_run < cost_to_keep_raw))
+						{
+							file_data.push_back((uint8_t)(128 + run_len));
+							file_data.push_back(cur_byte);
+
+							x += run_len;
+							raw_ofs = -1;
+						}
+						else
+						{
+							if (raw_ofs < 0)
+							{
+								raw_ofs = (int)file_data.size();
+								file_data.push_back(0);
+							}
+
+							if (++file_data[raw_ofs] == 128)
+								raw_ofs = -1;
+
+							file_data.push_back(cur_byte);
+							
+							x++;
+						}
+					} // x
+
+				} // c
+			} // y
+		}
+
+		return true;
+	}
+
+	bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info)
+	{
+		uint8_vec file_data;
+		if (!write_rgbe(file_data, img, hdr_info))
+			return false;
+		return write_vec_to_file(pFilename, file_data);
+	}
+		
+	bool read_exr(const char* pFilename, imagef& img, int& n_chans)
+	{
+		n_chans = 0;
+
+		int width = 0, height = 0;
+		float* out_rgba = nullptr;
+		const char* err = nullptr;
+		
+		int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err);
+		n_chans = 4;
+		if (status != 0)
+		{
+			error_printf("Failed loading .EXR image \"%s\"! (TinyEXR error: %s)\n", pFilename, err ? err : "?");
+			FreeEXRErrorMessage(err);
+			free(out_rgba);
+			return false;
+		}
+
+		const uint32_t MAX_SUPPORTED_DIM = 65536;
+		if ((width < 1) || (height < 1) || (width > (int)MAX_SUPPORTED_DIM) || (height > (int)MAX_SUPPORTED_DIM))
+		{
+			error_printf("Invalid dimensions of .EXR image \"%s\"!\n", pFilename);
+			free(out_rgba);
+			return false;
+		}
+
+		img.resize(width, height);
+		
+		if (n_chans == 1)
+		{
+			const float* pSrc = out_rgba;
+			vec4F* pDst = img.get_ptr();
+
+			for (int y = 0; y < height; y++)
+			{
+				for (int x = 0; x < width; x++)
+				{
+					(*pDst)[0] = pSrc[0];
+					(*pDst)[1] = pSrc[1];
+					(*pDst)[2] = pSrc[2];
+					(*pDst)[3] = 1.0f;
+
+					pSrc += 4;
+					++pDst;
+				}
+			}
+		}
+		else
+		{
+			memcpy(img.get_ptr(), out_rgba, sizeof(float) * 4 * img.get_total_pixels());
+		}
+
+		free(out_rgba);
+		return true;
+	}
+
+	bool read_exr(const void* pMem, size_t mem_size, imagef& img)
+	{
+		float* out_rgba = nullptr;
+		int width = 0, height = 0;
+		const char* pErr = nullptr;
+		int res = LoadEXRFromMemory(&out_rgba, &width, &height, (const uint8_t*)pMem, mem_size, &pErr);
+		if (res < 0)
+		{
+			error_printf("Failed loading .EXR image from memory! (TinyEXR error: %s)\n", pErr ? pErr : "?");
+			FreeEXRErrorMessage(pErr);
+			free(out_rgba);
+			return false;
+		}
+
+		img.resize(width, height);
+		memcpy(img.get_ptr(), out_rgba, width * height * sizeof(float) * 4);
+		free(out_rgba);
+
+		return true;
+	}
+
+	bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags)
+	{
+		assert((n_chans == 1) || (n_chans == 3) || (n_chans == 4));
+
+		const bool linear_hint = (flags & WRITE_EXR_LINEAR_HINT) != 0, 
+			store_float = (flags & WRITE_EXR_STORE_FLOATS) != 0,
+			no_compression = (flags & WRITE_EXR_NO_COMPRESSION) != 0;
+								
+		const uint32_t width = img.get_width(), height = img.get_height();
+		assert(width && height);
+		
+		if (!width || !height)
+			return false;
+		
+		float_vec layers[4];
+		float* image_ptrs[4];
+		for (uint32_t c = 0; c < n_chans; c++)
+		{
+			layers[c].resize(width * height);
+			image_ptrs[c] = layers[c].get_ptr();
+		}
+
+		// ABGR
+		int chan_order[4] = { 3, 2, 1, 0 };
+
+		if (n_chans == 1)
+		{
+			// Y
+			chan_order[0] = 0;
+		}
+		else if (n_chans == 3)
+		{
+			// BGR
+			chan_order[0] = 2;
+			chan_order[1] = 1;
+			chan_order[2] = 0;
+		}
+		else if (n_chans != 4)
+		{
+			assert(0);
+			return false;
+		}
+		
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const vec4F& p = img(x, y);
+
+				for (uint32_t c = 0; c < n_chans; c++)
+					layers[c][x + y * width] = p[chan_order[c]];
+			} // x
+		} // y
+
+		EXRHeader header;
+		InitEXRHeader(&header);
+
+		EXRImage image;
+		InitEXRImage(&image);
+
+		image.num_channels = n_chans;
+		image.images = (unsigned char**)image_ptrs;
+		image.width = width;
+		image.height = height;
+
+		header.num_channels = n_chans;
+		
+		header.channels = (EXRChannelInfo*)calloc(header.num_channels, sizeof(EXRChannelInfo));
+
+		// Must be (A)BGR order, since most of EXR viewers expect this channel order.
+		for (uint32_t i = 0; i < n_chans; i++)
+		{
+			char c = 'Y';
+			if (n_chans == 3)
+				c = "BGR"[i];
+			else if (n_chans == 4)
+				c = "ABGR"[i];
+						
+			header.channels[i].name[0] = c;
+			header.channels[i].name[1] = '\0';
+
+			header.channels[i].p_linear = linear_hint;
+		}
+		
+		header.pixel_types = (int*)calloc(header.num_channels, sizeof(int));
+		header.requested_pixel_types = (int*)calloc(header.num_channels, sizeof(int));
+		
+		if (!no_compression)
+			header.compression_type = TINYEXR_COMPRESSIONTYPE_ZIP;
+
+		for (int i = 0; i < header.num_channels; i++) 
+		{
+			// pixel type of input image
+			header.pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT; 
+
+			// pixel type of output image to be stored in .EXR
+			header.requested_pixel_types[i] = store_float ? TINYEXR_PIXELTYPE_FLOAT : TINYEXR_PIXELTYPE_HALF; 
+		}
+
+		const char* pErr_msg = nullptr;
+
+		int ret = SaveEXRImageToFile(&image, &header, pFilename, &pErr_msg);
+		if (ret != TINYEXR_SUCCESS) 
+		{
+			error_printf("Save EXR err: %s\n", pErr_msg);
+			FreeEXRErrorMessage(pErr_msg);
+		}
+				
+		free(header.channels);
+		free(header.pixel_types);
+		free(header.requested_pixel_types);
+
+		return (ret == TINYEXR_SUCCESS);
+	}
+
+	void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...)
+	{
+		char buf[2048];
+
+		va_list args;
+		va_start(args, pFmt);
+#ifdef _WIN32		
+		vsprintf_s(buf, sizeof(buf), pFmt, args);
+#else
+		vsnprintf(buf, sizeof(buf), pFmt, args);
+#endif
+		va_end(args);
+
+		const char* p = buf;
+
+		const uint32_t orig_x_ofs = x_ofs;
+
+		while (*p)
+		{
+			uint8_t c = *p++;
+			if ((c < 32) || (c > 127))
+				c = '.';
+
+			const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0];
+
+			for (uint32_t y = 0; y < 8; y++)
+			{
+				uint32_t row_bits = pGlpyh[y];
+				for (uint32_t x = 0; x < 8; x++)
+				{
+					const uint32_t q = row_bits & (1 << x);
+										
+					const color_rgba* pColor = q ? &fg : pBG;
+					if (!pColor)
+						continue;
+
+					if (alpha_only)
+						fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+					else
+						fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+				}
+			}
+
+			x_ofs += 8 * scale_x;
+			if ((x_ofs + 8 * scale_x) > m_width)
+			{
+				x_ofs = orig_x_ofs;
+				y_ofs += 8 * scale_y;
+			}
+		}
+	}
+	
+	// Very basic global Reinhard tone mapping, output converted to sRGB with no dithering, alpha is carried through unchanged. 
+	// Only used for debugging/development.
+	void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure)
+	{
+		uint32_t width = hdr_img.get_width(), height = hdr_img.get_height();
+
+		ldr_img.resize(width, height);
+				
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				vec4F c(hdr_img(x, y));
+
+				for (uint32_t t = 0; t < 3; t++)
+				{
+					if (c[t] <= 0.0f)
+					{
+						c[t] = 0.0f;
+					}
+					else
+					{
+						c[t] *= exposure;
+						c[t] = c[t] / (1.0f + c[t]);
+					}
+				}
+
+				c.clamp(0.0f, 1.0f);
+
+				c[0] = linear_to_srgb(c[0]) * 255.0f;
+				c[1] = linear_to_srgb(c[1]) * 255.0f;
+				c[2] = linear_to_srgb(c[2]) * 255.0f;
+				c[3] = c[3] * 255.0f;
+
+				color_rgba& o = ldr_img(x, y);
+				
+				o[0] = (uint8_t)std::round(c[0]);
+				o[1] = (uint8_t)std::round(c[1]);
+				o[2] = (uint8_t)std::round(c[2]);
+				o[3] = (uint8_t)std::round(c[3]);
+			}
+		}
+	}
+
+	bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img)
+	{
+		const uint32_t width = hdr_test_img.get_width();
+		const uint32_t height = hdr_test_img.get_height();
+
+		uint16_vec orig_half_img(width * 3 * height);
+		uint16_vec half_img(width * 3 * height);
+
+		int max_shift = 32;
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const vec4F& p = hdr_test_img(x, y);
+
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					if (p[i] < 0.0f)
+						return false;
+					if (p[i] > basist::MAX_HALF_FLOAT)
+						return false;
+
+					uint32_t h = basist::float_to_half(p[i]);
+					//uint32_t orig_h = h;
+
+					orig_half_img[(x + y * width) * 3 + i] = (uint16_t)h;
+
+					// Rotate sign bit into LSB
+					//h = rot_left16((uint16_t)h, 1);
+					//assert(rot_right16((uint16_t)h, 1) == orig_h);
+					h <<= 1;
+
+					half_img[(x + y * width) * 3 + i] = (uint16_t)h;
+
+					// Determine # of leading zero bits, ignoring the sign bit
+					if (h)
+					{
+						int lz = clz(h) - 16;
+						assert(lz >= 0 && lz <= 16);
+
+						assert((h << lz) <= 0xFFFF);
+
+						max_shift = basisu::minimum<int>(max_shift, lz);
+					}
+				} // i
+			} // x
+		} // y
+
+		//printf("tonemap_image_compressive: Max leading zeros: %i\n", max_shift);
+
+		uint32_t high_hist[256];
+		clear_obj(high_hist);
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				for (uint32_t i = 0; i < 3; i++)
+				{
+					uint16_t& hf = half_img[(x + y * width) * 3 + i];
+
+					assert(((uint32_t)hf << max_shift) <= 65535);
+
+					hf <<= max_shift;
+
+					uint32_t h = (uint8_t)(hf >> 8);
+					high_hist[h]++;
+				}
+			} // x
+		} // y
+
+		uint32_t total_vals_used = 0;
+		int remap_old_to_new[256];
+		for (uint32_t i = 0; i < 256; i++)
+			remap_old_to_new[i] = -1;
+
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			if (high_hist[i] != 0)
+			{
+				remap_old_to_new[i] = total_vals_used;
+				total_vals_used++;
+			}
+		}
+
+		assert(total_vals_used >= 1);
+
+		//printf("tonemap_image_compressive: Total used high byte values: %u, unused: %u\n", total_vals_used, 256 - total_vals_used);
+
+		bool val_used[256];
+		clear_obj(val_used);
+
+		int remap_new_to_old[256];
+		for (uint32_t i = 0; i < 256; i++)
+			remap_new_to_old[i] = -1;
+		BASISU_NOTE_UNUSED(remap_new_to_old);
+
+		int prev_c = -1;
+		BASISU_NOTE_UNUSED(prev_c);
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			if (remap_old_to_new[i] >= 0)
+			{
+				int c;
+				if (total_vals_used <= 1)
+					c = remap_old_to_new[i];
+				else
+				{
+					c = (remap_old_to_new[i] * 255 + ((total_vals_used - 1) / 2)) / (total_vals_used - 1);
+
+					assert(c > prev_c);
+				}
+
+				assert(!val_used[c]);
+
+				remap_new_to_old[c] = i;
+
+				remap_old_to_new[i] = c;
+				prev_c = c;
+
+				//printf("%u ", c);
+
+				val_used[c] = true;
+			}
+		} // i
+		//printf("\n");
+
+		dst_img.resize(width, height);
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					uint16_t& v16 = half_img[(x + y * width) * 3 + c];
+
+					uint32_t hb = v16 >> 8;
+					//uint32_t lb = v16 & 0xFF;
+
+					assert(remap_old_to_new[hb] != -1);
+					assert(remap_old_to_new[hb] <= 255);
+					assert(remap_new_to_old[remap_old_to_new[hb]] == (int)hb);
+
+					hb = remap_old_to_new[hb];
+
+					//v16 = (uint16_t)((hb << 8) | lb);
+
+					dst_img(x, y)[c] = (uint8_t)hb;
+				}
+			} // x
+		} // y
+
+		return true;
+	}
+					
 } // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h
index 0efeaa461fbf..780605e7b861 100644
--- a/thirdparty/basis_universal/encoder/basisu_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_enc.h
@@ -1,5 +1,5 @@
 // basisu_enc.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -48,7 +48,8 @@ namespace basisu
 
 	// Encoder library initialization.
 	// This function MUST be called before encoding anything!
-	void basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
+	// Returns false if library initialization fails.
+	bool basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
 	void basisu_encoder_deinit();
 
 	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
@@ -70,6 +71,18 @@ namespace basisu
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}
 
+	inline int left_shift32(int val, int shift)
+	{
+		assert((shift >= 0) && (shift < 32));
+		return static_cast<int>(static_cast<uint32_t>(val) << shift);
+	}
+
+	inline uint32_t left_shift32(uint32_t val, int shift)
+	{
+		assert((shift >= 0) && (shift < 32));
+		return val << shift;
+	}
+
 	inline int32_t clampi(int32_t value, int32_t low, int32_t high) 
 	{ 
 		if (value < low) 
@@ -130,6 +143,31 @@ namespace basisu
 
 		return bits;
 	}
+		
+	// Open interval
+	inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
+	inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
+
+	// Closed interval
+	inline int bounds_check_incl(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
+	inline uint32_t bounds_check_incl(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v <= h); return v; }
+
+	inline uint32_t clz(uint32_t x)
+	{
+		if (!x)
+			return 32;
+
+		uint32_t n = 0;
+		while ((x & 0x80000000) == 0)
+		{
+			x <<= 1u;
+			n++;
+		}
+
+		return n;
+	}
+
+	bool string_begins_with(const std::string& str, const char* pPhrase);
 				
 	// Hashing
 	
@@ -268,6 +306,7 @@ namespace basisu
 
 	public:
 		enum { num_elements = N };
+		typedef T scalar_type;
 
 		inline vec() { }
 		inline vec(eZero) { set_zero();  }
@@ -291,6 +330,7 @@ namespace basisu
 		inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
 
 		inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
+		inline void clear() { set_zero(); }
 
 		template <uint32_t OtherN, typename OtherT>
 		inline vec &set(const vec<OtherN, OtherT> &other)
@@ -391,7 +431,7 @@ namespace basisu
 		inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
 		inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
 
-		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len);	return *this; }
+		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; }
 
 		inline vec &clamp(T l, T h)
 		{
@@ -722,7 +762,7 @@ namespace basisu
 		void job_thread(uint32_t index);
 	};
 
-	// Simple 32-bit color class
+	// Simple 64-bit color class
 
 	class color_rgba_i16
 	{
@@ -1116,7 +1156,9 @@ namespace basisu
 	{
 		std::string result(s);
 		for (size_t i = 0; i < result.size(); i++)
-			result[i] = (char)tolower((int)result[i]);
+		{
+			result[i] = (char)tolower((uint8_t)(result[i]));
+		}
 		return result;
 	}
 
@@ -1408,7 +1450,7 @@ namespace basisu
 
 		size_t get_total_training_vecs() const { return m_training_vecs.size(); }
 		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
-				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
+			  array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
 
 		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
 		{
@@ -1437,36 +1479,36 @@ namespace basisu
 		}
 
 		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
-      {
+		{
 			uint_vec node_stack;
-         node_stack.reserve(512);
+			node_stack.reserve(512);
 
-         codebook.resize(0);
-         codebook.reserve(max_clusters);
+			codebook.resize(0);
+			codebook.reserve(max_clusters);
 			         
-         uint32_t node_index = 0;
+			uint32_t node_index = 0;
 
-         while (true)
-         {
-            const tsvq_node& cur = m_nodes[node_index];
+			while (true)
+			{
+				const tsvq_node& cur = m_nodes[node_index];
 
-            if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))
-            {
-               codebook.resize(codebook.size() + 1);
-               codebook.back() = cur.m_training_vecs;
+				if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))
+				{
+					codebook.resize(codebook.size() + 1);
+					codebook.back() = cur.m_training_vecs;
 										
-               if (node_stack.empty())
-                  break;
+					if (node_stack.empty())
+						break;
 
-               node_index = node_stack.back();
-               node_stack.pop_back();
-               continue;
-            }
+					node_index = node_stack.back();
+					node_stack.pop_back();
+					continue;
+				}
 				            
-            node_stack.push_back(cur.m_right_index);
-				node_index = cur.m_left_index;
-         }
-      }
+				node_stack.push_back(cur.m_right_index);
+					node_index = cur.m_left_index;
+			}
+		}
 
 		bool generate(uint32_t max_size)
 		{
@@ -2319,6 +2361,14 @@ namespace basisu
 			m_total_bits = 0;
 		}
 
+		inline void restart()
+		{
+			m_bytes.resize(0);
+			m_bit_buffer = 0;
+			m_bit_buffer_size = 0;
+			m_total_bits = 0;
+		}
+
 		inline const uint8_vec &get_bytes() const { return m_bytes; }
 
 		inline uint64_t get_total_bits() const { return m_total_bits; }
@@ -2920,11 +2970,11 @@ namespace basisu
 		inline const color_rgba *get_ptr() const { return &m_pixels[0]; }
 		inline color_rgba *get_ptr() { return &m_pixels[0]; }
 
-		bool has_alpha() const
+		bool has_alpha(uint32_t channel = 3) const
 		{
 			for (uint32_t y = 0; y < m_height; ++y)
 				for (uint32_t x = 0; x < m_width; ++x)
-					if ((*this)(x, y).a < 255)
+					if ((*this)(x, y)[channel] < 255)
 						return true;
 
 			return false;
@@ -3130,6 +3180,31 @@ namespace basisu
 			return *this;
 		}
 
+		imagef& crop_dup_borders(uint32_t w, uint32_t h)
+		{
+			const uint32_t orig_w = m_width, orig_h = m_height;
+
+			crop(w, h);
+
+			if (orig_w && orig_h)
+			{
+				if (m_width > orig_w)
+				{
+					for (uint32_t x = orig_w; x < m_width; x++)
+						for (uint32_t y = 0; y < m_height; y++)
+							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
+				}
+
+				if (m_height > orig_h)
+				{
+					for (uint32_t y = orig_h; y < m_height; y++)
+						for (uint32_t x = 0; x < m_width; x++)
+							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
+				}
+			}
+			return *this;
+		}
+
 		inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
 		inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
 
@@ -3213,19 +3288,128 @@ namespace basisu
 
 		inline const vec4F *get_ptr() const { return &m_pixels[0]; }
 		inline vec4F *get_ptr() { return &m_pixels[0]; }
+
+		bool clean_astc_hdr_pixels(float highest_mag)
+		{
+			bool status = true;
+			bool nan_msg = false;
+			bool inf_msg = false;
+			bool neg_zero_msg = false;
+			bool neg_msg = false;
+			bool clamp_msg = false;
+
+			for (uint32_t iy = 0; iy < m_height; iy++)
+			{
+				for (uint32_t ix = 0; ix < m_width; ix++)
+				{
+					vec4F& c = (*this)(ix, iy);
+
+					for (uint32_t s = 0; s < 4; s++)
+					{
+						float &p = c[s];
+						union { float f; uint32_t u; } x; x.f = p;
+						
+						if ((std::isnan(p)) || (std::isinf(p)) || (x.u == 0x80000000))
+						{
+							if (std::isnan(p))
+							{
+								if (!nan_msg)
+								{
+									fprintf(stderr, "One or more pixels was NaN, setting to 0.\n");
+									nan_msg = true;
+								}
+							}
+
+							if (std::isinf(p))
+							{
+								if (!inf_msg)
+								{
+									fprintf(stderr, "One or more pixels was INF, setting to 0.\n");
+									inf_msg = true;
+								}
+							}
+
+							if (x.u == 0x80000000)
+							{
+								if (!neg_zero_msg)
+								{
+									fprintf(stderr, "One or more pixels was -0, setting them to 0.\n");
+									neg_zero_msg = true;
+								}
+							}
+
+							p = 0.0f;
+							status = false;
+						}
+						else
+						{
+							//const float o = p;
+							if (p < 0.0f)
+							{
+								p = 0.0f;
+
+								if (!neg_msg)
+								{
+									fprintf(stderr, "One or more pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n");
+									neg_msg = true;
+								}
+								
+								status = false;
+							}
+
+							if (p > highest_mag)
+							{
+								p = highest_mag;
+								
+								if (!clamp_msg)
+								{
+									fprintf(stderr, "One or more pixels had to be clamped to %f.\n", highest_mag);
+									clamp_msg = true;
+								}
+
+								status = false;
+							}
+						}
+					}
+				}
+			}
+
+			return status;
+		}
+
+		imagef& flip_y()
+		{
+			for (uint32_t y = 0; y < m_height / 2; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
+
+			return *this;
+		}
 						
 	private:
 		uint32_t m_width, m_height, m_pitch;  // all in pixels
 		vec4F_vec m_pixels;
 	};
 
+	// REC 709 coefficients
+	const float REC_709_R = 0.212656f, REC_709_G = 0.715158f, REC_709_B = 0.072186f;
+
+	inline float get_luminance(const vec4F &c)
+	{
+		return c[0] * REC_709_R + c[1] * REC_709_G + c[2] * REC_709_B;
+	}
+
+	float linear_to_srgb(float l);
+	float srgb_to_linear(float s);
+
 	// Image metrics
 		
 	class image_metrics
 	{
 	public:
 		// TODO: Add ssim
-		float m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
+		double m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
+		bool m_has_neg, m_hf_mag_overflow, m_any_abnormal;
 
 		image_metrics()
 		{
@@ -3240,10 +3424,17 @@ namespace basisu
 			m_rms = 0;
 			m_psnr = 0;
 			m_ssim = 0;
+			m_has_neg = false;
+			m_hf_mag_overflow = false;
+			m_any_abnormal = false;
 		}
 
-		void print(const char *pPrefix = nullptr)	{ printf("%sMax: %3.0f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr);	}
+		void print(const char *pPrefix = nullptr)	{ printf("%sMax: %3.3f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr);	}
+		void print_hp(const char* pPrefix = nullptr) { printf("%sMax: %3.6f Mean: %3.6f RMS: %3.6f PSNR: %2.6f dB, Any Neg: %u, Half float overflow: %u, Any NaN/Inf: %u\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr, m_has_neg, m_hf_mag_overflow, m_any_abnormal); }
 
+		void calc(const imagef& a, const imagef& b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool log = false);
+		void calc_half(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
+		void calc_half2(const imagef& a, const imagef& b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error);
 		void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);
 	};
 
@@ -3256,6 +3447,8 @@ namespace basisu
 	bool load_tga(const char* pFilename, image& img);
 	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
 
+	bool load_qoi(const char* pFilename, image& img);
+
 	bool load_jpg(const char *pFilename, image& img);
 	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
 	
@@ -3263,9 +3456,64 @@ namespace basisu
 	bool load_image(const char* pFilename, image& img);
 	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
 
+	// Supports .HDR and most (but not all) .EXR's (see TinyEXR).
+	bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true);
+	inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true) { return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear); }
+
+	enum class hdr_image_type
+	{
+		cHITRGBAHalfFloat = 0,
+		cHITRGBAFloat = 1,
+		cHITPNGImage = 2,
+		cHITEXRImage = 3,
+		cHITHDRImage = 4
+	};
+
+	bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear);
+
 	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
 	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
 		
+	struct rgbe_header_info
+	{
+		std::string m_program;
+
+		// Note no validation is done, either gamma or exposure may be 0.
+		double m_gamma;
+		bool m_has_gamma;
+
+		double m_exposure; // watts/steradian/m^2.
+		bool m_has_exposure;
+
+		void clear() 
+		{ 
+			m_program.clear(); 
+			m_gamma = 1.0f; 
+			m_has_gamma = false; 
+			m_exposure = 1.0f; 
+			m_has_exposure = false; 
+		}
+	};
+
+	bool read_rgbe(const uint8_vec& filedata, imagef& img, rgbe_header_info& hdr_info);
+	bool read_rgbe(const char* pFilename, imagef& img, rgbe_header_info &hdr_info);
+
+	bool write_rgbe(uint8_vec& file_data, imagef& img, rgbe_header_info& hdr_info);
+	bool write_rgbe(const char* pFilename, imagef& img, rgbe_header_info& hdr_info);
+
+	bool read_exr(const char* pFilename, imagef& img, int& n_chans);
+	bool read_exr(const void* pMem, size_t mem_size, imagef& img);
+	
+	enum
+	{
+		WRITE_EXR_LINEAR_HINT = 1, // hint for lossy comp. methods: exr_perceptual_treatment_t, logarithmic or linear, defaults to logarithmic
+		WRITE_EXR_STORE_FLOATS = 2, // use 32-bit floats, otherwise it uses half floats
+		WRITE_EXR_NO_COMPRESSION = 4 // no compression, otherwise it uses ZIP compression (16 scanlines per block)
+	};
+
+	// Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images.
+	bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags);
+			
 	enum
 	{
 		cImageSaveGrayscale = 1,
@@ -3276,19 +3524,22 @@ namespace basisu
 	inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }
 	
 	bool read_file_to_vec(const char* pFilename, uint8_vec& data);
-	
+	bool read_file_to_data(const char* pFilename, void *pData, size_t len);	
+
 	bool write_data_to_file(const char* pFilename, const void* pData, size_t len);
 	
 	inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) {	return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); }
-
-	float linear_to_srgb(float l);
-	float srgb_to_linear(float s);
-
+		
 	bool image_resample(const image &src, image &dst, bool srgb = false,
 		const char *pFilter = "lanczos4", float filter_scale = 1.0f, 
 		bool wrapping = false,
 		uint32_t first_comp = 0, uint32_t num_comps = 4);
 
+	bool image_resample(const imagef& src, imagef& dst, 
+		const char* pFilter = "lanczos4", float filter_scale = 1.0f,
+		bool wrapping = false,
+		uint32_t first_comp = 0, uint32_t num_comps = 4);
+		
 	// Timing
 			
 	typedef uint64_t timer_ticks;
@@ -3319,6 +3570,8 @@ namespace basisu
 		bool m_started, m_stopped;
 	};
 
+	inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); }
+
 	// 2D array
 
 	template<typename T>
@@ -3372,8 +3625,8 @@ namespace basisu
 		inline const T &operator[] (uint32_t i) const { return m_values[i]; }
 		inline T &operator[] (uint32_t i) { return m_values[i]; }
 				
-		inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }		
-		inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }
+		inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }		
+		inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
 
 		void clear()
 		{
@@ -3450,7 +3703,327 @@ namespace basisu
 		}
 	};
 	typedef basisu::vector<pixel_block> pixel_block_vec;
-		
+
+	struct pixel_block_hdr
+	{
+		vec4F m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
+
+		inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+		inline vec4F& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+
+		inline const vec4F* get_ptr() const { return &m_pixels[0][0]; }
+		inline vec4F* get_ptr() { return &m_pixels[0][0]; }
+
+		inline void clear() { clear_obj(*this); }
+
+		inline bool operator== (const pixel_block& rhs) const
+		{
+			return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
+		}
+	};
+	typedef basisu::vector<pixel_block_hdr> pixel_block_hdr_vec;
+
+	void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure);
+	bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img);
+	
+	// Intersection
+	enum eClear { cClear = 0 };
+	enum eInitExpand { cInitExpand = 0 };
+
+	template<typename vector_type>
+	class ray
+	{
+	public:
+		typedef vector_type vector_t;
+		typedef typename vector_type::scalar_type scalar_type;
+
+		inline ray() { }
+		inline ray(eClear) { clear(); }
+		inline ray(const vector_type& origin, const vector_type& direction) : m_origin(origin), m_direction(direction) { }
+
+		inline void clear()
+		{
+			m_origin.clear();
+			m_direction.clear();
+		}
+
+		inline const vector_type& get_origin(void) const { return m_origin; }
+		inline void set_origin(const vector_type& origin) { m_origin = origin; }
+
+		inline const vector_type& get_direction(void) const { return m_direction; }
+		inline void set_direction(const vector_type& direction) { m_direction = direction; }
+
+		inline void set_endpoints(const vector_type& start, const vector_type& end)
+		{
+			m_origin = start;
+
+			m_direction = end - start;
+			m_direction.normalize_in_place();
+		}
+
+		inline vector_type eval(scalar_type t) const
+		{
+			return m_origin + m_direction * t;
+		}
+
+	private:
+		vector_type m_origin;
+		vector_type m_direction;
+	};
+
+	typedef ray<vec2F> ray2F;
+	typedef ray<vec3F> ray3F;
+
+	template<typename T>
+	class vec_interval
+	{
+	public:
+		enum { N = T::num_elements };
+		typedef typename T::scalar_type scalar_type;
+
+		inline vec_interval(const T& v) { m_bounds[0] = v; m_bounds[1] = v; }
+		inline vec_interval(const T& low, const T& high) { m_bounds[0] = low; m_bounds[1] = high; }
+
+		inline vec_interval() { }
+		inline vec_interval(eClear) { clear(); }
+		inline vec_interval(eInitExpand) { init_expand(); }
+
+		inline void clear() { m_bounds[0].clear(); m_bounds[1].clear(); }
+
+		inline void init_expand()
+		{
+			m_bounds[0].set(1e+30f, 1e+30f, 1e+30f);
+			m_bounds[1].set(-1e+30f, -1e+30f, -1e+30f);
+		}
+
+		inline vec_interval expand(const T& p)
+		{
+			for (uint32_t c = 0; c < N; c++)
+			{
+				if (p[c] < m_bounds[0][c])
+					m_bounds[0][c] = p[c];
+
+				if (p[c] > m_bounds[1][c])
+					m_bounds[1][c] = p[c];
+			}
+
+			return *this;
+		}
+
+		inline const T& operator[] (uint32_t i) const { assert(i < 2); return m_bounds[i]; }
+		inline       T& operator[] (uint32_t i) { assert(i < 2); return m_bounds[i]; }
+
+		const T& get_low() const { return m_bounds[0]; }
+		T& get_low() { return m_bounds[0]; }
+
+		const T& get_high() const { return m_bounds[1]; }
+		T& get_high() { return m_bounds[1]; }
+
+		scalar_type get_dim(uint32_t axis) const { return m_bounds[1][axis] - m_bounds[0][axis]; }
+
+		bool contains(const T& p) const
+		{
+			const T& low = get_low(), high = get_high();
+
+			for (uint32_t i = 0; i < N; i++)
+			{
+				if (p[i] < low[i])
+					return false;
+
+				if (p[i] > high[i])
+					return false;
+			}
+			return true;
+		}
+
+	private:
+		T m_bounds[2];
+	};
+
+	typedef vec_interval<vec1F> vec_interval1F;
+	typedef vec_interval<vec2F> vec_interval2F;
+	typedef vec_interval<vec3F> vec_interval3F;
+	typedef vec_interval<vec4F> vec_interval4F;
+
+	typedef vec_interval2F aabb2F;
+	typedef vec_interval3F aabb3F;
+
+	namespace intersection
+	{
+		enum result
+		{
+			cBackfacing = -1,
+			cFailure = 0,
+			cSuccess,
+			cParallel,
+			cInside,
+		};
+
+		// Returns cInside, cSuccess, or cFailure.
+		// Algorithm: Graphics Gems 1
+		template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
+		result ray_aabb(vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
+		{
+			enum
+			{
+				cNumDim = vector_type::num_elements,
+				cRight = 0,
+				cLeft = 1,
+				cMiddle = 2
+			};
+
+			bool inside = true;
+			int quadrant[cNumDim];
+			scalar_type candidate_plane[cNumDim];
+
+			for (int i = 0; i < cNumDim; i++)
+			{
+				if (ray.get_origin()[i] < box[0][i])
+				{
+					quadrant[i] = cLeft;
+					candidate_plane[i] = box[0][i];
+					inside = false;
+				}
+				else if (ray.get_origin()[i] > box[1][i])
+				{
+					quadrant[i] = cRight;
+					candidate_plane[i] = box[1][i];
+					inside = false;
+				}
+				else
+				{
+					quadrant[i] = cMiddle;
+				}
+			}
+
+			if (inside)
+			{
+				coord = ray.get_origin();
+				t = 0.0f;
+				return cInside;
+			}
+
+			scalar_type max_t[cNumDim];
+			for (int i = 0; i < cNumDim; i++)
+			{
+				if ((quadrant[i] != cMiddle) && (ray.get_direction()[i] != 0.0f))
+					max_t[i] = (candidate_plane[i] - ray.get_origin()[i]) / ray.get_direction()[i];
+				else
+					max_t[i] = -1.0f;
+			}
+
+			int which_plane = 0;
+			for (int i = 1; i < cNumDim; i++)
+				if (max_t[which_plane] < max_t[i])
+					which_plane = i;
+
+			if (max_t[which_plane] < 0.0f)
+				return cFailure;
+
+			for (int i = 0; i < cNumDim; i++)
+			{
+				if (i != which_plane)
+				{
+					coord[i] = ray.get_origin()[i] + max_t[which_plane] * ray.get_direction()[i];
+
+					if ((coord[i] < box[0][i]) || (coord[i] > box[1][i]))
+						return cFailure;
+				}
+				else
+				{
+					coord[i] = candidate_plane[i];
+				}
+
+				assert(coord[i] >= box[0][i] && coord[i] <= box[1][i]);
+			}
+
+			t = max_t[which_plane];
+			return cSuccess;
+		}
+
+		template<typename vector_type, typename scalar_type, typename ray_type, typename aabb_type>
+		result ray_aabb(bool& started_within, vector_type& coord, scalar_type& t, const ray_type& ray, const aabb_type& box)
+		{
+			if (!box.contains(ray.get_origin()))
+			{
+				started_within = false;
+				return ray_aabb(coord, t, ray, box);
+			}
+
+			started_within = true;
+
+			typename vector_type::T diag_dist = box.diagonal_length() * 1.5f;
+			ray_type outside_ray(ray.eval(diag_dist), -ray.get_direction());
+
+			result res(ray_aabb(coord, t, outside_ray, box));
+			if (res != cSuccess)
+				return res;
+
+			t = basisu::maximum(0.0f, diag_dist - t);
+			return cSuccess;
+		}
+
+	} // intersect
+
+	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
+	// Input cannot be negative, Inf or Nan.
+	inline basist::half_float float_to_half_non_neg_no_nan_inf(float val)
+	{
+		union { float f; int32_t i; uint32_t u; } fi = { val };
+		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF;
+		int e = 0, m = 0;
+
+		assert(((fi.i >> 31) == 0) && (flt_e != 0xFF));
+
+		// not zero or denormal
+		if (flt_e != 0)
+		{
+			int new_exp = flt_e - 127;
+			if (new_exp > 15)
+				e = 31;
+			else if (new_exp < -14)
+				m = lrintf((1 << 24) * fabsf(fi.f));
+			else
+			{
+				e = new_exp + 15;
+				m = lrintf(flt_m * (1.0f / ((float)(1 << 13))));
+			}
+		}
+
+		assert((0 <= m) && (m <= 1024));
+		if (m == 1024)
+		{
+			e++;
+			m = 0;
+		}
+
+		assert((e >= 0) && (e <= 31));
+		assert((m >= 0) && (m <= 1023));
+
+		basist::half_float result = (basist::half_float)((e << 10) | m);
+		return result;
+	}
+
+	// Supports positive and denormals only. No NaN or Inf.
+	inline float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h)
+	{
+		assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h));
+
+		union fu32
+		{
+			uint32_t u;
+			float f;
+		};
+
+		static const fu32 K = { 0x77800000 };
+
+		fu32 o;
+		o.u = h << 13;
+		o.f *= K.f;
+
+		return o.f;
+	}
+				
 } // namespace basisu
 
 
diff --git a/thirdparty/basis_universal/encoder/basisu_etc.cpp b/thirdparty/basis_universal/encoder/basisu_etc.cpp
index f8bd0f12e5f0..ba1c14231d32 100644
--- a/thirdparty/basis_universal/encoder/basisu_etc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_etc.cpp
@@ -1,5 +1,5 @@
 // basis_etc.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_etc.h b/thirdparty/basis_universal/encoder/basisu_etc.h
index 208f2aac1b80..5c44bd481212 100644
--- a/thirdparty/basis_universal/encoder/basisu_etc.h
+++ b/thirdparty/basis_universal/encoder/basisu_etc.h
@@ -1,5 +1,5 @@
 // basis_etc.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
index 1f30a33c7070..750f706aa538 100644
--- a/thirdparty/basis_universal/encoder/basisu_frontend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
@@ -1,5 +1,5 @@
 // basisu_frontend.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -2347,6 +2347,7 @@ namespace basisu
 						continue;
 
 					uint64_t overall_best_err = 0;
+					(void)overall_best_err;
 
 					uint64_t total_err[4][4][4];
 					clear_obj(total_err);
diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.h b/thirdparty/basis_universal/encoder/basisu_frontend.h
index cda73f398473..69fc8d8ec589 100644
--- a/thirdparty/basis_universal/encoder/basisu_frontend.h
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.h
@@ -1,5 +1,5 @@
 // basisu_frontend.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
index dec769d5acbe..342446b8fd43 100644
--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
 #include "basisu_gpu_texture.h"
 #include "basisu_enc.h"
 #include "basisu_pvrtc1_4.h"
-#if BASISU_USE_ASTC_DECOMPRESS
-#include "basisu_astc_decomp.h"
-#endif
+#include "3rdparty/android_astc_decomp.h"
 #include "basisu_bc7enc.h"
+#include "../transcoder/basisu_astc_hdr_core.h"
 
 namespace basisu
 {
+	//------------------------------------------------------------------------------------------------
+	// ETC2 EAC
+
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels)
 	{
 		static_assert(sizeof(eac_a8_block) == 8, "sizeof(eac_a8_block) == 8");
@@ -56,6 +58,8 @@ namespace basisu
 		pPixels[15].a = clamp255(base + pTable[pBlock->get_selector(3, 3, selector_bits)] * mul);
 	}
 
+	//------------------------------------------------------------------------------------------------
+	// BC1
 	struct bc1_block
 	{
 		enum { cTotalEndpointBytes = 2, cTotalSelectorBytes = 4 };
@@ -274,6 +278,9 @@ namespace basisu
 		return used_punchthrough;
 	}
 
+	//------------------------------------------------------------------------------------------------
+	// BC3-5
+
 	struct bc4_block
 	{
 		enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 };
@@ -372,7 +379,8 @@ namespace basisu
 		unpack_bc4(pBlock_bits, &pPixels[0].r, sizeof(color_rgba));
 		unpack_bc4((const uint8_t *)pBlock_bits + sizeof(bc4_block), &pPixels[0].g, sizeof(color_rgba));
 	}
-
+	
+	//------------------------------------------------------------------------------------------------
 	// ATC isn't officially documented, so I'm assuming these references:
 	// http://www.guildsoftware.com/papers/2012.Converting.DXTC.to.ATC.pdf
 	// https://github.com/Triang3l/S3TConv/blob/master/s3tconv_atitc.c
@@ -426,6 +434,7 @@ namespace basisu
 		}
 	}
 
+	//------------------------------------------------------------------------------------------------
 	// BC7 mode 0-7 decompression.
 	// Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines.
 
@@ -742,6 +751,255 @@ namespace basisu
 		return false;
 	}
 	
+	static inline int bc6h_sign_extend(int val, int bits)
+	{
+		assert((bits >= 1) && (bits < 32));
+		assert((val >= 0) && (val < (1 << bits)));
+		return (val << (32 - bits)) >> (32 - bits);
+	}
+
+	static inline int bc6h_apply_delta(int base, int delta, int num_bits, int is_signed)
+	{
+		int bitmask = ((1 << num_bits) - 1);
+		int v = (base + delta) & bitmask;
+		return is_signed ? bc6h_sign_extend(v, num_bits) : v;
+	}
+
+	static int bc6h_dequantize(int val, int bits, int is_signed)
+	{
+		int result;
+		if (is_signed)
+		{
+			if (bits >= 16)
+				result = val;
+			else
+			{
+				int s_flag = 0;
+				if (val < 0)
+				{
+					s_flag = 1;
+					val = -val;
+				}
+
+				if (val == 0)
+					result = 0;
+				else if (val >= ((1 << (bits - 1)) - 1))
+					result = 0x7FFF;
+				else
+					result = ((val << 15) + 0x4000) >> (bits - 1);
+
+				if (s_flag)
+					result = -result;
+			}
+		}
+		else
+		{
+			if (bits >= 15)
+				result = val;
+			else if (!val)
+				result = 0;
+			else if (val == ((1 << bits) - 1))
+				result = 0xFFFF;
+			else
+				result = ((val << 16) + 0x8000) >> bits;
+		}
+		return result;
+	}
+
+	static inline int bc6h_interpolate(int a, int b, const uint8_t* pWeights, int index)
+	{
+		return (a * (64 - (int)pWeights[index]) + b * (int)pWeights[index] + 32) >> 6;
+	}
+
+	static inline basist::half_float bc6h_convert_to_half(int val, int is_signed)
+	{
+		if (!is_signed)
+		{
+			// scale by 31/64
+			return (basist::half_float)((val * 31) >> 6);
+		}
+
+		// scale by 31/32
+		val = (val < 0) ? -(((-val) * 31) >> 5) : (val * 31) >> 5;
+
+		int s = 0;
+		if (val < 0)
+		{
+			s = 0x8000;
+			val = -val;
+		}
+
+		return (basist::half_float)(s | val);
+	}
+
+	static inline uint32_t bc6h_get_bits(uint32_t num_bits, uint64_t& l, uint64_t& h, uint32_t& total_bits)
+	{
+		assert((num_bits) && (num_bits <= 63));
+
+		uint32_t v = (uint32_t)(l & ((1U << num_bits) - 1U));
+
+		l >>= num_bits;
+		l |= (h << (64U - num_bits));
+		h >>= num_bits;
+
+		total_bits += num_bits;
+		assert(total_bits <= 128);
+
+		return v;
+	}
+
+	static inline uint32_t bc6h_reverse_bits(uint32_t v, uint32_t num_bits)
+	{
+		uint32_t res = 0;
+		for (uint32_t i = 0; i < num_bits; i++)
+		{
+			uint32_t bit = (v & (1u << i)) != 0u;
+			res |= (bit << (num_bits - 1u - i));
+		}
+		return res;
+	}
+
+	static inline uint64_t bc6h_read_le_qword(const void* p)
+	{
+		const uint8_t* pSrc = static_cast<const uint8_t*>(p);
+		return ((uint64_t)read_le_dword(pSrc)) | (((uint64_t)read_le_dword(pSrc + sizeof(uint32_t))) << 32U);
+	}
+
+	bool unpack_bc6h(const void* pSrc_block, void* pDst_block, bool is_signed, uint32_t dest_pitch_in_halfs)
+	{
+		assert(dest_pitch_in_halfs >= 4 * 3);
+
+		const uint32_t MAX_SUBSETS = 2, MAX_COMPS = 3;
+
+		const uint8_t* pSrc = static_cast<const uint8_t*>(pSrc_block);
+		basist::half_float* pDst = static_cast<basist::half_float*>(pDst_block);
+
+		uint64_t blo = bc6h_read_le_qword(pSrc), bhi = bc6h_read_le_qword(pSrc + sizeof(uint64_t));
+
+		// Unpack mode
+		const int mode = basist::g_bc6h_mode_lookup[blo & 31];
+		if (mode < 0)
+		{
+			for (int y = 0; y < 4; y++)
+			{
+				memset(pDst, 0, sizeof(basist::half_float) * 4);
+				pDst += dest_pitch_in_halfs;
+			}
+			return false;
+		}
+
+		// Skip mode bits
+		uint32_t total_bits_read = 0;
+		bc6h_get_bits((mode < 2) ? 2 : 5, blo, bhi, total_bits_read);
+
+		assert(mode < (int)basist::NUM_BC6H_MODES);
+
+		const uint32_t num_subsets = (mode >= 10) ? 1 : 2;
+		const bool is_mode_9_or_10 = (mode == 9) || (mode == 10);
+
+		// Unpack endpoint components
+		int comps[MAX_SUBSETS][MAX_COMPS][2] = { { { 0 } } };		// [subset][comp][l/h]
+		int part_index = 0;
+
+		uint32_t layout_index = 0;
+		while (layout_index < basist::MAX_BC6H_LAYOUT_INDEX)
+		{
+			const basist::bc6h_bit_layout& layout = basist::g_bc6h_bit_layouts[mode][layout_index];
+
+			if (layout.m_comp < 0)
+				break;
+
+			const int subset = layout.m_index >> 1, lh_index = layout.m_index & 1;
+			assert((layout.m_comp == 3) || ((subset >= 0) && (subset < (int)MAX_SUBSETS)));
+
+			const int last_bit = layout.m_last_bit, first_bit = layout.m_first_bit;
+			assert(last_bit >= 0);
+
+			int& res = (layout.m_comp == 3) ? part_index : comps[subset][layout.m_comp][lh_index];
+
+			if (first_bit < 0)
+			{
+				res |= (bc6h_get_bits(1, blo, bhi, total_bits_read) << last_bit);
+			}
+			else
+			{
+				const int total_bits = iabs(last_bit - first_bit) + 1;
+				const int bit_shift = basisu::minimum(first_bit, last_bit);
+
+				int b = bc6h_get_bits(total_bits, blo, bhi, total_bits_read);
+
+				if (last_bit < first_bit)
+					b = bc6h_reverse_bits(b, total_bits);
+
+				res |= (b << bit_shift);
+			}
+
+			layout_index++;
+		}
+		assert(layout_index != basist::MAX_BC6H_LAYOUT_INDEX);
+
+		// Sign extend/dequantize endpoints
+		const int num_sig_bits = basist::g_bc6h_mode_sig_bits[mode][0];
+		if (is_signed)
+		{
+			for (uint32_t comp = 0; comp < 3; comp++)
+				comps[0][comp][0] = bc6h_sign_extend(comps[0][comp][0], num_sig_bits);
+		}
+
+		if (is_signed || !is_mode_9_or_10)
+		{
+			for (uint32_t subset = 0; subset < num_subsets; subset++)
+				for (uint32_t comp = 0; comp < 3; comp++)
+					for (uint32_t lh = (subset ? 0 : 1); lh < 2; lh++)
+						comps[subset][comp][lh] = bc6h_sign_extend(comps[subset][comp][lh], basist::g_bc6h_mode_sig_bits[mode][1 + comp]);
+		}
+
+		if (!is_mode_9_or_10)
+		{
+			for (uint32_t subset = 0; subset < num_subsets; subset++)
+				for (uint32_t comp = 0; comp < 3; comp++)
+					for (uint32_t lh = (subset ? 0 : 1); lh < 2; lh++)
+						comps[subset][comp][lh] = bc6h_apply_delta(comps[0][comp][0], comps[subset][comp][lh], num_sig_bits, is_signed);
+		}
+
+		for (uint32_t subset = 0; subset < num_subsets; subset++)
+			for (uint32_t comp = 0; comp < 3; comp++)
+				for (uint32_t lh = 0; lh < 2; lh++)
+					comps[subset][comp][lh] = bc6h_dequantize(comps[subset][comp][lh], num_sig_bits, is_signed);
+
+		// Now unpack weights and output texels
+		const int weight_bits = (mode >= 10) ? 4 : 3;
+		const uint8_t* pWeights = (mode >= 10) ? basist::g_bc6h_weight4 : basist::g_bc6h_weight3;
+
+		dest_pitch_in_halfs -= 4 * 3;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				int subset = (num_subsets == 1) ? ((x | y) ? 0 : 0x80) : basist::g_bc6h_2subset_patterns[part_index][y][x];
+				const int num_bits = weight_bits + ((subset & 0x80) ? -1 : 0);
+
+				subset &= 1;
+
+				const int weight_index = bc6h_get_bits(num_bits, blo, bhi, total_bits_read);
+
+				pDst[0] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][0][0], comps[subset][0][1], pWeights, weight_index), is_signed);
+				pDst[1] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][1][0], comps[subset][1][1], pWeights, weight_index), is_signed);
+				pDst[2] = bc6h_convert_to_half(bc6h_interpolate(comps[subset][2][0], comps[subset][2][1], pWeights, weight_index), is_signed);
+
+				pDst += 3;
+			}
+
+			pDst += dest_pitch_in_halfs;
+		}
+
+		assert(total_bits_read == 128);
+		return true;
+	}
+	//------------------------------------------------------------------------------------------------
+	// FXT1 (for fun, and because some modern Intel parts support it, and because a subset is like BC1)
+
 	struct fxt1_block
 	{
 		union
@@ -901,6 +1159,9 @@ namespace basisu
 		return true;
 	}
 
+	//------------------------------------------------------------------------------------------------
+	// PVRTC2 (non-interpolated, hard_flag=1 modulation=0 subset only!)
+
 	struct pvrtc2_block
 	{
 		uint8_t m_modulation[4];
@@ -1015,6 +1276,9 @@ namespace basisu
 		return true;
 	}
 
+	//------------------------------------------------------------------------------------------------
+	// ETC2 EAC R11 or RG11
+
 	struct etc2_eac_r11
 	{
 		uint64_t m_base	: 8;
@@ -1085,13 +1349,16 @@ namespace basisu
 			unpack_etc2_eac_r(pBlock, pPixels, c);
 		}
 	}
-	
+
+	//------------------------------------------------------------------------------------------------
+	// UASTC
+
 	void unpack_uastc(const void* p, color_rgba* pPixels)
 	{
 		basist::unpack_uastc(*static_cast<const basist::uastc_block*>(p), (basist::color32 *)pPixels, false);
 	}
-	
-	// Unpacks to RGBA, R, RG, or A
+			
+	// Unpacks to RGBA, R, RG, or A. LDR GPU texture formats only.
 	bool unpack_block(texture_format fmt, const void* pBlock, color_rgba* pPixels)
 	{
 		switch (fmt)
@@ -1150,14 +1417,24 @@ namespace basisu
 			unpack_etc2_eac(pBlock, pPixels);
 			break;
 		}
-		case texture_format::cASTC4x4:
+		case texture_format::cBC6HSigned:
+		case texture_format::cBC6HUnsigned:
+		case texture_format::cASTC_HDR_4x4:
+		case texture_format::cUASTC_HDR_4x4:
+		{
+			// Can't unpack HDR blocks in unpack_block() because it returns 32bpp pixel data.
+			assert(0);
+			return false;
+		}
+		case texture_format::cASTC_LDR_4x4:
 		{
-#if BASISU_USE_ASTC_DECOMPRESS
 			const bool astc_srgb = false;
-			basisu_astc::astc::decompress(reinterpret_cast<uint8_t*>(pPixels), static_cast<const uint8_t*>(pBlock), astc_srgb, 4, 4);
-#else
-			memset(pPixels, 255, 16 * sizeof(color_rgba));
-#endif
+			bool status = basisu_astc::astc::decompress_ldr(reinterpret_cast<uint8_t*>(pPixels), static_cast<const uint8_t*>(pBlock), astc_srgb, 4, 4);
+			assert(status);
+
+			if (!status)
+				return false;
+			
 			break;
 		}
 		case texture_format::cATC_RGB:
@@ -1206,6 +1483,66 @@ namespace basisu
 		return true;
 	}
 
+	bool unpack_block_hdr(texture_format fmt, const void* pBlock, vec4F* pPixels)
+	{
+		switch (fmt)
+		{
+			case texture_format::cASTC_HDR_4x4:
+			case texture_format::cUASTC_HDR_4x4:
+			{
+#if 1
+				bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 4, 4);
+				assert(status);
+				if (!status)
+					return false;
+#else
+				basist::half_float half_block[16][4];
+				
+				astc_helpers::log_astc_block log_blk;
+				if (!astc_helpers::unpack_block(pBlock, log_blk, 4, 4))
+					return false;
+				if (!astc_helpers::decode_block(log_blk, half_block, 4, 4, astc_helpers::cDecodeModeHDR16))
+					return false;
+
+				for (uint32_t p = 0; p < 16; p++)
+				{
+					pPixels[p][0] = basist::half_to_float(half_block[p][0]);
+					pPixels[p][1] = basist::half_to_float(half_block[p][1]);
+					pPixels[p][2] = basist::half_to_float(half_block[p][2]);
+					pPixels[p][3] = basist::half_to_float(half_block[p][3]);
+				}
+
+				//memset(pPixels, 0, sizeof(vec4F) * 16);
+#endif
+				return true;
+			}
+			case texture_format::cBC6HSigned:
+			case texture_format::cBC6HUnsigned:
+			{
+				basist::half_float half_block[16][3];
+
+				unpack_bc6h(pBlock, half_block, fmt == texture_format::cBC6HSigned);
+
+				for (uint32_t p = 0; p < 16; p++)
+				{
+					pPixels[p][0] = basist::half_to_float(half_block[p][0]);
+					pPixels[p][1] = basist::half_to_float(half_block[p][1]);
+					pPixels[p][2] = basist::half_to_float(half_block[p][2]);
+					pPixels[p][3] = 1.0f;
+				}
+
+				return true;
+			}
+			default:
+			{
+				break;
+			}
+		}
+
+		assert(0);
+		return false;
+	}
+		
 	bool gpu_image::unpack(image& img) const
 	{
 		img.resize(get_pixel_width(), get_pixel_height());
@@ -1252,7 +1589,48 @@ namespace basisu
 
 		return success;
 	}
+
+	bool gpu_image::unpack_hdr(imagef& img) const
+	{
+		if ((m_fmt != texture_format::cASTC_HDR_4x4) && 
+			(m_fmt != texture_format::cUASTC_HDR_4x4) &&
+			(m_fmt != texture_format::cBC6HUnsigned) &&
+			(m_fmt != texture_format::cBC6HSigned))
+		{
+			// Can't call on LDR images, at least currently. (Could unpack the LDR data and convert to float.)
+			assert(0);
+			return false;
+		}
+
+		img.resize(get_pixel_width(), get_pixel_height());
+		img.set_all(vec4F(0.0f));
+
+		if (!img.get_width() || !img.get_height())
+			return true;
+
+		assert((m_block_width <= cMaxBlockSize) && (m_block_height <= cMaxBlockSize));
+		vec4F pixels[cMaxBlockSize * cMaxBlockSize];
+		clear_obj(pixels);
+
+		bool success = true;
+
+		for (uint32_t by = 0; by < m_blocks_y; by++)
+		{
+			for (uint32_t bx = 0; bx < m_blocks_x; bx++)
+			{
+				const void* pBlock = get_block_ptr(bx, by);
+
+				if (!unpack_block_hdr(m_fmt, pBlock, pixels))
+					success = false;
+
+				img.set_block_clipped(pixels, bx * m_block_width, by * m_block_height, m_block_width, m_block_height);
+			} // bx
+		} // by
+
+		return success;
+	}
 		
+	// KTX1 texture file writing
 	static const uint8_t g_ktx_file_id[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A };
 
 	// KTX/GL enums
@@ -1273,6 +1651,8 @@ namespace basisu
 		KTX_COMPRESSED_RGBA8_ETC2_EAC = 0x9278,
 		KTX_COMPRESSED_RGBA_BPTC_UNORM = 0x8E8C,
 		KTX_COMPRESSED_SRGB_ALPHA_BPTC_UNORM = 0x8E8D,
+		KTX_COMPRESSED_RGB_BPTC_SIGNED_FLOAT = 0x8E8E,
+		KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT = 0x8E8F,
 		KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG = 0x8C00,
 		KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02,
 		KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0,
@@ -1319,6 +1699,7 @@ namespace basisu
 		uint32_t width = 0, height = 0, total_levels = 0;
 		basisu::texture_format fmt = texture_format::cInvalidTextureFormat;
 
+		// Sanity check the input
 		if (cubemap_flag)
 		{
 			if ((gpu_images.size() % 6) != 0)
@@ -1327,7 +1708,7 @@ namespace basisu
 				return false;
 			}
 		}
-
+				
 		for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++)
 		{
 			const gpu_image_vec &levels = gpu_images[array_index];
@@ -1426,6 +1807,18 @@ namespace basisu
 			base_internal_fmt = KTX_RGBA;
 			break;
 		}
+		case texture_format::cBC6HSigned:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB_BPTC_SIGNED_FLOAT;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cBC6HUnsigned:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
 		case texture_format::cBC7:
 		{
 			internal_fmt = KTX_COMPRESSED_RGBA_BPTC_UNORM;
@@ -1443,7 +1836,10 @@ namespace basisu
 			base_internal_fmt = KTX_RGBA;
 			break;
 		}
-		case texture_format::cASTC4x4:
+		// We use different enums for HDR vs. LDR ASTC, but internally they are both just ASTC.
+		case texture_format::cASTC_LDR_4x4:
+		case texture_format::cASTC_HDR_4x4:
+		case texture_format::cUASTC_HDR_4x4: // UASTC_HDR is just HDR-only ASTC
 		{
 			internal_fmt = KTX_COMPRESSED_RGBA_ASTC_4x4_KHR;
 			base_internal_fmt = KTX_RGBA;
@@ -1496,17 +1892,17 @@ namespace basisu
 			return false;
 		}
 		}
-		
+
 		ktx_header header;
 		header.clear();
 		memcpy(&header.m_identifier, g_ktx_file_id, sizeof(g_ktx_file_id));
 		header.m_endianness = KTX_ENDIAN;
-		
+
 		header.m_pixelWidth = width;
 		header.m_pixelHeight = height;
-				
+
 		header.m_glTypeSize = 1;
-		
+
 		header.m_glInternalFormat = internal_fmt;
 		header.m_glBaseInternalFormat = base_internal_fmt;
 
@@ -1517,12 +1913,12 @@ namespace basisu
 		header.m_numberOfMipmapLevels = total_levels;
 		header.m_numberOfFaces = cubemap_flag ? 6 : 1;
 
-		append_vector(ktx_data, (uint8_t *)&header, sizeof(header));
+		append_vector(ktx_data, (uint8_t*)&header, sizeof(header));
 
 		for (uint32_t level_index = 0; level_index < total_levels; level_index++)
 		{
 			uint32_t img_size = gpu_images[0][level_index].get_size_in_bytes();
-			
+
 			if ((header.m_numberOfFaces == 1) || (header.m_numberOfArrayElements > 1))
 			{
 				img_size = img_size * header.m_numberOfFaces * maximum<uint32_t>(1, header.m_numberOfArrayElements);
@@ -1531,9 +1927,10 @@ namespace basisu
 			assert(img_size && ((img_size & 3) == 0));
 
 			packed_uint<4> packed_img_size(img_size);
-			append_vector(ktx_data, (uint8_t *)&packed_img_size, sizeof(packed_img_size));
+			append_vector(ktx_data, (uint8_t*)&packed_img_size, sizeof(packed_img_size));
 
 			uint32_t bytes_written = 0;
+			(void)bytes_written;
 
 			for (uint32_t array_index = 0; array_index < maximum<uint32_t>(1, header.m_numberOfArrayElements); array_index++)
 			{
@@ -1541,11 +1938,11 @@ namespace basisu
 				{
 					const gpu_image& img = gpu_images[cubemap_flag ? (array_index * 6 + face_index) : array_index][level_index];
 
-					append_vector(ktx_data, (uint8_t *)img.get_ptr(), img.get_size_in_bytes());
-					
+					append_vector(ktx_data, (uint8_t*)img.get_ptr(), img.get_size_in_bytes());
+
 					bytes_written += img.get_size_in_bytes();
 				}
-			
+
 			} // array_index
 
 		} // level_index
@@ -1553,7 +1950,58 @@ namespace basisu
 		return true;
 	}
 
-	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag)
+	bool does_dds_support_format(texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
+		case texture_format::cBC1:
+		case texture_format::cBC3:
+		case texture_format::cBC4:
+		case texture_format::cBC5:
+		case texture_format::cBC6HSigned:
+		case texture_format::cBC6HUnsigned:
+		case texture_format::cBC7:
+			return true;
+		default:
+			break;
+		}
+		return false;
+	}
+
+	// Only supports the basic DirectX BC texture formats.
+	// gpu_images array is: [face/layer][mipmap level]
+	// For cubemap arrays, # of face/layers must be a multiple of 6.
+	// Accepts 2D, 2D mipmapped, 2D array, 2D array mipmapped
+	// and cubemap, cubemap mipmapped, and cubemap array mipmapped.
+	bool write_dds_file(uint8_vec &dds_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
+	{		
+		return false;
+	}
+
+	bool write_dds_file(const char* pFilename, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
+	{
+		uint8_vec dds_data;
+
+		if (!write_dds_file(dds_data, gpu_images, cubemap_flag, use_srgb_format))
+			return false;
+
+		if (!write_vec_to_file(pFilename, dds_data))
+		{
+			fprintf(stderr, "write_dds_file: Failed writing DDS file data\n");
+			return false;
+		}
+
+		return true;
+	}
+		
+	bool read_uncompressed_dds_file(const char* pFilename, basisu::vector<image> &ldr_mips,	basisu::vector<imagef>& hdr_mips)
+	{
+		return false;
+	}
+
+	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag, bool use_srgb_format)
 	{
 		std::string extension(string_tolower(string_get_extension(pFilename)));
 
@@ -1570,8 +2018,8 @@ namespace basisu
 		}
 		else if (extension == "dds")
 		{
-			// TODO
-			return false;
+			if (!write_dds_file(filedata, g, cubemap_flag, use_srgb_format))
+				return false;
 		}
 		else
 		{
@@ -1583,11 +2031,18 @@ namespace basisu
 		return basisu::write_vec_to_file(pFilename, filedata);
 	}
 
-	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g)
+	bool write_compressed_texture_file(const char* pFilename, const gpu_image_vec& g, bool use_srgb_format)
+	{
+		basisu::vector<gpu_image_vec> a;
+		a.push_back(g);
+		return write_compressed_texture_file(pFilename, a, false, use_srgb_format);
+	}
+
+	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g, bool use_srgb_format)
 	{
 		basisu::vector<gpu_image_vec> v;
 		enlarge_vector(v, 1)->push_back(g);
-		return write_compressed_texture_file(pFilename, v, false);
+		return write_compressed_texture_file(pFilename, v, false, use_srgb_format);
 	}
 
 	//const uint32_t OUT_FILE_MAGIC = 'TEXC';
@@ -1626,5 +2081,49 @@ namespace basisu
 		
 		return fclose(pFile) != EOF;
 	}
+
+	// The .astc texture format is readable using ARM's astcenc, AMD Compressonator, and other engines/tools. It oddly doesn't support mipmaps, limiting 
+	// its usefulness/relevance.
+	// https://github.com/ARM-software/astc-encoder/blob/main/Docs/FileFormat.md
+	bool write_astc_file(const char* pFilename, const void* pBlocks, uint32_t block_width, uint32_t block_height, uint32_t dim_x, uint32_t dim_y)
+	{
+		assert(pBlocks && (block_width >= 4) && (block_height >= 4) && (dim_x > 0) && (dim_y > 0));
+
+		uint8_vec file_data;
+		file_data.push_back(0x13);
+		file_data.push_back(0xAB);
+		file_data.push_back(0xA1);
+		file_data.push_back(0x5C);
+
+		file_data.push_back((uint8_t)block_width);
+		file_data.push_back((uint8_t)block_height);
+		file_data.push_back(1);
+
+		file_data.push_back((uint8_t)dim_x);
+		file_data.push_back((uint8_t)(dim_x >> 8));
+		file_data.push_back((uint8_t)(dim_x >> 16));
+
+		file_data.push_back((uint8_t)dim_y);
+		file_data.push_back((uint8_t)(dim_y >> 8));
+		file_data.push_back((uint8_t)(dim_y >> 16));
+
+		file_data.push_back((uint8_t)1);
+		file_data.push_back((uint8_t)0);
+		file_data.push_back((uint8_t)0);
+
+		const uint32_t num_blocks_x = (dim_x + block_width - 1) / block_width;
+		const uint32_t num_blocks_y = (dim_y + block_height - 1) / block_height;
+
+		const uint32_t total_bytes = num_blocks_x * num_blocks_y * 16;
+
+		const size_t cur_size = file_data.size();
+
+		file_data.resize(cur_size + total_bytes);
+
+		memcpy(&file_data[cur_size], pBlocks, total_bytes);
+
+		return write_vec_to_file(pFilename, file_data);
+	}
+		
 } // basisu
 
diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
index 619926f5f95f..67c2a2bc5ec4 100644
--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
@@ -1,5 +1,5 @@
 // basisu_gpu_texture.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -48,6 +48,7 @@ namespace basisu
 		}
 
 		inline texture_format get_format() const { return m_fmt; }
+		inline bool is_hdr() const { return is_hdr_texture_format(m_fmt); }
 		
 		// Width/height in pixels
 		inline uint32_t get_pixel_width() const { return m_width; }
@@ -100,9 +101,13 @@ namespace basisu
 			m_blocks.resize(m_blocks_x * m_blocks_y * m_qwords_per_block);
 		}
 
+		// Unpacks LDR textures only.
 		bool unpack(image& img) const;
+
+		// Unpacks HDR textures only.
+		bool unpack_hdr(imagef& img) const;
 		
-		void override_dimensions(uint32_t w, uint32_t h)
+		inline void override_dimensions(uint32_t w, uint32_t h)
 		{
 			m_width = w;
 			m_height = h;
@@ -116,39 +121,50 @@ namespace basisu
 
 	typedef basisu::vector<gpu_image> gpu_image_vec;
 
-	// KTX file writing
-
+	// KTX1 file writing
 	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
-		
-	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
 	
-	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
-	{
-		basisu::vector<gpu_image_vec> a;
-		a.push_back(g);
-		return write_compressed_texture_file(pFilename, a, false);
-	}
+	bool does_dds_support_format(texture_format fmt);
+	bool write_dds_file(uint8_vec& dds_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format);
+	bool write_dds_file(const char* pFilename, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format);
+
+	// Currently reads 2D 32bpp RGBA, 16-bit HALF RGBA, or 32-bit FLOAT RGBA, with or without mipmaps. No tex arrays or cubemaps, yet.
+	bool read_uncompressed_dds_file(const char* pFilename, basisu::vector<image>& ldr_mips, basisu::vector<imagef>& hdr_mips);
 
-	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g);
+	// Supports DDS and KTX
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag, bool use_srgb_format);
+	bool write_compressed_texture_file(const char* pFilename, const gpu_image_vec& g, bool use_srgb_format);
+	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g, bool use_srgb_format);
 	
 	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi);
 
 	// GPU texture block unpacking
+	// For ETC1, use in basisu_etc.h: bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha)
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha);
 	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride);
 	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels);
 	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels);
-	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels);
+	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels); // full format
+	bool unpack_bc6h(const void* pSrc_block, void* pDst_block, bool is_signed, uint32_t dest_pitch_in_halfs = 4 * 3); // full format, outputs HALF values, RGB texels only (not RGBA)
 	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels);
+	// We only support CC_MIXED non-alpha blocks here because that's the only mode the transcoder uses at the moment.
 	bool unpack_fxt1(const void* p, color_rgba* pPixels);
+	// PVRTC2 is currently limited to only what our transcoder outputs (non-interpolated, hard_flag=1 modulation=0). In this mode, PVRTC2 looks much like BC1/ATC.
 	bool unpack_pvrtc2(const void* p, color_rgba* pPixels);
 	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c);
 	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels);
-
+	
 	// unpack_block() is primarily intended to unpack texture data created by the transcoder.
-	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation.
+	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not yet a complete implementation.
+	// Unpacks LDR texture formats only.
 	bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels);
-			
+
+	// Unpacks HDR texture formats only.
+	bool unpack_block_hdr(texture_format fmt, const void* pBlock, vec4F* pPixels);
+	
+	bool write_astc_file(const char* pFilename, const void* pBlocks, uint32_t block_width, uint32_t block_height, uint32_t dim_x, uint32_t dim_y);
+							
 } // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
index b03e2ea6e85c..9b85a594ee8c 100644
--- a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
@@ -1,5 +1,5 @@
 // basisu_kernels_declares.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
index dcf1ce069a6f..123862b1ddc4 100644
--- a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
@@ -1,5 +1,5 @@
 // basisu_kernels_imp.h - Do not directly include
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
index 4f15a5a12b02..36a493d7ed8b 100644
--- a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
@@ -1,5 +1,5 @@
 // basisu_kernels_sse.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,22 +22,6 @@
 #include <intrin.h>
 #endif
 
-#if !defined(_MSC_VER)
-	#if __AVX__ || __AVX2__ || __AVX512F__
-		#error Please check your compiler options
-	#endif
-	
-	#if CPPSPMD_SSE2
-		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
-			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
-		#endif
-	#else
-		#if !__SSE4_1__ || !__SSE3__ || !__SSSE3__
-			#error Please check your compiler options
-		#endif
-	#endif
-#endif
-
 #include "cppspmd_sse.h"
 
 #include "cppspmd_type_aliases.h"
diff --git a/thirdparty/basis_universal/encoder/basisu_miniz.h b/thirdparty/basis_universal/encoder/basisu_miniz.h
index 18de9972322f..dab38f9f9248 100644
--- a/thirdparty/basis_universal/encoder/basisu_miniz.h
+++ b/thirdparty/basis_universal/encoder/basisu_miniz.h
@@ -3,7 +3,7 @@
   
    Forked from the public domain/unlicense version at: https://code.google.com/archive/p/miniz/ 
    
-   Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+   Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -1973,7 +1973,7 @@ static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahe
                    (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) );
     if (!probe_len)
     {
-      *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, TDEFL_MAX_MATCH_LEN); break;
+      *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, (mz_uint)TDEFL_MAX_MATCH_LEN); break;
     }
     else if ((probe_len = ((mz_uint)(p - s) * 2) + (mz_uint)(*(const mz_uint8*)p == *(const mz_uint8*)q)) > match_len)
     {
@@ -2101,7 +2101,7 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d)
 
       total_lz_bytes += cur_match_len;
       lookahead_pos += cur_match_len;
-      dict_size = MZ_MIN(dict_size + cur_match_len, TDEFL_LZ_DICT_SIZE);
+      dict_size = MZ_MIN(dict_size + cur_match_len, (mz_uint)TDEFL_LZ_DICT_SIZE);
       cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK;
       MZ_ASSERT(lookahead_size >= cur_match_len);
       lookahead_size -= cur_match_len;
@@ -2129,7 +2129,7 @@ static mz_bool tdefl_compress_fast(tdefl_compressor *d)
       d->m_huff_count[0][lit]++;
 
       lookahead_pos++;
-      dict_size = MZ_MIN(dict_size + 1, TDEFL_LZ_DICT_SIZE);
+      dict_size = MZ_MIN(dict_size + 1, (mz_uint)TDEFL_LZ_DICT_SIZE);
       cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
       lookahead_size--;
 
@@ -2283,7 +2283,7 @@ static mz_bool tdefl_compress_normal(tdefl_compressor *d)
     d->m_lookahead_pos += len_to_move;
     MZ_ASSERT(d->m_lookahead_size >= len_to_move);
     d->m_lookahead_size -= len_to_move;
-    d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, TDEFL_LZ_DICT_SIZE);
+    d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, (mz_uint)TDEFL_LZ_DICT_SIZE);
     // Check if it's time to flush the current LZ codes to the internal output buffer.
     if ( (d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) ||
          ( (d->m_total_lz_bytes > 31*1024) && (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= d->m_total_lz_bytes) || (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))) )
diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.cpp b/thirdparty/basis_universal/encoder/basisu_opencl.cpp
index 81e3090a2639..e0611c18eefb 100644
--- a/thirdparty/basis_universal/encoder/basisu_opencl.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_opencl.cpp
@@ -1,5 +1,5 @@
 // basisu_opencl.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.h b/thirdparty/basis_universal/encoder/basisu_opencl.h
index 4194a0841840..2546a18dabbe 100644
--- a/thirdparty/basis_universal/encoder/basisu_opencl.h
+++ b/thirdparty/basis_universal/encoder/basisu_opencl.h
@@ -1,5 +1,5 @@
 // basisu_opencl.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Note: Undefine or set BASISU_SUPPORT_OPENCL to 0 to completely OpenCL support.
 //
diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
index 596fc197e6d9..4bf9516f90a1 100644
--- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
@@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
index db6985a439bb..a9fe6b27aa01 100644
--- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
@@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -231,7 +231,18 @@ namespace basisu
 
 		inline void set_to_black()
 		{
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif                          
+#endif
 			memset(m_blocks.get_ptr(), 0, m_blocks.size_in_bytes());
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif                
+#endif
 		}
 
 		inline bool get_block_uses_transparent_modulation(uint32_t bx, uint32_t by) const
diff --git a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
index 597cb3f6187e..46cd837376ee 100644
--- a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
@@ -1,5 +1,5 @@
 // basisu_resampler_filters.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.cpp b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
index f4cedf0031b2..a00c63335d09 100644
--- a/thirdparty/basis_universal/encoder/basisu_resampler.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
@@ -1,5 +1,5 @@
 // basisu_resampler.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.h b/thirdparty/basis_universal/encoder/basisu_resampler.h
index dc0978caebe7..ac1ef73d7f3e 100644
--- a/thirdparty/basis_universal/encoder/basisu_resampler.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.h
@@ -1,5 +1,5 @@
 // basisu_resampler.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
index 0ebb51c334b3..4d66ac2c7031 100644
--- a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
@@ -1,5 +1,5 @@
 // basisu_resampler_filters.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.cpp b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
index cceb400b883b..608ce937fcdd 100644
--- a/thirdparty/basis_universal/encoder/basisu_ssim.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
@@ -1,5 +1,5 @@
 // basisu_ssim.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.h b/thirdparty/basis_universal/encoder/basisu_ssim.h
index 986ca3bbdf50..51cd2d78fddf 100644
--- a/thirdparty/basis_universal/encoder/basisu_ssim.h
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.h
@@ -1,5 +1,5 @@
 // basisu_ssim.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
index 271bbc6f1daf..51f6e979d458 100644
--- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
@@ -1,5 +1,5 @@
 // basisu_uastc_enc.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,11 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "basisu_uastc_enc.h"
-
-#if BASISU_USE_ASTC_DECOMPRESS
-#include "basisu_astc_decomp.h"
-#endif
-
+#include "3rdparty/android_astc_decomp.h"
 #include "basisu_gpu_texture.h"
 #include "basisu_bc7enc.h"
 
@@ -384,6 +380,7 @@ namespace basisu
 		}
 
 		uint32_t total_endpoint_bits = 0;
+		(void)total_endpoint_bits;
 
 		for (uint32_t i = 0; i < total_tq_values; i++)
 		{
@@ -428,6 +425,8 @@ namespace basisu
 #endif
 
 		uint32_t total_weight_bits = 0;
+		(void)total_weight_bits;
+
 		const uint32_t plane_shift = (total_planes == 2) ? 1 : 0;
 		for (uint32_t i = 0; i < 16 * total_planes; i++)
 		{
@@ -3175,6 +3174,7 @@ namespace basisu
 		const bool favor_bc7_error = !favor_uastc_error && ((flags & cPackUASTCFavorBC7Error) != 0);
 		//const bool etc1_perceptual = true;
 		
+		// TODO: This uses 64KB of stack space!
 		uastc_encode_results results[MAX_ENCODE_RESULTS];
 						
 		level = clampi(level, cPackUASTCLevelFastest, cPackUASTCLevelVerySlow);
@@ -3567,7 +3567,6 @@ namespace basisu
 			success = basist::unpack_uastc(temp_block, (basist::color32 *)temp_block_unpacked, false);
 			VALIDATE(success);
 
-#if BASISU_USE_ASTC_DECOMPRESS
 			// Now round trip to packed ASTC and back, then decode to pixels.
 			uint32_t astc_data[4];
 			
@@ -3580,7 +3579,7 @@ namespace basisu
 			}
 
 			color_rgba decoded_astc_block[4][4];
-			success = basisu_astc::astc::decompress((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4);
+			success = basisu_astc::astc::decompress_ldr((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4);
 			VALIDATE(success);
 
 			for (uint32_t y = 0; y < 4; y++)
@@ -3595,7 +3594,6 @@ namespace basisu
 					VALIDATE(temp_block_unpacked[y][x].c[3] == decoded_uastc_block[y][x].a);
 				}
 			}
-#endif
 		}
 #endif
 
@@ -3789,8 +3787,9 @@ namespace basisu
 	{
 		uint64_t m_sel;
 		uint32_t m_ofs;
+		uint32_t m_pad; // avoid implicit padding for selector_bitsequence_hash
 		selector_bitsequence() { }
-		selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs) { }
+		selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs), m_pad(0) { }
 		bool operator== (const selector_bitsequence& other) const
 		{
 			return (m_ofs == other.m_ofs) && (m_sel == other.m_sel);
@@ -3811,7 +3810,7 @@ namespace basisu
 	{
 		std::size_t operator()(selector_bitsequence const& s) const noexcept
 		{
-			return static_cast<std::size_t>(hash_hsieh((uint8_t *)&s, sizeof(s)) ^ s.m_sel);
+			return hash_hsieh((const uint8_t*)&s, sizeof(s));
 		}
 	};
 
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
index ba39a558b38b..54d39380e683 100644
--- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
@@ -1,5 +1,5 @@
 // basisu_uastc_enc.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/cppspmd_flow.h b/thirdparty/basis_universal/encoder/cppspmd_flow.h
index f6930476aad1..93934173c4f0 100644
--- a/thirdparty/basis_universal/encoder/cppspmd_flow.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_flow.h
@@ -1,7 +1,7 @@
 // Do not include this header directly.
 // Control flow functionality in common between all the headers.
 //
-// Copyright 2020-2021 Binomial LLC
+// Copyright 2020-2024 Binomial LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math.h b/thirdparty/basis_universal/encoder/cppspmd_math.h
index e7b3202b8ee2..3032df865f1f 100644
--- a/thirdparty/basis_universal/encoder/cppspmd_math.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_math.h
@@ -1,6 +1,6 @@
 // Do not include this header directly.
 //
-// Copyright 2020-2021 Binomial LLC
+// Copyright 2020-2024 Binomial LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -646,7 +646,7 @@ CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
 {
 	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
 	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
-	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+	return VUINT_SHIFT_RIGHT(((v1 + (VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F)) * 0x1010101), 24);
 }
 
 CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
index cdb6447b62ef..f76c9b7e38ea 100644
--- a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
@@ -1,7 +1,7 @@
 // Do not include this header directly.
 // This header defines shared struct spmd_kernel helpers.
 //
-// Copyright 2020-2021 Binomial LLC
+// Copyright 2020-2024 Binomial LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/cppspmd_sse.h b/thirdparty/basis_universal/encoder/cppspmd_sse.h
index 4c61bab7b1a9..79dfa1561a0d 100644
--- a/thirdparty/basis_universal/encoder/cppspmd_sse.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h
@@ -450,7 +450,7 @@ struct spmd_kernel
 		CPPSPMD_FORCE_INLINE explicit operator vint() const;
 								
 	private:
-		vbool& operator=(const vbool&);
+		//vbool& operator=(const vbool&);
 	};
 
 	friend vbool operator!(const vbool& v);
@@ -481,7 +481,7 @@ struct spmd_kernel
 		CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
 
 	private:
-		vfloat& operator=(const vfloat&);
+		//vfloat& operator=(const vfloat&);
 	};
 
 	CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
@@ -514,7 +514,7 @@ struct spmd_kernel
 		float* m_pValue;
 
 	private:
-		float_lref& operator=(const float_lref&);
+		//float_lref& operator=(const float_lref&);
 	};
 
 	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
@@ -561,7 +561,7 @@ struct spmd_kernel
 		float* m_pValue;
 		
 	private:
-		float_vref& operator=(const float_vref&);
+		//float_vref& operator=(const float_vref&);
 	};
 
 	// Varying ref to varying float
@@ -571,7 +571,7 @@ struct spmd_kernel
 		vfloat* m_pValue;
 		
 	private:
-		vfloat_vref& operator=(const vfloat_vref&);
+		//vfloat_vref& operator=(const vfloat_vref&);
 	};
 
 	// Varying ref to varying int
@@ -581,7 +581,7 @@ struct spmd_kernel
 		vint* m_pValue;
 		
 	private:
-		vint_vref& operator=(const vint_vref&);
+		//vint_vref& operator=(const vint_vref&);
 	};
 
 	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
@@ -624,7 +624,7 @@ struct spmd_kernel
 		int* m_pValue;
 
 	private:
-		int_lref& operator=(const int_lref&);
+		//int_lref& operator=(const int_lref&);
 	};
 		
 	CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
@@ -663,7 +663,7 @@ struct spmd_kernel
 		int16_t* m_pValue;
 
 	private:
-		int16_lref& operator=(const int16_lref&);
+		//int16_lref& operator=(const int16_lref&);
 	};
 
 	CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
@@ -720,7 +720,7 @@ struct spmd_kernel
 		const int* m_pValue;
 
 	private:
-		cint_lref& operator=(const cint_lref&);
+		//cint_lref& operator=(const cint_lref&);
 	};
 
 	CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
@@ -742,7 +742,7 @@ struct spmd_kernel
 		int* m_pValue;
 
 	private:
-		int_vref& operator=(const int_vref&);
+		//int_vref& operator=(const int_vref&);
 	};
 
 	// Varying ref to constant ints
@@ -752,7 +752,7 @@ struct spmd_kernel
 		const int* m_pValue;
 
 	private:
-		cint_vref& operator=(const cint_vref&);
+		//cint_vref& operator=(const cint_vref&);
 	};
 
 	// Varying int
@@ -810,7 +810,7 @@ struct spmd_kernel
 		}
 
 	private:
-		vint& operator=(const vint&);
+		//vint& operator=(const vint&);
 	};
 
 	// Load/store linear int
@@ -1206,7 +1206,7 @@ struct spmd_kernel
 	CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
 	{
 		// TODO: There's surely a better way
-		__m128i k;
+		__m128i k = _mm_setzero_si128();
 
 		k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
 		k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
@@ -1261,7 +1261,7 @@ struct spmd_kernel
 		}
 
 	private:
-		lint& operator=(const lint&);
+		//lint& operator=(const lint&);
 	};
 
 	CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
diff --git a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
index 0dfb28b88f83..26004812395b 100644
--- a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
@@ -1,7 +1,7 @@
 // cppspmd_type_aliases.h
 // Do not include this file directly
 //
-// Copyright 2020-2021 Binomial LLC
+// Copyright 2020-2024 Binomial LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/encoder/pvpngreader.cpp b/thirdparty/basis_universal/encoder/pvpngreader.cpp
index 46639f2796ea..6b32f66cbe23 100644
--- a/thirdparty/basis_universal/encoder/pvpngreader.cpp
+++ b/thirdparty/basis_universal/encoder/pvpngreader.cpp
@@ -163,7 +163,7 @@ class png_memory_file : public png_file
 		{
 			if ((sizeof(size_t) == sizeof(uint32_t)) && (new_size > 0x7FFFFFFFUL))
 				return 0;
-			m_buf.resize(new_size);
+			m_buf.resize((size_t)new_size);
 		}
 
 		memcpy(&m_buf[(size_t)m_ofs], pBuf, len);
@@ -178,11 +178,11 @@ class png_memory_file : public png_file
 			return 0;
 
 		uint64_t max_bytes = minimum<uint64_t>(len, m_buf.size() - m_ofs);
-		memcpy(pBuf, &m_buf[(size_t)m_ofs], max_bytes);
+		memcpy(pBuf, &m_buf[(size_t)m_ofs], (size_t)max_bytes);
 
 		m_ofs += max_bytes;
 
-		return max_bytes;
+		return (size_t)max_bytes;
 	}
 };
 
@@ -249,11 +249,11 @@ class png_readonly_memory_file : public png_file
 			return 0;
 
 		uint64_t max_bytes = minimum<uint64_t>(len, m_buf_size - m_ofs);
-		memcpy(pBuf, &m_pBuf[(size_t)m_ofs], max_bytes);
+		memcpy(pBuf, &m_pBuf[(size_t)m_ofs], (size_t)max_bytes);
 
 		m_ofs += max_bytes;
 
-		return max_bytes;
+		return (size_t)max_bytes;
 	}
 };
 
@@ -1626,8 +1626,8 @@ int png_decoder::png_decode_start()
 
 	if (m_ihdr.m_ilace_type == 1)
 	{
-		int i;
-		uint32_t total_lines, lines_processed;
+		//int i;
+		//uint32_t total_lines, lines_processed;
 
 		m_adam7_pass_size_x[0] = adam7_pass_size(m_ihdr.m_width, 0, 8);
 		m_adam7_pass_size_x[1] = adam7_pass_size(m_ihdr.m_width, 4, 8);
@@ -1651,10 +1651,12 @@ int png_decoder::png_decode_start()
 
 		m_pass_y_left = 0;
 
+#if 0
 		total_lines = lines_processed = 0;
 
 		for (i = 0; i < 7; i++)
 			total_lines += m_adam7_pass_size_y[i];
+#endif
 
 		for (; ; )
 		{
@@ -1675,7 +1677,7 @@ int png_decoder::png_decode_start()
 				}
 			}
 
-			lines_processed++;
+			//lines_processed++;
 		}
 
 		m_adam7_decoded_flag = TRUE;
diff --git a/modules/basis_universal/patches/external-jpgd.patch b/thirdparty/basis_universal/patches/external-jpgd.patch
similarity index 100%
rename from modules/basis_universal/patches/external-jpgd.patch
rename to thirdparty/basis_universal/patches/external-jpgd.patch
diff --git a/thirdparty/basis_universal/patches/external-tinyexr.patch b/thirdparty/basis_universal/patches/external-tinyexr.patch
new file mode 100644
index 000000000000..665af1330075
--- /dev/null
+++ b/thirdparty/basis_universal/patches/external-tinyexr.patch
@@ -0,0 +1,23 @@
+diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
+index 6c0ac0ad370..2bf486a0287 100644
+--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
++++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
+@@ -27,7 +27,7 @@
+ #ifndef TINYEXR_USE_ZFP
+ #define TINYEXR_USE_ZFP (1)
+ #endif
+-#include "3rdparty/tinyexr.h"
++#include <tinyexr.h>
+ 
+ #ifndef MINIZ_HEADER_FILE_ONLY
+ #define MINIZ_HEADER_FILE_ONLY
+@@ -3257,7 +3257,8 @@ namespace basisu
+ 		float* out_rgba = nullptr;
+ 		const char* err = nullptr;
+ 		
+-		int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err, &n_chans);
++		int status = LoadEXRWithLayer(&out_rgba, &width, &height, pFilename, nullptr, &err);
++		n_chans = 4;
+ 		if (status != 0)
+ 		{
+ 			error_printf("Failed loading .EXR image \"%s\"! (TinyEXR error: %s)\n", pFilename, err ? err : "?");
diff --git a/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch b/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch
new file mode 100644
index 000000000000..a4d176602d2a
--- /dev/null
+++ b/thirdparty/basis_universal/patches/remove-tinydds-qoi.patch
@@ -0,0 +1,446 @@
+diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
+index 2bf486a0287..fff98e83014 100644
+--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
++++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
+@@ -37,9 +37,6 @@
+ #endif
+ #include "basisu_miniz.h"
+ 
+-#define QOI_IMPLEMENTATION
+-#include "3rdparty/qoi.h"
+-
+ #if defined(_WIN32)
+ // For QueryPerformanceCounter/QueryPerformanceFrequency
+ #define WIN32_LEAN_AND_MEAN
+@@ -408,16 +405,7 @@ namespace basisu
+ 
+ 	bool load_qoi(const char* pFilename, image& img)
+ 	{
+-		qoi_desc desc;
+-		clear_obj(desc);
+-
+-		void* p = qoi_read(pFilename, &desc, 4);
+-		if (!p)
+-			return false;
+-
+-		img.grant_ownership(static_cast<color_rgba *>(p), desc.width, desc.height);
+-
+-		return true;
++		return false;
+ 	}
+ 
+ 	bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename)
+diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
+index 000869a5337..342446b8fd4 100644
+--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
++++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
+@@ -19,9 +19,6 @@
+ #include "basisu_bc7enc.h"
+ #include "../transcoder/basisu_astc_hdr_core.h"
+ 
+-#define TINYDDS_IMPLEMENTATION
+-#include "3rdparty/tinydds.h"
+-
+ namespace basisu
+ {
+ 	//------------------------------------------------------------------------------------------------
+@@ -1979,208 +1976,8 @@ namespace basisu
+ 	// Accepts 2D, 2D mipmapped, 2D array, 2D array mipmapped
+ 	// and cubemap, cubemap mipmapped, and cubemap array mipmapped.
+ 	bool write_dds_file(uint8_vec &dds_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
+-	{
+-		if (!gpu_images.size())
+-		{
+-			assert(0);
+-			return false;
+-		}
+-
+-		// Sanity check the input
+-		uint32_t slices = 1;
+-		if (cubemap_flag)
+-		{
+-			if ((gpu_images.size() % 6) != 0)
+-			{
+-				assert(0);
+-				return false;
+-			}
+-			slices = gpu_images.size() / 6;
+-		}
+-		else
+-		{
+-			slices = gpu_images.size();
+-		}
+-
+-		uint32_t width = 0, height = 0, total_levels = 0;
+-		basisu::texture_format fmt = texture_format::cInvalidTextureFormat;
+-
+-		// Sanity check the input for consistent # of dimensions and mip levels
+-		for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++)
+-		{
+-			const gpu_image_vec& levels = gpu_images[array_index];
+-
+-			if (!levels.size())
+-			{
+-				// Empty mip chain
+-				assert(0);
+-				return false;
+-			}
+-
+-			if (!array_index)
+-			{
+-				width = levels[0].get_pixel_width();
+-				height = levels[0].get_pixel_height();
+-				total_levels = (uint32_t)levels.size();
+-				fmt = levels[0].get_format();
+-			}
+-			else
+-			{
+-				if ((width != levels[0].get_pixel_width()) ||
+-					(height != levels[0].get_pixel_height()) ||
+-					(total_levels != levels.size()))
+-				{
+-					// All cubemap/texture array faces must be the same dimension
+-					assert(0);
+-					return false;
+-				}
+-			}
+-
+-			for (uint32_t level_index = 0; level_index < levels.size(); level_index++)
+-			{
+-				if (level_index)
+-				{
+-					if ((levels[level_index].get_pixel_width() != maximum<uint32_t>(1, levels[0].get_pixel_width() >> level_index)) ||
+-						(levels[level_index].get_pixel_height() != maximum<uint32_t>(1, levels[0].get_pixel_height() >> level_index)))
+-					{
+-						// Malformed mipmap chain
+-						assert(0);
+-						return false;
+-					}
+-				}
+-
+-				if (fmt != levels[level_index].get_format())
+-				{
+-					// All input textures must use the same GPU format
+-					assert(0);
+-					return false;
+-				}
+-			}
+-		}
+-
+-		// No mipmap levels
+-		if (!total_levels)
+-		{
+-			assert(0);
+-			return false;
+-		}
+-
+-		// Create the DDS mipmap level data
+-		uint8_vec mipmaps[32];
+-
+-		// See https://learn.microsoft.com/en-us/windows/win32/direct3ddds/dds-file-layout-for-cubic-environment-maps
+-		// DDS cubemap organization is cubemap face 0 followed by all mips, then cubemap face 1 followed by all mips, etc.
+-		// Unfortunately tinydds.h's writer doesn't handle this case correctly, so we work around it here.
+-		// This also applies with 2D texture arrays, too. RenderDoc and ddsview (DirectXTex) views each type (cubemap array and 2D texture array) correctly.
+-		// Also see "Using Texture Arrays in Direct3D 10/11":
+-		// https://learn.microsoft.com/en-us/windows/win32/direct3ddds/dx-graphics-dds-pguide
+-		for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++)
+-		{
+-			const gpu_image_vec& levels = gpu_images[array_index];
+-
+-			for (uint32_t level_index = 0; level_index < levels.size(); level_index++)
+-			{
+-				append_vector(mipmaps[0], (uint8_t*)levels[level_index].get_ptr(), levels[level_index].get_size_in_bytes());
+-
+-			} // level_index
+-		} // array_index
+-
+-#if 0
+-		// This organization, required by tinydds.h's API, is wrong.
+-		{
+-			for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++)
+-			{
+-				const gpu_image_vec& levels = gpu_images[array_index];
+-
+-				for (uint32_t level_index = 0; level_index < levels.size(); level_index++)
+-				{
+-					append_vector(mipmaps[level_index], (uint8_t*)levels[level_index].get_ptr(), levels[level_index].get_size_in_bytes());
+-
+-				} // level_index
+-			} // array_index
+-		}
+-#endif
+-		
+-		// Write DDS file using tinydds
+-		TinyDDS_WriteCallbacks cbs;
+-		cbs.error = [](void* user, char const* msg) { BASISU_NOTE_UNUSED(user);  fprintf(stderr, "tinydds: %s\n", msg); };
+-		cbs.alloc = [](void* user, size_t size) -> void* { BASISU_NOTE_UNUSED(user); return malloc(size); };
+-		cbs.free = [](void* user, void* memory) { BASISU_NOTE_UNUSED(user); free(memory); };
+-		cbs.write = [](void* user, void const* buffer, size_t byteCount) { BASISU_NOTE_UNUSED(user); uint8_vec* pVec = (uint8_vec*)user; append_vector(*pVec, (const uint8_t*)buffer, byteCount); };
+-
+-		uint32_t mipmap_sizes[32];
+-		const void* mipmap_ptrs[32];
+-		
+-		clear_obj(mipmap_sizes);
+-		clear_obj(mipmap_ptrs);
+-
+-		assert(total_levels < 32);
+-		for (uint32_t i = 0; i < total_levels; i++)
+-		{
+-			mipmap_sizes[i] = mipmaps[i].size_in_bytes();
+-			mipmap_ptrs[i] = mipmaps[i].get_ptr();
+-		}
+-
+-		// Select tinydds texture format
+-		uint32_t tinydds_fmt = 0;
+-
+-		switch (fmt)
+-		{
+-			case texture_format::cBC1_NV:
+-			case texture_format::cBC1_AMD:
+-			case texture_format::cBC1: 
+-				tinydds_fmt = use_srgb_format ? TDDS_BC1_RGBA_SRGB_BLOCK : TDDS_BC1_RGBA_UNORM_BLOCK;
+-				break;
+-			case texture_format::cBC3:
+-				tinydds_fmt = use_srgb_format ? TDDS_BC3_SRGB_BLOCK : TDDS_BC3_UNORM_BLOCK;
+-				break;
+-			case texture_format::cBC4:
+-				tinydds_fmt = TDDS_BC4_UNORM_BLOCK;
+-				break;
+-			case texture_format::cBC5:
+-				tinydds_fmt = TDDS_BC5_UNORM_BLOCK;
+-				break;
+-			case texture_format::cBC6HSigned:
+-				tinydds_fmt = TDDS_BC6H_SFLOAT_BLOCK;
+-				break;
+-			case texture_format::cBC6HUnsigned:
+-				tinydds_fmt = TDDS_BC6H_UFLOAT_BLOCK;
+-				break;
+-			case texture_format::cBC7:
+-				tinydds_fmt = use_srgb_format ? TDDS_BC7_SRGB_BLOCK : TDDS_BC7_UNORM_BLOCK;
+-				break;
+-			default:
+-			{
+-				fprintf(stderr, "Warning: Unsupported format in write_dds_file().\n");
+-				return false;
+-			}
+-		}
+-
+-		// DirectXTex's DDSView doesn't handle odd sizes textures correctly. RenderDoc loads them fine, however.
+-		// Trying to work around this here results in invalid mipmaps. 
+-		//width = (width + 3) & ~3;
+-		//height = (height + 3) & ~3;
+-
+-		bool status = TinyDDS_WriteImage(&cbs,
+-			&dds_data,
+-			width,
+-			height,
+-			1,
+-			slices,
+-			total_levels,
+-			(TinyDDS_Format)tinydds_fmt,
+-			cubemap_flag,
+-			true,
+-			mipmap_sizes,
+-			mipmap_ptrs);
+-
+-		if (!status)
+-		{
+-			fprintf(stderr, "write_dds_file: Failed creating DDS file\n");
+-			return false;
+-		}
+-								
+-		return true;
++	{		
++		return false;
+ 	}
+ 
+ 	bool write_dds_file(const char* pFilename, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag, bool use_srgb_format)
+@@ -2201,188 +1998,6 @@ namespace basisu
+ 		
+ 	bool read_uncompressed_dds_file(const char* pFilename, basisu::vector<image> &ldr_mips,	basisu::vector<imagef>& hdr_mips)
+ 	{
+-		const uint32_t MAX_IMAGE_DIM = 16384;
+-
+-		TinyDDS_Callbacks cbs;
+-
+-		cbs.errorFn = [](void* user, char const* msg) { BASISU_NOTE_UNUSED(user); fprintf(stderr, "tinydds: %s\n", msg); };
+-		cbs.allocFn = [](void* user, size_t size) -> void* { BASISU_NOTE_UNUSED(user); return malloc(size); };
+-		cbs.freeFn = [](void* user, void* memory) { BASISU_NOTE_UNUSED(user); free(memory); };
+-		cbs.readFn = [](void* user, void* buffer, size_t byteCount) -> size_t { return (size_t)fread(buffer, 1, byteCount, (FILE*)user); };
+-		
+-#ifdef _MSC_VER
+-		cbs.seekFn = [](void* user, int64_t ofs) -> bool { return _fseeki64((FILE*)user, ofs, SEEK_SET) == 0; };
+-		cbs.tellFn = [](void* user) -> int64_t { return _ftelli64((FILE*)user); };
+-#else
+-		cbs.seekFn = [](void* user, int64_t ofs) -> bool { return fseek((FILE*)user, (long)ofs, SEEK_SET) == 0; };
+-		cbs.tellFn = [](void* user) -> int64_t { return (int64_t)ftell((FILE*)user); };
+-#endif
+-
+-		FILE* pFile = fopen_safe(pFilename, "rb");
+-		if (!pFile)
+-		{
+-			error_printf("Can't open .DDS file \"%s\"\n", pFilename);
+-			return false;
+-		}
+-
+-		// These are the formats AMD Compressonator supports in its UI.
+-		enum dds_fmt
+-		{
+-			cRGBA32,
+-			cRGBA_HALF,
+-			cRGBA_FLOAT
+-		};
+-
+-		bool status = false;
+-		dds_fmt fmt = cRGBA32;
+-		uint32_t width = 0, height = 0;
+-		bool hdr_flag = false;
+-		TinyDDS_Format tfmt = TDDS_UNDEFINED;
+-
+-		TinyDDS_ContextHandle ctx = TinyDDS_CreateContext(&cbs, pFile);
+-		if (!ctx)
+-			goto failure;
+-
+-		status = TinyDDS_ReadHeader(ctx);
+-		if (!status)
+-		{
+-			error_printf("Failed parsing DDS header in file \"%s\"\n", pFilename);
+-			goto failure;
+-		}
+-				
+-		if ((!TinyDDS_Is2D(ctx)) || (TinyDDS_ArraySlices(ctx) > 1) || (TinyDDS_IsCubemap(ctx)))
+-		{
+-			error_printf("Unsupported DDS texture type in file \"%s\"\n", pFilename);
+-			goto failure;
+-		}
+-
+-		width = TinyDDS_Width(ctx);
+-		height = TinyDDS_Height(ctx);
+-						
+-		if (!width || !height)
+-		{
+-			error_printf("DDS texture dimensions invalid in file \"%s\"\n", pFilename);
+-			goto failure;
+-		}
+-
+-		if ((width > MAX_IMAGE_DIM) || (height > MAX_IMAGE_DIM))
+-		{
+-			error_printf("DDS texture dimensions too large in file \"%s\"\n", pFilename);
+-			goto failure;
+-		}
+-		
+-		tfmt = TinyDDS_GetFormat(ctx);
+-		switch (tfmt)
+-		{
+-		case TDDS_R8G8B8A8_SRGB:
+-		case TDDS_R8G8B8A8_UNORM:
+-		case TDDS_B8G8R8A8_SRGB:
+-		case TDDS_B8G8R8A8_UNORM:
+-			fmt = cRGBA32;
+-			break;
+-		case TDDS_R16G16B16A16_SFLOAT:
+-			fmt = cRGBA_HALF;
+-			hdr_flag = true;
+-			break;
+-		case TDDS_R32G32B32A32_SFLOAT:
+-			fmt = cRGBA_FLOAT;
+-			hdr_flag = true;
+-			break;
+-		default:
+-			error_printf("File \"%s\" has an unsupported DDS texture format (only supports RGBA/BGRA 32bpp, RGBA HALF float, or RGBA FLOAT)\n", pFilename);
+-			goto failure;
+-		}
+-
+-		if (hdr_flag)
+-			hdr_mips.resize(TinyDDS_NumberOfMipmaps(ctx));
+-		else
+-			ldr_mips.resize(TinyDDS_NumberOfMipmaps(ctx));
+-
+-		for (uint32_t level = 0; level < TinyDDS_NumberOfMipmaps(ctx); level++)
+-		{
+-			const uint32_t level_width = TinyDDS_MipMapReduce(width, level);
+-			const uint32_t level_height = TinyDDS_MipMapReduce(height, level);
+-			const uint32_t total_level_texels = level_width * level_height;
+-
+-			const void* pImage = TinyDDS_ImageRawData(ctx, level);
+-			const uint32_t image_size = TinyDDS_ImageSize(ctx, level);
+-
+-			if (fmt == cRGBA32)
+-			{
+-				ldr_mips[level].resize(level_width, level_height);
+-
+-				if ((ldr_mips[level].get_total_pixels() * sizeof(uint32_t) != image_size))
+-				{
+-					assert(0);
+-					goto failure;
+-				}
+-
+-				memcpy(ldr_mips[level].get_ptr(), pImage, image_size);
+-								
+-				if ((tfmt == TDDS_B8G8R8A8_SRGB) || (tfmt == TDDS_B8G8R8A8_UNORM))
+-				{
+-					// Swap R and B components.
+-					uint32_t *pTexels = (uint32_t *)ldr_mips[level].get_ptr();
+-					for (uint32_t i = 0; i < total_level_texels; i++)
+-					{
+-						const uint32_t v = pTexels[i];
+-						const uint32_t r = (v >> 16) & 0xFF;
+-						const uint32_t b = v & 0xFF;
+-						pTexels[i] = r | (b << 16) | (v & 0xFF00FF00);
+-					}
+-				}
+-			}
+-			else if (fmt == cRGBA_FLOAT)
+-			{
+-				hdr_mips[level].resize(level_width, level_height);
+-
+-				if ((hdr_mips[level].get_total_pixels() * sizeof(float) * 4 != image_size))
+-				{
+-					assert(0);
+-					goto failure;
+-				}
+-
+-				memcpy(hdr_mips[level].get_ptr(), pImage, image_size);
+-			}
+-			else if (fmt == cRGBA_HALF)
+-			{
+-				hdr_mips[level].resize(level_width, level_height);
+-				
+-				if ((hdr_mips[level].get_total_pixels() * sizeof(basist::half_float) * 4 != image_size))
+-				{
+-					assert(0);
+-					goto failure;
+-				}
+-
+-				// Unpack half to float.
+-				const basist::half_float* pSrc_comps = static_cast<const basist::half_float*>(pImage);
+-				vec4F* pDst_texels = hdr_mips[level].get_ptr();
+-				
+-				for (uint32_t i = 0; i < total_level_texels; i++)
+-				{
+-					(*pDst_texels)[0] = basist::half_to_float(pSrc_comps[0]);
+-					(*pDst_texels)[1] = basist::half_to_float(pSrc_comps[1]);
+-					(*pDst_texels)[2] = basist::half_to_float(pSrc_comps[2]);
+-					(*pDst_texels)[3] = basist::half_to_float(pSrc_comps[3]);
+-
+-					pSrc_comps += 4;
+-					pDst_texels++;
+-				} // y
+-			}
+-		} // level
+-
+-		TinyDDS_DestroyContext(ctx);
+-		fclose(pFile);
+-
+-		return true;
+-
+-	failure:
+-		if (ctx)
+-			TinyDDS_DestroyContext(ctx);
+-
+-		if (pFile)
+-			fclose(pFile);
+-
+ 		return false;
+ 	}
+ 
diff --git a/thirdparty/basis_universal/transcoder/basisu.h b/thirdparty/basis_universal/transcoder/basisu.h
index 1230b59ec618..939ee79e62b9 100644
--- a/thirdparty/basis_universal/transcoder/basisu.h
+++ b/thirdparty/basis_universal/transcoder/basisu.h
@@ -1,5 +1,5 @@
 // basisu.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -117,13 +117,26 @@ namespace basisu
 	typedef basisu::vector<uint64_t> uint64_vec;
 	typedef basisu::vector<int> int_vec;
 	typedef basisu::vector<bool> bool_vec;
+	typedef basisu::vector<float> float_vec;
 
 	void enable_debug_printf(bool enabled);
 	void debug_printf(const char *pFmt, ...);
-		
 
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif                  
+#endif
+		
 	template <typename T> inline void clear_obj(T& obj) { memset(&obj, 0, sizeof(obj)); }
 
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif                            
+#endif
+
 	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }
 
 	template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
@@ -162,10 +175,45 @@ namespace basisu
 	template<typename T> inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); BASISU_NOTE_UNUSED(minv); BASISU_NOTE_UNUSED(maxv); return v; }
 	template<typename T> inline T open_range_check(T v, T maxv) { assert(v < maxv); BASISU_NOTE_UNUSED(maxv); return v; }
 
+	// Open interval
+	inline bool in_bounds(int v, int l, int h)
+	{
+		return (v >= l) && (v < h);
+	}
+
+	// Closed interval
+	inline bool in_range(int v, int l, int h)
+	{
+		return (v >= l) && (v <= h);
+	}
+
 	inline uint32_t total_bits(uint32_t v) { uint32_t l = 0; for ( ; v > 0U; ++l) v >>= 1; return l; }
 
 	template<typename T> inline T saturate(T val) { return clamp(val, 0.0f, 1.0f); }
 
+	inline uint32_t get_bit(uint32_t src, int ndx)
+	{
+		assert(in_bounds(ndx, 0, 32));
+		return (src >> ndx) & 1;
+	}
+
+	inline bool is_bit_set(uint32_t src, int ndx)
+	{
+		return get_bit(src, ndx) != 0;
+	}
+
+	inline uint32_t get_bits(uint32_t val, int low, int high)
+	{
+		const int num_bits = (high - low) + 1;
+		assert(in_range(num_bits, 1, 32));
+
+		val >>= low;
+		if (num_bits != 32)
+			val &= ((1u << num_bits) - 1);
+
+		return val;
+	}
+
 	template<typename T, typename R> inline void append_vector(T &vec, const R *pObjs, size_t n) 
 	{ 
 		if (n)
@@ -267,6 +315,11 @@ namespace basisu
 		return true;
 	}
 
+	static inline uint32_t read_le_word(const uint8_t* pBytes)
+	{
+		return (pBytes[1] << 8U) | (pBytes[0]);
+	}
+
 	static inline uint32_t read_le_dword(const uint8_t *pBytes)
 	{
 		return (pBytes[3] << 24U) | (pBytes[2] << 16U) | (pBytes[1] << 8U) | (pBytes[0]);
@@ -303,6 +356,10 @@ namespace basisu
 			return *this;
 		}
 
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"            
+#endif  
 		inline operator uint32_t() const
 		{
 			switch (NumBytes)
@@ -354,6 +411,9 @@ namespace basisu
 				}
 			}
 		}
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
 	};
 
 	enum eZero { cZero };
@@ -402,8 +462,11 @@ namespace basisu
 		cBC3,				// DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block)
 		cBC4,				// DXT5A
 		cBC5,				// 3DC/DXN (two BC4/DXT5A blocks)
+		cBC6HSigned,		// HDR
+		cBC6HUnsigned,		// HDR
 		cBC7,
-		cASTC4x4,		// LDR only
+		cASTC_LDR_4x4,		// ASTC 4x4 LDR only
+		cASTC_HDR_4x4,		// ASTC 4x4 HDR only (but may use LDR ASTC blocks internally)
 		cPVRTC1_4_RGB,
 		cPVRTC1_4_RGBA,
 		cATC_RGB,
@@ -413,17 +476,22 @@ namespace basisu
 		cETC2_R11_EAC,
 		cETC2_RG11_EAC,
 		cUASTC4x4,		
+		cUASTC_HDR_4x4,
 		cBC1_NV,
 		cBC1_AMD,
-		
+				
 		// Uncompressed/raw pixels
 		cRGBA32,
 		cRGB565,
 		cBGR565,
 		cRGBA4444,
-		cABGR4444
+		cABGR4444,
+		cRGBA_HALF,
+		cRGB_HALF,
+		cRGB_9E5
 	};
 
+	// This is bytes per block for GPU formats, or bytes per texel for uncompressed formats.
 	inline uint32_t get_bytes_per_block(texture_format fmt)
 	{
 		switch (fmt)
@@ -443,13 +511,27 @@ namespace basisu
 		case texture_format::cETC2_R11_EAC:
 			return 8;
 		case texture_format::cRGBA32:
-			return sizeof(uint32_t) * 16;
+		case texture_format::cRGB_9E5:
+			return sizeof(uint32_t);
+		case texture_format::cRGB_HALF:
+			return sizeof(uint16_t) * 3;
+		case texture_format::cRGBA_HALF:
+			return sizeof(uint16_t) * 4;
+		case texture_format::cRGB565:
+		case texture_format::cBGR565:
+		case texture_format::cRGBA4444:
+		case texture_format::cABGR4444:
+			return sizeof(uint16_t);
+
 		default:
 			break;
 		}
+		
+		// Everything else is 16 bytes/block.
 		return 16;
 	}
 
+	// This is qwords per block for GPU formats, or not valid for uncompressed formats.
 	inline uint32_t get_qwords_per_block(texture_format fmt)
 	{
 		return get_bytes_per_block(fmt) >> 3;
@@ -473,6 +555,17 @@ namespace basisu
 		BASISU_NOTE_UNUSED(fmt);
 		return 4;
 	}
+
+	inline bool is_hdr_texture_format(texture_format fmt)
+	{
+		if (fmt == texture_format::cASTC_HDR_4x4)
+			return true;
+		if (fmt == texture_format::cUASTC_HDR_4x4)
+			return true;
+		if ((fmt == texture_format::cBC6HSigned) || (fmt == texture_format::cBC6HUnsigned))
+			return true;
+		return false;
+	}
 							
 } // namespace basisu
 
diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h
new file mode 100644
index 000000000000..82dcd2bfe196
--- /dev/null
+++ b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h
@@ -0,0 +1,102 @@
+// File: basisu_astc_hdr_core.h
+#pragma once
+#include "basisu_astc_helpers.h"
+
+namespace basist
+{
+	struct astc_blk
+	{
+		uint8_t m_vals[16];
+	};
+
+	// ASTC_HDR_MAX_VAL is the maximum color component value that can be encoded.
+	// If the input has values higher than this, they need to be linearly scaled so all values are between [0,ASTC_HDR_MAX_VAL], and the linear scaling inverted in the shader.
+	const float ASTC_HDR_MAX_VAL = 65216.0f; // actually MAX_QLOG12_VAL
+
+	// Maximum usable QLOG encodings, and their floating point equivalent values, that don't result in NaN/Inf's.
+	const uint32_t MAX_QLOG7 = 123;
+	//const float MAX_QLOG7_VAL = 55296.0f;
+
+	const uint32_t MAX_QLOG8 = 247;
+	//const float MAX_QLOG8_VAL = 60416.0f;
+
+	const uint32_t MAX_QLOG9 = 495;
+	//const float MAX_QLOG9_VAL = 62976.0f;
+
+	const uint32_t MAX_QLOG10 = 991;
+	//const float MAX_QLOG10_VAL = 64256.0f;
+
+	const uint32_t MAX_QLOG11 = 1983;
+	//const float MAX_QLOG11_VAL = 64896.0f;
+
+	const uint32_t MAX_QLOG12 = 3967;
+	//const float MAX_QLOG12_VAL = 65216.0f;
+
+	const uint32_t MAX_QLOG16 = 63487;
+	const float MAX_QLOG16_VAL = 65504.0f;
+
+	const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4;
+
+	// Notes:
+	// qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless)
+	// However, this is not lossless in the general sense.
+	inline half_float qlog16_to_half_slow(uint32_t qlog16)
+	{
+		assert(qlog16 <= 0xFFFF);
+
+		int C = qlog16;
+
+		int E = (C & 0xF800) >> 11;
+		int M = C & 0x7FF;
+
+		int Mt;
+		if (M < 512)
+			Mt = 3 * M;
+		else if (M >= 1536)
+			Mt = 5 * M - 2048;
+		else
+			Mt = 4 * M - 512;
+
+		int Cf = (E << 10) + (Mt >> 3);
+		return (half_float)Cf;
+	}
+
+	// This is not lossless
+	inline half_float qlog_to_half_slow(uint32_t qlog, uint32_t bits)
+	{
+		assert((bits >= 7U) && (bits <= 16U));
+		assert(qlog < (1U << bits));
+
+		int C = qlog << (16 - bits);
+		return qlog16_to_half_slow(C);
+	}
+
+	void astc_hdr_core_init();
+
+	void decode_mode7_to_qlog12_ise20(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		int* pScale);
+
+	bool decode_mode7_to_qlog12(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		int* pScale,
+		uint32_t ise_endpoint_range);
+
+	void decode_mode11_to_qlog12_ise20(
+		const uint8_t* pEndpoints,
+		int e[2][3]);
+
+	bool decode_mode11_to_qlog12(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		uint32_t ise_endpoint_range);
+
+	bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk);
+	bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk);
+
+	bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk);
+	bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk);
+
+} // namespace basist
diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h
new file mode 100644
index 000000000000..09a234b2ae1f
--- /dev/null
+++ b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h
@@ -0,0 +1,3587 @@
+// basisu_astc_helpers.h
+// Be sure to define ASTC_HELPERS_IMPLEMENTATION somewhere to get the implementation, otherwise you only get the header.
+#pragma once
+#ifndef BASISU_ASTC_HELPERS_HEADER
+#define BASISU_ASTC_HELPERS_HEADER
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <fenv.h>
+
+namespace astc_helpers
+{
+	const uint32_t MAX_WEIGHT_VALUE = 64; // grid texel weights must range from [0,64]
+	const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid
+	const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels
+	const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values
+
+	static const uint32_t NUM_ASTC_BLOCK_SIZES = 14;
+	extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2];
+
+	// The Color Endpoint Modes (CEM's)
+	enum cems
+	{
+		CEM_LDR_LUM_DIRECT = 0,
+		CEM_LDR_LUM_BASE_PLUS_OFS = 1,
+		CEM_HDR_LUM_LARGE_RANGE = 2,
+		CEM_HDR_LUM_SMALL_RANGE = 3,
+		CEM_LDR_LUM_ALPHA_DIRECT = 4,
+		CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS = 5,
+		CEM_LDR_RGB_BASE_SCALE = 6,
+		CEM_HDR_RGB_BASE_SCALE = 7,
+		CEM_LDR_RGB_DIRECT = 8,
+		CEM_LDR_RGB_BASE_PLUS_OFFSET = 9,
+		CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A = 10,
+		CEM_HDR_RGB = 11,
+		CEM_LDR_RGBA_DIRECT = 12,
+		CEM_LDR_RGBA_BASE_PLUS_OFFSET = 13,
+		CEM_HDR_RGB_LDR_ALPHA = 14,
+		CEM_HDR_RGB_HDR_ALPHA = 15
+	};
+
+	// All Bounded Integer Sequence Coding (BISE or ISE) ranges.
+	// Weights: Ranges [0,11] are valid.
+	// Endpoints: Ranges [4,20] are valid.
+	enum bise_levels
+	{
+		BISE_2_LEVELS = 0,
+		BISE_3_LEVELS = 1,
+		BISE_4_LEVELS = 2,
+		BISE_5_LEVELS = 3,
+		BISE_6_LEVELS = 4,
+		BISE_8_LEVELS = 5,
+		BISE_10_LEVELS = 6,
+		BISE_12_LEVELS = 7,
+		BISE_16_LEVELS = 8,
+		BISE_20_LEVELS = 9,
+		BISE_24_LEVELS = 10,
+		BISE_32_LEVELS = 11,
+		BISE_40_LEVELS = 12,
+		BISE_48_LEVELS = 13,
+		BISE_64_LEVELS = 14,
+		BISE_80_LEVELS = 15,
+		BISE_96_LEVELS = 16,
+		BISE_128_LEVELS = 17,
+		BISE_160_LEVELS = 18,
+		BISE_192_LEVELS = 19,
+		BISE_256_LEVELS = 20
+	};
+
+	const uint32_t TOTAL_ISE_RANGES = 21;
+
+	// Valid endpoint ISE ranges
+	const uint32_t FIRST_VALID_ENDPOINT_ISE_RANGE = BISE_6_LEVELS; // 4
+	const uint32_t LAST_VALID_ENDPOINT_ISE_RANGE = BISE_256_LEVELS; // 20
+	const uint32_t TOTAL_ENDPOINT_ISE_RANGES = LAST_VALID_ENDPOINT_ISE_RANGE - FIRST_VALID_ENDPOINT_ISE_RANGE + 1;
+
+	// Valid weight ISE ranges
+	const uint32_t FIRST_VALID_WEIGHT_ISE_RANGE = BISE_2_LEVELS; // 0
+	const uint32_t LAST_VALID_WEIGHT_ISE_RANGE = BISE_32_LEVELS; // 11
+	const uint32_t TOTAL_WEIGHT_ISE_RANGES = LAST_VALID_WEIGHT_ISE_RANGE - FIRST_VALID_WEIGHT_ISE_RANGE + 1;
+
+	// The ISE range table.
+	extern const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3]; // 0=bits (0 to 8), 1=trits (0 or 1), 2=quints (0 or 1)
+
+	// Possible Color Component Select values, used in dual plane mode. 
+	// The CCS component will be interpolated using the 2nd weight plane.
+	enum ccs
+	{
+		CCS_GBA_R = 0,
+		CCS_RBA_G = 1,
+		CCS_RGA_B = 2,
+		CCS_RGB_A = 3
+	};
+		
+	struct astc_block
+	{
+		uint32_t m_vals[4];
+	};
+
+	const uint32_t MAX_PARTITIONS = 4;				// Max # of partitions or subsets for single plane mode
+	const uint32_t MAX_DUAL_PLANE_PARTITIONS = 3;	// Max # of partitions or subsets for dual plane mode
+	const uint32_t NUM_PARTITION_PATTERNS = 1024;	// Total # of partition pattern seeds (10-bits)
+	const uint32_t MAX_ENDPOINTS = 18;				// Maximum # of endpoint values in a block
+
+	struct log_astc_block
+	{
+		bool m_error_flag;
+		
+		bool m_solid_color_flag_ldr, m_solid_color_flag_hdr;
+		uint16_t m_solid_color[4];
+
+		// Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr
+		uint32_t m_grid_width, m_grid_height;	// weight grid dimensions, not the dimension of the block
+		
+		bool m_dual_plane;
+
+		uint32_t m_weight_ise_range;			// 0-11
+		uint32_t m_endpoint_ise_range;			// 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking
+
+		uint32_t m_color_component_selector;	// 0-3, 0=GBA R, 1=RBA G, 2=RGA B, 3=RGB A, only used in dual plane mode
+
+		uint32_t m_num_partitions;				// or the # of subsets, 1-4 (1-3 if dual plane mode)
+		uint32_t m_partition_id;				// 10-bits, must be 0 if m_num_partitions==1
+		
+		uint32_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's
+		
+		// ISE weight grid values. In dual plane mode, the order is p0,p1,  p0,p1,  etc.
+		uint8_t m_weights[MAX_GRID_WEIGHTS];
+		
+		// ISE endpoint values
+		// Endpoint order examples:
+		// 1 subset LA : LL0 LH0 AL0 AH0
+		// 1 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0
+		// 1 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0
+		// 2 subset LA : LL0 LH0 AL0 AH0 LL1 LH1 AL1 AH1
+		// 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1
+		// 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1
+		uint8_t m_endpoints[MAX_ENDPOINTS];
+
+		void clear()
+		{
+			memset(this, 0, sizeof(*this));
+		}
+	};
+
+	// Open interval
+	inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
+	inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
+
+	inline uint32_t get_bits(uint32_t val, int low, int high)
+	{
+		const int num_bits = (high - low) + 1;
+		assert((num_bits >= 1) && (num_bits <= 32));
+
+		val >>= low;
+		if (num_bits != 32)
+			val &= ((1u << num_bits) - 1);
+
+		return val;
+	}
+
+	// Returns the number of levels in the given ISE range.
+	inline uint32_t get_ise_levels(uint32_t ise_range) 
+	{ 
+		assert(ise_range < TOTAL_ISE_RANGES);
+		return (1 + 2 * g_ise_range_table[ise_range][1] + 4 * g_ise_range_table[ise_range][2]) << g_ise_range_table[ise_range][0];
+	}
+
+	inline int get_ise_sequence_bits(int count, int range)
+	{
+		// See 18.22 Data Size Determination
+		int total_bits = g_ise_range_table[range][0] * count;
+		total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5;
+		total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3;
+		return total_bits;
+	}
+		
+	inline uint32_t weight_interpolate(uint32_t l, uint32_t h, uint32_t w)
+	{
+		assert(w <= MAX_WEIGHT_VALUE);
+		return (l * (64 - w) + h * w + 32) >> 6;
+	}
+
+	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range);
+
+	// Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions.
+	bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr);
+
+	// Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component.
+	void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a);
+
+	// Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's)
+	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah);
+
+	// These helpers are all quite slow, but are useful for table preparation.
+	
+	// Dequantizes ISE encoded endpoint val to [0,255]
+	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range); // ISE ranges 4-11
+		
+	// Dequantizes ISE encoded weight val to [0,64]
+	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range); // ISE ranges 0-10
+
+	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range);
+	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range);
+
+	void create_quant_tables(
+		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
+		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
+		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
+		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
+		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
+		bool weight_flag);		// false if block endpoints, true if weights
+
+	// True if the CEM is LDR.
+	bool is_cem_ldr(uint32_t mode);
+	inline bool is_cem_hdr(uint32_t mode) { return !is_cem_ldr(mode); }
+
+	// True if the passed in dimensions are a valid ASTC block size. There are 14 supported configs, from 4x4 (8bpp) to 12x12 (.89bpp).
+	bool is_valid_block_size(uint32_t w, uint32_t h);
+
+	bool block_has_any_hdr_cems(const log_astc_block& log_blk);
+	bool block_has_any_ldr_cems(const log_astc_block& log_blk);
+	
+	// Returns the # of endpoint values for the given CEM.
+	inline uint32_t get_num_cem_values(uint32_t cem) { assert(cem <= 15); return 2 + 2 * (cem >> 2); }
+
+	struct dequant_table
+	{
+		basisu::vector<uint8_t> m_val_to_ise;	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
+		basisu::vector<uint8_t> m_ISE_to_val;	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
+		basisu::vector<uint8_t> m_ISE_to_rank;	// returns the level rank index given an ISE symbol, [levels]
+		basisu::vector<uint8_t> m_rank_to_ISE;  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]		
+
+		void init(bool weight_flag, uint32_t num_levels, bool init_rank_tabs)
+		{
+			m_val_to_ise.resize(weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256);
+			m_ISE_to_val.resize(num_levels);
+			if (init_rank_tabs)
+			{
+				m_ISE_to_rank.resize(num_levels);
+				m_rank_to_ISE.resize(num_levels);
+			}
+		}
+	};
+
+	struct dequant_tables
+	{
+		dequant_table m_weights[TOTAL_WEIGHT_ISE_RANGES];
+		dequant_table m_endpoints[TOTAL_ENDPOINT_ISE_RANGES];
+
+		const dequant_table& get_weight_tab(uint32_t range) const
+		{
+			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
+			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
+		}
+
+		dequant_table& get_weight_tab(uint32_t range)
+		{
+			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
+			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
+		}
+
+		const dequant_table& get_endpoint_tab(uint32_t range) const
+		{
+			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
+			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
+		}
+
+		dequant_table& get_endpoint_tab(uint32_t range)
+		{
+			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
+			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
+		}
+
+		void init(bool init_rank_tabs)
+		{
+			for (uint32_t range = FIRST_VALID_WEIGHT_ISE_RANGE; range <= LAST_VALID_WEIGHT_ISE_RANGE; range++)
+			{
+				const uint32_t num_levels = get_ise_levels(range);
+				dequant_table& tab = get_weight_tab(range);
+
+				tab.init(true, num_levels, init_rank_tabs);
+
+				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, true);
+			}
+
+			for (uint32_t range = FIRST_VALID_ENDPOINT_ISE_RANGE; range <= LAST_VALID_ENDPOINT_ISE_RANGE; range++)
+			{
+				const uint32_t num_levels = get_ise_levels(range);
+				dequant_table& tab = get_endpoint_tab(range);
+
+				tab.init(false, num_levels, init_rank_tabs);
+
+				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), init_rank_tabs ? tab.m_ISE_to_rank.data() : nullptr, init_rank_tabs ? tab.m_rank_to_ISE.data() : nullptr, range, false);
+			}
+		}
+	};
+
+	extern dequant_tables g_dequant_tables;
+	void init_tables(bool init_rank_tabs);
+		
+	// Procedurally returns the texel partition/subset index given the block coordinate and config.
+	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block);
+		
+	void blue_contract(
+		int r, int g, int b, int a,
+		int& dr, int& dg, int& db, int& da);
+
+	void bit_transfer_signed(int& a, int& b);
+
+	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t* pE);
+
+	typedef uint16_t half_float;
+	half_float float_to_half(float val, bool toward_zero);
+	float half_to_float(half_float hval);
+
+	const int MAX_RGB9E5 = 0xff80;
+	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b);
+	uint32_t pack_rgb9e5(float r, float g, float b);
+	
+	enum decode_mode
+	{
+		cDecodeModeSRGB8 = 0,	// returns uint8_t's, not valid on HDR blocks
+		cDecodeModeLDR8 = 1,	// returns uint8_t's, not valid on HDR blocks
+		cDecodeModeHDR16 = 2,   // returns uint16_t's (half floats), valid on all LDR/HDR blocks
+		cDecodeModeRGB9E5 = 3	// returns uint32_t's, packed as RGB 9E5 (shared exponent), see https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
+	};
+
+	// Decodes logical block to output pixels.
+	// pPixels must point to either 32-bit pixel values (SRGB8/LDR8/9E5) or 64-bit pixel values (HDR16)
+	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode);
+
+	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t *pBits128, uint32_t bit_ofs);
+
+	// Unpack a physical ASTC encoded GPU texture block to a logical block description.
+	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height);
+					
+} // namespace astc_helpers
+
+#endif // BASISU_ASTC_HELPERS_HEADER
+
+//------------------------------------------------------------------
+
+#ifdef BASISU_ASTC_HELPERS_IMPLEMENTATION
+
+namespace astc_helpers
+{
+	template<typename T> inline T my_min(T a, T b) { return (a < b) ? a : b; }
+	template<typename T> inline T my_max(T a, T b) { return (a > b) ? a : b; }
+
+	const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2] = { 
+		{ 4, 4 }, { 5, 4 }, { 5, 5 }, { 6, 5 }, 
+		{ 6, 6 }, { 8, 5 }, { 8, 6 }, { 10, 5 }, 
+		{ 10, 6 }, { 8, 8 }, { 10, 8 }, { 10, 10 }, 
+		{ 12, 10 }, { 12, 12 } 
+	};
+
+	const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3] =
+	{
+		//b  t  q
+		//2  3  5	 // rng  ise_index	notes
+		{ 1, 0, 0 }, // 0..1 0
+		{ 0, 1, 0 }, // 0..2 1
+		{ 2, 0, 0 }, // 0..3 2
+		{ 0, 0, 1 }, // 0..4 3
+		{ 1, 1, 0 }, // 0..5 4			min endpoint ISE index
+		{ 3, 0, 0 }, // 0..7 5
+		{ 1, 0, 1 }, // 0..9 6
+		{ 2, 1, 0 }, // 0..11 7
+		{ 4, 0, 0 }, // 0..15 8
+		{ 2, 0, 1 }, // 0..19 9
+		{ 3, 1, 0 }, // 0..23 10
+		{ 5, 0, 0 }, // 0..31 11		max weight ISE index
+		{ 3, 0, 1 }, // 0..39 12
+		{ 4, 1, 0 }, // 0..47 13
+		{ 6, 0, 0 }, // 0..63 14
+		{ 4, 0, 1 }, // 0..79 15
+		{ 5, 1, 0 }, // 0..95 16
+		{ 7, 0, 0 }, // 0..127 17
+		{ 5, 0, 1 }, // 0..159 18
+		{ 6, 1, 0 }, // 0..191 19
+		{ 8, 0, 0 }, // 0..255 20
+	};
+		
+	static inline void astc_set_bits_1_to_9(uint32_t* pDst, uint32_t& bit_offset, uint32_t code, uint32_t codesize)
+	{
+		uint8_t* pBuf = reinterpret_cast<uint8_t*>(pDst);
+
+		assert(codesize <= 9);
+		if (codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t val = code << byte_bit_offset;
+
+			uint32_t index = bit_offset >> 3;
+			pBuf[index] |= (uint8_t)val;
+
+			if (codesize > (8 - byte_bit_offset))
+				pBuf[index + 1] |= (uint8_t)(val >> 8);
+
+			bit_offset += codesize;
+		}
+	}
+
+	static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high)
+	{
+		return (bits >> low) & ((1 << (high - low + 1)) - 1);
+	}
+
+	// Writes bits to output in an endian safe way
+	static inline void astc_set_bits(uint32_t* pOutput, uint32_t& bit_pos, uint32_t value, uint32_t total_bits)
+	{
+		assert(total_bits <= 31);
+		assert(value < (1u << total_bits));
+
+		uint8_t* pBytes = reinterpret_cast<uint8_t*>(pOutput);
+
+		while (total_bits)
+		{
+			const uint32_t bits_to_write = my_min<int>(total_bits, 8 - (bit_pos & 7));
+
+			pBytes[bit_pos >> 3] |= static_cast<uint8_t>(value << (bit_pos & 7));
+
+			bit_pos += bits_to_write;
+			total_bits -= bits_to_write;
+			value >>= bits_to_write;
+		}
+	}
+
+	static const uint8_t g_astc_quint_encode[125] =
+	{
+		0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57,
+		58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104,
+		105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54,
+		126, 127, 94, 95, 62, 39, 47, 55, 63, 7 /*31 - results in the same decode as 7*/
+	};
+
+	// Encodes 3 values to output, usable for any range that uses quints and bits
+	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n)
+	{
+		// First extract the quints and the bits from the 3 input values
+		int quints = 0, bits[3];
+		const uint32_t bit_mask = (1 << n) - 1;
+		for (int i = 0; i < 3; i++)
+		{
+			static const int s_muls[3] = { 1, 5, 25 };
+
+			const int t = pValues[i] >> n;
+
+			quints += t * s_muls[i];
+			bits[i] = pValues[i] & bit_mask;
+		}
+
+		// Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits.
+		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
+
+		assert(quints < 125);
+		const int T = g_astc_quint_encode[quints];
+
+		// Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96.
+		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) |
+			(bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3);
+	}
+
+	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39,
+		43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154,
+		131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202,
+		208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224,
+		225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159,
+		191, 223, 124, 125, 126 };
+
+	// Encodes 5 values to output, usable for any range that uses trits and bits
+	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n)
+	{
+		// First extract the trits and the bits from the 5 input values
+		int trits = 0, bits[5];
+		const uint32_t bit_mask = (1 << n) - 1;
+		for (int i = 0; i < 5; i++)
+		{
+			static const int s_muls[5] = { 1, 3, 9, 27, 81 };
+
+			const int t = pValues[i] >> n;
+
+			trits += t * s_muls[i];
+			bits[i] = pValues[i] & bit_mask;
+		}
+
+		// Encode the trits, by inverting the bit manipulations done by the decoder, converting 5 trits into 8-bits.
+		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding
+
+		assert(trits < 243);
+		const int T = g_astc_trit_encode[trits];
+
+		// Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94.
+		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2);
+
+		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) |
+			(bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6);
+	}
+
+	// Packs values using ASTC's BISE to output buffer.
+	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range)
+	{
+		uint32_t temp[5] = { 0 };
+
+		const int num_bits = g_ise_range_table[range][0];
+
+		int group_size = 0;
+		if (g_ise_range_table[range][1])
+			group_size = 5;
+		else if (g_ise_range_table[range][2])
+			group_size = 3;
+
+#ifndef NDEBUG
+		const uint32_t num_levels = get_ise_levels(range);
+		for (int i = 0; i < num_vals; i++)
+		{
+			assert(pSrc_vals[i] < num_levels);
+		}
+#endif
+
+		if (group_size)
+		{
+			// Range has trits or quints - pack each group of 5 or 3 values 
+			const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3);
+
+			for (int group_index = 0; group_index < total_groups; group_index++)
+			{
+				uint8_t vals[5] = { 0 };
+
+				const int limit = my_min(group_size, num_vals - group_index * group_size);
+				for (int i = 0; i < limit; i++)
+					vals[i] = pSrc_vals[group_index * group_size + i];
+
+				if (group_size == 5)
+					astc_encode_trits(temp, vals, bit_pos, num_bits);
+				else
+					astc_encode_quints(temp, vals, bit_pos, num_bits);
+			}
+		}
+		else
+		{
+			for (int i = 0; i < num_vals; i++)
+				astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits);
+		}
+
+		// TODO: Could this write too many bits on incomplete blocks?
+		pDst[0] |= temp[0]; pDst[1] |= temp[1];
+		pDst[2] |= temp[2]; pDst[3] |= temp[3];
+	}
+
+	inline uint32_t rev_dword(uint32_t bits)
+	{
+		uint32_t v = (bits << 16) | (bits >> 16);
+		v = ((v & 0x00ff00ff) << 8) | ((v & 0xff00ff00) >> 8); v = ((v & 0x0f0f0f0f) << 4) | ((v & 0xf0f0f0f0) >> 4);
+		v = ((v & 0x33333333) << 2) | ((v & 0xcccccccc) >> 2); v = ((v & 0x55555555) << 1) | ((v & 0xaaaaaaaa) >> 1);
+		return v;
+	}
+
+	static inline bool is_packable(int value, int num_bits) { assert((num_bits >= 1) && (num_bits < 31)); return (value >= 0) && (value < (1 << num_bits)); }
+
+	static bool get_config_bits(const log_astc_block &log_block, uint32_t &config_bits)
+	{
+		config_bits = 0;
+
+		const int W = log_block.m_grid_width, H = log_block.m_grid_height;
+
+		const uint32_t P = log_block.m_weight_ise_range >= 6; // high precision
+		const uint32_t Dp_P = (log_block.m_dual_plane << 1) | P; // pack dual plane+high precision bits
+		
+		// See Tables 81-82
+		// Compute p from weight range
+		uint32_t p = 2 + log_block.m_weight_ise_range - (P ? 6 : 0);
+		
+		// Rearrange p's bits to p0 p2 p1
+		p = (p >> 1) + ((p & 1) << 2);
+		
+		// Try encoding each row of table 82.
+
+		// W+4 H+2
+		if (is_packable(W - 4, 2) && is_packable(H - 2, 2))
+		{
+			config_bits = (Dp_P << 9) | ((W - 4) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | (p & 3);
+			return true;
+		}
+
+		// W+8 H+2
+		if (is_packable(W - 8, 2) && is_packable(H - 2, 2))
+		{
+			config_bits = (Dp_P << 9) | ((W - 8) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 4 | (p & 3);
+			return true;
+		}
+
+		// W+2 H+8
+		if (is_packable(W - 2, 2) && is_packable(H - 8, 2))
+		{
+			config_bits = (Dp_P << 9) | ((H - 8) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 8 | (p & 3);
+			return true;
+		}
+
+		// W+2 H+6
+		if (is_packable(W - 2, 2) && is_packable(H - 6, 1))
+		{
+			config_bits = (Dp_P << 9) | ((H - 6) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
+			return true;
+		}
+
+		// W+2 H+2
+		if (is_packable(W - 2, 1) && is_packable(H - 2, 2))
+		{
+			config_bits = (Dp_P << 9) | ((W) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
+			return true;
+		}
+				
+		// 12 H+2
+		if ((W == 12) && is_packable(H - 2, 2))
+		{
+			config_bits = (Dp_P << 9) | ((H - 2) << 5) | (p << 2);
+			return true;
+		}
+
+		// W+2 12
+		if ((H == 12) && is_packable(W - 2, 2))
+		{
+			config_bits = (Dp_P << 9) | (1 << 7) | ((W - 2) << 5) | (p << 2);
+			return true;
+		}
+
+		// 6 10
+		if ((W == 6) && (H == 10))
+		{
+			config_bits = (Dp_P << 9) | (3 << 7) | (p << 2);
+			return true;
+		}
+
+		// 10 6
+		if ((W == 10) && (H == 6))
+		{
+			config_bits = (Dp_P << 9) | (0b1101 << 5) | (p << 2);
+			return true;
+		}
+				
+		// W+6 H+6 (no dual plane or high prec)
+		if ((!Dp_P) && is_packable(W - 6, 2) && is_packable(H - 6, 2))
+		{
+			config_bits = ((H - 6) << 9) | 256 | ((W - 6) << 5) | (p << 2);
+			return true;
+		}
+
+		// Failed: unsupported weight grid dimensions or config.
+		return false;
+	}
+
+	bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range)
+	{
+		memset(&phys_block, 0, sizeof(phys_block));
+
+		if (pExpected_endpoint_range)
+			*pExpected_endpoint_range = -1;
+
+		assert(!log_block.m_error_flag);
+		if (log_block.m_error_flag)
+			return false;
+				
+		if (log_block.m_solid_color_flag_ldr)
+		{
+			pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]);
+			return true;
+		}
+		else if (log_block.m_solid_color_flag_hdr)
+		{
+			pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]);
+			return true;
+		}
+				
+		if ((log_block.m_num_partitions < 1) || (log_block.m_num_partitions > MAX_PARTITIONS))
+			return false;
+
+		// Max usable weight range is 11
+		if (log_block.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE)
+			return false;
+
+		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
+		if ((log_block.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_block.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
+			return false;
+
+		if (log_block.m_color_component_selector > 3)
+			return false;
+				
+		uint32_t config_bits = 0;
+		if (!get_config_bits(log_block, config_bits))
+			return false;
+
+		uint32_t bit_pos = 0;
+		astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11);
+
+		const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height);
+		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
+
+		// 18.24 Illegal Encodings
+		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
+			return false;
+
+		uint32_t total_extra_bits = 0;
+
+		astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2);
+
+		if (log_block.m_num_partitions > 1)
+		{
+			if (log_block.m_partition_id >= NUM_PARTITION_PATTERNS)
+				return false;
+
+			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10);
+
+			uint32_t highest_cem = 0, lowest_cem = UINT32_MAX;
+			for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
+			{
+				highest_cem = my_max(highest_cem, log_block.m_color_endpoint_modes[j]);
+				lowest_cem = my_min(lowest_cem, log_block.m_color_endpoint_modes[j]);
+			}
+
+			if (highest_cem > 15)
+				return false;
+			
+			// Ensure CEM range is contiguous
+			if (((highest_cem >> 2) > (1 + (lowest_cem >> 2))))
+				return false;
+
+			// See tables 79/80
+			uint32_t encoded_cem = log_block.m_color_endpoint_modes[0] << 2;
+			if (lowest_cem != highest_cem)
+			{
+				encoded_cem = my_min<uint32_t>(3, 1 + (lowest_cem >> 2));
+
+				// See tables at 23.11 Color Endpoint Mode
+				for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
+				{
+					const int M = log_block.m_color_endpoint_modes[j] & 3;
+					
+					const int C = (log_block.m_color_endpoint_modes[j] >> 2) - ((encoded_cem & 3) - 1);
+					if ((C & 1) != C)
+						return false;
+
+					encoded_cem |= (C << (2 + j)) | (M << (2 + log_block.m_num_partitions + 2 * j));
+				}
+
+				total_extra_bits = 3 * log_block.m_num_partitions - 4;
+
+				if ((total_weight_bits + total_extra_bits) > 128)
+					return false;
+
+				uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits;
+				astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits);
+			}
+
+			astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6);
+		}
+		else
+		{
+			if (log_block.m_partition_id)
+				return false;
+			if (log_block.m_color_endpoint_modes[0] > 15)
+				return false;
+
+			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4);
+		}
+
+		if (log_block.m_dual_plane)
+		{
+			if (log_block.m_num_partitions > 3)
+				return false;
+
+			total_extra_bits += 2;
+			
+			uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits;
+			astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2);
+		}
+
+		const uint32_t total_config_bits = bit_pos + total_extra_bits;
+		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
+		if (num_remaining_bits < 0)
+			return false;
+
+		uint32_t total_cem_vals = 0;
+		for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
+			total_cem_vals += 2 + 2 * (log_block.m_color_endpoint_modes[j] >> 2);
+
+		if (total_cem_vals > MAX_ENDPOINTS)
+			return false;
+
+		int endpoint_ise_range = -1;
+		for (int k = 20; k > 0; k--)
+		{
+			int bits = get_ise_sequence_bits(total_cem_vals, k);
+			if (bits <= num_remaining_bits)
+			{
+				endpoint_ise_range = k;
+				break;
+			}
+		}
+
+		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
+		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
+			return false;
+
+		// Ensure the caller utilized the right endpoint ISE range.
+		if ((int)log_block.m_endpoint_ise_range != endpoint_ise_range)
+		{
+			if (pExpected_endpoint_range)
+				*pExpected_endpoint_range = endpoint_ise_range;
+			return false;
+		}
+
+		// Pack endpoints forwards
+		encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range);
+		
+		// Pack weights backwards
+		uint32_t weight_data[4] = { 0 };
+		encode_bise(weight_data, log_block.m_weights, 0, total_grid_weights, log_block.m_weight_ise_range);
+
+		for (uint32_t i = 0; i < 4; i++)
+			phys_block.m_vals[i] |= rev_dword(weight_data[3 - i]);
+
+		return true;
+	}
+
+	static inline uint32_t bit_replication_scale(uint32_t src, int num_src_bits, int num_dst_bits)
+	{
+		assert(num_src_bits <= num_dst_bits);
+		assert((src & ((1 << num_src_bits) - 1)) == src);
+
+		uint32_t dst = 0;
+		for (int shift = num_dst_bits - num_src_bits; shift > -num_src_bits; shift -= num_src_bits)
+			dst |= (shift >= 0) ? (src << shift) : (src >> -shift);
+
+		return dst;
+	}
+
+	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range)
+	{
+		assert((ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE));
+		assert(val < get_ise_levels(ise_range));
+
+		uint32_t u = 0;
+
+		switch (ise_range)
+		{
+		case 5:
+		{
+			u = bit_replication_scale(val, 3, 8);
+			break;
+		}
+		case 8:
+		{
+			u = bit_replication_scale(val, 4, 8);
+			break;
+		}
+		case 11:
+		{
+			u = bit_replication_scale(val, 5, 8);
+			break;
+		}
+		case 14:
+		{
+			u = bit_replication_scale(val, 6, 8);
+			break;
+		}
+		case 17:
+		{
+			u = bit_replication_scale(val, 7, 8);
+			break;
+		}
+		case 20:
+		{
+			u = val;
+			break;
+		}
+		case 4:
+		case 6:
+		case 7:
+		case 9:
+		case 10:
+		case 12:
+		case 13:
+		case 15:
+		case 16:
+		case 18:
+		case 19:
+		{
+			const uint32_t num_bits = g_ise_range_table[ise_range][0];
+			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
+			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);
+
+			// compute Table 103 row index
+			const int range_index = (num_bits * 2 + (num_quints ? 1 : 0)) - 2;
+
+			assert(range_index >= 0 && range_index <= 10);
+
+			uint32_t bits = val & ((1 << num_bits) - 1);
+			uint32_t tval = val >> num_bits;
+
+			assert(tval < (num_trits ? 3U : 5U));
+
+			uint32_t a = bits & 1;
+			uint32_t b = (bits >> 1) & 1;
+			uint32_t c = (bits >> 2) & 1;
+			uint32_t d = (bits >> 3) & 1;
+			uint32_t e = (bits >> 4) & 1;
+			uint32_t f = (bits >> 5) & 1;
+
+			uint32_t A = a ? 511 : 0;
+			uint32_t B = 0;
+
+			switch (range_index)
+			{
+			case 2:
+			{
+				// 876543210
+				// b000b0bb0
+				B = (b << 1) | (b << 2) | (b << 4) | (b << 8);
+				break;
+			}
+			case 3:
+			{
+				// 876543210
+				// b0000bb00
+				B = (b << 2) | (b << 3) | (b << 8);
+				break;
+			}
+			case 4:
+			{
+				// 876543210
+				// cb000cbcb
+				B = b | (c << 1) | (b << 2) | (c << 3) | (b << 7) | (c << 8);
+				break;
+			}
+			case 5:
+			{
+				// 876543210
+				// cb0000cbc
+				B = c | (b << 1) | (c << 2) | (b << 7) | (c << 8);
+				break;
+			}
+			case 6:
+			{
+				// 876543210
+				// dcb000dcb
+				B = b | (c << 1) | (d << 2) | (b << 6) | (c << 7) | (d << 8);
+				break;
+			}
+			case 7:
+			{
+				// 876543210
+				// dcb0000dc
+				B = c | (d << 1) | (b << 6) | (c << 7) | (d << 8);
+				break;
+			}
+			case 8:
+			{
+				// 876543210
+				// edcb000ed
+				B = d | (e << 1) | (b << 5) | (c << 6) | (d << 7) | (e << 8);
+				break;
+			}
+			case 9:
+			{
+				// 876543210
+				// edcb0000e
+				B = e | (b << 5) | (c << 6) | (d << 7) | (e << 8);
+				break;
+			}
+			case 10:
+			{
+				// 876543210
+				// fedcb000f
+				B = f | (b << 4) | (c << 5) | (d << 6) | (e << 7) | (f << 8);
+				break;
+			}
+			default:
+				break;
+			}
+
+			static uint8_t C_vals[11] = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
+			uint32_t C = C_vals[range_index];
+			uint32_t D = tval;
+
+			u = D * C + B;
+			u = u ^ A;
+			u = (A & 0x80) | (u >> 2);
+
+			break;
+		}
+		default:
+		{
+			assert(0);
+			break;
+		}
+		}
+
+		return u;
+	}
+
+	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range)
+	{
+		assert(val < get_ise_levels(ise_range));
+
+		uint32_t u = 0;
+		switch (ise_range)
+		{
+		case 0: 
+		{
+			u = val ? 63 : 0;
+			break;
+		}
+		case 1: // 0-2 
+		{
+			const uint8_t s_tab_0_2[3] = { 0, 32, 63 };
+			u = s_tab_0_2[val];
+			break;
+		}
+		case 2: // 0-3
+		{
+			u = bit_replication_scale(val, 2, 6);
+			break;
+		}
+		case 3: // 0-4
+		{
+			const uint8_t s_tab_0_4[5] = { 0, 16, 32, 47, 63 };
+			u = s_tab_0_4[val];
+			break;
+		}
+		case 5: // 0-7
+		{
+			u = bit_replication_scale(val, 3, 6);
+			break;
+		}
+		case 8: // 0-15
+		{
+			u = bit_replication_scale(val, 4, 6);
+			break;
+		}
+		case 11: // 0-31
+		{
+			u = bit_replication_scale(val, 5, 6);
+			break;
+		}
+		case 4: // 0-5
+		case 6: // 0-9
+		case 7: // 0-11
+		case 9: // 0-19
+		case 10: // 0-23
+		{
+			const uint32_t num_bits = g_ise_range_table[ise_range][0];
+			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
+			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);
+			
+			// compute Table 103 row index
+			const int range_index = num_bits * 2 + (num_quints ? 1 : 0);
+
+			// Extract bits and tris/quints from value
+			const uint32_t bits = val & ((1u << num_bits) - 1);
+			const uint32_t D = val >> num_bits;
+
+			assert(D < (num_trits ? 3U : 5U));
+
+			// Now dequantize
+			// See Table 103. ASTC weight unquantization parameters
+			static const uint32_t C_table[5] = { 50, 28, 23, 13, 11 };
+					
+			const uint32_t a = bits & 1, b = (bits >> 1) & 1, c = (bits >> 2) & 1;
+
+			const uint32_t A = (a == 0) ? 0 : 0x7F;
+						
+			uint32_t B = 0;
+			if (range_index == 4)
+				B = ((b << 6) | (b << 2) | (b << 0));
+			else if (range_index == 5)
+				B = ((b << 6) | (b << 1));
+			else if (range_index == 6)
+				B = ((c << 6) | (b << 5) | (c << 1) | (b << 0));
+
+			const uint32_t C = C_table[range_index - 2];
+
+			u = D * C + B;
+			u = u ^ A;
+			u = (A & 0x20) | (u >> 2);
+			break;
+		}
+		default:
+			assert(0);
+			break;
+		}
+
+		if (u > 32)
+			u++;
+
+		return u;
+	}
+
+	// Returns the nearest ISE symbol given a [0,255] endpoint value.
+	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range)
+	{
+		assert(ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE && ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE);
+
+		const uint32_t total_levels = get_ise_levels(ise_range);
+		int best_e = INT_MAX, best_index = 0;
+		for (uint32_t i = 0; i < total_levels; i++)
+		{
+			const int qv = dequant_bise_endpoint(i, ise_range);
+			int e = labs(v - qv);
+			if (e < best_e)
+			{
+				best_e = e;
+				best_index = i;
+				if (!best_e)
+					break;
+			}
+		}
+		return best_index;
+	}
+
+	// Returns the nearest ISE weight given a [0,64] endpoint value.
+	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range)
+	{
+		assert(ise_range >= FIRST_VALID_WEIGHT_ISE_RANGE && ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
+		assert(v <= (int)MAX_WEIGHT_VALUE);
+
+		const uint32_t total_levels = get_ise_levels(ise_range);
+		int best_e = INT_MAX, best_index = 0;
+		for (uint32_t i = 0; i < total_levels; i++)
+		{
+			const int qv = dequant_bise_weight(i, ise_range);
+			int e = labs(v - qv);
+			if (e < best_e)
+			{
+				best_e = e;
+				best_index = i;
+				if (!best_e)
+					break;
+			}
+		}
+		return best_index;
+	}
+
+	void create_quant_tables(
+		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
+		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
+		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
+		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
+		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
+		bool weight_flag)		// false if block endpoints, true if weights
+	{
+		const uint32_t num_dequant_vals = weight_flag ? (MAX_WEIGHT_VALUE + 1) : 256;
+
+		for (uint32_t i = 0; i < num_dequant_vals; i++)
+		{
+			uint32_t bise_index = weight_flag ? astc_helpers::find_nearest_bise_weight(i, ise_range) : astc_helpers::find_nearest_bise_endpoint(i, ise_range);
+
+			if (pVal_to_ise)
+				pVal_to_ise[i] = (uint8_t)bise_index;
+
+			if (pISE_to_val)
+				pISE_to_val[bise_index] = weight_flag ? (uint8_t)astc_helpers::dequant_bise_weight(bise_index, ise_range) : (uint8_t)astc_helpers::dequant_bise_endpoint(bise_index, ise_range);
+		}
+
+		if (pISE_to_rank || pRank_to_ISE)
+		{
+			const uint32_t num_levels = get_ise_levels(ise_range);
+
+			if (!g_ise_range_table[ise_range][1] && !g_ise_range_table[ise_range][2])
+			{
+				// Only bits
+				for (uint32_t i = 0; i < num_levels; i++)
+				{
+					if (pISE_to_rank)
+						pISE_to_rank[i] = (uint8_t)i;
+
+					if (pRank_to_ISE)
+						pRank_to_ISE[i] = (uint8_t)i;
+				}
+			}
+			else
+			{
+				// Range has trits or quints
+				uint32_t vals[256];
+				for (uint32_t i = 0; i < num_levels; i++)
+				{
+					uint32_t v = weight_flag ? astc_helpers::dequant_bise_weight(i, ise_range) : astc_helpers::dequant_bise_endpoint(i, ise_range);
+					
+					// Low=ISE value
+					// High=dequantized value
+					vals[i] = (v << 16) | i;
+				}
+				
+				// Sorts by dequantized value
+				std::sort(vals, vals + num_levels);
+				
+				for (uint32_t rank = 0; rank < num_levels; rank++)
+				{
+					uint32_t ise_val = (uint8_t)vals[rank];
+
+					if (pISE_to_rank)
+						pISE_to_rank[ise_val] = (uint8_t)rank;
+					
+					if (pRank_to_ISE)
+						pRank_to_ISE[rank] = (uint8_t)ise_val;
+				}
+			}
+		}
+	}
+
+	void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah)
+	{
+		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
+		memset(pDst, 0xFF, 16);
+
+		pDst[0] = 0b11111100;
+		pDst[1] = 0b11111101;
+
+		pDst[8] = (uint8_t)rh;
+		pDst[9] = (uint8_t)(rh >> 8);
+		pDst[10] = (uint8_t)gh;
+		pDst[11] = (uint8_t)(gh >> 8);
+		pDst[12] = (uint8_t)bh;
+		pDst[13] = (uint8_t)(bh >> 8);
+		pDst[14] = (uint8_t)ah;
+		pDst[15] = (uint8_t)(ah >> 8);
+	}
+
+	// rh-ah are half-floats
+	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) 
+	{
+		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
+		memset(pDst, 0xFF, 16);
+
+		pDst[0] = 0b11111100;
+		
+		pDst[8] = (uint8_t)rh;
+		pDst[9] = (uint8_t)(rh >> 8);
+		pDst[10] = (uint8_t)gh;
+		pDst[11] = (uint8_t)(gh >> 8);
+		pDst[12] = (uint8_t)bh;
+		pDst[13] = (uint8_t)(bh >> 8);
+		pDst[14] = (uint8_t)ah;
+		pDst[15] = (uint8_t)(ah >> 8);
+	}
+		
+	bool is_cem_ldr(uint32_t mode)
+	{
+		switch (mode)
+		{
+		case CEM_LDR_LUM_DIRECT:
+		case CEM_LDR_LUM_BASE_PLUS_OFS:
+		case CEM_LDR_LUM_ALPHA_DIRECT:
+		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
+		case CEM_LDR_RGB_BASE_SCALE:
+		case CEM_LDR_RGB_DIRECT:
+		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
+		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
+		case CEM_LDR_RGBA_DIRECT:
+		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
+			return true;
+		default:
+			break;
+		}
+	
+		return false;
+	}
+
+	bool is_valid_block_size(uint32_t w, uint32_t h)
+	{
+		assert((w >= MIN_BLOCK_DIM) && (w <= MAX_BLOCK_DIM));
+		assert((h >= MIN_BLOCK_DIM) && (h <= MAX_BLOCK_DIM));
+
+#define SIZECHK(x, y) if ((w == (x)) && (h == (y))) return true;
+		SIZECHK(4, 4);
+		SIZECHK(5, 4);
+
+		SIZECHK(5, 5);
+
+		SIZECHK(6, 5);
+		SIZECHK(6, 6);
+
+		SIZECHK(8, 5);
+		SIZECHK(8, 6);
+		SIZECHK(10, 5);
+		SIZECHK(10, 6);
+
+		SIZECHK(8, 8);
+		SIZECHK(10, 8);
+		SIZECHK(10, 10);
+
+		SIZECHK(12, 10);
+		SIZECHK(12, 12);
+#undef SIZECHK
+
+		return false;
+	}
+
+	bool block_has_any_hdr_cems(const log_astc_block& log_blk)
+	{
+		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));
+
+		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
+			if (is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
+				return true;
+
+		return false;
+	}
+
+	bool block_has_any_ldr_cems(const log_astc_block& log_blk)
+	{
+		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));
+
+		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
+			if (!is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
+				return true;
+
+		return false;
+	}
+		
+	dequant_tables g_dequant_tables;
+
+	void precompute_texel_partitions_4x4();
+
+	void init_tables(bool init_rank_tabs)
+	{
+		g_dequant_tables.init(init_rank_tabs);
+		
+		precompute_texel_partitions_4x4();
+	}
+
+	struct weighted_sample
+	{
+		uint8_t m_src_x;
+		uint8_t m_src_y;
+		uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8
+	};
+
+	static void compute_upsample_weights(
+		int block_width, int block_height,
+		int weight_grid_width, int weight_grid_height,
+		weighted_sample* pWeights) // there will be block_width * block_height bilinear samples
+	{
+		const uint32_t scaleX = (1024 + block_width / 2) / (block_width - 1);
+		const uint32_t scaleY = (1024 + block_height / 2) / (block_height - 1);
+
+		for (int texelY = 0; texelY < block_height; texelY++)
+		{
+			for (int texelX = 0; texelX < block_width; texelX++)
+			{
+				const uint32_t gX = (scaleX * texelX * (weight_grid_width - 1) + 32) >> 6;
+				const uint32_t gY = (scaleY * texelY * (weight_grid_height - 1) + 32) >> 6;
+				const uint32_t jX = gX >> 4;
+				const uint32_t jY = gY >> 4;
+				const uint32_t fX = gX & 0xf;
+				const uint32_t fY = gY & 0xf;
+				const uint32_t w11 = (fX * fY + 8) >> 4;
+				const uint32_t w10 = fY - w11;
+				const uint32_t w01 = fX - w11;
+				const uint32_t w00 = 16 - fX - fY + w11;
+
+				weighted_sample& s = pWeights[texelX + texelY * block_width];
+				s.m_src_x = (uint8_t)jX;
+				s.m_src_y = (uint8_t)jY;
+				s.m_weights[0][0] = (uint8_t)w00;
+				s.m_weights[0][1] = (uint8_t)w01;
+				s.m_weights[1][0] = (uint8_t)w10;
+				s.m_weights[1][1] = (uint8_t)w11;
+			}
+		}
+	}
+
+	// Should be dequantized [0,64] weights
+	static void upsample_weight_grid(
+		uint32_t bx, uint32_t by,		// destination/to dimension
+		uint32_t wx, uint32_t wy,		// source/from dimension
+		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
+		uint8_t* pDst_weights)			// [by][bx]
+	{
+		assert((bx >= 2) && (by >= 2) && (bx <= 12) && (by <= 12));
+		assert((wx >= 2) && (wy >= 2) && (wx <= bx) && (wy <= by));
+
+		const uint32_t total_src_weights = wx * wy;
+		const uint32_t total_dst_weights = bx * by;
+
+		if (total_src_weights == total_dst_weights)
+		{
+			memcpy(pDst_weights, pSrc_weights, total_src_weights);
+			return;
+		}
+
+		weighted_sample weights[12 * 12];
+		compute_upsample_weights(bx, by, wx, wy, weights);
+
+		const weighted_sample* pS = weights;
+
+		for (uint32_t y = 0; y < by; y++)
+		{
+			for (uint32_t x = 0; x < bx; x++, ++pS)
+			{
+				const uint32_t w00 = pS->m_weights[0][0];
+				const uint32_t w01 = pS->m_weights[0][1];
+				const uint32_t w10 = pS->m_weights[1][0];
+				const uint32_t w11 = pS->m_weights[1][1];
+
+				assert(w00 || w01 || w10 || w11);
+
+				const uint32_t sx = pS->m_src_x, sy = pS->m_src_y;
+
+				uint32_t total = 8;
+				if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * w00;
+				if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * w01;
+				if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * w10;
+				if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * w11;
+
+				pDst_weights[x + y * bx] = (uint8_t)(total >> 4);
+			}
+		}
+	}
+
+	inline uint32_t hash52(uint32_t v)
+	{
+		uint32_t p = v;
+		p ^= p >> 15;   p -= p << 17;   p += p << 7;    p += p << 4;
+		p ^= p >> 5;   p += p << 16;   p ^= p >> 7;    p ^= p >> 3;
+		p ^= p << 6;   p ^= p >> 17;
+		return p;
+	}
+
+	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block)
+	{
+		assert(zIn == 0);
+
+		const uint32_t  x = small_block ? xIn << 1 : xIn;
+		const uint32_t  y = small_block ? yIn << 1 : yIn;
+		const uint32_t  z = small_block ? zIn << 1 : zIn;
+		const uint32_t  seed = seedIn + 1024 * (num_partitions - 1);
+		const uint32_t  rnum = hash52(seed);
+
+		uint8_t         seed1 = (uint8_t)(rnum & 0xf);
+		uint8_t         seed2 = (uint8_t)((rnum >> 4) & 0xf);
+		uint8_t         seed3 = (uint8_t)((rnum >> 8) & 0xf);
+		uint8_t         seed4 = (uint8_t)((rnum >> 12) & 0xf);
+		uint8_t         seed5 = (uint8_t)((rnum >> 16) & 0xf);
+		uint8_t         seed6 = (uint8_t)((rnum >> 20) & 0xf);
+		uint8_t         seed7 = (uint8_t)((rnum >> 24) & 0xf);
+		uint8_t         seed8 = (uint8_t)((rnum >> 28) & 0xf);
+		uint8_t         seed9 = (uint8_t)((rnum >> 18) & 0xf);
+		uint8_t         seed10 = (uint8_t)((rnum >> 22) & 0xf);
+		uint8_t         seed11 = (uint8_t)((rnum >> 26) & 0xf);
+		uint8_t         seed12 = (uint8_t)(((rnum >> 30) | (rnum << 2)) & 0xf);
+
+		seed1 = (uint8_t)(seed1 * seed1);
+		seed2 = (uint8_t)(seed2 * seed2);
+		seed3 = (uint8_t)(seed3 * seed3);
+		seed4 = (uint8_t)(seed4 * seed4);
+		seed5 = (uint8_t)(seed5 * seed5);
+		seed6 = (uint8_t)(seed6 * seed6);
+		seed7 = (uint8_t)(seed7 * seed7);
+		seed8 = (uint8_t)(seed8 * seed8);
+		seed9 = (uint8_t)(seed9 * seed9);
+		seed10 = (uint8_t)(seed10 * seed10);
+		seed11 = (uint8_t)(seed11 * seed11);
+		seed12 = (uint8_t)(seed12 * seed12);
+
+		const int shA = (seed & 2) != 0 ? 4 : 5;
+		const int shB = (num_partitions == 3) ? 6 : 5;
+		const int sh1 = (seed & 1) != 0 ? shA : shB;
+		const int sh2 = (seed & 1) != 0 ? shB : shA;
+		const int sh3 = (seed & 0x10) != 0 ? sh1 : sh2;
+
+		seed1 = (uint8_t)(seed1 >> sh1);
+		seed2 = (uint8_t)(seed2 >> sh2);
+		seed3 = (uint8_t)(seed3 >> sh1);
+		seed4 = (uint8_t)(seed4 >> sh2);
+		seed5 = (uint8_t)(seed5 >> sh1);
+		seed6 = (uint8_t)(seed6 >> sh2);
+		seed7 = (uint8_t)(seed7 >> sh1);
+		seed8 = (uint8_t)(seed8 >> sh2);
+		seed9 = (uint8_t)(seed9 >> sh3);
+		seed10 = (uint8_t)(seed10 >> sh3);
+		seed11 = (uint8_t)(seed11 >> sh3);
+		seed12 = (uint8_t)(seed12 >> sh3);
+
+		const int a = 0x3f & (seed1 * x + seed2 * y + seed11 * z + (rnum >> 14));
+		const int b = 0x3f & (seed3 * x + seed4 * y + seed12 * z + (rnum >> 10));
+		const int c = (num_partitions >= 3) ? 0x3f & (seed5 * x + seed6 * y + seed9 * z + (rnum >> 6)) : 0;
+		const int d = (num_partitions >= 4) ? 0x3f & (seed7 * x + seed8 * y + seed10 * z + (rnum >> 2)) : 0;
+
+		return (a >= b && a >= c && a >= d) ? 0
+			: (b >= c && b >= d) ? 1
+			: (c >= d) ? 2
+			: 3;
+	}
+
+	static uint32_t g_texel_partitions_4x4[1024][2];
+
+	void precompute_texel_partitions_4x4()
+	{
+		for (uint32_t p = 0; p < 1024; p++)
+		{
+			uint32_t v2 = 0, v3 = 0;
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t shift = x * 2 + y * 8;
+					v2 |= (compute_texel_partition(p, x, y, 0, 2, true) << shift);
+					v3 |= (compute_texel_partition(p, x, y, 0, 3, true) << shift);
+				}
+			}
+
+			g_texel_partitions_4x4[p][0] = v2;
+			g_texel_partitions_4x4[p][1] = v3;
+		}
+	}
+
+	static inline int get_precompute_texel_partitions_4x4(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions)
+	{
+		assert(g_texel_partitions_4x4[1][0]);
+		assert(seed < 1024);
+		assert((x <= 3) && (y <= 3));
+		assert((num_partitions >= 2) && (num_partitions <= 3));
+	
+		const uint32_t shift = x * 2 + y * 8;
+		return (g_texel_partitions_4x4[seed][num_partitions - 2] >> shift) & 3;
+	}
+
+	void blue_contract(
+		int r, int g, int b, int a, 
+		int &dr, int &dg, int &db, int &da)
+	{
+		dr = (r + b) >> 1;
+		dg = (g + b) >> 1;
+		db = b;
+		da = a;
+	}
+
+	inline void bit_transfer_signed(int& a, int& b)
+	{
+		b >>= 1;
+		b |= (a & 0x80);
+		a >>= 1;
+		a &= 0x3F;
+		if ((a & 0x20) != 0) 
+			a -= 0x40;
+	}
+
+	static inline int clamp(int a, int l, int h)
+	{
+		if (a < l)
+			a = l;
+		else if (a > h)
+			a = h;
+		return a;
+	}
+
+	static inline float clampf(float a, float l, float h)
+	{
+		if (a < l)
+			a = l;
+		else if (a > h)
+			a = h;
+		return a;
+	}
+
+	inline int sign_extend(int src, int num_src_bits)
+	{
+		assert((num_src_bits >= 2) && (num_src_bits <= 31));
+
+		const bool negative = (src & (1 << (num_src_bits - 1))) != 0;
+		if (negative)
+			return src | ~((1 << num_src_bits) - 1);
+		else
+			return src & ((1 << num_src_bits) - 1);
+	}
+
+	// endpoints is [4][2]
+	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t *pE)
+	{
+		assert(cem_index <= CEM_HDR_RGB_HDR_ALPHA);
+
+		int v0 = pE[0], v1 = pE[1];
+
+		int& e0_r = pEndpoints[0][0], &e0_g = pEndpoints[1][0], &e0_b = pEndpoints[2][0], &e0_a = pEndpoints[3][0];
+		int& e1_r = pEndpoints[0][1], &e1_g = pEndpoints[1][1], &e1_b = pEndpoints[2][1], &e1_a = pEndpoints[3][1];
+
+		switch (cem_index)
+		{
+		case CEM_LDR_LUM_DIRECT:
+		{
+			e0_r = v0; e1_r = v1;
+			e0_g = v0; e1_g = v1;
+			e0_b = v0; e1_b = v1;
+			e0_a = 0xFF; e1_a = 0xFF;
+			break;
+		}
+		case CEM_LDR_LUM_BASE_PLUS_OFS:
+		{
+			int l0 = (v0 >> 2) | (v1 & 0xc0);
+			int l1 = l0 + (v1 & 0x3f);
+
+			if (l1 > 0xFF)
+				l1 = 0xFF;
+
+			e0_r = l0; e1_r = l1;
+			e0_g = l0; e1_g = l1;
+			e0_b = l0; e1_b = l1;
+			e0_a = 0xFF; e1_a = 0xFF;
+			break;
+		}
+		case CEM_LDR_LUM_ALPHA_DIRECT:
+		{
+			int v2 = pE[2], v3 = pE[3];
+
+			e0_r = v0; e1_r = v1;
+			e0_g = v0; e1_g = v1;
+			e0_b = v0; e1_b = v1;
+			e0_a = v2; e1_a = v3;
+			break;
+		}
+		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
+		{
+			int v2 = pE[2], v3 = pE[3];
+
+			bit_transfer_signed(v1, v0);
+			bit_transfer_signed(v3, v2);
+
+			e0_r = v0; e1_r = v0 + v1;
+			e0_g = v0; e1_g = v0 + v1;
+			e0_b = v0; e1_b = v0 + v1;
+			e0_a = v2; e1_a = v2 + v3;
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
+				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
+			}
+
+			break;
+		}
+		case CEM_LDR_RGB_BASE_SCALE:
+		{
+			int v2 = pE[2], v3 = pE[3];
+
+			e0_r = (v0 * v3) >> 8; e1_r = v0;
+			e0_g = (v1 * v3) >> 8; e1_g = v1;
+			e0_b = (v2 * v3) >> 8; e1_b = v2;
+			e0_a = 0xFF; e1_a = 0xFF;
+
+			break;
+		}
+		case CEM_LDR_RGB_DIRECT:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
+
+			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
+			{
+				e0_r = v0; e1_r = v1;
+				e0_g = v2; e1_g = v3;
+				e0_b = v4; e1_b = v5;
+				e0_a = 0xFF; e1_a = 0xFF;
+			}
+			else
+			{
+				blue_contract(v1, v3, v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
+				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
+			}
+
+			break;
+		}
+		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
+
+			bit_transfer_signed(v1, v0);
+			bit_transfer_signed(v3, v2);
+			bit_transfer_signed(v5, v4);
+
+			if ((v1 + v3 + v5) >= 0)
+			{
+				e0_r = v0; e1_r = v0 + v1;
+				e0_g = v2; e1_g = v2 + v3;
+				e0_b = v4; e1_b = v4 + v5;
+				e0_a = 0xFF; e1_a = 0xFF;
+			}
+			else
+			{
+				blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
+				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
+			}
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
+				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
+			}
+
+			break;
+		}
+		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
+
+			e0_r = (v0 * v3) >> 8; e1_r = v0;
+			e0_g = (v1 * v3) >> 8; e1_g = v1;
+			e0_b = (v2 * v3) >> 8; e1_b = v2;
+			e0_a = v4; e1_a = v5;
+
+			break;
+		}
+		case CEM_LDR_RGBA_DIRECT:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];
+
+			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
+			{
+				e0_r = v0; e1_r = v1;
+				e0_g = v2; e1_g = v3;
+				e0_b = v4; e1_b = v5;
+				e0_a = v6; e1_a = v7;
+			}
+			else
+			{
+				blue_contract(v1, v3, v5, v7, e0_r, e0_g, e0_b, e0_a);
+				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
+			}
+
+			break;
+		}
+		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];
+
+			bit_transfer_signed(v1, v0);
+			bit_transfer_signed(v3, v2);
+			bit_transfer_signed(v5, v4);
+			bit_transfer_signed(v7, v6);
+
+			if ((v1 + v3 + v5) >= 0)
+			{
+				e0_r = v0; e1_r = v0 + v1;
+				e0_g = v2; e1_g = v2 + v3;
+				e0_b = v4; e1_b = v4 + v5;
+				e0_a = v6; e1_a = v6 + v7;
+			}
+			else
+			{
+				blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7, e0_r, e0_g, e0_b, e0_a);
+				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
+			}
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
+				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
+			}
+
+			break;
+		}
+		case CEM_HDR_LUM_LARGE_RANGE:
+		{
+			int y0, y1;
+			if (v1 >= v0)
+			{
+				y0 = (v0 << 4);
+				y1 = (v1 << 4);
+			}
+			else
+			{
+				y0 = (v1 << 4) + 8;
+				y1 = (v0 << 4) - 8;
+			}
+
+			e0_r = y0; e1_r = y1;
+			e0_g = y0; e1_g = y1;
+			e0_b = y0; e1_b = y1;
+			e0_a = 0x780; e1_a = 0x780;
+						
+			break;
+		}
+		case CEM_HDR_LUM_SMALL_RANGE:
+		{
+			int y0, y1, d;
+
+			if ((v0 & 0x80) != 0)
+			{
+				y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+				d = (v1 & 0x1F) << 2;
+			}
+			else
+			{
+				y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+				d = (v1 & 0x0F) << 1;
+			}
+						
+			y1 = y0 + d;
+			if (y1 > 0xFFF) 
+				y1 = 0xFFF;
+						
+			e0_r = y0; e1_r = y1;
+			e0_g = y0; e1_g = y1;
+			e0_b = y0; e1_b = y1;
+			e0_a = 0x780; e1_a = 0x780;
+
+			break;
+		}
+		case CEM_HDR_RGB_BASE_SCALE:
+		{
+			int v2 = pE[2], v3 = pE[3];
+						
+			int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
+			
+			int majcomp, mode;
+			if ((modeval & 0xC) != 0xC) 
+			{
+				majcomp = modeval >> 2; 
+				mode = modeval & 3;
+			}
+			else if (modeval != 0xF) 
+			{
+				majcomp = modeval & 3;  
+				mode = 4;
+			}
+			else 
+			{
+				majcomp = 0; 
+				mode = 5;
+			}
+
+			int red = v0 & 0x3f; 
+			int green = v1 & 0x1f;
+			int blue = v2 & 0x1f; 
+			int scale = v3 & 0x1f;
+
+			int x0 = (v1 >> 6) & 1; 
+			int x1 = (v1 >> 5) & 1; 
+			int x2 = (v2 >> 6) & 1;
+			int x3 = (v2 >> 5) & 1; 
+			int x4 = (v3 >> 7) & 1; 
+			int x5 = (v3 >> 6) & 1;
+			int x6 = (v3 >> 5) & 1;
+
+			int ohm = 1 << mode;
+			if (ohm & 0x30) green |= x0 << 6;
+			if (ohm & 0x3A) green |= x1 << 5;
+			if (ohm & 0x30) blue |= x2 << 6;
+			if (ohm & 0x3A) blue |= x3 << 5;
+			if (ohm & 0x3D) scale |= x6 << 5;
+			if (ohm & 0x2D) scale |= x5 << 6;
+			if (ohm & 0x04) scale |= x4 << 7;
+			if (ohm & 0x3B) red |= x4 << 6;
+			if (ohm & 0x04) red |= x3 << 6;
+			if (ohm & 0x10) red |= x5 << 7;
+			if (ohm & 0x0F) red |= x2 << 7;
+			if (ohm & 0x05) red |= x1 << 8;
+			if (ohm & 0x0A) red |= x0 << 8;
+			if (ohm & 0x05) red |= x0 << 9;
+			if (ohm & 0x02) red |= x6 << 9;
+			if (ohm & 0x01) red |= x3 << 10;
+			if (ohm & 0x02) red |= x5 << 10;
+
+			static const int s_shamts[6] = { 1,1,2,3,4,5 };
+			
+			const int shamt = s_shamts[mode];
+			red <<= shamt; 
+			green <<= shamt; 
+			blue <<= shamt; 
+			scale <<= shamt;
+
+			if (mode != 5) 
+			{ 
+				green = red - green; 
+				blue = red - blue; 
+			}
+
+			if (majcomp == 1) 
+				std::swap(red, green);
+
+			if (majcomp == 2) 
+				std::swap(red, blue);
+						
+			e1_r = clamp(red, 0, 0xFFF);
+			e1_g = clamp(green, 0, 0xFFF);
+			e1_b = clamp(blue, 0, 0xFFF);
+			e1_a = 0x780;
+
+			e0_r = clamp(red - scale, 0, 0xFFF);
+			e0_g = clamp(green - scale, 0, 0xFFF);
+			e0_b = clamp(blue - scale, 0, 0xFFF);
+			e0_a = 0x780;
+
+			break;
+		}
+		case CEM_HDR_RGB_HDR_ALPHA:
+		case CEM_HDR_RGB_LDR_ALPHA:
+		case CEM_HDR_RGB:
+		{
+			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];
+
+			int majcomp = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);
+
+			e0_a = 0x780;
+			e1_a = 0x780;
+
+			if (majcomp == 3) 
+			{
+				e0_r = v0 << 4;
+				e0_g = v2 << 4;
+				e0_b = (v4 & 0x7f) << 5;
+
+				e1_r = v1 << 4;
+				e1_g = v3 << 4;
+				e1_b = (v5 & 0x7f) << 5;
+			}
+			else
+			{
+				int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
+				int va = v0 | ((v1 & 0x40) << 2);
+				int vb0 = v2 & 0x3f;
+				int vb1 = v3 & 0x3f;
+				int vc = v1 & 0x3f;
+				int vd0 = v4 & 0x7f;
+				int vd1 = v5 & 0x7f;
+
+				static const int s_dbitstab[8] = { 7,6,7,6,5,6,5,6 };
+				vd0 = sign_extend(vd0, s_dbitstab[mode]);
+				vd1 = sign_extend(vd1, s_dbitstab[mode]);
+
+				int x0 = (v2 >> 6) & 1;
+				int x1 = (v3 >> 6) & 1;
+				int x2 = (v4 >> 6) & 1;
+				int x3 = (v5 >> 6) & 1;
+				int x4 = (v4 >> 5) & 1;
+				int x5 = (v5 >> 5) & 1;
+
+				int ohm = 1 << mode;
+				if (ohm & 0xA4) va |= x0 << 9;
+				if (ohm & 0x08) va |= x2 << 9;
+				if (ohm & 0x50) va |= x4 << 9;
+				if (ohm & 0x50) va |= x5 << 10;
+				if (ohm & 0xA0) va |= x1 << 10;
+				if (ohm & 0xC0) va |= x2 << 11;
+				if (ohm & 0x04) vc |= x1 << 6;
+				if (ohm & 0xE8) vc |= x3 << 6;
+				if (ohm & 0x20) vc |= x2 << 7;
+				if (ohm & 0x5B) vb0 |= x0 << 6;
+				if (ohm & 0x5B) vb1 |= x1 << 6;
+				if (ohm & 0x12) vb0 |= x2 << 7;
+				if (ohm & 0x12) vb1 |= x3 << 7;
+
+				int shamt = (mode >> 1) ^ 3;
+				va  = (uint32_t)va  << shamt;
+				vb0 = (uint32_t)vb0 << shamt;
+				vb1 = (uint32_t)vb1 << shamt;
+				vc  = (uint32_t)vc  << shamt;
+				vd0 = (uint32_t)vd0 << shamt;
+				vd1 = (uint32_t)vd1 << shamt;
+
+				e1_r = clamp(va, 0, 0xFFF);
+				e1_g = clamp(va - vb0, 0, 0xFFF);
+				e1_b = clamp(va - vb1, 0, 0xFFF);
+
+				e0_r = clamp(va - vc, 0, 0xFFF);
+				e0_g = clamp(va - vb0 - vc - vd0, 0, 0xFFF);
+				e0_b = clamp(va - vb1 - vc - vd1, 0, 0xFFF);
+
+				if (majcomp == 1)
+				{
+					std::swap(e0_r, e0_g);
+					std::swap(e1_r, e1_g);
+				}
+				else if (majcomp == 2)
+				{
+					std::swap(e0_r, e0_b);
+					std::swap(e1_r, e1_b);
+				}
+			}
+
+			if (cem_index == CEM_HDR_RGB_LDR_ALPHA)
+			{
+				int v6 = pE[6], v7 = pE[7];
+
+				e0_a = v6;
+				e1_a = v7;
+			}
+			else if (cem_index == CEM_HDR_RGB_HDR_ALPHA)
+			{
+				int v6 = pE[6], v7 = pE[7];
+
+				// Extract mode bits
+				int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+				v6 &= 0x7F;
+				v7 &= 0x7F;
+
+				if (mode == 3)
+				{
+					e0_a = v6 << 5;
+					e1_a = v7 << 5;
+				}
+				else
+				{
+					v6 |= (v7 << (mode + 1)) & 0x780;
+					v7 &= (0x3F >> mode);
+					v7 ^= (0x20 >> mode);
+					v7 -= (0x20 >> mode);
+					v6 <<= (4 - mode); 
+					v7 <<= (4 - mode);
+
+					v7 += v6;
+					v7 = clamp(v7, 0, 0xFFF);
+					e0_a = v6; 
+					e1_a = v7;
+				}
+			}
+
+			break;
+		}
+		default:
+		{
+			assert(0);
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				pEndpoints[c][0] = 0;
+				pEndpoints[c][1] = 0;
+			}
+			break;
+		}
+		}
+	}
+		
+	static inline bool is_half_inf_or_nan(half_float v)
+	{
+		return get_bits(v, 10, 14) == 31;
+	}
+
+	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
+	half_float float_to_half(float val, bool toward_zero)
+	{
+		union { float f; int32_t i; uint32_t u; } fi = { val };
+		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
+		int s = flt_s, e = 0, m = 0;
+
+		// inf/NaN
+		if (flt_e == 0xff)
+		{
+			e = 31;
+			if (flt_m != 0) // NaN
+				m = 1;
+		}
+		// not zero or denormal
+		else if (flt_e != 0)
+		{
+			int new_exp = flt_e - 127;
+			if (new_exp > 15)
+				e = 31;
+			else if (new_exp < -14)
+			{
+				if (toward_zero)
+					m = (int)truncf((1 << 24) * fabsf(fi.f));
+				else
+					m = lrintf((1 << 24) * fabsf(fi.f));
+			}
+			else
+			{
+				e = new_exp + 15;
+				if (toward_zero)
+					m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13)));
+				else
+					m = lrintf((float)flt_m * (1.0f / (float)(1 << 13)));
+			}
+		}
+
+		assert((0 <= m) && (m <= 1024));
+		if (m == 1024)
+		{
+			e++;
+			m = 0;
+		}
+
+		assert((s >= 0) && (s <= 1));
+		assert((e >= 0) && (e <= 31));
+		assert((m >= 0) && (m <= 1023));
+
+		half_float result = (half_float)((s << 15) | (e << 10) | m);
+		return result;
+	}
+
+	float half_to_float(half_float hval)
+	{
+		union { float f; uint32_t u; } x = { 0 };
+
+		uint32_t s = ((uint32_t)hval >> 15) & 1;
+		uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
+		uint32_t m = (uint32_t)hval & 0x3FF;
+
+		if (!e)
+		{
+			if (!m)
+			{
+				// +- 0
+				x.u = s << 31;
+				return x.f;
+			}
+			else
+			{
+				// denormalized
+				while (!(m & 0x00000400))
+				{
+					m <<= 1;
+					--e;
+				}
+
+				++e;
+				m &= ~0x00000400;
+			}
+		}
+		else if (e == 31)
+		{
+			if (m == 0)
+			{
+				// +/- INF
+				x.u = (s << 31) | 0x7f800000;
+				return x.f;
+			}
+			else
+			{
+				// +/- NaN
+				x.u = (s << 31) | 0x7f800000 | (m << 13);
+				return x.f;
+			}
+		}
+
+		e = e + (127 - 15);
+		m = m << 13;
+
+		assert(s <= 1);
+		assert(m <= 0x7FFFFF);
+		assert(e <= 255);
+
+		x.u = m | (e << 23) | (s << 31);
+		return x.f;
+	}
+
+	static inline half_float qlog16_to_half(int k)
+	{
+		assert((k >= 0) && (k <= 0xFFFF));
+
+		int E = (k & 0xF800) >> 11;
+		int M = k & 0x7FF;
+
+		int Mt;
+		if (M < 512)
+			Mt = 3 * M;
+		else if (M >= 1536)
+			Mt = 5 * M - 2048;
+		else
+			Mt = 4 * M - 512;
+
+		return (half_float)((E << 10) + (Mt >> 3));
+	}
+
+	// See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
+	const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31;
+	const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS);
+	const int RGB9E5_MANTISSA_VALUES = (1 << RGB9E5_MANTISSA_BITS);
+	const int MAX_RGB9E5_MANTISSA = (RGB9E5_MANTISSA_VALUES - 1);
+	//const int MAX_RGB9E5 = (int)(((float)MAX_RGB9E5_MANTISSA) / RGB9E5_MANTISSA_VALUES * (1 << MAX_RGB9E5_EXP));
+	const int EPSILON_RGB9E5 = (int)((1.0f / (float)RGB9E5_MANTISSA_VALUES) / (float)(1 << RGB9E5_EXP_BIAS));
+		
+	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b)
+	{
+		int x = packed & 511;
+		int y = (packed >> 9) & 511;
+		int z = (packed >> 18) & 511;
+		int w = (packed >> 27) & 31;
+
+		const float scale = powf(2.0f, static_cast<float>(w - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));
+
+		r = x * scale;
+		g = y * scale;
+		b = z * scale;
+	}
+			
+	// floor_log2 is not correct for the denorm and zero values, but we are going to do a max of this value with the minimum rgb9e5 exponent that will hide these problem cases.
+	static inline int floor_log2(float x) 
+	{
+		union float754
+		{
+			unsigned int raw;
+			float value;
+		};
+
+		float754 f;
+		f.value = x;
+		// Extract float exponent
+		return ((f.raw >> 23) & 0xFF) - 127;
+	}
+
+	static inline int maximumi(int a, int b) { return (a > b) ? a : b; }
+	static inline float maximumf(float a, float b) { return (a > b) ? a : b; }
+
+	uint32_t pack_rgb9e5(float r, float g, float b)
+	{
+		r = clampf(r, 0.0f, MAX_RGB9E5);
+		g = clampf(g, 0.0f, MAX_RGB9E5);
+		b = clampf(b, 0.0f, MAX_RGB9E5);
+
+		float maxrgb = maximumf(maximumf(r, g), b);
+		int exp_shared = maximumi(-RGB9E5_EXP_BIAS - 1, floor_log2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
+		assert((exp_shared >= 0) && (exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP));
+
+		float denom = powf(2.0f, (float)(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));
+
+		int maxm = (int)floorf((maxrgb / denom) + 0.5f);
+		if (maxm == (MAX_RGB9E5_MANTISSA + 1))
+		{
+			denom *= 2;
+			exp_shared += 1;
+			assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
+		}
+		else 
+		{
+			assert(maxm <= MAX_RGB9E5_MANTISSA);
+		}
+
+		int rm = (int)floorf((r / denom) + 0.5f);
+		int gm = (int)floorf((g / denom) + 0.5f);
+		int bm = (int)floorf((b / denom) + 0.5f);
+
+		assert((rm >= 0) && (rm <= MAX_RGB9E5_MANTISSA));
+		assert((gm >= 0) && (gm <= MAX_RGB9E5_MANTISSA));
+		assert((bm >= 0) && (bm <= MAX_RGB9E5_MANTISSA));
+		
+		return rm | (gm << 9) | (bm << 18) | (exp_shared << 27);
+	}
+
+	static inline int clz17(uint32_t x)
+	{
+		assert(x <= 0x1FFFF);
+		x &= 0x1FFFF;
+
+		if (!x)
+			return 17;
+				
+		uint32_t n = 0;
+		while ((x & 0x10000) == 0)
+		{
+			x <<= 1u;
+			n++;
+		}
+
+		return n;
+	}
+
+	static inline uint32_t pack_rgb9e5_ldr_astc(int Cr, int Cg, int Cb)
+	{
+		int lz = clz17(Cr | Cg | Cb | 1);
+		if (Cr == 65535) { Cr = 65536; lz = 0; }
+		if (Cg == 65535) { Cg = 65536; lz = 0; }
+		if (Cb == 65535) { Cb = 65536; lz = 0; }
+		Cr <<= lz; Cg <<= lz; Cb <<= lz;
+		Cr = (Cr >> 8) & 0x1FF;
+		Cg = (Cg >> 8) & 0x1FF;
+		Cb = (Cb >> 8) & 0x1FF;
+		uint32_t exponent = 16 - lz;
+		uint32_t texel = (exponent << 27) | (Cb << 18) | (Cg << 9) | Cr;
+		return texel;
+	}
+
+	static inline uint32_t pack_rgb9e5_hdr_astc(int Cr, int Cg, int Cb)
+	{
+		if (Cr > 0x7c00) Cr = 0; else if (Cr == 0x7c00) Cr = 0x7bff;
+		if (Cg > 0x7c00) Cg = 0; else if (Cg == 0x7c00) Cg = 0x7bff;
+		if (Cb > 0x7c00) Cb = 0; else if (Cb == 0x7c00) Cb = 0x7bff;
+		int Re = (Cr >> 10) & 0x1F;
+		int Ge = (Cg >> 10) & 0x1F;
+		int Be = (Cb >> 10) & 0x1F;
+		int Rex = (Re == 0) ? 1 : Re;
+		int Gex = (Ge == 0) ? 1 : Ge;
+		int Bex = (Be == 0) ? 1 : Be;
+		int Xm = ((Cr | Cg | Cb) & 0x200) >> 9;
+		int Xe = Re | Ge | Be;
+		uint32_t rshift, gshift, bshift, expo;
+
+		if (Xe == 0)
+		{
+			expo = rshift = gshift = bshift = Xm;
+		}
+		else if (Re >= Ge && Re >= Be)
+		{
+			expo = Rex + 1;
+			rshift = 2;
+			gshift = Rex - Gex + 2;
+			bshift = Rex - Bex + 2;
+		}
+		else if (Ge >= Be)
+		{
+			expo = Gex + 1;
+			rshift = Gex - Rex + 2;
+			gshift = 2;
+			bshift = Gex - Bex + 2;
+		}
+		else
+		{
+			expo = Bex + 1;
+			rshift = Bex - Rex + 2;
+			gshift = Bex - Gex + 2;
+			bshift = 2;
+		}
+
+		int Rm = (Cr & 0x3FF) | (Re == 0 ? 0 : 0x400);
+		int Gm = (Cg & 0x3FF) | (Ge == 0 ? 0 : 0x400);
+		int Bm = (Cb & 0x3FF) | (Be == 0 ? 0 : 0x400);
+		Rm = (Rm >> rshift) & 0x1FF;
+		Gm = (Gm >> gshift) & 0x1FF;
+		Bm = (Bm >> bshift) & 0x1FF;
+
+		uint32_t texel = (expo << 27) | (Bm << 18) | (Gm << 9) | (Rm << 0);
+		return texel;
+	}
+		
+	// Important: pPixels is either 32-bit/texel or 64-bit/texel.
+	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode)
+	{
+		assert(is_valid_block_size(blk_width, blk_height));
+				
+		assert(g_dequant_tables.m_endpoints[0].m_ISE_to_val.size());
+		if (!g_dequant_tables.m_endpoints[0].m_ISE_to_val.size())
+			return false;
+
+		const uint32_t num_blk_pixels = blk_width * blk_height;
+		
+		// Write block error color
+		if (dec_mode == cDecodeModeHDR16)
+		{
+			// NaN's
+			memset(pPixels, 0xFF, num_blk_pixels * sizeof(half_float) * 4);
+		}
+		else if (dec_mode == cDecodeModeRGB9E5)
+		{
+			const uint32_t purple_9e5 = pack_rgb9e5(1.0f, 0.0f, 1.0f);
+
+			for (uint32_t i = 0; i < num_blk_pixels; i++)
+				((uint32_t*)pPixels)[i] = purple_9e5;
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_blk_pixels; i++)
+				((uint32_t*)pPixels)[i] = 0xFFFF00FF;
+		}
+
+		if (log_blk.m_error_flag)
+		{
+			// Should this return false? It's not an invalid logical block config, though.
+			return false;
+		}
+
+		// Handle solid color blocks
+		if (log_blk.m_solid_color_flag_ldr)
+		{
+			// LDR solid block
+			if (dec_mode == cDecodeModeHDR16)
+			{
+				// Convert LDR pixels to half-float
+				half_float h[4];
+				for (uint32_t c = 0; c < 4; c++)
+					h[c] = (log_blk.m_solid_color[c] == 0xFFFF) ? 0x3C00 : float_to_half((float)log_blk.m_solid_color[c] * (1.0f / 65536.0f), true);
+
+				for (uint32_t i = 0; i < num_blk_pixels; i++)
+					memcpy((uint16_t*)pPixels + i * 4, h, sizeof(half_float) * 4);
+			}
+			else if (dec_mode == cDecodeModeRGB9E5)
+			{
+				float r = (log_blk.m_solid_color[0] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[0] * (1.0f / 65536.0f));
+				float g = (log_blk.m_solid_color[1] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[1] * (1.0f / 65536.0f));
+				float b = (log_blk.m_solid_color[2] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[2] * (1.0f / 65536.0f));
+
+				const uint32_t packed = pack_rgb9e5(r, g, b);
+
+				for (uint32_t i = 0; i < num_blk_pixels; i++)
+					((uint32_t*)pPixels)[i] = packed;
+			}
+			else
+			{
+				// Convert LDR pixels to 8-bits
+				for (uint32_t i = 0; i < num_blk_pixels; i++)
+					for (uint32_t c = 0; c < 4; c++)
+						((uint8_t*)pPixels)[i * 4 + c] = (log_blk.m_solid_color[c] >> 8);
+			}
+
+			return true;
+		}
+		else if (log_blk.m_solid_color_flag_hdr)
+		{
+			// HDR solid block, decode mode must be half-float or RGB9E5
+			if (dec_mode == cDecodeModeHDR16)
+			{
+				for (uint32_t i = 0; i < num_blk_pixels; i++)
+					memcpy((uint16_t*)pPixels + i * 4, log_blk.m_solid_color, sizeof(half_float) * 4);
+			}
+			else if (dec_mode == cDecodeModeRGB9E5)
+			{
+				float r = half_to_float(log_blk.m_solid_color[0]);
+				float g = half_to_float(log_blk.m_solid_color[1]);
+				float b = half_to_float(log_blk.m_solid_color[2]);
+				
+				const uint32_t packed = pack_rgb9e5(r, g, b);
+
+				for (uint32_t i = 0; i < num_blk_pixels; i++)
+					((uint32_t*)pPixels)[i] = packed;
+			}
+			else
+			{
+				return false;
+			}
+
+			return true;
+		}
+						
+		// Sanity check block's config
+		if ((log_blk.m_grid_width < 2) || (log_blk.m_grid_height < 2))
+			return false;
+		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
+			return false;
+
+		if ((log_blk.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_blk.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
+			return false;
+		if ((log_blk.m_weight_ise_range < FIRST_VALID_WEIGHT_ISE_RANGE) || (log_blk.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE))
+			return false;
+		if ((log_blk.m_num_partitions < 1) || (log_blk.m_num_partitions > MAX_PARTITIONS))
+			return false;
+		if ((log_blk.m_dual_plane) && (log_blk.m_num_partitions > MAX_DUAL_PLANE_PARTITIONS))
+			return false;
+		if (log_blk.m_partition_id >= NUM_PARTITION_PATTERNS)
+			return false;
+		if ((log_blk.m_num_partitions == 1) && (log_blk.m_partition_id > 0))
+			return false;
+		if (log_blk.m_color_component_selector > 3)
+			return false;
+
+		const uint32_t total_endpoint_levels = get_ise_levels(log_blk.m_endpoint_ise_range);
+		const uint32_t total_weight_levels = get_ise_levels(log_blk.m_weight_ise_range);
+				
+		bool is_ldr_endpoints[MAX_PARTITIONS];
+
+		// Check CEM's
+		uint32_t total_cem_vals = 0;
+		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
+		{
+			if (log_blk.m_color_endpoint_modes[i] > 15)
+				return false;
+
+			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[i]);
+			
+			is_ldr_endpoints[i] = is_cem_ldr(log_blk.m_color_endpoint_modes[i]);
+		}
+
+		if (total_cem_vals > MAX_ENDPOINTS)
+			return false;
+
+		const dequant_table& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range);
+		const uint8_t* pEndpoint_dequant = endpoint_dequant_tab.m_ISE_to_val.data();
+
+		// Dequantized endpoints to [0,255]
+		uint8_t dequantized_endpoints[MAX_ENDPOINTS];
+		for (uint32_t i = 0; i < total_cem_vals; i++)
+		{
+			if (log_blk.m_endpoints[i] >= total_endpoint_levels)
+				return false;
+			dequantized_endpoints[i] = pEndpoint_dequant[log_blk.m_endpoints[i]];
+		}
+				
+		// Dequantize weights to [0,64]
+		uint8_t dequantized_weights[2][12 * 12];
+		
+		const dequant_table& weight_dequant_tab = g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range);
+		const uint8_t* pWeight_dequant = weight_dequant_tab.m_ISE_to_val.data();
+		
+		const uint32_t total_weight_vals = (log_blk.m_dual_plane ? 2 : 1) * log_blk.m_grid_width * log_blk.m_grid_height;
+		for (uint32_t i = 0; i < total_weight_vals; i++)
+		{
+			if (log_blk.m_weights[i] >= total_weight_levels)
+				return false;
+
+			const uint32_t plane_index = log_blk.m_dual_plane ? (i & 1) : 0;
+			const uint32_t grid_index = log_blk.m_dual_plane ? (i >> 1) : i;
+
+			dequantized_weights[plane_index][grid_index] = pWeight_dequant[log_blk.m_weights[i]];
+		}
+
+		// Upsample weight grid. [0,64] weights
+		uint8_t upsampled_weights[2][12 * 12];
+
+		upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[0][0], &upsampled_weights[0][0]);
+		if (log_blk.m_dual_plane)
+			upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[1][0], &upsampled_weights[1][0]);
+
+		// Decode CEM's
+		int endpoints[4][4][2]; // [subset][comp][l/h]
+
+		uint32_t endpoint_val_index = 0;
+		for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++)
+		{
+			const uint32_t cem_index = log_blk.m_color_endpoint_modes[subset];
+
+			decode_endpoint(cem_index, &endpoints[subset][0], &dequantized_endpoints[endpoint_val_index]);
+
+			endpoint_val_index += get_num_cem_values(cem_index);
+		}
+
+		// Decode texels
+		const bool small_block = num_blk_pixels < 31;
+		const bool use_precomputed_texel_partitions = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
+		const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX;
+		
+		bool success = true;
+
+		if (dec_mode == cDecodeModeRGB9E5)
+		{
+			// returns uint32_t's
+			for (uint32_t y = 0; y < blk_height; y++)
+			{
+				for (uint32_t x = 0; x < blk_width; x++)
+				{
+					const uint32_t pixel_index = x + y * blk_width;
+					const uint32_t subset = (log_blk.m_num_partitions > 1) ? 
+						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
+						: 0;
+
+					int comp[3];
+
+					for (uint32_t c = 0; c < 3; c++)
+					{
+						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
+
+						if (is_ldr_endpoints[subset])
+						{
+							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
+							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));
+
+							int le = endpoints[subset][c][0];
+							int he = endpoints[subset][c][1];
+
+							le = (le << 8) | le;
+							he = (he << 8) | he;
+
+							int k = weight_interpolate(le, he, w);
+							assert((k >= 0) && (k <= 0xFFFF));
+
+							comp[c] = k; // 1.0
+						}
+						else
+						{
+							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
+							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));
+
+							int le = endpoints[subset][c][0] << 4;
+							int he = endpoints[subset][c][1] << 4;
+
+							int qlog16 = weight_interpolate(le, he, w);
+
+							comp[c] = qlog16_to_half(qlog16);
+
+							if (is_half_inf_or_nan((half_float)comp[c]))
+								comp[c] = 0x7BFF;
+						}
+						
+					} // c
+
+					uint32_t packed;
+					if (is_ldr_endpoints[subset])
+						packed = pack_rgb9e5_ldr_astc(comp[0], comp[1], comp[2]);
+					else
+						packed = pack_rgb9e5_hdr_astc(comp[0], comp[1], comp[2]);
+
+					((uint32_t*)pPixels)[pixel_index] = packed;
+
+				} // x
+			} // y
+		}
+		else if (dec_mode == cDecodeModeHDR16)
+		{
+			// Note: must round towards zero when converting float to half for ASTC (18.19 Weight Application)
+			
+			// returns half floats
+			for (uint32_t y = 0; y < blk_height; y++)
+			{
+				for (uint32_t x = 0; x < blk_width; x++)
+				{
+					const uint32_t pixel_index = x + y * blk_width;
+					const uint32_t subset = (log_blk.m_num_partitions > 1) ?
+						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
+						: 0;
+
+					for (uint32_t c = 0; c < 4; c++)
+					{
+						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
+
+						half_float o;
+
+						if ( (is_ldr_endpoints[subset]) ||
+							 ((log_blk.m_color_endpoint_modes[subset] == CEM_HDR_RGB_LDR_ALPHA) && (c == 3)) )
+						{
+							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
+							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));
+
+							int le = endpoints[subset][c][0];
+							int he = endpoints[subset][c][1];
+
+							le = (le << 8) | le;
+							he = (he << 8) | he;
+
+							int k = weight_interpolate(le, he, w);
+							assert((k >= 0) && (k <= 0xFFFF));
+
+							if (k == 0xFFFF)
+								o = 0x3C00; // 1.0
+							else
+								o = float_to_half((float)k * (1.0f / 65536.0f), true);
+						}
+						else
+						{
+							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
+							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));
+
+							int le = endpoints[subset][c][0] << 4;
+							int he = endpoints[subset][c][1] << 4;
+
+							int qlog16 = weight_interpolate(le, he, w);
+							
+							o = qlog16_to_half(qlog16);
+
+							if (is_half_inf_or_nan(o))
+								o = 0x7BFF;
+						}
+												
+						((half_float*)pPixels)[pixel_index * 4 + c] = o;
+					}
+
+				} // x
+			} // y
+		}
+		else
+		{
+			// returns uint8_t's
+			for (uint32_t y = 0; y < blk_height; y++)
+			{
+				for (uint32_t x = 0; x < blk_width; x++)
+				{
+					const uint32_t pixel_index = x + y * blk_width;
+
+					const uint32_t subset = (log_blk.m_num_partitions > 1) ?
+						(use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block))
+						: 0;
+
+					if (!is_ldr_endpoints[subset])
+					{
+						((uint32_t*)pPixels)[pixel_index * 4] = 0xFFFF00FF;
+						success = false;
+					}
+					else
+					{
+						for (uint32_t c = 0; c < 4; c++)
+						{
+							const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];
+
+							int le = endpoints[subset][c][0];
+							int he = endpoints[subset][c][1];
+
+							// FIXME: the spec is apparently wrong? this matches ARM's and Google's decoder
+							//if ((dec_mode == cDecodeModeSRGB8) && (c <= 2))
+							// See https://github.com/ARM-software/astc-encoder/issues/447
+							if (dec_mode == cDecodeModeSRGB8)
+							{
+								le = (le << 8) | 0x80;
+								he = (he << 8) | 0x80;
+							}
+							else
+							{
+								le = (le << 8) | le;
+								he = (he << 8) | he;
+							}
+
+							uint32_t k = weight_interpolate(le, he, w);
+
+							// FIXME: This is what the spec says to do in LDR mode, but this is not what ARM's decoder does
+							// See decompress_symbolic_block(), decode_texel() and unorm16_to_sf16. 
+							// It seems to effectively divide by 65535.0 and convert to FP16, then back to float, mul by 255.0, add .5 and then convert to 8-bit.
+							((uint8_t*)pPixels)[pixel_index * 4 + c] = (uint8_t)(k >> 8);
+						}
+					}
+
+				} // x
+			} // y
+		}
+		
+		return success;
+	}
+
+	//------------------------------------------------
+	// Physical to logical block decoding
+
+	// unsigned 128-bit int, with some signed helpers
+	class uint128
+	{
+		uint64_t m_lo, m_hi;
+
+	public:
+		uint128() = default;
+		inline uint128(uint64_t lo) : m_lo(lo), m_hi(0) { }
+		inline uint128(uint64_t lo, uint64_t hi) : m_lo(lo), m_hi(hi) { }
+		inline uint128(const uint128& other) : m_lo(other.m_lo), m_hi(other.m_hi) { }
+
+		inline uint128& set_signed(int64_t lo) { m_lo = lo; m_hi = (lo < 0) ? UINT64_MAX : 0; return *this; }
+		inline uint128& set(uint64_t lo) { m_lo = lo; m_hi = 0; return *this; }
+
+		inline explicit operator uint8_t () const { return (uint8_t)m_lo; }
+		inline explicit operator uint16_t () const { return (uint16_t)m_lo; }
+		inline explicit operator uint32_t () const { return (uint32_t)m_lo; }
+		inline explicit operator uint64_t () const { return m_lo; }
+
+		inline uint128& operator= (const uint128& rhs) { m_lo = rhs.m_lo; m_hi = rhs.m_hi; return *this; }
+		inline uint128& operator= (const uint64_t val) { m_lo = val; m_hi = 0; return *this; }
+
+		inline uint64_t get_low() const { return m_lo; }
+		inline uint64_t& get_low() { return m_lo; }
+
+		inline uint64_t get_high() const { return m_hi; }
+		inline uint64_t& get_high() { return m_hi; }
+
+		inline bool operator== (const uint128& rhs) const { return (m_lo == rhs.m_lo) && (m_hi == rhs.m_hi); }
+		inline bool operator!= (const uint128& rhs) const { return (m_lo != rhs.m_lo) || (m_hi != rhs.m_hi); }
+
+		inline bool operator< (const uint128& rhs) const
+		{
+			if (m_hi < rhs.m_hi)
+				return true;
+
+			if (m_hi == rhs.m_hi)
+			{
+				if (m_lo < rhs.m_lo)
+					return true;
+			}
+
+			return false;
+		}
+
+		inline bool operator> (const uint128& rhs) const { return (rhs < *this); }
+
+		inline bool operator<= (const uint128& rhs) const { return (*this == rhs) || (*this < rhs); }
+		inline bool operator>= (const uint128& rhs) const { return (*this == rhs) || (*this > rhs); }
+
+		inline bool is_zero() const { return (m_lo == 0) && (m_hi == 0); }
+		inline bool is_all_ones() const { return (m_lo == UINT64_MAX) && (m_hi == UINT64_MAX); }
+		inline bool is_non_zero() const { return (m_lo != 0) || (m_hi != 0); }
+		inline explicit operator bool() const { return is_non_zero(); }
+		inline bool is_signed() const { return ((int64_t)m_hi) < 0; }
+
+		inline bool signed_less(const uint128& rhs) const
+		{
+			const bool l_signed = is_signed(), r_signed = rhs.is_signed();
+
+			if (l_signed == r_signed)
+				return *this < rhs;
+
+			if (l_signed && !r_signed)
+				return true;
+
+			assert(!l_signed && r_signed);
+			return false;
+		}
+
+		inline bool signed_greater(const uint128& rhs) const { return rhs.signed_less(*this); }
+		inline bool signed_less_equal(const uint128& rhs) const { return !rhs.signed_less(*this); }
+		inline bool signed_greater_equal(const uint128& rhs) const { return !signed_less(rhs); }
+
+		double get_double() const
+		{
+			double res = 0;
+
+			if (m_hi)
+				res = (double)m_hi * pow(2.0f, 64.0f);
+
+			res += (double)m_lo;
+
+			return res;
+		}
+
+		double get_signed_double() const
+		{
+			if (is_signed())
+				return -(uint128(*this).abs().get_double());
+			else
+				return get_double();
+		}
+
+		inline uint128 abs() const
+		{
+			uint128 res(*this);
+			if (res.is_signed())
+				res = -res;
+			return res;
+		}
+
+		inline uint128& operator<<= (int shift)
+		{
+			assert(shift >= 0);
+			if (shift < 0)
+				return *this;
+
+			m_hi = (shift >= 64) ? ((shift >= 128) ? 0 : (m_lo << (shift - 64))) : (m_hi << shift);
+
+			if ((shift) && (shift < 64))
+				m_hi |= (m_lo >> (64 - shift));
+
+			m_lo = (shift >= 64) ? 0 : (m_lo << shift);
+
+			return *this;
+		}
+
+		inline uint128 operator<< (int shift) const { uint128 res(*this); res <<= shift; return res; }
+
+		inline uint128& operator>>= (int shift)
+		{
+			assert(shift >= 0);
+			if (shift < 0)
+				return *this;
+
+			m_lo = (shift >= 64) ? ((shift >= 128) ? 0 : (m_hi >> (shift - 64))) : (m_lo >> shift);
+
+			if ((shift) && (shift < 64))
+				m_lo |= (m_hi << (64 - shift));
+
+			m_hi = (shift >= 64) ? 0 : (m_hi >> shift);
+
+			return *this;
+		}
+
+		inline uint128 operator>> (int shift) const { uint128 res(*this); res >>= shift; return res; }
+
+		inline uint128 signed_shift_right(int shift) const
+		{
+			uint128 res(*this);
+			res >>= shift;
+
+			if (is_signed())
+			{
+				uint128 x(0U);
+				x = ~x;
+				x >>= shift;
+				res |= (~x);
+			}
+
+			return res;
+		}
+
+		inline uint128& operator |= (const uint128& rhs) { m_lo |= rhs.m_lo; m_hi |= rhs.m_hi; return *this; }
+		inline uint128 operator | (const uint128& rhs) const { uint128 res(*this); res |= rhs; return res; }
+
+		inline uint128& operator &= (const uint128& rhs) { m_lo &= rhs.m_lo; m_hi &= rhs.m_hi; return *this; }
+		inline uint128 operator & (const uint128& rhs) const { uint128 res(*this); res &= rhs;	return res; }
+
+		inline uint128& operator ^= (const uint128& rhs) { m_lo ^= rhs.m_lo; m_hi ^= rhs.m_hi; return *this; }
+		inline uint128 operator ^ (const uint128& rhs) const { uint128 res(*this); res ^= rhs;	return res; }
+
+		inline uint128 operator ~() const { return uint128(~m_lo, ~m_hi); }
+
+		inline uint128 operator -() const { uint128 res(~*this); if (++res.m_lo == 0) ++res.m_hi; return res; }
+
+		// prefix
+		inline uint128 operator ++()
+		{
+			if (++m_lo == 0)
+				++m_hi;
+			return *this;
+		}
+
+		// postfix
+		inline uint128 operator ++(int)
+		{
+			uint128 res(*this);
+			if (++m_lo == 0)
+				++m_hi;
+			return res;
+		}
+
+		// prefix
+		inline uint128 operator --()
+		{
+			const uint64_t t = m_lo;
+			if (--m_lo > t)
+				--m_hi;
+			return *this;
+		}
+
+		// postfix
+		inline uint128 operator --(int)
+		{
+			const uint64_t t = m_lo;
+			uint128 res(*this);
+			if (--m_lo > t)
+				--m_hi;
+			return res;
+		}
+
+		inline uint128& operator+= (const uint128& rhs)
+		{
+			const uint64_t t = m_lo + rhs.m_lo;
+			m_hi = m_hi + rhs.m_hi + (t < m_lo);
+			m_lo = t;
+			return *this;
+		}
+
+		inline uint128 operator+ (const uint128& rhs) const { uint128 res(*this); res += rhs; return res; }
+
+		inline uint128& operator-= (const uint128& rhs)
+		{
+			const uint64_t t = m_lo - rhs.m_lo;
+			m_hi = m_hi - rhs.m_hi - (t > m_lo);
+			m_lo = t;
+			return *this;
+		}
+
+		inline uint128 operator- (const uint128& rhs) const { uint128 res(*this); res -= rhs; return res; }
+
+		// computes bit by bit, very slow
+		uint128& operator*=(const uint128& rhs)
+		{
+			uint128 temp(*this), result(0U);
+
+			for (uint128 bitmask(rhs); bitmask; bitmask >>= 1, temp <<= 1)
+				if (bitmask.get_low() & 1)
+					result += temp;
+
+			*this = result;
+			return *this;
+		}
+
+		uint128 operator*(const uint128& rhs) const { uint128 res(*this); res *= rhs; return res; }
+
+		// computes bit by bit, very slow
+		friend uint128 divide(const uint128& dividend, const uint128& divisor, uint128& remainder)
+		{
+			remainder = 0;
+
+			if (!divisor)
+			{
+				assert(0);
+				return ~uint128(0U);
+			}
+
+			uint128 quotient(0), one(1);
+
+			for (int i = 127; i >= 0; i--)
+			{
+				remainder = (remainder << 1) | ((dividend >> i) & one);
+				if (remainder >= divisor)
+				{
+					remainder -= divisor;
+					quotient |= (one << i);
+				}
+			}
+
+			return quotient;
+		}
+
+		uint128 operator/(const uint128& rhs) const { uint128 remainder, res; res = divide(*this, rhs, remainder); return res; }
+		uint128 operator/=(const uint128& rhs) { uint128 remainder; *this = divide(*this, rhs, remainder); return *this; }
+
+		uint128 operator%(const uint128& rhs) const { uint128 remainder; divide(*this, rhs, remainder); return remainder; }
+		uint128 operator%=(const uint128& rhs) { uint128 remainder; divide(*this, rhs, remainder); *this = remainder; return *this; }
+
+		void print_hex(FILE* pFile) const
+		{
+			fprintf(pFile, "0x%016llx%016llx", (unsigned long long int)m_hi, (unsigned long long int)m_lo);
+		}
+
+		void format_unsigned(std::string& res) const
+		{
+			basisu::vector<uint8_t> digits;
+			digits.reserve(39 + 1);
+
+			uint128 k(*this), ten(10);
+			do
+			{
+				uint128 r;
+				k = divide(k, ten, r);
+				digits.push_back((uint8_t)r);
+			} while (k);
+
+			for (int i = (int)digits.size() - 1; i >= 0; i--)
+				res += ('0' + digits[i]);
+		}
+
+		void format_signed(std::string& res) const
+		{
+			uint128 val(*this);
+
+			if (val.is_signed())
+			{
+				res.push_back('-');
+				val = -val;
+			}
+
+			val.format_unsigned(res);
+		}
+
+		void print_unsigned(FILE* pFile)
+		{
+			std::string str;
+			format_unsigned(str);
+			fprintf(pFile, "%s", str.c_str());
+		}
+
+		void print_signed(FILE* pFile)
+		{
+			std::string str;
+			format_signed(str);
+			fprintf(pFile, "%s", str.c_str());
+		}
+
+		uint128 get_reversed_bits() const
+		{
+			uint128 res;
+
+			const uint32_t* pSrc = (const uint32_t*)this;
+			uint32_t* pDst = (uint32_t*)&res;
+
+			pDst[0] = rev_dword(pSrc[3]);
+			pDst[1] = rev_dword(pSrc[2]);
+			pDst[2] = rev_dword(pSrc[1]);
+			pDst[3] = rev_dword(pSrc[0]);
+
+			return res;
+		}
+
+		uint128 get_byteswapped() const
+		{
+			uint128 res;
+
+			const uint8_t* pSrc = (const uint8_t*)this;
+			uint8_t* pDst = (uint8_t*)&res;
+
+			for (uint32_t i = 0; i < 16; i++)
+				pDst[i] = pSrc[15 - i];
+
+			return res;
+		}
+
+		inline uint64_t get_bits64(uint32_t bit_ofs, uint32_t bit_len) const
+		{
+			assert(bit_ofs < 128);
+			assert(bit_len && (bit_len <= 64) && ((bit_ofs + bit_len) <= 128));
+
+			uint128 res(*this);
+			res >>= bit_ofs;
+
+			const uint64_t bitmask = (bit_len == 64) ? UINT64_MAX : ((1ull << bit_len) - 1);
+			return res.get_low() & bitmask;
+		}
+
+		inline uint32_t get_bits(uint32_t bit_ofs, uint32_t bit_len) const
+		{
+			assert(bit_len <= 32);
+			return (uint32_t)get_bits64(bit_ofs, bit_len);
+		}
+
+		inline uint32_t next_bits(uint32_t& bit_ofs, uint32_t len) const
+		{
+			assert(len && (len <= 32));
+			uint32_t x = get_bits(bit_ofs, len);
+			bit_ofs += len;
+			return x;
+		}
+
+		inline uint128& set_bits(uint64_t val, uint32_t bit_ofs, uint32_t num_bits)
+		{
+			assert(bit_ofs < 128);
+			assert(num_bits && (num_bits <= 64) && ((bit_ofs + num_bits) <= 128));
+
+			uint128 bitmask(1);
+			bitmask = (bitmask << num_bits) - 1;
+			assert(uint128(val) <= bitmask);
+
+			bitmask <<= bit_ofs;
+			*this &= ~bitmask;
+
+			*this = *this | (uint128(val) << bit_ofs);
+			return *this;
+		}
+	};
+		
+	static bool decode_void_extent(const uint128& bits, log_astc_block& log_blk)
+	{
+		if (bits.get_bits(10, 2) != 0b11)
+			return false;
+
+		uint32_t bit_ofs = 12;
+		const uint32_t min_s = bits.next_bits(bit_ofs, 13);
+		const uint32_t max_s = bits.next_bits(bit_ofs, 13);
+		const uint32_t min_t = bits.next_bits(bit_ofs, 13);
+		const uint32_t max_t = bits.next_bits(bit_ofs, 13);
+		assert(bit_ofs == 64);
+		
+		const bool all_extents_all_ones = (min_s == 0x1FFF) && (max_s == 0x1FFF) && (min_t == 0x1FFF) && (max_t == 0x1FFF);
+		
+		if (!all_extents_all_ones && ((min_s >= max_s) || (min_t >= max_t)))
+			return false;
+
+		const bool hdr_flag = bits.get_bits(9, 1) != 0;
+
+		if (hdr_flag)
+			log_blk.m_solid_color_flag_hdr = true;
+		else
+			log_blk.m_solid_color_flag_ldr = true;
+
+		log_blk.m_solid_color[0] = (uint16_t)bits.get_bits(64, 16);
+		log_blk.m_solid_color[1] = (uint16_t)bits.get_bits(80, 16);
+		log_blk.m_solid_color[2] = (uint16_t)bits.get_bits(96, 16);
+		log_blk.m_solid_color[3] = (uint16_t)bits.get_bits(112, 16);
+
+		if (log_blk.m_solid_color_flag_hdr)
+		{
+			for (uint32_t c = 0; c < 4; c++)
+				if (is_half_inf_or_nan(log_blk.m_solid_color[c]))
+					return false;
+		}
+		
+		return true;
+	}
+
+	struct astc_dec_row
+	{
+		int8_t Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
+	};
+
+	static const astc_dec_row s_dec_rows[10] =
+	{
+		// Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
+		{  10,     9,     7,     2,      5,     2,      4,      2,      4,      0,      1      }, // 4 2
+		{  10,     9,     7,     2,      5,     2,      8,      2,      4,      0,      1      }, // 8 2 
+		{  10,     9,     5,     2,      7,     2,      2,      8,      4,      0,      1      }, // 2 8
+		{  10,     9,     5,     2,      7,     1,      2,      6,      4,      0,      1      }, // 2 6
+
+		{  10,     9,     7,     1,      5,     2,      2,      2,      4,      0,      1      }, // 2 2
+		{  10,     9,     0,     0,      5,     2,      12,     2,      4,      2,      3      }, // 12 2
+		{  10,     9,     5,     2,      0,     0,      2,     12,      4,      2,      3      }, // 2 12
+		{  10,     9,     0,     0,      0,     0,      6,     10,      4,      2,      3      }, // 6 10
+
+		{  10,     9,     0,     0,      0,     0,      10,    6,       4,      2,      3      }, // 10 6
+		{  -1,    -1,     5,     2,      9,     2,      6,     6,       4,      2,      3      }, // 6 6
+	};
+
+	static bool decode_config(const uint128& bits, log_astc_block& log_blk)
+	{
+		// Reserved
+		if (bits.get_bits(0, 4) == 0)
+			return false;
+
+		// Reserved
+		if ((bits.get_bits(0, 2) == 0) && (bits.get_bits(6, 3) == 0b111))
+		{
+			if (bits.get_bits(2, 4) != 0b1111) 
+				return false;
+		}
+
+		// Void extent
+		if (bits.get_bits(0, 9) == 0b111111100)
+			return decode_void_extent(bits, log_blk);
+												
+		// Check rows
+		const uint32_t x0_2 = bits.get_bits(0, 2), x2_2 = bits.get_bits(2, 2);
+		const uint32_t x5_4 = bits.get_bits(5, 4), x8_1 = bits.get_bits(8, 1);
+		const uint32_t x7_2 = bits.get_bits(7, 2);
+
+		int row_index = -1;
+		if (x0_2 == 0)
+		{
+			if (x7_2 == 0b00)
+				row_index = 5;
+			else if (x7_2 == 0b01)
+				row_index = 6;
+			else if (x5_4 == 0b1100)
+				row_index = 7;
+			else if (x5_4 == 0b1101)
+				row_index = 8;
+			else if (x7_2 == 0b10)
+				row_index = 9;
+		}
+		else
+		{
+			if (x2_2 == 0b00)
+				row_index = 0;
+			else if (x2_2 == 0b01)
+				row_index = 1;
+			else if (x2_2 == 0b10)
+				row_index = 2;
+			else if ((x2_2 == 0b11) && (x8_1 == 0))
+				row_index = 3;
+			else if ((x2_2 == 0b11) && (x8_1 == 1))
+				row_index = 4;
+		}
+		if (row_index < 0)
+			return false;
+
+		const astc_dec_row& r = s_dec_rows[row_index];
+
+		bool P = false, Dp = false;
+		uint32_t W = r.W_bias, H = r.H_bias;
+
+		if (r.P_ofs >= 0)
+			P = bits.get_bits(r.P_ofs, 1) != 0;
+
+		if (r.Dp_ofs >= 0)
+			Dp = bits.get_bits(r.Dp_ofs, 1) != 0;
+				
+		if (r.W_size)
+			W += bits.get_bits(r.W_ofs, r.W_size);
+
+		if (r.H_size)
+			H += bits.get_bits(r.H_ofs, r.H_size);
+
+		assert((W >= MIN_GRID_DIM) && (W <= MAX_BLOCK_DIM));
+		assert((H >= MIN_GRID_DIM) && (H <= MAX_BLOCK_DIM));
+		
+		int p0 = bits.get_bits(r.p0_ofs, 1);
+		int p1 = bits.get_bits(r.p1_ofs, 1);
+		int p2 = bits.get_bits(r.p2_ofs, 1);
+
+		uint32_t p = p0 | (p1 << 1) | (p2 << 2);
+		if (p < 2)
+			return false;
+		
+		log_blk.m_grid_width = W;
+		log_blk.m_grid_height = H;
+		
+		log_blk.m_weight_ise_range = (p - 2) + (P * BISE_10_LEVELS);
+		assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
+
+		log_blk.m_dual_plane = Dp;
+
+		return true;
+	}
+
+	static inline uint32_t read_le_dword(const uint8_t* pBytes)
+	{
+		return (pBytes[0]) | (pBytes[1] << 8U) | (pBytes[2] << 16U) | (pBytes[3] << 24U);
+	}
+
+	// See 18.12.Integer Sequence Encoding - tables computed by executing the decoder functions with all possible 8/7-bit inputs.
+	static const uint8_t s_trit_decode[256][5] =
+	{
+		{0,0,0,0,0},{1,0,0,0,0},{2,0,0,0,0},{0,0,2,0,0},{0,1,0,0,0},{1,1,0,0,0},{2,1,0,0,0},{1,0,2,0,0},
+		{0,2,0,0,0},{1,2,0,0,0},{2,2,0,0,0},{2,0,2,0,0},{0,2,2,0,0},{1,2,2,0,0},{2,2,2,0,0},{2,0,2,0,0},
+		{0,0,1,0,0},{1,0,1,0,0},{2,0,1,0,0},{0,1,2,0,0},{0,1,1,0,0},{1,1,1,0,0},{2,1,1,0,0},{1,1,2,0,0},
+		{0,2,1,0,0},{1,2,1,0,0},{2,2,1,0,0},{2,1,2,0,0},{0,0,0,2,2},{1,0,0,2,2},{2,0,0,2,2},{0,0,2,2,2},
+		{0,0,0,1,0},{1,0,0,1,0},{2,0,0,1,0},{0,0,2,1,0},{0,1,0,1,0},{1,1,0,1,0},{2,1,0,1,0},{1,0,2,1,0},
+		{0,2,0,1,0},{1,2,0,1,0},{2,2,0,1,0},{2,0,2,1,0},{0,2,2,1,0},{1,2,2,1,0},{2,2,2,1,0},{2,0,2,1,0},
+		{0,0,1,1,0},{1,0,1,1,0},{2,0,1,1,0},{0,1,2,1,0},{0,1,1,1,0},{1,1,1,1,0},{2,1,1,1,0},{1,1,2,1,0},
+		{0,2,1,1,0},{1,2,1,1,0},{2,2,1,1,0},{2,1,2,1,0},{0,1,0,2,2},{1,1,0,2,2},{2,1,0,2,2},{1,0,2,2,2},
+		{0,0,0,2,0},{1,0,0,2,0},{2,0,0,2,0},{0,0,2,2,0},{0,1,0,2,0},{1,1,0,2,0},{2,1,0,2,0},{1,0,2,2,0},
+		{0,2,0,2,0},{1,2,0,2,0},{2,2,0,2,0},{2,0,2,2,0},{0,2,2,2,0},{1,2,2,2,0},{2,2,2,2,0},{2,0,2,2,0},
+		{0,0,1,2,0},{1,0,1,2,0},{2,0,1,2,0},{0,1,2,2,0},{0,1,1,2,0},{1,1,1,2,0},{2,1,1,2,0},{1,1,2,2,0},
+		{0,2,1,2,0},{1,2,1,2,0},{2,2,1,2,0},{2,1,2,2,0},{0,2,0,2,2},{1,2,0,2,2},{2,2,0,2,2},{2,0,2,2,2},
+		{0,0,0,0,2},{1,0,0,0,2},{2,0,0,0,2},{0,0,2,0,2},{0,1,0,0,2},{1,1,0,0,2},{2,1,0,0,2},{1,0,2,0,2},
+		{0,2,0,0,2},{1,2,0,0,2},{2,2,0,0,2},{2,0,2,0,2},{0,2,2,0,2},{1,2,2,0,2},{2,2,2,0,2},{2,0,2,0,2},
+		{0,0,1,0,2},{1,0,1,0,2},{2,0,1,0,2},{0,1,2,0,2},{0,1,1,0,2},{1,1,1,0,2},{2,1,1,0,2},{1,1,2,0,2},
+		{0,2,1,0,2},{1,2,1,0,2},{2,2,1,0,2},{2,1,2,0,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,0,2,2,2},
+		{0,0,0,0,1},{1,0,0,0,1},{2,0,0,0,1},{0,0,2,0,1},{0,1,0,0,1},{1,1,0,0,1},{2,1,0,0,1},{1,0,2,0,1},
+		{0,2,0,0,1},{1,2,0,0,1},{2,2,0,0,1},{2,0,2,0,1},{0,2,2,0,1},{1,2,2,0,1},{2,2,2,0,1},{2,0,2,0,1},
+		{0,0,1,0,1},{1,0,1,0,1},{2,0,1,0,1},{0,1,2,0,1},{0,1,1,0,1},{1,1,1,0,1},{2,1,1,0,1},{1,1,2,0,1},
+		{0,2,1,0,1},{1,2,1,0,1},{2,2,1,0,1},{2,1,2,0,1},{0,0,1,2,2},{1,0,1,2,2},{2,0,1,2,2},{0,1,2,2,2},
+		{0,0,0,1,1},{1,0,0,1,1},{2,0,0,1,1},{0,0,2,1,1},{0,1,0,1,1},{1,1,0,1,1},{2,1,0,1,1},{1,0,2,1,1},
+		{0,2,0,1,1},{1,2,0,1,1},{2,2,0,1,1},{2,0,2,1,1},{0,2,2,1,1},{1,2,2,1,1},{2,2,2,1,1},{2,0,2,1,1},
+		{0,0,1,1,1},{1,0,1,1,1},{2,0,1,1,1},{0,1,2,1,1},{0,1,1,1,1},{1,1,1,1,1},{2,1,1,1,1},{1,1,2,1,1},
+		{0,2,1,1,1},{1,2,1,1,1},{2,2,1,1,1},{2,1,2,1,1},{0,1,1,2,2},{1,1,1,2,2},{2,1,1,2,2},{1,1,2,2,2},
+		{0,0,0,2,1},{1,0,0,2,1},{2,0,0,2,1},{0,0,2,2,1},{0,1,0,2,1},{1,1,0,2,1},{2,1,0,2,1},{1,0,2,2,1},
+		{0,2,0,2,1},{1,2,0,2,1},{2,2,0,2,1},{2,0,2,2,1},{0,2,2,2,1},{1,2,2,2,1},{2,2,2,2,1},{2,0,2,2,1},
+		{0,0,1,2,1},{1,0,1,2,1},{2,0,1,2,1},{0,1,2,2,1},{0,1,1,2,1},{1,1,1,2,1},{2,1,1,2,1},{1,1,2,2,1},
+		{0,2,1,2,1},{1,2,1,2,1},{2,2,1,2,1},{2,1,2,2,1},{0,2,1,2,2},{1,2,1,2,2},{2,2,1,2,2},{2,1,2,2,2},
+		{0,0,0,1,2},{1,0,0,1,2},{2,0,0,1,2},{0,0,2,1,2},{0,1,0,1,2},{1,1,0,1,2},{2,1,0,1,2},{1,0,2,1,2},
+		{0,2,0,1,2},{1,2,0,1,2},{2,2,0,1,2},{2,0,2,1,2},{0,2,2,1,2},{1,2,2,1,2},{2,2,2,1,2},{2,0,2,1,2},
+		{0,0,1,1,2},{1,0,1,1,2},{2,0,1,1,2},{0,1,2,1,2},{0,1,1,1,2},{1,1,1,1,2},{2,1,1,1,2},{1,1,2,1,2},
+		{0,2,1,1,2},{1,2,1,1,2},{2,2,1,1,2},{2,1,2,1,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,1,2,2,2}
+	};
+
+	static const uint8_t s_quint_decode[128][3] =
+	{
+		{0,0,0},{1,0,0},{2,0,0},{3,0,0},{4,0,0},{0,4,0},{4,4,0},{4,4,4},
+		{0,1,0},{1,1,0},{2,1,0},{3,1,0},{4,1,0},{1,4,0},{4,4,1},{4,4,4},
+		{0,2,0},{1,2,0},{2,2,0},{3,2,0},{4,2,0},{2,4,0},{4,4,2},{4,4,4},
+		{0,3,0},{1,3,0},{2,3,0},{3,3,0},{4,3,0},{3,4,0},{4,4,3},{4,4,4},
+		{0,0,1},{1,0,1},{2,0,1},{3,0,1},{4,0,1},{0,4,1},{4,0,4},{0,4,4},
+		{0,1,1},{1,1,1},{2,1,1},{3,1,1},{4,1,1},{1,4,1},{4,1,4},{1,4,4},
+		{0,2,1},{1,2,1},{2,2,1},{3,2,1},{4,2,1},{2,4,1},{4,2,4},{2,4,4},
+		{0,3,1},{1,3,1},{2,3,1},{3,3,1},{4,3,1},{3,4,1},{4,3,4},{3,4,4},
+		{0,0,2},{1,0,2},{2,0,2},{3,0,2},{4,0,2},{0,4,2},{2,0,4},{3,0,4},
+		{0,1,2},{1,1,2},{2,1,2},{3,1,2},{4,1,2},{1,4,2},{2,1,4},{3,1,4},
+		{0,2,2},{1,2,2},{2,2,2},{3,2,2},{4,2,2},{2,4,2},{2,2,4},{3,2,4},
+		{0,3,2},{1,3,2},{2,3,2},{3,3,2},{4,3,2},{3,4,2},{2,3,4},{3,3,4},
+		{0,0,3},{1,0,3},{2,0,3},{3,0,3},{4,0,3},{0,4,3},{0,0,4},{1,0,4},
+		{0,1,3},{1,1,3},{2,1,3},{3,1,3},{4,1,3},{1,4,3},{0,1,4},{1,1,4},
+		{0,2,3},{1,2,3},{2,2,3},{3,2,3},{4,2,3},{2,4,3},{0,2,4},{1,2,4},
+		{0,3,3},{1,3,3},{2,3,3},{3,3,3},{4,3,3},{3,4,3},{0,3,4},{1,3,4}
+	};
+
+	static void decode_trit_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
+	{
+		assert((num_vals >= 1) && (num_vals <= 5));
+		uint32_t m[5] = { 0 }, T = 0;
+
+		static const uint8_t s_t_bits[5] = { 2, 2, 1, 2, 1 };
+
+		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
+		{
+			if (bits_per_val)
+				m[c] = bits.next_bits(bit_ofs, bits_per_val);
+			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
+			T_ofs += s_t_bits[c];
+		}
+
+		const uint8_t (&p_trits)[5] = s_trit_decode[T];
+
+		for (uint32_t i = 0; i < num_vals; i++)
+			pVals[i] = (uint8_t)((p_trits[i] << bits_per_val) | m[i]);
+	}
+
+	static void decode_quint_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
+	{
+		assert((num_vals >= 1) && (num_vals <= 3));
+		uint32_t m[3] = { 0 }, T = 0;
+
+		static const uint8_t s_t_bits[3] = { 3, 2, 2 };
+
+		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
+		{
+			if (bits_per_val)
+				m[c] = bits.next_bits(bit_ofs, bits_per_val);
+			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
+			T_ofs += s_t_bits[c];
+		}
+
+		const uint8_t (&p_quints)[3] = s_quint_decode[T];
+
+		for (uint32_t i = 0; i < num_vals; i++)
+			pVals[i] = (uint8_t)((p_quints[i] << bits_per_val) | m[i]);
+	}
+
+	static void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t bit_ofs)
+	{
+		assert(num_vals && (ise_range < TOTAL_ISE_RANGES));
+		
+		const uint32_t bits_per_val = g_ise_range_table[ise_range][0];
+
+		if (g_ise_range_table[ise_range][1])
+		{
+			// Trits+bits, 5 vals per block, 7 bits extra per block
+			const uint32_t total_blocks = (num_vals + 4) / 5;
+			for (uint32_t b = 0; b < total_blocks; b++)
+			{
+				const uint32_t num_vals_in_block = std::min<int>(num_vals - 5 * b, 5);
+				decode_trit_block(pVals + 5 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
+			}
+		}
+		else if (g_ise_range_table[ise_range][2])
+		{
+			// Quints+bits, 3 vals per block, 8 bits extra per block
+			const uint32_t total_blocks = (num_vals + 2) / 3;
+			for (uint32_t b = 0; b < total_blocks; b++)
+			{
+				const uint32_t num_vals_in_block = std::min<int>(num_vals - 3 * b, 3);
+				decode_quint_block(pVals + 3 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
+			}
+		}
+		else
+		{
+			assert(bits_per_val);
+
+			// Only bits
+			for (uint32_t i = 0; i < num_vals; i++)
+				pVals[i] = (uint8_t)bits.next_bits(bit_ofs, bits_per_val);
+		}
+	}
+
+	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t* pBits128, uint32_t bit_ofs)
+	{
+		const uint128 bits(
+			(uint64_t)read_le_dword(pBits128) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t))) << 32),
+			(uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 3)) << 32));
+
+		return decode_bise(ise_range, pVals, num_vals, bits, bit_ofs);
+	}
+		
+	// Decodes a physical ASTC block to a logical ASTC block.
+	// blk_width/blk_height are only used to validate the weight grid's dimensions.
+	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height)
+	{
+		assert(is_valid_block_size(blk_width, blk_height));
+				
+		const uint8_t* pS = (uint8_t*)pASTC_block;
+
+		log_blk.clear();
+		log_blk.m_error_flag = true;
+		
+		const uint128 bits(
+			(uint64_t)read_le_dword(pS) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t))) << 32),
+			(uint64_t)read_le_dword(pS + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t) * 3)) << 32));
+		
+		const uint128 rev_bits(bits.get_reversed_bits());
+				
+		if (!decode_config(bits, log_blk))
+			return false;
+
+		if (log_blk.m_solid_color_flag_hdr || log_blk.m_solid_color_flag_ldr)
+		{
+			// Void extent
+			log_blk.m_error_flag = false;
+			return true;
+		}
+
+		// Check grid dimensions
+		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
+			return false;
+		
+		// Now we have the grid width/height, dual plane, weight ISE range
+		
+		const uint32_t total_grid_weights = (log_blk.m_dual_plane ? 2 : 1) * (log_blk.m_grid_width * log_blk.m_grid_height);
+		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_blk.m_weight_ise_range);
+				
+		// 18.24 Illegal Encodings
+		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
+			return false;
+		
+		const uint32_t end_of_weight_bit_ofs = 128 - total_weight_bits;
+
+		uint32_t total_extra_bits = 0;
+
+		// Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane.
+
+		log_blk.m_num_partitions = bits.get_bits(11, 2) + 1;
+		if (log_blk.m_num_partitions == 1)
+			log_blk.m_color_endpoint_modes[0] = bits.get_bits(13, 4); // read CEM bits
+		else
+		{
+			// 2 or more partitions
+			if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4))
+				return false;
+
+			log_blk.m_partition_id = bits.get_bits(13, 10);
+
+			uint32_t cem_bits = bits.get_bits(23, 6);
+
+			if ((cem_bits & 3) == 0)
+			{
+				// All CEM's the same
+				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
+					log_blk.m_color_endpoint_modes[i] = cem_bits >> 2;
+			}
+			else
+			{
+				// CEM's different, but within up to 2 adjacent classes
+				const uint32_t first_cem_index = ((cem_bits & 3) - 1) * 4;
+
+				total_extra_bits = 3 * log_blk.m_num_partitions - 4;
+
+				if ((total_weight_bits + total_extra_bits) > 128)
+					return false;
+
+				uint32_t cem_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
+				
+				uint32_t c[4] = { 0 }, m[4] = { 0 };
+				
+				cem_bits >>= 2;
+				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++, cem_bits >>= 1)
+					c[i] = cem_bits & 1;
+
+				switch (log_blk.m_num_partitions)
+				{
+				case 2:
+				{
+					m[0] = cem_bits & 3;
+					m[1] = bits.next_bits(cem_bit_pos, 2);
+					break;
+				}
+				case 3:
+				{
+					m[0] = cem_bits & 1;
+					m[0] |= (bits.next_bits(cem_bit_pos, 1) << 1);
+					m[1] = bits.next_bits(cem_bit_pos, 2);
+					m[2] = bits.next_bits(cem_bit_pos, 2);
+					break;
+				}
+				case 4:
+				{
+					for (uint32_t i = 0; i < 4; i++)
+						m[i] = bits.next_bits(cem_bit_pos, 2);
+					break;
+				}
+				default:
+				{
+					assert(0);
+					break;
+				}
+				}
+
+				assert(cem_bit_pos == end_of_weight_bit_ofs);
+
+				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
+				{
+					log_blk.m_color_endpoint_modes[i] = first_cem_index + (c[i] * 4) + m[i];
+					assert(log_blk.m_color_endpoint_modes[i] <= 15);
+				}
+			}
+		}
+
+		// Now we have all the CEM indices.
+
+		if (log_blk.m_dual_plane)
+		{
+			// Read CCS bits, beneath any CEM bits
+			total_extra_bits += 2;
+
+			if (total_extra_bits > end_of_weight_bit_ofs)
+				return false;
+
+			uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
+			log_blk.m_color_component_selector = bits.get_bits(ccs_bit_pos, 2);
+		}
+
+		uint32_t config_bit_pos = 11 + 2; // config+num_parts
+		if (log_blk.m_num_partitions == 1)
+			config_bit_pos += 4; // CEM bits
+		else
+			config_bit_pos += 10 + 6; // part_id+CEM bits
+
+		// config+num_parts+total_extra_bits (CEM extra+CCS)
+		uint32_t total_config_bits = config_bit_pos + total_extra_bits;
+		
+		// Compute number of remaining bits in block
+		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
+		if (num_remaining_bits < 0)
+			return false;
+
+		// Compute total number of ISE encoded color endpoint mode values
+		uint32_t total_cem_vals = 0;
+		for (uint32_t j = 0; j < log_blk.m_num_partitions; j++)
+			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[j]);
+
+		if (total_cem_vals > MAX_ENDPOINTS)
+			return false;
+
+		// Infer endpoint ISE range based off the # of values we need to encode, and the # of remaining bits in the block
+		int endpoint_ise_range = -1;
+		for (int k = 20; k > 0; k--)
+		{
+			int b = get_ise_sequence_bits(total_cem_vals, k);
+			if (b <= num_remaining_bits)
+			{
+				endpoint_ise_range = k;
+				break;
+			}
+		}
+
+		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
+		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
+			return false;
+
+		log_blk.m_endpoint_ise_range = endpoint_ise_range;
+
+		// Decode endpoints forwards in block
+		decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos);
+
+		// Decode grid weights backwards in block
+		decode_bise(log_blk.m_weight_ise_range, log_blk.m_weights, total_grid_weights, rev_bits, 0);
+
+		log_blk.m_error_flag = false;
+
+		return true;
+	}
+		
+} // namespace astc_helpers
+
+#endif //BASISU_ASTC_HELPERS_IMPLEMENTATION
diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h
index d3e14369ba07..bfc51bb499cc 100644
--- a/thirdparty/basis_universal/transcoder/basisu_containers.h
+++ b/thirdparty/basis_universal/transcoder/basisu_containers.h
@@ -188,8 +188,9 @@ namespace basisu
 
 #define BASISU_IS_SCALAR_TYPE(T) (scalar_type<T>::cFlag)
 
-#if defined(__GNUC__) && __GNUC__<5
-   #define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#if !defined(BASISU_HAVE_STD_TRIVIALLY_COPYABLE) && defined(__GNUC__) && __GNUC__<5
+   //#define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+    #define BASISU_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__)
 #else
    #define BASISU_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
 #endif
@@ -286,8 +287,19 @@ namespace basisu
 
          if (BASISU_IS_BITWISE_COPYABLE(T))
          {
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif                  
+#endif
              if ((m_p) && (other.m_p))
                 memcpy(m_p, other.m_p, m_size * sizeof(T));
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif                
+#endif
          }
          else
          {
@@ -330,8 +342,19 @@ namespace basisu
 
          if (BASISU_IS_BITWISE_COPYABLE(T))
          {
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif         
+#endif
              if ((m_p) && (other.m_p))
                 memcpy(m_p, other.m_p, other.m_size * sizeof(T));
+#ifndef __EMSCRIPTEN__          
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif                            
+#endif
          }
          else
          {
@@ -501,7 +524,7 @@ namespace basisu
 
          if (new_capacity > m_capacity)
          {
-            if (!increase_capacity(new_capacity, false))
+            if (!increase_capacity(new_capacity, false, true))
                return false;
          }
          else if (new_capacity < m_capacity)
@@ -509,7 +532,8 @@ namespace basisu
             // Must work around the lack of a "decrease_capacity()" method.
             // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize.
             vector tmp;
-            tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false);
+            if (!tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false, true))
+                return false;
             tmp = *this;
             swap(tmp);
          }
@@ -750,7 +774,21 @@ namespace basisu
             }
 
             // Copy "down" the objects to preserve, filling in the empty slots.
+
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif
+#endif
+
             memmove(pDst, pSrc, num_to_move * sizeof(T));
+
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif            
+#endif
          }
          else
          {
@@ -1003,7 +1041,21 @@ namespace basisu
       inline void set_all(const T& o)
       {
          if ((sizeof(T) == 1) && (scalar_type<T>::cFlag))
+         {
+#ifndef __EMSCRIPTEN__
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wclass-memaccess"            
+#endif              
+#endif
             memset(m_p, *reinterpret_cast<const uint8_t*>(&o), m_size);
+
+#ifndef __EMSCRIPTEN__            
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif                        
+#endif
+         }
          else
          {
             T* pDst = m_p;
@@ -1029,7 +1081,7 @@ namespace basisu
       // Important: This method is used in Basis Universal. If you change how this container allocates memory, you'll need to change any users of this method.
       inline bool grant_ownership(T* p, uint32_t size, uint32_t capacity)
       {
-         // To to prevent the caller from obviously shooting themselves in the foot.
+         // To prevent the caller from obviously shooting themselves in the foot.
          if (((p + capacity) > m_p) && (p < (m_p + m_capacity)))
          {
             // Can grant ownership of a block inside the container itself!
diff --git a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
index d5cb61569b03..60c0b3d89f91 100644
--- a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
+++ b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
@@ -19,23 +19,30 @@ namespace basisu
       if (m_capacity >= min_new_capacity)
          return true;
 
-      size_t new_capacity = min_new_capacity;
-      if ((grow_hint) && (!helpers::is_power_of_2((uint64_t)new_capacity)))
-      {
-         new_capacity = (size_t)helpers::next_pow2((uint64_t)new_capacity);
-
-         assert(new_capacity && (new_capacity > m_capacity));
+      uint64_t new_capacity_u64 = min_new_capacity;
+      if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64)))
+          new_capacity_u64 = helpers::next_pow2(new_capacity_u64);
 
-         if (new_capacity < min_new_capacity)
-         {
-            if (nofail)
-               return false;
-            fprintf(stderr, "vector too large\n");
-            abort();
-         }
+      size_t new_capacity = (size_t)new_capacity_u64;
+      if (new_capacity != new_capacity_u64)
+      {
+          if (nofail)
+              return false;
+          fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n");
+          abort();
       }
             
-      const size_t desired_size = element_size * new_capacity;
+      const uint64_t desired_size_u64 = (uint64_t)element_size * new_capacity;
+            
+      const size_t desired_size = (size_t)desired_size_u64;
+      if (desired_size_u64 != desired_size)
+      {
+          if (nofail)
+              return false;
+          fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n");
+          abort();
+      }
+
       size_t actual_size = 0;
       if (!pMover)
       {
@@ -46,11 +53,7 @@ namespace basisu
                return false;
 
             char buf[256];
-#ifdef _MSC_VER
-            sprintf_s(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
-#else
-            sprintf(buf, "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
-#endif
+            snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size);
             fprintf(stderr, "%s", buf);
             abort();
          }
@@ -75,11 +78,7 @@ namespace basisu
                return false;
 
             char buf[256];
-#ifdef _MSC_VER
-            sprintf_s(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
-#else
-            sprintf(buf, "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
-#endif
+            snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size);
             fprintf(stderr, "%s", buf);
             abort();
          }
diff --git a/thirdparty/basis_universal/transcoder/basisu_file_headers.h b/thirdparty/basis_universal/transcoder/basisu_file_headers.h
index 4316d738e6b6..d29e3feb0340 100644
--- a/thirdparty/basis_universal/transcoder/basisu_file_headers.h
+++ b/thirdparty/basis_universal/transcoder/basisu_file_headers.h
@@ -1,5 +1,5 @@
 // basis_file_headers.h
-// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -89,7 +89,8 @@ namespace basist
 	enum class basis_tex_format
 	{
 		cETC1S = 0,
-		cUASTC4x4 = 1
+		cUASTC4x4 = 1,
+		cUASTC_HDR_4x4 = 2
 	};
 
 	struct basis_file_header
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
index c698861f3b9f..32018cd282d9 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
@@ -1,5 +1,5 @@
 // basisu_transcoder.cpp
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
 #include <limits.h>
 #include "basisu_containers_impl.h"
 
+#define BASISU_ASTC_HELPERS_IMPLEMENTATION
+#include "basisu_astc_helpers.h"
+
+#include "basisu_astc_hdr_core.h"
+
 #ifndef BASISD_IS_BIG_ENDIAN
 // TODO: This doesn't work on OSX. How can this be so difficult?
 //#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN)
@@ -139,6 +144,10 @@
 	#endif
 #endif
 
+#ifndef BASISD_SUPPORT_UASTC_HDR
+	#define BASISD_SUPPORT_UASTC_HDR 1
+#endif
+
 #define BASISD_WRITE_NEW_BC7_MODE5_TABLES			0
 #define BASISD_WRITE_NEW_DXT1_TABLES				0
 #define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES		0
@@ -1908,17 +1917,24 @@ namespace basist
 	void basisu_transcoder_init()
 	{
 		if (g_transcoder_initialized)
-      {
-         BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Called more than once\n");      
+		{
+			BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Called more than once\n");      
 			return;
-      }
+		}
          
-     BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Initializing (this is not an error)\n");      
+		BASISU_DEVEL_ERROR("basisu_transcoder::basisu_transcoder_init: Initializing (this is not an error)\n");      
 
 #if BASISD_SUPPORT_UASTC
 		uastc_init();
 #endif
 
+#if BASISD_SUPPORT_UASTC_HDR
+		// TODO: Examine this, optimize for startup time/mem utilization.
+		astc_helpers::init_tables(false);
+
+		astc_hdr_core_init();
+#endif
+
 #if BASISD_SUPPORT_ASTC
 		transcoder_init_astc();
 #endif
@@ -2027,6 +2043,10 @@ namespace basist
 		transcoder_init_pvrtc2();
 #endif
 
+#if BASISD_SUPPORT_UASTC_HDR
+		bc6h_enc_init();
+#endif
+
 		g_transcoder_initialized = true;
 	}
 
@@ -6928,7 +6948,7 @@ namespace basist
 
 	static inline int sq(int x) { return x * x; }
 						
-	// PVRTC2 is a slightly borked format for alpha: In Non-Interpolated mode, the way AlphaB8 is exanded from 4 to 8 bits means it can never be 0. 
+	// PVRTC2 is a slightly borked format for alpha: In Non-Interpolated mode, the way AlphaB8 is expanded from 4 to 8 bits means it can never be 0. 
 	// This is actually very bad, because on 100% transparent blocks which have non-trivial color pixels, part of the color channel will leak into alpha! 
 	// And there's nothing straightforward we can do because using the other modes is too expensive/complex. I can see why Apple didn't adopt it.
 	static void convert_etc1s_to_pvrtc2_rgba(void* pDst, const endpoint* pEndpoints, const selector* pSelector, const endpoint* pEndpoint_codebook, const selector* pSelector_codebook)
@@ -7515,6 +7535,8 @@ namespace basist
 	}
 #endif // BASISD_SUPPORT_PVRTC2
 
+	//------------------------------------------------------------------------------------------------
+
 	basisu_lowlevel_etc1s_transcoder::basisu_lowlevel_etc1s_transcoder() :
 		m_pGlobal_codebook(nullptr),
 		m_selector_history_buf_size(0)
@@ -8620,7 +8642,7 @@ namespace basist
 			// Now make sure the output buffer is large enough, or we'll overwrite memory.
 			if (output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels))
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n");
+				BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n");
 				return false;
 			}
 		}
@@ -8632,7 +8654,7 @@ namespace basist
 
 			if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1)
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n");
+				BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n");
 				return false;
 			}
 		}
@@ -8640,7 +8662,7 @@ namespace basist
 		{
 			if (output_blocks_buf_size_in_blocks_or_pixels < total_slice_blocks)
 			{
-				BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n");
+				BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n");
 				return false;
 			}
 		}
@@ -9242,13 +9264,17 @@ namespace basist
 
 		return status;
 	}
+
+	//------------------------------------------------------------------------------------------------
 	
 	basisu_lowlevel_uastc_transcoder::basisu_lowlevel_uastc_transcoder()
 	{
 	}
 
-	bool basisu_lowlevel_uastc_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
-        uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels,
+	bool basisu_lowlevel_uastc_transcoder::transcode_slice(
+		void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+        uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, 
+		const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels,
 		basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags)
 	{
 		BASISU_NOTE_UNUSED(pState);
@@ -9784,6 +9810,317 @@ namespace basist
 
 		return status;
 	}
+
+	//------------------------------------------------------------------------------------------------
+
+	basisu_lowlevel_uastc_hdr_transcoder::basisu_lowlevel_uastc_hdr_transcoder()
+	{
+	}
+
+	bool basisu_lowlevel_uastc_hdr_transcoder::transcode_slice(
+		void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+		uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, 
+		const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels,
+		basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags)
+	{
+		BASISU_NOTE_UNUSED(pState);
+		BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks);
+		BASISU_NOTE_UNUSED(has_alpha);
+		BASISU_NOTE_UNUSED(channel0);
+		BASISU_NOTE_UNUSED(channel1);
+		BASISU_NOTE_UNUSED(decode_flags);
+
+		assert(g_transcoder_initialized);
+		if (!g_transcoder_initialized)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder not globally initialized.\n");
+			return false;
+		}
+
+#if BASISD_SUPPORT_UASTC_HDR
+		const uint32_t total_blocks = num_blocks_x * num_blocks_y;
+
+		if (!output_row_pitch_in_blocks_or_pixels)
+		{
+			if (basis_block_format_is_uncompressed(fmt))
+				output_row_pitch_in_blocks_or_pixels = orig_width;
+			else
+				output_row_pitch_in_blocks_or_pixels = num_blocks_x;
+		}
+
+		if (basis_block_format_is_uncompressed(fmt))
+		{
+			if (!output_rows_in_pixels)
+				output_rows_in_pixels = orig_height;
+		}
+
+		uint32_t total_expected_block_bytes = sizeof(astc_blk) * total_blocks;
+		if (image_data_size < total_expected_block_bytes)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n");
+			return false;
+		}
+
+		const astc_blk* pSource_block = reinterpret_cast<const astc_blk*>(pImage_data);
+
+		bool status = false;
+
+		// TODO: Optimize pure memcpy() case.
+			
+		for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y)
+		{
+			void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes;
+
+			for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t*)pDst_block + output_block_or_pixel_stride_in_bytes)
+			{
+				switch (fmt)
+				{
+				case block_format::cUASTC_HDR_4x4:
+				case block_format::cASTC_HDR_4x4:
+				{
+					// Nothing to do, UASTC HDR is just ASTC.
+					memcpy(pDst_block, pSource_block, sizeof(uastc_block));
+					status = true;
+					break;
+				}
+				case block_format::cBC6H:
+				{
+					status = astc_hdr_transcode_to_bc6h(*pSource_block, *(bc6h_block *)pDst_block);
+					break;
+				}
+				case block_format::cRGB_9E5:
+				{
+					astc_helpers::log_astc_block log_blk;
+					status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4);
+					if (status)
+					{
+						uint32_t* pDst_pixels = reinterpret_cast<uint32_t*>(
+							static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t)
+							);
+
+						uint32_t blk_texels[4][4];
+
+						status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeRGB9E5);
+						
+						if (status)
+						{
+							const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+							const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+							for (uint32_t y = 0; y < max_y; y++)
+							{
+								memcpy(pDst_pixels, &blk_texels[y][0], sizeof(uint32_t) * max_x);
+
+								pDst_pixels += output_row_pitch_in_blocks_or_pixels;
+							} // y
+						}
+					}
+					
+					break;
+				}
+				case block_format::cRGBA_HALF:
+				{
+					astc_helpers::log_astc_block log_blk;
+					status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4);
+					if (status)
+					{
+						half_float* pDst_pixels = reinterpret_cast<half_float*>(
+							static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 4
+							);
+												
+						half_float blk_texels[4][4][4];
+						status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeHDR16);
+
+						if (status)
+						{
+							const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+							const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+							for (uint32_t y = 0; y < max_y; y++)
+							{
+								for (uint32_t x = 0; x < max_x; x++)
+								{
+									pDst_pixels[0 + 4 * x] = blk_texels[y][x][0];
+									pDst_pixels[1 + 4 * x] = blk_texels[y][x][1];
+									pDst_pixels[2 + 4 * x] = blk_texels[y][x][2];
+									pDst_pixels[3 + 4 * x] = blk_texels[y][x][3];
+								} // x
+
+								pDst_pixels += output_row_pitch_in_blocks_or_pixels * 4;
+							} // y
+						}
+					}
+
+					break;
+				}
+				case block_format::cRGB_HALF:
+				{
+					astc_helpers:: log_astc_block log_blk;
+					status = astc_helpers::unpack_block(pSource_block, log_blk, 4, 4);
+					if (status)
+					{
+						half_float* pDst_pixels =
+							reinterpret_cast<half_float*>(static_cast<uint8_t*>(pDst_blocks) + (block_x * 4 + block_y * 4 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 3);
+
+						half_float blk_texels[4][4][4];
+						status = astc_helpers::decode_block(log_blk, blk_texels, 4, 4, astc_helpers::cDecodeModeHDR16);
+						if (status)
+						{
+							const uint32_t max_x = basisu::minimum<int>(4, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 4);
+							const uint32_t max_y = basisu::minimum<int>(4, (int)output_rows_in_pixels - (int)block_y * 4);
+
+							for (uint32_t y = 0; y < max_y; y++)
+							{
+								for (uint32_t x = 0; x < max_x; x++)
+								{
+									pDst_pixels[0 + 3 * x] = blk_texels[y][x][0];
+									pDst_pixels[1 + 3 * x] = blk_texels[y][x][1];
+									pDst_pixels[2 + 3 * x] = blk_texels[y][x][2];
+								} // x
+
+								pDst_pixels += output_row_pitch_in_blocks_or_pixels * 3;
+							} // y
+						}
+					}
+
+					break;
+				}
+				default:
+					assert(0);
+					break;
+
+				}
+
+				if (!status)
+				{
+					BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder failed to unpack a UASTC HDR block - this is a bug, or the data was corrupted\n");					return false;
+				}
+
+			} // block_x
+
+		} // block_y
+
+		return true;
+#else
+		BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: UASTC_HDR is unsupported\n");
+
+		BASISU_NOTE_UNUSED(decode_flags);
+		BASISU_NOTE_UNUSED(channel0);
+		BASISU_NOTE_UNUSED(channel1);
+		BASISU_NOTE_UNUSED(output_rows_in_pixels);
+		BASISU_NOTE_UNUSED(output_row_pitch_in_blocks_or_pixels);
+		BASISU_NOTE_UNUSED(output_block_or_pixel_stride_in_bytes);
+		BASISU_NOTE_UNUSED(fmt);
+		BASISU_NOTE_UNUSED(image_data_size);
+		BASISU_NOTE_UNUSED(pImage_data);
+		BASISU_NOTE_UNUSED(num_blocks_x);
+		BASISU_NOTE_UNUSED(num_blocks_y);
+		BASISU_NOTE_UNUSED(pDst_blocks);
+
+		return false;
+#endif
+	}
+
+	bool basisu_lowlevel_uastc_hdr_transcoder::transcode_image(
+		transcoder_texture_format target_format,
+		void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+		uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+		uint32_t slice_offset, uint32_t slice_length,
+		uint32_t decode_flags,
+		bool has_alpha,
+		bool is_video,
+		uint32_t output_row_pitch_in_blocks_or_pixels,
+		basisu_transcoder_state* pState,
+		uint32_t output_rows_in_pixels,
+		int channel0, int channel1)
+	{
+		BASISU_NOTE_UNUSED(is_video);
+		BASISU_NOTE_UNUSED(level_index);
+		BASISU_NOTE_UNUSED(decode_flags);
+
+		if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length)
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: source data buffer too small\n");
+			return false;
+		}
+
+		const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format);
+		const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y;
+
+		if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks))
+		{
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: output buffer size too small\n");
+			return false;
+		}
+
+		bool status = false;
+
+		switch (target_format)
+		{
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
+		{
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_HDR_4x4,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFBC6H:
+		{
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC6H,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to BC6H failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGB_HALF:
+		{
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_HALF,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGBA_HALF:
+		{
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA_HALF,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n");
+			}
+			break;
+		}
+		case transcoder_texture_format::cTFRGB_9E5:
+		{
+			status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_9E5,
+				bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+			if (!status)
+			{
+				BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n");
+			}
+			break;
+		}
+		default:
+		{
+			assert(0);
+			BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: Invalid format\n");
+			break;
+		}
+		}
+
+		return status;
+	}
+
+	//------------------------------------------------------------------------------------------------
 	
 	basisu_transcoder::basisu_transcoder() :
 		m_ready_to_transcode(false)
@@ -10390,7 +10727,7 @@ namespace basist
 		}
 		else
 		{
-			// Nothing special to do for UASTC.
+			// Nothing special to do for UASTC/UASTC HDR.
 			if (m_lowlevel_etc1s_decoder.m_local_endpoints.size())
 			{
 				m_lowlevel_etc1s_decoder.clear();
@@ -10510,7 +10847,14 @@ namespace basist
 			return false;
 		}
 				
-		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
+		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4)
+		{
+			return m_lowlevel_uastc_hdr_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
+				pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
+				fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState,
+				output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+		else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
 		{
 			return m_lowlevel_uastc_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y,
 				pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size,
@@ -10742,7 +11086,18 @@ namespace basist
 			memset(static_cast<uint8_t*>(pOutput_blocks) + total_slice_blocks * bytes_per_block_or_pixel, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block_or_pixel);
 		}
 		
-		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
+		if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4)
+		{
+			const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index];
+
+			// Use the container independent image transcode method.
+			status = m_lowlevel_uastc_hdr_decoder.transcode_image(fmt,
+				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+				(const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index,
+				pSlice_desc->m_file_ofs, pSlice_desc->m_file_size,
+				decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels);
+		}
+		else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)
 		{
 			const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index];
 
@@ -10808,20 +11163,27 @@ namespace basist
 			return 8;
 		case transcoder_texture_format::cTFBC7_RGBA:
 		case transcoder_texture_format::cTFBC7_ALT:
+		case transcoder_texture_format::cTFBC6H:
 		case transcoder_texture_format::cTFETC2_RGBA:
 		case transcoder_texture_format::cTFBC3_RGBA:
 		case transcoder_texture_format::cTFBC5_RG:
 		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
 		case transcoder_texture_format::cTFATC_RGBA:
 		case transcoder_texture_format::cTFFXT1_RGB:
 		case transcoder_texture_format::cTFETC2_EAC_RG11:
 			return 16;
 		case transcoder_texture_format::cTFRGBA32:
+		case transcoder_texture_format::cTFRGB_9E5:
 			return sizeof(uint32_t);
 		case transcoder_texture_format::cTFRGB565:
 		case transcoder_texture_format::cTFBGR565:
 		case transcoder_texture_format::cTFRGBA4444:
 			return sizeof(uint16_t);
+		case transcoder_texture_format::cTFRGB_HALF:
+			return sizeof(half_float) * 3;
+		case transcoder_texture_format::cTFRGBA_HALF:
+			return sizeof(half_float) * 4;
 		default:
 			assert(0);
 			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
@@ -10845,17 +11207,22 @@ namespace basist
 		case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA";
 		case transcoder_texture_format::cTFBC5_RG: return "BC5_RG";
 		case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA";
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return "ASTC_HDR_RGBA";
 		case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB";
 		case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA";
 		case transcoder_texture_format::cTFRGBA32: return "RGBA32";
 		case transcoder_texture_format::cTFRGB565: return "RGB565";
 		case transcoder_texture_format::cTFBGR565: return "BGR565";
 		case transcoder_texture_format::cTFRGBA4444: return "RGBA4444";
+		case transcoder_texture_format::cTFRGBA_HALF: return "RGBA_HALF";
+		case transcoder_texture_format::cTFRGB_9E5: return "RGB_9E5";
+		case transcoder_texture_format::cTFRGB_HALF: return "RGB_HALF";
 		case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB";
 		case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB";
 		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA";
 		case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11";
 		case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11";
+		case transcoder_texture_format::cTFBC6H: return "BC6H";
 		default:
 			assert(0);
 			BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n");
@@ -10881,7 +11248,13 @@ namespace basist
 		case block_format::cRGB565: return "RGB565";
 		case block_format::cBGR565: return "BGR565";
 		case block_format::cRGBA4444: return "RGBA4444";
+		case block_format::cRGBA_HALF: return "RGBA_HALF";
+		case block_format::cRGB_HALF: return "RGB_HALF";
+		case block_format::cRGB_9E5: return "RGB_9E5";
 		case block_format::cUASTC_4x4: return "UASTC_4x4";
+		case block_format::cUASTC_HDR_4x4: return "UASTC_HDR_4x4";
+		case block_format::cBC6H: return "BC6H";
+		case block_format::cASTC_HDR_4x4: return "ASTC_HDR_4x4";
 		case block_format::cFXT1_RGB: return "FXT1_RGB";
 		case block_format::cPVRTC2_4_RGB: return "PVRTC2_4_RGB";
 		case block_format::cPVRTC2_4_RGBA: return "PVRTC2_4_RGBA";
@@ -10914,11 +11287,13 @@ namespace basist
 
 	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt)
 	{
+		// TODO: Technically ASTC_HDR does support alpha, but UASTC_HDR doesn't yet support it. Unsure what to do here.
 		switch (fmt)
 		{
 		case transcoder_texture_format::cTFETC2_RGBA:
 		case transcoder_texture_format::cTFBC3_RGBA:
 		case transcoder_texture_format::cTFASTC_4x4_RGBA:
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
 		case transcoder_texture_format::cTFBC7_RGBA:
 		case transcoder_texture_format::cTFBC7_ALT:
 		case transcoder_texture_format::cTFPVRTC1_4_RGBA:
@@ -10926,6 +11301,23 @@ namespace basist
 		case transcoder_texture_format::cTFATC_RGBA:
 		case transcoder_texture_format::cTFRGBA32:
 		case transcoder_texture_format::cTFRGBA4444:
+		case transcoder_texture_format::cTFRGBA_HALF:
+			return true;
+		default:
+			break;
+		}
+		return false;
+	}
+
+	bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt)
+	{
+		switch (fmt)
+		{
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
+		case transcoder_texture_format::cTFBC6H:
+		case transcoder_texture_format::cTFRGBA_HALF:
+		case transcoder_texture_format::cTFRGB_HALF:
+		case transcoder_texture_format::cTFRGB_9E5:
 			return true;
 		default:
 			break;
@@ -10947,13 +11339,18 @@ namespace basist
 		case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA;
 		case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3;
 		case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5;
-		case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC4x4;
+		case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC_LDR_4x4;
+		case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return basisu::texture_format::cASTC_HDR_4x4;
+		case transcoder_texture_format::cTFBC6H: return basisu::texture_format::cBC6HUnsigned;
 		case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB;
 		case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA;
 		case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32;
 		case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565;
 		case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565;
 		case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444;
+		case transcoder_texture_format::cTFRGBA_HALF: return basisu::texture_format::cRGBA_HALF;
+		case transcoder_texture_format::cTFRGB_9E5: return basisu::texture_format::cRGB_9E5;
+		case transcoder_texture_format::cTFRGB_HALF: return basisu::texture_format::cRGB_HALF;
 		case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB;
 		case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA;
 		case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA;
@@ -10975,6 +11372,9 @@ namespace basist
 		case transcoder_texture_format::cTFRGB565:
 		case transcoder_texture_format::cTFBGR565:
 		case transcoder_texture_format::cTFRGBA4444:
+		case transcoder_texture_format::cTFRGB_HALF:
+		case transcoder_texture_format::cTFRGBA_HALF:
+		case transcoder_texture_format::cTFRGB_9E5:
 			return true;
 		default:
 			break;
@@ -10995,6 +11395,9 @@ namespace basist
 		case block_format::cRGBA4444_COLOR:
 		case block_format::cRGBA4444_ALPHA:
 		case block_format::cRGBA4444_COLOR_OPAQUE:
+		case block_format::cRGBA_HALF:
+		case block_format::cRGB_HALF:
+		case block_format::cRGB_9E5:
 			return true;
 		default:
 			break;
@@ -11007,11 +11410,16 @@ namespace basist
 		switch (fmt)
 		{
 		case transcoder_texture_format::cTFRGBA32:
+		case transcoder_texture_format::cTFRGB_9E5:
 			return sizeof(uint32_t); 
 		case transcoder_texture_format::cTFRGB565:
 		case transcoder_texture_format::cTFBGR565:
 		case transcoder_texture_format::cTFRGBA4444:
 			return sizeof(uint16_t);
+		case transcoder_texture_format::cTFRGB_HALF:
+			return sizeof(half_float) * 3;
+		case transcoder_texture_format::cTFRGBA_HALF:
+			return sizeof(half_float) * 4;
 		default:
 			break;
 		}
@@ -11038,8 +11446,26 @@ namespace basist
 	
 	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt)
 	{
-		if (fmt == basis_tex_format::cUASTC4x4)
+		if (fmt == basis_tex_format::cUASTC_HDR_4x4)
+		{
+			// UASTC HDR
+#if BASISD_SUPPORT_UASTC_HDR
+			switch (tex_type)
+			{
+			case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
+			case transcoder_texture_format::cTFBC6H:
+			case transcoder_texture_format::cTFRGBA_HALF:
+			case transcoder_texture_format::cTFRGB_HALF:
+			case transcoder_texture_format::cTFRGB_9E5:
+				return true;
+			default:
+				break;
+			}
+#endif
+		}
+		else if (fmt == basis_tex_format::cUASTC4x4)
 		{
+			// UASTC LDR
 #if BASISD_SUPPORT_UASTC
 			switch (tex_type)
 			{
@@ -11049,6 +11475,12 @@ namespace basist
 			case transcoder_texture_format::cTFATC_RGB:
 			case transcoder_texture_format::cTFATC_RGBA:
 			case transcoder_texture_format::cTFFXT1_RGB:
+			// UASTC LDR doesn't support transcoding to HDR formats
+			case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA:
+			case transcoder_texture_format::cTFBC6H:
+			case transcoder_texture_format::cTFRGBA_HALF:
+			case transcoder_texture_format::cTFRGB_HALF:
+			case transcoder_texture_format::cTFRGB_9E5:
 				return false;
 			default:
 				return true;
@@ -11057,6 +11489,7 @@ namespace basist
 		}
 		else
 		{
+			// ETC1S
 			switch (tex_type)
 			{
 				// ETC1 and uncompressed are always supported.
@@ -11812,7 +12245,7 @@ namespace basist
 	// Encodes 3 values to output, usable for any range that uses quints and bits
 	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n)
 	{
-		// First extract the trits and the bits from the 5 input values
+		// First extract the quints and the bits from the 3 input values
 		int quints = 0, bits[3];
 		const uint32_t bit_mask = (1 << n) - 1;
 		for (int i = 0; i < 3; i++)
@@ -12131,11 +12564,13 @@ namespace basist
 
 			return bits & ((1U << codesize) - 1U);
 		}
-
-		uint32_t byte_bit_offset = bit_offset & 7U;
-		const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]);
-		bit_offset += codesize;
-		return (w >> byte_bit_offset)& ((1U << codesize) - 1U);
+		else
+		{
+			uint32_t byte_bit_offset = bit_offset & 7U;
+			const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]);
+			bit_offset += codesize;
+			return (w >> byte_bit_offset) & ((1U << codesize) - 1U);
+		}
 	}
 
 	bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints)
@@ -12170,6 +12605,7 @@ namespace basist
 			return false;
 
 		unpacked.m_mode = mode;
+		unpacked.m_common_pattern = 0;
 
 		uint32_t bit_ofs = g_uastc_mode_huff_codes[mode][1];
 
@@ -16663,10 +17099,12 @@ namespace basist
 
 		memcpy(&m_header, pData, sizeof(m_header));
 
-		// We only support UASTC and ETC1S
-		if (m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED)
+		// We only support UASTC LDR, UASTC HDR and ETC1S.
+		// Note the DFD's contents are what we are guided by for decoding the KTX2 file, not this format field (currently).
+		if ((m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED) && 
+			(m_header.m_vk_format != basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK))
 		{
-			BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC format\n");
+			BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC LDR/HDR format\n");
 			return false;
 		}
 
@@ -16890,6 +17328,16 @@ namespace basist
 			// We're assuming "DATA" means RGBA so it has alpha.
 			m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG);
 		}
+		else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC_HDR)
+		{
+			m_format = basist::basis_tex_format::cUASTC_HDR_4x4;
+
+			m_dfd_samples = 1;
+			m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15);
+
+			// We're assuming "DATA" means RGBA so it has alpha.
+			m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG);
+		}
 		else
 		{
 			// Unsupported DFD color model.
@@ -17167,7 +17615,8 @@ namespace basist
 				return false;
 			}
 		}
-		else if (m_format == basist::basis_tex_format::cUASTC4x4)
+		else if ((m_format == basist::basis_tex_format::cUASTC4x4) ||
+			     (m_format == basist::basis_tex_format::cUASTC_HDR_4x4))
 		{
 			// Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices.
 			assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length);
@@ -17188,14 +17637,29 @@ namespace basist
 				return false;
 			}
 
-			if (!m_uastc_transcoder.transcode_image(fmt,
-				pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
-				(const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index,
-				0, (uint32_t)total_2D_image_size,
-				decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1))
+			if (m_format == basist::basis_tex_format::cUASTC_HDR_4x4)
 			{
-				BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n");
-				return false;
+				if (!m_uastc_hdr_transcoder.transcode_image(fmt,
+					pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+					(const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index,
+					0, (uint32_t)total_2D_image_size,
+					decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1))
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n");
+					return false;
+				}
+			}
+			else
+			{
+				if (!m_uastc_transcoder.transcode_image(fmt,
+					pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels,
+					(const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index,
+					0, (uint32_t)total_2D_image_size,
+					decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1))
+				{
+					BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n");
+					return false;
+				}
 			}
 		}
 		else
@@ -17476,4 +17940,1531 @@ namespace basist
 #endif
 	}
 
+	//-------------------------------
+
+#ifdef BASISD_SUPPORT_UASTC_HDR
+	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
+	basist::half_float float_to_half(float val)
+	{
+		union { float f; int32_t i; uint32_t u; } fi = { val };
+		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
+		int s = flt_s, e = 0, m = 0;
+
+		// inf/NaN
+		if (flt_e == 0xff)
+		{
+			e = 31;
+			if (flt_m != 0) // NaN
+				m = 1;
+		}
+		// not zero or denormal
+		else if (flt_e != 0)
+		{
+			int new_exp = flt_e - 127;
+			if (new_exp > 15)
+				e = 31;
+			else if (new_exp < -14)
+				m = lrintf((1 << 24) * fabsf(fi.f));
+			else
+			{
+				e = new_exp + 15;
+				m = lrintf(flt_m * (1.0f / ((float)(1 << 13))));
+			}
+		}
+
+		assert((0 <= m) && (m <= 1024));
+		if (m == 1024)
+		{
+			e++;
+			m = 0;
+		}
+
+		assert((s >= 0) && (s <= 1));
+		assert((e >= 0) && (e <= 31));
+		assert((m >= 0) && (m <= 1023));
+
+		basist::half_float result = (basist::half_float)((s << 15) | (e << 10) | m);
+		return result;
+	}
+		
+	//------------------------------------------------------------------------------------------------
+	// HDR support
+	// 
+	// Originally from bc6h_enc.cpp
+	// BC6H decoder fuzzed vs. DirectXTex's for unsigned/signed
+
+	const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4] = // base bits, r, g, b
+	{
+		// 2 subsets
+		{ 10, 5, 5, 5, },	// 0, mode 1 in MS/D3D docs
+		{ 7, 6, 6, 6, },	// 1
+		{ 11, 5, 4, 4, },	// 2
+		{ 11, 4, 5, 4, },	// 3
+		{ 11, 4, 4, 5, },	// 4
+		{ 9, 5, 5, 5, },	// 5
+		{ 8, 6, 5, 5, },	// 6
+		{ 8, 5, 6, 5, },	// 7
+		{ 8, 5, 5, 6, },	// 8
+		{ 6, 6, 6, 6, },	// 9, endpoints not delta encoded, mode 10 in MS/D3D docs
+		// 1 subset
+		{ 10, 10, 10, 10, }, // 10, endpoints not delta encoded, mode 11 in MS/D3D docs
+		{ 11, 9, 9, 9, },	// 11
+		{ 12, 8, 8, 8, },	// 12
+		{ 16, 4, 4, 4, }	// 13, also useful for solid blocks
+	};
+
+	const int8_t g_bc6h_mode_lookup[32] = { 0, 1, 2, 10, 0, 1, 3, 11, 0, 1, 4, 12, 0, 1, 5, 13, 0, 1, 6, -1, 0, 1, 7, -1, 0, 1, 8, -1, 0, 1, 9, -1 };
+
+	const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX] =
+	{
+		// comp_index, subset*2+lh_index, last_bit, first_bit
+		//------------------------        mode 0: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (10.555, 10.555, 10.555), delta            
+		{ { 1, 2, 4, -1 }, { 2, 2, 4, -1 }, { 2, 3, 4, -1 }, { 0, 0, 9, 0 }, { 1, 0, 9, 0 }, { 2, 0, 9, 0 }, { 0, 1, 4, 0 },
+		{ 1, 3, 4, -1 }, { 1, 2, 3, 0 }, { 1, 1, 4, 0 }, { 2, 3, 0, -1 }, { 1, 3, 3, 0 }, { 2, 1, 4, 0 }, { 2, 3, 1, -1 },
+		{ 2, 2, 3, 0 }, { 0, 2, 4, 0 }, { 2, 3, 2, -1 }, { 0, 3, 4, 0 }, { 2, 3, 3, -1 }, { 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 1: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (7.666, 7.666, 7.666), delta
+		{ { 1, 2, 5, -1 },{ 1, 3, 4, -1 },{ 1, 3, 5, -1 },{ 0, 0, 6, 0 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },
+		{ 1, 0, 6, 0 },{ 2, 2, 5, -1 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 6, 0 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 },
+		{ 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },
+		{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 2: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.555, 11.444, 11.444), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 4, 0 },{ 0, 0, 10, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 },{ 1, 0, 10, -1 },
+		{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },
+		{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 3: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.555, 11.444), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },
+		{ 1, 0, 10, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 0, -1 },
+		{ 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 1, 2, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 4: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.444, 11.555), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 2, 2, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 },
+		{ 1, 0, 10, -1 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 0, 10, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 1, -1 },
+		{ 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 2, 3, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 5: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (9.555, 9.555, 9.555), delta
+		{ { 0, 0, 8, 0 },{ 2, 2, 4, -1 },{ 1, 0, 8, 0 },{ 1, 2, 4, -1 },{ 2, 0, 8, 0 },{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },
+		{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },
+		{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 6: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.666, 8.555, 8.555), delta
+		{ { 0, 0, 7, 0 },{ 1, 3, 4, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 3, -1 },
+		{ 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },
+		{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 7: 2 subsets, Weight bits: 46 bits, Endpoints bits: 72 bits (8.555, 8.666, 8.555), delta
+		{ { 0, 0, 7, 0 },{ 2, 3, 0, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 1, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 1, 3, 5, -1 },
+		{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },
+		{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 8: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.555, 8.555, 8.666), delta
+		{ { 0, 0, 7, 0 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 5, -1 },
+		{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },
+		{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 9: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (6.6.6.6, 6.6.6.6, 6.6.6.6), NO delta
+		{ { 0, 0, 5, 0 },{ 1, 3, 4, -1 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 5, 0 },{ 1, 2, 5, -1 },{ 2, 2, 5, -1 },
+		{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 5, 0 },{ 1, 3, 5, -1 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 },{ 2, 3, 4, -1 },{ 0, 1, 5, 0 },
+		{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 10: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (10.10, 10.10, 10.10), NO delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 9, 0 },{ 1, 1, 9, 0 },{ 2, 1, 9, 0 }, {-1, 0, 0, 0} },
+		//------------------------        mode 11: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (11.9, 11.9, 11.9), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 8, 0 },{ 0, 0, 10, -1 },{ 1, 1, 8, 0 },{ 1, 0, 10, -1 },{ 2, 1, 8, 0 },{ 2, 0, 10, -1 }, {-1, 0, 0, 0} },
+		//------------------------        mode 12: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (12.8, 12.8, 12.8), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 7, 0 },{ 0, 0, 10, 11 },{ 1, 1, 7, 0 },{ 1, 0, 10, 11 },{ 2, 1, 7, 0 },{ 2, 0, 10, 11 }, {-1, 0, 0, 0} },
+		//------------------------        mode 13: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (16.4, 16.4, 16.4), delta
+		{ { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, 15 },{ 1, 1, 3, 0 },{ 1, 0, 10, 15 },{ 2, 1, 3, 0 },{ 2, 0, 10, 15 }, {-1, 0, 0, 0} }
+	};
+
+	// The same as the first 32 2-subset patterns in BC7. 
+	// Bit 7 is a flag indicating that the weight uses 1 less bit than usual.
+	const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4] = // [pat][y][x]
+	{
+		{ {0x80, 0, 1, 1}, { 0, 0, 1, 1 }, { 0, 0, 1, 1 }, { 0, 0, 1, 0x81 }}, { {0x80, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0x81} },
+		{ {0x80, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 0x81} }, { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 0x81} },
+		{ {0x80, 0, 0, 0}, {1, 0, 0, 0}, {1, 1, 1, 0}, {1, 1, 1, 0x81} }, { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} },
+		{ {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 1, 0x81, 1}, {0, 0, 1, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} },
+		{ {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 1, 0, 0}, {1, 1, 1, 0} },
+		{ {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, { {0x80, 1, 1, 1}, {0, 0, 1, 1}, {  0, 0, 1, 1}, {0, 0, 0, 0x81} },
+		{ {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} },
+		{ {0x80, 1, 0x81, 0}, {0, 1, 1, 0}, {0, 1, 1, 0}, {0, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {0, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 0, 0} },
+		{ {0x80, 0, 0, 1}, {0, 1, 1, 1}, {0x81, 1, 1, 0}, {1, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {0x81, 1, 1, 1}, {0, 0, 0, 0} },
+		{ {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {1, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {1, 0, 0, 1}, {1, 0, 0, 1}, {1, 1, 0, 0} }
+	};
+
+	const uint8_t g_bc6h_weight3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+	const uint8_t g_bc6h_weight4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+		
+	struct bc6h_logical_block
+	{
+		uint32_t m_mode;
+		uint32_t m_partition_pattern;	// must be 0 if 1 subset
+		uint32_t m_endpoints[3][4];		// [comp][subset*2+lh_index] - must be already properly packed
+		uint8_t m_weights[16];			// weights must be of the proper size, taking into account skipped MSB's which must be 0
+
+		void clear()
+		{
+			basisu::clear_obj(*this);
+		}
+	};
+
+	static inline void write_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h)
+	{
+		assert((num_bits) && (num_bits < 64) && (bit_pos < 128));
+		assert(val < (1ULL << num_bits));
+
+		if (bit_pos < 64)
+		{
+			l |= (val << bit_pos);
+
+			if ((bit_pos + num_bits) > 64)
+				h |= (val >> (64 - bit_pos));
+		}
+		else
+		{
+			h |= (val << (bit_pos - 64));
+		}
+
+		bit_pos += num_bits;
+		assert(bit_pos <= 128);
+	}
+
+	static inline void write_rev_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h)
+	{
+		assert((num_bits) && (num_bits < 64) && (bit_pos < 128));
+		assert(val < (1ULL << num_bits));
+
+		for (uint32_t i = 0; i < num_bits; i++)
+			write_bits((val >> (num_bits - 1u - i)) & 1, 1, bit_pos, l, h);
+	}
+
+	static void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk)
+	{
+		const uint8_t s_mode_bits[NUM_BC6H_MODES] = { 0b00, 0b01, 0b00010, 0b00110, 0b01010, 0b01110, 0b10010, 0b10110, 0b11010, 0b11110, 0b00011, 0b00111, 0b01011, 0b01111 };
+
+		const uint32_t mode = log_blk.m_mode;
+		assert(mode < NUM_BC6H_MODES);
+
+		uint64_t l = s_mode_bits[mode], h = 0;
+		uint32_t bit_pos = (mode >= 2) ? 5 : 2;
+
+		const uint32_t num_subsets = (mode >= BC6H_FIRST_1SUBSET_MODE_INDEX) ? 1 : 2;
+
+		assert(((num_subsets == 2) && (log_blk.m_partition_pattern < TOTAL_BC6H_PARTITION_PATTERNS)) ||
+			((num_subsets == 1) && (!log_blk.m_partition_pattern)));
+
+		// Sanity checks
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			assert(log_blk.m_endpoints[c][0] < (1u << g_bc6h_mode_sig_bits[mode][0]));	   // 1st subset l, base bits
+			assert(log_blk.m_endpoints[c][1] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 1st subset h, these are deltas except for modes 9,10
+			assert(log_blk.m_endpoints[c][2] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset l
+			assert(log_blk.m_endpoints[c][3] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset h
+		}
+
+		const bc6h_bit_layout* pLayout = &g_bc6h_bit_layouts[mode][0];
+
+		while (pLayout->m_comp != -1)
+		{
+			uint32_t v = (pLayout->m_comp == 3) ? log_blk.m_partition_pattern : log_blk.m_endpoints[pLayout->m_comp][pLayout->m_index];
+
+			if (pLayout->m_first_bit == -1)
+			{
+				write_bits((v >> pLayout->m_last_bit) & 1, 1, bit_pos, l, h);
+			}
+			else
+			{
+				const uint32_t total_bits = basisu::iabs(pLayout->m_last_bit - pLayout->m_first_bit) + 1;
+
+				v >>= basisu::minimum(pLayout->m_first_bit, pLayout->m_last_bit);
+				v &= ((1 << total_bits) - 1);
+
+				if (pLayout->m_first_bit > pLayout->m_last_bit)
+					write_rev_bits(v, total_bits, bit_pos, l, h);
+				else
+					write_bits(v, total_bits, bit_pos, l, h);
+			}
+
+			pLayout++;
+		}
+
+		const uint32_t num_mode_sel_bits = (num_subsets == 1) ? 4 : 3;
+		const uint8_t* pPat = &g_bc6h_2subset_patterns[log_blk.m_partition_pattern][0][0];
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t sel = log_blk.m_weights[i];
+
+			uint32_t num_bits = num_mode_sel_bits;
+			if (num_subsets == 2)
+			{
+				const uint32_t subset_index = pPat[i];
+				num_bits -= (subset_index >> 7);
+			}
+			else if (!i)
+			{
+				num_bits--;
+			}
+
+			assert(sel < (1u << num_bits));
+
+			write_bits(sel, num_bits, bit_pos, l, h);
+		}
+
+		assert(bit_pos == 128);
+
+		basisu::write_le_dword(&dst_blk.m_bytes[0], (uint32_t)l);
+		basisu::write_le_dword(&dst_blk.m_bytes[4], (uint32_t)(l >> 32u));
+		basisu::write_le_dword(&dst_blk.m_bytes[8], (uint32_t)h);
+		basisu::write_le_dword(&dst_blk.m_bytes[12], (uint32_t)(h >> 32u));
+	}
+
+#if 0
+	static inline uint32_t bc6h_blog_dequantize_to_blog16(uint32_t comp, uint32_t bits_per_comp)
+	{
+		int unq;
+
+		if (bits_per_comp >= 15)
+			unq = comp;
+		else if (comp == 0)
+			unq = 0;
+		else if (comp == ((1u << bits_per_comp) - 1u))
+			unq = 0xFFFFu;
+		else
+			unq = ((comp << 16u) + 0x8000u) >> bits_per_comp;
+
+		return unq;
+	}
+#endif
+
+	// Suboptimal, but very close.
+	static inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits)
+	{
+		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
+		return (h * 64 + 30) / (31 * (1 << (16 - num_bits)));
+	}
+
+	// 6,7,8,9,10,11,12
+	const uint32_t BC6H_BLOG_TAB_MIN = 6;
+	const uint32_t BC6H_BLOG_TAB_MAX = 12;
+	//const uint32_t BC6H_BLOG_TAB_NUM = BC6H_BLOG_TAB_MAX - BC6H_BLOG_TAB_MIN + 1;
+	
+	// Handles 16, or 6-12 bits. Others assert.
+	static inline uint32_t half_to_blog_tab(half_float h, uint32_t num_bits)
+	{
+		BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MIN);
+		BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MAX);
+
+		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
+
+		if (num_bits == 16)
+		{
+			return bc6h_half_to_blog(h, 16);
+		}
+		else
+		{
+			assert((num_bits >= BC6H_BLOG_TAB_MIN) && (num_bits <= BC6H_BLOG_TAB_MAX));
+			
+			// Note: This used to be done using a table lookup, but it required ~224KB of tables. This isn't quite as accurate, but the error is very slight (+-1 half values as ints).
+			return bc6h_half_to_blog(h, num_bits);
+		}
+	}
+
+	bool g_bc6h_enc_initialized;
+
+	void bc6h_enc_init()
+	{
+		if (g_bc6h_enc_initialized)
+			return;
+
+		g_bc6h_enc_initialized = true;
+	}
+
+	// mode 10, 4-bit weights
+	void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 15);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		// Convert half endpoints to blog10 (mode 10 doesn't use delta encoding)
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 10);
+			log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 10);
+		}
+
+		memcpy(log_blk.m_weights, pWeights, 16);
+
+		if (log_blk.m_weights[0] & 8)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+				log_blk.m_weights[i] = 15 - log_blk.m_weights[i];
+
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]);
+			}
+		}
+
+		log_blk.m_mode = BC6H_FIRST_1SUBSET_MODE_INDEX;
+		pack_bc6h_block(*pPacked_block, log_blk);
+	}
+
+	// Tries modes 11-13 (delta endpoint) encoding, falling back to mode 10 only when necessary, 4-bit weights
+	void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 15);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		for (uint32_t mode = BC6H_LAST_MODE_INDEX; mode > BC6H_FIRST_1SUBSET_MODE_INDEX; mode--)
+		{
+			const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0], num_delta_bits = g_bc6h_mode_sig_bits[mode][1];
+			const int base_bitmask = (1 << num_base_bits) - 1;
+			const int delta_bitmask = (1 << num_delta_bits) - 1;
+			BASISU_NOTE_UNUSED(base_bitmask);
+
+			assert(num_delta_bits < num_base_bits);
+			assert((num_delta_bits == g_bc6h_mode_sig_bits[mode][2]) && (num_delta_bits == g_bc6h_mode_sig_bits[mode][3]));
+
+			uint32_t blog_endpoints[3][2];
+
+			// Convert half endpoints to blog 16, 12, or 11
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits);
+				assert((int)blog_endpoints[c][0] <= base_bitmask);
+
+				blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits);
+				assert((int)blog_endpoints[c][1] <= base_bitmask);
+			}
+
+			// Copy weights
+			memcpy(log_blk.m_weights, pWeights, 16);
+
+			// Ensure first weight MSB is 0
+			if (log_blk.m_weights[0] & 8)
+			{
+				// Invert weights
+				for (uint32_t i = 0; i < 16; i++)
+					log_blk.m_weights[i] = 15 - log_blk.m_weights[i];
+
+				// Swap blog quantized endpoints
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					std::swap(blog_endpoints[c][0], blog_endpoints[c][1]);
+				}
+			}
+
+			const int max_delta = (1 << (num_delta_bits - 1)) - 1;
+			const int min_delta = -(max_delta + 1);
+			assert((max_delta - min_delta) == delta_bitmask);
+
+			bool failed_flag = false;
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				log_blk.m_endpoints[c][0] = blog_endpoints[c][0];
+
+				int delta = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0];
+				if ((delta < min_delta) || (delta > max_delta))
+				{
+					failed_flag = true;
+					break;
+				}
+
+				log_blk.m_endpoints[c][1] = delta & delta_bitmask;
+			}
+
+			if (failed_flag)
+				continue;
+
+			log_blk.m_mode = mode;
+			pack_bc6h_block(*pPacked_block, log_blk);
+						
+			return;
+		}
+
+		// Worst case fall back to mode 10, which can handle any endpoints
+		bc6h_enc_block_mode10(pPacked_block, pEndpoints, pWeights);
+	}
+
+	// Mode 9 (direct endpoint encoding), 3-bit weights, but only 1 subset
+	void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 7);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		// Convert half endpoints to blog6 (mode 9 doesn't use delta encoding)
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 6);
+			log_blk.m_endpoints[c][2] = log_blk.m_endpoints[c][0];
+
+			log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 6);
+			log_blk.m_endpoints[c][3] = log_blk.m_endpoints[c][1];
+		}
+
+		memcpy(log_blk.m_weights, pWeights, 16);
+
+		const uint32_t pat_index = 0;
+		const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0];
+
+		if (log_blk.m_weights[0] & 4)
+		{
+			for (uint32_t c = 0; c < 3; c++)
+				std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]);
+
+			for (uint32_t i = 0; i < 16; i++)
+				if ((pPat[i] & 0x7F) == 0)
+					log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+		}
+
+		if (log_blk.m_weights[15] & 4)
+		{
+			for (uint32_t c = 0; c < 3; c++)
+				std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]);
+
+			for (uint32_t i = 0; i < 16; i++)
+				if ((pPat[i] & 0x7F) == 1)
+					log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+		}
+
+		log_blk.m_mode = 9;
+		log_blk.m_partition_pattern = pat_index;
+		pack_bc6h_block(*pPacked_block, log_blk);
+	}
+
+	// Tries modes 0-8, falls back to mode 9
+	void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 7);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++)
+		{
+			static const int s_mode_order[9] = { 2, 3, 4, 0,  5, 6, 7, 8,  1 }; // ordered from largest base bits to least
+			const uint32_t mode = s_mode_order[mode_iter];
+
+			const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0];
+			const int base_bitmask = (1 << num_base_bits) - 1;
+			BASISU_NOTE_UNUSED(base_bitmask);
+
+			const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] };
+			const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 };
+
+			uint32_t blog_endpoints[3][4];
+
+			// Convert half endpoints to blog 7-11
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits);
+				blog_endpoints[c][2] = blog_endpoints[c][0];
+				assert((int)blog_endpoints[c][0] <= base_bitmask);
+
+				blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits);
+				blog_endpoints[c][3] = blog_endpoints[c][1];
+				assert((int)blog_endpoints[c][1] <= base_bitmask);
+			}
+
+			const uint32_t pat_index = 0;
+			const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0];
+
+			memcpy(log_blk.m_weights, pWeights, 16);
+
+			if (log_blk.m_weights[0] & 4)
+			{
+				// Swap part 0's endpoints/weights
+				for (uint32_t c = 0; c < 3; c++)
+					std::swap(blog_endpoints[c][0], blog_endpoints[c][1]);
+
+				for (uint32_t i = 0; i < 16; i++)
+					if ((pPat[i] & 0x7F) == 0)
+						log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+			}
+
+			if (log_blk.m_weights[15] & 4)
+			{
+				// Swap part 1's endpoints/weights
+				for (uint32_t c = 0; c < 3; c++)
+					std::swap(blog_endpoints[c][2], blog_endpoints[c][3]);
+
+				for (uint32_t i = 0; i < 16; i++)
+					if ((pPat[i] & 0x7F) == 1)
+						log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+			}
+
+			bool failed_flag = false;
+
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1;
+
+				const int min_delta = -(max_delta + 1);
+				assert((max_delta - min_delta) == delta_bitmasks[c]);
+
+				log_blk.m_endpoints[c][0] = blog_endpoints[c][0];
+
+				int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0];
+				int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0];
+				int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0];
+
+				if ((delta0 < min_delta) || (delta0 > max_delta) ||
+					(delta1 < min_delta) || (delta1 > max_delta) ||
+					(delta2 < min_delta) || (delta2 > max_delta))
+				{
+					failed_flag = true;
+					break;
+				}
+
+				log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c];
+				log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c];
+				log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c];
+
+				if (failed_flag)
+					break;
+			}
+			if (failed_flag)
+				continue;
+
+			log_blk.m_mode = mode;
+			log_blk.m_partition_pattern = pat_index;
+			pack_bc6h_block(*pPacked_block, log_blk);
+
+			return;
+
+		} // mode_iter
+
+		bc6h_enc_block_1subset_mode9_3bit_weights(pPacked_block, pEndpoints, pWeights);
+	}
+
+	// pEndpoints[subset][comp][lh_index]
+	void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+		assert(common_part_index < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 7);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		// Convert half endpoints to blog6 (mode 9 doesn't use delta encoding)
+		for (uint32_t s = 0; s < 2; s++)
+		{
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				log_blk.m_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], 6);
+				log_blk.m_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], 6);
+			}
+		}
+
+		memcpy(log_blk.m_weights, pWeights, 16);
+
+		//const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc;
+		const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7;
+
+		const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert;
+		if (invert_flag)
+		{
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][2]);
+				std::swap(log_blk.m_endpoints[c][1], log_blk.m_endpoints[c][3]);
+			}
+		}
+
+		const uint32_t pat_index = bc7_pattern;
+		assert(pat_index < 32);
+		const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0];
+
+		bool swap_flags[2] = { false, false };
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			if ((pPat[i] & 0x80) == 0)
+				continue;
+
+			if (log_blk.m_weights[i] & 4)
+			{
+				const uint32_t p = pPat[i] & 1;
+				swap_flags[p] = true;
+			}
+		}
+
+		if (swap_flags[0])
+		{
+			for (uint32_t c = 0; c < 3; c++)
+				std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]);
+
+			for (uint32_t i = 0; i < 16; i++)
+				if ((pPat[i] & 0x7F) == 0)
+					log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+		}
+
+		if (swap_flags[1])
+		{
+			for (uint32_t c = 0; c < 3; c++)
+				std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]);
+
+			for (uint32_t i = 0; i < 16; i++)
+				if ((pPat[i] & 0x7F) == 1)
+					log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+		}
+
+		log_blk.m_mode = 9;
+		log_blk.m_partition_pattern = pat_index;
+		pack_bc6h_block(*pPacked_block, log_blk);
+	}
+
+	void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights)
+	{
+		assert(g_bc6h_enc_initialized);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			assert(pWeights[i] <= 7);
+		}
+
+		bc6h_logical_block log_blk;
+		log_blk.clear();
+
+		for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++)
+		{
+			static const int s_mode_order[9] = { 2, 3, 4, 0,  5, 6, 7, 8,  1 }; // ordered from largest base bits to least
+			const uint32_t mode = s_mode_order[mode_iter];
+
+			const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0];
+			const int base_bitmask = (1 << num_base_bits) - 1;
+			BASISU_NOTE_UNUSED(base_bitmask);
+
+			const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] };
+			const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 };
+
+			uint32_t blog_endpoints[3][4];
+
+			// Convert half endpoints to blog 7-11
+			for (uint32_t s = 0; s < 2; s++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					blog_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], num_base_bits);
+					blog_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], num_base_bits);
+				}
+			}
+
+			memcpy(log_blk.m_weights, pWeights, 16);
+
+			//const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc;
+			const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7;
+
+			const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert;
+			if (invert_flag)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+				{
+					std::swap(blog_endpoints[c][0], blog_endpoints[c][2]);
+					std::swap(blog_endpoints[c][1], blog_endpoints[c][3]);
+				}
+			}
+
+			const uint32_t pat_index = bc7_pattern;
+			assert(pat_index < 32);
+			const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0];
+
+			bool swap_flags[2] = { false, false };
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if ((pPat[i] & 0x80) == 0)
+					continue;
+
+				if (log_blk.m_weights[i] & 4)
+				{
+					const uint32_t p = pPat[i] & 1;
+					swap_flags[p] = true;
+				}
+			}
+
+			if (swap_flags[0])
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					std::swap(blog_endpoints[c][0], blog_endpoints[c][1]);
+
+				for (uint32_t i = 0; i < 16; i++)
+					if ((pPat[i] & 0x7F) == 0)
+						log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+			}
+
+			if (swap_flags[1])
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					std::swap(blog_endpoints[c][2], blog_endpoints[c][3]);
+
+				for (uint32_t i = 0; i < 16; i++)
+					if ((pPat[i] & 0x7F) == 1)
+						log_blk.m_weights[i] = 7 - log_blk.m_weights[i];
+			}
+
+			// Try packing the endpoints
+			bool failed_flag = false;
+
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1;
+
+				const int min_delta = -(max_delta + 1);
+				assert((max_delta - min_delta) == delta_bitmasks[c]);
+
+				log_blk.m_endpoints[c][0] = blog_endpoints[c][0];
+
+				int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0];
+				int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0];
+				int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0];
+
+				if ((delta0 < min_delta) || (delta0 > max_delta) ||
+					(delta1 < min_delta) || (delta1 > max_delta) ||
+					(delta2 < min_delta) || (delta2 > max_delta))
+				{
+					failed_flag = true;
+					break;
+				}
+
+				log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c];
+				log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c];
+				log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c];
+
+				if (failed_flag)
+					break;
+			}
+			if (failed_flag)
+				continue;
+
+			log_blk.m_mode = mode;
+			log_blk.m_partition_pattern = pat_index;
+			pack_bc6h_block(*pPacked_block, log_blk);
+
+			//half_float blk[16 * 3];
+			//unpack_bc6h(pPacked_block, blk, false);
+
+			return;
+		}
+
+		bc6h_enc_block_2subset_mode9_3bit_weights(pPacked_block, common_part_index, pEndpoints, pWeights);
+	}
+
+	bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3])
+	{
+		assert(g_bc6h_enc_initialized);
+
+		if ((pColor[0] | pColor[1] | pColor[2]) & 0x8000)
+			return false;
+
+		// ASTC block unpacker won't allow Inf/NaN's to come through.
+		//if (is_half_inf_or_nan(pColor[0]) || is_half_inf_or_nan(pColor[1]) || is_half_inf_or_nan(pColor[2]))
+		//	return false;
+
+		uint8_t weights[16];
+		memset(weights, 0, sizeof(weights));
+
+		half_float endpoints[3][2];
+		endpoints[0][0] = pColor[0];
+		endpoints[0][1] = pColor[0];
+				
+		endpoints[1][0] = pColor[1];
+		endpoints[1][1] = pColor[1];
+
+		endpoints[2][0] = pColor[2];
+		endpoints[2][1] = pColor[2];
+				
+		bc6h_enc_block_1subset_4bit_weights(pPacked_block, endpoints, weights);
+
+		return true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+	// basisu_astc_hdr_core.cpp
+
+	static bool g_astc_hdr_core_initialized;
+	static int8_t g_astc_partition_id_to_common_bc7_pat_index[1024];
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	void astc_hdr_core_init()
+	{
+		if (g_astc_hdr_core_initialized)
+			return;
+
+		memset(g_astc_partition_id_to_common_bc7_pat_index, 0xFF, sizeof(g_astc_partition_id_to_common_bc7_pat_index));
+
+		for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; ++part_index)
+		{
+			const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc;
+			//const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7;
+
+			assert(astc_pattern < 1024);
+			g_astc_partition_id_to_common_bc7_pat_index[astc_pattern] = (int8_t)part_index;
+		}
+
+		g_astc_hdr_core_initialized = true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	static inline int astc_hdr_sign_extend(int src, int num_src_bits)
+	{
+		assert(basisu::in_range(num_src_bits, 2, 31));
+
+		const bool negative = (src & (1 << (num_src_bits - 1))) != 0;
+		if (negative)
+			return src | ~((1 << num_src_bits) - 1);
+		else
+			return src & ((1 << num_src_bits) - 1);
+	}
+
+	static inline void astc_hdr_pack_bit(
+		int& dst, int dst_bit,
+		int src_val, int src_bit = 0)
+	{
+		assert(dst_bit >= 0 && dst_bit <= 31);
+		int bit = basisu::get_bit(src_val, src_bit);
+		dst |= (bit << dst_bit);
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	void decode_mode7_to_qlog12_ise20(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		int* pScale)
+	{
+		assert(g_astc_hdr_core_initialized);
+
+		for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++)
+		{
+			assert(pEndpoints[i] <= 255);
+		}
+
+		const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3];
+
+		// Extract mode bits and unpack to major component and mode.
+		const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
+
+		int majcomp, mode;
+		if ((modeval & 0xC) != 0xC)
+		{
+			majcomp = modeval >> 2;
+			mode = modeval & 3;
+		}
+		else if (modeval != 0xF)
+		{
+			majcomp = modeval & 3;
+			mode = 4;
+		}
+		else
+		{
+			majcomp = 0;
+			mode = 5;
+		}
+
+		// Extract low-order bits of r, g, b, and s.
+		int red = v0 & 0x3f;
+		int green = v1 & 0x1f;
+		int blue = v2 & 0x1f;
+		int scale = v3 & 0x1f;
+
+		// Extract high-order bits, which may be assigned depending on mode
+		int x0 = (v1 >> 6) & 1;
+		int x1 = (v1 >> 5) & 1;
+		int x2 = (v2 >> 6) & 1;
+		int x3 = (v2 >> 5) & 1;
+		int x4 = (v3 >> 7) & 1;
+		int x5 = (v3 >> 6) & 1;
+		int x6 = (v3 >> 5) & 1;
+
+		// Now move the high-order xs into the right place.
+		const int ohm = 1 << mode;
+		if (ohm & 0x30) green |= x0 << 6;
+		if (ohm & 0x3A) green |= x1 << 5;
+		if (ohm & 0x30) blue |= x2 << 6;
+		if (ohm & 0x3A) blue |= x3 << 5;
+		if (ohm & 0x3D) scale |= x6 << 5;
+		if (ohm & 0x2D) scale |= x5 << 6;
+		if (ohm & 0x04) scale |= x4 << 7;
+		if (ohm & 0x3B) red |= x4 << 6;
+		if (ohm & 0x04) red |= x3 << 6;
+		if (ohm & 0x10) red |= x5 << 7;
+		if (ohm & 0x0F) red |= x2 << 7;
+		if (ohm & 0x05) red |= x1 << 8;
+		if (ohm & 0x0A) red |= x0 << 8;
+		if (ohm & 0x05) red |= x0 << 9;
+		if (ohm & 0x02) red |= x6 << 9;
+		if (ohm & 0x01) red |= x3 << 10;
+		if (ohm & 0x02) red |= x5 << 10;
+
+		// Shift the bits to the top of the 12-bit result.
+		static const int s_shamts[6] = { 1,1,2,3,4,5 };
+
+		const int shamt = s_shamts[mode];
+		red <<= shamt;
+		green <<= shamt;
+		blue <<= shamt;
+		scale <<= shamt;
+
+		// Minor components are stored as differences
+		if (mode != 5)
+		{
+			green = red - green;
+			blue = red - blue;
+		}
+
+		// Swizzle major component into place
+		if (majcomp == 1)
+			std::swap(red, green);
+
+		if (majcomp == 2)
+			std::swap(red, blue);
+
+		// Clamp output values, set alpha to 1.0
+		e[1][0] = basisu::clamp(red, 0, 0xFFF);
+		e[1][1] = basisu::clamp(green, 0, 0xFFF);
+		e[1][2] = basisu::clamp(blue, 0, 0xFFF);
+
+		e[0][0] = basisu::clamp(red - scale, 0, 0xFFF);
+		e[0][1] = basisu::clamp(green - scale, 0, 0xFFF);
+		e[0][2] = basisu::clamp(blue - scale, 0, 0xFFF);
+
+		if (pScale)
+			*pScale = scale;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	bool decode_mode7_to_qlog12(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		int* pScale,
+		uint32_t ise_endpoint_range)
+	{
+		assert(g_astc_hdr_core_initialized);
+
+		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
+		{
+			decode_mode7_to_qlog12_ise20(pEndpoints, e, pScale);
+		}
+		else
+		{
+			uint8_t dequantized_endpoints[NUM_MODE7_ENDPOINTS];
+
+			for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++)
+				dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]];
+
+			decode_mode7_to_qlog12_ise20(dequantized_endpoints, e, pScale);
+		}
+
+		for (uint32_t i = 0; i < 2; i++)
+		{
+			if (e[i][0] > (int)MAX_QLOG12)
+				return false;
+
+			if (e[i][1] > (int)MAX_QLOG12)
+				return false;
+
+			if (e[i][2] > (int)MAX_QLOG12)
+				return false;
+		}
+
+		return true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	void decode_mode11_to_qlog12_ise20(
+		const uint8_t* pEndpoints,
+		int e[2][3])
+	{
+#ifdef _DEBUG
+		for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++)
+		{
+			assert(pEndpoints[i] <= 255);
+		}
+#endif
+
+		const uint32_t maj_comp = basisu::get_bit(pEndpoints[4], 7) | (basisu::get_bit(pEndpoints[5], 7) << 1);
+
+		if (maj_comp == 3)
+		{
+			// Direct, qlog8 and qlog7
+			e[0][0] = pEndpoints[0] << 4;
+			e[1][0] = pEndpoints[1] << 4;
+
+			e[0][1] = pEndpoints[2] << 4;
+			e[1][1] = pEndpoints[3] << 4;
+
+			e[0][2] = (pEndpoints[4] & 127) << 5;
+			e[1][2] = (pEndpoints[5] & 127) << 5;
+		}
+		else
+		{
+			int v0 = pEndpoints[0];
+			int v1 = pEndpoints[1];
+			int v2 = pEndpoints[2];
+			int v3 = pEndpoints[3];
+			int v4 = pEndpoints[4];
+			int v5 = pEndpoints[5];
+
+			int mode = 0;
+			astc_hdr_pack_bit(mode, 0, v1, 7);
+			astc_hdr_pack_bit(mode, 1, v2, 7);
+			astc_hdr_pack_bit(mode, 2, v3, 7);
+
+			int va = v0;
+			astc_hdr_pack_bit(va, 8, v1, 6);
+
+			int vb0 = v2 & 63;
+			int vb1 = v3 & 63;
+			int vc = v1 & 63;
+
+			int vd0 = v4 & 0x7F; // this takes more bits than is sometimes needed
+			int vd1 = v5 & 0x7F; // this takes more bits than is sometimes needed
+			static const int8_t dbitstab[8] = { 7,6,7,6,5,6,5,6 };
+			vd0 = astc_hdr_sign_extend(vd0, dbitstab[mode]);
+			vd1 = astc_hdr_sign_extend(vd1, dbitstab[mode]);
+
+			int x0 = basisu::get_bit(v2, 6);
+			int x1 = basisu::get_bit(v3, 6);
+			int x2 = basisu::get_bit(v4, 6);
+			int x3 = basisu::get_bit(v5, 6);
+			int x4 = basisu::get_bit(v4, 5);
+			int x5 = basisu::get_bit(v5, 5);
+
+			const uint32_t ohm = 1U << mode;
+			if (ohm & 0xA4) va |= (x0 << 9);
+			if (ohm & 0x08) va |= (x2 << 9);
+			if (ohm & 0x50) va |= (x4 << 9);
+			if (ohm & 0x50) va |= (x5 << 10);
+			if (ohm & 0xA0) va |= (x1 << 10);
+			if (ohm & 0xC0) va |= (x2 << 11);
+			if (ohm & 0x04) vc |= (x1 << 6);
+			if (ohm & 0xE8) vc |= (x3 << 6);
+			if (ohm & 0x20) vc |= (x2 << 7);
+			if (ohm & 0x5B) vb0 |= (x0 << 6);
+			if (ohm & 0x5B) vb1 |= (x1 << 6);
+			if (ohm & 0x12) vb0 |= (x2 << 7);
+			if (ohm & 0x12) vb1 |= (x3 << 7);
+
+			const int shamt = (mode >> 1) ^ 3;
+			
+			va  = (uint32_t)va  << shamt;
+			vb0 = (uint32_t)vb0 << shamt;
+			vb1 = (uint32_t)vb1 << shamt;
+			vc  = (uint32_t)vc  << shamt;
+			vd0 = (uint32_t)vd0 << shamt;
+			vd1 = (uint32_t)vd1 << shamt;
+
+			// qlog12
+			e[1][0] = basisu::clamp<int>(va, 0, 0xFFF);
+			e[1][1] = basisu::clamp<int>(va - vb0, 0, 0xFFF);
+			e[1][2] = basisu::clamp<int>(va - vb1, 0, 0xFFF);
+
+			e[0][0] = basisu::clamp<int>(va - vc, 0, 0xFFF);
+			e[0][1] = basisu::clamp<int>(va - vb0 - vc - vd0, 0, 0xFFF);
+			e[0][2] = basisu::clamp<int>(va - vb1 - vc - vd1, 0, 0xFFF);
+
+			if (maj_comp)
+			{
+				std::swap(e[0][0], e[0][maj_comp]);
+				std::swap(e[1][0], e[1][maj_comp]);
+			}
+		}
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	bool decode_mode11_to_qlog12(
+		const uint8_t* pEndpoints,
+		int e[2][3],
+		uint32_t ise_endpoint_range)
+	{
+		assert(g_astc_hdr_core_initialized);
+		assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
+
+		if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS)
+		{
+			decode_mode11_to_qlog12_ise20(pEndpoints, e);
+		}
+		else
+		{
+			uint8_t dequantized_endpoints[NUM_MODE11_ENDPOINTS];
+
+			for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++)
+				dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]];
+
+			decode_mode11_to_qlog12_ise20(dequantized_endpoints, e);
+		}
+
+		for (uint32_t i = 0; i < 2; i++)
+		{
+			if (e[i][0] > (int)MAX_QLOG12)
+				return false;
+
+			if (e[i][1] > (int)MAX_QLOG12)
+				return false;
+
+			if (e[i][2] > (int)MAX_QLOG12)
+				return false;
+		}
+
+		return true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk)
+	{
+		assert(g_astc_hdr_core_initialized);
+		assert((best_blk.m_weight_ise_range >= 1) && (best_blk.m_weight_ise_range <= 8));
+		
+		if (best_blk.m_weight_ise_range == 5)
+		{
+			// Use 3-bit BC6H weights which are a perfect match for 3-bit ASTC weights, but encode 1-subset as 2 equal subsets
+			bc6h_enc_block_1subset_3bit_weights(&transcoded_bc6h_blk, h_e, best_blk.m_weights);
+		}
+		else
+		{
+			uint8_t bc6h_weights[16];
+
+			if (best_blk.m_weight_ise_range == 1)
+			{
+				// weight ISE 1: 3 levels
+				static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 8, 15 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 2)
+			{
+				// weight ISE 2: 4 levels
+				static const uint8_t s_astc2_to_bc6h_4[4] = { 0, 5, 10, 15 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc2_to_bc6h_4[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 3)
+			{
+				// weight ISE 3: 5 levels
+				static const uint8_t s_astc3_to_bc6h_4[5] = { 0, 4, 7, 11, 15 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc3_to_bc6h_4[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 4)
+			{
+				// weight ISE 4: 6 levels
+				static const uint8_t s_astc4_to_bc6h_4[6] = { 0, 15, 3, 12, 6, 9 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc4_to_bc6h_4[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 6)
+			{
+				// weight ISE 6: 10 levels
+				static const uint8_t s_astc6_to_bc6h_4[10] = { 0, 15, 2, 13, 3, 12, 5, 10, 6, 9 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc6_to_bc6h_4[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 7)
+			{
+				// weight ISE 7: 12 levels
+				static const uint8_t s_astc7_to_bc6h_4[12] = { 0, 15, 4, 11, 1, 14, 5, 10, 2, 13, 6, 9 };
+
+				for (uint32_t i = 0; i < 16; i++)
+					bc6h_weights[i] = s_astc7_to_bc6h_4[best_blk.m_weights[i]];
+			}
+			else if (best_blk.m_weight_ise_range == 8)
+			{
+				// 16 levels
+				memcpy(bc6h_weights, best_blk.m_weights, 16);
+			}
+			else
+			{
+				assert(0);
+				return false;
+			}
+
+			bc6h_enc_block_1subset_4bit_weights(&transcoded_bc6h_blk, h_e, bc6h_weights);
+		}
+
+		return true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+
+	bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk)
+	{
+		assert(g_astc_hdr_core_initialized);
+		assert(best_blk.m_num_partitions == 2);
+		assert(common_part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+		
+		half_float bc6h_endpoints[2][3][2]; // [subset][comp][lh_index]
+
+		// UASTC HDR checks
+		// Both CEM's must be equal in 2-subset UASTC HDR.
+		if (best_blk.m_color_endpoint_modes[0] != best_blk.m_color_endpoint_modes[1])
+			return false;
+		if ((best_blk.m_color_endpoint_modes[0] != 7) && (best_blk.m_color_endpoint_modes[0] != 11))
+			return false;
+				
+		if (best_blk.m_color_endpoint_modes[0] == 7)
+		{
+			if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 20)) ||
+		 		  ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 20)) ||
+				  ((best_blk.m_weight_ise_range == 3) && (best_blk.m_endpoint_ise_range == 19)) ||
+				  ((best_blk.m_weight_ise_range == 4) && (best_blk.m_endpoint_ise_range == 17)) ||
+				  ((best_blk.m_weight_ise_range == 5) && (best_blk.m_endpoint_ise_range == 15))))
+			{
+				return false;
+			}
+		}
+		else
+		{
+			if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 14)) ||
+				  ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 12))))
+			{
+				return false;
+			}
+		}
+
+		for (uint32_t s = 0; s < 2; s++)
+		{
+			int e[2][3];
+			if (best_blk.m_color_endpoint_modes[0] == 7)
+			{
+				bool success = decode_mode7_to_qlog12(best_blk.m_endpoints + s * NUM_MODE7_ENDPOINTS, e, nullptr, best_blk.m_endpoint_ise_range);
+				if (!success)
+					return false;
+			}
+			else
+			{
+				bool success = decode_mode11_to_qlog12(best_blk.m_endpoints + s * NUM_MODE11_ENDPOINTS, e, best_blk.m_endpoint_ise_range);
+				if (!success)
+					return false;
+			}
+
+			for (uint32_t c = 0; c < 3; c++)
+			{
+				bc6h_endpoints[s][c][0] = qlog_to_half_slow(e[0][c], 12);
+				if (is_half_inf_or_nan(bc6h_endpoints[s][c][0]))
+					return false;
+
+				bc6h_endpoints[s][c][1] = qlog_to_half_slow(e[1][c], 12);
+				if (is_half_inf_or_nan(bc6h_endpoints[s][c][1]))
+					return false;
+			}
+		}
+
+		uint8_t bc6h_weights[16];
+		if (best_blk.m_weight_ise_range == 1)
+		{
+			static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 4, 7 };
+
+			for (uint32_t i = 0; i < 16; i++)
+				bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]];
+		}
+		else if (best_blk.m_weight_ise_range == 2)
+		{
+			static const uint8_t s_astc2_to_bc6h_3[4] = { 0, 2, 5, 7 };
+
+			for (uint32_t i = 0; i < 16; i++)
+				bc6h_weights[i] = s_astc2_to_bc6h_3[best_blk.m_weights[i]];
+		}
+		else if (best_blk.m_weight_ise_range == 3)
+		{
+			static const uint8_t s_astc3_to_bc6h_3[5] = { 0, 2, 4, 5, 7 };
+
+			for (uint32_t i = 0; i < 16; i++)
+				bc6h_weights[i] = s_astc3_to_bc6h_3[best_blk.m_weights[i]];
+		}
+		else if (best_blk.m_weight_ise_range == 4)
+		{
+			static const uint8_t s_astc4_to_bc6h_3[6] = { 0, 7, 1, 6, 3, 4 };
+
+			for (uint32_t i = 0; i < 16; i++)
+				bc6h_weights[i] = s_astc4_to_bc6h_3[best_blk.m_weights[i]];
+		}
+		else if (best_blk.m_weight_ise_range == 5)
+		{
+			memcpy(bc6h_weights, best_blk.m_weights, 16);
+		}
+		else
+		{
+			assert(0);
+			return false;
+		}
+
+		bc6h_enc_block_2subset_3bit_weights(&transcoded_bc6h_blk, common_part_index, bc6h_endpoints, bc6h_weights);
+
+		return true;
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+	// Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails.
+	bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk)
+	{
+		assert(g_astc_hdr_core_initialized);
+		if (!g_astc_hdr_core_initialized)
+		{
+			assert(0);
+			return false;
+		}
+
+		astc_helpers::log_astc_block log_blk;
+
+		if (!astc_helpers::unpack_block(&src_blk, log_blk, 4, 4))
+		{
+			// Failed unpacking ASTC data
+			return false;
+		}
+
+		return astc_hdr_transcode_to_bc6h(log_blk, dst_blk);
+	}
+
+	//--------------------------------------------------------------------------------------------------------------------------
+	// Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails.
+	bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk)
+	{
+		assert(g_astc_hdr_core_initialized);
+		if (!g_astc_hdr_core_initialized)
+		{
+			assert(0);
+			return false;
+		}
+				
+		if (log_blk.m_solid_color_flag_ldr)
+		{
+			// Don't support LDR solid colors.
+			return false;
+		}
+
+		if (log_blk.m_solid_color_flag_hdr)
+		{
+			// Solid color HDR block
+			return bc6h_enc_block_solid_color(&dst_blk, log_blk.m_solid_color);
+		}
+
+		// Only support 4x4 grid sizes
+		if ((log_blk.m_grid_width != 4) || (log_blk.m_grid_height != 4))
+			return false;
+				
+		// Don't support dual plane encoding
+		if (log_blk.m_dual_plane)
+			return false;
+
+		if (log_blk.m_num_partitions == 1)
+		{
+			// Handle 1 partition (or subset)
+			
+			// UASTC HDR checks
+			if ((log_blk.m_weight_ise_range < 1) || (log_blk.m_weight_ise_range > 8))
+				return false;
+									
+			int e[2][3];
+			bool success;
+
+			if (log_blk.m_color_endpoint_modes[0] == 7)
+			{
+				if (log_blk.m_endpoint_ise_range != 20)
+					return false;
+
+				success = decode_mode7_to_qlog12(log_blk.m_endpoints, e, nullptr, log_blk.m_endpoint_ise_range);
+			}
+			else if (log_blk.m_color_endpoint_modes[0] == 11)
+			{
+				// UASTC HDR checks
+				if (log_blk.m_weight_ise_range <= 7)
+				{
+					if (log_blk.m_endpoint_ise_range != 20)
+						return false;
+				}
+				else if (log_blk.m_endpoint_ise_range != 19)
+				{
+					return false;
+				}
+
+				success = decode_mode11_to_qlog12(log_blk.m_endpoints, e, log_blk.m_endpoint_ise_range);
+			}
+			else
+			{
+				return false;
+			}
+
+			if (!success)
+				return false;
+
+			// Transform endpoints to half float
+			half_float h_e[3][2] =
+			{
+				{ qlog_to_half_slow(e[0][0], 12), qlog_to_half_slow(e[1][0], 12) },
+				{ qlog_to_half_slow(e[0][1], 12), qlog_to_half_slow(e[1][1], 12) },
+				{ qlog_to_half_slow(e[0][2], 12), qlog_to_half_slow(e[1][2], 12) }
+			};
+
+			// Sanity check for NaN/Inf
+			for (uint32_t i = 0; i < 2; i++)
+				if (is_half_inf_or_nan(h_e[0][i]) || is_half_inf_or_nan(h_e[1][i]) || is_half_inf_or_nan(h_e[2][i]))
+					return false;
+			
+			// Transcode to bc6h
+			if (!transcode_bc6h_1subset(h_e, log_blk, dst_blk))
+				return false;
+		}
+		else if (log_blk.m_num_partitions == 2)
+		{
+			// Handle 2 partition (or subset)
+			int common_bc7_pat_index = g_astc_partition_id_to_common_bc7_pat_index[log_blk.m_partition_id];
+			if (common_bc7_pat_index < 0)
+				return false;
+
+			assert(common_bc7_pat_index < (int)basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2);
+						
+			if (!transcode_bc6h_2subsets(common_bc7_pat_index, log_blk, dst_blk))
+				return false;
+		}
+		else
+		{
+			// Only supports 1 or 2 partitions (or subsets)
+			return false;
+		}
+
+		return true;
+	}
+#endif // BASISD_SUPPORT_UASTC_HDR
+
 } // namespace basist
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.h b/thirdparty/basis_universal/transcoder/basisu_transcoder.h
index 3327e8ddb732..8324e996989c 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.h
@@ -1,5 +1,5 @@
 // basisu_transcoder.h
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +29,7 @@
 
 // Set BASISU_FORCE_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development.
 #ifndef BASISU_FORCE_DEVEL_MESSAGES
+	// TODO - disable before checking in
 	#define BASISU_FORCE_DEVEL_MESSAGES 0
 #endif
 
@@ -55,7 +56,7 @@ namespace basist
 		cTFETC2_RGBA = 1,							// Opaque+alpha, ETC2_EAC_A8 block followed by a ETC1 block, alpha channel will be opaque for opaque .basis files
 
 		// BC1-5, BC7 (desktop, some mobile devices)
-		cTFBC1_RGB = 2,							// Opaque only, no punchthrough alpha support yet, transcodes alpha slice if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified
+		cTFBC1_RGB = 2,								// Opaque only, no punchthrough alpha support yet, transcodes alpha slice if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified
 		cTFBC3_RGBA = 3, 							// Opaque+alpha, BC4 followed by a BC1 block, alpha channel will be opaque for opaque .basis files
 		cTFBC4_R = 4,								// Red only, alpha slice is transcoded to output if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified
 		cTFBC5_RG = 5,								// XY: Two BC4 blocks, X=R and Y=Alpha, .basis file should have alpha data (if not Y will be all 255's)
@@ -63,10 +64,11 @@ namespace basist
 
 		// PVRTC1 4bpp (mobile, PowerVR devices)
 		cTFPVRTC1_4_RGB = 8,						// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified, nearly lowest quality of any texture format.
-		cTFPVRTC1_4_RGBA = 9,					// Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.
+		cTFPVRTC1_4_RGBA = 9,						// Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.
 
 		// ASTC (mobile, Intel devices, hopefully all desktop GPU's one day)
-		cTFASTC_4x4_RGBA = 10,					// Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions.
+		cTFASTC_4x4_RGBA = 10,						// LDR. Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. 
+													// LDR: Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions.
 
 		// ATC (mobile, Adreno devices, this is a niche format)
 		cTFATC_RGB = 11,							// Opaque, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. ATI ATC (GL_ATC_RGB_AMD)
@@ -74,8 +76,8 @@ namespace basist
 
 		// FXT1 (desktop, Intel devices, this is a super obscure format)
 		cTFFXT1_RGB = 17,							// Opaque only, uses exclusively CC_MIXED blocks. Notable for having a 8x4 block size. GL_3DFX_texture_compression_FXT1 is supported on Intel integrated GPU's (such as HD 630).
-														// Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now.
-														// See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h.
+													// Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now.
+													// See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h.
 
 		cTFPVRTC2_4_RGB = 18,					// Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB).
 		cTFPVRTC2_4_RGBA = 19,					// Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks.
@@ -83,13 +85,22 @@ namespace basist
 		cTFETC2_EAC_R11 = 20,					// R only (ETC2 EAC R11 unsigned)
 		cTFETC2_EAC_RG11 = 21,					// RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps
 
+		cTFBC6H = 22,							// HDR, RGB only, unsigned
+		cTFASTC_HDR_4x4_RGBA = 23,				// HDR, RGBA (currently UASTC HDR is only RGB), unsigned
+
 		// Uncompressed (raw pixel) formats
+		// Note these uncompressed formats (RGBA32, 565, and 4444) can only be transcoded to from LDR input files (ETC1S or UASTC LDR).
 		cTFRGBA32 = 13,							// 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte.
 		cTFRGB565 = 14,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11
 		cTFBGR565 = 15,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0
-		cTFRGBA4444 = 16,							// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0
+		cTFRGBA4444 = 16,						// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0
+		
+		// Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR).
+		cTFRGB_HALF = 24,						// 48bpp RGB half (16-bits/component, 3 components)
+		cTFRGBA_HALF = 25,						// 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha)
+		cTFRGB_9E5 = 26,						// 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent)
 
-		cTFTotalTextureFormats = 22,
+		cTFTotalTextureFormats = 27,
 
 		// Old enums for compatibility with code compiled against previous versions
 		cTFETC1 = cTFETC1_RGB,
@@ -124,6 +135,9 @@ namespace basist
 	// Returns true if the format supports an alpha channel.
 	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt);
 
+	// Returns true if the format is HDR.
+	bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt);
+
 	// Returns the basisu::texture_format corresponding to the specified transcoder_texture_format.
 	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt);
 
@@ -142,7 +156,7 @@ namespace basist
 	// Returns the block height for the specified texture format, which is currently always 4.
 	uint32_t basis_get_block_height(transcoder_texture_format tex_type);
 
-	// Returns true if the specified format was enabled at compile time.
+	// Returns true if the specified format was enabled at compile time, and is supported for the specific basis/ktx2 texture format (ETC1S, UASTC, or UASTC HDR).
 	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S);
 
 	// Validates that the output buffer is large enough to hold the entire transcoded texture.
@@ -317,6 +331,42 @@ namespace basist
 			int channel0 = -1, int channel1 = -1);
 	};
 
+	class basisu_lowlevel_uastc_hdr_transcoder
+	{
+		friend class basisu_transcoder;
+
+	public:
+		basisu_lowlevel_uastc_hdr_transcoder();
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt,
+				output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels,
+				pState, output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t slice_offset, uint32_t slice_length,
+			uint32_t decode_flags = 0,
+			bool has_alpha = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0,
+			int channel0 = -1, int channel1 = -1);
+	};
+
 	struct basisu_slice_info
 	{
 		uint32_t m_orig_width;
@@ -530,6 +580,7 @@ namespace basist
 	private:
 		mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder;
 		mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder;
+		mutable basisu_lowlevel_uastc_hdr_transcoder m_lowlevel_uastc_hdr_decoder;
 
 		bool m_ready_to_transcode;
 
@@ -612,10 +663,12 @@ namespace basist
 #pragma pack(pop)
 
 	const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0;
+	const uint32_t KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK = 1000066000; // TODO, is this correct?
 	const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166;
+	const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR = 167;
 	const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163;
 	const uint32_t KTX2_IMAGE_IS_P_FRAME = 2;
-	const uint32_t KTX2_UASTC_BLOCK_SIZE = 16;
+	const uint32_t KTX2_UASTC_BLOCK_SIZE = 16; // also the block size for UASTC_HDR
 	const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased
 
 	// The KTX2 transfer functions supported by KTX2
@@ -800,13 +853,15 @@ namespace basist
 		// Returns 0 or the number of layers in the texture array or texture video. Valid after init().
 		uint32_t get_layers() const { return m_header.m_layer_count; }
 
-		// Returns cETC1S or cUASTC4x4. Valid after init().
+		// Returns cETC1S, cUASTC4x4, or cUASTC_HDR_4x4. Valid after init().
 		basist::basis_tex_format get_format() const { return m_format; } 
-
+				
 		bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; }
 
 		bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; }
 
+		bool is_hdr() const { return get_format() == basist::basis_tex_format::cUASTC_HDR_4x4; }
+
 		// Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init().
 		uint32_t get_has_alpha() const { return m_has_alpha; }
 
@@ -913,6 +968,7 @@ namespace basist
 								
 		basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder;
 		basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder;
+		basist::basisu_lowlevel_uastc_hdr_transcoder m_uastc_hdr_transcoder;
 				
 		ktx2_transcoder_state m_def_transcoder_state;
 
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
index 0505df6ea67e..17c9dc7c8c9d 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
@@ -1,5 +1,5 @@
 // basisu_transcoder_internal.h - Universal texture format transcoder library.
-// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
 //
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
@@ -20,8 +20,9 @@
 #pragma warning (disable: 4127) //  conditional expression is constant
 #endif
 
-#define BASISD_LIB_VERSION 116
-#define BASISD_VERSION_STRING "01.16"
+// v1.50: Added UASTC HDR support
+#define BASISD_LIB_VERSION 150
+#define BASISD_VERSION_STRING "01.50"
 
 #ifdef _DEBUG
 #define BASISD_BUILD_DEBUG
@@ -82,9 +83,15 @@ namespace basist
 		cRGBA4444_ALPHA,
 		cRGBA4444_COLOR_OPAQUE,
 		cRGBA4444,
-
-		cUASTC_4x4,
-						
+		cRGBA_HALF,
+		cRGB_HALF,
+		cRGB_9E5,
+
+		cUASTC_4x4,							// LDR, universal
+		cUASTC_HDR_4x4,						// HDR, transcodes only to 4x4 HDR ASTC, BC6H, or uncompressed
+		cBC6H,
+		cASTC_HDR_4x4,
+								
 		cTotalBlockFormats
 	};
 
@@ -264,8 +271,8 @@ namespace basist
 		}
 
 		const basisu::uint8_vec &get_code_sizes() const { return m_code_sizes; }
-		const basisu::int_vec get_lookup() const { return m_lookup; }
-		const basisu::int16_vec get_tree() const { return m_tree; }
+		const basisu::int_vec &get_lookup() const { return m_lookup; }
+		const basisu::int16_vec &get_tree() const { return m_tree; }
 
 		bool is_valid() const { return m_code_sizes.size() > 0; }
 
@@ -789,7 +796,198 @@ namespace basist
 	};
 
 	bool basis_block_format_is_uncompressed(block_format tex_type);
-	
+
+	//------------------------------------
+
+	typedef uint16_t half_float;
+
+	const double MIN_DENORM_HALF_FLOAT = 0.000000059604645; // smallest positive subnormal number
+	const double MIN_HALF_FLOAT = 0.00006103515625; // smallest positive normal number
+	const double MAX_HALF_FLOAT = 65504.0; // largest normal number
+
+	inline uint32_t get_bits(uint32_t val, int low, int high)
+	{
+		const int num_bits = (high - low) + 1;
+		assert((num_bits >= 1) && (num_bits <= 32));
+
+		val >>= low;
+		if (num_bits != 32)
+			val &= ((1u << num_bits) - 1);
+
+		return val;
+	}
+
+	inline bool is_half_inf_or_nan(half_float v)
+	{
+		return get_bits(v, 10, 14) == 31;
+	}
+
+	inline bool is_half_denorm(half_float v)
+	{
+		int e = (v >> 10) & 31;
+		return !e;
+	}
+
+	inline int get_half_exp(half_float v)
+	{
+		int e = ((v >> 10) & 31);
+		return e ? (e - 15) : -14;
+	}
+
+	inline int get_half_mantissa(half_float v)
+	{
+		if (is_half_denorm(v))
+			return v & 0x3FF;
+		return (v & 0x3FF) | 0x400;
+	}
+
+	inline float get_half_mantissaf(half_float v)
+	{
+		return ((float)get_half_mantissa(v)) / 1024.0f;
+	}
+
+	inline int get_half_sign(half_float v)
+	{
+		return v ? ((v & 0x8000) ? -1 : 1) : 0;
+	}
+
+	inline bool half_is_signed(half_float v)
+	{
+		return (v & 0x8000) != 0;
+	}
+
+#if 0
+	int hexp = get_half_exp(Cf);
+	float hman = get_half_mantissaf(Cf);
+	int hsign = get_half_sign(Cf);
+	float k = powf(2.0f, hexp) * hman * hsign;
+	if (is_half_inf_or_nan(Cf))
+		k = std::numeric_limits<float>::quiet_NaN();
+#endif
+
+	half_float float_to_half(float val);
+
+	inline float half_to_float(half_float hval)
+	{
+		union { float f; uint32_t u; } x = { 0 };
+
+		uint32_t s = ((uint32_t)hval >> 15) & 1;
+		uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
+		uint32_t m = (uint32_t)hval & 0x3FF;
+
+		if (!e)
+		{
+			if (!m)
+			{
+				// +- 0
+				x.u = s << 31;
+				return x.f;
+			}
+			else
+			{
+				// denormalized
+				while (!(m & 0x00000400))
+				{
+					m <<= 1;
+					--e;
+				}
+
+				++e;
+				m &= ~0x00000400;
+			}
+		}
+		else if (e == 31)
+		{
+			if (m == 0)
+			{
+				// +/- INF
+				x.u = (s << 31) | 0x7f800000;
+				return x.f;
+			}
+			else
+			{
+				// +/- NaN
+				x.u = (s << 31) | 0x7f800000 | (m << 13);
+				return x.f;
+			}
+		}
+
+		e = e + (127 - 15);
+		m = m << 13;
+
+		assert(s <= 1);
+		assert(m <= 0x7FFFFF);
+		assert(e <= 255);
+
+		x.u = m | (e << 23) | (s << 31);
+		return x.f;
+	}
+
+	// Originally from bc6h_enc.h
+
+	void bc6h_enc_init();
+
+	const uint32_t MAX_BLOG16_VAL = 0xFFFF;
+
+	// BC6H internals
+	const uint32_t NUM_BC6H_MODES = 14;
+	const uint32_t BC6H_LAST_MODE_INDEX = 13;
+	const uint32_t BC6H_FIRST_1SUBSET_MODE_INDEX = 10; // in the MS docs, this is "mode 11" (where the first mode is 1), 60 bits for endpoints (10.10, 10.10, 10.10), 63 bits for weights
+	const uint32_t TOTAL_BC6H_PARTITION_PATTERNS = 32;
+
+	extern const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4]; // base, r, g, b
+
+	struct bc6h_bit_layout
+	{
+		int8_t m_comp; // R=0,G=1,B=2,D=3 (D=partition index)
+		int8_t m_index; // 0-3, 0-1 Low/High subset 1, 2-3 Low/High subset 2, -1=partition index (d)
+		int8_t m_last_bit;
+		int8_t m_first_bit; // may be -1 if a single bit, may be >m_last_bit if reversed
+	};
+
+	const uint32_t MAX_BC6H_LAYOUT_INDEX = 25;
+	extern const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX];
+
+	extern const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4]; // [y][x]
+
+	extern const uint8_t g_bc6h_weight3[8];
+	extern const uint8_t g_bc6h_weight4[16];
+
+	extern const int8_t g_bc6h_mode_lookup[32];
+		
+	// Converts b16 to half float
+	inline half_float bc6h_blog16_to_half(uint32_t comp)
+	{
+		assert(comp <= 0xFFFF);
+
+		// scale the magnitude by 31/64
+		comp = (comp * 31u) >> 6u;
+		return (half_float)comp;
+	}
+
+	const uint32_t MAX_BC6H_HALF_FLOAT_AS_UINT = 0x7BFF;
+
+	// Inverts bc6h_blog16_to_half().
+	// Returns the nearest blog16 given a half value. 
+	inline uint32_t bc6h_half_to_blog16(half_float h)
+	{
+		assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT);
+		return (h * 64 + 30) / 31;
+	}
+
+	struct bc6h_block
+	{
+		uint8_t m_bytes[16];
+	};
+
+	void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
+	void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
+	void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
+	void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights);
+	void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
+	void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index]
+	bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]);
+		
 } // namespace basist
 
 
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
index 8244550959dd..205758b3d7ff 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
@@ -1,4 +1,4 @@
-// Copyright (C) 2017-2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2017-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
index fad45fe22d0a..f2d324fcc333 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
@@ -1,4 +1,4 @@
-// Copyright (C) 2017-2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2017-2024 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
index f91314f4ff43..457bd51e3011 100644
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
@@ -13,6 +13,7 @@ namespace basist
 	const uint32_t UASTC_MODE_INDEX_SOLID_COLOR = 8;
 
 	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS2 = 30;
+	const uint32_t TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 = 27; // BC6H only supports only 5-bit pattern indices, BC7 supports 4-bit or 6-bit
 	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS3 = 11;
 	const uint32_t TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS = 19;