From 4d5c23683a9e994f7782e8926c2573cd7f667de8 Mon Sep 17 00:00:00 2001 From: Lukas Cone Date: Wed, 30 Mar 2022 22:00:10 +0200 Subject: [PATCH] Adding API for signed BC4/BC5 variants --- ISPC Texture Compressor/main.cpp | 2 + ISPC Texture Compressor/processing.cpp | 20 +++++- ISPC Texture Compressor/processing.h | 2 + ispc_texcomp/ispc_texcomp.cpp | 11 ++++ ispc_texcomp/ispc_texcomp.h | 6 +- ispc_texcomp/kernel.ispc | 84 ++++++++++++++++++++++++++ readme.md | 3 +- 7 files changed, 123 insertions(+), 5 deletions(-) diff --git a/ISPC Texture Compressor/main.cpp b/ISPC Texture Compressor/main.cpp index 8f744cc..3acbd87 100644 --- a/ISPC Texture Compressor/main.cpp +++ b/ISPC Texture Compressor/main.cpp @@ -199,7 +199,9 @@ int WINAPI wWinMain( HINSTANCE hInstance, HINSTANCE hPrevInstance, LPWSTR lpCmdL { CDXUTComboBox *comboBox = gSampleUI.GetComboBox(IDC_PROFILE); comboBox->AddItem(L"BC4 (R)", (void*)(CompressImageBC4)); + comboBox->AddItem(L"BC4S (R)", (void*)(CompressImageBC4S)); comboBox->AddItem(L"BC5 (RG)", (void*)(CompressImageBC5)); + comboBox->AddItem(L"BC5S (RG)", (void*)(CompressImageBC5S)); comboBox->AddItem(L"BC6H veryfast", (void *)(CompressImageBC6H_veryfast)); comboBox->AddItem(L"BC6H fast", (void *)(CompressImageBC6H_fast)); comboBox->AddItem(L"BC6H basic", (void *)(CompressImageBC6H_basic)); diff --git a/ISPC Texture Compressor/processing.cpp b/ISPC Texture Compressor/processing.cpp index 3631d8e..da1b0c0 100644 --- a/ISPC Texture Compressor/processing.cpp +++ b/ISPC Texture Compressor/processing.cpp @@ -450,6 +450,8 @@ static inline DXGI_FORMAT GetNonSRGBFormat(DXGI_FORMAT f) { case DXGI_FORMAT_BC3_UNORM_SRGB: return DXGI_FORMAT_BC3_UNORM; case DXGI_FORMAT_BC4_UNORM: return DXGI_FORMAT_BC4_UNORM; case DXGI_FORMAT_BC5_UNORM: return DXGI_FORMAT_BC5_UNORM; + case DXGI_FORMAT_BC4_SNORM: return DXGI_FORMAT_BC4_SNORM; + case DXGI_FORMAT_BC5_SNORM: return DXGI_FORMAT_BC5_SNORM; case DXGI_FORMAT_BC7_UNORM_SRGB: return DXGI_FORMAT_BC7_UNORM; case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: return DXGI_FORMAT_R8G8B8A8_UNORM; default: assert(!"Unknown format!"); @@ -1119,10 +1121,12 @@ int GetBytesPerBlock(CompressionFunc* fn) default: case DXGI_FORMAT_BC1_UNORM_SRGB: case DXGI_FORMAT_BC4_UNORM: + case DXGI_FORMAT_BC4_SNORM: return 8; case DXGI_FORMAT_BC3_UNORM_SRGB: case DXGI_FORMAT_BC5_UNORM: + case DXGI_FORMAT_BC5_SNORM: case DXGI_FORMAT_BC7_UNORM_SRGB: case DXGI_FORMAT_BC6H_UF16: return 16; @@ -1131,12 +1135,12 @@ int GetBytesPerBlock(CompressionFunc* fn) bool IsBC4(CompressionFunc* fn) { - return fn == CompressImageBC4; + return fn == CompressImageBC4 || fn == CompressImageBC4S; } bool IsBC5(CompressionFunc* fn) { - return fn == CompressImageBC5; + return fn == CompressImageBC5 || fn == CompressImageBC5S; } bool IsBC6H(CompressionFunc* fn) @@ -1155,6 +1159,8 @@ DXGI_FORMAT GetFormatFromCompressionFunc(CompressionFunc* fn) if (fn == CompressImageBC3) return DXGI_FORMAT_BC3_UNORM_SRGB; if (fn == CompressImageBC4) return DXGI_FORMAT_BC4_UNORM; if (fn == CompressImageBC5) return DXGI_FORMAT_BC5_UNORM; + if (fn == CompressImageBC4S) return DXGI_FORMAT_BC4_SNORM; + if (fn == CompressImageBC5S) return DXGI_FORMAT_BC5_SNORM; if (IsBC6H(fn)) return DXGI_FORMAT_BC6H_UF16; @@ -1181,6 +1187,16 @@ void CompressImageBC5(const rgba_surface* input, BYTE* output) CompressBlocksBC5(input, output); } +void CompressImageBC4S(const rgba_surface* input, BYTE* output) +{ + CompressBlocksBC4S(input, output); +} + +void CompressImageBC5S(const rgba_surface* input, BYTE* output) +{ + CompressBlocksBC5S(input, output); +} + #define DECLARE_CompressImageBC6H_profile(profile) \ void CompressImageBC6H_ ## profile(const rgba_surface* input, BYTE* output) \ { \ diff --git a/ISPC Texture Compressor/processing.h b/ISPC Texture Compressor/processing.h index 317a511..6d759ed 100644 --- a/ISPC Texture Compressor/processing.h +++ b/ISPC Texture Compressor/processing.h @@ -99,6 +99,8 @@ void CompressImageBC1(const rgba_surface* input, BYTE* output); void CompressImageBC3(const rgba_surface* input, BYTE* output); void CompressImageBC4(const rgba_surface* input, BYTE* output); void CompressImageBC5(const rgba_surface* input, BYTE* output); +void CompressImageBC4S(const rgba_surface* input, BYTE* output); +void CompressImageBC5S(const rgba_surface* input, BYTE* output); void CompressImageBC6H_veryfast(const rgba_surface* input, BYTE* output); void CompressImageBC6H_fast(const rgba_surface* input, BYTE* output); void CompressImageBC6H_basic(const rgba_surface* input, BYTE* output); diff --git a/ispc_texcomp/ispc_texcomp.cpp b/ispc_texcomp/ispc_texcomp.cpp index 49c4c41..cc3936b 100644 --- a/ispc_texcomp/ispc_texcomp.cpp +++ b/ispc_texcomp/ispc_texcomp.cpp @@ -461,11 +461,22 @@ void CompressBlocksBC4(const rgba_surface* src, uint8_t* dst) ispc::CompressBlocksBC4_ispc((ispc::rgba_surface*)src, dst); } +void CompressBlocksBC4S(const rgba_surface* src, uint8_t* dst) +{ + ispc::CompressBlocksBC4S_ispc((ispc::rgba_surface*)src, dst); +} + void CompressBlocksBC5(const rgba_surface* src, uint8_t* dst) { ispc::CompressBlocksBC5_ispc((ispc::rgba_surface*)src, dst); } +void CompressBlocksBC5S(const rgba_surface* src, uint8_t* dst) +{ + ispc::CompressBlocksBC5S_ispc((ispc::rgba_surface*)src, dst); +} + + void CompressBlocksBC7(const rgba_surface* src, uint8_t* dst, bc7_enc_settings* settings) { ispc::CompressBlocksBC7_ispc((ispc::rgba_surface*)src, dst, (ispc::bc7_enc_settings*)settings); diff --git a/ispc_texcomp/ispc_texcomp.h b/ispc_texcomp/ispc_texcomp.h index 9418ece..95062f8 100644 --- a/ispc_texcomp/ispc_texcomp.h +++ b/ispc_texcomp/ispc_texcomp.h @@ -108,8 +108,8 @@ extern "C" void ReplicateBorders(rgba_surface* dst_slice, const rgba_surface* sr - LDR input is 32 bit/pixel (sRGB), HDR is 64 bit/pixel (half float) - for BC4 input is 8bit/pixel (R8), for BC5 input is 16bit/pixel (RG8) - dst buffer must be allocated with enough space for the compressed texture: - - 8 bytes/block for BC1/BC4/ETC1, - - 16 bytes/block for BC3/BC5/BC6H/BC7/ASTC + - 8 bytes/block for BC1/BC4/BC4S/ETC1, + - 16 bytes/block for BC3/BC5/BC5S/BC6H/BC7/ASTC - the blocks are stored in raster scan order (natural CPU texture layout) - use the GetProfile_* functions to select various speed/quality tradeoffs - the RGB profiles are slightly faster as they ignore the alpha channel @@ -118,7 +118,9 @@ extern "C" void ReplicateBorders(rgba_surface* dst_slice, const rgba_surface* sr extern "C" void CompressBlocksBC1(const rgba_surface* src, uint8_t* dst); extern "C" void CompressBlocksBC3(const rgba_surface* src, uint8_t* dst); extern "C" void CompressBlocksBC4(const rgba_surface* src, uint8_t* dst); +extern "C" void CompressBlocksBC4S(const rgba_surface* src, uint8_t* dst); extern "C" void CompressBlocksBC5(const rgba_surface* src, uint8_t* dst); +extern "C" void CompressBlocksBC5S(const rgba_surface* src, uint8_t* dst); extern "C" void CompressBlocksBC6H(const rgba_surface* src, uint8_t* dst, bc6h_enc_settings* settings); extern "C" void CompressBlocksBC7(const rgba_surface* src, uint8_t* dst, bc7_enc_settings* settings); extern "C" void CompressBlocksETC1(const rgba_surface* src, uint8_t* dst, etc_enc_settings* settings); diff --git a/ispc_texcomp/kernel.ispc b/ispc_texcomp/kernel.ispc index 752184e..5186810 100644 --- a/ispc_texcomp/kernel.ispc +++ b/ispc_texcomp/kernel.ispc @@ -621,6 +621,47 @@ inline void CompressBlockBC3_alpha(float block[16], uint32 data[2]) data[1] |= qblock[1]<<8; } +inline void CompressBlockBC4Signed(float block[16], uint32 data[2]) +{ + float ep[2] = { 255, 0 }; + + for (uniform int k=0; k<16; k++) + { + ep[0] = min(ep[0], block[k]); + ep[1] = max(ep[1], block[k]); + } + + if (ep[0] == ep[1]) ep[1] = ep[0]+0.1f; + + uint32 qblock[2] = { 0, 0 }; + float scale = 7f/(ep[1]-ep[0]); + + for (uniform int k=0; k<16; k++) + { + float v = block[k]; + float proj = (v-ep[0])*scale+0.5f; + + int q = clamp((int)proj, 0, 7); + + q = 7-q; + + if (q > 0) q++; + if (q==8) q = 1; + + qblock[k/8] |= q << ((k%8)*3); + } + + // (could be improved by refinement) + + for (uniform int e=0; e<2; e++) ep[e] = clamp((int)ep[e] - 0x80, -128, 127); + + data[0] = (0xFF & (int)ep[0]) << 8; + data[0] |= (0xFF & (int)ep[1]); + data[0] |= qblock[0]<<16; + data[1] = qblock[0]>>16; + data[1] |= qblock[1]<<8; +} + inline void CompressBlockBC1(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[]) { float block[48]; @@ -658,6 +699,18 @@ inline void CompressBlockBC4(uniform rgba_surface src[], int xx, uniform int yy, store_data(dst, src->width, xx, yy, data, 2); } +inline void CompressBlockBC4S(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[]) +{ + float block[16]; + uint32 data[2]; + + load_block_r_8bit(block, src, xx, yy); + + CompressBlockBC4Signed(block, data); + + store_data(dst, src->width, xx, yy, data, 2); +} + inline void CompressBlockBC5(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[]) { float block[32]; @@ -671,6 +724,19 @@ inline void CompressBlockBC5(uniform rgba_surface src[], int xx, uniform int yy, store_data(dst, src->width, xx, yy, data, 4); } +inline void CompressBlockBC5S(uniform rgba_surface src[], int xx, uniform int yy, uniform uint8 dst[]) +{ + float block[32]; + uint32 data[4]; + + load_block_interleaved_rg_8bit(block, src, xx, yy); + + CompressBlockBC4Signed(block, data); + CompressBlockBC4Signed(&block[16], &data[2]); + + store_data(dst, src->width, xx, yy, data, 4); +} + export void CompressBlocksBC1_ispc(uniform rgba_surface src[], uniform uint8 dst[]) { for (uniform int yy = 0; yyheight/4; yy++) @@ -698,6 +764,15 @@ export void CompressBlocksBC4_ispc(uniform rgba_surface src[], uniform uint8 dst } } +export void CompressBlocksBC4S_ispc(uniform rgba_surface src[], uniform uint8 dst[]) +{ + for (uniform int yy = 0; yyheight/4; yy++) + foreach (xx = 0 ... src->width/4) + { + CompressBlockBC4S(src, xx, yy, dst); + } +} + export void CompressBlocksBC5_ispc(uniform rgba_surface src[], uniform uint8 dst[]) { for (uniform int yy = 0; yyheight/4; yy++) @@ -707,6 +782,15 @@ export void CompressBlocksBC5_ispc(uniform rgba_surface src[], uniform uint8 dst } } +export void CompressBlocksBC5S_ispc(uniform rgba_surface src[], uniform uint8 dst[]) +{ + for (uniform int yy = 0; yyheight/4; yy++) + foreach (xx = 0 ... src->width/4) + { + CompressBlockBC5S(src, xx, yy, dst); + } +} + /////////////////////////////////////////////////////////// // BC7 encoding diff --git a/readme.md b/readme.md index 68c002e..24da2f7 100644 --- a/readme.md +++ b/readme.md @@ -7,7 +7,8 @@ formats: * BC7 * ASTC (LDR, block sizes up to 8x8) * ETC1 -* BC1, BC3 (aka DXT1, DXT5) and BC4, BC5 (aka ATI1N, ATI2N) +* BC1, BC3 (aka DXT1, DXT5) +* BC4, BC5 (aka ATI1N, ATI2N) both UNORM and SNORM variants The library uses the [ISPC compiler](https://ispc.github.io/) to generate CPU SIMD-optimized compression algorithms. For more information, see the [Fast ISPC