Merge pull request #214 from dbPhilips/perf_enhancements

Perf enhancements
syoyo · Jan 22, 2025 · 648e47b · 648e47b
2 parents 756f7d3 + 33e0472
commit 648e47b
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -159,10 +159,12 @@ Include `tinyexr.h` with `TINYEXR_IMPLEMENTATION` flag (do this only for **one**
 * `TINYEXR_USE_MINIZ` Use miniz (default = 1). Please include `zlib.h` header before `tinyexr.h` if you disable miniz support(e.g. use system's zlib).
 * `TINYEXR_USE_STB_ZLIB` Use zlib from `stb_image[_write].h` instead of miniz or the system's zlib (default = 0).
 * `TINYEXR_USE_PIZ` Enable PIZ compression support (default = 1)
-* `TINYEXR_USE_ZFP` Enable ZFP compression supoort (TinyEXR extension, default = 0)
-* `TINYEXR_USE_THREAD` Enable threaded loading using C++11 thread (Requires C++11 compiler, default = 0)
+* `TINYEXR_USE_ZFP` Enable ZFP compression support (TinyEXR extension, default = 0)
+* `TINYEXR_USE_THREAD` Enable threaded loading/storing using C++11 thread (Requires C++11 compiler, default = 0)
+  * Use `TINYEXR_MAX_THREADS` over 0 to use MIN(TINYEXR_MAX_THREADS,hardware_concurrency()) in stead off hardware_concurrency(). (default = 0)
 * `TINYEXR_USE_OPENMP` Enable OpenMP threading support (default = 1 if `_OPENMP` is defined)
   * Use `TINYEXR_USE_OPENMP=0` to force disable OpenMP code path even if OpenMP is available/enabled in the compiler.
+* `TINYEXR_USE_COMPILER_FP16` Enable use of compiler provided FP16<>FP32 conversions when available (default = 0)
 
 ### Quickly reading RGB(A) EXR file.
 

diff --git a/tinyexr.h b/tinyexr.h
@@ -131,7 +131,11 @@ extern "C" {
 
 #ifndef TINYEXR_USE_THREAD
 #define TINYEXR_USE_THREAD (0)  // No threaded loading.
-// http://computation.llnl.gov/projects/floating-point-compression
+#else
+// When using threading a reduced custom upperbound can be specified by setting TINYEXR_MAX_THREADS
+#ifndef TINYEXR_MAX_THREADS // if not defined define it as 0 meaning upper limit is taken from hardware_concurrency()
+#define TINYEXR_MAX_THREADS (0)
+#endif
 #endif
 
 #ifndef TINYEXR_USE_OPENMP
@@ -142,6 +146,41 @@ extern "C" {
 #endif
 #endif
 
+#ifndef TINYEXR_USE_COMPILER_FP16
+#define TINYEXR_USE_COMPILER_FP16 (0)
+#endif
+
+#if TINYEXR_USE_COMPILER_FP16
+#ifndef _MSC_VER
+#if defined( __GNUC__ ) || defined( __clang__ )
+#if defined( __SSE2__ )
+#if ( __GNUC__ > 11 ) || ( __clang_major__ > 14 )
+#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
+#define __STDC_WANT_IEC_60559_TYPES_EXT__
+#endif
+#include <float.h>
+#include <math.h>
+#define TINYEXR_FP16_COMPILER_TYPE _Float16
+#endif
+#endif
+#if defined( __ARM_NEON__ ) || defined( __ARM_NEON )
+#define TINYEXR_FP16_COMPILER_TYPE __fp16
+#endif
+#endif
+#else
+#if (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+#include <intrin.h>
+#define TINYEXR_FP16_COMPILER_TYPE uint16_t
+#endif
+#endif
+#endif
+
+#ifdef TINYEXR_FP16_COMPILER_TYPE
+#define TINYEXR_HAS_FP16_COMPILER_TYPE (1)
+#else
+#define TINYEXR_HAS_FP16_COMPILER_TYPE (0)
+#endif
+
 #define TINYEXR_SUCCESS (0)
 #define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1)
 #define TINYEXR_ERROR_INVALID_EXR_VERSION (-2)
@@ -771,15 +810,15 @@ static void SetWarningMessage(const std::string &msg, const char **warn) {
 
 static const int kEXRVersionSize = 8;
 
-static void cpy2(unsigned short *dst_val, const unsigned short *src_val) {
+static void inline cpy2(unsigned short *dst_val, const unsigned short *src_val) {
   unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
   const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
 
   dst[0] = src[0];
   dst[1] = src[1];
 }
 
-static void swap2(unsigned short *val) {
+static void inline swap2(unsigned short *val) {
 #if TINYEXR_LITTLE_ENDIAN
   (void)val;
 #else
@@ -801,7 +840,7 @@ static void swap2(unsigned short *val) {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
-static void cpy4(int *dst_val, const int *src_val) {
+static void inline cpy4(int *dst_val, const int *src_val) {
   unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
   const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
 
@@ -811,7 +850,7 @@ static void cpy4(int *dst_val, const int *src_val) {
   dst[3] = src[3];
 }
 
-static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
+static void inline cpy4(unsigned int *dst_val, const unsigned int *src_val) {
   unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
   const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
 
@@ -821,7 +860,7 @@ static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
   dst[3] = src[3];
 }
 
-static void cpy4(float *dst_val, const float *src_val) {
+static void inline cpy4(float *dst_val, const float *src_val) {
   unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
   const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
 
@@ -838,7 +877,7 @@ static void cpy4(float *dst_val, const float *src_val) {
 #pragma GCC diagnostic pop
 #endif
 
-static void swap4(unsigned int *val) {
+static void inline swap4(unsigned int *val) {
 #if TINYEXR_LITTLE_ENDIAN
   (void)val;
 #else
@@ -853,7 +892,7 @@ static void swap4(unsigned int *val) {
 #endif
 }
 
-static void swap4(int *val) {
+static void inline swap4(int *val) {
 #if TINYEXR_LITTLE_ENDIAN
   (void)val;
 #else
@@ -868,7 +907,7 @@ static void swap4(int *val) {
 #endif
 }
 
-static void swap4(float *val) {
+static void inline swap4(float *val) {
 #if TINYEXR_LITTLE_ENDIAN
   (void)val;
 #else
@@ -884,7 +923,7 @@ static void swap4(float *val) {
 }
 
 #if 0
-static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
+static void inline cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
   unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
   const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
 
@@ -899,7 +938,7 @@ static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64
 }
 #endif
 
-static void swap8(tinyexr::tinyexr_uint64 *val) {
+static void inline swap8(tinyexr::tinyexr_uint64 *val) {
 #if TINYEXR_LITTLE_ENDIAN
   (void)val;
 #else
@@ -919,6 +958,11 @@ static void swap8(tinyexr::tinyexr_uint64 *val) {
 }
 
 // https://gist.github.com/rygorous/2156668
+#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
+union FP32 {
+  float f;
+};
+#else
 union FP32 {
   unsigned int u;
   float f;
@@ -934,12 +978,21 @@ union FP32 {
 #endif
   } s;
 };
+#endif
 
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wpadded"
 #endif
 
+#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
+union FP16 {
+  TINYEXR_FP16_COMPILER_TYPE f;
+  unsigned short u;
+};
+
+#else
+
 union FP16 {
   unsigned short u;
   struct {
@@ -954,11 +1007,32 @@ union FP16 {
 #endif
   } s;
 };
+#endif
 
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
 
+#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
+static inline FP32 half_to_float(FP16 h) {
+  FP32 o;
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+   o.f =_mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(static_cast<int> (h.u))));
+#else
+   o.f = static_cast<float> (h.f);
+#endif
+  return o;
+}
+static inline FP16 float_to_half_full(FP32 f) {
+  FP16 o;
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
+  o.f  = static_cast<TINYEXR_FP16_COMPILER_TYPE> (_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f.f), _MM_FROUND_CUR_DIRECTION)));
+#else
+  o.f = static_cast<TINYEXR_FP16_COMPILER_TYPE> (f.f);
+#endif
+  return o;
+}
+#else
 static FP32 half_to_float(FP16 h) {
   static const FP32 magic = {113 << 23};
   static const unsigned int shifted_exp = 0x7c00
@@ -1018,7 +1092,7 @@ static FP16 float_to_half_full(FP32 f) {
   o.s.Sign = f.s.Sign;
   return o;
 }
-
+#endif
 // NOTE: From OpenEXR code
 // #define IMF_INCREASING_Y  0
 // #define IMF_DECREASING_Y  1
@@ -4930,10 +5004,12 @@ static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
   std::atomic<int> tile_count(0);
 
   int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+#if (TINYEXR_MAX_THREADS > 0)
+  num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
+#endif
   if (num_threads > int(num_tiles)) {
     num_threads = int(num_tiles);
   }
-
   for (int t = 0; t < num_threads; t++) {
     workers.emplace_back(std::thread([&]()
       {
@@ -5286,10 +5362,12 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
     std::atomic<int> y_count(0);
 
     int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+#if (TINYEXR_MAX_THREADS > 0)
+    num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
+#endif
     if (num_threads > int(num_blocks)) {
       num_threads = int(num_blocks);
     }
-
     for (int t = 0; t < num_threads; t++) {
       workers.emplace_back(std::thread([&]() {
         int y = 0;
@@ -7268,6 +7346,9 @@ static int EncodeTiledLevel(const EXRImage* level_image, const EXRHeader* exr_he
   std::atomic<int> tile_count(0);
 
   int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
+#if (TINYEXR_MAX_THREADS > 0)
+  num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
+#endif
   if (num_threads > int(num_tiles)) {
     num_threads = int(num_tiles);
   }
@@ -7517,7 +7598,9 @@ static int EncodeChunk(const EXRImage* exr_image, const EXRHeader* exr_header,
     std::atomic<int> block_count(0);
 
     int num_threads = std::min(std::max(1, int(std::thread::hardware_concurrency())), num_blocks);
-
+#if (TINYEXR_MAX_THREADS > 0)
+    num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
+#endif
     for (int t = 0; t < num_threads; t++) {
       workers.emplace_back(std::thread([&]() {
         int i = 0;
@@ -9047,13 +9130,19 @@ int SaveEXRToMemory(const float *data, int width, int height, int components,
     images[3].resize(static_cast<size_t>(width * height));
 
     // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
-    for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
-      images[0][i] = data[static_cast<size_t>(components) * i + 0];
-      images[1][i] = data[static_cast<size_t>(components) * i + 1];
-      images[2][i] = data[static_cast<size_t>(components) * i + 2];
-      if (components == 4) {
-        images[3][i] = data[static_cast<size_t>(components) * i + 3];
-      }
+    if (components == 4) {
+        for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
+          images[0][i] = data[static_cast<size_t>(components) * i + 0];
+          images[1][i] = data[static_cast<size_t>(components) * i + 1];
+          images[2][i] = data[static_cast<size_t>(components) * i + 2];
+          images[3][i] = data[static_cast<size_t>(components) * i + 3];
+        }
+    } else {
+       for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
+          images[0][i] = data[static_cast<size_t>(components) * i + 0];
+          images[1][i] = data[static_cast<size_t>(components) * i + 1];
+          images[2][i] = data[static_cast<size_t>(components) * i + 2];
+        }
     }
   }
 
@@ -9198,13 +9287,19 @@ int SaveEXR(const float *data, int width, int height, int components,
     images[3].resize(pixel_count);
 
     // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
-    for (size_t i = 0; i < pixel_count; i++) {
-      images[0][i] = data[static_cast<size_t>(components) * i + 0];
-      images[1][i] = data[static_cast<size_t>(components) * i + 1];
-      images[2][i] = data[static_cast<size_t>(components) * i + 2];
-      if (components == 4) {
-        images[3][i] = data[static_cast<size_t>(components) * i + 3];
-      }
+    if (components == 4) {
+       for (size_t i = 0; i < pixel_count; i++) {
+         images[0][i] = data[static_cast<size_t>(components) * i + 0];
+         images[1][i] = data[static_cast<size_t>(components) * i + 1];
+         images[2][i] = data[static_cast<size_t>(components) * i + 2];
+         images[3][i] = data[static_cast<size_t>(components) * i + 3];
+       }
+    } else {
+       for (size_t i = 0; i < pixel_count; i++) {
+         images[0][i] = data[static_cast<size_t>(components) * i + 0];
+         images[1][i] = data[static_cast<size_t>(components) * i + 1];
+         images[2][i] = data[static_cast<size_t>(components) * i + 2];
+       }
     }
   }