Skip to content

Commit

Permalink
Merge pull request #214 from dbPhilips/perf_enhancements
Browse files Browse the repository at this point in the history
Perf enhancements
  • Loading branch information
syoyo authored Jan 22, 2025
2 parents 756f7d3 + 33e0472 commit 648e47b
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 31 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,12 @@ Include `tinyexr.h` with `TINYEXR_IMPLEMENTATION` flag (do this only for **one**
* `TINYEXR_USE_MINIZ` Use miniz (default = 1). Please include `zlib.h` header before `tinyexr.h` if you disable miniz support(e.g. use system's zlib).
* `TINYEXR_USE_STB_ZLIB` Use zlib from `stb_image[_write].h` instead of miniz or the system's zlib (default = 0).
* `TINYEXR_USE_PIZ` Enable PIZ compression support (default = 1)
* `TINYEXR_USE_ZFP` Enable ZFP compression supoort (TinyEXR extension, default = 0)
* `TINYEXR_USE_THREAD` Enable threaded loading using C++11 thread (Requires C++11 compiler, default = 0)
* `TINYEXR_USE_ZFP` Enable ZFP compression support (TinyEXR extension, default = 0)
* `TINYEXR_USE_THREAD` Enable threaded loading/storing using C++11 thread (Requires C++11 compiler, default = 0)
* Use `TINYEXR_MAX_THREADS` over 0 to use MIN(TINYEXR_MAX_THREADS,hardware_concurrency()) in stead off hardware_concurrency(). (default = 0)
* `TINYEXR_USE_OPENMP` Enable OpenMP threading support (default = 1 if `_OPENMP` is defined)
* Use `TINYEXR_USE_OPENMP=0` to force disable OpenMP code path even if OpenMP is available/enabled in the compiler.
* `TINYEXR_USE_COMPILER_FP16` Enable use of compiler provided FP16<>FP32 conversions when available (default = 0)

### Quickly reading RGB(A) EXR file.

Expand Down
153 changes: 124 additions & 29 deletions tinyexr.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,11 @@ extern "C" {

#ifndef TINYEXR_USE_THREAD
#define TINYEXR_USE_THREAD (0) // No threaded loading.
// http://computation.llnl.gov/projects/floating-point-compression
#else
// When using threading a reduced custom upperbound can be specified by setting TINYEXR_MAX_THREADS
#ifndef TINYEXR_MAX_THREADS // if not defined define it as 0 meaning upper limit is taken from hardware_concurrency()
#define TINYEXR_MAX_THREADS (0)
#endif
#endif

#ifndef TINYEXR_USE_OPENMP
Expand All @@ -142,6 +146,41 @@ extern "C" {
#endif
#endif

#ifndef TINYEXR_USE_COMPILER_FP16
#define TINYEXR_USE_COMPILER_FP16 (0)
#endif

#if TINYEXR_USE_COMPILER_FP16
#ifndef _MSC_VER
#if defined( __GNUC__ ) || defined( __clang__ )
#if defined( __SSE2__ )
#if ( __GNUC__ > 11 ) || ( __clang_major__ > 14 )
#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
#define __STDC_WANT_IEC_60559_TYPES_EXT__
#endif
#include <float.h>
#include <math.h>
#define TINYEXR_FP16_COMPILER_TYPE _Float16
#endif
#endif
#if defined( __ARM_NEON__ ) || defined( __ARM_NEON )
#define TINYEXR_FP16_COMPILER_TYPE __fp16
#endif
#endif
#else
#if (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
#include <intrin.h>
#define TINYEXR_FP16_COMPILER_TYPE uint16_t
#endif
#endif
#endif

#ifdef TINYEXR_FP16_COMPILER_TYPE
#define TINYEXR_HAS_FP16_COMPILER_TYPE (1)
#else
#define TINYEXR_HAS_FP16_COMPILER_TYPE (0)
#endif

#define TINYEXR_SUCCESS (0)
#define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1)
#define TINYEXR_ERROR_INVALID_EXR_VERSION (-2)
Expand Down Expand Up @@ -771,15 +810,15 @@ static void SetWarningMessage(const std::string &msg, const char **warn) {

static const int kEXRVersionSize = 8;

static void cpy2(unsigned short *dst_val, const unsigned short *src_val) {
static void inline cpy2(unsigned short *dst_val, const unsigned short *src_val) {
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);

dst[0] = src[0];
dst[1] = src[1];
}

static void swap2(unsigned short *val) {
static void inline swap2(unsigned short *val) {
#if TINYEXR_LITTLE_ENDIAN
(void)val;
#else
Expand All @@ -801,7 +840,7 @@ static void swap2(unsigned short *val) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-function"
#endif
static void cpy4(int *dst_val, const int *src_val) {
static void inline cpy4(int *dst_val, const int *src_val) {
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);

Expand All @@ -811,7 +850,7 @@ static void cpy4(int *dst_val, const int *src_val) {
dst[3] = src[3];
}

static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
static void inline cpy4(unsigned int *dst_val, const unsigned int *src_val) {
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);

Expand All @@ -821,7 +860,7 @@ static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
dst[3] = src[3];
}

static void cpy4(float *dst_val, const float *src_val) {
static void inline cpy4(float *dst_val, const float *src_val) {
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);

Expand All @@ -838,7 +877,7 @@ static void cpy4(float *dst_val, const float *src_val) {
#pragma GCC diagnostic pop
#endif

static void swap4(unsigned int *val) {
static void inline swap4(unsigned int *val) {
#if TINYEXR_LITTLE_ENDIAN
(void)val;
#else
Expand All @@ -853,7 +892,7 @@ static void swap4(unsigned int *val) {
#endif
}

static void swap4(int *val) {
static void inline swap4(int *val) {
#if TINYEXR_LITTLE_ENDIAN
(void)val;
#else
Expand All @@ -868,7 +907,7 @@ static void swap4(int *val) {
#endif
}

static void swap4(float *val) {
static void inline swap4(float *val) {
#if TINYEXR_LITTLE_ENDIAN
(void)val;
#else
Expand All @@ -884,7 +923,7 @@ static void swap4(float *val) {
}

#if 0
static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
static void inline cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);

Expand All @@ -899,7 +938,7 @@ static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64
}
#endif

static void swap8(tinyexr::tinyexr_uint64 *val) {
static void inline swap8(tinyexr::tinyexr_uint64 *val) {
#if TINYEXR_LITTLE_ENDIAN
(void)val;
#else
Expand All @@ -919,6 +958,11 @@ static void swap8(tinyexr::tinyexr_uint64 *val) {
}

// https://gist.github.com/rygorous/2156668
#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
union FP32 {
float f;
};
#else
union FP32 {
unsigned int u;
float f;
Expand All @@ -934,12 +978,21 @@ union FP32 {
#endif
} s;
};
#endif

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpadded"
#endif

#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
union FP16 {
TINYEXR_FP16_COMPILER_TYPE f;
unsigned short u;
};

#else

union FP16 {
unsigned short u;
struct {
Expand All @@ -954,11 +1007,32 @@ union FP16 {
#endif
} s;
};
#endif

#ifdef __clang__
#pragma clang diagnostic pop
#endif

#if TINYEXR_HAS_FP16_COMPILER_TYPE && (TINYEXR_USE_COMPILER_FP16 > 0)
static inline FP32 half_to_float(FP16 h) {
FP32 o;
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
o.f =_mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(static_cast<int> (h.u))));
#else
o.f = static_cast<float> (h.f);
#endif
return o;
}
static inline FP16 float_to_half_full(FP32 f) {
FP16 o;
#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__)
o.f = static_cast<TINYEXR_FP16_COMPILER_TYPE> (_mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f.f), _MM_FROUND_CUR_DIRECTION)));
#else
o.f = static_cast<TINYEXR_FP16_COMPILER_TYPE> (f.f);
#endif
return o;
}
#else
static FP32 half_to_float(FP16 h) {
static const FP32 magic = {113 << 23};
static const unsigned int shifted_exp = 0x7c00
Expand Down Expand Up @@ -1018,7 +1092,7 @@ static FP16 float_to_half_full(FP32 f) {
o.s.Sign = f.s.Sign;
return o;
}

#endif
// NOTE: From OpenEXR code
// #define IMF_INCREASING_Y 0
// #define IMF_DECREASING_Y 1
Expand Down Expand Up @@ -4930,10 +5004,12 @@ static int DecodeTiledLevel(EXRImage* exr_image, const EXRHeader* exr_header,
std::atomic<int> tile_count(0);

int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
#if (TINYEXR_MAX_THREADS > 0)
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
#endif
if (num_threads > int(num_tiles)) {
num_threads = int(num_tiles);
}

for (int t = 0; t < num_threads; t++) {
workers.emplace_back(std::thread([&]()
{
Expand Down Expand Up @@ -5286,10 +5362,12 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
std::atomic<int> y_count(0);

int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
#if (TINYEXR_MAX_THREADS > 0)
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
#endif
if (num_threads > int(num_blocks)) {
num_threads = int(num_blocks);
}

for (int t = 0; t < num_threads; t++) {
workers.emplace_back(std::thread([&]() {
int y = 0;
Expand Down Expand Up @@ -7268,6 +7346,9 @@ static int EncodeTiledLevel(const EXRImage* level_image, const EXRHeader* exr_he
std::atomic<int> tile_count(0);

int num_threads = std::max(1, int(std::thread::hardware_concurrency()));
#if (TINYEXR_MAX_THREADS > 0)
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
#endif
if (num_threads > int(num_tiles)) {
num_threads = int(num_tiles);
}
Expand Down Expand Up @@ -7517,7 +7598,9 @@ static int EncodeChunk(const EXRImage* exr_image, const EXRHeader* exr_header,
std::atomic<int> block_count(0);

int num_threads = std::min(std::max(1, int(std::thread::hardware_concurrency())), num_blocks);

#if (TINYEXR_MAX_THREADS > 0)
num_threads = std::min(num_threads,TINYEXR_MAX_THREADS);
#endif
for (int t = 0; t < num_threads; t++) {
workers.emplace_back(std::thread([&]() {
int i = 0;
Expand Down Expand Up @@ -9047,13 +9130,19 @@ int SaveEXRToMemory(const float *data, int width, int height, int components,
images[3].resize(static_cast<size_t>(width * height));

// Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
if (components == 4) {
images[3][i] = data[static_cast<size_t>(components) * i + 3];
}
if (components == 4) {
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
images[3][i] = data[static_cast<size_t>(components) * i + 3];
}
} else {
for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
}
}
}

Expand Down Expand Up @@ -9198,13 +9287,19 @@ int SaveEXR(const float *data, int width, int height, int components,
images[3].resize(pixel_count);

// Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
for (size_t i = 0; i < pixel_count; i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
if (components == 4) {
images[3][i] = data[static_cast<size_t>(components) * i + 3];
}
if (components == 4) {
for (size_t i = 0; i < pixel_count; i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
images[3][i] = data[static_cast<size_t>(components) * i + 3];
}
} else {
for (size_t i = 0; i < pixel_count; i++) {
images[0][i] = data[static_cast<size_t>(components) * i + 0];
images[1][i] = data[static_cast<size_t>(components) * i + 1];
images[2][i] = data[static_cast<size_t>(components) * i + 2];
}
}
}

Expand Down

0 comments on commit 648e47b

Please sign in to comment.