From fd89312853800d1e996e6556bc9f0008a0e1502d Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Sat, 28 Apr 2018 09:13:41 +0300 Subject: [PATCH] *updated Simd Library. --- src/3rd/Simd/SimdArray.h | 7 +- src/3rd/Simd/SimdAvx1.h | 14 +- src/3rd/Simd/SimdAvx1Float32.cpp | 92 ++ src/3rd/Simd/SimdAvx1Gemm32f.cpp | 627 ++++++++ src/3rd/Simd/SimdAvx1HogLite.cpp | 48 +- src/3rd/Simd/SimdAvx1Neural.cpp | 4 +- src/3rd/Simd/SimdAvx1Resizer.cpp | 135 ++ src/3rd/Simd/SimdAvx1Synet.cpp | 325 +++++ src/3rd/Simd/SimdAvx2.h | 18 +- src/3rd/Simd/SimdAvx2Float16.cpp | 61 +- src/3rd/Simd/SimdAvx2Float32.cpp | 63 +- src/3rd/Simd/SimdAvx2Gemm32f.cpp | 481 +++++++ src/3rd/Simd/SimdAvx2Hog.cpp | 12 +- src/3rd/Simd/SimdAvx2HogLite.cpp | 48 +- src/3rd/Simd/SimdAvx2Neural.cpp | 218 +-- src/3rd/Simd/SimdAvx2Resizer.cpp | 151 ++ src/3rd/Simd/SimdAvx2Statistic.cpp | 42 +- src/3rd/Simd/SimdAvx2Synet.cpp | 238 +++ src/3rd/Simd/SimdAvx512bw.h | 10 +- src/3rd/Simd/SimdAvx512bwFloat16.cpp | 63 +- src/3rd/Simd/SimdAvx512bwFloat32.cpp | 63 +- src/3rd/Simd/SimdAvx512bwHog.cpp | 12 +- src/3rd/Simd/SimdAvx512bwHogLite.cpp | 64 +- src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp | 2 +- src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp | 32 +- src/3rd/Simd/SimdAvx512bwStatistic.cpp | 58 +- src/3rd/Simd/SimdAvx512f.h | 12 +- src/3rd/Simd/SimdAvx512fGemm32f.cpp | 1055 ++++++++++++++ src/3rd/Simd/SimdAvx512fNeural.cpp | 401 +++--- src/3rd/Simd/SimdAvx512fResizer.cpp | 156 ++ src/3rd/Simd/SimdAvx512fSynet.cpp | 368 +++++ src/3rd/Simd/SimdBase.h | 24 +- src/3rd/Simd/SimdBaseDetection.cpp | 10 +- src/3rd/Simd/SimdBaseFloat16.cpp | 16 +- src/3rd/Simd/SimdBaseFloat32.cpp | 18 +- src/3rd/Simd/SimdBaseGemm32f.cpp | 48 + src/3rd/Simd/SimdBaseHogLite.cpp | 28 +- src/3rd/Simd/SimdBaseNeural.cpp | 4 +- src/3rd/Simd/SimdBaseReduceGray5x5.cpp | 18 +- src/3rd/Simd/SimdBaseResizer.cpp | 257 ++++ src/3rd/Simd/SimdBaseStatistic.cpp | 24 +- src/3rd/Simd/SimdBaseSynet.cpp | 232 +++ src/3rd/Simd/SimdBaseThread.cpp | 45 + src/3rd/Simd/SimdBase_tinyxml2.cpp | 964 +++++++++---- src/3rd/Simd/SimdBase_tinyxml2.h | 1435 ++++++++++--------- src/3rd/Simd/SimdConst.h | 10 +- src/3rd/Simd/SimdDetection.h | 7 +- src/3rd/Simd/SimdDetection.hpp | 6 +- src/3rd/Simd/SimdEnable.h | 17 +- src/3rd/Simd/SimdGemm.h | 163 +++ src/3rd/Simd/SimdLib.cpp | 190 ++- src/3rd/Simd/SimdLib.h | 359 ++++- src/3rd/Simd/SimdLib.hpp | 62 +- src/3rd/Simd/SimdLoad.h | 16 +- src/3rd/Simd/SimdMath.h | 35 +- src/3rd/Simd/SimdMemory.h | 17 +- src/3rd/Simd/SimdNeon.h | 5 +- src/3rd/Simd/SimdNeonResizeBilinear.cpp | 171 ++- src/3rd/Simd/SimdNeonStatistic.cpp | 45 +- src/3rd/Simd/SimdNeural.hpp | 8 +- src/3rd/Simd/SimdParallel.hpp | 10 +- src/3rd/Simd/SimdPow.h | 205 +++ src/3rd/Simd/SimdResizer.h | 139 ++ src/3rd/Simd/SimdSse1.h | 12 +- src/3rd/Simd/SimdSse1Float32.cpp | 92 ++ src/3rd/Simd/SimdSse1Gemm32f.cpp | 595 ++++++++ src/3rd/Simd/SimdSse1Resizer.cpp | 118 ++ src/3rd/Simd/SimdSse1Synet.cpp | 325 +++++ src/3rd/Simd/SimdSse2.h | 6 +- src/3rd/Simd/SimdSse2Float32.cpp | 4 +- src/3rd/Simd/SimdSse2Neural.cpp | 95 +- src/3rd/Simd/SimdSse2ReduceGray2x2.cpp | 9 +- src/3rd/Simd/SimdSse2Statistic.cpp | 45 +- src/3rd/Simd/SimdSse2Synet.cpp | 91 ++ src/3rd/Simd/SimdSse3Neural.cpp | 4 +- src/3rd/Simd/SimdSse41.h | 2 +- src/3rd/Simd/SimdSse41Hog.cpp | 12 +- src/3rd/Simd/SimdSse41HogLite.cpp | 48 +- src/3rd/Simd/SimdStore.h | 12 + src/3rd/Simd/SimdVersion.h | 8 +- src/3rd/Simd/SimdView.hpp | 113 +- 81 files changed, 9526 insertions(+), 1533 deletions(-) create mode 100644 src/3rd/Simd/SimdAvx1Float32.cpp create mode 100644 src/3rd/Simd/SimdAvx1Gemm32f.cpp create mode 100644 src/3rd/Simd/SimdAvx1Resizer.cpp create mode 100644 src/3rd/Simd/SimdAvx1Synet.cpp create mode 100644 src/3rd/Simd/SimdAvx2Gemm32f.cpp create mode 100644 src/3rd/Simd/SimdAvx2Resizer.cpp create mode 100644 src/3rd/Simd/SimdAvx2Synet.cpp create mode 100644 src/3rd/Simd/SimdAvx512fGemm32f.cpp create mode 100644 src/3rd/Simd/SimdAvx512fResizer.cpp create mode 100644 src/3rd/Simd/SimdAvx512fSynet.cpp create mode 100644 src/3rd/Simd/SimdBaseGemm32f.cpp create mode 100644 src/3rd/Simd/SimdBaseResizer.cpp create mode 100644 src/3rd/Simd/SimdBaseSynet.cpp create mode 100644 src/3rd/Simd/SimdBaseThread.cpp create mode 100644 src/3rd/Simd/SimdGemm.h create mode 100644 src/3rd/Simd/SimdPow.h create mode 100644 src/3rd/Simd/SimdResizer.h create mode 100644 src/3rd/Simd/SimdSse1Float32.cpp create mode 100644 src/3rd/Simd/SimdSse1Gemm32f.cpp create mode 100644 src/3rd/Simd/SimdSse1Resizer.cpp create mode 100644 src/3rd/Simd/SimdSse1Synet.cpp create mode 100644 src/3rd/Simd/SimdSse2Synet.cpp diff --git a/src/3rd/Simd/SimdArray.h b/src/3rd/Simd/SimdArray.h index 73476787..de2e917b 100644 --- a/src/3rd/Simd/SimdArray.h +++ b/src/3rd/Simd/SimdArray.h @@ -25,6 +25,7 @@ #define __SimdArray_h__ #include "Simd/SimdMemory.h" +#include "Simd/SimdEnable.h" namespace Simd { @@ -33,7 +34,7 @@ namespace Simd T * const data; size_t const size; - SIMD_INLINE Array(size_t size_ = 0, bool clear = false) + SIMD_INLINE Array(size_t size_ = 0, bool clear = false, size_t align = SIMD_ALIGN) : data(0) , size(0) { @@ -46,7 +47,7 @@ namespace Simd Simd::Free(data); } - SIMD_INLINE void Resize(size_t size_, bool clear = false) + SIMD_INLINE void Resize(size_t size_, bool clear = false, size_t align = SIMD_ALIGN) { if (size_ != size) { @@ -54,7 +55,7 @@ namespace Simd Simd::Free(data); *(size_t*)&size = size_; if (size_) - *(T**)&data = (T*)Simd::Allocate(size * sizeof(T)); + *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align); } if (clear) Clear(); diff --git a/src/3rd/Simd/SimdAvx1.h b/src/3rd/Simd/SimdAvx1.h index 93b96e07..33f76cd3 100644 --- a/src/3rd/Simd/SimdAvx1.h +++ b/src/3rd/Simd/SimdAvx1.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,7 +31,11 @@ namespace Simd #ifdef SIMD_AVX_ENABLE namespace Avx { - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); @@ -100,6 +104,12 @@ namespace Simd void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); } #endif// SIMD_AVX_ENABLE } diff --git a/src/3rd/Simd/SimdAvx1Float32.cpp b/src/3rd/Simd/SimdAvx1Float32.cpp new file mode 100644 index 00000000..78de5333 --- /dev/null +++ b/src/3rd/Simd/SimdAvx1Float32.cpp @@ -0,0 +1,92 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t partialAlignedSize = AlignLo(size, F); + size_t fullAlignedSize = AlignLo(size, DF); + size_t i = 0; + __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + if (fullAlignedSize) + { + for (; i < fullAlignedSize; i += DF) + { + __m256 a0 = Load(a + i + 0 * F); + __m256 b0 = Load(b + i + 0 * F); + _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0)); + _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0)); + _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0)); + __m256 a1 = Load(a + i + 1 * F); + __m256 b1 = Load(b + i + 1 * F); + _aa[1] = _mm256_add_ps(_aa[1], _mm256_mul_ps(a1, a1)); + _ab[1] = _mm256_add_ps(_ab[1], _mm256_mul_ps(a1, b1)); + _bb[1] = _mm256_add_ps(_bb[1], _mm256_mul_ps(b1, b1)); + } + _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); + _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); + _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); + } + for (; i < partialAlignedSize; i += F) + { + __m256 a0 = Load(a + i); + __m256 b0 = Load(b + i); + _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0)); + _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0)); + _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0)); + } + float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]); + for (; i < size; ++i) + { + float _a = a[i]; + float _b = b[i]; + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance32f(a, b, size, distance); + else + CosineDistance32f(a, b, size, distance); + } + } +#endif// SIMD_AVX_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx1Gemm32f.cpp b/src/3rd/Simd/SimdAvx1Gemm32f.cpp new file mode 100644 index 00000000..99fa9dd1 --- /dev/null +++ b/src/3rd/Simd/SimdAvx1Gemm32f.cpp @@ -0,0 +1,627 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdGemm.h" + +namespace Simd +{ +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha) + { + _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); + } + + SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail) + { + if (tail == F) + AddProduct(ptr, value, alpha); + else + { + float tmp[F]; + _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); + for (size_t i = 0; i < tail; ++i) + ptr[i] = tmp[i]; + } + } + + static void Kernel4x24(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + __m256 c02 = _mm256_setzero_ps(); + __m256 c12 = _mm256_setzero_ps(); + __m256 c22 = _mm256_setzero_ps(); + __m256 c32 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m256 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + b2 = _mm256_loadu_ps(B + 2 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); + c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); + c02 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c02); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); + c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); + c12 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c12); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); + c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); + c22 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c22); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); + c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); + c32 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c32); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01); + AddProduct(C + 2 * F, _alpha, c02, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11); + AddProduct(C + 2 * F, _alpha, c12, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21); + AddProduct(C + 2 * F, _alpha, c22, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31); + AddProduct(C + 2 * F, _alpha, c32, tail); + } + + static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m256 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); + c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); + c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); + c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); + c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + } + + static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c0 = _mm256_setzero_ps(); + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + __m256 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B); + c0 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a0++)), c0); + c1 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a1++)), c1); + c2 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a2++)), c2); + c3 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a3++)), c3); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, tail); + AddProduct(C + 1 * ldc, _alpha, c1, tail); + AddProduct(C + 2 * ldc, _alpha, c2, tail); + AddProduct(C + 3 * ldc, _alpha, c3, tail); + } + + static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c40 = _mm256_setzero_ps(); + __m256 c50 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + __m256 c41 = _mm256_setzero_ps(); + __m256 c51 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m256 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); + c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); + c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); + c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); + c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31); + a0 = _mm256_set1_ps(*A4++); + c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40); + c41 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c41); + a0 = _mm256_set1_ps(*A5++); + c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50); + c51 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c51); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, tail); + } + + static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c40 = _mm256_setzero_ps(); + __m256 c50 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m256 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30); + a0 = _mm256_set1_ps(*A4++); + c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40); + a0 = _mm256_set1_ps(*A5++); + c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50, tail); + } + + static void KernelMx24(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c[4][3]; + const float * a[4]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm256_setzero_ps(); + c[i][1] = _mm256_setzero_ps(); + c[i][2] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + b2 = _mm256_loadu_ps(B + 2 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); + c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1]); + AddProduct(C + 2 * F, _alpha, c[i][2], tail); + C += ldc; + } + } + + static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c[6][2]; + const float * a[6]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm256_setzero_ps(); + c[i][1] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1], tail); + C += ldc; + } + } + + static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { +#ifdef SIMD_X64_ENABLE + __m256 c[6]; + const float * a[6]; +#else + __m256 c[4]; + const float * a[4]; +#endif + for (size_t i = 0; i < M; ++i) + { + c[i] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + AddProduct(C + i * ldc, _alpha, c[i], tail); + } + + SIMD_INLINE void ScaleC(float * C, __m256 beta) + { + _mm256_storeu_ps(C, _mm256_mul_ps(_mm256_loadu_ps(C), beta)); + } + + void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc) + { + if (beta == 1.0f) + return; + else if (beta == 0.0f) + { + for (size_t i = 0; i < M; ++i) + memset(C + i * ldc, 0, N * sizeof(float)); + } + else + { + size_t NQF = AlignLo(N, QF); + size_t NF = AlignLo(N, F); + __m256 _beta = _mm256_set1_ps(beta); + for (size_t i = 0; i < M; ++i) + { + size_t j = 0; + for (; j < NQF; j += QF) + { + ScaleC(C + j + F * 0, _beta); + ScaleC(C + j + F * 1, _beta); + ScaleC(C + j + F * 2, _beta); + ScaleC(C + j + F * 3, _beta); + } + for (; j < NF; j += F) + ScaleC(C + j, _beta); + for (; j < N; ++j) + C[j] *= beta; + C += ldc; + } + } + } + + void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) + { + for (size_t j = 0; j < N; j += microN) + { + size_t n = Simd::Min(microN, N - j); + size_t k = 0; + if (microN == 1 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); + pB += microN; + } + } + else + { + __m256 mask0 = Avx::LeftNotZero(n - 0 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); + pB += microN; + } + } + } + else if (microN == 2 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); + _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F)); + pB += microN; + } + } + else + { + __m256 mask0 = Avx::LeftNotZero(n - 0 * F); + __m256 mask1 = Avx::LeftNotZero(n - 1 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); + _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F))); + pB += microN; + } + } + } + else if (microN == 3 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F)); + _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F)); + _mm256_storeu_ps(pB + 2 * F, _mm256_loadu_ps(b + 2 * F)); + pB += microN; + } + } + else + { + __m256 mask0 = Avx::LeftNotZero(n - 0 * F); + __m256 mask1 = Avx::LeftNotZero(n - 1 * F); + __m256 mask2 = Avx::LeftNotZero(n - 2 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F))); + _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F))); + _mm256_storeu_ps(pB + 2 * F, _mm256_and_ps(mask2, _mm256_loadu_ps(b + 2 * F))); + pB += microN; + } + } + } + for (; k < K; ++k) + { + const float * b = B + k * ldb; + size_t c = 0; + for (; c < n; ++c) + *(pB++) = *(b++); + for (; c < microN; ++c) + *(pB++) = 0; + } + B += microN; + } + } + + static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) + { + size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); + for (size_t i = 0; i < M; i += cell) + { + size_t m = Simd::Min(cell, M - i), k = 0; + if (cell == 4 && m == 4) + { + for (; k < K8; k += 8) + { + const float * ps = src + k; + __m256 s0 = _mm256_loadu_ps(ps + 0 * K); + __m256 s1 = _mm256_loadu_ps(ps + 1 * K); + __m256 s2 = _mm256_loadu_ps(ps + 2 * K); + __m256 s3 = _mm256_loadu_ps(ps + 3 * K); + __m256 s00 = _mm256_unpacklo_ps(s0, s2); + __m256 s01 = _mm256_unpacklo_ps(s1, s3); + __m256 s10 = _mm256_unpackhi_ps(s0, s2); + __m256 s11 = _mm256_unpackhi_ps(s1, s3); + __m256 d0 = _mm256_unpacklo_ps(s00, s01); + __m256 d1 = _mm256_unpackhi_ps(s00, s01); + __m256 d2 = _mm256_unpacklo_ps(s10, s11); + __m256 d3 = _mm256_unpackhi_ps(s10, s11); + _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); + _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); + _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); + _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); + dst += 32; + }; + for (; k < K4; k += 4) + { + const float * ps = src + k; + __m128 s0 = _mm_loadu_ps(ps + 0 * stride); + __m128 s1 = _mm_loadu_ps(ps + 1 * stride); + __m128 s2 = _mm_loadu_ps(ps + 2 * stride); + __m128 s3 = _mm_loadu_ps(ps + 3 * stride); + __m128 s00 = _mm_unpacklo_ps(s0, s2); + __m128 s01 = _mm_unpacklo_ps(s1, s3); + __m128 s10 = _mm_unpackhi_ps(s0, s2); + __m128 s11 = _mm_unpackhi_ps(s1, s3); + _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); + _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); + _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); + _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); + dst += 16; + } + } + for (; k < K; ++k) + { + for (size_t c = 0; c < m; ++c) + *(dst++) = src[c*stride + k]; + } + src += cell * stride; + } + } + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) + { + const size_t CACHE_L1_SIZE = 32 * 1024; + const size_t CACHE_L2_SIZE = 256 * 1024; + const size_t CACHE_L3_SIZE = 2 * 1024 * 1024; + typedef Simd::GemmNN GemmNN; + GemmNN::Main kernelMM, kernelMT; + GemmNN::Tail kernelTM, kernelTT; + size_t microM, microN, L1, L2; +#ifdef SIMD_X64_ENABLE + if (K > 4024) + { + microM = 6; + microN = 16; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel6x16; + kernelMT = tail > F ? Kernel6x16 : Kernel6x8; + kernelTM = KernelMx16; + kernelTT = tail > F ? KernelMx16 : KernelMx8; + } + else + { + microM = 4; + microN = 24; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel4x24; + kernelMT = tail > DF ? Kernel4x24 : (tail > F ? Kernel4x16 : Kernel4x8); + kernelTM = KernelMx24; + kernelTT = tail > DF ? KernelMx24 : (tail > F ? KernelMx16 : KernelMx8); + } +#else + microM = 4; + microN = 8; + kernelMM = Kernel4x8; + kernelMT = Kernel4x8; + kernelTM = KernelMx8; + kernelTT = KernelMx8; +#endif + L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE; + L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE; + GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F, + kernelMM, kernelMT, kernelTM, kernelTT, Avx::GemmScaleC, Avx::GemmPackB, NULL); + gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); + } + } +#endif// SIMD_AVX_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx1HogLite.cpp b/src/3rd/Simd/SimdAvx1HogLite.cpp index 00cbe0bf..1ce3a087 100644 --- a/src/3rd/Simd/SimdAvx1HogLite.cpp +++ b/src/3rd/Simd/SimdAvx1HogLite.cpp @@ -51,9 +51,9 @@ namespace Simd sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(Load(src + 3 * step), _filter)); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) { @@ -63,7 +63,7 @@ namespace Simd __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < filterStride; filterCol += F) @@ -78,7 +78,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -91,9 +91,9 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); __m128 _min = _mm_set1_ps(-FLT_MAX); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) @@ -109,7 +109,7 @@ namespace Simd __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < filterStride; filterCol += F) @@ -127,7 +127,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -144,53 +144,53 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterSize && srcHeight >= filterSize); + assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - size_t dstWidth = srcWidth - filterSize + 1; - size_t dstHeight = srcHeight - filterSize + 1; + size_t dstWidth = srcWidth - filterWidth + 1; + size_t dstHeight = srcHeight - filterHeight + 1; if (mask) { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } else { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); } } }; - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } namespace HogLiteFeatureResizerDetail diff --git a/src/3rd/Simd/SimdAvx1Neural.cpp b/src/3rd/Simd/SimdAvx1Neural.cpp index dfd0ac55..054b15fa 100644 --- a/src/3rd/Simd/SimdAvx1Neural.cpp +++ b/src/3rd/Simd/SimdAvx1Neural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -1764,7 +1764,7 @@ namespace Simd bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1) + if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F) { if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) return true; diff --git a/src/3rd/Simd/SimdAvx1Resizer.cpp b/src/3rd/Simd/SimdAvx1Resizer.cpp new file mode 100644 index 00000000..eb58eccf --- /dev/null +++ b/src/3rd/Simd/SimdAvx1Resizer.cpp @@ -0,0 +1,135 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdResizer.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp) + : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m256), caffeInterp) + { + } + + void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const + { + Array32f bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + float * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + size_t rsa = AlignLo(_rs, Avx::F); + size_t rsh = AlignLo(_rs, Sse::F); + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + float * pb = pbx[k]; + const float * ps = src + (sy + k)*srcStride; + size_t dx = 0; + if (_cn == 1) + { + __m256 _1 = _mm256_set1_ps(1.0f); + for (; dx < rsa; dx += Avx::F) + { + __m256 s0145 = Avx::Load(ps + _ix[dx + 0], ps + _ix[dx + 1], ps + _ix[dx + 4], ps + _ix[dx + 5]); + __m256 s2367 = Avx::Load(ps + _ix[dx + 2], ps + _ix[dx + 3], ps + _ix[dx + 6], ps + _ix[dx + 7]); + __m256 fx1 = _mm256_load_ps(_ax.data + dx); + __m256 fx0 = _mm256_sub_ps(_1, fx1); + __m256 m0 = _mm256_mul_ps(fx0, _mm256_shuffle_ps(s0145, s2367, 0x88)); + __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD)); + _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1)); + } + for (; dx < rsh; dx += Sse::F) + { + __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 fx1 = _mm_load_ps(_ax.data + dx); + __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); + __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); + __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); + _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); + } + } + for (; dx < _rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx; + } + } + + size_t dx = 0; + __m256 _fy0 = _mm256_set1_ps(fy0); + __m256 _fy1 = _mm256_set1_ps(fy1); + for (; dx < rsa; dx += Avx::F) + { + __m256 m0 = _mm256_mul_ps(_mm256_load_ps(pbx[0] + dx), _fy0); + __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1); + _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1)); + } + for (; dx < rsh; dx += Sse::F) + { + __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); + __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); + _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); + } + for (; dx < _rs; dx++) + dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; + } + } + + //--------------------------------------------------------------------- + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) + { + if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true); + else + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + } + } +#endif //SIMD_AVX_ENABLE +} + diff --git a/src/3rd/Simd/SimdAvx1Synet.cpp b/src/3rd/Simd/SimdAvx1Synet.cpp new file mode 100644 index 00000000..6f1401b6 --- /dev/null +++ b/src/3rd/Simd/SimdAvx1Synet.cpp @@ -0,0 +1,325 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + template SIMD_INLINE void SynetAddBias(const __m256 & bias, float * dst) + { + Store(dst, _mm256_add_ps(Load(dst), bias)); + } + + template SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m256 _bias = _mm256_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetAddBias(_bias, dst + j + F * 0); + SynetAddBias(_bias, dst + j + F * 1); + SynetAddBias(_bias, dst + j + F * 2); + SynetAddBias(_bias, dst + j + F * 3); + } + for (; j < partial; j += F) + SynetAddBias(_bias, dst + j); + } + for (; j < size; ++j) + dst[j] += bias[i]; + dst += size; + } + } + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetAddBias(bias, count, size, dst); + else + SynetAddBias(bias, count, size, dst); + } + + template void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset) + { + Store(dst + offset, _mm256_mul_ps(Load(src0 + offset), Load(src1 + offset))); + } + + template void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(src0, src1, dst, j); + } + for (; j < size; ++j) + dst[j] = src0[j] * src1[j]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(dst, srci, dst, j); + } + for (; j < size; ++j) + dst[j] *= srci[j]; + } + } + + template void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset) + { + Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src0 + offset), weight0), _mm256_mul_ps(Load(src1 + offset), weight1))); + } + + template void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset) + { + Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src + offset), weight), Load(dst + offset))); + } + + template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + __m256 weight0 = _mm256_set1_ps(weight[0]); + __m256 weight1 = _mm256_set1_ps(weight[1]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); + } + for (; j < size; ++j) + dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + __m256 weighti = _mm256_set1_ps(weight[i]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(srci, weighti, dst, j); + } + for (; j < size; ++j) + dst[j] += srci[j] * weight[i]; + } + } + + template void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset) + { + Store(dst + offset, _mm256_max_ps(Load(src0 + offset), Load(src1 + offset))); + } + + template void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(src0, src1, dst, j); + } + for (; j < size; ++j) + dst[j] = Simd::Max(src0[j], src1[j]); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(dst, srci, dst, j); + } + for (; j < size; ++j) + dst[j] = Simd::Max(dst[j], srci[j]); + } + } + + template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + switch (type) + { + case SimdSynetEltwiseOperationProduct: + SynetEltwiseLayerForwardProduct(src, count, size, dst); + break; + case SimdSynetEltwiseOperationSum: + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + break; + case SimdSynetEltwiseOperationMax: + SynetEltwiseLayerForwardMax(src, count, size, dst); + break; + default: + assert(0); + } + } + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + assert(count >= 2); + bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); + for (size_t i = 2; i < count; ++i) + aligned = aligned && Aligned(src[i]); + if (aligned) + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + else + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset) + { + Store(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load(src + offset), scale), bias)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset) + { + Store(dst + offset, _mm256_mul_ps(Load(src + offset), scale)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + if (bias) + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m256 _scale = _mm256_set1_ps(scale[i]); + __m256 _bias = _mm256_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, _bias, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i] + bias[i]; + src += size; + dst += size; + } + } + else + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m256 _scale = _mm256_set1_ps(scale[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i]; + src += size; + dst += size; + } + } + } + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetScaleLayerForward(src, scale, bias, count, size, dst); + else + SynetScaleLayerForward(src, scale, bias, count, size, dst); + } + } +#endif// SIMD_AVX_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx2.h b/src/3rd/Simd/SimdAvx2.h index fdca8625..6b4dc9c2 100644 --- a/src/3rd/Simd/SimdAvx2.h +++ b/src/3rd/Simd/SimdAvx2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -181,13 +181,19 @@ namespace Simd void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); + void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); + void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); @@ -212,7 +218,7 @@ namespace Simd void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); @@ -402,11 +408,19 @@ namespace Simd void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); + void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); + void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); diff --git a/src/3rd/Simd/SimdAvx2Float16.cpp b/src/3rd/Simd/SimdAvx2Float16.cpp index b4cb4b58..81365746 100644 --- a/src/3rd/Simd/SimdAvx2Float16.cpp +++ b/src/3rd/Simd/SimdAvx2Float16.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -151,6 +151,65 @@ namespace Simd else SquaredDifferenceSum16f(a, b, size, sum); } + + template void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t partialAlignedSize = AlignLo(size, F); + size_t fullAlignedSize = AlignLo(size, DF); + size_t i = 0; + __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + if (fullAlignedSize) + { + for (; i < fullAlignedSize; i += DF) + { + __m256 a0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 0)); + __m256 b0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 0)); + _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); + __m256 a1 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 1)); + __m256 b1 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 1)); + _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]); + _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]); + _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]); + } + _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); + _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); + _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); + } + for (; i < partialAlignedSize; i += F) + { + __m256 a0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + i) + 0)); + __m256 b0 = _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + i) + 0)); + _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); + } + if (partialAlignedSize != size) + { + __m256 mask = RightNotZero(size - partialAlignedSize); + __m256 a0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load((__m128i*)(a + size - F)))); + __m256 b0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load((__m128i*)(b + size - F)))); + _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); + } + float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]); + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance16f(a, b, size, distance); + else + CosineDistance16f(a, b, size, distance); + } } #endif// SIMD_AVX2_ENABLE } diff --git a/src/3rd/Simd/SimdAvx2Float32.cpp b/src/3rd/Simd/SimdAvx2Float32.cpp index 242aba1f..948eb875 100644 --- a/src/3rd/Simd/SimdAvx2Float32.cpp +++ b/src/3rd/Simd/SimdAvx2Float32.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" namespace Simd { @@ -70,7 +71,7 @@ namespace Simd SIMD_INLINE __m256 Uint8ToFloat32(const __m128i & value, const __m256 & lower, const __m256 & boost) { - return _mm256_sub_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(value)), boost), lower); + return _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(value)), boost), lower); } template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m256 & lower, const __m256 & boost, float * dst) @@ -103,6 +104,64 @@ namespace Simd else Uint8ToFloat32(src, size, lower, upper, dst); } + + template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t partialAlignedSize = AlignLo(size, F); + size_t fullAlignedSize = AlignLo(size, DF); + size_t i = 0; + __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }; + if (fullAlignedSize) + { + for (; i < fullAlignedSize; i += DF) + { + __m256 a0 = Load(a + i + 0 * F); + __m256 b0 = Load(b + i + 0 * F); + _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); + __m256 a1 = Load(a + i + 1 * F); + __m256 b1 = Load(b + i + 1 * F); + _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]); + _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]); + _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]); + } + _aa[0] = _mm256_add_ps(_aa[0], _aa[1]); + _ab[0] = _mm256_add_ps(_ab[0], _ab[1]); + _bb[0] = _mm256_add_ps(_bb[0], _bb[1]); + } + for (; i < partialAlignedSize; i += F) + { + __m256 a0 = Load(a + i); + __m256 b0 = Load(b + i); + _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]); + } + float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]); + for (; i < size; ++i) + { + float _a = a[i]; + float _b = b[i]; + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance32f(a, b, size, distance); + else + CosineDistance32f(a, b, size, distance); + } } #endif// SIMD_AVX2_ENABLE } diff --git a/src/3rd/Simd/SimdAvx2Gemm32f.cpp b/src/3rd/Simd/SimdAvx2Gemm32f.cpp new file mode 100644 index 00000000..8aa7b2be --- /dev/null +++ b/src/3rd/Simd/SimdAvx2Gemm32f.cpp @@ -0,0 +1,481 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdGemm.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha) + { + _mm256_storeu_ps(ptr, _mm256_fmadd_ps(value, alpha, _mm256_loadu_ps(ptr))); + } + + SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail) + { + if (tail == F) + AddProduct(ptr, value, alpha); + else + { + float tmp[F]; + _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr))); + for (size_t i = 0; i < tail; ++i) + ptr[i] = tmp[i]; + } + } + + static void Kernel4x24(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + __m256 c02 = _mm256_setzero_ps(); + __m256 c12 = _mm256_setzero_ps(); + __m256 c22 = _mm256_setzero_ps(); + __m256 c32 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m256 b0, b1, b2, a0; + for (size_t k = 0; k < K; ++k) + { + _mm_prefetch((char*)B + 384, _MM_HINT_T0); + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + b2 = _mm256_loadu_ps(B + 2 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_fmadd_ps(a0, b0, c00); + c01 = _mm256_fmadd_ps(a0, b1, c01); + c02 = _mm256_fmadd_ps(a0, b2, c02); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_fmadd_ps(a0, b0, c10); + c11 = _mm256_fmadd_ps(a0, b1, c11); + c12 = _mm256_fmadd_ps(a0, b2, c12); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_fmadd_ps(a0, b0, c20); + c21 = _mm256_fmadd_ps(a0, b1, c21); + c22 = _mm256_fmadd_ps(a0, b2, c22); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_fmadd_ps(a0, b0, c30); + c31 = _mm256_fmadd_ps(a0, b1, c31); + c32 = _mm256_fmadd_ps(a0, b2, c32); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01); + AddProduct(C + 2 * F, _alpha, c02, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11); + AddProduct(C + 2 * F, _alpha, c12, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21); + AddProduct(C + 2 * F, _alpha, c22, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31); + AddProduct(C + 2 * F, _alpha, c32, tail); + } + + static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m256 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + a0 = _mm256_set1_ps(*A0++); + c00 = _mm256_fmadd_ps(a0, b0, c00); + c01 = _mm256_fmadd_ps(a0, b1, c01); + a0 = _mm256_set1_ps(*A1++); + c10 = _mm256_fmadd_ps(a0, b0, c10); + c11 = _mm256_fmadd_ps(a0, b1, c11); + a0 = _mm256_set1_ps(*A2++); + c20 = _mm256_fmadd_ps(a0, b0, c20); + c21 = _mm256_fmadd_ps(a0, b1, c21); + a0 = _mm256_set1_ps(*A3++); + c30 = _mm256_fmadd_ps(a0, b0, c30); + c31 = _mm256_fmadd_ps(a0, b1, c31); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + } + + static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c0 = _mm256_setzero_ps(); + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + __m256 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B); + c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a0++), c0); + c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a1++), c1); + c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a2++), c2); + c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a3++), c3); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, tail); + AddProduct(C + 1 * ldc, _alpha, c1, tail); + AddProduct(C + 2 * ldc, _alpha, c2, tail); + AddProduct(C + 3 * ldc, _alpha, c3, tail); + } + + static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c00 = _mm256_setzero_ps(); + __m256 c10 = _mm256_setzero_ps(); + __m256 c20 = _mm256_setzero_ps(); + __m256 c30 = _mm256_setzero_ps(); + __m256 c40 = _mm256_setzero_ps(); + __m256 c50 = _mm256_setzero_ps(); + __m256 c01 = _mm256_setzero_ps(); + __m256 c11 = _mm256_setzero_ps(); + __m256 c21 = _mm256_setzero_ps(); + __m256 c31 = _mm256_setzero_ps(); + __m256 c41 = _mm256_setzero_ps(); + __m256 c51 = _mm256_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m256 b0, b1, a0, a1; + for (size_t k = 0; k < K; k++) + { + _mm_prefetch((char*)B + 512, _MM_HINT_T0); + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + a0 = _mm256_set1_ps(*A0++); + a1 = _mm256_set1_ps(*A1++); + c00 = _mm256_fmadd_ps(a0, b0, c00); + c01 = _mm256_fmadd_ps(a0, b1, c01); + c10 = _mm256_fmadd_ps(a1, b0, c10); + c11 = _mm256_fmadd_ps(a1, b1, c11); + a0 = _mm256_set1_ps(*A2++); + a1 = _mm256_set1_ps(*A3++); + c20 = _mm256_fmadd_ps(a0, b0, c20); + c21 = _mm256_fmadd_ps(a0, b1, c21); + c30 = _mm256_fmadd_ps(a1, b0, c30); + c31 = _mm256_fmadd_ps(a1, b1, c31); + a0 = _mm256_set1_ps(*A4++); + a1 = _mm256_set1_ps(*A5++); + c40 = _mm256_fmadd_ps(a0, b0, c40); + c41 = _mm256_fmadd_ps(a0, b1, c41); + c50 = _mm256_fmadd_ps(a1, b0, c50); + c51 = _mm256_fmadd_ps(a1, b1, c51); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, tail); + } + + static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c0 = _mm256_setzero_ps(); + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + __m256 c4 = _mm256_setzero_ps(); + __m256 c5 = _mm256_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + const float * a4 = A + lda * 4; + const float * a5 = A + lda * 5; + __m256 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B); + c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a0++), c0); + c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a1++), c1); + c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a2++), c2); + c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a3++), c3); + c4 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a4++), c4); + c5 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a5++), c5); + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, tail); + AddProduct(C + 1 * ldc, _alpha, c1, tail); + AddProduct(C + 2 * ldc, _alpha, c2, tail); + AddProduct(C + 3 * ldc, _alpha, c3, tail); + AddProduct(C + 4 * ldc, _alpha, c4, tail); + AddProduct(C + 5 * ldc, _alpha, c5, tail); + } + + static void KernelMx24(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c[4][3]; + const float * a[4]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm256_setzero_ps(); + c[i][1] = _mm256_setzero_ps(); + c[i][2] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + b2 = _mm256_loadu_ps(B + 2 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]); + c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1]); + AddProduct(C + 2 * F, _alpha, c[i][2], tail); + C += ldc; + } + } + + static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c[6][2]; + const float * a[6]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm256_setzero_ps(); + c[i][1] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + b1 = _mm256_loadu_ps(B + 1 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i][0] = _mm256_fmadd_ps(b0, a0, c[i][0]); + c[i][1] = _mm256_fmadd_ps(b1, a0, c[i][1]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1], tail); + C += ldc; + } + } + + static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m256 c[4]; + const float * a[4]; + for (size_t i = 0; i < M; ++i) + { + c[i] = _mm256_setzero_ps(); + a[i] = A + lda * i; + } + __m256 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm256_loadu_ps(B + 0 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm256_set1_ps(*a[i]++); + c[i] = _mm256_fmadd_ps(b0, a0, c[i]); + } + B += ldb; + } + __m256 _alpha = _mm256_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + AddProduct(C + i * ldc, _alpha, c[i], tail); + } + + static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) + { + size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); + for (size_t i = 0; i < M; i += cell) + { + size_t m = Simd::Min(cell, M - i), k = 0; + if (cell == 4 && m == 4) + { + for (; k < K8; k += 8) + { + const float * ps = src + k; + __m256 s0 = _mm256_loadu_ps(ps + 0 * K); + __m256 s1 = _mm256_loadu_ps(ps + 1 * K); + __m256 s2 = _mm256_loadu_ps(ps + 2 * K); + __m256 s3 = _mm256_loadu_ps(ps + 3 * K); + __m256 s00 = _mm256_unpacklo_ps(s0, s2); + __m256 s01 = _mm256_unpacklo_ps(s1, s3); + __m256 s10 = _mm256_unpackhi_ps(s0, s2); + __m256 s11 = _mm256_unpackhi_ps(s1, s3); + __m256 d0 = _mm256_unpacklo_ps(s00, s01); + __m256 d1 = _mm256_unpackhi_ps(s00, s01); + __m256 d2 = _mm256_unpacklo_ps(s10, s11); + __m256 d3 = _mm256_unpackhi_ps(s10, s11); + _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); + _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); + _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); + _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); + dst += 32; + }; + for (; k < K4; k += 4) + { + const float * ps = src + k; + __m128 s0 = _mm_loadu_ps(ps + 0 * stride); + __m128 s1 = _mm_loadu_ps(ps + 1 * stride); + __m128 s2 = _mm_loadu_ps(ps + 2 * stride); + __m128 s3 = _mm_loadu_ps(ps + 3 * stride); + __m128 s00 = _mm_unpacklo_ps(s0, s2); + __m128 s01 = _mm_unpacklo_ps(s1, s3); + __m128 s10 = _mm_unpackhi_ps(s0, s2); + __m128 s11 = _mm_unpackhi_ps(s1, s3); + _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); + _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); + _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); + _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); + dst += 16; + } + } + for (; k < K; ++k) + { + for (size_t c = 0; c < m; ++c) + *(dst++) = src[c*stride + k]; + } + src += cell * stride; + } + } + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) + { + const size_t CACHE_L1_SIZE = 32 * 1024; + const size_t CACHE_L2_SIZE = 256 * 1024; + const size_t CACHE_L3_SIZE = 2 * 1024 * 1024; + typedef Simd::GemmNN GemmNN; + GemmNN::Main kernelMM, kernelMT; + GemmNN::Tail kernelTM, kernelTT; + size_t microM, microN, L1, L2; +#ifdef SIMD_X64_ENABLE + if (K > 4096) + { + microM = 6; + microN = 16; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel6x16; + kernelMT = tail > F ? Kernel6x16 : Kernel6x8; + kernelTM = KernelMx16; + kernelTT = tail > F ? KernelMx16 : KernelMx8; + } + else + { + microM = 4; + microN = 24; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel4x24; + kernelMT = tail > DF ? Kernel4x24 : (tail > F ? Kernel4x16 : Kernel4x8); + kernelTM = KernelMx24; + kernelTT = tail > DF ? KernelMx24 : (tail > F ? KernelMx16 : KernelMx8); + } +#else + microM = 4; + microN = 8; + kernelMM = Kernel4x8; + kernelMT = Kernel4x8; + kernelTM = KernelMx8; + kernelTT = KernelMx8; +#endif + L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE; + L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE; + GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F, + kernelMM, kernelMT, kernelTM, kernelTT, Avx::GemmScaleC, Avx::GemmPackB, NULL); + gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx2Hog.cpp b/src/3rd/Simd/SimdAvx2Hog.cpp index b477826e..e8d22bee 100644 --- a/src/3rd/Simd/SimdAvx2Hog.cpp +++ b/src/3rd/Simd/SimdAvx2Hog.cpp @@ -542,12 +542,12 @@ namespace Simd Avx::Store(h1[1] + i, _mm256_add_ps(Avx::Load(h1[1] + i), _mm256_unpackhi_ps(b1, b3))); } __m128 * ps = (__m128*)src; - __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16))); - __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16))); - _mm_storel_pi((__m64*)(h0[0] + 16), s0); - _mm_storeh_pi((__m64*)(h0[1] + 16), s0); - _mm_storel_pi((__m64*)(h1[0] + 16), s1); - _mm_storeh_pi((__m64*)(h1[1] + 16), s1); + __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16)); + __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16)); + Sse::StoreHalf<0>(h0[0] + 16, s0); + Sse::StoreHalf<1>(h0[1] + 16, s0); + Sse::StoreHalf<0>(h1[0] + 16, s1); + Sse::StoreHalf<1>(h1[1] + 16, s1); h0++; h1++; src += 72; diff --git a/src/3rd/Simd/SimdAvx2HogLite.cpp b/src/3rd/Simd/SimdAvx2HogLite.cpp index 6ded259c..6951cd8d 100644 --- a/src/3rd/Simd/SimdAvx2HogLite.cpp +++ b/src/3rd/Simd/SimdAvx2HogLite.cpp @@ -507,9 +507,9 @@ namespace Simd sums[3] = _mm256_fmadd_ps(Avx::Load(src + 3 * step), _filter, sums[3]); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); size_t alignedFilterStride = AlignLo(filterStride, QF); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) @@ -520,7 +520,7 @@ namespace Simd __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += QF) @@ -537,7 +537,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -550,9 +550,9 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); size_t alignedFilterStride = AlignLo(filterStride, QF); __m128 _min = _mm_set1_ps(-FLT_MAX); @@ -569,7 +569,7 @@ namespace Simd __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += QF) @@ -589,7 +589,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -606,53 +606,53 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterSize && srcHeight >= filterSize); + assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - size_t dstWidth = srcWidth - filterSize + 1; - size_t dstHeight = srcHeight - filterSize + 1; + size_t dstWidth = srcWidth - filterWidth + 1; + size_t dstHeight = srcHeight - filterHeight + 1; if (mask) { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } else { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); } } }; - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } namespace HogLiteFeatureResizerDetail diff --git a/src/3rd/Simd/SimdAvx2Neural.cpp b/src/3rd/Simd/SimdAvx2Neural.cpp index e0dcdfa6..709617ce 100644 --- a/src/3rd/Simd/SimdAvx2Neural.cpp +++ b/src/3rd/Simd/SimdAvx2Neural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,6 +27,7 @@ #include "Simd/SimdStream.h" #include "Simd/SimdBase.h" #include "Simd/SimdNeural.h" +#include "Simd/SimdPow.h" namespace Simd { @@ -332,84 +333,28 @@ namespace Simd NeuralRoughSigmoid2(src, size, slope, dst); } - class PowEstimator + template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { - __m256i _exponent, _mantissa; - __m256 _one; - - void Init() - { - _exponent = _mm256_set1_epi32(0x7F800000); - _mantissa = _mm256_set1_epi32(0x007FFFFF); - _one = _mm256_set1_ps(1.0f); - } - - SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) - { - __m256 p = _mm256_set1_ps(f); - p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(e)); - p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(d)); - p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(c)); - p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(b)); - p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(a)); - return p; - } - - SIMD_INLINE __m256 Exp2(__m256 x) - { - x = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(129.00000f)), _mm256_set1_ps(-126.99999f)); - __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f))); - __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); - __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23)); - __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm256_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m256 Log2(__m256 x) - { - __m256i i = _mm256_castps_si256(x); - __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _exponent), 23), _mm256_set1_epi32(127))); - __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mantissa)), _one); - __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm256_fmadd_ps(p, _mm256_sub_ps(m, _one), e); - } - - SIMD_INLINE __m256 Pow(__m256 basis, __m256 exponent) - { - return Exp2(_mm256_mul_ps(Log2(basis), exponent)); - } - - template void Run(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t alignedSize = AlignLo(size, F); - __m256 _e = _mm256_set1_ps(e); - size_t i = 0; - for (; i < alignedSize; i += F) - Avx::Store(dst + i, Pow(Avx::Load(src + i), _e)); - for (; i < size; ++i) - dst[i] = Base::Pow(src[i], e); - } - - public: - void Run(const float * src, size_t size, const float * exponent, float * dst) - { - Init(); + if (align) + assert(Aligned(src) && Aligned(dst)); - if (Aligned(src) && Aligned(dst)) - Run(src, size, exponent, dst); - else - Run(src, size, exponent, dst); - } - }; + float e = exponent[0]; + size_t alignedSize = AlignLo(size, F); + __m256 _e = _mm256_set1_ps(e); + Pow pow; + size_t i = 0; + for (; i < alignedSize; i += F) + Avx::Store(dst + i, pow(Avx::Load(src + i), _e)); + for (; i < size; ++i) + dst[i] = Base::Pow(src[i], e); + } void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { - PowEstimator estimator; - estimator.Run(src, size, exponent, dst); + if (Aligned(src) && Aligned(dst)) + NeuralPow(src, size, exponent, dst); + else + NeuralPow(src, size, exponent, dst); } template void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride) @@ -1542,7 +1487,7 @@ namespace Simd void Kernel4x24(size_t N, size_t K, const float * a, const float * b, float * c) { - register __m256 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32; + __m256 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32; c00 = _mm256_setzero_ps(); c01 = _mm256_setzero_ps(); @@ -1607,22 +1552,45 @@ namespace Simd -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; const float * tail = (float*)mask + 24 - N + N24; - size_t i = 0; - for (; i < M4; i += 4) + if (M > N) { - size_t j = 0; - for (; j < N24; j += 24) - Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j); - if (N24 < N) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4); + size_t i = 0; + for (; i < M4; i += 4) + { + size_t j = 0; + for (; j < N24; j += 24) + Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j); + if (N24 < N) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4); + } + if (M4 < M) + { + size_t j = 0; + for (; j < N24; j += 24) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4); + if (N24 < N) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4); + } } - if (M4 < M) + else { size_t j = 0; for (; j < N24; j += 24) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4); + { + size_t i = 0; + for (; i < M4; i += 4) + Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j); + if (M4 < M) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4); + } if (N24 < N) - KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4); + { + size_t i = 0; + for (; i < M4; i += 4) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4); + if (M4 < M) + KernelMx24(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4); + } } } @@ -1667,7 +1635,6 @@ namespace Simd template void AddConvolution8x8(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, float * dst, size_t dstDepth) { - __m256 _weight[kernelX*kernelY]; for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel) { __m256 _dst[8]; @@ -1676,6 +1643,7 @@ namespace Simd _dst[row] = Avx::Load(pdst); if (kernelY < 4) { + __m256 _weight[kernelX*kernelY]; for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) { const float * psrc = src + srcWidth*srcHeight*srcChannel; @@ -1690,6 +1658,7 @@ namespace Simd } else { + __m256 _weight[kernelX]; for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) { const float * psrc = src + srcWidth*srcHeight*srcChannel; @@ -1753,11 +1722,80 @@ namespace Simd } } + void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth) + { + size_t dstDepth4 = dstDepth/4*4; + size_t dstChannel = 0; + for (; dstChannel < dstDepth4; dstChannel += 4) + { + __m256 dst00 = _mm256_loadu_ps(dst + 0 * F); + __m256 dst01 = _mm256_loadu_ps(dst + 1 * F); + __m256 dst10 = _mm256_loadu_ps(dst + 2 * F); + __m256 dst11 = _mm256_loadu_ps(dst + 3 * F); + __m256 dst20 = _mm256_loadu_ps(dst + 4 * F); + __m256 dst21 = _mm256_loadu_ps(dst + 5 * F); + __m256 dst30 = _mm256_loadu_ps(dst + 6 * F); + __m256 dst31 = _mm256_loadu_ps(dst + 7 * F); + const float * psrc = src; + const float * pw0 = weight; + const float * pw1 = pw0 + srcDepth; + const float * pw2 = pw1 + srcDepth; + const float * pw3 = pw2 + srcDepth; + for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) + { + __m256 _weight; + __m256 src0 = _mm256_loadu_ps(psrc + 0 * F); + __m256 src1 = _mm256_loadu_ps(psrc + 1 * F); + _weight = _mm256_set1_ps(pw0[srcChannel]); + dst00 = _mm256_fmadd_ps(_weight, src0, dst00); + dst01 = _mm256_fmadd_ps(_weight, src1, dst01); + _weight = _mm256_set1_ps(pw1[srcChannel]); + dst10 = _mm256_fmadd_ps(_weight, src0, dst10); + dst11 = _mm256_fmadd_ps(_weight, src1, dst11); + _weight = _mm256_set1_ps(pw2[srcChannel]); + dst20 = _mm256_fmadd_ps(_weight, src0, dst20); + dst21 = _mm256_fmadd_ps(_weight, src1, dst21); + _weight = _mm256_set1_ps(pw3[srcChannel]); + dst30 = _mm256_fmadd_ps(_weight, src0, dst30); + dst31 = _mm256_fmadd_ps(_weight, src1, dst31); + psrc += 16; + } + _mm256_storeu_ps(dst + 0 * F, dst00); + _mm256_storeu_ps(dst + 1 * F, dst01); + _mm256_storeu_ps(dst + 2 * F, dst10); + _mm256_storeu_ps(dst + 3 * F, dst11); + _mm256_storeu_ps(dst + 4 * F, dst20); + _mm256_storeu_ps(dst + 5 * F, dst21); + _mm256_storeu_ps(dst + 6 * F, dst30); + _mm256_storeu_ps(dst + 7 * F, dst31); + dst += 16*4; + weight += srcDepth * 4; + } + for (; dstChannel < dstDepth; ++dstChannel) + { + __m256 dst0 = _mm256_loadu_ps(dst + 0 * F); + __m256 dst1 = _mm256_loadu_ps(dst + 1 * F); + const float * psrc = src; + for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) + { + __m256 weight0 = _mm256_set1_ps(*weight++); + dst0 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 0 * F), dst0); + dst1 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 1 * F), dst1); + psrc += 16; + } + _mm256_storeu_ps(dst + 0 * F, dst0); + _mm256_storeu_ps(dst + 1 * F, dst1); + dst += 16; + } + } + void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) { assert(kernelX == kernelY); - if (kernelX == 2) + if (kernelX == 1 && dstWidth*dstHeight == 16) + AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth); + else if (kernelX == 2) AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); else if (kernelX == 3) AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); @@ -1771,9 +1809,11 @@ namespace Simd bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1) + if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F) { - if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) + if (kernelX >= 2 && kernelX <= 5 && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) + return true; + if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64)) return true; } return false; diff --git a/src/3rd/Simd/SimdAvx2Resizer.cpp b/src/3rd/Simd/SimdAvx2Resizer.cpp new file mode 100644 index 00000000..f263b94d --- /dev/null +++ b/src/3rd/Simd/SimdAvx2Resizer.cpp @@ -0,0 +1,151 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdResizer.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp) + : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m256), caffeInterp) + { + } + + void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const + { + Array32f bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + float * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + size_t rsa = AlignLo(_rs, Avx::F); + size_t rsh = AlignLo(_rs, Sse::F); + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + float * pb = pbx[k]; + const float * ps = src + (sy + k)*srcStride; + size_t dx = 0; + if (_cn == 1) + { + __m256 _1 = _mm256_set1_ps(1.0f); + for (; dx < rsa; dx += Avx::F) + { + __m256i idx = Avx2::LoadPermuted((__m256i*)(_ix.data + dx)); + __m256 s0145 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 0), 4)); + __m256 s2367 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 1), 4)); + __m256 fx1 = _mm256_load_ps(_ax.data + dx); + __m256 fx0 = _mm256_sub_ps(_1, fx1); + __m256 s0 = _mm256_shuffle_ps(s0145, s2367, 0x88); + __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD); + _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); + } + for (; dx < rsh; dx += Sse::F) + { + __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 fx1 = _mm_load_ps(_ax.data + dx); + __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1); + __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); + __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); + _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); + } + } + else + { + __m256 _1 = _mm256_set1_ps(1.0f); + __m256i cn = _mm256_set1_epi32((int)_cn); + for (; dx < rsa; dx += Avx::F) + { + __m256i i0 = _mm256_load_si256((__m256i*)(_ix.data + dx)); + __m256i i1 = _mm256_add_epi32(i0, cn); + __m256 s0 = _mm256_i32gather_ps(ps, i0, 4); + __m256 s1 = _mm256_i32gather_ps(ps, i1, 4); + __m256 fx1 = _mm256_load_ps(_ax.data + dx); + __m256 fx0 = _mm256_sub_ps(_1, fx1); + _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1))); + } + } + for (; dx < _rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx; + } + } + + size_t dx = 0; + __m256 _fy0 = _mm256_set1_ps(fy0); + __m256 _fy1 = _mm256_set1_ps(fy1); + for (; dx < rsa; dx += Avx::F) + { + __m256 b0 = _mm256_load_ps(pbx[0] + dx); + __m256 b1 = _mm256_load_ps(pbx[1] + dx); + _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1))); + } + for (; dx < rsh; dx += Sse::F) + { + __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0)); + __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1)); + _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); + } + for (; dx < _rs; dx++) + dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; + } + } + + //--------------------------------------------------------------------- + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) + { + if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true); + else + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + } + } +#endif //SIMD_AVX2_ENABLE +} + diff --git a/src/3rd/Simd/SimdAvx2Statistic.cpp b/src/3rd/Simd/SimdAvx2Statistic.cpp index 03d7c5eb..af9209df 100644 --- a/src/3rd/Simd/SimdAvx2Statistic.cpp +++ b/src/3rd/Simd/SimdAvx2Statistic.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -538,6 +538,46 @@ namespace Simd SquareSum(src, stride, width, height, sum); } + template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + assert(width >= A); + if (align) + assert(Aligned(src) && Aligned(stride)); + + size_t bodyWidth = AlignLo(width, A); + __m256i tailMask = SetMask(0, A - width + bodyWidth, 0xFF); + __m256i fullValueSum = _mm256_setzero_si256(); + __m256i fullSquareSum = _mm256_setzero_si256(); + for (size_t row = 0; row < height; ++row) + { + __m256i rowSquareSum = _mm256_setzero_si256(); + for (size_t col = 0; col < bodyWidth; col += A) + { + const __m256i value = Load((__m256i*)(src + col)); + fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum); + rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value)); + } + if (width - bodyWidth) + { + const __m256i value = _mm256_and_si256(tailMask, Load((__m256i*)(src + width - A))); + fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum); + rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value)); + } + fullSquareSum = _mm256_add_epi64(fullSquareSum, HorizontalSum32(rowSquareSum)); + src += stride; + } + *valueSum = ExtractSum(fullValueSum); + *squareSum = ExtractSum(fullSquareSum); + } + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + if (Aligned(src) && Aligned(stride)) + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + } + SIMD_INLINE __m256i Correlation(__m256i a, __m256i b) { const __m256i lo = _mm256_madd_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); diff --git a/src/3rd/Simd/SimdAvx2Synet.cpp b/src/3rd/Simd/SimdAvx2Synet.cpp new file mode 100644 index 00000000..6da16eae --- /dev/null +++ b/src/3rd/Simd/SimdAvx2Synet.cpp @@ -0,0 +1,238 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" +#include "Simd/SimdAvx1.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdPow.h" + +namespace Simd +{ +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + template void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset) + { + Avx::Store(dst + offset, _mm256_fmadd_ps(Avx::Load(src0 + offset), weight0, _mm256_mul_ps(Avx::Load(src1 + offset), weight1))); + } + + template void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset) + { + Avx::Store(dst + offset, _mm256_fmadd_ps(Avx::Load(src + offset), weight, Load(dst + offset))); + } + + template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + __m256 weight0 = _mm256_set1_ps(weight[0]); + __m256 weight1 = _mm256_set1_ps(weight[1]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); + } + for (; j < size; ++j) + dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + __m256 weighti = _mm256_set1_ps(weight[i]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(srci, weighti, dst, j); + } + for (; j < size; ++j) + dst[j] += srci[j] * weight[i]; + } + } + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + if (type != SimdSynetEltwiseOperationSum) + { + Avx::SynetEltwiseLayerForward(src, weight, count, size, type, dst); + return; + } + assert(count >= 2); + bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); + for (size_t i = 2; i < count; ++i) + aligned = aligned && Aligned(src[i]); + if (aligned) + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + else + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + } + + template SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + size_t aligned = AlignLo(size, F); + Array32f sum(size, true), zero(size, true); + + for (size_t i = 0; i < half; ++i) + { + const float * pos = src + i * size; + size_t j = 0; + for (; j < aligned; j += F) + { + __m256 _pos = Avx::Load(pos + j); + Avx::Store(sum.data + j, _mm256_fmadd_ps(_pos, _pos, Avx::Load(sum.data + j))); + } + for (; j < size; ++j) + sum[j] += Simd::Square(pos[j]); + } + + __m256 k0 = _mm256_set1_ps(k[0]); + __m256 k1 = _mm256_set1_ps(k[1]); + __m256 k2 = _mm256_set1_ps(k[2]); + Avx2::Pow pow; + for (size_t i = 0; i < count; ++i) + { + const float * pos = (i < count - half) ? src + half * size : zero.data; + const float * neg = (i > half) ? src - (half + 1) * size : zero.data; + size_t j = 0; + for (; j < aligned; j += F) + { + __m256 _pos = Avx::Load(pos + j); + __m256 _neg = Avx::Load(neg + j); + __m256 _sum = Avx::Load(sum.data + j); + _sum = _mm256_fmadd_ps(_pos, _pos, _mm256_fnmadd_ps(_neg, _neg, _sum)); + __m256 _src = Avx::Load(src + j); + Avx::Store(sum.data + j, _sum); + Avx::Store(dst + j, _mm256_mul_ps(_src, pow(_mm256_fmadd_ps(k1, _sum, k0), k2))); + } + for (; j < size; ++j) + { + sum[j] += Simd::Square(pos[j]); + sum[j] -= Simd::Square(neg[j]); + dst[j] = src[j] * Base::Pow(k[0] + k[1] * sum[j], k[2]); + } + src += size; + dst += size; + } + } + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + if (Aligned(src) && Aligned(dst) && Aligned(size)) + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + else + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset) + { + Avx::Store(dst + offset, _mm256_fmadd_ps(Avx::Load(src + offset), scale, bias)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset) + { + Avx::Store(dst + offset, _mm256_mul_ps(Avx::Load(src + offset), scale)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + if (bias) + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m256 _scale = _mm256_set1_ps(scale[i]); + __m256 _bias = _mm256_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, _bias, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i] + bias[i]; + src += size; + dst += size; + } + } + else + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m256 _scale = _mm256_set1_ps(scale[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i]; + src += size; + dst += size; + } + } + } + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetScaleLayerForward(src, scale, bias, count, size, dst); + else + SynetScaleLayerForward(src, scale, bias, count, size, dst); + } + } +#endif// SIMD_AVX2_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx512bw.h b/src/3rd/Simd/SimdAvx512bw.h index 45f56475..f8d843e0 100644 --- a/src/3rd/Simd/SimdAvx512bw.h +++ b/src/3rd/Simd/SimdAvx512bw.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -179,10 +179,14 @@ namespace Simd void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); + void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); + void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); @@ -211,7 +215,7 @@ namespace Simd void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); @@ -355,6 +359,8 @@ namespace Simd void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); + void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); void StretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, diff --git a/src/3rd/Simd/SimdAvx512bwFloat16.cpp b/src/3rd/Simd/SimdAvx512bwFloat16.cpp index 3f4252bd..d8b6eac7 100644 --- a/src/3rd/Simd/SimdAvx512bwFloat16.cpp +++ b/src/3rd/Simd/SimdAvx512bwFloat16.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -202,6 +202,67 @@ namespace Simd else SquaredDifferenceSum16f(a, b, size, sum); } + + template SIMD_INLINE void CosineDistance16f(const __m512i & a, const __m512i & b, __m512 * aa, __m512 * ab, __m512 * bb) + { + __m512 a0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a, part)); + __m512 b0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b, part)); + aa[part] = _mm512_fmadd_ps(a0, a0, aa[part]); + ab[part] = _mm512_fmadd_ps(a0, b0, ab[part]); + bb[part] = _mm512_fmadd_ps(b0, b0, bb[part]); + } + + template SIMD_INLINE void CosineDistance16f2(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb, __mmask32 tail = -1) + { + __m512i a0 = Load(a, tail); + __m512i b0 = Load(b, tail); + CosineDistance16f<0>(a0, b0, aa, ab, bb); + CosineDistance16f<1>(a0, b0, aa, ab, bb); + } + + template SIMD_INLINE void CosineDistance16f4(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb) + { + __m512i a0 = Load(a + 00); + __m512i b0 = Load(b + 00); + CosineDistance16f<0>(a0, b0, aa, ab, bb); + CosineDistance16f<1>(a0, b0, aa, ab, bb); + __m512i a1 = Load(a + HA); + __m512i b1 = Load(b + HA); + CosineDistance16f<0>(a1, b1, aa, ab, bb); + CosineDistance16f<1>(a1, b1, aa, ab, bb); + } + + template void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t alignedSize = AlignLo(size, DF); + __mmask32 tailMask = TailMask32(size - alignedSize); + size_t fullAlignedSize = AlignLo(size, QF); + size_t i = 0; + __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + for (; i < fullAlignedSize; i += QF) + CosineDistance16f4(a + i, b + i, _aa, _ab, _bb); + for (; i < alignedSize; i += DF) + CosineDistance16f2(a + i, b + i, _aa, _ab, _bb); + if (i < size) + CosineDistance16f2(a + i, b + i, _aa, _ab, _bb, tailMask); + float aa = Avx512f::ExtractSum(_mm512_add_ps(_aa[0], _aa[1])); + float ab = Avx512f::ExtractSum(_mm512_add_ps(_ab[0], _ab[1])); + float bb = Avx512f::ExtractSum(_mm512_add_ps(_bb[0], _bb[1])); + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance16f(a, b, size, distance); + else + CosineDistance16f(a, b, size, distance); + } } #endif// SIMD_AVX512BW_ENABLE } diff --git a/src/3rd/Simd/SimdAvx512bwFloat32.cpp b/src/3rd/Simd/SimdAvx512bwFloat32.cpp index 8cbbede8..165a1ed4 100644 --- a/src/3rd/Simd/SimdAvx512bwFloat32.cpp +++ b/src/3rd/Simd/SimdAvx512bwFloat32.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,7 @@ */ #include "Simd/SimdMemory.h" #include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" namespace Simd { @@ -75,7 +76,7 @@ namespace Simd template SIMD_INLINE void Uint8ToFloat32(const __m128i & value, const __m512 & lower, const __m512 & boost, float * dst, __mmask16 tail) { - Avx512f::Store(dst, _mm512_sub_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(value)), boost), lower), tail); + Avx512f::Store(dst, _mm512_add_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(value)), boost), lower), tail); } template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m512 & lower, const __m512 & boost, float * dst, __mmask64 srcTail, const __mmask16 * dstTails) @@ -115,6 +116,64 @@ namespace Simd else Uint8ToFloat32(src, size, lower, upper, dst); } + + template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t partialAlignedSize = AlignLo(size, F); + size_t fullAlignedSize = AlignLo(size, DF); + size_t i = 0; + __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; + if (fullAlignedSize) + { + for (; i < fullAlignedSize; i += DF) + { + __m512 a0 = Avx512f::Load(a + i + 0 * F); + __m512 b0 = Avx512f::Load(b + i + 0 * F); + _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]); + __m512 a1 = Avx512f::Load(a + i + 1 * F); + __m512 b1 = Avx512f::Load(b + i + 1 * F); + _aa[1] = _mm512_fmadd_ps(a1, a1, _aa[1]); + _ab[1] = _mm512_fmadd_ps(a1, b1, _ab[1]); + _bb[1] = _mm512_fmadd_ps(b1, b1, _bb[1]); + } + _aa[0] = _mm512_add_ps(_aa[0], _aa[1]); + _ab[0] = _mm512_add_ps(_ab[0], _ab[1]); + _bb[0] = _mm512_add_ps(_bb[0], _bb[1]); + } + for (; i < partialAlignedSize; i += F) + { + __m512 a0 = Avx512f::Load(a + i); + __m512 b0 = Avx512f::Load(b + i); + _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]); + _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]); + _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]); + } + float aa = Avx512f::ExtractSum(_aa[0]), ab = Avx512f::ExtractSum(_ab[0]), bb = Avx512f::ExtractSum(_bb[0]); + for (; i < size; ++i) + { + float _a = a[i]; + float _b = b[i]; + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance32f(a, b, size, distance); + else + CosineDistance32f(a, b, size, distance); + } } #endif// SIMD_AVX512BW_ENABLE } diff --git a/src/3rd/Simd/SimdAvx512bwHog.cpp b/src/3rd/Simd/SimdAvx512bwHog.cpp index 9817e50d..c875227f 100644 --- a/src/3rd/Simd/SimdAvx512bwHog.cpp +++ b/src/3rd/Simd/SimdAvx512bwHog.cpp @@ -517,12 +517,12 @@ namespace Simd } #else __m128 * ps = (__m128*)src; - __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16))); - __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16))); - _mm_storel_pi((__m64*)(h0[0] + 16), s0); - _mm_storeh_pi((__m64*)(h0[1] + 16), s0); - _mm_storel_pi((__m64*)(h1[0] + 16), s1); - _mm_storeh_pi((__m64*)(h1[1] + 16), s1); + __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16)); + __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16)); + Sse::StoreHalf<0>(h0[0] + 16, s0); + Sse::StoreHalf<1>(h0[1] + 16, s0); + Sse::StoreHalf<0>(h1[0] + 16, s1); + Sse::StoreHalf<1>(h1[1] + 16, s1); #endif h0++; h1++; diff --git a/src/3rd/Simd/SimdAvx512bwHogLite.cpp b/src/3rd/Simd/SimdAvx512bwHogLite.cpp index edb32257..00cfb3da 100644 --- a/src/3rd/Simd/SimdAvx512bwHogLite.cpp +++ b/src/3rd/Simd/SimdAvx512bwHogLite.cpp @@ -470,9 +470,9 @@ namespace Simd sums[1] = _mm512_fmadd_ps(src5, filter3, sums[1]); } - template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = 8 * filterSize; + size_t filterStride = 8 * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 8); size_t alignedFilterStride = AlignLo(filterStride, DF); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) @@ -483,7 +483,7 @@ namespace Simd __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * 8; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += DF) @@ -503,7 +503,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * 8; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -516,9 +516,9 @@ namespace Simd } } - template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = 8 * filterSize; + size_t filterStride = 8 * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 8); size_t alignedFilterStride = AlignLo(filterStride, DF); __m128 _min = _mm_set1_ps(-FLT_MAX); @@ -535,7 +535,7 @@ namespace Simd __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * 8; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += DF) @@ -558,7 +558,7 @@ namespace Simd __m256 sum = _mm256_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * 8; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -603,9 +603,9 @@ namespace Simd sums[3] = _mm512_fmadd_ps(src4, filter1, sums[3]); } - template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = 16 * filterSize; + size_t filterStride = 16 * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); size_t alignedFilterStride = AlignLo(filterStride, DF); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) @@ -616,7 +616,7 @@ namespace Simd __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * 16; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += DF) @@ -638,7 +638,7 @@ namespace Simd __m512 sum = _mm512_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * 16; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -651,9 +651,9 @@ namespace Simd } } - template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = 16 * filterSize; + size_t filterStride = 16 * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); size_t alignedFilterStride = AlignLo(filterStride, DF); __m128 _min = _mm_set1_ps(-FLT_MAX); @@ -670,7 +670,7 @@ namespace Simd __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * 16; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < alignedFilterStride; filterCol += DF) @@ -695,7 +695,7 @@ namespace Simd __m512 sum = _mm512_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * 16; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -712,53 +712,53 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { if (featureSize == 16) - Filter16(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter16(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); else - Filter8(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter8(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { if (featureSize == 16) - Filter16(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter16(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter8(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter8(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterSize && srcHeight >= filterSize); + assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - size_t dstWidth = srcWidth - filterSize + 1; - size_t dstHeight = srcHeight - filterSize + 1; + size_t dstWidth = srcWidth - filterWidth + 1; + size_t dstHeight = srcHeight - filterHeight + 1; if (mask) { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } else { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); } } }; - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } class HogLiteFeatureResizer diff --git a/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp b/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp index df7520e0..9475c008 100644 --- a/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp +++ b/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp b/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp index f3fb710d..499a176d 100644 --- a/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp +++ b/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -306,15 +306,34 @@ namespace Simd Store(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi))); } + template SIMD_INLINE void Gather(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) + { + struct Src { uint8_t channels[channelCount * 1]; }; + struct Dst { uint8_t channels[channelCount * 2]; }; + const Src * s = (const Src *)src; + Dst * d = (Dst*)dst; + for (size_t i = 0; i < size; i++) + d[i] = *(Dst *)(s + idx[i]); + } + + template <> SIMD_INLINE void Gather<2>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) + { + for (size_t i = 0; i < size; i += 16) + _mm512_storeu_si512(dst + 4*i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), src, 2)); + } + + template <> SIMD_INLINE void Gather<4>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst) + { + for (size_t i = 0; i < size; i += 8) + _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), src, 4)); + } + template void ResizeBilinear( const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert(dstWidth >= A); - struct One { uint8_t channels[channelCount]; }; - struct Two { uint8_t channels[channelCount * 2]; }; - size_t size = 2 * dstWidth*channelCount; size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2; size_t alignedSize = AlignHi(size, DA) - DA; @@ -350,10 +369,7 @@ namespace Simd for (; k < 2; k++) { - Two * pb = (Two *)buffer.bx[k]; - const One * psrc = (const One *)(src + (sy + k)*srcStride); - for (size_t x = 0; x < dstWidth; x++) - pb[x] = *(Two *)(psrc + buffer.ix[x]); + Gather(src + (sy + k)*srcStride, buffer.ix, dstWidth, buffer.bx[k]); uint8_t * pbx = buffer.bx[k]; for (size_t i = 0; i < bufferSize; i += step) diff --git a/src/3rd/Simd/SimdAvx512bwStatistic.cpp b/src/3rd/Simd/SimdAvx512bwStatistic.cpp index e5a9cf8d..27a35a7d 100644 --- a/src/3rd/Simd/SimdAvx512bwStatistic.cpp +++ b/src/3rd/Simd/SimdAvx512bwStatistic.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -555,6 +555,62 @@ namespace Simd SquareSum(src, stride, width, height, sum); } + template void ValueSquareSum(const __m512i & value, __m512i * valueSums, __m512i * squareSums) + { + valueSums[index] = _mm512_add_epi64(valueSums[index], _mm512_sad_epu8(value, K_ZERO)); + squareSums[index] = _mm512_add_epi32(squareSums[index], SquareSum(value)); + } + + template void ValueSquareSum4(const uint8_t * src, __m512i * valueSums, __m512i * squareSums) + { + ValueSquareSum<0>(Load(src + 0 * A), valueSums, squareSums); + ValueSquareSum<1>(Load(src + 1 * A), valueSums, squareSums); + ValueSquareSum<2>(Load(src + 2 * A), valueSums, squareSums); + ValueSquareSum<3>(Load(src + 3 * A), valueSums, squareSums); + } + + template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + assert(width < 256 * 256 * F); + if (align) + assert(Aligned(src) && Aligned(stride)); + + size_t alignedWidth = Simd::AlignLo(width, A); + size_t fullAlignedWidth = Simd::AlignLo(width, QA); + __mmask64 tailMask = TailMask64(width - alignedWidth); + size_t blockSize = (256 * 256 * F) / width; + size_t blockCount = height / blockSize + 1; + __m512i valueSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; + __m512i fullSquareSum = _mm512_setzero_si512(); + for (size_t block = 0; block < blockCount; ++block) + { + __m512i squareSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() }; + for (size_t row = block * blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row) + { + size_t col = 0; + for (; col < fullAlignedWidth; col += QA) + ValueSquareSum4(src + col, valueSums, squareSums); + for (; col < alignedWidth; col += A) + ValueSquareSum<0>(Load(src + col), valueSums, squareSums); + if (col < width) + ValueSquareSum<0>(Load(src + col, tailMask), valueSums, squareSums); + src += stride; + } + fullSquareSum = _mm512_add_epi64(fullSquareSum, HorizontalSum32( + _mm512_add_epi32(_mm512_add_epi32(squareSums[0], squareSums[1]), _mm512_add_epi32(squareSums[2], squareSums[3])))); + } + *valueSum = ExtractSum(_mm512_add_epi64(_mm512_add_epi64(valueSums[0], valueSums[1]), _mm512_add_epi64(valueSums[2], valueSums[3]))); + *squareSum = ExtractSum(fullSquareSum); + } + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + if (Aligned(src) && Aligned(stride)) + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + } + SIMD_INLINE __m512i CorrelationSum(__m512i a, __m512i b) { const __m512i lo = _mm512_madd_epi16(_mm512_unpacklo_epi8(a, _mm512_setzero_si512()), _mm512_unpacklo_epi8(b, _mm512_setzero_si512())); diff --git a/src/3rd/Simd/SimdAvx512f.h b/src/3rd/Simd/SimdAvx512f.h index 96017961..ccc05797 100644 --- a/src/3rd/Simd/SimdAvx512f.h +++ b/src/3rd/Simd/SimdAvx512f.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,8 @@ namespace Simd #ifdef SIMD_AVX512F_ENABLE namespace Avx512f { + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + void NeuralProductSum(const float * a, const float * b, size_t size, float * sum); void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst); @@ -102,6 +104,14 @@ namespace Simd void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); } #endif// SIMD_AVX512F_ENABLE } diff --git a/src/3rd/Simd/SimdAvx512fGemm32f.cpp b/src/3rd/Simd/SimdAvx512fGemm32f.cpp new file mode 100644 index 00000000..49afdc12 --- /dev/null +++ b/src/3rd/Simd/SimdAvx512fGemm32f.cpp @@ -0,0 +1,1055 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdGemm.h" + +namespace Simd +{ +#ifdef SIMD_AVX512F_ENABLE + namespace Avx512f + { + SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha) + { + _mm512_storeu_ps(ptr, _mm512_fmadd_ps(value, alpha, _mm512_loadu_ps(ptr))); + } + + SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha, __mmask16 mask) + { + _mm512_mask_storeu_ps(ptr, mask, _mm512_fmadd_ps(value, alpha, _mm512_maskz_loadu_ps(mask, ptr))); + } + + static void Kernel4x48(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + __m512 c02 = _mm512_setzero_ps(); + __m512 c12 = _mm512_setzero_ps(); + __m512 c22 = _mm512_setzero_ps(); + __m512 c32 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m512 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + b2 = _mm512_loadu_ps(B + 2 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + c02 = _mm512_fmadd_ps(a0, b2, c02); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + c12 = _mm512_fmadd_ps(a0, b2, c12); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + c22 = _mm512_fmadd_ps(a0, b2, c22); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + c32 = _mm512_fmadd_ps(a0, b2, c32); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01); + AddProduct(C + 2 * F, _alpha, c02, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11); + AddProduct(C + 2 * F, _alpha, c12, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21); + AddProduct(C + 2 * F, _alpha, c22, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31); + AddProduct(C + 2 * F, _alpha, c32, mask); + } + + static void Kernel4x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m512 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, mask); + } + + static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c0 = _mm512_setzero_ps(); + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + __m512 c3 = _mm512_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + __m512 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B); + c0 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a0++), c0); + c1 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a1++), c1); + c2 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a2++), c2); + c3 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a3++), c3); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, mask); + AddProduct(C + 1 * ldc, _alpha, c1, mask); + AddProduct(C + 2 * ldc, _alpha, c2, mask); + AddProduct(C + 3 * ldc, _alpha, c3, mask); + } + + static void Kernel6x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + __m512 c41 = _mm512_setzero_ps(); + __m512 c51 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m512 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + c41 = _mm512_fmadd_ps(a0, b1, c41); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + c51 = _mm512_fmadd_ps(a0, b1, c51); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, mask); + } + + static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m512 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50, mask); + } + + static void Kernel8x48(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c02 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c12 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c22 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + __m512 c32 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c41 = _mm512_setzero_ps(); + __m512 c42 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c51 = _mm512_setzero_ps(); + __m512 c52 = _mm512_setzero_ps(); + __m512 c60 = _mm512_setzero_ps(); + __m512 c61 = _mm512_setzero_ps(); + __m512 c62 = _mm512_setzero_ps(); + __m512 c70 = _mm512_setzero_ps(); + __m512 c71 = _mm512_setzero_ps(); + __m512 c72 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + const float * A6 = A + lda * 6; + const float * A7 = A + lda * 7; + __m512 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + b2 = _mm512_loadu_ps(B + 2 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + c02 = _mm512_fmadd_ps(a0, b2, c02); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + c12 = _mm512_fmadd_ps(a0, b2, c12); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + c22 = _mm512_fmadd_ps(a0, b2, c22); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + c32 = _mm512_fmadd_ps(a0, b2, c32); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + c41 = _mm512_fmadd_ps(a0, b1, c41); + c42 = _mm512_fmadd_ps(a0, b2, c42); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + c51 = _mm512_fmadd_ps(a0, b1, c51); + c52 = _mm512_fmadd_ps(a0, b2, c52); + a0 = _mm512_set1_ps(*A6++); + c60 = _mm512_fmadd_ps(a0, b0, c60); + c61 = _mm512_fmadd_ps(a0, b1, c61); + c62 = _mm512_fmadd_ps(a0, b2, c62); + a0 = _mm512_set1_ps(*A7++); + c70 = _mm512_fmadd_ps(a0, b0, c70); + c71 = _mm512_fmadd_ps(a0, b1, c71); + c72 = _mm512_fmadd_ps(a0, b2, c72); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01); + AddProduct(C + 2 * F, _alpha, c02, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11); + AddProduct(C + 2 * F, _alpha, c12, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21); + AddProduct(C + 2 * F, _alpha, c22, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31); + AddProduct(C + 2 * F, _alpha, c32, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41); + AddProduct(C + 2 * F, _alpha, c42, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51); + AddProduct(C + 2 * F, _alpha, c52, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c60); + AddProduct(C + 1 * F, _alpha, c61); + AddProduct(C + 2 * F, _alpha, c62, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c70); + AddProduct(C + 1 * F, _alpha, c71); + AddProduct(C + 2 * F, _alpha, c72, mask); + } + + static void Kernel8x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c41 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c51 = _mm512_setzero_ps(); + __m512 c60 = _mm512_setzero_ps(); + __m512 c61 = _mm512_setzero_ps(); + __m512 c70 = _mm512_setzero_ps(); + __m512 c71 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + const float * A6 = A + lda * 6; + const float * A7 = A + lda * 7; + __m512 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + c41 = _mm512_fmadd_ps(a0, b1, c41); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + c51 = _mm512_fmadd_ps(a0, b1, c51); + a0 = _mm512_set1_ps(*A6++); + c60 = _mm512_fmadd_ps(a0, b0, c60); + c61 = _mm512_fmadd_ps(a0, b1, c61); + a0 = _mm512_set1_ps(*A7++); + c70 = _mm512_fmadd_ps(a0, b0, c70); + c71 = _mm512_fmadd_ps(a0, b1, c71); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c60); + AddProduct(C + 1 * F, _alpha, c61, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c70); + AddProduct(C + 1 * F, _alpha, c71, mask); + } + + static void Kernel8x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c60 = _mm512_setzero_ps(); + __m512 c70 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + const float * A6 = A + lda * 6; + const float * A7 = A + lda * 7; + __m512 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + a0 = _mm512_set1_ps(*A6++); + c60 = _mm512_fmadd_ps(a0, b0, c60); + a0 = _mm512_set1_ps(*A7++); + c70 = _mm512_fmadd_ps(a0, b0, c70); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c60, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c70, mask); + } + + static void Kernel12x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c01 = _mm512_setzero_ps(); + __m512 c11 = _mm512_setzero_ps(); + __m512 c21 = _mm512_setzero_ps(); + __m512 c31 = _mm512_setzero_ps(); + __m512 c41 = _mm512_setzero_ps(); + __m512 c51 = _mm512_setzero_ps(); + __m512 c60 = _mm512_setzero_ps(); + __m512 c70 = _mm512_setzero_ps(); + __m512 c80 = _mm512_setzero_ps(); + __m512 c90 = _mm512_setzero_ps(); + __m512 cA0 = _mm512_setzero_ps(); + __m512 cB0 = _mm512_setzero_ps(); + __m512 c61 = _mm512_setzero_ps(); + __m512 c71 = _mm512_setzero_ps(); + __m512 c81 = _mm512_setzero_ps(); + __m512 c91 = _mm512_setzero_ps(); + __m512 cA1 = _mm512_setzero_ps(); + __m512 cB1 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + const float * A6 = A + lda * 6; + const float * A7 = A + lda * 7; + const float * A8 = A + lda * 8; + const float * A9 = A + lda * 9; + const float * AA = A + lda * 10; + const float * AB = A + lda * 11; + __m512 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + c01 = _mm512_fmadd_ps(a0, b1, c01); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + c11 = _mm512_fmadd_ps(a0, b1, c11); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + c21 = _mm512_fmadd_ps(a0, b1, c21); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + c31 = _mm512_fmadd_ps(a0, b1, c31); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + c41 = _mm512_fmadd_ps(a0, b1, c41); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + c51 = _mm512_fmadd_ps(a0, b1, c51); + a0 = _mm512_set1_ps(*A6++); + c60 = _mm512_fmadd_ps(a0, b0, c60); + c61 = _mm512_fmadd_ps(a0, b1, c61); + a0 = _mm512_set1_ps(*A7++); + c70 = _mm512_fmadd_ps(a0, b0, c70); + c71 = _mm512_fmadd_ps(a0, b1, c71); + a0 = _mm512_set1_ps(*A8++); + c80 = _mm512_fmadd_ps(a0, b0, c80); + c81 = _mm512_fmadd_ps(a0, b1, c81); + a0 = _mm512_set1_ps(*A9++); + c90 = _mm512_fmadd_ps(a0, b0, c90); + c91 = _mm512_fmadd_ps(a0, b1, c91); + a0 = _mm512_set1_ps(*AA++); + cA0 = _mm512_fmadd_ps(a0, b0, cA0); + cA1 = _mm512_fmadd_ps(a0, b1, cA1); + a0 = _mm512_set1_ps(*AB++); + cB0 = _mm512_fmadd_ps(a0, b0, cB0); + cB1 = _mm512_fmadd_ps(a0, b1, cB1); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c60); + AddProduct(C + 1 * F, _alpha, c61, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c70); + AddProduct(C + 1 * F, _alpha, c71, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c80); + AddProduct(C + 1 * F, _alpha, c81, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c90); + AddProduct(C + 1 * F, _alpha, c91, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, cA0); + AddProduct(C + 1 * F, _alpha, cA1, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, cB0); + AddProduct(C + 1 * F, _alpha, cB1, mask); + } + + static void Kernel12x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { + __m512 c00 = _mm512_setzero_ps(); + __m512 c10 = _mm512_setzero_ps(); + __m512 c20 = _mm512_setzero_ps(); + __m512 c30 = _mm512_setzero_ps(); + __m512 c40 = _mm512_setzero_ps(); + __m512 c50 = _mm512_setzero_ps(); + __m512 c60 = _mm512_setzero_ps(); + __m512 c70 = _mm512_setzero_ps(); + __m512 c80 = _mm512_setzero_ps(); + __m512 c90 = _mm512_setzero_ps(); + __m512 cA0 = _mm512_setzero_ps(); + __m512 cB0 = _mm512_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + const float * A6 = A + lda * 6; + const float * A7 = A + lda * 7; + const float * A8 = A + lda * 8; + const float * A9 = A + lda * 9; + const float * AA = A + lda * 10; + const float * AB = A + lda * 11; + __m512 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + a0 = _mm512_set1_ps(*A0++); + c00 = _mm512_fmadd_ps(a0, b0, c00); + a0 = _mm512_set1_ps(*A1++); + c10 = _mm512_fmadd_ps(a0, b0, c10); + a0 = _mm512_set1_ps(*A2++); + c20 = _mm512_fmadd_ps(a0, b0, c20); + a0 = _mm512_set1_ps(*A3++); + c30 = _mm512_fmadd_ps(a0, b0, c30); + a0 = _mm512_set1_ps(*A4++); + c40 = _mm512_fmadd_ps(a0, b0, c40); + a0 = _mm512_set1_ps(*A5++); + c50 = _mm512_fmadd_ps(a0, b0, c50); + a0 = _mm512_set1_ps(*A6++); + c60 = _mm512_fmadd_ps(a0, b0, c60); + a0 = _mm512_set1_ps(*A7++); + c70 = _mm512_fmadd_ps(a0, b0, c70); + a0 = _mm512_set1_ps(*A8++); + c80 = _mm512_fmadd_ps(a0, b0, c80); + a0 = _mm512_set1_ps(*A9++); + c90 = _mm512_fmadd_ps(a0, b0, c90); + a0 = _mm512_set1_ps(*AA++); + cA0 = _mm512_fmadd_ps(a0, b0, cA0); + a0 = _mm512_set1_ps(*AB++); + cB0 = _mm512_fmadd_ps(a0, b0, cB0); + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c60, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c70, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c80, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, c90, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, cA0, mask); + C += ldc; + AddProduct(C + 0 * F, _alpha, cB0, mask); + } + + static void KernelMx48(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { +#if SIMD_ZMM_COUNT == 32 + __m512 c[8][3]; + const float * a[8]; +#else + __m512 c[4][3]; + const float * a[4]; +#endif + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm512_setzero_ps(); + c[i][1] = _mm512_setzero_ps(); + c[i][2] = _mm512_setzero_ps(); + a[i] = A + lda * i; + } + __m512 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + b2 = _mm512_loadu_ps(B + 2 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm512_set1_ps(*a[i]++); + c[i][0] = _mm512_add_ps(_mm512_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm512_add_ps(_mm512_mul_ps(b1, a0), c[i][1]); + c[i][2] = _mm512_add_ps(_mm512_mul_ps(b2, a0), c[i][2]); + } + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1]); + AddProduct(C + 2 * F, _alpha, c[i][2], mask); + C += ldc; + } + } + + static void KernelMx32(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { +#if SIMD_ZMM_COUNT == 32 + __m512 c[12][2]; + const float * a[12]; +#else + __m512 c[6][2]; + const float * a[6]; +#endif + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm512_setzero_ps(); + c[i][1] = _mm512_setzero_ps(); + a[i] = A + lda * i; + } + __m512 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + b1 = _mm512_loadu_ps(B + 1 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm512_set1_ps(*a[i]++); + c[i][0] = _mm512_fmadd_ps(b0, a0, c[i][0]); + c[i][1] = _mm512_fmadd_ps(b1, a0, c[i][1]); + } + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1], mask); + C += ldc; + } + } + + static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask) + { +#if SIMD_ZMM_COUNT == 32 + __m512 c[12]; + const float * a[12]; +#elif SIMD_ZMM_COUNT == 16 + __m512 c[6]; + const float * a[6]; +#else + __m512 c[4]; + const float * a[4]; +#endif + for (size_t i = 0; i < M; ++i) + { + c[i] = _mm512_setzero_ps(); + a[i] = A + lda * i; + } + __m512 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm512_loadu_ps(B + 0 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm512_set1_ps(*a[i]++); + c[i] = _mm512_fmadd_ps(b0, a0, c[i]); + } + B += ldb; + } + __m512 _alpha = _mm512_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + AddProduct(C + i * ldc, _alpha, c[i], mask); + } + + SIMD_INLINE void ScaleC(float * ptr, __m512 value, __mmask16 mask = -1) + { + _mm512_mask_storeu_ps(ptr, mask, _mm512_mul_ps(_mm512_maskz_loadu_ps(mask, ptr), value)); + } + + static void ScaleC(size_t M, size_t N, float value, float * C, size_t ldc) + { + size_t NQF = AlignLo(N, QF); + size_t NF = AlignLo(N, F); + __m512 _value = _mm512_set1_ps(value); + __mmask16 tail = TailMask16(N - NF); + for (size_t i = 0; i < M; ++i) + { + size_t j = 0; + for (; j < NQF; j += QF) + { + ScaleC(C + j + F * 0, _value); + ScaleC(C + j + F * 1, _value); + ScaleC(C + j + F * 2, _value); + ScaleC(C + j + F * 3, _value); + } + for (; j < NF; j += F) + ScaleC(C + j, _value); + if(j < N) + ScaleC(C + j, _value, tail); + C += ldc; + } + } + + static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) + { + size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8); + for (size_t i = 0; i < M; i += cell) + { + size_t m = Simd::Min(cell, M - i), k = 0; + if (cell == 4 && m == 4) + { + for (; k < K8; k += 8) + { + const float * ps = src + k; + __m256 s0 = _mm256_loadu_ps(ps + 0 * K); + __m256 s1 = _mm256_loadu_ps(ps + 1 * K); + __m256 s2 = _mm256_loadu_ps(ps + 2 * K); + __m256 s3 = _mm256_loadu_ps(ps + 3 * K); + __m256 s00 = _mm256_unpacklo_ps(s0, s2); + __m256 s01 = _mm256_unpacklo_ps(s1, s3); + __m256 s10 = _mm256_unpackhi_ps(s0, s2); + __m256 s11 = _mm256_unpackhi_ps(s1, s3); + __m256 d0 = _mm256_unpacklo_ps(s00, s01); + __m256 d1 = _mm256_unpackhi_ps(s00, s01); + __m256 d2 = _mm256_unpacklo_ps(s10, s11); + __m256 d3 = _mm256_unpackhi_ps(s10, s11); + _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20)); + _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20)); + _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31)); + _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31)); + dst += 32; + }; + for (; k < K4; k += 4) + { + const float * ps = src + k; + __m128 s0 = _mm_loadu_ps(ps + 0 * stride); + __m128 s1 = _mm_loadu_ps(ps + 1 * stride); + __m128 s2 = _mm_loadu_ps(ps + 2 * stride); + __m128 s3 = _mm_loadu_ps(ps + 3 * stride); + __m128 s00 = _mm_unpacklo_ps(s0, s2); + __m128 s01 = _mm_unpacklo_ps(s1, s3); + __m128 s10 = _mm_unpackhi_ps(s0, s2); + __m128 s11 = _mm_unpackhi_ps(s1, s3); + _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); + _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); + _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); + _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); + dst += 16; + } + } + for (; k < K; ++k) + { + for (size_t c = 0; c < m; ++c) + *(dst++) = src[c*stride + k]; + } + src += cell * stride; + } + } + + static void PackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) + { + for (size_t j = 0; j < N; j += microN) + { + size_t n = Simd::Min(microN, N - j); + if (microN == 1 * F) + { + __mmask16 mask0 = TailMask16(n - 0 * F); + for (size_t k = 0; k < K; ++k) + { + const float * b = B + k * ldb; + _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); + pB += microN; + } + } + else if (microN == 2 * F) + { + __mmask16 mask0 = TailMask16(n - 0 * F); + __mmask16 mask1 = TailMask16(n - 1 * F); + for (size_t k = 0; k < K; ++k) + { + const float * b = B + k * ldb; + _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); + _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F)); + pB += microN; + } + } + else if (microN == 3 * F) + { + __mmask16 mask0 = TailMask16(n - 0 * F); + __mmask16 mask1 = TailMask16(n - 1 * F); + __mmask16 mask2 = TailMask16(n - 2 * F); + for (size_t k = 0; k < K; ++k) + { + const float * b = B + k * ldb; + _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F)); + _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F)); + _mm512_storeu_ps(pB + 2 * F, _mm512_maskz_loadu_ps(mask2, b + 2 * F)); + pB += microN; + } + } + else + { + for (size_t k = 0; k < K; ++k) + { + const float * b = B + k * ldb; + size_t c = 0; + for (; c < n; ++c) + *(pB++) = *(b++); + for (; c < microN; ++c) + *(pB++) = 0; + } + } + B += microN; + } + } + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) + { + const size_t CACHE_L1_SIZE = 32 * 1024; + const size_t CACHE_L2_SIZE = 256 * 1024; + const size_t CACHE_L3_SIZE = 2 * 1024 * 1024; + typedef Simd::GemmNN GemmNN; + GemmNN::Main kernelMM, kernelMT; + GemmNN::Tail kernelTM, kernelTT; + size_t microM, microN; +#if SIMD_ZMM_COUNT == 32 + if (K > 4024 && false) + { + microM = 12; + microN = 32; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel12x32; + kernelMT = tail > F ? Kernel12x32 : Kernel12x16; + kernelTM = KernelMx32; + kernelTT = tail > F ? KernelMx32 : KernelMx16; + } + else + { + microM = 8; + microN = 48; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel8x48; + kernelMT = tail > DF ? Kernel8x48 : (tail > F ? Kernel8x32 : Kernel8x16); + kernelTM = KernelMx48; + kernelTT = tail > DF ? KernelMx48 : (tail > F ? KernelMx32 : KernelMx16); + } +#elif SIMD_ZMM_COUNT == 16 + if (K > 4024) + { + microM = 6; + microN = 32; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel6x32; + kernelMT = tail > F ? Kernel6x32 : Kernel6x16; + kernelTM = KernelMx32; + kernelTT = tail > F ? KernelMx32 : KernelMx16; + } + else + { + microM = 4; + microN = 48; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel4x48; + kernelMT = tail > DF ? Kernel4x48 : (tail > F ? Kernel4x32 : Kernel4x16); + kernelTM = KernelMx48; + kernelTT = tail > DF ? KernelMx48 : (tail > F ? KernelMx32 : KernelMx16); + } +#else + microM = 4; + microN = 16; + kernelMM = Kernel4x16; + kernelMT = Kernel4x16; + kernelTM = KernelMx16; + kernelTT = KernelMx16; +#endif + GemmNN gemmNN(M, N, K, microM, microN, CACHE_L2_SIZE, CACHE_L3_SIZE, CACHE_L3_SIZE, F, + kernelMM, kernelMT, kernelTM, kernelTT, Avx512f::ScaleC, Avx512f::PackB, TailMask16); + gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); + } + } +#endif// SIMD_AVX512F_ENABLE +} diff --git a/src/3rd/Simd/SimdAvx512fNeural.cpp b/src/3rd/Simd/SimdAvx512fNeural.cpp index 670d6a20..8bb8db3e 100644 --- a/src/3rd/Simd/SimdAvx512fNeural.cpp +++ b/src/3rd/Simd/SimdAvx512fNeural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -27,6 +27,7 @@ #include "Simd/SimdStream.h" #include "Simd/SimdNeural.h" #include "Simd/SimdAvx2.h" +#include "Simd/SimdPow.h" namespace Simd { @@ -621,90 +622,35 @@ namespace Simd NeuralDerivativeRelu(src, size, slope, dst); } - class PowEstimator + template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { - __m512i _exponent, _mantissa, _127; - __m512 _1_0, _0_5; - - void Init() - { - _exponent = _mm512_set1_epi32(0x7F800000); - _mantissa = _mm512_set1_epi32(0x007FFFFF); - _127 = _mm512_set1_epi32(127); - _1_0 = _mm512_set1_ps(1.0f); - _0_5 = _mm512_set1_ps(0.5f); - } - - SIMD_INLINE __m512 Poly5(__m512 x, float a, float b, float c, float d, float e, float f) - { - __m512 p = _mm512_set1_ps(f); - p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(e)); - p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(d)); - p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(c)); - p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(b)); - p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(a)); - return p; - } - - SIMD_INLINE __m512 Exp2(__m512 x) - { - x = _mm512_max_ps(_mm512_min_ps(x, _mm512_set1_ps(129.00000f)), _mm512_set1_ps(-126.99999f)); - __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _0_5)); - __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart)); - __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _mm512_set1_epi32(127)), 23)); - __m512 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm512_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m512 Log2(__m512 x) - { - __m512i i = _mm512_castps_si512(x); - __m512 e = _mm512_cvtepi32_ps(_mm512_sub_epi32(_mm512_srli_epi32(_mm512_and_si512(i, _exponent), 23), _127)); - __m512 m = _mm512_or_ps(_mm512_castsi512_ps(_mm512_and_si512(i, _mantissa)), _1_0); - __m512 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm512_fmadd_ps(p, _mm512_sub_ps(m, _1_0), e); - } - - SIMD_INLINE __m512 Pow(__m512 basis, __m512 exponent) - { - return Exp2(_mm512_mul_ps(Log2(basis), exponent)); - } - - template void Run(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t alignedSize = AlignLo(size, F); - __m512 _e = _mm512_set1_ps(e); - size_t i = 0; - for (; i < alignedSize; i += F) - Store(dst + i, Pow(Load(src + i), _e)); - for (; i < size; ++i) - dst[i] = Base::Pow(src[i], e); - } + if (align) + assert(Aligned(src) && Aligned(dst)); - public: - void Run(const float * src, size_t size, const float * exponent, float * dst) + float e = exponent[0]; + size_t aligned = AlignLo(size, F); + __m512 _e = _mm512_set1_ps(e); + Pow pow; + size_t i = 0; + for (; i < aligned; i += F) + Avx512f::Store(dst + i, pow(Avx512f::Load(src + i), _e)); + if (i < size) { - Init(); - - if (Aligned(src) && Aligned(dst)) - Run(src, size, exponent, dst); - else - Run(src, size, exponent, dst); + __mmask16 tail = TailMask16(size - i); + Avx512f::Store(dst + i, pow(Avx512f::Load(src + i, tail), _e), tail); } - }; + } void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { #if defined(_MSC_VER) && _MSC_VER <= 1912 Avx2::NeuralPow(src, size, exponent, dst); -#else - PowEstimator estimator; - estimator.Run(src, size, exponent, dst); -#endif +#else + if (Aligned(src) && Aligned(dst)) + NeuralPow(src, size, exponent, dst); + else + NeuralPow(src, size, exponent, dst); +#endif } template SIMD_INLINE void NeuralUpdateWeights(const float * x, const __m512 & a, const __m512 & b, float * d, float * w, __mmask16 m) @@ -1996,37 +1942,48 @@ namespace Simd _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sum128)); } - template static SIMD_INLINE void Kernel4x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums) + template static SIMD_INLINE void Kernel6x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums) { - __m512 b0 = Load(b + 0 * K); - sums[0x0] = _mm512_fmadd_ps(a[0], b0, sums[0x0]); - sums[0x4] = _mm512_fmadd_ps(a[1], b0, sums[0x4]); - sums[0x8] = _mm512_fmadd_ps(a[2], b0, sums[0x8]); - sums[0xC] = _mm512_fmadd_ps(a[3], b0, sums[0xC]); - __m512 b1 = Load(b + 1 * K); - sums[0x1] = _mm512_fmadd_ps(a[0], b1, sums[0x1]); - sums[0x5] = _mm512_fmadd_ps(a[1], b1, sums[0x5]); - sums[0x9] = _mm512_fmadd_ps(a[2], b1, sums[0x9]); - sums[0xD] = _mm512_fmadd_ps(a[3], b1, sums[0xD]); - __m512 b2 = Load(b + 2 * K); - sums[0x2] = _mm512_fmadd_ps(a[0], b2, sums[0x2]); - sums[0x6] = _mm512_fmadd_ps(a[1], b2, sums[0x6]); - sums[0xA] = _mm512_fmadd_ps(a[2], b2, sums[0xA]); - sums[0xE] = _mm512_fmadd_ps(a[3], b2, sums[0xE]); - __m512 b3 = Load(b + 3 * K); - sums[0x3] = _mm512_fmadd_ps(a[0], b3, sums[0x3]); - sums[0x7] = _mm512_fmadd_ps(a[1], b3, sums[0x7]); - sums[0xB] = _mm512_fmadd_ps(a[2], b3, sums[0xB]); - sums[0xF] = _mm512_fmadd_ps(a[3], b3, sums[0xF]); + __m512 _b; + _b = Load(b + 0 * K); + sums[0x00] = _mm512_fmadd_ps(a[0], _b, sums[0x00]); + sums[0x04] = _mm512_fmadd_ps(a[1], _b, sums[0x04]); + sums[0x08] = _mm512_fmadd_ps(a[2], _b, sums[0x08]); + sums[0x0C] = _mm512_fmadd_ps(a[3], _b, sums[0x0C]); + sums[0x10] = _mm512_fmadd_ps(a[4], _b, sums[0x10]); + sums[0x14] = _mm512_fmadd_ps(a[5], _b, sums[0x14]); + _b = Load(b + 1 * K); + sums[0x01] = _mm512_fmadd_ps(a[0], _b, sums[0x01]); + sums[0x05] = _mm512_fmadd_ps(a[1], _b, sums[0x05]); + sums[0x09] = _mm512_fmadd_ps(a[2], _b, sums[0x09]); + sums[0x0D] = _mm512_fmadd_ps(a[3], _b, sums[0x0D]); + sums[0x11] = _mm512_fmadd_ps(a[4], _b, sums[0x11]); + sums[0x15] = _mm512_fmadd_ps(a[5], _b, sums[0x15]); + _b = Load(b + 2 * K); + sums[0x02] = _mm512_fmadd_ps(a[0], _b, sums[0x02]); + sums[0x06] = _mm512_fmadd_ps(a[1], _b, sums[0x06]); + sums[0x0A] = _mm512_fmadd_ps(a[2], _b, sums[0x0A]); + sums[0x0E] = _mm512_fmadd_ps(a[3], _b, sums[0x0E]); + sums[0x12] = _mm512_fmadd_ps(a[4], _b, sums[0x12]); + sums[0x16] = _mm512_fmadd_ps(a[5], _b, sums[0x16]); + _b = Load(b + 3 * K); + sums[0x03] = _mm512_fmadd_ps(a[0], _b, sums[0x03]); + sums[0x07] = _mm512_fmadd_ps(a[1], _b, sums[0x07]); + sums[0x0B] = _mm512_fmadd_ps(a[2], _b, sums[0x0B]); + sums[0x0F] = _mm512_fmadd_ps(a[3], _b, sums[0x0F]); + sums[0x13] = _mm512_fmadd_ps(a[4], _b, sums[0x13]); + sums[0x17] = _mm512_fmadd_ps(a[5], _b, sums[0x17]); } - template static SIMD_INLINE void Kernel4x1x16(const __m512 * a, const float * b, __m512 * sums) + template static SIMD_INLINE void Kernel6x1x16(const __m512 * a, const float * b, __m512 * sums) { __m512 b0 = Load(b); sums[0] = _mm512_fmadd_ps(a[0], b0, sums[0]); sums[1] = _mm512_fmadd_ps(a[1], b0, sums[1]); sums[2] = _mm512_fmadd_ps(a[2], b0, sums[2]); sums[3] = _mm512_fmadd_ps(a[3], b0, sums[3]); + sums[4] = _mm512_fmadd_ps(a[4], b0, sums[4]); + sums[5] = _mm512_fmadd_ps(a[5], b0, sums[5]); } template static SIMD_INLINE void Kernel3x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums) @@ -2058,12 +2015,14 @@ namespace Simd sums[0x2] = _mm512_fmadd_ps(a[2], _b, sums[0x2]); } - template static SIMD_INLINE void Load4(const float * p, __m512 * a, size_t step, __mmask16 tail = -1) + template static SIMD_INLINE void Load6(const float * p, __m512 * a, size_t step, __mmask16 tail = -1) { a[0] = Load(p + 0 * step, tail); a[1] = Load(p + 1 * step, tail); a[2] = Load(p + 2 * step, tail); a[3] = Load(p + 3 * step, tail); + a[4] = Load(p + 4 * step, tail); + a[5] = Load(p + 5 * step, tail); } template static SIMD_INLINE void Load3(const float * p, __m512 * a, size_t step, __mmask16 tail = -1) @@ -2076,22 +2035,24 @@ namespace Simd template void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) { size_t M3 = M / 3 * 3; - size_t M4 = Simd::AlignLo(M, 4); + size_t M6 = M / 6 * 6; size_t N4 = Simd::AlignLo(N, 4); size_t K16 = Simd::AlignLo(K, 16); __mmask16 tailMask = TailMask16(K - K16); size_t i = 0; #if SIMD_ZMM_COUNT == 32 - for (; i < M4; i += 4) + for (; i < M6; i += 6) { const float * pa = a + i*K; float * pc = c + i*N; size_t j = 0; - register __m512 _a[4]; + __m512 _a[6]; for (; j < N4; j += 4) { const float * pb = b + j*K; - register __m512 sums[16] = { + __m512 sums[24] = { + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), @@ -2099,38 +2060,44 @@ namespace Simd size_t k = 0; for (; k < K16; k += 16) { - Load4(pa + k, _a, K); - Kernel4x4x16(_a, K, pb + k, sums); + Load6(pa + k, _a, K); + Kernel6x4x16(_a, K, pb + k, sums); } if (k < K) { - Load4(pa + k, _a, K, tailMask); - Kernel4x4x16(_a, K, pb + k, sums); + Load6(pa + k, _a, K, tailMask); + Kernel6x4x16(_a, K, pb + k, sums); } - Add4ExtractedSums(sums + 0x0, pc + 0 * N + j); - Add4ExtractedSums(sums + 0x4, pc + 1 * N + j); - Add4ExtractedSums(sums + 0x8, pc + 2 * N + j); - Add4ExtractedSums(sums + 0xC, pc + 3 * N + j); + Add4ExtractedSums(sums + 0x00, pc + 0 * N + j); + Add4ExtractedSums(sums + 0x04, pc + 1 * N + j); + Add4ExtractedSums(sums + 0x08, pc + 2 * N + j); + Add4ExtractedSums(sums + 0x0C, pc + 3 * N + j); + Add4ExtractedSums(sums + 0x10, pc + 4 * N + j); + Add4ExtractedSums(sums + 0x14, pc + 5 * N + j); } for (; j < N; ++j) { const float * pb = b + j*K; - register __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 sums[6] = { + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; size_t k = 0; for (; k < K16; k += 16) { - Load4(pa + k, _a, K); - Kernel4x1x16(_a, pb + k, sums); + Load6(pa + k, _a, K); + Kernel6x1x16(_a, pb + k, sums); } if (k < K) { - Load4(pa + k, _a, K, tailMask); - Kernel4x1x16(_a, pb + k, sums); + Load6(pa + k, _a, K, tailMask); + Kernel6x1x16(_a, pb + k, sums); } pc[0 * N + j] += ExtractSum(sums[0]); pc[1 * N + j] += ExtractSum(sums[1]); pc[2 * N + j] += ExtractSum(sums[2]); pc[3 * N + j] += ExtractSum(sums[3]); + pc[4 * N + j] += ExtractSum(sums[4]); + pc[5 * N + j] += ExtractSum(sums[5]); } } #endif @@ -2139,11 +2106,11 @@ namespace Simd const float * pa = a + i*K; float * pc = c + i*N; size_t j = 0; - register __m512 _a[3]; + __m512 _a[3]; for (; j < N4; j += 4) { const float * pb = b + j*K; - register __m512 sums[12] = { + __m512 sums[12] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; @@ -2165,7 +2132,7 @@ namespace Simd for (; j < N; ++j) { const float * pb = b + j*K; - register __m512 sums[3] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 sums[3] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; size_t k = 0; for (; k < K16; k += 16) { @@ -2190,16 +2157,16 @@ namespace Simd for (; j < N4; j += 4) { const float * pb = b + j*K; - register __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; + __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; size_t k = 0; for (; k < K16; k += 16) { - register __m512 _a = Load(pa + k); + __m512 _a = Load(pa + k); Kernel1x4x16(_a, K, pb + k, sums); } if (k < K) { - register __m512 _a = Load(pa + k, tailMask); + __m512 _a = Load(pa + k, tailMask); Kernel1x4x16(_a, K, pb + k, sums); } Add4ExtractedSums(sums + 0, pc + j); @@ -2207,16 +2174,16 @@ namespace Simd for (; j < N; ++j) { const float * pb = b + j*K; - register __m512 sum = _mm512_setzero_ps(); + __m512 sum = _mm512_setzero_ps(); size_t k = 0; for (; k < K16; k += 16) { - register __m512 _a = Load(pa + k); + __m512 _a = Load(pa + k); Kernel1x1x16(_a, pb + k, sum); } if (k < K) { - register __m512 _a = Load(pa + k, tailMask); + __m512 _a = Load(pa + k, tailMask); Kernel1x1x16(_a, pb + k, sum); } pc[j] += ExtractSum(sum); @@ -2380,7 +2347,7 @@ namespace Simd } src = tmp; } - if (cell == 32) + if (cell == 48) { for (size_t j = 0; j < N; j += cell) { @@ -2389,17 +2356,18 @@ namespace Simd { for (size_t k = 0; k < K; ++k) { - const float * psrc = src + k*N; - Store(dst + 0, Load(psrc + 0)); - Store(dst + F, Load(psrc + F)); - dst += 32; + const float * psrc = src + k * N; + Store(dst + 0 * F, Load(psrc + 0 * F)); + Store(dst + 1 * F, Load(psrc + 1 * F)); + Store(dst + 2 * F, Load(psrc + 2 * F)); + dst += 48; } } else { for (size_t k = 0; k < K; ++k) { - const float * psrc = src + k*N; + const float * psrc = src + k * N; size_t c = 0; for (; c < n; ++c) *(dst++) = *(psrc++); @@ -2535,104 +2503,140 @@ namespace Simd } } - template SIMD_INLINE void AddSums32(const __m512 * sums, size_t size, float * dst, size_t stride, const __mmask16 * tails) + template SIMD_INLINE void KernelMx48(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, const __mmask16 * tails) { - for (size_t i = 0; i < size; ++i, dst += stride) - { - AddSum(sums[i + 0], dst + 00, tails[0]); - AddSum(sums[i + 4], dst + 16, tails[1]); - } - } - - template SIMD_INLINE void KernelMx32(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, const __mmask16 * tails) - { - __m512 sums[8] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), + __m512 sums[12] = { + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), + _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() }; for (size_t k = 0; k < K; ++k) { __m512 b0 = Load(b + 00); __m512 b1 = Load(b + 16); + __m512 b2 = Load(b + 32); for (size_t s = 0; s < m; ++s) { __m512 a0 = _mm512_set1_ps(a[s]); sums[s + 0] = _mm512_fmadd_ps(b0, a0, sums[s + 0]); sums[s + 4] = _mm512_fmadd_ps(b1, a0, sums[s + 4]); + sums[s + 8] = _mm512_fmadd_ps(b2, a0, sums[s + 8]); } - b += 32; + b += 48; a += m; } - AddSums32(sums, m, c, N, tails); + for (size_t i = 0; i < m; ++i, c += N) + { + AddSum(sums[i + 0], c + 00, tails[0]); + AddSum(sums[i + 4], c + 16, tails[1]); + AddSum(sums[i + 8], c + 32, tails[2]); + } } - void Kernel4x32(size_t N, size_t K, const float * a, const float * b, float * c) + void Kernel4x48(size_t N, size_t K, const float * a, const float * b, float * c) { - register __m512 _a, b0, b1, c00, c01, c10, c11, c20, c21, c30, c31; + __m512 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32; c00 = _mm512_setzero_ps(); c01 = _mm512_setzero_ps(); + c02 = _mm512_setzero_ps(); c10 = _mm512_setzero_ps(); c11 = _mm512_setzero_ps(); + c12 = _mm512_setzero_ps(); c20 = _mm512_setzero_ps(); c21 = _mm512_setzero_ps(); + c22 = _mm512_setzero_ps(); c30 = _mm512_setzero_ps(); c31 = _mm512_setzero_ps(); + c32 = _mm512_setzero_ps(); for (size_t k = 0; k < K; ++k) { b0 = _mm512_loadu_ps(b + 0 * F); b1 = _mm512_loadu_ps(b + 1 * F); + b2 = _mm512_loadu_ps(b + 2 * F); _a = _mm512_set1_ps(a[0]); c00 = _mm512_fmadd_ps(b0, _a, c00); c01 = _mm512_fmadd_ps(b1, _a, c01); + c02 = _mm512_fmadd_ps(b2, _a, c02); _a = _mm512_set1_ps(a[1]); c10 = _mm512_fmadd_ps(b0, _a, c10); c11 = _mm512_fmadd_ps(b1, _a, c11); + c12 = _mm512_fmadd_ps(b2, _a, c12); _a = _mm512_set1_ps(a[2]); c20 = _mm512_fmadd_ps(b0, _a, c20); c21 = _mm512_fmadd_ps(b1, _a, c21); + c22 = _mm512_fmadd_ps(b2, _a, c22); _a = _mm512_set1_ps(a[3]); c30 = _mm512_fmadd_ps(b0, _a, c30); c31 = _mm512_fmadd_ps(b1, _a, c31); - b += 32; + c32 = _mm512_fmadd_ps(b2, _a, c32); + b += 48; a += 4; } AddSum(c00, c + 0 * F); AddSum(c01, c + 1 * F); + AddSum(c02, c + 2 * F); c += N; AddSum(c10, c + 0 * F); AddSum(c11, c + 1 * F); + AddSum(c12, c + 2 * F); c += N; AddSum(c20, c + 0 * F); AddSum(c21, c + 1 * F); + AddSum(c22, c + 2 * F); c += N; AddSum(c30, c + 0 * F); AddSum(c31, c + 1 * F); + AddSum(c32, c + 2 * F); } - template void Execute4x32(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) + template void Execute4x48(size_t M, size_t N, size_t K, const float * a, const float * b, float * c) { size_t M4 = Simd::AlignLo(M, 4); - size_t N32 = Simd::AlignLo(N, 32); - __mmask16 tailMasks[2]; - for (size_t i = 0; i < 2; ++i) - tailMasks[i] = TailMask16(N - N32 - F*i); - size_t i = 0; - for (; i < M4; i += 4) + size_t N48 = N/48*48; + __mmask16 tailMasks[3]; + for (size_t i = 0; i < 3; ++i) + tailMasks[i] = TailMask16(N - N48 - F*i); + if (M > N) { - size_t j = 0; - for (; j < N32; j += 32) - Kernel4x32(N, K, a + i * K, b + j * K, c + i * N + j); - if (j < N) - KernelMx32(N, K, a + i*K, b + j*K, c + i*N + j, 4, tailMasks); + size_t i = 0; + for (; i < M4; i += 4) + { + size_t j = 0; + for (; j < N48; j += 48) + Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j); + if (j < N) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks); + } + if (i < M) + { + size_t j = 0; + for (; j < N48; j += 48) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); + if (j < N) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); + } } - if (i < M) + else { size_t j = 0; - for (; j < N32; j += 32) - KernelMx32(N, K, a + i*K, b + j*K, c + i*N + j, M - M4, tailMasks); - if (j < N) - KernelMx32(N, K, a + i*K, b + j*K, c + i*N + j, M - M4, tailMasks); + for (; j < N48; j += 48) + { + size_t i = 0; + for (; i < M4; i += 4) + Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j); + if (M4 < M) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); + } + if (N48 < N) + { + size_t i = 0; + for (; i < M4; i += 4) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks); + if (M4 < M) + KernelMx48(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks); + } } } @@ -2642,8 +2646,8 @@ namespace Simd { if (cellB == 16) Execute4x16(M, N, K, a, b, c); - if (cellB == 32) - Execute4x32(M, N, K, a, b, c); + if (cellB == 48) + Execute4x48(M, N, K, a, b, c); } } } @@ -2812,11 +2816,64 @@ namespace Simd } } + void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth) + { + size_t dstDepth4 = dstDepth / 4 * 4; + size_t dstChannel = 0; + for (; dstChannel < dstDepth4; dstChannel += 4) + { + __m512 dst00 = _mm512_loadu_ps(dst + 0 * F); + __m512 dst10 = _mm512_loadu_ps(dst + 1 * F); + __m512 dst20 = _mm512_loadu_ps(dst + 2 * F); + __m512 dst30 = _mm512_loadu_ps(dst + 3 * F); + const float * psrc = src; + const float * pw0 = weight; + const float * pw1 = pw0 + srcDepth; + const float * pw2 = pw1 + srcDepth; + const float * pw3 = pw2 + srcDepth; + for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) + { + __m512 _weight; + __m512 src0 = _mm512_loadu_ps(psrc + 0 * F); + _weight = _mm512_set1_ps(pw0[srcChannel]); + dst00 = _mm512_fmadd_ps(_weight, src0, dst00); + _weight = _mm512_set1_ps(pw1[srcChannel]); + dst10 = _mm512_fmadd_ps(_weight, src0, dst10); + _weight = _mm512_set1_ps(pw2[srcChannel]); + dst20 = _mm512_fmadd_ps(_weight, src0, dst20); + _weight = _mm512_set1_ps(pw3[srcChannel]); + dst30 = _mm512_fmadd_ps(_weight, src0, dst30); + psrc += 16; + } + _mm512_storeu_ps(dst + 0 * F, dst00); + _mm512_storeu_ps(dst + 1 * F, dst10); + _mm512_storeu_ps(dst + 2 * F, dst20); + _mm512_storeu_ps(dst + 3 * F, dst30); + dst += 16 * 4; + weight += srcDepth * 4; + } + for (; dstChannel < dstDepth; ++dstChannel) + { + __m512 dst0 = _mm512_loadu_ps(dst + 0 * F); + const float * psrc = src; + for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel) + { + __m512 weight0 = _mm512_set1_ps(*weight++); + dst0 = _mm512_fmadd_ps(weight0, _mm512_loadu_ps(psrc + 0 * F), dst0); + psrc += 16; + } + _mm512_storeu_ps(dst + 0 * F, dst0); + dst += 16; + } + } + void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth, const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth) { assert(kernelX == kernelY); - if (kernelX == 2) + if (kernelX == 1 && dstWidth*dstHeight == 16) + AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth); + else if (kernelX == 2) AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); else if (kernelX == 3) AddConvolution(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth); @@ -2830,9 +2887,11 @@ namespace Simd bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1) + if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1) { - if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) + if (kernelX >= 2 && kernelX <= 5 && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3) + return true; + if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64)) return true; } return false; @@ -2888,9 +2947,9 @@ namespace Simd break; case Ver1: cellA = 4; - cellB = 32; + cellB = 48; sizeA = M*K; - strideB = Simd::AlignHi(N, cellB); + strideB = (N + cellB - 1)/cellB*cellB; sizeB = strideB*K; if (kernelX*kernelY > 1) sizeT = sizeB; diff --git a/src/3rd/Simd/SimdAvx512fResizer.cpp b/src/3rd/Simd/SimdAvx512fResizer.cpp new file mode 100644 index 00000000..b4a30135 --- /dev/null +++ b/src/3rd/Simd/SimdAvx512fResizer.cpp @@ -0,0 +1,156 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdResizer.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_AVX512F_ENABLE + namespace Avx512f + { + const __m512i K64_PERMUTE_FOR_PACK = SIMD_MM512_SETR_EPI64(0, 2, 4, 6, 1, 3, 5, 7); + + ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp) + : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m512), caffeInterp) + { + } + + void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const + { + Array32f bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + float * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + size_t rsa = AlignLo(_rs, Avx512f::F); + __mmask16 tail = TailMask16(_rs - rsa); + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + float * pb = pbx[k]; + const float * ps = src + (sy + k)*srcStride; + size_t dx = 0; + if (_cn == 1) + { + __m512 _1 = _mm512_set1_ps(1.0f); + for (; dx < rsa; dx += Avx512f::F) + { + __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_load_si512(_ix.data + dx)); + __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4)); + __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4)); + __m512 fx1 = _mm512_load_ps(_ax.data + dx); + __m512 fx0 = _mm512_sub_ps(_1, fx1); + __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88); + __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD); + _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); + } + if (dx < _rs) + { + __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_maskz_loadu_epi32(tail, _ix.data + dx)); + __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4)); + __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4)); + __m512 fx1 = _mm512_maskz_loadu_ps(tail, _ax.data + dx); + __m512 fx0 = _mm512_sub_ps(_1, fx1); + __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88); + __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD); + _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); + } + } + else + { + __m512 _1 = _mm512_set1_ps(1.0f); + __m512i cn = _mm512_set1_epi32((int)_cn); + for (; dx < rsa; dx += Avx512f::F) + { + __m512i i0 = _mm512_load_si512(_ix.data + dx); + __m512i i1 = _mm512_add_epi32(i0, cn); + __m512 s0 = _mm512_i32gather_ps(i0, ps, 4); + __m512 s1 = _mm512_i32gather_ps(i1, ps, 4); + __m512 fx1 = _mm512_load_ps(_ax.data + dx); + __m512 fx0 = _mm512_sub_ps(_1, fx1); + _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); + } + if (dx < _rs) + { + __m512i i0 = _mm512_maskz_loadu_epi32(tail, _ix.data + dx); + __m512i i1 = _mm512_add_epi32(i0, cn); + __m512 s0 = _mm512_i32gather_ps(i0, ps, 4); + __m512 s1 = _mm512_i32gather_ps(i1, ps, 4); + __m512 fx1 = _mm512_maskz_loadu_ps(tail, _ax.data + dx); + __m512 fx0 = _mm512_sub_ps(_1, fx1); + _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1))); + } + } + } + + size_t dx = 0; + __m512 _fy0 = _mm512_set1_ps(fy0); + __m512 _fy1 = _mm512_set1_ps(fy1); + for (; dx < rsa; dx += Avx512f::F) + { + __m512 b0 = Load(pbx[0] + dx); + __m512 b1 = Load(pbx[1] + dx); + Store(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1))); + } + if (dx < _rs) + { + __m512 b0 = Load(pbx[0] + dx, tail); + __m512 b1 = Load(pbx[1] + dx, tail); + Store(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1)), tail); + } + } + } + + //--------------------------------------------------------------------- + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) + { + if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true); + else + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + } + } +#endif //SIMD_AVX512f_ENABLE +} + diff --git a/src/3rd/Simd/SimdAvx512fSynet.cpp b/src/3rd/Simd/SimdAvx512fSynet.cpp new file mode 100644 index 00000000..a3756f81 --- /dev/null +++ b/src/3rd/Simd/SimdAvx512fSynet.cpp @@ -0,0 +1,368 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" +#include "Simd/SimdPow.h" +#include "Simd/SimdAvx2.h" +#include "Simd/SimdArray.h" + +namespace Simd +{ +#ifdef SIMD_AVX512F_ENABLE + namespace Avx512f + { + template SIMD_INLINE void SynetAddBias(const __m512 & bias, float * dst, __mmask16 tail = -1) + { + Store(dst, _mm512_add_ps((Load(dst, tail)), bias), tail); + } + + template SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + __mmask16 tail = __mmask16(-1) >> (F + partial - size); + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + __m512 _bias = _mm512_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetAddBias(_bias, dst + j + F * 0); + SynetAddBias(_bias, dst + j + F * 1); + SynetAddBias(_bias, dst + j + F * 2); + SynetAddBias(_bias, dst + j + F * 3); + } + for (; j < partial; j += F) + SynetAddBias(_bias, dst + j); + if(j < size) + SynetAddBias(_bias, dst + j, tail); + dst += size; + } + } + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetAddBias(bias, count, size, dst); + else + SynetAddBias(bias, count, size, dst); + } + + template void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_mul_ps((Load(src0 + offset, tail)), (Load(src1 + offset, tail))), tail); + } + + template void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + __mmask16 tail = __mmask16(-1) >> (F + partial - size); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(src0, src1, dst, j); + if (j < size) + SynetEltwiseLayerForwardProduct(src0, src1, dst, j, tail); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + for (j = 0; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(dst, srci, dst, j); + if (j < size) + SynetEltwiseLayerForwardProduct(dst, srci, dst, j, tail); + } + } + + template void SynetEltwiseLayerForwardSum(const float * src0, const __m512 & weight0, const float * src1, const __m512 & weight1, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_fmadd_ps((Load(src0 + offset, tail)), weight0, _mm512_mul_ps((Load(src1 + offset, tail)), weight1)), tail); + } + + template void SynetEltwiseLayerForwardSum(const float * src, const __m512 & weight, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_fmadd_ps((Load(src + offset, tail)), weight, (Load(dst + offset, tail))), tail); + } + + template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + __mmask16 tail = __mmask16(-1) >> (F + partial - size); + const float * src0 = src[0]; + const float * src1 = src[1]; + __m512 weight0 = _mm512_set1_ps(weight[0]); + __m512 weight1 = _mm512_set1_ps(weight[1]); + size_t j = 0; + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); + if (j < size) + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j, tail); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + __m512 weighti = _mm512_set1_ps(weight[i]); + for (j = 0; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(srci, weighti, dst, j); + if (j < size) + SynetEltwiseLayerForwardSum(srci, weighti, dst, j, tail); + } + } + + template void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_max_ps((Load(src0 + offset, tail)), (Load(src1 + offset, tail))), tail); + } + + template void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + __mmask16 tail = __mmask16(-1) >> (F + partial - size); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(src0, src1, dst, j); + if(j < size) + SynetEltwiseLayerForwardMax(src0, src1, dst, j, tail); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + for (j = 0; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(dst, srci, dst, j); + if (j < size) + SynetEltwiseLayerForwardMax(dst, srci, dst, j, tail); + } + } + + template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + switch (type) + { + case SimdSynetEltwiseOperationProduct: + SynetEltwiseLayerForwardProduct(src, count, size, dst); + break; + case SimdSynetEltwiseOperationSum: + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + break; + case SimdSynetEltwiseOperationMax: + SynetEltwiseLayerForwardMax(src, count, size, dst); + break; + default: + assert(0); + } + } + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + assert(count >= 2); + bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); + for (size_t i = 2; i < count; ++i) + aligned = aligned && Aligned(src[i]); + if (aligned) + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + else + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + } + + template SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + size_t aligned = AlignLo(size, F); + __mmask16 tail = TailMask16(size - aligned); + Array32f sum(size, true), zero(size, true); + + for (size_t i = 0; i < half; ++i) + { + const float * pos = src + i * size; + size_t j = 0; + for (; j < aligned; j += F) + { + __m512 _pos = Avx512f::Load(pos + j); + Avx512f::Store(sum.data + j, _mm512_fmadd_ps(_pos, _pos, Avx512f::Load(sum.data + j))); + } + if (j < size) + { + __m512 _pos = Avx512f::Load(pos + j, tail); + __m512 _sum = Avx512f::Load(sum.data + j, tail); + Avx512f::Store(sum.data + j, _mm512_fmadd_ps(_pos, _pos, _sum), tail); + } + } + + __m512 k0 = _mm512_set1_ps(k[0]); + __m512 k1 = _mm512_set1_ps(k[1]); + __m512 k2 = _mm512_set1_ps(k[2]); + Avx512f::Pow pow; + for (size_t i = 0; i < count; ++i) + { + const float * pos = (i < count - half) ? src + half * size : zero.data; + const float * neg = (i > half) ? src - (half + 1) * size : zero.data; + size_t j = 0; + for (; j < aligned; j += F) + { + __m512 _pos = Avx512f::Load(pos + j); + __m512 _neg = Avx512f::Load(neg + j); + __m512 _sum = Avx512f::Load(sum.data + j); + _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum)); + __m512 _src = Avx512f::Load(src + j); + Avx512f::Store(sum.data + j, _sum); + Avx512f::Store(dst + j, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2))); + } + if (j < size) + { + __m512 _pos = Avx512f::Load(pos + j, tail); + __m512 _neg = Avx512f::Load(neg + j, tail); + __m512 _sum = Avx512f::Load(sum.data + j, tail); + _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum)); + __m512 _src = Avx512f::Load(src + j, tail); + Avx512f::Store(sum.data + j, _sum, tail); + Avx512f::Store(dst + j, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2)), tail); + } + src += size; + dst += size; + } + } + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + if (Aligned(src) && Aligned(dst) && Aligned(size)) + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + else + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m512 & scale, const __m512 & bias, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_fmadd_ps((Load(src + offset, tail)), scale, bias), tail); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m512 & scale, float * dst, size_t offset, __mmask16 tail = -1) + { + Store(dst + offset, _mm512_mul_ps((Load(src + offset, tail)), scale), tail); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + __mmask16 tail = __mmask16(-1) >> (F + partial - size); + if (bias) + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + __m512 _scale = _mm512_set1_ps(scale[i]); + __m512 _bias = _mm512_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, _bias, dst, j); + if (j < size) + SynetScaleLayerForward(src, _scale, _bias, dst, j, tail); + src += size; + dst += size; + } + } + else + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + __m512 _scale = _mm512_set1_ps(scale[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, dst, j); + if (j < size) + SynetScaleLayerForward(src, _scale, dst, j, tail); + src += size; + dst += size; + } + } + } + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetScaleLayerForward(src, scale, bias, count, size, dst); + else + SynetScaleLayerForward(src, scale, bias, count, size, dst); + } + } +#endif// SIMD_AVX512F_ENABLE +} diff --git a/src/3rd/Simd/SimdBase.h b/src/3rd/Simd/SimdBase.h index 6c4a6646..3750e7ed 100644 --- a/src/3rd/Simd/SimdBase.h +++ b/src/3rd/Simd/SimdBase.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar, +* Copyright (c) 2011-2018 Yermalayeu Ihar, * 2014-2016 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,6 +31,10 @@ namespace Simd { namespace Base { + size_t GetThreadNumber(); + + void SetThreadNumber(size_t threadNumber); + uint32_t Crc32c(const void * src, size_t size); void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, @@ -227,13 +231,19 @@ namespace Simd void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); + void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); + void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride); void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha); @@ -269,7 +279,7 @@ namespace Simd void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); @@ -495,6 +505,8 @@ namespace Simd void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); @@ -503,6 +515,14 @@ namespace Simd void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); + void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); diff --git a/src/3rd/Simd/SimdBaseDetection.cpp b/src/3rd/Simd/SimdBaseDetection.cpp index 37fbbbe5..e7a1830e 100644 --- a/src/3rd/Simd/SimdBaseDetection.cpp +++ b/src/3rd/Simd/SimdBaseDetection.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -94,7 +94,8 @@ namespace Simd tinyxml2::XMLNode * child = parent->FirstChild(); if (child == NULL) SIMD_EX("Invalid node!"); - std::stringstream ss(tinyxml2::XMLUtil::SkipWhiteSpace(child->Value())); + int curLineNum = 0; + std::stringstream ss(tinyxml2::XMLUtil::SkipWhiteSpace(child->Value(), &curLineNum)); std::vector values; while (!ss.eof()) { @@ -900,10 +901,5 @@ namespace Simd Rect(left, top, right, bottom), Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref()); } - - void DetectionFree(void * ptr) - { - delete (Deletable*)ptr; - } } } diff --git a/src/3rd/Simd/SimdBaseFloat16.cpp b/src/3rd/Simd/SimdBaseFloat16.cpp index 0c1d268b..1ae91895 100644 --- a/src/3rd/Simd/SimdBaseFloat16.cpp +++ b/src/3rd/Simd/SimdBaseFloat16.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -146,5 +146,19 @@ namespace Simd sums[0] += SquaredDifference16f(a[i], b[i]); *sum = sums[0] + sums[1] + sums[2] + sums[3]; } + + void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) + { + float aa = 0, ab = 0, bb = 0; + for (size_t i = 0; i < size; ++i) + { + float _a = Float16ToFloat32(a[i]); + float _b = Float16ToFloat32(b[i]); + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } } } diff --git a/src/3rd/Simd/SimdBaseFloat32.cpp b/src/3rd/Simd/SimdBaseFloat32.cpp index 127825fa..23f19a2a 100644 --- a/src/3rd/Simd/SimdBaseFloat32.cpp +++ b/src/3rd/Simd/SimdBaseFloat32.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -51,7 +51,7 @@ namespace Simd SIMD_INLINE float Uint8ToFloat32(int value, float lower, float boost) { - return value*boost - lower; + return value*boost + lower; } void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) @@ -69,5 +69,19 @@ namespace Simd for (; i < size; ++i) dst[i] = Uint8ToFloat32(src[i], _lower, boost); } + + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + float aa = 0, ab = 0, bb = 0; + for (size_t i = 0; i < size; ++i) + { + float _a = a[i]; + float _b = b[i]; + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } } } diff --git a/src/3rd/Simd/SimdBaseGemm32f.cpp b/src/3rd/Simd/SimdBaseGemm32f.cpp new file mode 100644 index 00000000..9425bb7d --- /dev/null +++ b/src/3rd/Simd/SimdBaseGemm32f.cpp @@ -0,0 +1,48 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdDefs.h" + +namespace Simd +{ + namespace Base + { + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) + { + float b = beta[0]; + for (size_t i = 0; i < M; ++i) + { + float * pC = C + i * ldc; + for (size_t j = 0; j < N; ++j) + pC[j] = b * pC[j]; + for (size_t k = 0; k < K; ++k) + { + const float * pB = B + k * ldb; + float a = alpha[0] * A[i*lda + k]; + for (size_t j = 0; j < N; ++j) + pC[j] = a * pB[j] + pC[j]; + } + } + } + } +} diff --git a/src/3rd/Simd/SimdBaseHogLite.cpp b/src/3rd/Simd/SimdBaseHogLite.cpp index 78119de6..bd9570ee 100644 --- a/src/3rd/Simd/SimdBaseHogLite.cpp +++ b/src/3rd/Simd/SimdBaseHogLite.cpp @@ -214,9 +214,9 @@ namespace Simd class HogLiteFeatureFilter { - void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride) + void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = featureSize*filterSize; + size_t filterStride = featureSize*filterWidth; for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) { for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) @@ -224,7 +224,7 @@ namespace Simd float sum = 0; const float * pSrc = src + dstRow*srcStride + dstCol*featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; ++filterCol) sum += pSrc[filterCol] * pFilter[filterCol]; @@ -237,9 +237,9 @@ namespace Simd } } - void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = featureSize*filterSize; + size_t filterStride = featureSize*filterWidth; for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) { for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol) @@ -249,7 +249,7 @@ namespace Simd float sum = 0; const float * pSrc = src + dstRow*srcStride + dstCol*featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; ++filterCol) sum += pSrc[filterCol] * pFilter[filterCol]; @@ -266,24 +266,24 @@ namespace Simd } } public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterSize && srcHeight >= filterSize); + assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - size_t dstWidth = srcWidth - filterSize + 1; - size_t dstHeight = srcHeight - filterSize + 1; + size_t dstWidth = srcWidth - filterWidth + 1; + size_t dstHeight = srcHeight - filterHeight + 1; if (mask) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); } }; - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } class HogLiteFeatureResizer diff --git a/src/3rd/Simd/SimdBaseNeural.cpp b/src/3rd/Simd/SimdBaseNeural.cpp index c5fc008b..5b03f0cd 100644 --- a/src/3rd/Simd/SimdBaseNeural.cpp +++ b/src/3rd/Simd/SimdBaseNeural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "Simd/SimdMath.h" #include "Simd/SimdMemory.h" +#include "Simd/SimdPow.h" namespace Simd { diff --git a/src/3rd/Simd/SimdBaseReduceGray5x5.cpp b/src/3rd/Simd/SimdBaseReduceGray5x5.cpp index 2d16c539..a95c56d5 100644 --- a/src/3rd/Simd/SimdBaseReduceGray5x5.cpp +++ b/src/3rd/Simd/SimdBaseReduceGray5x5.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -208,9 +208,9 @@ namespace Simd ++sx; dx = dy; - register unsigned short * p_isc0 = buffer.isc0; - register unsigned short * p_isc1 = buffer.isc1; - register unsigned short * p_iscp = buffer.iscp; + unsigned short * p_isc0 = buffer.isc0; + unsigned short * p_isc1 = buffer.isc1; + unsigned short * p_iscp = buffer.iscp; // Main entries in row for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx) @@ -219,7 +219,7 @@ namespace Simd p_isc1++; p_iscp++; - register unsigned short icurrent = (unsigned short)(*sx); + unsigned short icurrent = (unsigned short)(*sx); isrp = icurrent * 4; icurrent = (unsigned short)(*(++sx)); @@ -239,7 +239,7 @@ namespace Simd //doing the last operation due to even number of operations in previous cycle if (!(srcWidth & 1)) { - register unsigned short icurrent = (unsigned short)(*sx); + unsigned short icurrent = (unsigned short)(*sx); isrp = icurrent * 4; ++dstx; evenX = !evenX; @@ -281,11 +281,11 @@ namespace Simd ++sx; // Main entries in odd-numbered row - register unsigned short * p_iscp = buffer.iscp; + unsigned short * p_iscp = buffer.iscp; for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx) { - register unsigned short icurrent = (unsigned short)(*sx); + unsigned short icurrent = (unsigned short)(*sx); isrp = icurrent * 4; p_iscp++; @@ -301,7 +301,7 @@ namespace Simd //doing the last operation due to even number of operations in previous cycle if (!(srcWidth & 1)) { - register unsigned short icurrent = (unsigned short)(*sx); + unsigned short icurrent = (unsigned short)(*sx); isrp = icurrent * 4; ++dstx; evenX = !evenX; diff --git a/src/3rd/Simd/SimdBaseResizer.cpp b/src/3rd/Simd/SimdBaseResizer.cpp new file mode 100644 index 00000000..1eba12f0 --- /dev/null +++ b/src/3rd/Simd/SimdBaseResizer.cpp @@ -0,0 +1,257 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdResizer.h" + +namespace Simd +{ + namespace Base + { + ResizerByteBilinear::ResizerByteBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels) + : Resizer(SimdResizeChannelByte, SimdResizeMethodBilinear) + , _sx(srcX), _sy(srcY), _dx(dstX), _dy(dstY), _cn(channels) + { + _ay.Resize(_dy); + _iy.Resize(_dy); + EstimateIndexAlpha(_sy, _dy, _iy.data, _ay.data, 1); + + _rs = _dx * _cn; + _ax.Resize(_rs); + _ix.Resize(_rs); + EstimateIndexAlpha(_sx, _dx, _ix.data, _ax.data, _cn); + } + + void ResizerByteBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, int32_t * alphas, size_t channels) + { + float scale = (float)srcSize / dstSize; + + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f)*scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + + if (index < 0) + { + index = 0; + alpha = 0; + } + + if (index >(ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels*index + c); + alphas[offset] = (int32_t)(alpha * FRACTION_RANGE + 0.5f); + } + } + } + + void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const + { + Array32i bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + int32_t * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + int32_t fy = _ay[dy]; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + int32_t * pb = pbx[k]; + const uint8_t * ps = src + (sy + k)*srcStride; + for (size_t dx = 0; dx < _rs; dx++) + { + int32_t sx = _ix[dx]; + int32_t fx = _ax[dx]; + int32_t t = ps[sx]; + pb[dx] = (t << LINEAR_SHIFT) + (ps[sx + _cn] - t)*fx; + } + } + + if (fy == 0) + for (size_t dx = 0; dx < _rs; dx++) + dst[dx] = ((pbx[0][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; + else if (fy == FRACTION_RANGE) + for (size_t dx = 0; dx < _rs; dx++) + dst[dx] = ((pbx[1][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; + else + { + for (size_t dx = 0; dx < _rs; dx++) + { + int32_t t = pbx[0][dx]; + dst[dx] = ((t << LINEAR_SHIFT) + (pbx[1][dx] - t)*fy + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT; + } + } + } + } + + //--------------------------------------------------------------------- + + ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, size_t align, bool caffeInterp) + : Resizer(SimdResizeChannelFloat, SimdResizeMethodBilinear) + , _sx(srcX), _sy(srcY), _dx(dstX), _dy(dstY), _cn(channels) + { + _ay.Resize(_dy, false, align); + _iy.Resize(_dy, false, align); + EstimateIndexAlpha(_sy, _dy, _iy.data, _ay.data, 1, caffeInterp); + + _rs = _dx * _cn; + _ax.Resize(_rs, false, align); + _ix.Resize(_rs, false, align); + EstimateIndexAlpha(_sx, _dx, _ix.data, _ax.data, _cn, caffeInterp); + } + + void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, float * alphas, size_t channels, bool caffeInterp) + { + if (caffeInterp) + { + float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = float(i)*scale; + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index > (ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels*index + c); + alphas[offset] = alpha; + } + } + } + else + { + float scale = (float)srcSize / dstSize; + for (size_t i = 0; i < dstSize; ++i) + { + float alpha = (float)((i + 0.5f)*scale - 0.5f); + ptrdiff_t index = (ptrdiff_t)::floor(alpha); + alpha -= index; + if (index < 0) + { + index = 0; + alpha = 0; + } + if (index >(ptrdiff_t)srcSize - 2) + { + index = srcSize - 2; + alpha = 1; + } + for (size_t c = 0; c < channels; c++) + { + size_t offset = i * channels + c; + indices[offset] = (int32_t)(channels*index + c); + alphas[offset] = alpha; + } + } + } + } + + void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const + { + Run((const float*)src, srcStride / sizeof(float), (float*)dst, dstStride / sizeof(float)); + } + + void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const + { + Array32f bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + float * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + float * pb = pbx[k]; + const float * ps = src + (sy + k)*srcStride; + for (size_t dx = 0; dx < _rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx]*(1.0f - fx) + ps[sx + _cn]*fx; + } + } + + for (size_t dx = 0; dx < _rs; dx++) + dst[dx] = pbx[0][dx]*fy0 + pbx[1][dx]*fy1; + } + } + + //--------------------------------------------------------------------- + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) + { + if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear) + return new ResizerByteBilinear(srcX, srcY, dstX, dstY, channels); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(void*), false); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(void*), true); + else + return NULL; + } + } +} + diff --git a/src/3rd/Simd/SimdBaseStatistic.cpp b/src/3rd/Simd/SimdBaseStatistic.cpp index c95634cd..c897f09b 100644 --- a/src/3rd/Simd/SimdBaseStatistic.cpp +++ b/src/3rd/Simd/SimdBaseStatistic.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -220,6 +220,28 @@ namespace Simd src += stride; } } + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + assert(width < 0x10000); + + *valueSum = 0; + *squareSum = 0; + for (size_t row = 0; row < height; ++row) + { + int rowValueSum = 0; + int rowSquareSum = 0; + for (size_t col = 0; col < width; ++col) + { + int value = src[col]; + rowValueSum += value; + rowSquareSum += Square(value); + } + *valueSum += rowValueSum; + *squareSum += rowSquareSum; + src += stride; + } + } void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) { diff --git a/src/3rd/Simd/SimdBaseSynet.cpp b/src/3rd/Simd/SimdBaseSynet.cpp new file mode 100644 index 00000000..76507206 --- /dev/null +++ b/src/3rd/Simd/SimdBaseSynet.cpp @@ -0,0 +1,232 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdArray.h" +#include "Simd/SimdPow.h" + +namespace Simd +{ + namespace Base + { + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = Simd::AlignLo(size, 4); + for (size_t i = 0; i < count; ++i) + { + float value = bias[i]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] += value; + dst[j + 1] += value; + dst[j + 2] += value; + dst[j + 3] += value; + } + for (; j < size; ++j) + dst[j] += value; + dst += size; + } + } + + void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = Simd::AlignLo(size, 4); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] = src0[j + 0] * src1[j + 0]; + dst[j + 1] = src0[j + 1] * src1[j + 1]; + dst[j + 2] = src0[j + 2] * src1[j + 2]; + dst[j + 3] = src0[j + 3] * src1[j + 3]; + } + for (; j < size; ++j) + dst[j] = src0[j] * src1[j]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + for (j = 0; j < aligned; j += 4) + { + dst[j + 0] *= srci[j + 0]; + dst[j + 1] *= srci[j + 1]; + dst[j + 2] *= srci[j + 2]; + dst[j + 3] *= srci[j + 3]; + } + for (; j < size; ++j) + dst[j] *= srci[j]; + } + } + + void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) + { + size_t aligned = Simd::AlignLo(size, 4); + const float * src0 = src[0]; + const float * src1 = src[1]; + float weight0 = weight[0], weight1 = weight[1]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] = src0[j + 0] * weight0 + src1[j + 0] * weight1; + dst[j + 1] = src0[j + 1] * weight0 + src1[j + 1] * weight1; + dst[j + 2] = src0[j + 2] * weight0 + src1[j + 2] * weight1; + dst[j + 3] = src0[j + 3] * weight0 + src1[j + 3] * weight1; + } + for (; j < size; ++j) + dst[j] = src0[j] * weight0 + src1[j] * weight1; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + float weighti = weight[i]; + for (j = 0; j < aligned; j += 4) + { + dst[j + 0] += srci[j + 0] * weighti; + dst[j + 1] += srci[j + 1] * weighti; + dst[j + 2] += srci[j + 2] * weighti; + dst[j + 3] += srci[j + 3] * weighti; + } + for (; j < size; ++j) + dst[j] += srci[j] * weighti; + } + } + + void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = Simd::AlignLo(size, 4); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] = Simd::Max(src0[j + 0], src1[j + 0]); + dst[j + 1] = Simd::Max(src0[j + 1], src1[j + 1]); + dst[j + 2] = Simd::Max(src0[j + 2], src1[j + 2]); + dst[j + 3] = Simd::Max(src0[j + 3], src1[j + 3]); + } + for (; j < size; ++j) + dst[j] = Simd::Max(src0[j], src1[j]); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + for (j = 0; j < aligned; j += 4) + { + dst[j + 0] = Simd::Max(dst[j + 0], srci[j + 0]); + dst[j + 1] = Simd::Max(dst[j + 1], srci[j + 1]); + dst[j + 2] = Simd::Max(dst[j + 2], srci[j + 2]); + dst[j + 3] = Simd::Max(dst[j + 3], srci[j + 3]); + } + for (; j < size; ++j) + dst[j] = Simd::Max(dst[j], srci[j]); + } + } + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + switch (type) + { + case SimdSynetEltwiseOperationProduct: + SynetEltwiseLayerForwardProduct(src, count, size, dst); + break; + case SimdSynetEltwiseOperationSum: + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + break; + case SimdSynetEltwiseOperationMax: + SynetEltwiseLayerForwardMax(src, count, size, dst); + break; + default: + assert(0); + } + } + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + float k0 = k[0], k1 = k[1], k2 = k[2]; + Array32f sum(size, true), zero(size, true); + + for (size_t i = 0; i < half; ++i) + { + const float * pos = src + i * size; + for (size_t j = 0; j < size; ++j) + sum[j] += Simd::Square(pos[j]); + } + + for (size_t i = 0; i < count; ++i) + { + const float * pos = (i < count - half) ? src + half * size : zero.data; + const float * neg = (i > half) ? src - (half + 1) * size : zero.data; + for (size_t j = 0; j < size; ++j) + { + sum[j] += Simd::Square(pos[j]); + sum[j] -= Simd::Square(neg[j]); + dst[j] = src[j] * Pow(k0 + k1 * sum[j], k2); + } + src += size; + dst += size; + } + } + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = Simd::AlignLo(size, 4); + if (bias) + { + for (size_t i = 0; i < count; ++i) + { + float s = scale[i]; + float b = bias[i]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] = src[j + 0] * s + b; + dst[j + 1] = src[j + 1] * s + b; + dst[j + 2] = src[j + 2] * s + b; + dst[j + 3] = src[j + 3] * s + b; + } + for (; j < size; ++j) + dst[j] = src[j] * s + b; + src += size; + dst += size; + } + } + else + { + for (size_t i = 0; i < count; ++i) + { + float s = scale[i]; + size_t j = 0; + for (; j < aligned; j += 4) + { + dst[j + 0] = src[j + 0] * s; + dst[j + 1] = src[j + 1] * s; + dst[j + 2] = src[j + 2] * s; + dst[j + 3] = src[j + 3] * s; + } + for (; j < size; ++j) + dst[j] = src[j] * s; + src += size; + dst += size; + } + } + } + } +} diff --git a/src/3rd/Simd/SimdBaseThread.cpp b/src/3rd/Simd/SimdBaseThread.cpp new file mode 100644 index 00000000..fb000a65 --- /dev/null +++ b/src/3rd/Simd/SimdBaseThread.cpp @@ -0,0 +1,45 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMath.h" +#include "Simd/SimdBase.h" + +#include + +namespace Simd +{ + namespace Base + { + size_t g_threadNumber = 1; + + size_t GetThreadNumber() + { + return g_threadNumber; + } + + void SetThreadNumber(size_t threadNumber) + { + g_threadNumber = Simd::RestrictRange(threadNumber, 1, std::thread::hardware_concurrency()); + } + } +} diff --git a/src/3rd/Simd/SimdBase_tinyxml2.cpp b/src/3rd/Simd/SimdBase_tinyxml2.cpp index 2f9e5d0a..4541dcb4 100644 --- a/src/3rd/Simd/SimdBase_tinyxml2.cpp +++ b/src/3rd/Simd/SimdBase_tinyxml2.cpp @@ -21,15 +21,86 @@ must not be misrepresented as being the original software. distribution. */ -#include "SimdBase_tinyxml2.h" +#include "Simd/SimdBase_tinyxml2.h" #include // yes, this one new style header, is in the Android SDK. -#if defined(ANDROID_NDK) || defined(__QNXNTO__) +#if defined(ANDROID_NDK) || defined(__BORLANDC__) || defined(__QNXNTO__) # include +# include #else # include +# include #endif +#if defined(_MSC_VER) && (_MSC_VER >= 1400 ) && (!defined WINCE) +// Microsoft Visual Studio, version 2005 and higher. Not WinCE. +/*int _snprintf_s( +char *buffer, +size_t sizeOfBuffer, +size_t count, +const char *format [, +argument] ... +);*/ +static inline int TIXML_SNPRINTF(char* buffer, size_t size, const char* format, ...) +{ + va_list va; + va_start(va, format); + int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va); + va_end(va); + return result; +} + +static inline int TIXML_VSNPRINTF(char* buffer, size_t size, const char* format, va_list va) +{ + int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va); + return result; +} + +#define TIXML_VSCPRINTF _vscprintf +#define TIXML_SSCANF sscanf_s +#elif defined _MSC_VER +// Microsoft Visual Studio 2003 and earlier or WinCE +#define TIXML_SNPRINTF _snprintf +#define TIXML_VSNPRINTF _vsnprintf +#define TIXML_SSCANF sscanf +#if (_MSC_VER < 1400 ) && (!defined WINCE) +// Microsoft Visual Studio 2003 and not WinCE. +#define TIXML_VSCPRINTF _vscprintf // VS2003's C runtime has this, but VC6 C runtime or WinCE SDK doesn't have. +#else +// Microsoft Visual Studio 2003 and earlier or WinCE. +static inline int TIXML_VSCPRINTF(const char* format, va_list va) +{ + int len = 512; + for (;;) { + len = len * 2; + char* str = new char[len](); + const int required = _vsnprintf(str, len, format, va); + delete[] str; + if (required != -1) { + TIXMLASSERT(required >= 0); + len = required; + break; + } + } + TIXMLASSERT(len >= 0); + return len; +} +#endif +#else +// GCC version 3 and higher +//#warning( "Using sn* functions." ) +#define TIXML_SNPRINTF snprintf +#define TIXML_VSNPRINTF vsnprintf +static inline int TIXML_VSCPRINTF(const char* format, va_list va) +{ + int len = vsnprintf(0, 0, format, va); + TIXMLASSERT(len >= 0); + return len; +} +#define TIXML_SSCANF sscanf +#endif + + static const char LINE_FEED = (char)0x0a; // all line endings are normalized to LF static const char LF = LINE_FEED; static const char CARRIAGE_RETURN = (char)0x0d; // CR gets filtered out @@ -57,10 +128,10 @@ namespace tinyxml2 static const int NUM_ENTITIES = 5; static const Entity entities[NUM_ENTITIES] = { { "quot", 4, DOUBLE_QUOTE }, - { "amp", 3, '&' }, - { "apos", 4, SINGLE_QUOTE }, - { "lt", 2, '<' }, - { "gt", 2, '>' } + { "amp", 3, '&' }, + { "apos", 4, SINGLE_QUOTE }, + { "lt", 2, '<' }, + { "gt", 2, '>' } }; @@ -78,6 +149,7 @@ namespace tinyxml2 // This in effect implements the assignment operator by "moving" // ownership (as in auto_ptr). + TIXMLASSERT(other != 0); TIXMLASSERT(other->_flags == 0); TIXMLASSERT(other->_start == 0); TIXMLASSERT(other->_end == 0); @@ -93,6 +165,7 @@ namespace tinyxml2 _end = 0; } + void StrPair::Reset() { if (_flags & NEEDS_DELETE) { @@ -106,8 +179,10 @@ namespace tinyxml2 void StrPair::SetStr(const char* str, int flags) { + TIXMLASSERT(str); Reset(); size_t len = strlen(str); + TIXMLASSERT(_start == 0); _start = new char[len + 1]; memcpy(_start, str, len + 1); _end = _start + len; @@ -115,9 +190,11 @@ namespace tinyxml2 } - char* StrPair::ParseText(char* p, const char* endTag, int strFlags) + char* StrPair::ParseText(char* p, const char* endTag, int strFlags, int* curLineNumPtr) { + TIXMLASSERT(p); TIXMLASSERT(endTag && *endTag); + TIXMLASSERT(curLineNumPtr); char* start = p; char endChar = *endTag; @@ -129,7 +206,11 @@ namespace tinyxml2 Set(start, p, strFlags); return p + length; } + else if (*p == '\n') { + ++(*curLineNumPtr); + } ++p; + TIXMLASSERT(p); } return 0; } @@ -160,15 +241,15 @@ namespace tinyxml2 // Adjusting _start would cause undefined behavior on delete[] TIXMLASSERT((_flags & NEEDS_DELETE) == 0); // Trim leading space. - _start = XMLUtil::SkipWhiteSpace(_start); + _start = XMLUtil::SkipWhiteSpace(_start, 0); if (*_start) { - char* p = _start; // the read pointer + const char* p = _start; // the read pointer char* q = _start; // the write pointer while (*p) { if (XMLUtil::IsWhiteSpace(*p)) { - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, 0); if (*p == 0) { break; // don't write to q; this trims the trailing space. } @@ -193,7 +274,7 @@ namespace tinyxml2 _flags ^= NEEDS_FLUSH; if (_flags) { - char* p = _start; // the read pointer + const char* p = _start; // the read pointer char* q = _start; // the write pointer while (p < _end) { @@ -207,7 +288,8 @@ namespace tinyxml2 else { ++p; } - *q++ = LF; + *q = LF; + ++q; } else if ((_flags & NEEDS_NEWLINE_NORMALIZATION) && *p == LF) { if (*(p + 1) == CR) { @@ -216,7 +298,8 @@ namespace tinyxml2 else { ++p; } - *q++ = LF; + *q = LF; + ++q; } else if ((_flags & NEEDS_ENTITY_PROCESSING) && *p == '&') { // Entities handled by tinyXML2: @@ -243,8 +326,8 @@ namespace tinyxml2 } } else { - int i = 0; - for (; i < NUM_ENTITIES; ++i) { + bool entityFound = false; + for (int i = 0; i < NUM_ENTITIES; ++i) { const Entity& entity = entities[i]; if (strncmp(p + 1, entity.pattern, entity.length) == 0 && *(p + entity.length + 1) == ';') { @@ -252,10 +335,11 @@ namespace tinyxml2 *q = entity.value; ++q; p += entity.length + 2; + entityFound = true; break; } } - if (i == NUM_ENTITIES) { + if (!entityFound) { // fixme: treat as error? ++p; ++q; @@ -272,7 +356,7 @@ namespace tinyxml2 } // The loop below has plenty going on, and this // is a less useful mode. Break it out. - if (_flags & COLLAPSE_WHITESPACE) { + if (_flags & NEEDS_WHITESPACE_COLLAPSING) { CollapseWhitespace(); } _flags = (_flags & NEEDS_DELETE); @@ -286,6 +370,19 @@ namespace tinyxml2 // --------- XMLUtil ----------- // + const char* XMLUtil::writeBoolTrue = "true"; + const char* XMLUtil::writeBoolFalse = "false"; + + void XMLUtil::SetBoolSerialization(const char* writeTrue, const char* writeFalse) + { + static const char* defTrue = "true"; + static const char* defFalse = "false"; + + writeBoolTrue = (writeTrue) ? writeTrue : defTrue; + writeBoolFalse = (writeFalse) ? writeFalse : defFalse; + } + + const char* XMLUtil::ReadBOM(const char* p, bool* bom) { TIXMLASSERT(p); @@ -323,26 +420,30 @@ namespace tinyxml2 *length = 4; } else { - *length = 0; // This code won't covert this correctly anyway. + *length = 0; // This code won't convert this correctly anyway. return; } output += *length; - // Scary scary fall throughs. + // Scary scary fall throughs are annotated with carefully designed comments + // to suppress compiler warnings such as -Wimplicit-fallthrough in gcc switch (*length) { case 4: --output; *output = (char)((input | BYTE_MARK) & BYTE_MASK); input >>= 6; + //fall through case 3: --output; *output = (char)((input | BYTE_MARK) & BYTE_MASK); input >>= 6; + //fall through case 2: --output; *output = (char)((input | BYTE_MARK) & BYTE_MASK); input >>= 6; + //fall through case 1: --output; *output = (char)(input | FIRST_BYTE_MARK[*length]); @@ -397,8 +498,8 @@ namespace tinyxml2 else { return 0; } + TIXMLASSERT(digit < 16); TIXMLASSERT(digit == 0 || mult <= UINT_MAX / digit); - TIXMLASSERT(digit >= 0 && digit < 16); const unsigned int digitScaled = mult * digit; TIXMLASSERT(ucs <= ULONG_MAX - digitScaled); ucs += digitScaled; @@ -427,6 +528,7 @@ namespace tinyxml2 while (*q != '#') { if (*q >= '0' && *q <= '9') { const unsigned int digit = *q - '0'; + TIXMLASSERT(digit < 10); TIXMLASSERT(digit == 0 || mult <= UINT_MAX / digit); const unsigned int digitScaled = mult * digit; TIXMLASSERT(ucs <= ULONG_MAX - digitScaled); @@ -462,12 +564,12 @@ namespace tinyxml2 void XMLUtil::ToStr(bool v, char* buffer, int bufferSize) { - TIXML_SNPRINTF(buffer, bufferSize, "%d", v ? 1 : 0); + TIXML_SNPRINTF(buffer, bufferSize, "%s", v ? writeBoolTrue : writeBoolFalse); } /* - ToStr() of a number is a very tricky topic. - https://github.com/leethomason/tinyxml2/issues/106 + ToStr() of a number is a very tricky topic. + https://github.com/leethomason/tinyxml2/issues/106 */ void XMLUtil::ToStr(float v, char* buffer, int bufferSize) { @@ -481,6 +583,13 @@ namespace tinyxml2 } + void XMLUtil::ToStr(int64_t v, char* buffer, int bufferSize) + { + // horrible syntax trick to make the compiler happy about %lld + TIXML_SNPRINTF(buffer, bufferSize, "%lld", (long long)v); + } + + bool XMLUtil::ToInt(const char* str, int* value) { if (TIXML_SSCANF(str, "%d", value) == 1) { @@ -524,6 +633,7 @@ namespace tinyxml2 return false; } + bool XMLUtil::ToDouble(const char* str, double* value) { if (TIXML_SSCANF(str, "%lf", value) == 1) { @@ -533,72 +643,78 @@ namespace tinyxml2 } + bool XMLUtil::ToInt64(const char* str, int64_t* value) + { + long long v = 0; // horrible syntax trick to make the compiler happy about %lld + if (TIXML_SSCANF(str, "%lld", &v) == 1) { + *value = (int64_t)v; + return true; + } + return false; + } + + char* XMLDocument::Identify(char* p, XMLNode** node) { TIXMLASSERT(node); TIXMLASSERT(p); char* const start = p; - p = XMLUtil::SkipWhiteSpace(p); + int const startLine = _parseCurLineNum; + p = XMLUtil::SkipWhiteSpace(p, &_parseCurLineNum); if (!*p) { *node = 0; TIXMLASSERT(p); return p; } - // What is this thing? - // These strings define the matching patters: + // These strings define the matching patterns: static const char* xmlHeader = { "_memPool = &_commentPool; + returnNode = CreateUnlinkedNode(_commentPool); + returnNode->_parseLineNum = _parseCurLineNum; p += xmlHeaderLen; } else if (XMLUtil::StringEqual(p, commentHeader, commentHeaderLen)) { - TIXMLASSERT(sizeof(XMLComment) == _commentPool.ItemSize()); - returnNode = new (_commentPool.Alloc()) XMLComment(this); - returnNode->_memPool = &_commentPool; + returnNode = CreateUnlinkedNode(_commentPool); + returnNode->_parseLineNum = _parseCurLineNum; p += commentHeaderLen; } else if (XMLUtil::StringEqual(p, cdataHeader, cdataHeaderLen)) { - TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize()); - XMLText* text = new (_textPool.Alloc()) XMLText(this); + XMLText* text = CreateUnlinkedNode(_textPool); returnNode = text; - returnNode->_memPool = &_textPool; + returnNode->_parseLineNum = _parseCurLineNum; p += cdataHeaderLen; text->SetCData(true); } else if (XMLUtil::StringEqual(p, dtdHeader, dtdHeaderLen)) { - TIXMLASSERT(sizeof(XMLUnknown) == _commentPool.ItemSize()); - returnNode = new (_commentPool.Alloc()) XMLUnknown(this); - returnNode->_memPool = &_commentPool; + returnNode = CreateUnlinkedNode(_commentPool); + returnNode->_parseLineNum = _parseCurLineNum; p += dtdHeaderLen; } else if (XMLUtil::StringEqual(p, elementHeader, elementHeaderLen)) { - TIXMLASSERT(sizeof(XMLElement) == _elementPool.ItemSize()); - returnNode = new (_elementPool.Alloc()) XMLElement(this); - returnNode->_memPool = &_elementPool; + returnNode = CreateUnlinkedNode(_elementPool); + returnNode->_parseLineNum = _parseCurLineNum; p += elementHeaderLen; } else { - TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize()); - returnNode = new (_textPool.Alloc()) XMLText(this); - returnNode->_memPool = &_textPool; + returnNode = CreateUnlinkedNode(_textPool); + returnNode->_parseLineNum = _parseCurLineNum; // Report line of first non-whitespace character p = start; // Back it up, all the text counts. + _parseCurLineNum = startLine; } TIXMLASSERT(returnNode); @@ -627,8 +743,11 @@ namespace tinyxml2 XMLNode::XMLNode(XMLDocument* doc) : _document(doc), _parent(0), + _value(), + _parseLineNum(0), _firstChild(0), _lastChild(0), _prev(0), _next(0), + _userData(0), _memPool(0) { } @@ -644,6 +763,9 @@ namespace tinyxml2 const char* XMLNode::Value() const { + // Edge case: XMLDocuments don't have a Value. Return null. + if (this->ToDocument()) + return 0; return _value.GetStr(); } @@ -657,15 +779,24 @@ namespace tinyxml2 } } + XMLNode* XMLNode::DeepClone(XMLDocument* target) const + { + XMLNode* clone = this->ShallowClone(target); + if (!clone) return 0; + + for (const XMLNode* child = this->FirstChild(); child; child = child->NextSibling()) { + XMLNode* childClone = child->DeepClone(target); + TIXMLASSERT(childClone); + clone->InsertEndChild(childClone); + } + return clone; + } void XMLNode::DeleteChildren() { while (_firstChild) { - TIXMLASSERT(_firstChild->_document == _document); - XMLNode* node = _firstChild; - Unlink(node); - - DeleteNode(node); + TIXMLASSERT(_lastChild); + DeleteChild(_firstChild); } _firstChild = _lastChild = 0; } @@ -675,6 +806,7 @@ namespace tinyxml2 { TIXMLASSERT(child); TIXMLASSERT(child->_document == _document); + TIXMLASSERT(child->_parent == this); if (child == _firstChild) { _firstChild = _firstChild->_next; } @@ -688,6 +820,8 @@ namespace tinyxml2 if (child->_next) { child->_next->_prev = child->_prev; } + child->_next = 0; + child->_prev = 0; child->_parent = 0; } @@ -697,6 +831,10 @@ namespace tinyxml2 TIXMLASSERT(node); TIXMLASSERT(node->_document == _document); TIXMLASSERT(node->_parent == this); + Unlink(node); + TIXMLASSERT(node->_prev == 0); + TIXMLASSERT(node->_next == 0); + TIXMLASSERT(node->_parent == 0); DeleteNode(node); } @@ -776,6 +914,13 @@ namespace tinyxml2 TIXMLASSERT(false); return 0; } + if (afterThis == addThis) { + // Current state: BeforeThis -> AddThis -> OneAfterAddThis + // Now AddThis must disappear from it's location and then + // reappear between BeforeThis and OneAfterAddThis. + // So just leave it where it is. + return addThis; + } if (afterThis->_next == 0) { // The last node or the only node. @@ -793,40 +938,35 @@ namespace tinyxml2 - const XMLElement* XMLNode::FirstChildElement(const char* value) const + const XMLElement* XMLNode::FirstChildElement(const char* name) const { - for (XMLNode* node = _firstChild; node; node = node->_next) { - XMLElement* element = node->ToElement(); + for (const XMLNode* node = _firstChild; node; node = node->_next) { + const XMLElement* element = node->ToElementWithName(name); if (element) { - if (!value || XMLUtil::StringEqual(element->Name(), value)) { - return element; - } + return element; } } return 0; } - const XMLElement* XMLNode::LastChildElement(const char* value) const + const XMLElement* XMLNode::LastChildElement(const char* name) const { - for (XMLNode* node = _lastChild; node; node = node->_prev) { - XMLElement* element = node->ToElement(); + for (const XMLNode* node = _lastChild; node; node = node->_prev) { + const XMLElement* element = node->ToElementWithName(name); if (element) { - if (!value || XMLUtil::StringEqual(element->Name(), value)) { - return element; - } + return element; } } return 0; } - const XMLElement* XMLNode::NextSiblingElement(const char* value) const + const XMLElement* XMLNode::NextSiblingElement(const char* name) const { - for (XMLNode* node = this->_next; node; node = node->_next) { - const XMLElement* element = node->ToElement(); - if (element - && (!value || XMLUtil::StringEqual(value, node->Value()))) { + for (const XMLNode* node = _next; node; node = node->_next) { + const XMLElement* element = node->ToElementWithName(name); + if (element) { return element; } } @@ -834,12 +974,11 @@ namespace tinyxml2 } - const XMLElement* XMLNode::PreviousSiblingElement(const char* value) const + const XMLElement* XMLNode::PreviousSiblingElement(const char* name) const { - for (XMLNode* node = _prev; node; node = node->_prev) { - const XMLElement* element = node->ToElement(); - if (element - && (!value || XMLUtil::StringEqual(value, node->Value()))) { + for (const XMLNode* node = _prev; node; node = node->_prev) { + const XMLElement* element = node->ToElementWithName(name); + if (element) { return element; } } @@ -847,7 +986,7 @@ namespace tinyxml2 } - char* XMLNode::ParseDeep(char* p, StrPair* parentEnd) + char* XMLNode::ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr) { // This is a recursive method, but thinking about it "at the current level" // it is a pretty simple flat list: @@ -870,26 +1009,50 @@ namespace tinyxml2 XMLNode* node = 0; p = _document->Identify(p, &node); + TIXMLASSERT(p); if (node == 0) { break; } + int initialLineNum = node->_parseLineNum; + StrPair endTag; - p = node->ParseDeep(p, &endTag); + p = node->ParseDeep(p, &endTag, curLineNumPtr); if (!p) { DeleteNode(node); if (!_document->Error()) { - _document->SetError(XML_ERROR_PARSING, 0, 0); + _document->SetError(XML_ERROR_PARSING, initialLineNum, 0); } break; } + XMLDeclaration* decl = node->ToDeclaration(); + if (decl) { + // Declarations are only allowed at document level + bool wellLocated = (ToDocument() != 0); + if (wellLocated) { + // Multiple declarations are allowed but all declarations + // must occur before anything else + for (const XMLNode* existingNode = _document->FirstChild(); existingNode; existingNode = existingNode->NextSibling()) { + if (!existingNode->ToDeclaration()) { + wellLocated = false; + break; + } + } + } + if (!wellLocated) { + _document->SetError(XML_ERROR_PARSING_DECLARATION, initialLineNum, "XMLDeclaration value=%s", decl->Value()); + DeleteNode(node); + break; + } + } + XMLElement* ele = node->ToElement(); if (ele) { // We read the end tag. Return it to the parent. if (ele->ClosingType() == XMLElement::CLOSING) { - if (parentEnd) { - ele->_value.TransferTo(parentEnd); + if (parentEndTag) { + ele->_value.TransferTo(parentEndTag); } node->_memPool->SetTracked(); // created and then immediately deleted. DeleteNode(node); @@ -908,12 +1071,12 @@ namespace tinyxml2 if (ele->ClosingType() != XMLElement::OPEN) { mismatch = true; } - else if (!XMLUtil::StringEqual(endTag.GetStr(), node->Value())) { + else if (!XMLUtil::StringEqual(endTag.GetStr(), ele->Name())) { mismatch = true; } } if (mismatch) { - _document->SetError(XML_ERROR_MISMATCHED_ELEMENT, node->Value(), 0); + _document->SetError(XML_ERROR_MISMATCHED_ELEMENT, initialLineNum, "XMLElement name=%s", ele->Name()); DeleteNode(node); break; } @@ -923,11 +1086,16 @@ namespace tinyxml2 return 0; } - void XMLNode::DeleteNode(XMLNode* node) + /*static*/ void XMLNode::DeleteNode(XMLNode* node) { if (node == 0) { return; } + TIXMLASSERT(node->_document); + if (!node->ToDocument()) { + node->_document->MarkInUse(node); + } + MemPool* pool = node->_memPool; node->~XMLNode(); pool->Free(node); @@ -938,35 +1106,52 @@ namespace tinyxml2 TIXMLASSERT(insertThis); TIXMLASSERT(insertThis->_document == _document); - if (insertThis->_parent) + if (insertThis->_parent) { insertThis->_parent->Unlink(insertThis); - else + } + else { + insertThis->_document->MarkInUse(insertThis); insertThis->_memPool->SetTracked(); + } + } + + const XMLElement* XMLNode::ToElementWithName(const char* name) const + { + const XMLElement* element = this->ToElement(); + if (element == 0) { + return 0; + } + if (name == 0) { + return element; + } + if (XMLUtil::StringEqual(element->Name(), name)) { + return element; + } + return 0; } // --------- XMLText ---------- // - char* XMLText::ParseDeep(char* p, StrPair*) + char* XMLText::ParseDeep(char* p, StrPair*, int* curLineNumPtr) { - const char* start = p; if (this->CData()) { - p = _value.ParseText(p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION); + p = _value.ParseText(p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr); if (!p) { - _document->SetError(XML_ERROR_PARSING_CDATA, start, 0); + _document->SetError(XML_ERROR_PARSING_CDATA, _parseLineNum, 0); } return p; } else { int flags = _document->ProcessEntities() ? StrPair::TEXT_ELEMENT : StrPair::TEXT_ELEMENT_LEAVE_ENTITIES; if (_document->WhitespaceMode() == COLLAPSE_WHITESPACE) { - flags |= StrPair::COLLAPSE_WHITESPACE; + flags |= StrPair::NEEDS_WHITESPACE_COLLAPSING; } - p = _value.ParseText(p, "<", flags); + p = _value.ParseText(p, "<", flags, curLineNumPtr); if (p && *p) { return p - 1; } if (!p) { - _document->SetError(XML_ERROR_PARSING_TEXT, start, 0); + _document->SetError(XML_ERROR_PARSING_TEXT, _parseLineNum, 0); } } return 0; @@ -986,6 +1171,7 @@ namespace tinyxml2 bool XMLText::ShallowEqual(const XMLNode* compare) const { + TIXMLASSERT(compare); const XMLText* text = compare->ToText(); return (text && XMLUtil::StringEqual(text->Value(), Value())); } @@ -1010,13 +1196,12 @@ namespace tinyxml2 } - char* XMLComment::ParseDeep(char* p, StrPair*) + char* XMLComment::ParseDeep(char* p, StrPair*, int* curLineNumPtr) { // Comment parses as text. - const char* start = p; - p = _value.ParseText(p, "-->", StrPair::COMMENT); + p = _value.ParseText(p, "-->", StrPair::COMMENT, curLineNumPtr); if (p == 0) { - _document->SetError(XML_ERROR_PARSING_COMMENT, start, 0); + _document->SetError(XML_ERROR_PARSING_COMMENT, _parseLineNum, 0); } return p; } @@ -1060,13 +1245,12 @@ namespace tinyxml2 } - char* XMLDeclaration::ParseDeep(char* p, StrPair*) + char* XMLDeclaration::ParseDeep(char* p, StrPair*, int* curLineNumPtr) { // Declaration parses as text. - const char* start = p; - p = _value.ParseText(p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION); + p = _value.ParseText(p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr); if (p == 0) { - _document->SetError(XML_ERROR_PARSING_DECLARATION, start, 0); + _document->SetError(XML_ERROR_PARSING_DECLARATION, _parseLineNum, 0); } return p; } @@ -1109,14 +1293,12 @@ namespace tinyxml2 } - char* XMLUnknown::ParseDeep(char* p, StrPair*) + char* XMLUnknown::ParseDeep(char* p, StrPair*, int* curLineNumPtr) { // Unknown parses as text. - const char* start = p; - - p = _value.ParseText(p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION); + p = _value.ParseText(p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr); if (!p) { - _document->SetError(XML_ERROR_PARSING_UNKNOWN, start, 0); + _document->SetError(XML_ERROR_PARSING_UNKNOWN, _parseLineNum, 0); } return p; } @@ -1158,7 +1340,7 @@ namespace tinyxml2 return _value.GetStr(); } - char* XMLAttribute::ParseDeep(char* p, bool processEntities) + char* XMLAttribute::ParseDeep(char* p, bool processEntities, int* curLineNumPtr) { // Parse using the name rules: bug fix, was using ParseText before p = _name.ParseName(p); @@ -1167,13 +1349,13 @@ namespace tinyxml2 } // Skip white space before = - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr); if (*p != '=') { return 0; } ++p; // move up to opening quote - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr); if (*p != '\"' && *p != '\'') { return 0; } @@ -1181,7 +1363,7 @@ namespace tinyxml2 char endTag[2] = { *p, 0 }; ++p; // move past opening quote - p = _value.ParseText(p, endTag, processEntities ? StrPair::ATTRIBUTE_VALUE : StrPair::ATTRIBUTE_VALUE_LEAVE_ENTITIES); + p = _value.ParseText(p, endTag, processEntities ? StrPair::ATTRIBUTE_VALUE : StrPair::ATTRIBUTE_VALUE_LEAVE_ENTITIES, curLineNumPtr); return p; } @@ -1195,7 +1377,7 @@ namespace tinyxml2 XMLError XMLAttribute::QueryIntValue(int* value) const { if (XMLUtil::ToInt(Value(), value)) { - return XML_NO_ERROR; + return XML_SUCCESS; } return XML_WRONG_ATTRIBUTE_TYPE; } @@ -1204,7 +1386,16 @@ namespace tinyxml2 XMLError XMLAttribute::QueryUnsignedValue(unsigned int* value) const { if (XMLUtil::ToUnsigned(Value(), value)) { - return XML_NO_ERROR; + return XML_SUCCESS; + } + return XML_WRONG_ATTRIBUTE_TYPE; + } + + + XMLError XMLAttribute::QueryInt64Value(int64_t* value) const + { + if (XMLUtil::ToInt64(Value(), value)) { + return XML_SUCCESS; } return XML_WRONG_ATTRIBUTE_TYPE; } @@ -1213,7 +1404,7 @@ namespace tinyxml2 XMLError XMLAttribute::QueryBoolValue(bool* value) const { if (XMLUtil::ToBool(Value(), value)) { - return XML_NO_ERROR; + return XML_SUCCESS; } return XML_WRONG_ATTRIBUTE_TYPE; } @@ -1222,7 +1413,7 @@ namespace tinyxml2 XMLError XMLAttribute::QueryFloatValue(float* value) const { if (XMLUtil::ToFloat(Value(), value)) { - return XML_NO_ERROR; + return XML_SUCCESS; } return XML_WRONG_ATTRIBUTE_TYPE; } @@ -1231,7 +1422,7 @@ namespace tinyxml2 XMLError XMLAttribute::QueryDoubleValue(double* value) const { if (XMLUtil::ToDouble(Value(), value)) { - return XML_NO_ERROR; + return XML_SUCCESS; } return XML_WRONG_ATTRIBUTE_TYPE; } @@ -1259,6 +1450,15 @@ namespace tinyxml2 } + void XMLAttribute::SetAttribute(int64_t v) + { + char buf[BUF_SIZE]; + XMLUtil::ToStr(v, buf, BUF_SIZE); + _value.SetStr(buf); + } + + + void XMLAttribute::SetAttribute(bool v) { char buf[BUF_SIZE]; @@ -1283,7 +1483,7 @@ namespace tinyxml2 // --------- XMLElement ---------- // XMLElement::XMLElement(XMLDocument* doc) : XMLNode(doc), - _closingType(0), + _closingType(OPEN), _rootAttribute(0) { } @@ -1322,6 +1522,47 @@ namespace tinyxml2 return 0; } + int XMLElement::IntAttribute(const char* name, int defaultValue) const + { + int i = defaultValue; + QueryIntAttribute(name, &i); + return i; + } + + unsigned XMLElement::UnsignedAttribute(const char* name, unsigned defaultValue) const + { + unsigned i = defaultValue; + QueryUnsignedAttribute(name, &i); + return i; + } + + int64_t XMLElement::Int64Attribute(const char* name, int64_t defaultValue) const + { + int64_t i = defaultValue; + QueryInt64Attribute(name, &i); + return i; + } + + bool XMLElement::BoolAttribute(const char* name, bool defaultValue) const + { + bool b = defaultValue; + QueryBoolAttribute(name, &b); + return b; + } + + double XMLElement::DoubleAttribute(const char* name, double defaultValue) const + { + double d = defaultValue; + QueryDoubleAttribute(name, &d); + return d; + } + + float XMLElement::FloatAttribute(const char* name, float defaultValue) const + { + float f = defaultValue; + QueryFloatAttribute(name, &f); + return f; + } const char* XMLElement::GetText() const { @@ -1359,6 +1600,14 @@ namespace tinyxml2 } + void XMLElement::SetText(int64_t v) + { + char buf[BUF_SIZE]; + XMLUtil::ToStr(v, buf, BUF_SIZE); + SetText(buf); + } + + void XMLElement::SetText(bool v) { char buf[BUF_SIZE]; @@ -1409,6 +1658,19 @@ namespace tinyxml2 } + XMLError XMLElement::QueryInt64Text(int64_t* ival) const + { + if (FirstChild() && FirstChild()->ToText()) { + const char* t = FirstChild()->Value(); + if (XMLUtil::ToInt64(t, ival)) { + return XML_SUCCESS; + } + return XML_CAN_NOT_CONVERT_TEXT; + } + return XML_NO_TEXT_NODE; + } + + XMLError XMLElement::QueryBoolText(bool* bval) const { if (FirstChild() && FirstChild()->ToText()) { @@ -1447,6 +1709,47 @@ namespace tinyxml2 return XML_NO_TEXT_NODE; } + int XMLElement::IntText(int defaultValue) const + { + int i = defaultValue; + QueryIntText(&i); + return i; + } + + unsigned XMLElement::UnsignedText(unsigned defaultValue) const + { + unsigned i = defaultValue; + QueryUnsignedText(&i); + return i; + } + + int64_t XMLElement::Int64Text(int64_t defaultValue) const + { + int64_t i = defaultValue; + QueryInt64Text(&i); + return i; + } + + bool XMLElement::BoolText(bool defaultValue) const + { + bool b = defaultValue; + QueryBoolText(&b); + return b; + } + + double XMLElement::DoubleText(double defaultValue) const + { + double d = defaultValue; + QueryDoubleText(&d); + return d; + } + + float XMLElement::FloatText(float defaultValue) const + { + float f = defaultValue; + QueryFloatText(&f); + return f; + } XMLAttribute* XMLElement::FindOrCreateAttribute(const char* name) @@ -1461,17 +1764,17 @@ namespace tinyxml2 } } if (!attrib) { - TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize()); - attrib = new (_document->_attributePool.Alloc()) XMLAttribute(); - attrib->_memPool = &_document->_attributePool; + attrib = CreateAttribute(); + TIXMLASSERT(attrib); if (last) { + TIXMLASSERT(last->_next == 0); last->_next = attrib; } else { + TIXMLASSERT(_rootAttribute == 0); _rootAttribute = attrib; } attrib->SetName(name); - attrib->_memPool->SetTracked(); // always created and linked. } return attrib; } @@ -1496,30 +1799,30 @@ namespace tinyxml2 } - char* XMLElement::ParseAttributes(char* p) + char* XMLElement::ParseAttributes(char* p, int* curLineNumPtr) { - const char* start = p; XMLAttribute* prevAttribute = 0; // Read the attributes. while (p) { - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr); if (!(*p)) { - _document->SetError(XML_ERROR_PARSING_ELEMENT, start, Name()); + _document->SetError(XML_ERROR_PARSING_ELEMENT, _parseLineNum, "XMLElement name=%s", Name()); return 0; } // attribute. if (XMLUtil::IsNameStartChar(*p)) { - TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize()); - XMLAttribute* attrib = new (_document->_attributePool.Alloc()) XMLAttribute(); - attrib->_memPool = &_document->_attributePool; - attrib->_memPool->SetTracked(); + XMLAttribute* attrib = CreateAttribute(); + TIXMLASSERT(attrib); + attrib->_parseLineNum = _document->_parseCurLineNum; - p = attrib->ParseDeep(p, _document->ProcessEntities()); + int attrLineNum = attrib->_parseLineNum; + + p = attrib->ParseDeep(p, _document->ProcessEntities(), curLineNumPtr); if (!p || Attribute(attrib->Name())) { DeleteAttribute(attrib); - _document->SetError(XML_ERROR_PARSING_ATTRIBUTE, start, p); + _document->SetError(XML_ERROR_PARSING_ATTRIBUTE, attrLineNum, "XMLElement name=%s", Name()); return 0; } // There is a minor bug here: if the attribute in the source xml @@ -1528,25 +1831,27 @@ namespace tinyxml2 // avoids re-scanning the attribute list. Preferring performance for // now, may reconsider in the future. if (prevAttribute) { + TIXMLASSERT(prevAttribute->_next == 0); prevAttribute->_next = attrib; } else { + TIXMLASSERT(_rootAttribute == 0); _rootAttribute = attrib; } prevAttribute = attrib; } // end of the tag - else if (*p == '/' && *(p + 1) == '>') { - _closingType = CLOSED; - return p + 2; // done; sealed element. - } - // end of the tag else if (*p == '>') { ++p; break; } + // end of the tag + else if (*p == '/' && *(p + 1) == '>') { + _closingType = CLOSED; + return p + 2; // done; sealed element. + } else { - _document->SetError(XML_ERROR_PARSING_ELEMENT, start, p); + _document->SetError(XML_ERROR_PARSING_ELEMENT, _parseLineNum, 0); return 0; } } @@ -1563,14 +1868,24 @@ namespace tinyxml2 pool->Free(attribute); } + XMLAttribute* XMLElement::CreateAttribute() + { + TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize()); + XMLAttribute* attrib = new (_document->_attributePool.Alloc()) XMLAttribute(); + TIXMLASSERT(attrib); + attrib->_memPool = &_document->_attributePool; + attrib->_memPool->SetTracked(); + return attrib; + } + // // // foobar // - char* XMLElement::ParseDeep(char* p, StrPair* strPair) + char* XMLElement::ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr) { // Read the element name. - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr); // The closing element is the form. It is // parsed just like a regular element then deleted from @@ -1585,12 +1900,12 @@ namespace tinyxml2 return 0; } - p = ParseAttributes(p); - if (!p || !*p || _closingType) { + p = ParseAttributes(p, curLineNumPtr); + if (!p || !*p || _closingType != OPEN) { return p; } - p = XMLNode::ParseDeep(p, strPair); + p = XMLNode::ParseDeep(p, parentEndTag, curLineNumPtr); return p; } @@ -1613,7 +1928,7 @@ namespace tinyxml2 { TIXMLASSERT(compare); const XMLElement* other = compare->ToElement(); - if (other && XMLUtil::StringEqual(other->Value(), Value())) { + if (other && XMLUtil::StringEqual(other->Name(), Name())) { const XMLAttribute* a = FirstAttribute(); const XMLAttribute* b = other->FirstAttribute(); @@ -1659,10 +1974,10 @@ namespace tinyxml2 "XML_ERROR_FILE_NOT_FOUND", "XML_ERROR_FILE_COULD_NOT_BE_OPENED", "XML_ERROR_FILE_READ_ERROR", - "XML_ERROR_ELEMENT_MISMATCH", + "UNUSED_XML_ERROR_ELEMENT_MISMATCH", "XML_ERROR_PARSING_ELEMENT", "XML_ERROR_PARSING_ATTRIBUTE", - "XML_ERROR_IDENTIFYING_TAG", + "UNUSED_XML_ERROR_IDENTIFYING_TAG", "XML_ERROR_PARSING_TEXT", "XML_ERROR_PARSING_CDATA", "XML_ERROR_PARSING_COMMENT", @@ -1676,17 +1991,24 @@ namespace tinyxml2 }; - XMLDocument::XMLDocument(bool processEntities, Whitespace whitespace) : + XMLDocument::XMLDocument(bool processEntities, Whitespace whitespaceMode) : XMLNode(0), _writeBOM(false), _processEntities(processEntities), - _errorID(XML_NO_ERROR), - _whitespace(whitespace), - _errorStr1(0), - _errorStr2(0), - _charBuffer(0) + _errorID(XML_SUCCESS), + _whitespaceMode(whitespaceMode), + _errorStr(), + _errorLineNum(0), + _charBuffer(0), + _parseCurLineNum(0), + _unlinked(), + _elementPool(), + _attributePool(), + _textPool(), + _commentPool() { - _document = this; // avoid warning about 'this' in initializer list + // avoid VC++ C4355 warning about 'this' in initializer list (C4355 is off by default in VS2012+) + _document = this; } @@ -1696,16 +2018,30 @@ namespace tinyxml2 } + void XMLDocument::MarkInUse(XMLNode* node) + { + TIXMLASSERT(node); + TIXMLASSERT(node->_parent == 0); + + for (int i = 0; i < _unlinked.Size(); ++i) { + if (node == _unlinked[i]) { + _unlinked.SwapRemove(i); + break; + } + } + } + void XMLDocument::Clear() { DeleteChildren(); + while (_unlinked.Size()) { + DeleteNode(_unlinked[0]); // Will remove from _unlinked as part of delete. + } -#ifdef DEBUG +#ifdef TINYXML2_DEBUG const bool hadError = Error(); #endif - _errorID = XML_NO_ERROR; - _errorStr1 = 0; - _errorStr2 = 0; + ClearError(); delete[] _charBuffer; _charBuffer = 0; @@ -1717,7 +2053,7 @@ namespace tinyxml2 _attributePool.Trace("attribute"); #endif -#ifdef DEBUG +#ifdef TINYXML2_DEBUG if (!hadError) { TIXMLASSERT(_elementPool.CurrentAllocs() == _elementPool.Untracked()); TIXMLASSERT(_attributePool.CurrentAllocs() == _attributePool.Untracked()); @@ -1728,11 +2064,22 @@ namespace tinyxml2 } + void XMLDocument::DeepCopy(XMLDocument* target) const + { + TIXMLASSERT(target); + if (target == this) { + return; // technically success - a no-op. + } + + target->Clear(); + for (const XMLNode* node = this->FirstChild(); node; node = node->NextSibling()) { + target->InsertEndChild(node->DeepClone(target)); + } + } + XMLElement* XMLDocument::NewElement(const char* name) { - TIXMLASSERT(sizeof(XMLElement) == _elementPool.ItemSize()); - XMLElement* ele = new (_elementPool.Alloc()) XMLElement(this); - ele->_memPool = &_elementPool; + XMLElement* ele = CreateUnlinkedNode(_elementPool); ele->SetName(name); return ele; } @@ -1740,9 +2087,7 @@ namespace tinyxml2 XMLComment* XMLDocument::NewComment(const char* str) { - TIXMLASSERT(sizeof(XMLComment) == _commentPool.ItemSize()); - XMLComment* comment = new (_commentPool.Alloc()) XMLComment(this); - comment->_memPool = &_commentPool; + XMLComment* comment = CreateUnlinkedNode(_commentPool); comment->SetValue(str); return comment; } @@ -1750,9 +2095,7 @@ namespace tinyxml2 XMLText* XMLDocument::NewText(const char* str) { - TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize()); - XMLText* text = new (_textPool.Alloc()) XMLText(this); - text->_memPool = &_textPool; + XMLText* text = CreateUnlinkedNode(_textPool); text->SetValue(str); return text; } @@ -1760,9 +2103,7 @@ namespace tinyxml2 XMLDeclaration* XMLDocument::NewDeclaration(const char* str) { - TIXMLASSERT(sizeof(XMLDeclaration) == _commentPool.ItemSize()); - XMLDeclaration* dec = new (_commentPool.Alloc()) XMLDeclaration(this); - dec->_memPool = &_commentPool; + XMLDeclaration* dec = CreateUnlinkedNode(_commentPool); dec->SetValue(str ? str : "xml version=\"1.0\" encoding=\"UTF-8\""); return dec; } @@ -1770,9 +2111,7 @@ namespace tinyxml2 XMLUnknown* XMLDocument::NewUnknown(const char* str) { - TIXMLASSERT(sizeof(XMLUnknown) == _commentPool.ItemSize()); - XMLUnknown* unk = new (_commentPool.Alloc()) XMLUnknown(this); - unk->_memPool = &_commentPool; + XMLUnknown* unk = CreateUnlinkedNode(_commentPool); unk->SetValue(str); return unk; } @@ -1816,7 +2155,7 @@ namespace tinyxml2 Clear(); FILE* fp = callfopen(filename, "rb"); if (!fp) { - SetError(XML_ERROR_FILE_NOT_FOUND, filename, 0); + SetError(XML_ERROR_FILE_NOT_FOUND, 0, "filename=%s", filename ? filename : ""); return _errorID; } LoadFile(fp); @@ -1824,6 +2163,28 @@ namespace tinyxml2 return _errorID; } + // This is likely overengineered template art to have a check that unsigned long value incremented + // by one still fits into size_t. If size_t type is larger than unsigned long type + // (x86_64-w64-mingw32 target) then the check is redundant and gcc and clang emit + // -Wtype-limits warning. This piece makes the compiler select code with a check when a check + // is useful and code with no check when a check is redundant depending on how size_t and unsigned long + // types sizes relate to each other. + template + = sizeof(size_t))> + struct LongFitsIntoSizeTMinusOne { + static bool Fits(unsigned long value) + { + return value < (size_t)-1; + } + }; + + template <> + struct LongFitsIntoSizeTMinusOne { + static bool Fits(unsigned long) + { + return true; + } + }; XMLError XMLDocument::LoadFile(FILE* fp) { @@ -1842,13 +2203,21 @@ namespace tinyxml2 SetError(XML_ERROR_FILE_READ_ERROR, 0, 0); return _errorID; } + TIXMLASSERT(filelength >= 0); - const size_t size = filelength; - if (size == 0) { + if (!LongFitsIntoSizeTMinusOne<>::Fits(filelength)) { + // Cannot handle files which won't fit in buffer together with null terminator + SetError(XML_ERROR_FILE_READ_ERROR, 0, 0); + return _errorID; + } + + if (filelength == 0) { SetError(XML_ERROR_EMPTY_DOCUMENT, 0, 0); return _errorID; } + const size_t size = filelength; + TIXMLASSERT(_charBuffer == 0); _charBuffer = new char[size + 1]; size_t read = fread(_charBuffer, 1, size, fp); if (read != size) { @@ -1867,7 +2236,7 @@ namespace tinyxml2 { FILE* fp = callfopen(filename, "w"); if (!fp) { - SetError(XML_ERROR_FILE_COULD_NOT_BE_OPENED, filename, 0); + SetError(XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, "filename=%s", filename ? filename : ""); return _errorID; } SaveFile(fp, compact); @@ -1878,6 +2247,9 @@ namespace tinyxml2 XMLError XMLDocument::SaveFile(FILE* fp, bool compact) { + // Clear any error from the last save, otherwise it will get reported + // for *this* call. + ClearError(); XMLPrinter stream(fp, compact); Print(&stream); return _errorID; @@ -1895,6 +2267,7 @@ namespace tinyxml2 if (len == (size_t)(-1)) { len = strlen(p); } + TIXMLASSERT(_charBuffer == 0); _charBuffer = new char[len + 1]; memcpy(_charBuffer, p, len); _charBuffer[len] = 0; @@ -1916,78 +2289,103 @@ namespace tinyxml2 void XMLDocument::Print(XMLPrinter* streamer) const { - XMLPrinter stdStreamer(stdout); - if (!streamer) { - streamer = &stdStreamer; + if (streamer) { + Accept(streamer); + } + else { + XMLPrinter stdoutStreamer(stdout); + Accept(&stdoutStreamer); } - Accept(streamer); } - void XMLDocument::SetError(XMLError error, const char* str1, const char* str2) + void XMLDocument::SetError(XMLError error, int lineNum, const char* format, ...) { TIXMLASSERT(error >= 0 && error < XML_ERROR_COUNT); _errorID = error; - _errorStr1 = str1; - _errorStr2 = str2; + _errorLineNum = lineNum; + _errorStr.Reset(); + + size_t BUFFER_SIZE = 1000; + char* buffer = new char[BUFFER_SIZE]; + + TIXML_SNPRINTF(buffer, BUFFER_SIZE, "Error=%s ErrorID=%d (0x%x) Line number=%d", ErrorIDToName(error), int(error), int(error), lineNum); + + if (format) { + size_t len = strlen(buffer); + TIXML_SNPRINTF(buffer + len, BUFFER_SIZE - len, ": "); + len = strlen(buffer); + + va_list va; + va_start(va, format); + TIXML_VSNPRINTF(buffer + len, BUFFER_SIZE - len, format, va); + va_end(va); + } + _errorStr.SetStr(buffer); + delete[] buffer; } - const char* XMLDocument::ErrorName() const + + /*static*/ const char* XMLDocument::ErrorIDToName(XMLError errorID) { - TIXMLASSERT(_errorID >= 0 && _errorID < XML_ERROR_COUNT); - return _errorNames[_errorID]; + TIXMLASSERT(errorID >= 0 && errorID < XML_ERROR_COUNT); + const char* errorName = _errorNames[errorID]; + TIXMLASSERT(errorName && errorName[0]); + return errorName; } - void XMLDocument::PrintError() const + const char* XMLDocument::ErrorStr() const { - if (Error()) { - static const int LEN = 20; - char buf1[LEN] = { 0 }; - char buf2[LEN] = { 0 }; + return _errorStr.Empty() ? "" : _errorStr.GetStr(); + } - if (_errorStr1) { - TIXML_SNPRINTF(buf1, LEN, "%s", _errorStr1); - } - if (_errorStr2) { - TIXML_SNPRINTF(buf2, LEN, "%s", _errorStr2); - } - printf("XMLDocument error id=%d '%s' str1=%s str2=%s\n", - _errorID, ErrorName(), buf1, buf2); - } + void XMLDocument::PrintError() const + { + printf("%s\n", ErrorStr()); + } + + const char* XMLDocument::ErrorName() const + { + return ErrorIDToName(_errorID); } void XMLDocument::Parse() { TIXMLASSERT(NoChildren()); // Clear() must have been called previously TIXMLASSERT(_charBuffer); + _parseCurLineNum = 1; + _parseLineNum = 1; char* p = _charBuffer; - p = XMLUtil::SkipWhiteSpace(p); + p = XMLUtil::SkipWhiteSpace(p, &_parseCurLineNum); p = const_cast(XMLUtil::ReadBOM(p, &_writeBOM)); if (!*p) { SetError(XML_ERROR_EMPTY_DOCUMENT, 0, 0); return; } - ParseDeep(p, 0); + ParseDeep(p, 0, &_parseCurLineNum); } XMLPrinter::XMLPrinter(FILE* file, bool compact, int depth) : _elementJustOpened(false), + _stack(), _firstElement(true), _fp(file), _depth(depth), _textDepth(-1), _processEntities(true), - _compactMode(compact) + _compactMode(compact), + _buffer() { - for (int i = 0; i < ENTITY_RANGE; ++i) { + for (int i = 0; i= 1400 ) -#if defined(WINCE) - int len = 512; - do { - len = len * 2; - char* str = new char[len](); - len = _vsnprintf(str, len, format, va); - delete[] str; - } while (len < 0); -#else - int len = _vscprintf(format, va); -#endif -#else - int len = vsnprintf(0, 0, format, va); -#endif + const int len = TIXML_VSCPRINTF(format, va); // Close out and re-start the va-args va_end(va); + TIXMLASSERT(len >= 0); va_start(va, format); TIXMLASSERT(_buffer.Size() > 0 && _buffer[_buffer.Size() - 1] == 0); char* p = _buffer.PushArr(len) - 1; // back up over the null terminator. -#if defined(_MSC_VER) && (_MSC_VER >= 1400 ) -#if defined(WINCE) - _vsnprintf(p, len + 1, format, va); -#else - vsnprintf_s(p, len + 1, _TRUNCATE, format, va); -#endif -#else - vsnprintf(p, len + 1, format, va); -#endif + TIXML_VSNPRINTF(p, len + 1, format, va); } va_end(va); } + void XMLPrinter::Write(const char* data, size_t size) + { + if (_fp) { + fwrite(data, sizeof(char), size, _fp); + } + else { + char* p = _buffer.PushArr(static_cast(size)) - 1; // back up over the null terminator. + memcpy(p, data, size); + p[size] = 0; + } + } + + + void XMLPrinter::Putc(char ch) + { + if (_fp) { + fputc(ch, _fp); + } + else { + char* p = _buffer.PushArr(sizeof(char)) - 1; // back up over the null terminator. + p[0] = ch; + p[1] = 0; + } + } + + void XMLPrinter::PrintSpace(int depth) { - for (int i = 0; i < depth; ++i) { - Print(" "); + for (int i = 0; i 0 && *q < ENTITY_RANGE) { // Check for entities. If one is found, flush @@ -2062,25 +2466,39 @@ namespace tinyxml2 // entity, and keep looking. if (flag[(unsigned char)(*q)]) { while (p < q) { - Print("%c", *p); - ++p; + const size_t delta = q - p; + const int toPrint = (INT_MAX < delta) ? INT_MAX : (int)delta; + Write(p, toPrint); + p += toPrint; } - for (int i = 0; i < NUM_ENTITIES; ++i) { + bool entityPatternPrinted = false; + for (int i = 0; i 0)) { - Print("%s", p); + TIXMLASSERT(p <= q); + if (!_processEntities || (p < q)) { + const size_t delta = q - p; + const int toPrint = (INT_MAX < delta) ? INT_MAX : (int)delta; + Write(p, toPrint); } } @@ -2089,7 +2507,7 @@ namespace tinyxml2 { if (writeBOM) { static const unsigned char bom[] = { TIXML_UTF_LEAD_0, TIXML_UTF_LEAD_1, TIXML_UTF_LEAD_2, 0 }; - Print("%s", bom); + Write(reinterpret_cast< const char* >(bom)); } if (writeDec) { PushDeclaration("xml version=\"1.0\""); @@ -2103,13 +2521,15 @@ namespace tinyxml2 _stack.Push(name); if (_textDepth < 0 && !_firstElement && !compactMode) { - Print("\n"); + Putc('\n'); } if (!compactMode) { PrintSpace(_depth); } - Print("<%s", name); + Write("<"); + Write(name); + _elementJustOpened = true; _firstElement = false; ++_depth; @@ -2119,9 +2539,11 @@ namespace tinyxml2 void XMLPrinter::PushAttribute(const char* name, const char* value) { TIXMLASSERT(_elementJustOpened); - Print(" %s=\"", name); + Putc(' '); + Write(name); + Write("=\""); PrintString(value, false); - Print("\""); + Putc('\"'); } @@ -2141,6 +2563,14 @@ namespace tinyxml2 } + void XMLPrinter::PushAttribute(const char* name, int64_t v) + { + char buf[BUF_SIZE]; + XMLUtil::ToStr(v, buf, BUF_SIZE); + PushAttribute(name, buf); + } + + void XMLPrinter::PushAttribute(const char* name, bool v) { char buf[BUF_SIZE]; @@ -2163,21 +2593,23 @@ namespace tinyxml2 const char* name = _stack.Pop(); if (_elementJustOpened) { - Print("/>"); + Write("/>"); } else { if (_textDepth < 0 && !compactMode) { - Print("\n"); + Putc('\n'); PrintSpace(_depth); } - Print("", name); + Write(""); } if (_textDepth == _depth) { _textDepth = -1; } if (_depth == 0 && !compactMode) { - Print("\n"); + Putc('\n'); } _elementJustOpened = false; } @@ -2189,7 +2621,7 @@ namespace tinyxml2 return; } _elementJustOpened = false; - Print(">"); + Putc('>'); } @@ -2199,15 +2631,22 @@ namespace tinyxml2 SealElementIfJustOpened(); if (cdata) { - Print(""); + Write(""); } else { PrintString(text, true); } } + void XMLPrinter::PushText(int64_t value) + { + char buf[BUF_SIZE]; + XMLUtil::ToStr(value, buf, BUF_SIZE); + PushText(buf, false); + } + void XMLPrinter::PushText(int value) { char buf[BUF_SIZE]; @@ -2252,11 +2691,14 @@ namespace tinyxml2 { SealElementIfJustOpened(); if (_textDepth < 0 && !_firstElement && !_compactMode) { - Print("\n"); + Putc('\n'); PrintSpace(_depth); } _firstElement = false; - Print("", comment); + + Write(""); } @@ -2264,11 +2706,14 @@ namespace tinyxml2 { SealElementIfJustOpened(); if (_textDepth < 0 && !_firstElement && !_compactMode) { - Print("\n"); + Putc('\n'); PrintSpace(_depth); } _firstElement = false; - Print("", value); + + Write(""); } @@ -2276,11 +2721,14 @@ namespace tinyxml2 { SealElementIfJustOpened(); if (_textDepth < 0 && !_firstElement && !_compactMode) { - Print("\n"); + Putc('\n'); PrintSpace(_depth); } _firstElement = false; - Print("", value); + + Write("'); } @@ -2296,8 +2744,11 @@ namespace tinyxml2 bool XMLPrinter::VisitEnter(const XMLElement& element, const XMLAttribute* attribute) { - const XMLElement* parentElem = element.Parent()->ToElement(); - bool compactMode = parentElem ? CompactMode(*parentElem) : _compactMode; + const XMLElement* parentElem = 0; + if (element.Parent()) { + parentElem = element.Parent()->ToElement(); + } + const bool compactMode = parentElem ? CompactMode(*parentElem) : _compactMode; OpenElement(element.Name(), compactMode); while (attribute) { PushAttribute(attribute->Name(), attribute->Value()); @@ -2341,3 +2792,4 @@ namespace tinyxml2 } } // namespace tinyxml2 + diff --git a/src/3rd/Simd/SimdBase_tinyxml2.h b/src/3rd/Simd/SimdBase_tinyxml2.h index bda9804e..ac732fe1 100644 --- a/src/3rd/Simd/SimdBase_tinyxml2.h +++ b/src/3rd/Simd/SimdBase_tinyxml2.h @@ -24,38 +24,38 @@ distribution. #ifndef TINYXML2_INCLUDED #define TINYXML2_INCLUDED -#include "Simd/SimdConfig.h" - #if defined(ANDROID_NDK) || defined(__BORLANDC__) || defined(__QNXNTO__) # include # include # include # include # include -# include +# if defined(__PS3__) +# include +# endif #else # include # include # include # include # include -# include #endif +#include /* - TODO: intern strings instead of allocation. +TODO: intern strings instead of allocation. */ /* - gcc: - g++ -Wall -DDEBUG tinyxml2.cpp xmltest.cpp -o gccxmltest.exe +gcc: +g++ -Wall -DTINYXML2_DEBUG tinyxml2.cpp xmltest.cpp -o gccxmltest.exe - Formatting, Artistic Style: - AStyle.exe --style=1tbs --indent-switches --break-closing-brackets --indent-preprocessor tinyxml2.cpp tinyxml2.h +Formatting, Artistic Style: +AStyle.exe --style=1tbs --indent-switches --break-closing-brackets --indent-preprocessor tinyxml2.cpp tinyxml2.h */ -#if defined( _DEBUG ) || defined( DEBUG ) || defined (__DEBUG__) -# ifndef DEBUG -# define DEBUG +#if defined( _DEBUG ) || defined (__DEBUG__) +# ifndef TINYXML2_DEBUG +# define TINYXML2_DEBUG # endif #endif @@ -72,15 +72,17 @@ distribution. # else # define TINYXML2_LIB # endif +#elif __GNUC__ >= 4 +# define TINYXML2_LIB __attribute__((visibility("default"))) #else # define TINYXML2_LIB #endif -#if defined(DEBUG) +#if defined(TINYXML2_DEBUG) # if defined(_MSC_VER) # // "(void)0," is for suppressing C4127 warning in "assert(false)", "assert(true)" and the like -# define TIXMLASSERT( x ) if ( !((void)0,(x))) { __debugbreak(); } //if ( !(x)) WinDebugBreak() +# define TIXMLASSERT( x ) if ( !((void)0,(x))) { __debugbreak(); } # elif defined (ANDROID_NDK) # include # define TIXMLASSERT( x ) if ( !(x)) { __android_log_assert( "assert", "grinliz", "ASSERT in '%s' at %d.", __FILE__, __LINE__ ); } @@ -88,46 +90,22 @@ distribution. # include # define TIXMLASSERT assert # endif -# else -# define TIXMLASSERT( x ) {} -#endif - - -#if defined(_MSC_VER) && (_MSC_VER >= 1400 ) && (!defined WINCE) -// Microsoft visual studio, version 2005 and higher. -/*int _snprintf_s( - char *buffer, - size_t sizeOfBuffer, - size_t count, - const char *format [, - argument] ... -);*/ -inline int TIXML_SNPRINTF(char* buffer, size_t size, const char* format, ...) -{ - va_list va; - va_start(va, format); - int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va); - va_end(va); - return result; -} -#define TIXML_SSCANF sscanf_s -#elif defined WINCE -#define TIXML_SNPRINTF _snprintf -#define TIXML_SSCANF sscanf #else -// GCC version 3 and higher -//#warning( "Using sn* functions." ) -#define TIXML_SNPRINTF snprintf -#define TIXML_SSCANF sscanf +# define TIXMLASSERT( x ) {} #endif + /* Versioning, past 1.0.14: - http://semver.org/ +http://semver.org/ */ -static const int TIXML2_MAJOR_VERSION = 3; -static const int TIXML2_MINOR_VERSION = 0; +static const int TIXML2_MAJOR_VERSION = 6; +static const int TIXML2_MINOR_VERSION = 1; static const int TIXML2_PATCH_VERSION = 0; +#define TINYXML2_MAJOR_VERSION 6 +#define TINYXML2_MINOR_VERSION 1 +#define TINYXML2_PATCH_VERSION 0 + namespace tinyxml2 { class XMLDocument; @@ -140,10 +118,10 @@ namespace tinyxml2 class XMLPrinter; /* - A class that wraps strings. Normally stores the start and end - pointers into the XML file itself, and will apply normalization - and entity translation if actually read. Can also store (and memory - manage) a traditional char[] + A class that wraps strings. Normally stores the start and end + pointers into the XML file itself, and will apply normalization + and entity translation if actually read. Can also store (and memory + manage) a traditional char[] */ class StrPair { @@ -151,7 +129,7 @@ namespace tinyxml2 enum { NEEDS_ENTITY_PROCESSING = 0x01, NEEDS_NEWLINE_NORMALIZATION = 0x02, - COLLAPSE_WHITESPACE = 0x04, + NEEDS_WHITESPACE_COLLAPSING = 0x04, TEXT_ELEMENT = NEEDS_ENTITY_PROCESSING | NEEDS_NEWLINE_NORMALIZATION, TEXT_ELEMENT_LEAVE_ENTITIES = NEEDS_NEWLINE_NORMALIZATION, @@ -165,6 +143,8 @@ namespace tinyxml2 ~StrPair(); void Set(char* start, char* end, int flags) { + TIXMLASSERT(start); + TIXMLASSERT(end); Reset(); _start = start; _end = end; @@ -184,13 +164,13 @@ namespace tinyxml2 void SetStr(const char* str, int flags = 0); - char* ParseText(char* in, const char* endTag, int strFlags); + char* ParseText(char* in, const char* endTag, int strFlags, int* curLineNumPtr); char* ParseName(char* in); void TransferTo(StrPair* other); + void Reset(); private: - void Reset(); void CollapseWhitespace(); enum { @@ -198,7 +178,6 @@ namespace tinyxml2 NEEDS_DELETE = 0x200 }; - // After parsing, if *_end != 0, it can be set to zero. int _flags; char* _start; char* _end; @@ -209,18 +188,19 @@ namespace tinyxml2 /* - A dynamic array of Plain Old Data. Doesn't support constructors, etc. - Has a small initial memory pool, so that low or no usage will not - cause a call to new/delete + A dynamic array of Plain Old Data. Doesn't support constructors, etc. + Has a small initial memory pool, so that low or no usage will not + cause a call to new/delete */ - template + template class DynArray { public: - DynArray() { - _mem = _pool; - _allocated = INIT; - _size = 0; + DynArray() : + _mem(_pool), + _allocated(INITIAL_SIZE), + _size(0) + { } ~DynArray() { @@ -236,7 +216,8 @@ namespace tinyxml2 void Push(T t) { TIXMLASSERT(_size < INT_MAX); EnsureCapacity(_size + 1); - _mem[_size++] = t; + _mem[_size] = t; + ++_size; } T* PushArr(int count) { @@ -250,7 +231,8 @@ namespace tinyxml2 T Pop() { TIXMLASSERT(_size > 0); - return _mem[--_size]; + --_size; + return _mem[_size]; } void PopArr(int count) { @@ -283,14 +265,24 @@ namespace tinyxml2 } int Capacity() const { + TIXMLASSERT(_allocated >= INITIAL_SIZE); return _allocated; } + void SwapRemove(int i) { + TIXMLASSERT(i >= 0 && i < _size); + TIXMLASSERT(_size > 0); + _mem[i] = _mem[_size - 1]; + --_size; + } + const T* Mem() const { + TIXMLASSERT(_mem); return _mem; } T* Mem() { + TIXMLASSERT(_mem); return _mem; } @@ -304,6 +296,7 @@ namespace tinyxml2 TIXMLASSERT(cap <= INT_MAX / 2); int newAllocated = cap * 2; T* newMem = new T[newAllocated]; + TIXMLASSERT(newAllocated >= _size); memcpy(newMem, _mem, sizeof(T)*_size); // warning: not using constructors, only works for PODs if (_mem != _pool) { delete[] _mem; @@ -314,15 +307,15 @@ namespace tinyxml2 } T* _mem; - T _pool[INIT]; + T _pool[INITIAL_SIZE]; int _allocated; // objects allocated int _size; // number objects in use }; /* - Parent virtual class of a pool for fast allocation - and deallocation of objects. + Parent virtual class of a pool for fast allocation + and deallocation of objects. */ class MemPool { @@ -339,13 +332,13 @@ namespace tinyxml2 /* - Template child class to create pools of the correct type. + Template child class to create pools of the correct type. */ - template< int SIZE > + template< int ITEM_SIZE > class MemPoolT : public MemPool { public: - MemPoolT() : _root(0), _currentAllocs(0), _nAllocs(0), _maxAllocs(0), _nUntracked(0) {} + MemPoolT() : _blockPtrs(), _root(0), _currentAllocs(0), _nAllocs(0), _maxAllocs(0), _nUntracked(0) {} ~MemPoolT() { Clear(); } @@ -353,8 +346,8 @@ namespace tinyxml2 void Clear() { // Delete the blocks. while (!_blockPtrs.Empty()) { - Block* b = _blockPtrs.Pop(); - delete b; + Block* lastBlock = _blockPtrs.Pop(); + delete lastBlock; } _root = 0; _currentAllocs = 0; @@ -364,7 +357,7 @@ namespace tinyxml2 } virtual int ItemSize() const { - return SIZE; + return ITEM_SIZE; } int CurrentAllocs() const { return _currentAllocs; @@ -376,21 +369,23 @@ namespace tinyxml2 Block* block = new Block(); _blockPtrs.Push(block); - for (int i = 0; i < COUNT - 1; ++i) { - block->chunk[i].next = &block->chunk[i + 1]; + Item* blockItems = block->items; + for (int i = 0; i < ITEMS_PER_BLOCK - 1; ++i) { + blockItems[i].next = &(blockItems[i + 1]); } - block->chunk[COUNT - 1].next = 0; - _root = block->chunk; + blockItems[ITEMS_PER_BLOCK - 1].next = 0; + _root = blockItems; } - void* result = _root; + Item* const result = _root; + TIXMLASSERT(result != 0); _root = _root->next; ++_currentAllocs; if (_currentAllocs > _maxAllocs) { _maxAllocs = _currentAllocs; } - _nAllocs++; - _nUntracked++; + ++_nAllocs; + ++_nUntracked; return result; } @@ -399,20 +394,21 @@ namespace tinyxml2 return; } --_currentAllocs; - Chunk* chunk = static_cast(mem); -#ifdef DEBUG - memset(chunk, 0xfe, sizeof(Chunk)); + Item* item = static_cast(mem); +#ifdef TINYXML2_DEBUG + memset(item, 0xfe, sizeof(*item)); #endif - chunk->next = _root; - _root = chunk; + item->next = _root; + _root = item; } void Trace(const char* name) { printf("Mempool %s watermark=%d [%dk] current=%d size=%d nAlloc=%d blocks=%d\n", - name, _maxAllocs, _maxAllocs*SIZE / 1024, _currentAllocs, SIZE, _nAllocs, _blockPtrs.Size()); + name, _maxAllocs, _maxAllocs * ITEM_SIZE / 1024, _currentAllocs, + ITEM_SIZE, _nAllocs, _blockPtrs.Size()); } void SetTracked() { - _nUntracked--; + --_nUntracked; } int Untracked() const { @@ -428,21 +424,23 @@ namespace tinyxml2 // 16k: 5200 // 32k: 4300 // 64k: 4000 21000 - enum { COUNT = (4 * 1024) / SIZE }; // Some compilers do not accept to use COUNT in private part if COUNT is private + // Declared public because some compilers do not accept to use ITEMS_PER_BLOCK + // in private part if ITEMS_PER_BLOCK is private + enum { ITEMS_PER_BLOCK = (4 * 1024) / ITEM_SIZE }; private: MemPoolT(const MemPoolT&); // not supported void operator=(const MemPoolT&); // not supported - union Chunk { - Chunk* next; - char mem[SIZE]; + union Item { + Item* next; + char itemData[ITEM_SIZE]; }; struct Block { - Chunk chunk[COUNT]; + Item items[ITEMS_PER_BLOCK]; }; DynArray< Block*, 10 > _blockPtrs; - Chunk* _root; + Item* _root; int _currentAllocs; int _nAllocs; @@ -453,23 +451,23 @@ namespace tinyxml2 /** - Implements the interface to the "Visitor pattern" (see the Accept() method.) - If you call the Accept() method, it requires being passed a XMLVisitor - class to handle callbacks. For nodes that contain other nodes (Document, Element) - you will get called with a VisitEnter/VisitExit pair. Nodes that are always leafs - are simply called with Visit(). + Implements the interface to the "Visitor pattern" (see the Accept() method.) + If you call the Accept() method, it requires being passed a XMLVisitor + class to handle callbacks. For nodes that contain other nodes (Document, Element) + you will get called with a VisitEnter/VisitExit pair. Nodes that are always leafs + are simply called with Visit(). - If you return 'true' from a Visit method, recursive parsing will continue. If you return - false, no children of this node or its siblings will be visited. + If you return 'true' from a Visit method, recursive parsing will continue. If you return + false, no children of this node or its siblings will be visited. - All flavors of Visit methods have a default implementation that returns 'true' (continue - visiting). You need to only override methods that are interesting to you. + All flavors of Visit methods have a default implementation that returns 'true' (continue + visiting). You need to only override methods that are interesting to you. - Generally Accept() is called on the XMLDocument, although all nodes support visiting. + Generally Accept() is called on the XMLDocument, although all nodes support visiting. - You should never change the document from a callback. + You should never change the document from a callback. - @sa XMLNode::Accept() + @sa XMLNode::Accept() */ class TINYXML2_LIB XMLVisitor { @@ -515,16 +513,15 @@ namespace tinyxml2 // WARNING: must match XMLDocument::_errorNames[] enum XMLError { XML_SUCCESS = 0, - XML_NO_ERROR = 0, XML_NO_ATTRIBUTE, XML_WRONG_ATTRIBUTE_TYPE, XML_ERROR_FILE_NOT_FOUND, XML_ERROR_FILE_COULD_NOT_BE_OPENED, XML_ERROR_FILE_READ_ERROR, - XML_ERROR_ELEMENT_MISMATCH, + UNUSED_XML_ERROR_ELEMENT_MISMATCH, // remove at next major version XML_ERROR_PARSING_ELEMENT, XML_ERROR_PARSING_ATTRIBUTE, - XML_ERROR_IDENTIFYING_TAG, + UNUSED_XML_ERROR_IDENTIFYING_TAG, // remove at next major version XML_ERROR_PARSING_TEXT, XML_ERROR_PARSING_CDATA, XML_ERROR_PARSING_COMMENT, @@ -541,21 +538,25 @@ namespace tinyxml2 /* - Utility functionality. + Utility functionality. */ - class XMLUtil + class TINYXML2_LIB XMLUtil { public: - static const char* SkipWhiteSpace(const char* p) { + static const char* SkipWhiteSpace(const char* p, int* curLineNumPtr) { TIXMLASSERT(p); + while (IsWhiteSpace(*p)) { + if (curLineNumPtr && *p == '\n') { + ++(*curLineNumPtr); + } ++p; } TIXMLASSERT(p); return p; } - static char* SkipWhiteSpace(char* p) { - return const_cast(SkipWhiteSpace(const_cast(p))); + static char* SkipWhiteSpace(char* p, int* curLineNumPtr) { + return const_cast(SkipWhiteSpace(const_cast(p), curLineNumPtr)); } // Anything in the high order range of UTF-8 is assumed to not be whitespace. This isn't @@ -586,19 +587,13 @@ namespace tinyxml2 if (p == q) { return true; } - int n = 0; - while (*p && *q && *p == *q && n < nChar) { - ++p; - ++q; - ++n; - } - if ((n == nChar) || (*p == 0 && *q == 0)) { - return true; - } - return false; + TIXMLASSERT(p); + TIXMLASSERT(q); + TIXMLASSERT(nChar >= 0); + return strncmp(p, q, nChar) == 0; } - inline static bool IsUTF8Continuation(const char p) { + inline static bool IsUTF8Continuation(char p) { return (p & 0x80) != 0; } @@ -614,6 +609,7 @@ namespace tinyxml2 static void ToStr(bool v, char* buffer, int bufferSize); static void ToStr(float v, char* buffer, int bufferSize); static void ToStr(double v, char* buffer, int bufferSize); + static void ToStr(int64_t v, char* buffer, int bufferSize); // converts strings to primitive types static bool ToInt(const char* str, int* value); @@ -621,33 +617,45 @@ namespace tinyxml2 static bool ToBool(const char* str, bool* value); static bool ToFloat(const char* str, float* value); static bool ToDouble(const char* str, double* value); - }; + static bool ToInt64(const char* str, int64_t* value); + // Changes what is serialized for a boolean value. + // Default to "true" and "false". Shouldn't be changed + // unless you have a special testing or compatibility need. + // Be careful: static, global, & not thread safe. + // Be sure to set static const memory as parameters. + static void SetBoolSerialization(const char* writeTrue, const char* writeFalse); - /** XMLNode is a base class for every object that is in the - XML Document Object Model (DOM), except XMLAttributes. - Nodes have siblings, a parent, and children which can - be navigated. A node is always in a XMLDocument. - The type of a XMLNode can be queried, and it can - be cast to its more defined type. - - A XMLDocument allocates memory for all its Nodes. - When the XMLDocument gets deleted, all its Nodes - will also be deleted. - - @verbatim - A Document can contain: Element (container or leaf) - Comment (leaf) - Unknown (leaf) - Declaration( leaf ) + private: + static const char* writeBoolTrue; + static const char* writeBoolFalse; + }; - An Element can contain: Element (container or leaf) - Text (leaf) - Attributes (not on tree) - Comment (leaf) - Unknown (leaf) - @endverbatim + /** XMLNode is a base class for every object that is in the + XML Document Object Model (DOM), except XMLAttributes. + Nodes have siblings, a parent, and children which can + be navigated. A node is always in a XMLDocument. + The type of a XMLNode can be queried, and it can + be cast to its more defined type. + + A XMLDocument allocates memory for all its Nodes. + When the XMLDocument gets deleted, all its Nodes + will also be deleted. + + @verbatim + A Document can contain: Element (container or leaf) + Comment (leaf) + Unknown (leaf) + Declaration( leaf ) + + An Element can contain: Element (container or leaf) + Text (leaf) + Attributes (not on tree) + Comment (leaf) + Unknown (leaf) + + @endverbatim */ class TINYXML2_LIB XMLNode { @@ -657,10 +665,12 @@ namespace tinyxml2 /// Get the XMLDocument that owns this XMLNode. const XMLDocument* GetDocument() const { + TIXMLASSERT(_document); return _document; } /// Get the XMLDocument that owns this XMLNode. XMLDocument* GetDocument() { + TIXMLASSERT(_document); return _document; } @@ -709,21 +719,24 @@ namespace tinyxml2 } /** The meaning of 'value' changes for the specific type. - @verbatim - Document: empty - Element: name of the element - Comment: the comment text - Unknown: the tag contents - Text: the text string - @endverbatim + @verbatim + Document: empty (NULL is returned, not an empty string) + Element: name of the element + Comment: the comment text + Unknown: the tag contents + Text: the text string + @endverbatim */ const char* Value() const; /** Set the Value of an XML node. - @sa Value() + @sa Value() */ void SetValue(const char* val, bool staticMem = false); + /// Gets the line number the node is in, if the document was parsed from a file. + int GetLineNum() const { return _parseLineNum; } + /// Get the parent of this node on the DOM. const XMLNode* Parent() const { return _parent; @@ -748,12 +761,12 @@ namespace tinyxml2 } /** Get the first child element, or optionally the first child - element with the specified name. + element with the specified name. */ - const XMLElement* FirstChildElement(const char* value = 0) const; + const XMLElement* FirstChildElement(const char* name = 0) const; - XMLElement* FirstChildElement(const char* value = 0) { - return const_cast(const_cast(this)->FirstChildElement(value)); + XMLElement* FirstChildElement(const char* name = 0) { + return const_cast(const_cast(this)->FirstChildElement(name)); } /// Get the last child node, or null if none exists. @@ -762,16 +775,16 @@ namespace tinyxml2 } XMLNode* LastChild() { - return const_cast(const_cast(this)->LastChild()); + return _lastChild; } /** Get the last child element or optionally the last child - element with the specified name. + element with the specified name. */ - const XMLElement* LastChildElement(const char* value = 0) const; + const XMLElement* LastChildElement(const char* name = 0) const; - XMLElement* LastChildElement(const char* value = 0) { - return const_cast(const_cast(this)->LastChildElement(value)); + XMLElement* LastChildElement(const char* name = 0) { + return const_cast(const_cast(this)->LastChildElement(name)); } /// Get the previous (left) sibling node of this node. @@ -784,10 +797,10 @@ namespace tinyxml2 } /// Get the previous (left) sibling element of this node, with an optionally supplied name. - const XMLElement* PreviousSiblingElement(const char* value = 0) const; + const XMLElement* PreviousSiblingElement(const char* name = 0) const; - XMLElement* PreviousSiblingElement(const char* value = 0) { - return const_cast(const_cast(this)->PreviousSiblingElement(value)); + XMLElement* PreviousSiblingElement(const char* name = 0) { + return const_cast(const_cast(this)->PreviousSiblingElement(name)); } /// Get the next (right) sibling node of this node. @@ -800,18 +813,18 @@ namespace tinyxml2 } /// Get the next (right) sibling element of this node, with an optionally supplied name. - const XMLElement* NextSiblingElement(const char* value = 0) const; + const XMLElement* NextSiblingElement(const char* name = 0) const; - XMLElement* NextSiblingElement(const char* value = 0) { - return const_cast(const_cast(this)->NextSiblingElement(value)); + XMLElement* NextSiblingElement(const char* name = 0) { + return const_cast(const_cast(this)->NextSiblingElement(name)); } /** - Add a child node as the last (right) child. - If the child node is already part of the document, - it is moved from its old location to the new location. - Returns the addThis argument or 0 if the node does not - belong to the same document. + Add a child node as the last (right) child. + If the child node is already part of the document, + it is moved from its old location to the new location. + Returns the addThis argument or 0 if the node does not + belong to the same document. */ XMLNode* InsertEndChild(XMLNode* addThis); @@ -819,86 +832,115 @@ namespace tinyxml2 return InsertEndChild(addThis); } /** - Add a child node as the first (left) child. - If the child node is already part of the document, - it is moved from its old location to the new location. - Returns the addThis argument or 0 if the node does not - belong to the same document. + Add a child node as the first (left) child. + If the child node is already part of the document, + it is moved from its old location to the new location. + Returns the addThis argument or 0 if the node does not + belong to the same document. */ XMLNode* InsertFirstChild(XMLNode* addThis); /** - Add a node after the specified child node. - If the child node is already part of the document, - it is moved from its old location to the new location. - Returns the addThis argument or 0 if the afterThis node - is not a child of this node, or if the node does not - belong to the same document. + Add a node after the specified child node. + If the child node is already part of the document, + it is moved from its old location to the new location. + Returns the addThis argument or 0 if the afterThis node + is not a child of this node, or if the node does not + belong to the same document. */ XMLNode* InsertAfterChild(XMLNode* afterThis, XMLNode* addThis); /** - Delete all the children of this node. + Delete all the children of this node. */ void DeleteChildren(); /** - Delete a child of this node. + Delete a child of this node. */ void DeleteChild(XMLNode* node); /** - Make a copy of this node, but not its children. - You may pass in a Document pointer that will be - the owner of the new Node. If the 'document' is - null, then the node returned will be allocated - from the current Document. (this->GetDocument()) + Make a copy of this node, but not its children. + You may pass in a Document pointer that will be + the owner of the new Node. If the 'document' is + null, then the node returned will be allocated + from the current Document. (this->GetDocument()) - Note: if called on a XMLDocument, this will return null. + Note: if called on a XMLDocument, this will return null. */ virtual XMLNode* ShallowClone(XMLDocument* document) const = 0; /** - Test if 2 nodes are the same, but don't test children. - The 2 nodes do not need to be in the same Document. + Make a copy of this node and all its children. + + If the 'target' is null, then the nodes will + be allocated in the current document. If 'target' + is specified, the memory will be allocated is the + specified XMLDocument. - Note: if called on a XMLDocument, this will return false. + NOTE: This is probably not the correct tool to + copy a document, since XMLDocuments can have multiple + top level XMLNodes. You probably want to use + XMLDocument::DeepCopy() + */ + XMLNode* DeepClone(XMLDocument* target) const; + + /** + Test if 2 nodes are the same, but don't test children. + The 2 nodes do not need to be in the same Document. + + Note: if called on a XMLDocument, this will return false. */ virtual bool ShallowEqual(const XMLNode* compare) const = 0; /** Accept a hierarchical visit of the nodes in the TinyXML-2 DOM. Every node in the - XML tree will be conditionally visited and the host will be called back - via the XMLVisitor interface. + XML tree will be conditionally visited and the host will be called back + via the XMLVisitor interface. - This is essentially a SAX interface for TinyXML-2. (Note however it doesn't re-parse - the XML for the callbacks, so the performance of TinyXML-2 is unchanged by using this - interface versus any other.) + This is essentially a SAX interface for TinyXML-2. (Note however it doesn't re-parse + the XML for the callbacks, so the performance of TinyXML-2 is unchanged by using this + interface versus any other.) - The interface has been based on ideas from: + The interface has been based on ideas from: - - http://www.saxproject.org/ - - http://c2.com/cgi/wiki?HierarchicalVisitorPattern + - http://www.saxproject.org/ + - http://c2.com/cgi/wiki?HierarchicalVisitorPattern - Which are both good references for "visiting". + Which are both good references for "visiting". - An example of using Accept(): - @verbatim - XMLPrinter printer; - tinyxmlDoc.Accept( &printer ); - const char* xmlcstr = printer.CStr(); - @endverbatim + An example of using Accept(): + @verbatim + XMLPrinter printer; + tinyxmlDoc.Accept( &printer ); + const char* xmlcstr = printer.CStr(); + @endverbatim */ virtual bool Accept(XMLVisitor* visitor) const = 0; - // internal - virtual char* ParseDeep(char*, StrPair*); + /** + Set user data into the XMLNode. TinyXML-2 in + no way processes or interprets user data. + It is initially 0. + */ + void SetUserData(void* userData) { _userData = userData; } + + /** + Get user data set into the XMLNode. TinyXML-2 in + no way processes or interprets user data. + It is initially 0. + */ + void* GetUserData() const { return _userData; } protected: XMLNode(XMLDocument*); virtual ~XMLNode(); + virtual char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + XMLDocument* _document; XMLNode* _parent; mutable StrPair _value; + int _parseLineNum; XMLNode* _firstChild; XMLNode* _lastChild; @@ -906,11 +948,14 @@ namespace tinyxml2 XMLNode* _prev; XMLNode* _next; + void* _userData; + private: - MemPool* _memPool; + MemPool * _memPool; void Unlink(XMLNode* child); static void DeleteNode(XMLNode* node); void InsertChildPreamble(XMLNode* insertThis) const; + const XMLElement* ToElementWithName(const char* name) const; XMLNode(const XMLNode&); // not supported XMLNode& operator=(const XMLNode&); // not supported @@ -919,19 +964,18 @@ namespace tinyxml2 /** XML text. - Note that a text node can have child element nodes, for example: - @verbatim - This is bold - @endverbatim + Note that a text node can have child element nodes, for example: + @verbatim + This is bold + @endverbatim - A text node can have 2 ways to output the next. "normal" output - and CDATA. It will default to the mode it was parsed from the XML file and - you generally want to leave it alone, but you can change the output mode with - SetCData() and query it with CData(). + A text node can have 2 ways to output the next. "normal" output + and CDATA. It will default to the mode it was parsed from the XML file and + you generally want to leave it alone, but you can change the output mode with + SetCData() and query it with CData(). */ class TINYXML2_LIB XMLText : public XMLNode { - friend class XMLBase; friend class XMLDocument; public: virtual bool Accept(XMLVisitor* visitor) const; @@ -952,7 +996,6 @@ namespace tinyxml2 return _isCData; } - char* ParseDeep(char*, StrPair* endTag); virtual XMLNode* ShallowClone(XMLDocument* document) const; virtual bool ShallowEqual(const XMLNode* compare) const; @@ -960,6 +1003,8 @@ namespace tinyxml2 XMLText(XMLDocument* doc) : XMLNode(doc), _isCData(false) {} virtual ~XMLText() {} + char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + private: bool _isCData; @@ -982,7 +1027,6 @@ namespace tinyxml2 virtual bool Accept(XMLVisitor* visitor) const; - char* ParseDeep(char*, StrPair* endTag); virtual XMLNode* ShallowClone(XMLDocument* document) const; virtual bool ShallowEqual(const XMLNode* compare) const; @@ -990,6 +1034,8 @@ namespace tinyxml2 XMLComment(XMLDocument* doc); virtual ~XMLComment(); + char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + private: XMLComment(const XMLComment&); // not supported XMLComment& operator=(const XMLComment&); // not supported @@ -997,15 +1043,15 @@ namespace tinyxml2 /** In correct XML the declaration is the first entry in the file. - @verbatim - - @endverbatim + @verbatim + + @endverbatim - TinyXML-2 will happily read or write files without a declaration, - however. + TinyXML-2 will happily read or write files without a declaration, + however. - The text of the declaration isn't interpreted. It is parsed - and written as a string. + The text of the declaration isn't interpreted. It is parsed + and written as a string. */ class TINYXML2_LIB XMLDeclaration : public XMLNode { @@ -1020,7 +1066,6 @@ namespace tinyxml2 virtual bool Accept(XMLVisitor* visitor) const; - char* ParseDeep(char*, StrPair* endTag); virtual XMLNode* ShallowClone(XMLDocument* document) const; virtual bool ShallowEqual(const XMLNode* compare) const; @@ -1028,6 +1073,8 @@ namespace tinyxml2 XMLDeclaration(XMLDocument* doc); virtual ~XMLDeclaration(); + char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + private: XMLDeclaration(const XMLDeclaration&); // not supported XMLDeclaration& operator=(const XMLDeclaration&); // not supported @@ -1035,11 +1082,11 @@ namespace tinyxml2 /** Any tag that TinyXML-2 doesn't recognize is saved as an - unknown. It is a tag of text, but should not be modified. - It will be written back to the XML, unchanged, when the file - is saved. + unknown. It is a tag of text, but should not be modified. + It will be written back to the XML, unchanged, when the file + is saved. - DTD tags get thrown into XMLUnknowns. + DTD tags get thrown into XMLUnknowns. */ class TINYXML2_LIB XMLUnknown : public XMLNode { @@ -1054,7 +1101,6 @@ namespace tinyxml2 virtual bool Accept(XMLVisitor* visitor) const; - char* ParseDeep(char*, StrPair* endTag); virtual XMLNode* ShallowClone(XMLDocument* document) const; virtual bool ShallowEqual(const XMLNode* compare) const; @@ -1062,6 +1108,8 @@ namespace tinyxml2 XMLUnknown(XMLDocument* doc); virtual ~XMLUnknown(); + char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + private: XMLUnknown(const XMLUnknown&); // not supported XMLUnknown& operator=(const XMLUnknown&); // not supported @@ -1070,10 +1118,10 @@ namespace tinyxml2 /** An attribute is a name-value pair. Elements have an arbitrary - number of attributes, each with a unique name. + number of attributes, each with a unique name. - @note The attributes are not XMLNodes. You may only query the - Next() attribute in a list. + @note The attributes are not XMLNodes. You may only query the + Next() attribute in a list. */ class TINYXML2_LIB XMLAttribute { @@ -1085,20 +1133,30 @@ namespace tinyxml2 /// The value of the attribute. const char* Value() const; + /// Gets the line number the attribute is in, if the document was parsed from a file. + int GetLineNum() const { return _parseLineNum; } + /// The next attribute in the list. const XMLAttribute* Next() const { return _next; } /** IntValue interprets the attribute as an integer, and returns the value. - If the value isn't an integer, 0 will be returned. There is no error checking; - use QueryIntValue() if you need error checking. + If the value isn't an integer, 0 will be returned. There is no error checking; + use QueryIntValue() if you need error checking. */ - int IntValue() const { + int IntValue() const { int i = 0; QueryIntValue(&i); return i; } + + int64_t Int64Value() const { + int64_t i = 0; + QueryInt64Value(&i); + return i; + } + /// Query as an unsigned integer. See IntValue() unsigned UnsignedValue() const { unsigned i = 0; @@ -1125,13 +1183,15 @@ namespace tinyxml2 } /** QueryIntValue interprets the attribute as an integer, and returns the value - in the provided parameter. The function will return XML_NO_ERROR on success, - and XML_WRONG_ATTRIBUTE_TYPE if the conversion is not successful. + in the provided parameter. The function will return XML_SUCCESS on success, + and XML_WRONG_ATTRIBUTE_TYPE if the conversion is not successful. */ XMLError QueryIntValue(int* value) const; /// See QueryIntValue XMLError QueryUnsignedValue(unsigned int* value) const; /// See QueryIntValue + XMLError QueryInt64Value(int64_t* value) const; + /// See QueryIntValue XMLError QueryBoolValue(bool* value) const; /// See QueryIntValue XMLError QueryDoubleValue(double* value) const; @@ -1145,6 +1205,8 @@ namespace tinyxml2 /// Set the attribute to value. void SetAttribute(unsigned value); /// Set the attribute to value. + void SetAttribute(int64_t value); + /// Set the attribute to value. void SetAttribute(bool value); /// Set the attribute to value. void SetAttribute(double value); @@ -1154,29 +1216,29 @@ namespace tinyxml2 private: enum { BUF_SIZE = 200 }; - XMLAttribute() : _next(0), _memPool(0) {} + XMLAttribute() : _name(), _value(), _parseLineNum(0), _next(0), _memPool(0) {} virtual ~XMLAttribute() {} XMLAttribute(const XMLAttribute&); // not supported void operator=(const XMLAttribute&); // not supported void SetName(const char* name); - char* ParseDeep(char* p, bool processEntities); + char* ParseDeep(char* p, bool processEntities, int* curLineNumPtr); mutable StrPair _name; mutable StrPair _value; + int _parseLineNum; XMLAttribute* _next; MemPool* _memPool; }; /** The element is a container class. It has a value, the element name, - and can contain other elements, text, comments, and unknowns. - Elements also contain an arbitrary number of attributes. + and can contain other elements, text, comments, and unknowns. + Elements also contain an arbitrary number of attributes. */ class TINYXML2_LIB XMLElement : public XMLNode { - friend class XMLBase; friend class XMLDocument; public: /// Get the name of an element (which is the Value() of the node.) @@ -1197,77 +1259,60 @@ namespace tinyxml2 virtual bool Accept(XMLVisitor* visitor) const; /** Given an attribute name, Attribute() returns the value - for the attribute of that name, or null if none - exists. For example: + for the attribute of that name, or null if none + exists. For example: - @verbatim - const char* value = ele->Attribute( "foo" ); - @endverbatim + @verbatim + const char* value = ele->Attribute( "foo" ); + @endverbatim - The 'value' parameter is normally null. However, if specified, - the attribute will only be returned if the 'name' and 'value' - match. This allow you to write code: + The 'value' parameter is normally null. However, if specified, + the attribute will only be returned if the 'name' and 'value' + match. This allow you to write code: - @verbatim - if ( ele->Attribute( "foo", "bar" ) ) callFooIsBar(); - @endverbatim + @verbatim + if ( ele->Attribute( "foo", "bar" ) ) callFooIsBar(); + @endverbatim - rather than: - @verbatim - if ( ele->Attribute( "foo" ) ) { - if ( strcmp( ele->Attribute( "foo" ), "bar" ) == 0 ) callFooIsBar(); - } - @endverbatim + rather than: + @verbatim + if ( ele->Attribute( "foo" ) ) { + if ( strcmp( ele->Attribute( "foo" ), "bar" ) == 0 ) callFooIsBar(); + } + @endverbatim */ const char* Attribute(const char* name, const char* value = 0) const; /** Given an attribute name, IntAttribute() returns the value - of the attribute interpreted as an integer. 0 will be - returned if there is an error. For a method with error - checking, see QueryIntAttribute() + of the attribute interpreted as an integer. The default + value will be returned if the attribute isn't present, + or if there is an error. (For a method with error + checking, see QueryIntAttribute()). */ - int IntAttribute(const char* name) const { - int i = 0; - QueryIntAttribute(name, &i); - return i; - } + int IntAttribute(const char* name, int defaultValue = 0) const; /// See IntAttribute() - unsigned UnsignedAttribute(const char* name) const { - unsigned i = 0; - QueryUnsignedAttribute(name, &i); - return i; - } + unsigned UnsignedAttribute(const char* name, unsigned defaultValue = 0) const; /// See IntAttribute() - bool BoolAttribute(const char* name) const { - bool b = false; - QueryBoolAttribute(name, &b); - return b; - } + int64_t Int64Attribute(const char* name, int64_t defaultValue = 0) const; /// See IntAttribute() - double DoubleAttribute(const char* name) const { - double d = 0; - QueryDoubleAttribute(name, &d); - return d; - } + bool BoolAttribute(const char* name, bool defaultValue = false) const; /// See IntAttribute() - float FloatAttribute(const char* name) const { - float f = 0; - QueryFloatAttribute(name, &f); - return f; - } + double DoubleAttribute(const char* name, double defaultValue = 0) const; + /// See IntAttribute() + float FloatAttribute(const char* name, float defaultValue = 0) const; /** Given an attribute name, QueryIntAttribute() returns - XML_NO_ERROR, XML_WRONG_ATTRIBUTE_TYPE if the conversion - can't be performed, or XML_NO_ATTRIBUTE if the attribute - doesn't exist. If successful, the result of the conversion - will be written to 'value'. If not successful, nothing will - be written to 'value'. This allows you to provide default - value: - - @verbatim - int value = 10; - QueryIntAttribute( "foo", &value ); // if "foo" isn't found, value will still be 10 - @endverbatim + XML_SUCCESS, XML_WRONG_ATTRIBUTE_TYPE if the conversion + can't be performed, or XML_NO_ATTRIBUTE if the attribute + doesn't exist. If successful, the result of the conversion + will be written to 'value'. If not successful, nothing will + be written to 'value'. This allows you to provide default + value: + + @verbatim + int value = 10; + QueryIntAttribute( "foo", &value ); // if "foo" isn't found, value will still be 10 + @endverbatim */ XMLError QueryIntAttribute(const char* name, int* value) const { const XMLAttribute* a = FindAttribute(name); @@ -1276,6 +1321,7 @@ namespace tinyxml2 } return a->QueryIntValue(value); } + /// See QueryIntAttribute() XMLError QueryUnsignedAttribute(const char* name, unsigned int* value) const { const XMLAttribute* a = FindAttribute(name); @@ -1284,6 +1330,16 @@ namespace tinyxml2 } return a->QueryUnsignedValue(value); } + + /// See QueryIntAttribute() + XMLError QueryInt64Attribute(const char* name, int64_t* value) const { + const XMLAttribute* a = FindAttribute(name); + if (!a) { + return XML_NO_ATTRIBUTE; + } + return a->QueryInt64Value(value); + } + /// See QueryIntAttribute() XMLError QueryBoolAttribute(const char* name, bool* value) const { const XMLAttribute* a = FindAttribute(name); @@ -1309,23 +1365,34 @@ namespace tinyxml2 return a->QueryFloatValue(value); } + /// See QueryIntAttribute() + XMLError QueryStringAttribute(const char* name, const char** value) const { + const XMLAttribute* a = FindAttribute(name); + if (!a) { + return XML_NO_ATTRIBUTE; + } + *value = a->Value(); + return XML_SUCCESS; + } + + /** Given an attribute name, QueryAttribute() returns - XML_NO_ERROR, XML_WRONG_ATTRIBUTE_TYPE if the conversion - can't be performed, or XML_NO_ATTRIBUTE if the attribute - doesn't exist. It is overloaded for the primitive types, - and is a generally more convenient replacement of - QueryIntAttribute() and related functions. - - If successful, the result of the conversion - will be written to 'value'. If not successful, nothing will - be written to 'value'. This allows you to provide default - value: - - @verbatim - int value = 10; - QueryAttribute( "foo", &value ); // if "foo" isn't found, value will still be 10 - @endverbatim + XML_SUCCESS, XML_WRONG_ATTRIBUTE_TYPE if the conversion + can't be performed, or XML_NO_ATTRIBUTE if the attribute + doesn't exist. It is overloaded for the primitive types, + and is a generally more convenient replacement of + QueryIntAttribute() and related functions. + + If successful, the result of the conversion + will be written to 'value'. If not successful, nothing will + be written to 'value'. This allows you to provide default + value: + + @verbatim + int value = 10; + QueryAttribute( "foo", &value ); // if "foo" isn't found, value will still be 10 + @endverbatim */ int QueryAttribute(const char* name, int* value) const { return QueryIntAttribute(name, value); @@ -1335,6 +1402,10 @@ namespace tinyxml2 return QueryUnsignedAttribute(name, value); } + int QueryAttribute(const char* name, int64_t* value) const { + return QueryInt64Attribute(name, value); + } + int QueryAttribute(const char* name, bool* value) const { return QueryBoolAttribute(name, value); } @@ -1362,6 +1433,13 @@ namespace tinyxml2 XMLAttribute* a = FindOrCreateAttribute(name); a->SetAttribute(value); } + + /// Sets the named attribute to value. + void SetAttribute(const char* name, int64_t value) { + XMLAttribute* a = FindOrCreateAttribute(name); + a->SetAttribute(value); + } + /// Sets the named attribute to value. void SetAttribute(const char* name, bool value) { XMLAttribute* a = FindOrCreateAttribute(name); @@ -1379,7 +1457,7 @@ namespace tinyxml2 } /** - Delete an attribute. + Delete an attribute. */ void DeleteAttribute(const char* name); @@ -1391,130 +1469,149 @@ namespace tinyxml2 const XMLAttribute* FindAttribute(const char* name) const; /** Convenience function for easy access to the text inside an element. Although easy - and concise, GetText() is limited compared to getting the XMLText child - and accessing it directly. - - If the first child of 'this' is a XMLText, the GetText() - returns the character string of the Text node, else null is returned. - - This is a convenient method for getting the text of simple contained text: - @verbatim - This is text - const char* str = fooElement->GetText(); - @endverbatim - - 'str' will be a pointer to "This is text". - - Note that this function can be misleading. If the element foo was created from - this XML: - @verbatim - This is text - @endverbatim - - then the value of str would be null. The first child node isn't a text node, it is - another element. From this XML: - @verbatim - This is text - @endverbatim - GetText() will return "This is ". + and concise, GetText() is limited compared to getting the XMLText child + and accessing it directly. + + If the first child of 'this' is a XMLText, the GetText() + returns the character string of the Text node, else null is returned. + + This is a convenient method for getting the text of simple contained text: + @verbatim + This is text + const char* str = fooElement->GetText(); + @endverbatim + + 'str' will be a pointer to "This is text". + + Note that this function can be misleading. If the element foo was created from + this XML: + @verbatim + This is text + @endverbatim + + then the value of str would be null. The first child node isn't a text node, it is + another element. From this XML: + @verbatim + This is text + @endverbatim + GetText() will return "This is ". */ const char* GetText() const; /** Convenience function for easy access to the text inside an element. Although easy - and concise, SetText() is limited compared to creating an XMLText child - and mutating it directly. - - If the first child of 'this' is a XMLText, SetText() sets its value to - the given string, otherwise it will create a first child that is an XMLText. - - This is a convenient method for setting the text of simple contained text: - @verbatim - This is text - fooElement->SetText( "Hullaballoo!" ); - Hullaballoo! - @endverbatim - - Note that this function can be misleading. If the element foo was created from - this XML: - @verbatim - This is text - @endverbatim - - then it will not change "This is text", but rather prefix it with a text element: - @verbatim - Hullaballoo!This is text - @endverbatim - - For this XML: - @verbatim - - @endverbatim - SetText() will generate - @verbatim - Hullaballoo! - @endverbatim + and concise, SetText() is limited compared to creating an XMLText child + and mutating it directly. + + If the first child of 'this' is a XMLText, SetText() sets its value to + the given string, otherwise it will create a first child that is an XMLText. + + This is a convenient method for setting the text of simple contained text: + @verbatim + This is text + fooElement->SetText( "Hullaballoo!" ); + Hullaballoo! + @endverbatim + + Note that this function can be misleading. If the element foo was created from + this XML: + @verbatim + This is text + @endverbatim + + then it will not change "This is text", but rather prefix it with a text element: + @verbatim + Hullaballoo!This is text + @endverbatim + + For this XML: + @verbatim + + @endverbatim + SetText() will generate + @verbatim + Hullaballoo! + @endverbatim */ void SetText(const char* inText); - /// Convenience method for setting text inside and element. See SetText() for important limitations. + /// Convenience method for setting text inside an element. See SetText() for important limitations. void SetText(int value); - /// Convenience method for setting text inside and element. See SetText() for important limitations. + /// Convenience method for setting text inside an element. See SetText() for important limitations. void SetText(unsigned value); - /// Convenience method for setting text inside and element. See SetText() for important limitations. + /// Convenience method for setting text inside an element. See SetText() for important limitations. + void SetText(int64_t value); + /// Convenience method for setting text inside an element. See SetText() for important limitations. void SetText(bool value); - /// Convenience method for setting text inside and element. See SetText() for important limitations. + /// Convenience method for setting text inside an element. See SetText() for important limitations. void SetText(double value); - /// Convenience method for setting text inside and element. See SetText() for important limitations. + /// Convenience method for setting text inside an element. See SetText() for important limitations. void SetText(float value); /** - Convenience method to query the value of a child text node. This is probably best - shown by example. Given you have a document is this form: - @verbatim - - 1 - 1.4 - - @endverbatim - - The QueryIntText() and similar functions provide a safe and easier way to get to the - "value" of x and y. - - @verbatim - int x = 0; - float y = 0; // types of x and y are contrived for example - const XMLElement* xElement = pointElement->FirstChildElement( "x" ); - const XMLElement* yElement = pointElement->FirstChildElement( "y" ); - xElement->QueryIntText( &x ); - yElement->QueryFloatText( &y ); - @endverbatim - - @returns XML_SUCCESS (0) on success, XML_CAN_NOT_CONVERT_TEXT if the text cannot be converted - to the requested type, and XML_NO_TEXT_NODE if there is no child text to query. + Convenience method to query the value of a child text node. This is probably best + shown by example. Given you have a document is this form: + @verbatim + + 1 + 1.4 + + @endverbatim + + The QueryIntText() and similar functions provide a safe and easier way to get to the + "value" of x and y. + + @verbatim + int x = 0; + float y = 0; // types of x and y are contrived for example + const XMLElement* xElement = pointElement->FirstChildElement( "x" ); + const XMLElement* yElement = pointElement->FirstChildElement( "y" ); + xElement->QueryIntText( &x ); + yElement->QueryFloatText( &y ); + @endverbatim + + @returns XML_SUCCESS (0) on success, XML_CAN_NOT_CONVERT_TEXT if the text cannot be converted + to the requested type, and XML_NO_TEXT_NODE if there is no child text to query. */ XMLError QueryIntText(int* ival) const; /// See QueryIntText() XMLError QueryUnsignedText(unsigned* uval) const; /// See QueryIntText() + XMLError QueryInt64Text(int64_t* uval) const; + /// See QueryIntText() XMLError QueryBoolText(bool* bval) const; /// See QueryIntText() XMLError QueryDoubleText(double* dval) const; /// See QueryIntText() XMLError QueryFloatText(float* fval) const; + int IntText(int defaultValue = 0) const; + + /// See QueryIntText() + unsigned UnsignedText(unsigned defaultValue = 0) const; + /// See QueryIntText() + int64_t Int64Text(int64_t defaultValue = 0) const; + /// See QueryIntText() + bool BoolText(bool defaultValue = false) const; + /// See QueryIntText() + double DoubleText(double defaultValue = 0) const; + /// See QueryIntText() + float FloatText(float defaultValue = 0) const; + // internal: - enum { + enum ElementClosingType { OPEN, // CLOSED, // CLOSING // }; - int ClosingType() const { + ElementClosingType ClosingType() const { return _closingType; } - char* ParseDeep(char* p, StrPair* endTag); virtual XMLNode* ShallowClone(XMLDocument* document) const; virtual bool ShallowEqual(const XMLNode* compare) const; + protected: + char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr); + private: XMLElement(XMLDocument* doc); virtual ~XMLElement(); @@ -1526,11 +1623,12 @@ namespace tinyxml2 } XMLAttribute* FindOrCreateAttribute(const char* name); //void LinkAttribute( XMLAttribute* attrib ); - char* ParseAttributes(char* p); + char* ParseAttributes(char* p, int* curLineNumPtr); static void DeleteAttribute(XMLAttribute* attribute); + XMLAttribute* CreateAttribute(); enum { BUF_SIZE = 200 }; - int _closingType; + ElementClosingType _closingType; // The attribute list is ordered; there is no 'lastAttribute' // because the list needs to be scanned for dupes before adding // a new attribute. @@ -1545,70 +1643,79 @@ namespace tinyxml2 /** A Document binds together all the functionality. - It can be saved, loaded, and printed to the screen. - All Nodes are connected and allocated to a Document. - If the Document is deleted, all its Nodes are also deleted. + It can be saved, loaded, and printed to the screen. + All Nodes are connected and allocated to a Document. + If the Document is deleted, all its Nodes are also deleted. */ class TINYXML2_LIB XMLDocument : public XMLNode { friend class XMLElement; + // Gives access to SetError, but over-access for everything else. + // Wishing C++ had "internal" scope. + friend class XMLNode; + friend class XMLText; + friend class XMLComment; + friend class XMLDeclaration; + friend class XMLUnknown; public: /// constructor - XMLDocument(bool processEntities = true, Whitespace = PRESERVE_WHITESPACE); + XMLDocument(bool processEntities = true, Whitespace whitespaceMode = PRESERVE_WHITESPACE); ~XMLDocument(); virtual XMLDocument* ToDocument() { + TIXMLASSERT(this == _document); return this; } virtual const XMLDocument* ToDocument() const { + TIXMLASSERT(this == _document); return this; } /** - Parse an XML file from a character string. - Returns XML_NO_ERROR (0) on success, or - an errorID. - - You may optionally pass in the 'nBytes', which is - the number of bytes which will be parsed. If not - specified, TinyXML-2 will assume 'xml' points to a - null terminated string. + Parse an XML file from a character string. + Returns XML_SUCCESS (0) on success, or + an errorID. + + You may optionally pass in the 'nBytes', which is + the number of bytes which will be parsed. If not + specified, TinyXML-2 will assume 'xml' points to a + null terminated string. */ XMLError Parse(const char* xml, size_t nBytes = (size_t)(-1)); /** - Load an XML file from disk. - Returns XML_NO_ERROR (0) on success, or - an errorID. + Load an XML file from disk. + Returns XML_SUCCESS (0) on success, or + an errorID. */ XMLError LoadFile(const char* filename); /** - Load an XML file from disk. You are responsible - for providing and closing the FILE*. + Load an XML file from disk. You are responsible + for providing and closing the FILE*. - NOTE: The file should be opened as binary ("rb") - not text in order for TinyXML-2 to correctly - do newline normalization. + NOTE: The file should be opened as binary ("rb") + not text in order for TinyXML-2 to correctly + do newline normalization. - Returns XML_NO_ERROR (0) on success, or - an errorID. + Returns XML_SUCCESS (0) on success, or + an errorID. */ XMLError LoadFile(FILE*); /** - Save the XML file to disk. - Returns XML_NO_ERROR (0) on success, or - an errorID. + Save the XML file to disk. + Returns XML_SUCCESS (0) on success, or + an errorID. */ XMLError SaveFile(const char* filename, bool compact = false); /** - Save the XML file to disk. You are responsible - for providing and closing the FILE*. + Save the XML file to disk. You are responsible + for providing and closing the FILE*. - Returns XML_NO_ERROR (0) on success, or - an errorID. + Returns XML_SUCCESS (0) on success, or + an errorID. */ XMLError SaveFile(FILE* fp, bool compact = false); @@ -1616,11 +1723,11 @@ namespace tinyxml2 return _processEntities; } Whitespace WhitespaceMode() const { - return _whitespace; + return _whitespaceMode; } /** - Returns true if this document has a leading Byte Order Mark of UTF8. + Returns true if this document has a leading Byte Order Mark of UTF8. */ bool HasBOM() const { return _writeBOM; @@ -1632,7 +1739,7 @@ namespace tinyxml2 } /** Return the root element of DOM. Equivalent to FirstChildElement(). - To get the first node, use FirstChild(). + To get the first node, use FirstChild(). */ XMLElement* RootElement() { return FirstChildElement(); @@ -1642,94 +1749,112 @@ namespace tinyxml2 } /** Print the Document. If the Printer is not provided, it will - print to stdout. If you provide Printer, this can print to a file: - @verbatim - XMLPrinter printer( fp ); - doc.Print( &printer ); - @endverbatim - - Or you can use a printer to print to memory: - @verbatim - XMLPrinter printer; - doc.Print( &printer ); - // printer.CStr() has a const char* to the XML - @endverbatim + print to stdout. If you provide Printer, this can print to a file: + @verbatim + XMLPrinter printer( fp ); + doc.Print( &printer ); + @endverbatim + + Or you can use a printer to print to memory: + @verbatim + XMLPrinter printer; + doc.Print( &printer ); + // printer.CStr() has a const char* to the XML + @endverbatim */ void Print(XMLPrinter* streamer = 0) const; virtual bool Accept(XMLVisitor* visitor) const; /** - Create a new Element associated with - this Document. The memory for the Element - is managed by the Document. + Create a new Element associated with + this Document. The memory for the Element + is managed by the Document. */ XMLElement* NewElement(const char* name); /** - Create a new Comment associated with - this Document. The memory for the Comment - is managed by the Document. + Create a new Comment associated with + this Document. The memory for the Comment + is managed by the Document. */ XMLComment* NewComment(const char* comment); /** - Create a new Text associated with - this Document. The memory for the Text - is managed by the Document. + Create a new Text associated with + this Document. The memory for the Text + is managed by the Document. */ XMLText* NewText(const char* text); /** - Create a new Declaration associated with - this Document. The memory for the object - is managed by the Document. - - If the 'text' param is null, the standard - declaration is used.: - @verbatim - - @endverbatim + Create a new Declaration associated with + this Document. The memory for the object + is managed by the Document. + + If the 'text' param is null, the standard + declaration is used.: + @verbatim + + @endverbatim */ XMLDeclaration* NewDeclaration(const char* text = 0); /** - Create a new Unknown associated with - this Document. The memory for the object - is managed by the Document. + Create a new Unknown associated with + this Document. The memory for the object + is managed by the Document. */ XMLUnknown* NewUnknown(const char* text); /** - Delete a node associated with this document. - It will be unlinked from the DOM. + Delete a node associated with this document. + It will be unlinked from the DOM. */ void DeleteNode(XMLNode* node); - void SetError(XMLError error, const char* str1, const char* str2); + void ClearError() { + SetError(XML_SUCCESS, 0, 0); + } /// Return true if there was an error parsing the document. bool Error() const { - return _errorID != XML_NO_ERROR; + return _errorID != XML_SUCCESS; } /// Return the errorID. XMLError ErrorID() const { return _errorID; } const char* ErrorName() const; + static const char* ErrorIDToName(XMLError errorID); - /// Return a possibly helpful diagnostic location or string. - const char* GetErrorStr1() const { - return _errorStr1; - } - /// Return a possibly helpful secondary diagnostic location or string. - const char* GetErrorStr2() const { - return _errorStr2; - } - /// If there is an error, print it to stdout. + /** Returns a "long form" error description. A hopefully helpful + diagnostic with location, line number, and/or additional info. + */ + const char* ErrorStr() const; + + /// A (trivial) utility function that prints the ErrorStr() to stdout. void PrintError() const; + /// Return the line where the error occured, or zero if unknown. + int ErrorLineNum() const + { + return _errorLineNum; + } + /// Clear the document, resetting it to the initial state. void Clear(); + /** + Copies this document to a target document. + The target will be completely cleared before the copy. + If you want to copy a sub-tree, see XMLNode::DeepClone(). + + NOTE: that the 'target' must be non-null. + */ + void DeepCopy(XMLDocument* target) const; + // internal char* Identify(char* p, XMLNode** node); + // internal + void MarkInUse(XMLNode*); + virtual XMLNode* ShallowClone(XMLDocument* /*document*/) const { return 0; } @@ -1741,13 +1866,21 @@ namespace tinyxml2 XMLDocument(const XMLDocument&); // not supported void operator=(const XMLDocument&); // not supported - bool _writeBOM; - bool _processEntities; - XMLError _errorID; - Whitespace _whitespace; - const char* _errorStr1; - const char* _errorStr2; - char* _charBuffer; + bool _writeBOM; + bool _processEntities; + XMLError _errorID; + Whitespace _whitespaceMode; + mutable StrPair _errorStr; + int _errorLineNum; + char* _charBuffer; + int _parseCurLineNum; + // Memory tracking does add some overhead. + // However, the code assumes that you don't + // have a bunch of unlinked nodes around. + // Therefore it takes less memory to track + // in the document vs. a linked list in the XMLNode, + // and the performance is the same. + DynArray _unlinked; MemPoolT< sizeof(XMLElement) > _elementPool; MemPoolT< sizeof(XMLAttribute) > _attributePool; @@ -1757,78 +1890,92 @@ namespace tinyxml2 static const char* _errorNames[XML_ERROR_COUNT]; void Parse(); - }; + void SetError(XMLError error, int lineNum, const char* format, ...); - /** - A XMLHandle is a class that wraps a node pointer with null checks; this is - an incredibly useful thing. Note that XMLHandle is not part of the TinyXML-2 - DOM structure. It is a separate utility class. + template + NodeType* CreateUnlinkedNode(MemPoolT& pool); + }; - Take an example: - @verbatim - - - - - - - @endverbatim + template + inline NodeType* XMLDocument::CreateUnlinkedNode(MemPoolT& pool) + { + TIXMLASSERT(sizeof(NodeType) == PoolElementSize); + TIXMLASSERT(sizeof(NodeType) == pool.ItemSize()); + NodeType* returnNode = new (pool.Alloc()) NodeType(this); + TIXMLASSERT(returnNode); + returnNode->_memPool = &pool; - Assuming you want the value of "attributeB" in the 2nd "Child" element, it's very - easy to write a *lot* of code that looks like: + _unlinked.Push(returnNode); + return returnNode; + } - @verbatim - XMLElement* root = document.FirstChildElement( "Document" ); - if ( root ) - { - XMLElement* element = root->FirstChildElement( "Element" ); - if ( element ) - { - XMLElement* child = element->FirstChildElement( "Child" ); - if ( child ) - { - XMLElement* child2 = child->NextSiblingElement( "Child" ); - if ( child2 ) - { - // Finally do something useful. - @endverbatim + /** + A XMLHandle is a class that wraps a node pointer with null checks; this is + an incredibly useful thing. Note that XMLHandle is not part of the TinyXML-2 + DOM structure. It is a separate utility class. + + Take an example: + @verbatim + + + + + + + @endverbatim + + Assuming you want the value of "attributeB" in the 2nd "Child" element, it's very + easy to write a *lot* of code that looks like: + + @verbatim + XMLElement* root = document.FirstChildElement( "Document" ); + if ( root ) + { + XMLElement* element = root->FirstChildElement( "Element" ); + if ( element ) + { + XMLElement* child = element->FirstChildElement( "Child" ); + if ( child ) + { + XMLElement* child2 = child->NextSiblingElement( "Child" ); + if ( child2 ) + { + // Finally do something useful. + @endverbatim - And that doesn't even cover "else" cases. XMLHandle addresses the verbosity - of such code. A XMLHandle checks for null pointers so it is perfectly safe - and correct to use: + And that doesn't even cover "else" cases. XMLHandle addresses the verbosity + of such code. A XMLHandle checks for null pointers so it is perfectly safe + and correct to use: - @verbatim - XMLHandle docHandle( &document ); - XMLElement* child2 = docHandle.FirstChildElement( "Document" ).FirstChildElement( "Element" ).FirstChildElement().NextSiblingElement(); - if ( child2 ) - { - // do something useful - @endverbatim + @verbatim + XMLHandle docHandle( &document ); + XMLElement* child2 = docHandle.FirstChildElement( "Document" ).FirstChildElement( "Element" ).FirstChildElement().NextSiblingElement(); + if ( child2 ) + { + // do something useful + @endverbatim - Which is MUCH more concise and useful. + Which is MUCH more concise and useful. - It is also safe to copy handles - internally they are nothing more than node pointers. - @verbatim - XMLHandle handleCopy = handle; - @endverbatim + It is also safe to copy handles - internally they are nothing more than node pointers. + @verbatim + XMLHandle handleCopy = handle; + @endverbatim - See also XMLConstHandle, which is the same as XMLHandle, but operates on const objects. + See also XMLConstHandle, which is the same as XMLHandle, but operates on const objects. */ class TINYXML2_LIB XMLHandle { public: /// Create a handle from any node (at any depth of the tree.) This can be a null pointer. - XMLHandle(XMLNode* node) { - _node = node; + XMLHandle(XMLNode* node) : _node(node) { } /// Create a handle from a node. - XMLHandle(XMLNode& node) { - _node = &node; + XMLHandle(XMLNode& node) : _node(&node) { } /// Copy constructor - XMLHandle(const XMLHandle& ref) { - _node = ref._node; + XMLHandle(const XMLHandle& ref) : _node(ref._node) { } /// Assignment XMLHandle& operator=(const XMLHandle& ref) { @@ -1841,32 +1988,32 @@ namespace tinyxml2 return XMLHandle(_node ? _node->FirstChild() : 0); } /// Get the first child element of this handle. - XMLHandle FirstChildElement(const char* value = 0) { - return XMLHandle(_node ? _node->FirstChildElement(value) : 0); + XMLHandle FirstChildElement(const char* name = 0) { + return XMLHandle(_node ? _node->FirstChildElement(name) : 0); } /// Get the last child of this handle. XMLHandle LastChild() { return XMLHandle(_node ? _node->LastChild() : 0); } /// Get the last child element of this handle. - XMLHandle LastChildElement(const char* _value = 0) { - return XMLHandle(_node ? _node->LastChildElement(_value) : 0); + XMLHandle LastChildElement(const char* name = 0) { + return XMLHandle(_node ? _node->LastChildElement(name) : 0); } /// Get the previous sibling of this handle. XMLHandle PreviousSibling() { return XMLHandle(_node ? _node->PreviousSibling() : 0); } /// Get the previous sibling element of this handle. - XMLHandle PreviousSiblingElement(const char* _value = 0) { - return XMLHandle(_node ? _node->PreviousSiblingElement(_value) : 0); + XMLHandle PreviousSiblingElement(const char* name = 0) { + return XMLHandle(_node ? _node->PreviousSiblingElement(name) : 0); } /// Get the next sibling of this handle. XMLHandle NextSibling() { return XMLHandle(_node ? _node->NextSibling() : 0); } /// Get the next sibling element of this handle. - XMLHandle NextSiblingElement(const char* _value = 0) { - return XMLHandle(_node ? _node->NextSiblingElement(_value) : 0); + XMLHandle NextSiblingElement(const char* name = 0) { + return XMLHandle(_node ? _node->NextSiblingElement(name) : 0); } /// Safe cast to XMLNode. This can return null. @@ -1875,41 +2022,38 @@ namespace tinyxml2 } /// Safe cast to XMLElement. This can return null. XMLElement* ToElement() { - return ((_node == 0) ? 0 : _node->ToElement()); + return (_node ? _node->ToElement() : 0); } /// Safe cast to XMLText. This can return null. XMLText* ToText() { - return ((_node == 0) ? 0 : _node->ToText()); + return (_node ? _node->ToText() : 0); } /// Safe cast to XMLUnknown. This can return null. XMLUnknown* ToUnknown() { - return ((_node == 0) ? 0 : _node->ToUnknown()); + return (_node ? _node->ToUnknown() : 0); } /// Safe cast to XMLDeclaration. This can return null. XMLDeclaration* ToDeclaration() { - return ((_node == 0) ? 0 : _node->ToDeclaration()); + return (_node ? _node->ToDeclaration() : 0); } private: - XMLNode* _node; + XMLNode * _node; }; /** - A variant of the XMLHandle class for working with const XMLNodes and Documents. It is the - same in all regards, except for the 'const' qualifiers. See XMLHandle for API. + A variant of the XMLHandle class for working with const XMLNodes and Documents. It is the + same in all regards, except for the 'const' qualifiers. See XMLHandle for API. */ class TINYXML2_LIB XMLConstHandle { public: - XMLConstHandle(const XMLNode* node) { - _node = node; + XMLConstHandle(const XMLNode* node) : _node(node) { } - XMLConstHandle(const XMLNode& node) { - _node = &node; + XMLConstHandle(const XMLNode& node) : _node(&node) { } - XMLConstHandle(const XMLConstHandle& ref) { - _node = ref._node; + XMLConstHandle(const XMLConstHandle& ref) : _node(ref._node) { } XMLConstHandle& operator=(const XMLConstHandle& ref) { @@ -1920,26 +2064,26 @@ namespace tinyxml2 const XMLConstHandle FirstChild() const { return XMLConstHandle(_node ? _node->FirstChild() : 0); } - const XMLConstHandle FirstChildElement(const char* value = 0) const { - return XMLConstHandle(_node ? _node->FirstChildElement(value) : 0); + const XMLConstHandle FirstChildElement(const char* name = 0) const { + return XMLConstHandle(_node ? _node->FirstChildElement(name) : 0); } const XMLConstHandle LastChild() const { return XMLConstHandle(_node ? _node->LastChild() : 0); } - const XMLConstHandle LastChildElement(const char* _value = 0) const { - return XMLConstHandle(_node ? _node->LastChildElement(_value) : 0); + const XMLConstHandle LastChildElement(const char* name = 0) const { + return XMLConstHandle(_node ? _node->LastChildElement(name) : 0); } const XMLConstHandle PreviousSibling() const { return XMLConstHandle(_node ? _node->PreviousSibling() : 0); } - const XMLConstHandle PreviousSiblingElement(const char* _value = 0) const { - return XMLConstHandle(_node ? _node->PreviousSiblingElement(_value) : 0); + const XMLConstHandle PreviousSiblingElement(const char* name = 0) const { + return XMLConstHandle(_node ? _node->PreviousSiblingElement(name) : 0); } const XMLConstHandle NextSibling() const { return XMLConstHandle(_node ? _node->NextSibling() : 0); } - const XMLConstHandle NextSiblingElement(const char* _value = 0) const { - return XMLConstHandle(_node ? _node->NextSiblingElement(_value) : 0); + const XMLConstHandle NextSiblingElement(const char* name = 0) const { + return XMLConstHandle(_node ? _node->NextSiblingElement(name) : 0); } @@ -1947,16 +2091,16 @@ namespace tinyxml2 return _node; } const XMLElement* ToElement() const { - return ((_node == 0) ? 0 : _node->ToElement()); + return (_node ? _node->ToElement() : 0); } const XMLText* ToText() const { - return ((_node == 0) ? 0 : _node->ToText()); + return (_node ? _node->ToText() : 0); } const XMLUnknown* ToUnknown() const { - return ((_node == 0) ? 0 : _node->ToUnknown()); + return (_node ? _node->ToUnknown() : 0); } const XMLDeclaration* ToDeclaration() const { - return ((_node == 0) ? 0 : _node->ToDeclaration()); + return (_node ? _node->ToDeclaration() : 0); } private: @@ -1965,55 +2109,55 @@ namespace tinyxml2 /** - Printing functionality. The XMLPrinter gives you more - options than the XMLDocument::Print() method. - - It can: - -# Print to memory. - -# Print to a file you provide. - -# Print XML without a XMLDocument. - - Print to Memory - - @verbatim - XMLPrinter printer; - doc.Print( &printer ); - SomeFunction( printer.CStr() ); - @endverbatim - - Print to a File - - You provide the file pointer. - @verbatim - XMLPrinter printer( fp ); - doc.Print( &printer ); - @endverbatim - - Print without a XMLDocument - - When loading, an XML parser is very useful. However, sometimes - when saving, it just gets in the way. The code is often set up - for streaming, and constructing the DOM is just overhead. - - The Printer supports the streaming case. The following code - prints out a trivially simple XML file without ever creating - an XML document. - - @verbatim - XMLPrinter printer( fp ); - printer.OpenElement( "foo" ); - printer.PushAttribute( "foo", "bar" ); - printer.CloseElement(); - @endverbatim + Printing functionality. The XMLPrinter gives you more + options than the XMLDocument::Print() method. + + It can: + -# Print to memory. + -# Print to a file you provide. + -# Print XML without a XMLDocument. + + Print to Memory + + @verbatim + XMLPrinter printer; + doc.Print( &printer ); + SomeFunction( printer.CStr() ); + @endverbatim + + Print to a File + + You provide the file pointer. + @verbatim + XMLPrinter printer( fp ); + doc.Print( &printer ); + @endverbatim + + Print without a XMLDocument + + When loading, an XML parser is very useful. However, sometimes + when saving, it just gets in the way. The code is often set up + for streaming, and constructing the DOM is just overhead. + + The Printer supports the streaming case. The following code + prints out a trivially simple XML file without ever creating + an XML document. + + @verbatim + XMLPrinter printer( fp ); + printer.OpenElement( "foo" ); + printer.PushAttribute( "foo", "bar" ); + printer.CloseElement(); + @endverbatim */ class TINYXML2_LIB XMLPrinter : public XMLVisitor { public: /** Construct the printer. If the FILE* is specified, - this will print to the FILE. Else it will print - to memory, and the result is available in CStr(). - If 'compact' is set to true, then output is created - with only required whitespace and newlines. + this will print to the FILE. Else it will print + to memory, and the result is available in CStr(). + If 'compact' is set to true, then output is created + with only required whitespace and newlines. */ XMLPrinter(FILE* file = 0, bool compact = false, int depth = 0); virtual ~XMLPrinter() {} @@ -2021,13 +2165,14 @@ namespace tinyxml2 /** If streaming, write the BOM and declaration. */ void PushHeader(bool writeBOM, bool writeDeclaration); /** If streaming, start writing an element. - The element must be closed with CloseElement() + The element must be closed with CloseElement() */ void OpenElement(const char* name, bool compactMode = false); /// If streaming, add an attribute to an open element. void PushAttribute(const char* name, const char* value); void PushAttribute(const char* name, int value); void PushAttribute(const char* name, unsigned value); + void PushAttribute(const char* name, int64_t value); void PushAttribute(const char* name, bool value); void PushAttribute(const char* name, double value); /// If streaming, close the Element. @@ -2039,6 +2184,8 @@ namespace tinyxml2 void PushText(int value); /// Add a text node from an unsigned. void PushText(unsigned value); + /// Add a text node from an unsigned. + void PushText(int64_t value); /// Add a text node from a bool. void PushText(bool value); /// Add a text node from a float. @@ -2066,37 +2213,41 @@ namespace tinyxml2 virtual bool Visit(const XMLUnknown& unknown); /** - If in print to memory mode, return a pointer to - the XML file in memory. + If in print to memory mode, return a pointer to + the XML file in memory. */ const char* CStr() const { return _buffer.Mem(); } /** - If in print to memory mode, return the size - of the XML file in memory. (Note the size returned - includes the terminating null.) + If in print to memory mode, return the size + of the XML file in memory. (Note the size returned + includes the terminating null.) */ int CStrSize() const { return _buffer.Size(); } /** - If in print to memory mode, reset the buffer to the - beginning. + If in print to memory mode, reset the buffer to the + beginning. */ void ClearBuffer() { _buffer.Clear(); _buffer.Push(0); + _firstElement = true; } protected: virtual bool CompactMode(const XMLElement&) { return _compactMode; } /** Prints out the space before an element. You may override to change - the space and tabs used. A PrintSpace() override should call Print(). + the space and tabs used. A PrintSpace() override should call Print(). */ virtual void PrintSpace(int depth); void Print(const char* format, ...); + void Write(const char* data, size_t size); + inline void Write(const char* data) { Write(data, strlen(data)); } + void Putc(char ch); void SealElementIfJustOpened(); bool _elementJustOpened; @@ -2120,6 +2271,10 @@ namespace tinyxml2 bool _restrictedEntityFlag[ENTITY_RANGE]; DynArray< char, 20 > _buffer; + + // Prohibit cloning, intentionally not implemented + XMLPrinter(const XMLPrinter&); + XMLPrinter& operator=(const XMLPrinter&); }; diff --git a/src/3rd/Simd/SimdConst.h b/src/3rd/Simd/SimdConst.h index 97e2f5e7..0caa63f8 100644 --- a/src/3rd/Simd/SimdConst.h +++ b/src/3rd/Simd/SimdConst.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar, +* Copyright (c) 2011-2018 Yermalayeu Ihar, * 2014-2015 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -90,7 +90,7 @@ namespace Simd namespace Sse2 { using namespace Sse; -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::F; using Sse::DF; using Sse::QF; @@ -165,7 +165,7 @@ namespace Simd namespace Sse3 { using namespace Sse2; -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::F; using Sse::DF; using Sse::QF; @@ -212,7 +212,7 @@ namespace Simd namespace Sse41 { using namespace Ssse3; -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::F; using Sse::DF; using Sse::QF; @@ -241,7 +241,7 @@ namespace Simd namespace Avx2 { using namespace Avx; -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Avx::F; using Avx::DF; using Avx::QF; diff --git a/src/3rd/Simd/SimdDetection.h b/src/3rd/Simd/SimdDetection.h index 7bf8dd90..cc96135f 100644 --- a/src/3rd/Simd/SimdDetection.h +++ b/src/3rd/Simd/SimdDetection.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -40,11 +40,6 @@ namespace Simd typedef Simd::Point Size; typedef Simd::Rectangle Rect; - struct Deletable - { - virtual ~Deletable() {} - }; - struct Data : public Deletable { struct DTreeNode diff --git a/src/3rd/Simd/SimdDetection.hpp b/src/3rd/Simd/SimdDetection.hpp index 9fe896ee..7f53c463 100644 --- a/src/3rd/Simd/SimdDetection.hpp +++ b/src/3rd/Simd/SimdDetection.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -206,7 +206,7 @@ namespace Simd ~Detection() { for (size_t i = 0; i < _data.size(); ++i) - ::SimdDetectionFree(_data[i].handle); + ::SimdRelease(_data[i].handle); } /*! @@ -383,7 +383,7 @@ namespace Simd ~Level() { for (size_t i = 0; i < hids.size(); ++i) - ::SimdDetectionFree(hids[i].handle); + ::SimdRelease(hids[i].handle); } }; typedef std::unique_ptr LevelPtr; diff --git a/src/3rd/Simd/SimdEnable.h b/src/3rd/Simd/SimdEnable.h index b1e96d15..14139149 100644 --- a/src/3rd/Simd/SimdEnable.h +++ b/src/3rd/Simd/SimdEnable.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -172,6 +172,21 @@ namespace Simd } const bool Enable = SupportedByCPU() && SupportedByOS(); + + const unsigned int SCR_FTZ = 1 << 15; + + SIMD_INLINE SimdBool GetFlushToZero() + { + return _mm_getcsr() | SCR_FTZ ? SimdTrue : SimdFalse; + } + + SIMD_INLINE void SetFlushToZero(SimdBool value) + { + if (value) + _mm_setcsr(_mm_getcsr() | SCR_FTZ); + else + _mm_setcsr(_mm_getcsr() & ~SCR_FTZ); + } } #endif diff --git a/src/3rd/Simd/SimdGemm.h b/src/3rd/Simd/SimdGemm.h new file mode 100644 index 00000000..a71e7489 --- /dev/null +++ b/src/3rd/Simd/SimdGemm.h @@ -0,0 +1,163 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdGemm_h__ +#define __SimdGemm_h__ + +#include "Simd/SimdArray.h" +#include "Simd/SimdBase.h" +#include "Simd/SimdParallel.hpp" + +namespace Simd +{ + template class GemmNN + { + public: + typedef void(*Main)(size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T * C, size_t ldc, TM tail); + typedef void(*Tail)(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T * C, size_t ldc, TM tail); + typedef void(*ScaleC)(size_t M, size_t N, T beta, T * C, size_t ldc); + typedef void(*PackB)(const T * B, size_t ldb, size_t K, size_t N, size_t microN, T * pB); + typedef TM(*TailMask)(ptrdiff_t tail); + + GemmNN(size_t M, size_t N, size_t K, size_t microM, size_t microN, size_t L1, size_t L2, size_t L3, size_t F, + Main kernelMM, Main kernelMT, Tail kernelTM, Tail kernelTT, ScaleC scaleC, PackB packB, TailMask tailMask) + : _M(M) + , _N(N) + , _K(K) + , _microM(microM) + , _microN(microN) + , _F(F) + , _threadNumber(Base::GetThreadNumber()) + , _kernelMM(kernelMM) + , _kernelMT(kernelMT) + , _kernelTM(kernelTM) + , _kernelTT(kernelTT) + , _scaleC(scaleC) + , _packB(packB) + { + + _macroK = L1 / sizeof(T) / _microN; + _macroM = AlignLoAny(L2 / sizeof(T) / _macroK, _microM); + _macroN = AlignLoAny(L3 / sizeof(T) / _macroK, _microN); + if (_N * _M * _K < 256 * 256 * 256 * 2) + _threadNumber = 1; + _pA.resize(_threadNumber); + _pB.resize(_threadNumber); + for (size_t t = 0; t < _threadNumber; ++t) + { + _pA[t].Resize(_macroM * _macroK); + _pB[t].Resize(_macroN * _macroK); + } + size_t NF = AlignLo(_N, _F); + if (tailMask) + { + _main = TM(-1); + _tail = NF == _N ? TM(-1) : tailMask(_N - NF); + } + else + { + _main = TM(_F); + _tail = NF == _N ? TM(_F) : TM(_N - NF); + } + } + + void Run(const T * alpha, const T * A, size_t lda, const T * B, size_t ldb, const T * beta, T * C, size_t ldc) + { + Simd::Parallel(0, _N, [&](size_t thread, size_t begin, size_t end) + { + ThreadKernel(end - begin, *alpha, A, lda, B + begin, ldb, *beta, C + begin, ldc, thread); + }, _threadNumber, _microN); + } + + private: + + void ThreadKernel(size_t N, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, size_t thread) + { + for (size_t j = 0; j < N; j += _macroN) + { + size_t macroN = Simd::Min(N, j + _macroN) - j; + for (size_t k = 0; k < _K; k += _macroK) + { + size_t macroK = Simd::Min(_K, k + _macroK) - k; + //PackA(A + i * lda, lda, macroM, K, _microM, _A.data); + for (size_t i = 0; i < _M; i += _macroM) + { + size_t macroM = Simd::Min(_M, i + _macroM) - i; + if (k == 0) + _scaleC(macroM, macroN, beta, C + i * ldc + j, ldc); + MacroKernel(macroM, macroN, macroK, alpha, A + i * lda + k, lda, B + k * ldb + j, ldb, beta, C + i * ldc + j, ldc, i == 0, thread); + } + } + } + } + + void MacroKernel(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, bool packB, size_t thread) + { + size_t MA = AlignLoAny(M, _microM); + size_t NA = AlignLoAny(N, _microN); + size_t j = 0; + for (; j < NA; j += _microN) + { + T * pB = _pB[thread].data + j * _macroK; + if (packB) + _packB(B + j, ldb, K, _microN, _microN, pB); + size_t i = 0; + for (; i < MA; i += _microM) + _kernelMM(K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _main); + if (i < M) + _kernelTM(M - i, _microN, K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _main); + } + if (j < N) + { + T * pB = _pB[thread].data + j * _macroK; + if (packB) + _packB(B + j, ldb, K, N - j, _microN, pB); + size_t i = 0; + for (; i < MA; i += _microM) + _kernelMT(K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _tail); + if (i < M) + _kernelTT(M - i, NA - j, K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _tail); + } + } + + typedef std::vector> Arrays; + + Arrays _pA, _pB; + size_t _M, _N, _K, _microM, _microN, _macroM, _macroN, _macroK, _F, _threadNumber; + TM _main, _tail; + Main _kernelMM, _kernelMT; + Tail _kernelTM, _kernelTT; + ScaleC _scaleC; + PackB _packB; + }; + +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc); + + void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB); + } +#endif//SIMD_AVX_ENABLE +} +#endif//__SimdGemm_h__ diff --git a/src/3rd/Simd/SimdLib.cpp b/src/3rd/Simd/SimdLib.cpp index c7cc2667..0862d1b4 100644 --- a/src/3rd/Simd/SimdLib.cpp +++ b/src/3rd/Simd/SimdLib.cpp @@ -1,8 +1,9 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar, -* 2014-2016 Antonenka Mikhail. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2014-2018 Antonenka Mikhail, +* 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -48,10 +49,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdMemory.h" #include "Simd/SimdEnable.h" -#include "Simd/SimdVersion.h" #include "Simd/SimdConst.h" #include "Simd/SimdLog.h" +#include "Simd/SimdResizer.h" + #include "Simd/SimdBase.h" #include "Simd/SimdSse1.h" #include "Simd/SimdSse2.h" @@ -68,6 +70,15 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved) #include "Simd/SimdNeon.h" #include "Simd/SimdMsa.h" +#if !defined(SIMD_VERSION) +#include "Simd/SimdVersion.h" +#endif + +SIMD_API const char * SimdVersion() +{ + return SIMD_VERSION; +} + using namespace Simd; SIMD_API int SimdCpuInfo() @@ -118,11 +129,6 @@ SIMD_API int SimdCpuInfo() return info; } -SIMD_API const char * SimdVersion() -{ - return SIMD_VERSION; -} - SIMD_API void * SimdAllocate(size_t size, size_t align) { return Allocate(size, align); @@ -143,6 +149,39 @@ SIMD_API size_t SimdAlignment() return Simd::ALIGNMENT; } +SIMD_API void SimdRelease(void * context) +{ + delete (Deletable*)context; +} + +SIMD_API size_t SimdGetThreadNumber() +{ + return Base::GetThreadNumber(); +} + +SIMD_API void SimdSetThreadNumber(size_t threadNumber) +{ + Base::SetThreadNumber(threadNumber); +} + +SIMD_API SimdBool SimdGetFlushToZero() +{ +#ifdef SIMD_SSE_ENABLE + if (Sse::Enable) + return Sse::GetFlushToZero(); + else +#endif + return SimdFalse; +} + +SIMD_API void SimdSetFlushToZero(SimdBool value) +{ +#ifdef SIMD_SSE_ENABLE + if (Sse::Enable) + Sse::SetFlushToZero(value); +#endif +} + SIMD_API uint32_t SimdCrc32c(const void * src, size_t size) { #ifdef SIMD_SSE42_ENABLE @@ -1600,11 +1639,6 @@ SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, Base::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride); } -SIMD_API void SimdDetectionFree(void * ptr) -{ - Base::DetectionFree(ptr); -} - SIMD_API void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride) { @@ -1943,6 +1977,21 @@ SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b Base::SquaredDifferenceSum16f(a, b, size, sum); } +SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance) +{ +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::CosineDistance16f(a, b, size, distance); + else +#endif +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable && size >= Avx2::F) + Avx2::CosineDistance16f(a, b, size, distance); + else +#endif + Base::CosineDistance16f(a, b, size, distance); +} + SIMD_API void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) { #ifdef SIMD_AVX512BW_ENABLE @@ -1993,6 +2042,14 @@ SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float * Base::Uint8ToFloat32(src, size, lower, upper, dst); } +typedef void(*SimdCosineDistance32fPtr) (const float * a, const float * b, size_t size, float * distance); +SimdCosineDistance32fPtr simdCosineDistance32f = SIMD_FUNC4(CosineDistance32f, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC); + +SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance) +{ + simdCosineDistance32f(a, b, size, distance); +} + SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride) { @@ -2029,6 +2086,14 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t Base::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride); } +typedef void(*SimdGemm32fNNPtr) (size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); +SimdGemm32fNNPtr simdGemm32fNN = SIMD_FUNC4(Gemm32fNN, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC); + +SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) +{ + simdGemm32fNN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); +} + SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride) { #ifdef SIMD_AVX512BW_ENABLE @@ -2338,29 +2403,29 @@ SIMD_API void SimdHogLiteExtractFeatures(const uint8_t * src, size_t srcStride, Base::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride); } -SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) +SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { #ifdef SIMD_AVX512BW_ENABLE if (Avx512bw::Enable) - Avx512bw::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Avx512bw::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else #endif #ifdef SIMD_AVX2_ENABLE if (Avx2::Enable) - Avx2::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Avx2::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else #endif #ifdef SIMD_AVX_ENABLE if (Avx::Enable) - Avx::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Avx::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else #endif #ifdef SIMD_SSE41_ENABLE if (Sse41::Enable) - Sse41::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Sse41::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else #endif - Base::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Base::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } SIMD_API void SimdHogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight) @@ -3894,6 +3959,36 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount); } +SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) +{ +#ifdef SIMD_AVX512F_ENABLE + if (Avx512f::Enable) + return Avx512f::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + else +#endif +#ifdef SIMD_AVX2_ENABLE + if (Avx2::Enable) + return Avx2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + else +#endif +#ifdef SIMD_AVX_ENABLE + if (Avx::Enable) + return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + else +#endif +#ifdef SIMD_SSE_ENABLE + if (Sse::Enable) + return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + else +#endif + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); +} + +SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) +{ + ((const Resizer*)resizer)->Run(src, srcStride, dst, dstStride); +} + SIMD_API void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex) { #ifdef SIMD_AVX512BW_ENABLE @@ -4672,6 +4767,31 @@ SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, si Base::SquareSum(src, stride, width, height, sum); } +SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) +{ +#ifdef SIMD_AVX512BW_ENABLE + if (Avx512bw::Enable) + Avx512bw::ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else +#endif +#ifdef SIMD_AVX2_ENABLE + if(Avx2::Enable && width >= Avx2::A) + Avx2::ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else +#endif +#ifdef SIMD_SSE2_ENABLE + if(Sse2::Enable && width >= Sse2::A) + Sse2::ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else +#endif +#ifdef SIMD_NEON_ENABLE + if (Neon::Enable && width >= Neon::A) + Neon::ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else +#endif + Base::ValueSquareSum(src, stride, width, height, valueSum, squareSum); +} + SIMD_API void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum) { #ifdef SIMD_AVX512BW_ENABLE @@ -4763,6 +4883,38 @@ SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float * Base::SvmSumLinear(x, svs, weights, length, count, sum); } +typedef void(*SimdSynetAddBiasPtr) (const float * bias, size_t count, size_t size, float * dst); +volatile SimdSynetAddBiasPtr simdSynetAddBias = SIMD_FUNC3(SynetAddBias, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC); + +SIMD_API void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst) +{ + simdSynetAddBias(bias, count, size, dst); +} + +typedef void(*SimdSynetEltwiseLayerForwardPtr) (float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); +volatile SimdSynetEltwiseLayerForwardPtr simdSynetEltwiseLayerForward = SIMD_FUNC4(SynetEltwiseLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC); + +SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) +{ + simdSynetEltwiseLayerForward(src, weight, count, size, type, dst); +} + +typedef void(*SimdSynetLrnLayerCrossChannelsPtr) (const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); +volatile SimdSynetLrnLayerCrossChannelsPtr simdSynetLrnLayerCrossChannels = SIMD_FUNC3(SynetLrnLayerCrossChannels, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC); + +SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) +{ + simdSynetLrnLayerCrossChannels(src, half, count, size, k, dst); +} + +typedef void(*SimdSynetScaleLayerForwardPtr) (const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); +volatile SimdSynetScaleLayerForwardPtr simdSynetScaleLayerForward = SIMD_FUNC4(SynetScaleLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC); + +SIMD_API void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) +{ + simdSynetScaleLayerForward(src, scale, bias, count, size, dst); +} + SIMD_API void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride) { diff --git a/src/3rd/Simd/SimdLib.h b/src/3rd/Simd/SimdLib.h index 30b1bdf6..c584894a 100644 --- a/src/3rd/Simd/SimdLib.h +++ b/src/3rd/Simd/SimdLib.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar, +* Copyright (c) 2011-2018 Yermalayeu Ihar, * 2014-2016 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -66,6 +66,15 @@ typedef signed __int64 int64_t; typedef unsigned __int64 uint64_t; #endif +/*! @ingroup c_types + Describes boolean type. +*/ +typedef enum +{ + SimdFalse = 0, /*!< False value. */ + SimdTrue = 1, /*!< True value. */ +} SimdBool; + /*! @ingroup c_types Describes types of SIMD extensions which supported by current CPU and Simd Library (see function ::SimdCpuInfo). */ @@ -324,6 +333,58 @@ extern "C" */ SIMD_API size_t SimdAlignment(); + /*! @ingroup memory + + \fn void SimdRelease(void * context); + + \short Releases context created with using of Simd Library API. + + \note This function releases a context created by functions ::SimdDetectionLoadA and ::SimdDetectionInit. + + \param [in] context - a context to be released. + */ + SIMD_API void SimdRelease(void * context); + + /*! @ingroup thread + + \fn size_t SimdGetThreadNumber(); + + \short Gets number of threads used by Simd Library to parallelize some algorithms. + + \return current thread number. + */ + SIMD_API size_t SimdGetThreadNumber(); + + /*! @ingroup thread + + \fn void SimdSetThreadNumber(size_t threadNumber); + + \short Sets number of threads used by Simd Library to parallelize some algorithms. + + \param [in] threadNumber - a number of threads. + */ + SIMD_API void SimdSetThreadNumber(size_t threadNumber); + + /*! @ingroup cpu_flags + + \fn SimdBool SimdGetFlushToZero(); + + \short Gets current CPU Flush-To-Zero (FTZ) flag. It is used in order to process subnormal numbers. + + \return current FTZ flag. + */ + SIMD_API SimdBool SimdGetFlushToZero(); + + /*! @ingroup cpu_flags + + \fn void SimdSetFlushToZero(SimdBool value); + + \short Sets current CPU Flush-To-Zero (FTZ) flag. It is used in order to process subnormal numbers. + + \param [in] value - a value of Flush-To-Zero (FTZ) flag. + */ + SIMD_API void SimdSetFlushToZero(SimdBool value); + /*! @ingroup hash \fn uint32_t SimdCrc32c(const void * src, size_t size); @@ -1591,7 +1652,7 @@ extern "C" \param [in] path - a path to cascade. \return a pointer to loaded cascade. On error it returns NULL. - This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using function ::SimdDetectionFree. + This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using of function ::SimdRelease. */ SIMD_API void * SimdDetectionLoadA(const char * path); @@ -1635,7 +1696,7 @@ extern "C" \return a pointer to hidden cascade. On error it returns NULL. This pointer is used in functions ::SimdDetectionPrepare, ::SimdDetectionHaarDetect32fp, ::SimdDetectionHaarDetect32fi, ::SimdDetectionLbpDetect32fp, ::SimdDetectionLbpDetect32fi, ::SimdDetectionLbpDetect16ip and ::SimdDetectionLbpDetect16ii. - It must be released with using function ::SimdDetectionFree. + It must be released with using of function ::SimdRelease. */ SIMD_API void * SimdDetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height, uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16); @@ -1799,17 +1860,6 @@ extern "C" SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride, ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride); - /*! @ingroup object_detection - - \fn void SimdDetectionFree(void * ptr); - - \short Frees pointers which was received with using of functions ::SimdDetectionLoadA and ::SimdDetectionInit. - - \note This function is used for implementation of Simd::Detection. - - \param [in] ptr - a pointer which was received with using of functions ::SimdDetectionLoadA and ::SimdDetectionInit. - */ SIMD_API void SimdDetectionFree(void * ptr); - /*! @ingroup edge_background \fn void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride); @@ -2153,6 +2203,26 @@ extern "C" */ SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum); + /*! @ingroup float16 + + \fn void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); + + \short Calculates cosine distance of two 16-bit float arrays. + + All arrays must have the same size. + + Algorithm description: + \verbatim + distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i])); + \endverbatim + + \param [in] a - a pointer to the first 16-bit float array. + \param [in] b - a pointer to the second 16-bit float array. + \param [in] size - a size of arrays. + \param [out] distance - a pointer to 32-bit float with cosine distance. + */ + SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance); + /*! @ingroup other_conversion \fn void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst); @@ -2191,6 +2261,26 @@ extern "C" */ SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst); + /*! @ingroup correlation + + \fn void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance); + + \short Calculates cosine distance of two 32-bit float arrays. + + All arrays must have the same size. + + Algorithm description: + \verbatim + distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i])); + \endverbatim + + \param [in] a - a pointer to the first 32-bit float array. + \param [in] b - a pointer to the second 32-bit float array. + \param [in] size - a size of arrays. + \param [out] distance - a pointer to 32-bit float with cosine distance. + */ + SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance); + /*! @ingroup other_filter \fn void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); @@ -2219,6 +2309,30 @@ extern "C" SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride); + /*! @ingroup matrix + + \fn void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + + \short Performs general matrix multiplication (for 32-bit float numbers). + + C(M, N) = alpha*A(M, K)*B(K, N) + beta*C(M, N); + + \note This function supports multithreading (See functions ::SimdGetThreadNumber and ::SimdSetThreadNumber). + + \param [in] M - a height of A and C matrices. + \param [in] N - a width of B and C matrices. + \param [in] K - a width of A and height of C matrices. + \param [in] alpha - a pointer to multiplier of the first term. + \param [in] A - a pointer to input A matrix. + \param [in] lda - a leading dimension of A matrix. + \param [in] B - a pointer to input B matrix. + \param [in] ldb - a leading dimension of B matrix. + \param [in] beta - a pointer to multiplier of the second term. + \param [out] C - a pointer to output C matrix. + \param [in] ldc - a leading dimension of C matrix. + */ + SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + /*! @ingroup gray_conversion \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride); @@ -2521,7 +2635,7 @@ extern "C" /*! @ingroup hog - \fn void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + \fn void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); \short Applies filter to lite HOG features. @@ -2531,8 +2645,8 @@ extern "C" \verbatim if(mask[x, y]) sum = 0; - for(dy = 0; dy < filterSize; dy++) - for(dx = 0; dx < filterSize*featureSize; dx++) + for(dy = 0; dy < filterHeight; dy++) + for(dx = 0; dx < filterWidth*featureSize; dx++) sum += src[x*featureSize + dx, y + dy]*filter[dx, dy]; dst[x, y] = sum; else @@ -2546,15 +2660,16 @@ extern "C" \param [in] featureSize - a size of cell with features. It must be 8 or 16. \param [in] filter - a pointer to the 32-bit float array with filter values. Array must have size equal to filterSize*filterSize*featureSize. - \param [in] filterSize - a size (width and height) of used filter. - \param [in] mask - a pointer to the 32-bit integer array with mask (0 or -1). + \param [in] filterWidth - a width of used filter. + \param [in] filterHeight - a height of used filter. + \param [in] mask - a pointer to the 32-bit integer array with mask (0 or -1). Pointer can be null otherwise the array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize). A function ::SimdHogLiteCreateMask is usefull in order to create this mask. \param [in] maskStride - a row size of mask array. \param [out] dst - a pointer to output buffer with result of filtration. Array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize). \param [in] dstStride - a row size of the output buffer with result of filtration. */ - SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); /*! @ingroup hog @@ -4126,6 +4241,61 @@ extern "C" SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount); + /*! @ingroup resizing + Describes resized image channel types. + */ + typedef enum + { + /*! 8-bit integer channel type. */ + SimdResizeChannelByte, + /*! 32-bit float channel type. */ + SimdResizeChannelFloat, + } SimdResizeChannelType; + + /*! @ingroup resizing + Describes methods used in oreder to resize image. + */ + typedef enum + { + /*! Bilinear method. */ + SimdResizeMethodBilinear, + /*! caffe::interp compatible method. */ + SimdResizeMethodCaffeInterp, + } SimdResizeMethodType; + + /*! @ingroup resizing + + \fn void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + + \short Creates resize context. + + \param [in] srcX - a width of the input image. + \param [in] srcY - a height of the input image. + \param [in] dstX - a width of the output image. + \param [in] dstY - a height of the output image. + \param [in] channels - a channel number of input and output image. + \param [in] type - a type of input and output image channel. + \param [in] method - a method used in order to resize image. + \return a pointer to resize context. On error it returns NULL. + This pointer is used in functions ::SimdResizerRun. + It must be released with using of function ::SimdRelease. + */ + SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + + /*! @ingroup resizing + + \fn void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); + + \short Performs image resizing. + + \param [in] resizer - a resize context. It must be created by function ::SimdResizerInit and released by function ::SimdRelease. + \param [in] src - a pointer to pixels data of the original input image. + \param [in] srcStride - a row size (in bytes) of the input image. + \param [out] dst - a pointer to pixels data of the resized output image. + \param [in] dstStride - a row size (in bytes) of the output image. + */ + SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride); + /*! @ingroup segmentation \fn void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex); @@ -4768,8 +4938,26 @@ extern "C" \param [in] height - an image height. \param [out] sum - the result sum. */ + SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + + /*! @ingroup other_statistic + + \fn void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); + + \short Gets sum and squared sum of value of pixels for gray 8-bit image. + \note This function has a C++ wrappers: Simd::ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum). + + \param [in] src - a pointer to pixels data of the image. + \param [in] stride - a row size of the image. + \param [in] width - an image width. + \param [in] height - an image height. + \param [out] valueSum - the result value sum. + \param [out] squareSum - the result square sum. + */ + SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); + /*! @ingroup other_statistic \fn void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); @@ -4840,6 +5028,137 @@ extern "C" */ SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); + /*! @ingroup synet + + \fn void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + \short Adds a bias to given vector. + + Algorithm's details: + \verbatim + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + dst[i*size + j] += bias[i]; + \endverbatim + + \note This function is used in Synet Framework. + + \param [in] bias - a pointer to the 32-bit float array with bias coefficients. + \param [in] count - a size of bias array. + \param [in] size - an internal size of bias addition. + \param [in, out] dst - a pointer to cumulative 32-bit float array. The size of the array must be equal to count*size. + */ + SIMD_API void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + /*! @ingroup synet + Describes operation type used in function ::SimdSynetEltwiseLayerForward. + */ + typedef enum + { + SimdSynetEltwiseOperationProduct, /*!< Product. */ + SimdSynetEltwiseOperationSum, /*!< Weighted sum. */ + SimdSynetEltwiseOperationMax, /*!< Maximum. */ + } SimdSynetEltwiseOperationType; + + /*! @ingroup synet + + \fn void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + \short This function is used for forward propagation of EltwiseLayer. + + Algorithm's details for ::SimdSynetEltwiseOperationProduct: + \verbatim + for(j = 0; j < size; ++j) + dst[j] = 1; + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + dst[j] *= src[i][j]; + \endverbatim + + Algorithm's details for ::SimdSynetEltwiseOperationSum: + \verbatim + for(j = 0; j < size; ++j) + dst[j] = 0; + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + dst[j] += src[i][j]*weight[i]; + \endverbatim + + Algorithm's details for ::SimdSynetEltwiseOperationMax: + \verbatim + for(j = 0; j < size; ++j) + dst[j] = -FLT_MAX; + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + dst[j] = max(dst[j], src[i][j]); + \endverbatim + + \note This function is used in Synet Framework. + + \param [in] src - a pointer to poitres to the input 32-bit float arrays. + \param [in] weight - a pointer to the 32-bit float array with sum coefficients. It is need only for ::SimdSynetEltwiseOperationSum operation type otherwise it can be NULL. + \param [in] count - a count of input arrays. Must be at least 2. + \param [in] size - a size of the input and output arrays. + \param [in] type - a type of operation (see ::SimdSynetEltwiseOperationType). + \param [out] dst - a pointer to the output 32-bit float array. + */ + SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + /*! @ingroup synet + + \fn void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + + \short This function is used for forward propagation of LrnLayer (cross channels normalization). + + Algorithm's details: + \verbatim + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + { + lo = Max(0, i - half); + ln = Min(count, i + half + 1); + sum = 0; + for(l = lo; l < ln; ++l) + sum += Square(src[l*size + j]); + dst[i*size + j] = src[i*size + j]*Pow(k[0] + sum*k[1], k[2]); + } + \endverbatim + + \note This function is used in Synet Framework. + + \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to count*size. + \param [in] half - a local normalization half size. + \param [in] count - a channels count. + \param [in] size - an internal size of the operation. + \param [in] k - a pointer to the 32-bit float array with 3 coefficients (see algorithm details). + \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count*size. + */ + SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + + /*! @ingroup synet + + \fn void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); + + \short This function is used for forward propagation of ScaleLayer. + + Algorithm's details: + \verbatim + for(i = 0; i < count; ++i) + for(j = 0; j < size; ++j) + dst[i*size + j] = src[i*size + j]*scale[i] + (bias ? bias[i] : 0); + \endverbatim + + \note This function is used in Synet Framework. + + \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to count*size. + \param [in] scale - a pointer to the 32-bit float array with scale coefficients. + \param [in] bias - a pointer to the 32-bit float array with bias coefficients. Can be NULL. + \param [in] count - a size of scale and bias arrays. + \param [in] size - an internal size of the operation. + \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count*size. + */ + SIMD_API void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); + /*! @ingroup texture_estimation \fn void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); diff --git a/src/3rd/Simd/SimdLib.hpp b/src/3rd/Simd/SimdLib.hpp index 4896360f..74bba30f 100644 --- a/src/3rd/Simd/SimdLib.hpp +++ b/src/3rd/Simd/SimdLib.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar, +* Copyright (c) 2011-2018 Yermalayeu Ihar, * 2014-2016 Antonenka Mikhail. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -2592,9 +2592,9 @@ namespace Simd \note This function is a C++ wrapper for function ::SimdResizeBilinear. \param [in] src - an original input image. - \param [out] dst - a reduced output image. + \param [out] dst - a resized output image. */ - template class A> SIMD_INLINE void ResizeBilinear(const View& src, View& dst) + template class A> SIMD_INLINE void ResizeBilinear(const View & src, View & dst) { assert(src.format == dst.format && src.ChannelSize() == 1); @@ -2609,6 +2609,43 @@ namespace Simd } } + /*! @ingroup resizing + + \fn void ResizeAreaGray(const View & src, View & dst) + + \short Performs resizing of input image with using area interpolation. + + All images must have the same format (8-bit gray). + + \param [in] src - an original input image. + \param [out] dst - a resized output image. + */ + template class A> SIMD_INLINE void ResizeAreaGray(const View & src, View & dst) + { + assert(src.format == dst.format && src.format == View::Gray8); + + if (EqualSize(src, dst)) + { + Copy(src, dst); + } + else + { + size_t level = 0; + for (; (dst.width << (level + 1)) < (size_t)src.width; level++); + Point size = src.Size() << level; + if (level) + { + Pyramid pyramid(size, level + 1); + Simd::ResizeBilinear(src, pyramid[0]); + for (size_t i = 0; i < level; ++i) + Simd::ReduceGray(pyramid.At(i), pyramid.At(i + 1), ::SimdReduce2x2); + Simd::Copy(pyramid[level], dst); + } + else + Simd::ResizeBilinear(src, dst); + } + } + /*! @ingroup segmentation \fn void SegmentationChangeIndex(View & mask, uint8_t oldIndex, uint8_t newIndex) @@ -3227,6 +3264,25 @@ namespace Simd SimdSquareSum(src.data, src.stride, src.width, src.height, &sum); } + + /*! @ingroup other_statistic + + \fn void ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum) + + \short Gets sum and sum of squared value of pixels for gray 8-bit image. + + \note This function is a C++ wrapper for function ::SimdValueSquareSum. + + \param [in] src - an input image. + \param [out] valueSum - a result value sum. + \param [out] squareSum - a result square sum. + */ + template class A> SIMD_INLINE void ValueSquareSum(const View& src, uint64_t & valueSum, uint64_t & squareSum) + { + assert(src.format == View::Gray8); + + SimdValueSquareSum(src.data, src.stride, src.width, src.height, &valueSum, &squareSum); + } /*! @ingroup other_statistic diff --git a/src/3rd/Simd/SimdLoad.h b/src/3rd/Simd/SimdLoad.h index 3ef69919..c173d816 100644 --- a/src/3rd/Simd/SimdLoad.h +++ b/src/3rd/Simd/SimdLoad.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,6 +42,11 @@ namespace Simd { return _mm_load_ps(p); } + + SIMD_INLINE __m128 Load(const float * p0, const float * p1) + { + return _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (__m64*)p0), (__m64*)p1); + } } #endif//SIMD_SSE_ENABLE @@ -148,7 +153,7 @@ namespace Simd #ifdef SIMD_SSE3_ENABLE namespace Sse3 { -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::Load; using Sse2::Load; #endif @@ -158,7 +163,7 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::Load; using Sse2::Load; #endif @@ -184,6 +189,11 @@ namespace Simd { return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0)), Sse::Load(p1), 1); } + + SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3) + { + return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1); + } } #endif//SIMD_AVX_ENABLE diff --git a/src/3rd/Simd/SimdMath.h b/src/3rd/Simd/SimdMath.h index b26fef98..e5cd3e99 100644 --- a/src/3rd/Simd/SimdMath.h +++ b/src/3rd/Simd/SimdMath.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -260,15 +260,6 @@ namespace Simd gradient[offset] += d*d; weight[offset] -= alpha * d / ::sqrt(gradient[offset] + epsilon); } - - SIMD_INLINE float Pow(float basis, float exponent) - { -#if defined(__GNUC__) && defined(SIMD_X86_ENABLE) - -#else - return ::expf(::logf(basis)*exponent); -#endif - } } #ifdef SIMD_SSE_ENABLE @@ -296,16 +287,16 @@ namespace Simd return _mm_or_ps(_mm_and_ps(mask, positive), _mm_andnot_ps(mask, negative)); } - SIMD_INLINE __m128 RightNotZero(size_t count) + SIMD_INLINE __m128 RightNotZero(ptrdiff_t count) { const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 }; - return _mm_loadu_ps((float*)(mask + count)); + return _mm_loadu_ps((float*)(mask + Simd::RestrictRange(count, 0, F))); } - SIMD_INLINE __m128 LeftNotZero(size_t count) + SIMD_INLINE __m128 LeftNotZero(ptrdiff_t count) { const int32_t mask[DF] = { -1, -1, -1, -1, 0, 0, 0, 0 }; - return _mm_loadu_ps((float*)(mask + 4 - count)); + return _mm_loadu_ps((float*)(mask + F - Simd::RestrictRange(count, 0, F))); } template SIMD_INLINE __m128 Masked(const __m128 & value, const __m128 & mask); @@ -497,7 +488,7 @@ namespace Simd #ifdef SIMD_SSE3_ENABLE namespace Sse3 { -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::RightNotZero; #endif } @@ -530,7 +521,7 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Sse::RightNotZero; #endif @@ -583,10 +574,16 @@ namespace Simd return _mm256_mul_ps(_mm256_rsqrt_ps(_mm256_max_ps(value, _mm256_set1_ps(0.00000001f))), value); } - SIMD_INLINE __m256 RightNotZero(size_t count) + SIMD_INLINE __m256 RightNotZero(ptrdiff_t count) { const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }; - return _mm256_loadu_ps((float*)(mask + count)); + return _mm256_loadu_ps((float*)(mask + Simd::RestrictRange(count, 0, F))); + } + + SIMD_INLINE __m256 LeftNotZero(ptrdiff_t count) + { + const int32_t mask[DF] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; + return _mm256_loadu_ps((float*)(mask + F - Simd::RestrictRange(count, 0, F))); } SIMD_INLINE __m256 PermutedHorizontalAdd(__m256 a, __m256 b) @@ -618,7 +615,7 @@ namespace Simd #ifdef SIMD_AVX2_ENABLE namespace Avx2 { -#if defined(_MSC_VER) && _MSC_VER >= 1800 && _MSC_VER < 1900 // Visual Studio 2013 compiler bug +#if defined(_MSC_VER) && _MSC_VER >= 1700 && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug using Avx::RightNotZero; #endif diff --git a/src/3rd/Simd/SimdMemory.h b/src/3rd/Simd/SimdMemory.h index 1530ad56..a4b49cd8 100644 --- a/src/3rd/Simd/SimdMemory.h +++ b/src/3rd/Simd/SimdMemory.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * 2016-2016 Sintegrial Technologies. * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -30,6 +30,16 @@ namespace Simd { + SIMD_INLINE size_t AlignHiAny(size_t size, size_t align) + { + return (size + align - 1) / align * align; + } + + SIMD_INLINE size_t AlignLoAny(size_t size, size_t align) + { + return size / align * align; + } + SIMD_INLINE size_t AlignHi(size_t size, size_t align) { return (size + align - 1) & ~(align - 1); @@ -91,6 +101,11 @@ namespace Simd #endif } + struct Deletable + { + virtual ~Deletable() {} + }; + #ifdef SIMD_SSE_ENABLE namespace Sse { diff --git a/src/3rd/Simd/SimdNeon.h b/src/3rd/Simd/SimdNeon.h index b6d6828e..ea5e901a 100644 --- a/src/3rd/Simd/SimdNeon.h +++ b/src/3rd/Simd/SimdNeon.h @@ -1,7 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -407,6 +408,8 @@ namespace Simd void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); diff --git a/src/3rd/Simd/SimdNeonResizeBilinear.cpp b/src/3rd/Simd/SimdNeonResizeBilinear.cpp index 8fc9512d..7b67a4b1 100644 --- a/src/3rd/Simd/SimdNeonResizeBilinear.cpp +++ b/src/3rd/Simd/SimdNeonResizeBilinear.cpp @@ -1,7 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -58,6 +59,40 @@ namespace Simd private: void *_p; }; + + struct Index + { + int src, dst; + uint8_t shuffle[Simd::Neon::A]; + }; + + struct BufferG + { + BufferG(size_t width, size_t blocks, size_t height) + { + _p = Simd::Allocate(3 * width + sizeof(int) * 2 * height + blocks * sizeof(Index) + 2 * A); + bx[0] = (uint8_t*)_p; + bx[1] = bx[0] + width + A; + ax = bx[1] + width + A; + ix = (Index*)(ax + width); + iy = (int*)(ix + blocks); + ay = iy + height; + } + + ~BufferG() + { + Free(_p); + } + + uint8_t * bx[2]; + uint8_t * ax; + Index * ix; + int * ay; + int * iy; + private: + void *_p; + }; + } template void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas) @@ -91,6 +126,55 @@ namespace Simd } } + void EstimateAlphaIndexX(int srcSize, int dstSize, Index * indexes, uint8_t * alphas, size_t & blockCount) + { + float scale = (float)srcSize / dstSize; + int block = 0; + indexes[0].src = 0; + indexes[0].dst = 0; + for (int dstIndex = 0; dstIndex < dstSize; ++dstIndex) + { + float alpha = (float)((dstIndex + 0.5)*scale - 0.5); + int srcIndex = (int)::floor(alpha); + alpha -= srcIndex; + + if (srcIndex < 0) + { + srcIndex = 0; + alpha = 0; + } + + if (srcIndex > srcSize - 2) + { + srcIndex = srcSize - 2; + alpha = 1; + } + + int dst = 2 * dstIndex - indexes[block].dst; + int src = srcIndex - indexes[block].src; + if (src >= A - 1 || dst >= A) + { + block++; + indexes[block].src = Simd::Min(srcIndex, srcSize - (int)A); + indexes[block].dst = 2 * dstIndex; + dst = 0; + src = srcIndex - indexes[block].src; + } + indexes[block].shuffle[dst] = src; + indexes[block].shuffle[dst + 1] = src + 1; + + alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5); + alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]); + alphas += 2; + } + blockCount = block + 1; + } + + SIMD_INLINE size_t BlockCountMax(size_t src, size_t dst) + { + return (size_t)Simd::Max(::ceil(float(src) / (A - 1)), ::ceil(float(dst) / HA)); + } + template void InterpolateX(const uint8_t * alpha, uint8_t * buffer); template <> SIMD_INLINE void InterpolateX<1>(const uint8_t * alpha, uint8_t * buffer) @@ -172,7 +256,7 @@ namespace Simd size_t size = 2 * dstWidth*channelCount; size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2; size_t alignedSize = AlignHi(size, DA) - DA; - const size_t step = A*channelCount; + const size_t step = A * channelCount; Buffer buffer(bufferSize, dstWidth, dstHeight); @@ -221,6 +305,84 @@ namespace Simd } } + SIMD_INLINE void LoadGray(const uint8_t * src, const Index & index, uint8_t * dst) + { + + uint8x16_t _src = vld1q_u8(src + index.src); + uint8x16_t _shuffle = vld1q_u8(index.shuffle); + + uint8x8x2_t src1; + src1.val[0] = vget_low_u8(_src); + src1.val[1] = vget_high_u8(_src); + + uint8x8_t dstLow = vtbl2_u8(src1, vget_low_u8(_shuffle)); + uint8x8_t dstHigh = vtbl2_u8(src1, vget_high_u8(_shuffle)); + + uint8x16_t _dst = vcombine_u8(dstLow, dstHigh); + + vst1q_u8(dst + index.dst, _dst); + + } + + void ResizeBilinearGray( + const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, + uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) + { + assert(dstWidth >= A); + + size_t bufferWidth = AlignHi(dstWidth, A) * 2; + size_t blockCount = BlockCountMax(srcWidth, dstWidth); + size_t size = 2 * dstWidth; + size_t alignedSize = AlignHi(size, DA) - DA; + const size_t step = A; + + BufferG buffer(bufferWidth, blockCount, dstHeight); + + Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1); + + EstimateAlphaIndexX((int)srcWidth, (int)dstWidth, buffer.ix, buffer.ax, blockCount); + + ptrdiff_t previous = -2; + + uint16x8_t a[2]; + + for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride) + { + a[0] = vdupq_n_u16(Base::FRACTION_RANGE - buffer.ay[yDst]); + a[1] = vdupq_n_u16(buffer.ay[yDst]); + + ptrdiff_t sy = buffer.iy[yDst]; + int k = 0; + + if (sy == previous) + k = 2; + else if (sy == previous + 1) + { + Swap(buffer.bx[0], buffer.bx[1]); + k = 1; + } + + previous = sy; + + for (; k < 2; k++) + { + const uint8_t * psrc = src + (sy + k)*srcStride; + uint8_t * pdst = buffer.bx[k]; + for (size_t i = 0; i < blockCount; ++i) + LoadGray(psrc, buffer.ix[i], pdst); + + uint8_t * pbx = buffer.bx[k]; + for (size_t i = 0; i < bufferWidth; i += step) + InterpolateX<1>(buffer.ax + i, pbx + i); + } + + for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A) + InterpolateY(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id); + size_t i = size - DA; + InterpolateY(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2); + } + } + void ResizeBilinear( const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount) @@ -228,7 +390,10 @@ namespace Simd switch (channelCount) { case 1: - ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + if (srcWidth >= A && srcWidth < 4 * dstWidth) + ResizeBilinearGray(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); + else + ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); break; case 2: ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride); diff --git a/src/3rd/Simd/SimdNeonStatistic.cpp b/src/3rd/Simd/SimdNeonStatistic.cpp index ff38b6fc..466590bb 100644 --- a/src/3rd/Simd/SimdNeonStatistic.cpp +++ b/src/3rd/Simd/SimdNeonStatistic.cpp @@ -1,7 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2018-2018 Radchenko Andrey. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -554,6 +555,48 @@ namespace Simd SquareSum(src, stride, width, height, sum); } + template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + assert(width >= A); + if (align) + assert(Aligned(src) && Aligned(stride)); + + size_t alignedWidth = Simd::AlignLo(width, A); + uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); + uint64x2_t fullValueSum = K64_0000000000000000; + uint64x2_t fullSquareSum = K64_0000000000000000; + for (size_t row = 0; row < height; ++row) + { + uint32x4_t rowValueSum = K32_00000000; + uint32x4_t rowSquareSum = K32_00000000; + for (size_t col = 0; col < alignedWidth; col += A) + { + uint8x16_t _src = Load(src + col); + rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src)); + rowSquareSum = vaddq_u32(rowSquareSum, Square(_src)); + } + if (alignedWidth != width) + { + uint8x16_t _src = vandq_u8(Load(src + width - A), tailMask); + rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src)); + rowSquareSum = vaddq_u32(rowSquareSum, Square(_src)); + } + fullValueSum = vaddq_u64(fullValueSum, vpaddlq_u32(rowValueSum)); + fullSquareSum = vaddq_u64(fullSquareSum, vpaddlq_u32(rowSquareSum)); + src += stride; + } + *valueSum = ExtractSum64u(fullValueSum); + *squareSum = ExtractSum64u(fullSquareSum); + } + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + if (Aligned(src) && Aligned(stride)) + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + } + SIMD_INLINE uint32x4_t Correlation(const uint8x16_t & a, const uint8x16_t & b) { uint16x8_t lo = vmull_u8(Half<0>(a), Half<0>(b)); diff --git a/src/3rd/Simd/SimdNeural.hpp b/src/3rd/Simd/SimdNeural.hpp index e9c7ba04..3ac72df1 100644 --- a/src/3rd/Simd/SimdNeural.hpp +++ b/src/3rd/Simd/SimdNeural.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -1282,7 +1282,7 @@ namespace Simd */ class DropoutLayer : public Layer { - const size_t RANDOM_SIZE = 256; + static size_t SIMD_INLINE RandomSize() { return 256; } public: /*! \short Creates new DropoutLayer class. @@ -1344,7 +1344,7 @@ namespace Simd _specific.resize(number); if (train) { - _mask.resize(_src.Volume()*(1 + RANDOM_SIZE)); + _mask.resize(_src.Volume()*(1 + RandomSize())); for (size_t i = 0; i < _mask.size(); ++i) _mask[i] = Detail::RandomUniform(0.0f, 1.0f) <= _rate ? 1.0f : 0.0f; } @@ -1362,7 +1362,7 @@ namespace Simd const float * Mask() { - size_t start = Detail::RandomUniform(0, int(RANDOM_SIZE*_src.Volume())); + size_t start = Detail::RandomUniform(0, int(RandomSize()*_src.Volume())); return _mask.data() + start; } }; diff --git a/src/3rd/Simd/SimdParallel.hpp b/src/3rd/Simd/SimdParallel.hpp index 34464d2e..142c2a4a 100644 --- a/src/3rd/Simd/SimdParallel.hpp +++ b/src/3rd/Simd/SimdParallel.hpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,21 +26,21 @@ #include #include +#include namespace Simd { - template inline void Parallel(size_t begin, size_t end, const Function & function, size_t threadNumber, size_t blockStepMin = 1) + template inline void Parallel(size_t begin, size_t end, const Function & function, size_t threadNumber, size_t blockAlign = 1) { threadNumber = std::min(threadNumber, std::thread::hardware_concurrency()); - if (threadNumber <= 1) + if (threadNumber <= 1 || size_t(blockAlign*1.5) >= (end - begin)) function(0, begin, end); else { std::vector> futures; size_t blockSize = (end - begin + threadNumber - 1) / threadNumber; - if (blockStepMin > 1) - blockSize += blockSize%blockStepMin; + blockSize = (blockSize + blockAlign - 1) / blockAlign * blockAlign; size_t blockBegin = begin; size_t blockEnd = blockBegin + blockSize; diff --git a/src/3rd/Simd/SimdPow.h b/src/3rd/Simd/SimdPow.h new file mode 100644 index 00000000..6fcb4d04 --- /dev/null +++ b/src/3rd/Simd/SimdPow.h @@ -0,0 +1,205 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdPow_h__ +#define __SimdPow_h__ + +#include "Simd/SimdMath.h" + +namespace Simd +{ + namespace Base + { + SIMD_INLINE float Pow(float basis, float exponent) + { + return ::expf(::logf(basis)*exponent); + } + } + +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 + { + class Pow + { + __m128i _exponent, _mantissa; + __m128 _one; + + SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) const + { + __m128 p = _mm_set1_ps(f); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b)); + p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a)); + return p; + } + + SIMD_INLINE __m128 Exp2(__m128 x) const + { + x = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(129.00000f)), _mm_set1_ps(-126.99999f)); + __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); + __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); + __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); + __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); + return _mm_mul_ps(expipart, expfpart); + } + + SIMD_INLINE __m128 Log2(__m128 x) const + { + __m128i i = _mm_castps_si128(x); + __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _exponent), 23), _mm_set1_epi32(127))); + __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mantissa)), _one); + __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _one)), e); + } + + public: + + SIMD_INLINE Pow() + { + _exponent = _mm_set1_epi32(0x7F800000); + _mantissa = _mm_set1_epi32(0x007FFFFF); + _one = _mm_set1_ps(1.0f); + } + + SIMD_INLINE __m128 operator() (__m128 basis, __m128 exponent) const + { + return Exp2(_mm_mul_ps(Log2(basis), exponent)); + } + }; + } +#endif //SIMD_SSE2_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class Pow + { + __m256i _exponent, _mantissa; + __m256 _one; + + SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) const + { + __m256 p = _mm256_set1_ps(f); + p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(e)); + p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(d)); + p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(c)); + p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(b)); + p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(a)); + return p; + } + + SIMD_INLINE __m256 Exp2(__m256 x) const + { + x = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(129.00000f)), _mm256_set1_ps(-126.99999f)); + __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f))); + __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart)); + __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23)); + __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); + return _mm256_mul_ps(expipart, expfpart); + } + + SIMD_INLINE __m256 Log2(__m256 x) const + { + __m256i i = _mm256_castps_si256(x); + __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _exponent), 23), _mm256_set1_epi32(127))); + __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mantissa)), _one); + __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm256_fmadd_ps(p, _mm256_sub_ps(m, _one), e); + } + + public: + + SIMD_INLINE Pow() + { + _exponent = _mm256_set1_epi32(0x7F800000); + _mantissa = _mm256_set1_epi32(0x007FFFFF); + _one = _mm256_set1_ps(1.0f); + } + + SIMD_INLINE __m256 operator()(__m256 basis, __m256 exponent) const + { + return Exp2(_mm256_mul_ps(Log2(basis), exponent)); + } + }; + } +#endif //SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512F_ENABLE + namespace Avx512f + { + class Pow + { + __m512i _exponent, _mantissa; + __m512 _one; + + SIMD_INLINE __m512 Poly5(__m512 x, float a, float b, float c, float d, float e, float f) const + { + __m512 p = _mm512_set1_ps(f); + p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(e)); + p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(d)); + p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(c)); + p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(b)); + p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(a)); + return p; + } + + SIMD_INLINE __m512 Exp2(__m512 x) const + { + x = _mm512_max_ps(_mm512_min_ps(x, _mm512_set1_ps(129.00000f)), _mm512_set1_ps(-126.99999f)); + __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _mm512_set1_ps(0.5f))); + __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart)); + __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _mm512_set1_epi32(127)), 23)); + __m512 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); + return _mm512_mul_ps(expipart, expfpart); + } + + SIMD_INLINE __m512 Log2(__m512 x) const + { + __m512i i = _mm512_castps_si512(x); + __m512 e = _mm512_cvtepi32_ps(_mm512_sub_epi32(_mm512_srli_epi32(_mm512_and_si512(i, _exponent), 23), _mm512_set1_epi32(127))); + __m512 m = _mm512_or_ps(_mm512_castsi512_ps(_mm512_and_si512(i, _mantissa)), _one); + __m512 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); + return _mm512_fmadd_ps(p, _mm512_sub_ps(m, _one), e); + } + + public: + + SIMD_INLINE Pow() + { + _exponent = _mm512_set1_epi32(0x7F800000); + _mantissa = _mm512_set1_epi32(0x007FFFFF); + _one = _mm512_set1_ps(1.0f); + } + + SIMD_INLINE __m512 operator()(__m512 basis, __m512 exponent) const + { + return Exp2(_mm512_mul_ps(Log2(basis), exponent)); + } + }; + } +#endif //SIMD_AVX512F_ENABLE +} + +#endif//__SimdPow_h__ diff --git a/src/3rd/Simd/SimdResizer.h b/src/3rd/Simd/SimdResizer.h new file mode 100644 index 00000000..3d709314 --- /dev/null +++ b/src/3rd/Simd/SimdResizer.h @@ -0,0 +1,139 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#ifndef __SimdResizer_h__ +#define __SimdResizer_h__ + +#include "Simd/SimdArray.h" + +namespace Simd +{ + class Resizer : Deletable + { + SimdResizeChannelType _type; + SimdResizeMethodType _method; + + public: + Resizer(SimdResizeChannelType type, SimdResizeMethodType method) + : _type(type) + , _method(method) + { + } + + SimdResizeChannelType Type() const { return _type; } + SimdResizeMethodType Method() const { return _method; } + + virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const = 0; + }; + + namespace Base + { + class ResizerByteBilinear : Resizer + { + size_t _sx, _sy, _dx, _dy, _cn, _rs; + Array32i _ax, _ix, _ay, _iy; + public: + ResizerByteBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels); + + static void EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, int32_t * alphas, size_t channels); + + virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const; + }; + + class ResizerFloatBilinear : Resizer + { + protected: + size_t _sx, _sy, _dx, _dy, _cn, _rs; + Array32i _ix, _iy; + Array32f _ax, _ay; + + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const; + + public: + ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, size_t align, bool caffeInterp); + + virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const; + + static void EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, float * alphas, size_t channels, bool caffeInterp); + }; + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + } + +#ifdef SIMD_SSE_ENABLE + namespace Sse + { + class ResizerFloatBilinear : Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const; + public: + ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp); + }; + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + } +#endif //SIMD_SSE_ENABLE + +#ifdef SIMD_AVX_ENABLE + namespace Avx + { + class ResizerFloatBilinear : Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const; + public: + ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp); + }; + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + } +#endif //SIMD_AVX_ENABLE + +#ifdef SIMD_AVX2_ENABLE + namespace Avx2 + { + class ResizerFloatBilinear : Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const; + public: + ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp); + }; + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + } +#endif //SIMD_AVX2_ENABLE + +#ifdef SIMD_AVX512F_ENABLE + namespace Avx512f + { + class ResizerFloatBilinear : Base::ResizerFloatBilinear + { + virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const; + public: + ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp); + }; + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method); + } +#endif //SIMD_AVX512F_ENABLE +} +#endif//__SimdResizer_h__ diff --git a/src/3rd/Simd/SimdSse1.h b/src/3rd/Simd/SimdSse1.h index d216df1a..3edf8be5 100644 --- a/src/3rd/Simd/SimdSse1.h +++ b/src/3rd/Simd/SimdSse1.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,10 @@ namespace Simd #ifdef SIMD_SSE_ENABLE namespace Sse { + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance); + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc); + void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride); void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add); @@ -96,6 +100,12 @@ namespace Simd void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum); void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum); + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst); + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst); + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst); } #endif// SIMD_SSE_ENABLE } diff --git a/src/3rd/Simd/SimdSse1Float32.cpp b/src/3rd/Simd/SimdSse1Float32.cpp new file mode 100644 index 00000000..f2c3370a --- /dev/null +++ b/src/3rd/Simd/SimdSse1Float32.cpp @@ -0,0 +1,92 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_SSE_ENABLE + namespace Sse + { + template void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (align) + assert(Aligned(a) && Aligned(b)); + + size_t partialAlignedSize = AlignLo(size, F); + size_t fullAlignedSize = AlignLo(size, DF); + size_t i = 0; + __m128 _aa[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; + __m128 _ab[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; + __m128 _bb[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; + if (fullAlignedSize) + { + for (; i < fullAlignedSize; i += DF) + { + __m128 a0 = Load(a + i + 0 * F); + __m128 b0 = Load(b + i + 0 * F); + _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0)); + _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0)); + _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0)); + __m128 a1 = Load(a + i + 1 * F); + __m128 b1 = Load(b + i + 1 * F); + _aa[1] = _mm_add_ps(_aa[1], _mm_mul_ps(a1, a1)); + _ab[1] = _mm_add_ps(_ab[1], _mm_mul_ps(a1, b1)); + _bb[1] = _mm_add_ps(_bb[1], _mm_mul_ps(b1, b1)); + } + _aa[0] = _mm_add_ps(_aa[0], _aa[1]); + _ab[0] = _mm_add_ps(_ab[0], _ab[1]); + _bb[0] = _mm_add_ps(_bb[0], _bb[1]); + } + for (; i < partialAlignedSize; i += F) + { + __m128 a0 = Load(a + i); + __m128 b0 = Load(b + i); + _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0)); + _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0)); + _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0)); + } + float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]); + for (; i < size; ++i) + { + float _a = a[i]; + float _b = b[i]; + aa += _a * _a; + ab += _a * _b; + bb += _b * _b; + } + *distance = 1.0f - ab / ::sqrt(aa*bb); + } + + void CosineDistance32f(const float * a, const float * b, size_t size, float * distance) + { + if (Aligned(a) && Aligned(b)) + CosineDistance32f(a, b, size, distance); + else + CosineDistance32f(a, b, size, distance); + } + } +#endif// SIMD_SSE_ENABLE +} diff --git a/src/3rd/Simd/SimdSse1Gemm32f.cpp b/src/3rd/Simd/SimdSse1Gemm32f.cpp new file mode 100644 index 00000000..7d480aff --- /dev/null +++ b/src/3rd/Simd/SimdSse1Gemm32f.cpp @@ -0,0 +1,595 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdStore.h" +#include "Simd/SimdGemm.h" + +namespace Simd +{ +#ifdef SIMD_SSE_ENABLE + namespace Sse + { + SIMD_INLINE void AddProduct(float * ptr, __m128 value, __m128 alpha) + { + _mm_storeu_ps(ptr, _mm_add_ps(_mm_mul_ps(value, alpha), _mm_loadu_ps(ptr))); + } + + SIMD_INLINE void AddProduct(float * ptr, __m128 value, __m128 alpha, size_t tail) + { + if (tail == F) + AddProduct(ptr, value, alpha); + else + { + float tmp[F]; + _mm_storeu_ps(tmp, _mm_add_ps(_mm_mul_ps(value, alpha), _mm_loadu_ps(ptr))); + for (size_t i = 0; i < tail; ++i) + ptr[i] = tmp[i]; + } + } + + static void Kernel4x12(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c00 = _mm_setzero_ps(); + __m128 c10 = _mm_setzero_ps(); + __m128 c20 = _mm_setzero_ps(); + __m128 c30 = _mm_setzero_ps(); + __m128 c01 = _mm_setzero_ps(); + __m128 c11 = _mm_setzero_ps(); + __m128 c21 = _mm_setzero_ps(); + __m128 c31 = _mm_setzero_ps(); + __m128 c02 = _mm_setzero_ps(); + __m128 c12 = _mm_setzero_ps(); + __m128 c22 = _mm_setzero_ps(); + __m128 c32 = _mm_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m128 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + b1 = _mm_loadu_ps(B + 1 * F); + b2 = _mm_loadu_ps(B + 2 * F); + a0 = _mm_set1_ps(*A0++); + c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00); + c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01); + c02 = _mm_add_ps(_mm_mul_ps(a0, b2), c02); + a0 = _mm_set1_ps(*A1++); + c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10); + c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11); + c12 = _mm_add_ps(_mm_mul_ps(a0, b2), c12); + a0 = _mm_set1_ps(*A2++); + c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20); + c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21); + c22 = _mm_add_ps(_mm_mul_ps(a0, b2), c22); + a0 = _mm_set1_ps(*A3++); + c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30); + c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31); + c32 = _mm_add_ps(_mm_mul_ps(a0, b2), c32); + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01); + AddProduct(C + 2 * F, _alpha, c02, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11); + AddProduct(C + 2 * F, _alpha, c12, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21); + AddProduct(C + 2 * F, _alpha, c22, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31); + AddProduct(C + 2 * F, _alpha, c32, tail); + } + + static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c00 = _mm_setzero_ps(); + __m128 c10 = _mm_setzero_ps(); + __m128 c20 = _mm_setzero_ps(); + __m128 c30 = _mm_setzero_ps(); + __m128 c01 = _mm_setzero_ps(); + __m128 c11 = _mm_setzero_ps(); + __m128 c21 = _mm_setzero_ps(); + __m128 c31 = _mm_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + __m128 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + b1 = _mm_loadu_ps(B + 1 * F); + a0 = _mm_set1_ps(*A0++); + c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00); + c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01); + a0 = _mm_set1_ps(*A1++); + c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10); + c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11); + a0 = _mm_set1_ps(*A2++); + c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20); + c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21); + a0 = _mm_set1_ps(*A3++); + c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30); + c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31); + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + } + + static void Kernel4x4(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c0 = _mm_setzero_ps(); + __m128 c1 = _mm_setzero_ps(); + __m128 c2 = _mm_setzero_ps(); + __m128 c3 = _mm_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + __m128 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B); + c0 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a0++)), c0); + c1 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a1++)), c1); + c2 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a2++)), c2); + c3 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a3++)), c3); + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, tail); + AddProduct(C + 1 * ldc, _alpha, c1, tail); + AddProduct(C + 2 * ldc, _alpha, c2, tail); + AddProduct(C + 3 * ldc, _alpha, c3, tail); + } + + static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c00 = _mm_setzero_ps(); + __m128 c10 = _mm_setzero_ps(); + __m128 c20 = _mm_setzero_ps(); + __m128 c30 = _mm_setzero_ps(); + __m128 c40 = _mm_setzero_ps(); + __m128 c50 = _mm_setzero_ps(); + __m128 c01 = _mm_setzero_ps(); + __m128 c11 = _mm_setzero_ps(); + __m128 c21 = _mm_setzero_ps(); + __m128 c31 = _mm_setzero_ps(); + __m128 c41 = _mm_setzero_ps(); + __m128 c51 = _mm_setzero_ps(); + const float * A0 = A + lda * 0; + const float * A1 = A + lda * 1; + const float * A2 = A + lda * 2; + const float * A3 = A + lda * 3; + const float * A4 = A + lda * 4; + const float * A5 = A + lda * 5; + __m128 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + b1 = _mm_loadu_ps(B + 1 * F); + a0 = _mm_set1_ps(*A0++); + c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00); + c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01); + a0 = _mm_set1_ps(*A1++); + c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10); + c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11); + a0 = _mm_set1_ps(*A2++); + c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20); + c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21); + a0 = _mm_set1_ps(*A3++); + c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30); + c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31); + a0 = _mm_set1_ps(*A4++); + c40 = _mm_add_ps(_mm_mul_ps(a0, b0), c40); + c41 = _mm_add_ps(_mm_mul_ps(a0, b1), c41); + a0 = _mm_set1_ps(*A5++); + c50 = _mm_add_ps(_mm_mul_ps(a0, b0), c50); + c51 = _mm_add_ps(_mm_mul_ps(a0, b1), c51); + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + AddProduct(C + 0 * F, _alpha, c00); + AddProduct(C + 1 * F, _alpha, c01, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c10); + AddProduct(C + 1 * F, _alpha, c11, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c20); + AddProduct(C + 1 * F, _alpha, c21, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c30); + AddProduct(C + 1 * F, _alpha, c31, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c40); + AddProduct(C + 1 * F, _alpha, c41, tail); + C += ldc; + AddProduct(C + 0 * F, _alpha, c50); + AddProduct(C + 1 * F, _alpha, c51, tail); + } + + static void Kernel6x4(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c0 = _mm_setzero_ps(); + __m128 c1 = _mm_setzero_ps(); + __m128 c2 = _mm_setzero_ps(); + __m128 c3 = _mm_setzero_ps(); + __m128 c4 = _mm_setzero_ps(); + __m128 c5 = _mm_setzero_ps(); + const float * a0 = A + lda * 0; + const float * a1 = A + lda * 1; + const float * a2 = A + lda * 2; + const float * a3 = A + lda * 3; + const float * a4 = A + lda * 4; + const float * a5 = A + lda * 5; + __m128 b0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B); + c0 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a0++)), c0); + c1 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a1++)), c1); + c2 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a2++)), c2); + c3 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a3++)), c3); + c4 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a4++)), c4); + c5 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a5++)), c5); + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + AddProduct(C + 0 * ldc, _alpha, c0, tail); + AddProduct(C + 1 * ldc, _alpha, c1, tail); + AddProduct(C + 2 * ldc, _alpha, c2, tail); + AddProduct(C + 3 * ldc, _alpha, c3, tail); + AddProduct(C + 4 * ldc, _alpha, c4, tail); + AddProduct(C + 5 * ldc, _alpha, c5, tail); + } + + static void KernelMx12(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c[4][3]; + const float * a[4]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm_setzero_ps(); + c[i][1] = _mm_setzero_ps(); + c[i][2] = _mm_setzero_ps(); + a[i] = A + lda * i; + } + __m128 b0, b1, b2, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + b1 = _mm_loadu_ps(B + 1 * F); + b2 = _mm_loadu_ps(B + 2 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm_set1_ps(*a[i]++); + c[i][0] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm_add_ps(_mm_mul_ps(b1, a0), c[i][1]); + c[i][2] = _mm_add_ps(_mm_mul_ps(b2, a0), c[i][2]); + } + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1]); + AddProduct(C + 2 * F, _alpha, c[i][2], tail); + C += ldc; + } + } + + static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { + __m128 c[6][2]; + const float * a[6]; + for (size_t i = 0; i < M; ++i) + { + c[i][0] = _mm_setzero_ps(); + c[i][1] = _mm_setzero_ps(); + a[i] = A + lda * i; + } + __m128 b0, b1, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + b1 = _mm_loadu_ps(B + 1 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm_set1_ps(*a[i]++); + c[i][0] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i][0]); + c[i][1] = _mm_add_ps(_mm_mul_ps(b1, a0), c[i][1]); + } + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + { + AddProduct(C + 0 * F, _alpha, c[i][0]); + AddProduct(C + 1 * F, _alpha, c[i][1], tail); + C += ldc; + } + } + + static void KernelMx4(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail) + { +#ifdef SIMD_X64_ENABLE + __m128 c[6]; + const float * a[6]; +#else + __m128 c[4]; + const float * a[4]; +#endif + for (size_t i = 0; i < M; ++i) + { + c[i] = _mm_setzero_ps(); + a[i] = A + lda * i; + } + __m128 b0, a0; + for (size_t k = 0; k < K; k++) + { + b0 = _mm_loadu_ps(B + 0 * F); + for (size_t i = 0; i < M; ++i) + { + a0 = _mm_set1_ps(*a[i]++); + c[i] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i]); + } + B += ldb; + } + __m128 _alpha = _mm_set1_ps(alpha); + for (size_t i = 0; i < M; ++i) + AddProduct(C + i * ldc, _alpha, c[i], tail); + } + + SIMD_INLINE void ScaleC(float * C, __m128 beta) + { + _mm_storeu_ps(C, _mm_mul_ps(_mm_loadu_ps(C), beta)); + } + + void ScaleC(size_t M, size_t N, float beta, float * C, size_t ldc) + { + if (beta == 1.0f) + return; + else if (beta == 0.0f) + { + for (size_t i = 0; i < M; ++i) + memset(C + i * ldc, 0, N * sizeof(float)); + } + else + { + size_t NQF = AlignLo(N, QF); + size_t NF = AlignLo(N, F); + __m128 _beta = _mm_set1_ps(beta); + for (size_t i = 0; i < M; ++i) + { + size_t j = 0; + for (; j < NQF; j += QF) + { + ScaleC(C + j + F * 0, _beta); + ScaleC(C + j + F * 1, _beta); + ScaleC(C + j + F * 2, _beta); + ScaleC(C + j + F * 3, _beta); + } + for (; j < NF; j += F) + ScaleC(C + j, _beta); + for (; j < N; ++j) + C[j] *= beta; + C += ldc; + } + } + } + + static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst) + { + for (size_t i = 0; i < M; i += cell) + { + size_t m = Simd::Min(cell, M - i), k = 0; + if (cell == 4 && m == 4) + { + size_t K4 = AlignLo(K, 4); + for (; k < K4; k += 4) + { + const float * ps = src + k; + __m128 s0 = _mm_loadu_ps(ps + 0 * stride); + __m128 s1 = _mm_loadu_ps(ps + 1 * stride); + __m128 s2 = _mm_loadu_ps(ps + 2 * stride); + __m128 s3 = _mm_loadu_ps(ps + 3 * stride); + __m128 s00 = _mm_unpacklo_ps(s0, s2); + __m128 s01 = _mm_unpacklo_ps(s1, s3); + __m128 s10 = _mm_unpackhi_ps(s0, s2); + __m128 s11 = _mm_unpackhi_ps(s1, s3); + _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01)); + _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01)); + _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11)); + _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11)); + dst += 16; + } + } + for (; k < K; ++k) + { + for (size_t c = 0; c < m; ++c) + *(dst++) = src[c*stride + k]; + } + src += cell * stride; + } + } + + static void PackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB) + { + for (size_t j = 0; j < N; j += microN) + { + size_t n = Simd::Min(microN, N - j); + size_t k = 0; + if (microN == 1 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F)); + pB += microN; + } + } + else + { + __m128 mask0 = Sse::LeftNotZero(n - 0 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F))); + pB += microN; + } + } + } + else if (microN == 2 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F)); + _mm_storeu_ps(pB + 1 * F, _mm_loadu_ps(b + 1 * F)); + pB += microN; + } + } + else + { + __m128 mask0 = Sse::LeftNotZero(n - 0 * F); + __m128 mask1 = Sse::LeftNotZero(n - 1 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F))); + _mm_storeu_ps(pB + 1 * F, _mm_and_ps(mask1, _mm_loadu_ps(b + 1 * F))); + pB += microN; + } + } + } + else if (microN == 3 * F) + { + if (n == microN) + { + for (; k < K; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F)); + _mm_storeu_ps(pB + 1 * F, _mm_loadu_ps(b + 1 * F)); + _mm_storeu_ps(pB + 2 * F, _mm_loadu_ps(b + 2 * F)); + pB += microN; + } + } + else + { + __m128 mask0 = Sse::LeftNotZero(n - 0 * F); + __m128 mask1 = Sse::LeftNotZero(n - 1 * F); + __m128 mask2 = Sse::LeftNotZero(n - 2 * F); + for (; k < K - 1; ++k) + { + const float * b = B + k * ldb; + _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F))); + _mm_storeu_ps(pB + 1 * F, _mm_and_ps(mask1, _mm_loadu_ps(b + 1 * F))); + _mm_storeu_ps(pB + 2 * F, _mm_and_ps(mask2, _mm_loadu_ps(b + 2 * F))); + pB += microN; + } + } + } + for (; k < K; ++k) + { + const float * b = B + k * ldb; + size_t c = 0; + for (; c < n; ++c) + *(pB++) = *(b++); + for (; c < microN; ++c) + *(pB++) = 0; + } + B += microN; + } + } + + void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc) + { + const size_t CACHE_L1_SIZE = 32 * 1024; + const size_t CACHE_L2_SIZE = 256 * 1024; + const size_t CACHE_L3_SIZE = 2 * 1024 * 1024; + typedef Simd::GemmNN GemmNN; + GemmNN::Main kernelMM, kernelMT; + GemmNN::Tail kernelTM, kernelTT; + size_t microM, microN, L1, L2; +#ifdef SIMD_X64_ENABLE + if (K > 4024) + { + microM = 6; + microN = 8; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel6x8; + kernelMT = tail > F ? Kernel6x8 : Kernel6x4; + kernelTM = KernelMx8; + kernelTT = tail > F ? KernelMx8 : KernelMx4; + } + else + { + microM = 4; + microN = 12; + size_t tail = N - AlignLoAny(N, microN); + kernelMM = Kernel4x12; + kernelMT = tail > DF ? Kernel4x12 : (tail > F ? Kernel4x8 : Kernel4x4); + kernelTM = KernelMx12; + kernelTT = tail > DF ? KernelMx12 : (tail > F ? KernelMx8 : KernelMx4); + } +#else + microM = 4; + microN = 4; + kernelMM = Kernel4x4; + kernelMT = Kernel4x4; + kernelTM = KernelMx4; + kernelTT = KernelMx4; +#endif + L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE; + L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE; + GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F, + kernelMM, kernelMT, kernelTM, kernelTT, ScaleC, PackB, NULL); + gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc); + } + } +#endif// SIMD_SSE_ENABLE +} diff --git a/src/3rd/Simd/SimdSse1Resizer.cpp b/src/3rd/Simd/SimdSse1Resizer.cpp new file mode 100644 index 00000000..c93f57ad --- /dev/null +++ b/src/3rd/Simd/SimdSse1Resizer.cpp @@ -0,0 +1,118 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdResizer.h" +#include "Simd/SimdStore.h" + +namespace Simd +{ +#ifdef SIMD_SSE_ENABLE + namespace Sse + { + ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp) + : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m128), caffeInterp) + { + } + + void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const + { + Array32f bx[2]; + bx[0].Resize(_rs); + bx[1].Resize(_rs); + float * pbx[2] = { bx[0].data, bx[1].data }; + int32_t prev = -2; + size_t rsa = AlignLo(_rs, Sse::F); + for (size_t dy = 0; dy < _dy; dy++, dst += dstStride) + { + float fy1 = _ay[dy]; + float fy0 = 1.0f - fy1; + int32_t sy = _iy[dy]; + int32_t k = 0; + + if (sy == prev) + k = 2; + else if (sy == prev + 1) + { + Swap(pbx[0], pbx[1]); + k = 1; + } + + prev = sy; + + for (; k < 2; k++) + { + float * pb = pbx[k]; + const float * ps = src + (sy + k)*srcStride; + size_t dx = 0; + if (_cn == 1) + { + __m128 _1 = _mm_set1_ps(1.0f); + for (; dx < rsa; dx += Sse::F) + { + __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]); + __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); + __m128 fx1 = _mm_load_ps(_ax.data + dx); + __m128 fx0 = _mm_sub_ps(_1, fx1); + __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88)); + __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD)); + _mm_store_ps(pb + dx, _mm_add_ps(m0, m1)); + } + } + for (; dx < _rs; dx++) + { + int32_t sx = _ix[dx]; + float fx = _ax[dx]; + pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx; + } + } + + size_t dx = 0; + __m128 _fy0 = _mm_set1_ps(fy0); + __m128 _fy1 = _mm_set1_ps(fy1); + for (; dx < rsa; dx += Sse::F) + { + __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0); + __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1); + _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1)); + } + for (; dx < _rs; dx++) + dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1; + } + } + + //--------------------------------------------------------------------- + + void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method) + { + if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false); + else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp) + return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true); + else + return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method); + } + } +#endif //SIMD_SSE_ENABLE +} + diff --git a/src/3rd/Simd/SimdSse1Synet.cpp b/src/3rd/Simd/SimdSse1Synet.cpp new file mode 100644 index 00000000..3a8efdc8 --- /dev/null +++ b/src/3rd/Simd/SimdSse1Synet.cpp @@ -0,0 +1,325 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdExtract.h" + +namespace Simd +{ +#ifdef SIMD_SSE_ENABLE + namespace Sse + { + template SIMD_INLINE void SynetAddBias(const __m128 & bias, float * dst) + { + Store(dst, _mm_add_ps(Load(dst), bias)); + } + + template SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m128 _bias = _mm_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetAddBias(_bias, dst + j + F * 0); + SynetAddBias(_bias, dst + j + F * 1); + SynetAddBias(_bias, dst + j + F * 2); + SynetAddBias(_bias, dst + j + F * 3); + } + for (; j < partial; j += F) + SynetAddBias(_bias, dst + j); + } + for (; j < size; ++j) + dst[j] += bias[i]; + dst += size; + } + } + + void SynetAddBias(const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetAddBias(bias, count, size, dst); + else + SynetAddBias(bias, count, size, dst); + } + + template void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset) + { + Store(dst + offset, _mm_mul_ps(Load(src0 + offset), Load(src1 + offset))); + } + + template void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(src0, src1, dst, j); + } + for (; j < size; ++j) + dst[j] = src0[j] * src1[j]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardProduct(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardProduct(dst, srci, dst, j); + } + for (; j < size; ++j) + dst[j] *= srci[j]; + } + } + + template void SynetEltwiseLayerForwardSum(const float * src0, const __m128 & weight0, const float * src1, const __m128 & weight1, float * dst, size_t offset) + { + Store(dst + offset, _mm_add_ps(_mm_mul_ps(Load(src0 + offset), weight0), _mm_mul_ps(Load(src1 + offset), weight1))); + } + + template void SynetEltwiseLayerForwardSum(const float * src, const __m128 & weight, float * dst, size_t offset) + { + Store(dst + offset, _mm_add_ps(_mm_mul_ps(Load(src + offset), weight), Load(dst + offset))); + } + + template void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + __m128 weight0 = _mm_set1_ps(weight[0]); + __m128 weight1 = _mm_set1_ps(weight[1]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 0); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 1); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 2); + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(src0, weight0, src1, weight1, dst, j); + } + for (; j < size; ++j) + dst[j] = src0[j] * weight[0] + src1[j] * weight[1]; + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + __m128 weighti = _mm_set1_ps(weight[i]); + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 0); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 1); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 2); + SynetEltwiseLayerForwardSum(srci, weighti, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardSum(srci, weighti, dst, j); + } + for (; j < size; ++j) + dst[j] += srci[j] * weight[i]; + } + } + + template void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset) + { + Store(dst + offset, _mm_max_ps(Load(src0 + offset), Load(src1 + offset))); + } + + template void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + const float * src0 = src[0]; + const float * src1 = src[1]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 0); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 1); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 2); + SynetEltwiseLayerForwardMax(src0, src1, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(src0, src1, dst, j); + } + for (; j < size; ++j) + dst[j] = Simd::Max(src0[j], src1[j]); + for (size_t i = 2; i < count; ++i) + { + const float * srci = src[i]; + size_t j = 0; + if (partial) + { + for (; j < aligned; j += QF) + { + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 0); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 1); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 2); + SynetEltwiseLayerForwardMax(dst, srci, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetEltwiseLayerForwardMax(dst, srci, dst, j); + } + for (; j < size; ++j) + dst[j] = Simd::Max(dst[j], srci[j]); + } + } + + template void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + switch (type) + { + case SimdSynetEltwiseOperationProduct: + SynetEltwiseLayerForwardProduct(src, count, size, dst); + break; + case SimdSynetEltwiseOperationSum: + SynetEltwiseLayerForwardSum(src, weight, count, size, dst); + break; + case SimdSynetEltwiseOperationMax: + SynetEltwiseLayerForwardMax(src, count, size, dst); + break; + default: + assert(0); + } + } + + void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst) + { + assert(count >= 2); + bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]); + for (size_t i = 2; i < count; ++i) + aligned = aligned && Aligned(src[i]); + if (aligned) + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + else + SynetEltwiseLayerForward(src, weight, count, size, type, dst); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m128 & scale, const __m128 & bias, float * dst, size_t offset) + { + Store(dst + offset, _mm_add_ps(_mm_mul_ps(Load(src + offset), scale), bias)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m128 & scale, float * dst, size_t offset) + { + Store(dst + offset, _mm_mul_ps(Load(src + offset), scale)); + } + + template SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + size_t aligned = AlignLo(size, QF); + size_t partial = AlignLo(size, F); + if (bias) + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m128 _scale = _mm_set1_ps(scale[i]); + __m128 _bias = _mm_set1_ps(bias[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, _bias, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, _bias, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i] + bias[i]; + src += size; + dst += size; + } + } + else + { + for (size_t i = 0; i < count; ++i) + { + size_t j = 0; + if (partial) + { + __m128 _scale = _mm_set1_ps(scale[i]); + for (; j < aligned; j += QF) + { + SynetScaleLayerForward(src, _scale, dst, j + F * 0); + SynetScaleLayerForward(src, _scale, dst, j + F * 1); + SynetScaleLayerForward(src, _scale, dst, j + F * 2); + SynetScaleLayerForward(src, _scale, dst, j + F * 3); + } + for (; j < partial; j += F) + SynetScaleLayerForward(src, _scale, dst, j); + } + for (; j < size; ++j) + dst[j] = src[j] * scale[i]; + src += size; + dst += size; + } + } + } + + void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst) + { + if (Aligned(dst) && Aligned(size)) + SynetScaleLayerForward(src, scale, bias, count, size, dst); + else + SynetScaleLayerForward(src, scale, bias, count, size, dst); + } + } +#endif// SIMD_SSE_ENABLE +} diff --git a/src/3rd/Simd/SimdSse2.h b/src/3rd/Simd/SimdSse2.h index 812010e0..6801ef5d 100644 --- a/src/3rd/Simd/SimdSse2.h +++ b/src/3rd/Simd/SimdSse2.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -277,12 +277,16 @@ namespace Simd void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum); + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum); void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum); void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride); + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst); + void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride); diff --git a/src/3rd/Simd/SimdSse2Float32.cpp b/src/3rd/Simd/SimdSse2Float32.cpp index 4418419f..6338efed 100644 --- a/src/3rd/Simd/SimdSse2Float32.cpp +++ b/src/3rd/Simd/SimdSse2Float32.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -70,7 +70,7 @@ namespace Simd SIMD_INLINE __m128 Uint8ToFloat32(const __m128i & value, const __m128 & lower, const __m128 & boost) { - return _mm_sub_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower); + return _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower); } template SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m128 & lower, const __m128 & boost, float * dst) diff --git a/src/3rd/Simd/SimdSse2Neural.cpp b/src/3rd/Simd/SimdSse2Neural.cpp index abde119c..fe625263 100644 --- a/src/3rd/Simd/SimdSse2Neural.cpp +++ b/src/3rd/Simd/SimdSse2Neural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,6 +25,7 @@ #include "Simd/SimdExtract.h" #include "Simd/SimdStore.h" #include "Simd/SimdStream.h" +#include "Simd/SimdPow.h" namespace Simd { @@ -99,84 +100,28 @@ namespace Simd NeuralConvert(src, srcStride, width, height, dst, dstStride); } - class PowEstimator + template void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { - __m128i _exponent, _mantissa; - __m128 _one; - - void Init() - { - _exponent = _mm_set1_epi32(0x7F800000); - _mantissa = _mm_set1_epi32(0x007FFFFF); - _one = _mm_set1_ps(1.0f); - } - - SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) - { - __m128 p = _mm_set1_ps(f); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b)); - p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a)); - return p; - } - - SIMD_INLINE __m128 Exp2(__m128 x) - { - x = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(129.00000f)), _mm_set1_ps(-126.99999f)); - __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f))); - __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); - __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23)); - __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f); - return _mm_mul_ps(expipart, expfpart); - } - - SIMD_INLINE __m128 Log2(__m128 x) - { - __m128i i = _mm_castps_si128(x); - __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _exponent), 23), _mm_set1_epi32(127))); - __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mantissa)), _one); - __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f); - return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _one)), e); - } - - SIMD_INLINE __m128 Pow(__m128 basis, __m128 exponent) - { - return Exp2(_mm_mul_ps(Log2(basis), exponent)); - } - - template void Run(const float * src, size_t size, const float * exponent, float * dst) - { - if (align) - assert(Aligned(src) && Aligned(dst)); - - float e = exponent[0]; - size_t alignedSize = AlignLo(size, F); - __m128 _e = _mm_set1_ps(e); - size_t i = 0; - for (; i < alignedSize; i += F) - Sse::Store(dst + i, Pow(Sse::Load(src + i), _e)); - for (; i < size; ++i) - dst[i] = Base::Pow(src[i], e); - } - - public: - void Run(const float * src, size_t size, const float * exponent, float * dst) - { - Init(); - - if (Aligned(src) && Aligned(dst)) - Run(src, size, exponent, dst); - else - Run(src, size, exponent, dst); - } - }; + if (align) + assert(Aligned(src) && Aligned(dst)); + + float e = exponent[0]; + size_t alignedSize = AlignLo(size, F); + __m128 _e = _mm_set1_ps(e); + Pow pow; + size_t i = 0; + for (; i < alignedSize; i += F) + Sse::Store(dst + i, pow(Sse::Load(src + i), _e)); + for (; i < size; ++i) + dst[i] = Base::Pow(src[i], e); + } void NeuralPow(const float * src, size_t size, const float * exponent, float * dst) { - PowEstimator estimator; - estimator.Run(src, size, exponent, dst); + if (Aligned(src) && Aligned(dst)) + NeuralPow(src, size, exponent, dst); + else + NeuralPow(src, size, exponent, dst); } class ExpEstimator diff --git a/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp b/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp index a913fa22..15b262ce 100644 --- a/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp +++ b/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp @@ -1,7 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2018-2018 Kirill Matsaberydze. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -77,9 +78,9 @@ namespace Simd { dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0); srcOffset = evenWidth - DA; - Store((__m128i*)(dst + dstOffset), Average8( - Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), - Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); + Store((__m128i*)(dst + dstOffset), Average8( + Load((__m128i*)(src0 + srcOffset)), Load((__m128i*)(src0 + srcOffset + A)), + Load((__m128i*)(src1 + srcOffset)), Load((__m128i*)(src1 + srcOffset + A)))); if (evenWidth != srcWidth) { dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]); diff --git a/src/3rd/Simd/SimdSse2Statistic.cpp b/src/3rd/Simd/SimdSse2Statistic.cpp index 4f3ba118..57ef1950 100644 --- a/src/3rd/Simd/SimdSse2Statistic.cpp +++ b/src/3rd/Simd/SimdSse2Statistic.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -521,10 +521,49 @@ namespace Simd else SquareSum(src, stride, width, height, sum); } + + template void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + assert(width >= A); + if (align) + assert(Aligned(src) && Aligned(stride)); + + size_t bodyWidth = AlignLo(width, A); + __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth); + __m128i fullValueSum = _mm_setzero_si128(); + __m128i fullSquareSum = _mm_setzero_si128(); + for (size_t row = 0; row < height; ++row) + { + __m128i rowSquareSum = _mm_setzero_si128(); + for (size_t col = 0; col < bodyWidth; col += A) + { + const __m128i value = Load((__m128i*)(src + col)); + fullValueSum = _mm_add_epi64(_mm_sad_epu8(value, K_ZERO), fullValueSum); + rowSquareSum = _mm_add_epi32(rowSquareSum, Square(value)); + } + if (width - bodyWidth) + { + const __m128i value = _mm_and_si128(tailMask, Load((__m128i*)(src + width - A))); + fullValueSum = _mm_add_epi64(_mm_sad_epu8(value, K_ZERO), fullValueSum); + rowSquareSum = _mm_add_epi32(rowSquareSum, Square(value)); + } + fullSquareSum = _mm_add_epi64(fullSquareSum, HorizontalSum32(rowSquareSum)); + src += stride; + } + *valueSum = ExtractInt64Sum(fullValueSum); + *squareSum = ExtractInt64Sum(fullSquareSum); + } + + void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum) + { + if (Aligned(src) && Aligned(stride)) + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + else + ValueSquareSum(src, stride, width, height, valueSum, squareSum); + } SIMD_INLINE __m128i Correlation(__m128i a, __m128i b) - { - const __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_unpacklo_epi8(b, _mm_setzero_si128())); + { const __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_unpacklo_epi8(b, _mm_setzero_si128())); const __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), _mm_unpackhi_epi8(b, _mm_setzero_si128())); return _mm_add_epi32(lo, hi); } diff --git a/src/3rd/Simd/SimdSse2Synet.cpp b/src/3rd/Simd/SimdSse2Synet.cpp new file mode 100644 index 00000000..d0e1ae88 --- /dev/null +++ b/src/3rd/Simd/SimdSse2Synet.cpp @@ -0,0 +1,91 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2018 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdMemory.h" +#include "Simd/SimdStore.h" +#include "Simd/SimdArray.h" +#include "Simd/SimdPow.h" + +namespace Simd +{ +#ifdef SIMD_SSE2_ENABLE + namespace Sse2 + { + template SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + size_t aligned = AlignLo(size, F); + Array32f sum(size, true), zero(size, true); + + for (size_t i = 0; i < half; ++i) + { + const float * pos = src + i * size; + size_t j = 0; + for (; j < aligned; j += F) + { + __m128 _pos = Sse::Load(pos + j); + Sse::Store(sum.data + j, _mm_add_ps(Sse::Load(sum.data + j), _mm_mul_ps(_pos, _pos))); + } + for (; j < size; ++j) + sum[j] += Simd::Square(pos[j]); + } + + __m128 k0 = _mm_set1_ps(k[0]); + __m128 k1 = _mm_set1_ps(k[1]); + __m128 k2 = _mm_set1_ps(k[2]); + Sse2::Pow pow; + for (size_t i = 0; i < count; ++i) + { + const float * pos = (i < count - half) ? src + half * size : zero.data; + const float * neg = (i > half) ? src - (half + 1) * size : zero.data; + size_t j = 0; + for (; j < aligned; j += F) + { + __m128 _pos = Sse::Load(pos + j); + __m128 _neg = Sse::Load(neg + j); + __m128 _sum = Sse::Load(sum.data + j); + _sum = _mm_add_ps(_sum, _mm_sub_ps(_mm_mul_ps(_pos, _pos), _mm_mul_ps(_neg, _neg))); + __m128 _src = Sse::Load(src + j); + Sse::Store(sum.data + j, _sum); + Sse::Store(dst + j, _mm_mul_ps(_src, pow(_mm_add_ps(k0, _mm_mul_ps(k1, _sum)), k2))); + } + for (; j < size; ++j) + { + sum[j] += Simd::Square(pos[j]); + sum[j] -= Simd::Square(neg[j]); + dst[j] = src[j] * Base::Pow(k[0] + k[1] * sum[j], k[2]); + } + src += size; + dst += size; + } + } + + void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst) + { + if (Aligned(src) && Aligned(dst) && Aligned(size)) + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + else + SynetLrnLayerCrossChannels(src, half, count, size, k, dst); + } + } +#endif// SIMD_SSE2_ENABLE +} diff --git a/src/3rd/Simd/SimdSse3Neural.cpp b/src/3rd/Simd/SimdSse3Neural.cpp index 78f9c3c5..886468db 100644 --- a/src/3rd/Simd/SimdSse3Neural.cpp +++ b/src/3rd/Simd/SimdSse3Neural.cpp @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -912,7 +912,7 @@ namespace Simd bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth) { - if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1) + if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F) { if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 5 * 5) return true; diff --git a/src/3rd/Simd/SimdSse41.h b/src/3rd/Simd/SimdSse41.h index c51f90b5..0fddd817 100644 --- a/src/3rd/Simd/SimdSse41.h +++ b/src/3rd/Simd/SimdSse41.h @@ -55,7 +55,7 @@ namespace Simd void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride); - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride); void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight); diff --git a/src/3rd/Simd/SimdSse41Hog.cpp b/src/3rd/Simd/SimdSse41Hog.cpp index be0e5499..d7d841e5 100644 --- a/src/3rd/Simd/SimdSse41Hog.cpp +++ b/src/3rd/Simd/SimdSse41Hog.cpp @@ -434,12 +434,12 @@ namespace Simd _mm_storeu_ps(h1[0] + i, _mm_add_ps(_mm_loadu_ps(h1[0] + i), _mm_unpacklo_ps(s10, s11))); _mm_storeu_ps(h1[1] + i, _mm_add_ps(_mm_loadu_ps(h1[1] + i), _mm_unpackhi_ps(s10, s11))); } - __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16))); - __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16))); - _mm_storel_pi((__m64*)(h0[0] + 16), s0); - _mm_storeh_pi((__m64*)(h0[1] + 16), s0); - _mm_storel_pi((__m64*)(h1[0] + 16), s1); - _mm_storeh_pi((__m64*)(h1[1] + 16), s1); + __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16)); + __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16)); + Sse::StoreHalf<0>(h0[0] + 16, s0); + Sse::StoreHalf<1>(h0[1] + 16, s0); + Sse::StoreHalf<0>(h1[0] + 16, s1); + Sse::StoreHalf<1>(h1[1] + 16, s1); h0++; h1++; src += 4 * Q2; diff --git a/src/3rd/Simd/SimdSse41HogLite.cpp b/src/3rd/Simd/SimdSse41HogLite.cpp index aca011bc..0d249610 100644 --- a/src/3rd/Simd/SimdSse41HogLite.cpp +++ b/src/3rd/Simd/SimdSse41HogLite.cpp @@ -358,9 +358,9 @@ namespace Simd sums[3] = _mm_add_ps(sums[3], _mm_mul_ps(Load(src + 3 * step), _filter)); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) { @@ -370,7 +370,7 @@ namespace Simd __m128 sums[4] = { _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < filterStride; filterCol += F) @@ -385,7 +385,7 @@ namespace Simd __m128 sum = _mm_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -398,9 +398,9 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { - size_t filterStride = featureSize * filterSize; + size_t filterStride = featureSize * filterWidth; size_t alignedDstWidth = AlignLo(dstWidth, 4); __m128 _min = _mm_set1_ps(-FLT_MAX); for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow) @@ -416,7 +416,7 @@ namespace Simd __m128 sums[4] = { _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps() }; const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { size_t filterCol = 0; for (; filterCol < filterStride; filterCol += F) @@ -434,7 +434,7 @@ namespace Simd __m128 sum = _mm_setzero_ps(); const float * pSrc = src + dstRow * srcStride + dstCol * featureSize; const float * pFilter = filter; - for (size_t filterRow = 0; filterRow < filterSize; ++filterRow) + for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow) { for (size_t filterCol = 0; filterCol < filterStride; filterCol += F) ProductSum1x1(pSrc + filterCol, pFilter + filterCol, sum); @@ -451,53 +451,53 @@ namespace Simd } } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride); } - template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + template void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { if (featureSize == 16) - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } public: - void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { assert(featureSize == 8 || featureSize == 16); - assert(srcWidth >= filterSize && srcHeight >= filterSize); + assert(srcWidth >= filterWidth && srcHeight >= filterHeight); - size_t dstWidth = srcWidth - filterSize + 1; - size_t dstHeight = srcHeight - filterSize + 1; + size_t dstWidth = srcWidth - filterWidth + 1; + size_t dstHeight = srcHeight - filterHeight + 1; if (mask) { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } else { if (Aligned(src) && Aligned(srcStride) && Aligned(filter)) - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); else - Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride); + Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride); } } }; - void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) + void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride) { HogLiteFeatureFilter featureFilter; - featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride); + featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride); } namespace HogLiteFeatureResizerDetail diff --git a/src/3rd/Simd/SimdStore.h b/src/3rd/Simd/SimdStore.h index 0fa43918..1d188986 100644 --- a/src/3rd/Simd/SimdStore.h +++ b/src/3rd/Simd/SimdStore.h @@ -45,6 +45,18 @@ namespace Simd { _mm_store_ps(p, a); } + + template SIMD_INLINE void StoreHalf(float * p, __m128 a); + + template <> SIMD_INLINE void StoreHalf<0>(float * p, __m128 a) + { + _mm_storel_pi((__m64*)p, a); + } + + template <> SIMD_INLINE void StoreHalf<1>(float * p, __m128 a) + { + _mm_storeh_pi((__m64*)p, a); + } } #endif//SIMD_SSE_ENABLE diff --git a/src/3rd/Simd/SimdVersion.h b/src/3rd/Simd/SimdVersion.h index 686d4e81..dc3163f8 100644 --- a/src/3rd/Simd/SimdVersion.h +++ b/src/3rd/Simd/SimdVersion.h @@ -1,7 +1,7 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -26,13 +26,15 @@ * File name : SimdVersion.h * Description : This file contains information about current version. * -* Do not change this file because the file is auto generated by script. +* Do not change this file because the file is auto generated by script: +* 'prj/cmd/GetVersion.cmd' for Microsoft Visual Studio or +* 'prj/sh/GetVersion.sh' for CMake. */ #ifndef __SimdVersion_h__ #define __SimdVersion_h__ -#define SIMD_VERSION "4.1.60.1349" +#define SIMD_VERSION "4.1.64.1404" #endif//__SimdVersion_h__ diff --git a/src/3rd/Simd/SimdView.hpp b/src/3rd/Simd/SimdView.hpp index 2abc5791..8fcdb628 100644 --- a/src/3rd/Simd/SimdView.hpp +++ b/src/3rd/Simd/SimdView.hpp @@ -1,7 +1,8 @@ /* * Simd Library (http://ermig1979.github.io/Simd). * -* Copyright (c) 2011-2017 Yermalayeu Ihar. +* Copyright (c) 2011-2018 Yermalayeu Ihar, +* 2018-2018 Dmitry Fedorov. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -202,6 +203,28 @@ namespace Simd operator cv::Mat() const; #endif + +#ifdef SIMD_TENSORFLOW_ENABLE + /*! + Creates an Tensorflow Tensor which references this image. + + \note You have to define SIMD_TENSORFLOW_ENABLE in order to use this functionality. + + \return an Tensorflow Tensor which references to this image. + */ + void ToTFTensor(tensorflow::Tensor & tensor, float shift = 0, float scale = 1) const; + + + /*! + Creates an Tensorflow Tensor which references this image. + + \note You have to define SIMD_TENSORFLOW_ENABLE in order to use this functionality. + + \return an Tensorflow Tensor which references to this image. + */ + void ToTFTensor(tensorflow::Tensor & tensor, int batchIndex, float shift = 0, float scale = 0) const; +#endif + /*! Gets a copy of current image view. @@ -626,6 +649,92 @@ namespace Simd } #endif +#ifdef SIMD_TENSORFLOW_ENABLE + template class A> SIMD_INLINE void View::ToTFTensor( tensorflow::Tensor & tensor, float shift, float scale) const + { + auto mapped = tensor.tensor(); + + if (format == View::Bgr24) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t * bgr = data + row*stride; + for (size_t col = 0; col < width; ++col, bgr += 3) + { + mapped(row, col, 0) = (bgr[0] + shift) * scale; + mapped(row, col, 1) = (bgr[1] + shift) * scale; + mapped(row, col, 2) = (bgr[2] + shift) * scale; + } + } + } else if (format == View::Bgra32) + { + + for (size_t row = 0; row < height; ++row) + { + const uint8_t * bgra = data + row*stride; + for (size_t col = 0; col < width; ++col, bgra += 4) + { + mapped(row, col, 0) = (bgra[0] + shift) * scale; + mapped(row, col, 1) = (bgra[1] + shift) * scale; + mapped(row, col, 2) = (bgra[2] + shift) * scale; + } + } + } else if (format == View::Gray8) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t * gray = data + row*stride; + for (size_t col = 0; col < width; ++col) + { + mapped(row, col, 0) = (gray[0] + shift) * scale; + } + } + } + } + + template class A> SIMD_INLINE void View::ToTFTensor( tensorflow::Tensor & tensor, int batchIndex, float shift, float scale) const + { + auto mapped = tensor.tensor(); + + if (format == View::Bgr24) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t * bgr = data + row*stride; + for (size_t col = 0; col < width; ++col, bgr += 3) + { + mapped(batchIndex, row, col, 0) = ((float)bgr[0] + shift) * scale; + mapped(batchIndex, row, col, 1) = ((float)bgr[1] + shift) * scale; + mapped(batchIndex, row, col, 2) = ((float)bgr[2] + shift) * scale; + } + } + } else if (format == View::Bgra32) + { + + for (size_t row = 0; row < height; ++row) + { + const uint8_t * bgra = data + row*stride; + for (size_t col = 0; col < width; ++col, bgra += 4) + { + mapped(batchIndex, row, col, 0) = ((float)bgra[0] + shift) * scale; + mapped(batchIndex, row, col, 1) = ((float)bgra[1] + shift) * scale; + mapped(batchIndex, row, col, 2) = ((float)bgra[2] + shift) * scale; + } + } + } else if (format == View::Gray8) + { + for (size_t row = 0; row < height; ++row) + { + const uint8_t * gray = data + row*stride; + for (size_t col = 0; col < width; ++col) + { + mapped(batchIndex, row, col, 0) = ((float)gray[0] + shift) * scale; + } + } + } + } +#endif + template class A> SIMD_INLINE View::View(size_t w, size_t h, ptrdiff_t s, Format f, void * d) : width(w) , height(h) @@ -1039,7 +1148,7 @@ namespace Simd if (!(format == View::Gray8 || format == View::Bgr24 || format == View::Bgra32)) return false; - std::ofstream ofs(path.c_str(), std::ifstream::binary); + std::ofstream ofs(path.c_str(), std::ofstream::binary); if (ofs.is_open()) { if (format == View::Gray8)