From fd89312853800d1e996e6556bc9f0008a0e1502d Mon Sep 17 00:00:00 2001
From: Yermalayeu Ihar <ermig@tut.by>
Date: Sat, 28 Apr 2018 09:13:41 +0300
Subject: [PATCH] *updated Simd Library.

---
 src/3rd/Simd/SimdArray.h                    |    7 +-
 src/3rd/Simd/SimdAvx1.h                     |   14 +-
 src/3rd/Simd/SimdAvx1Float32.cpp            |   92 ++
 src/3rd/Simd/SimdAvx1Gemm32f.cpp            |  627 ++++++++
 src/3rd/Simd/SimdAvx1HogLite.cpp            |   48 +-
 src/3rd/Simd/SimdAvx1Neural.cpp             |    4 +-
 src/3rd/Simd/SimdAvx1Resizer.cpp            |  135 ++
 src/3rd/Simd/SimdAvx1Synet.cpp              |  325 +++++
 src/3rd/Simd/SimdAvx2.h                     |   18 +-
 src/3rd/Simd/SimdAvx2Float16.cpp            |   61 +-
 src/3rd/Simd/SimdAvx2Float32.cpp            |   63 +-
 src/3rd/Simd/SimdAvx2Gemm32f.cpp            |  481 +++++++
 src/3rd/Simd/SimdAvx2Hog.cpp                |   12 +-
 src/3rd/Simd/SimdAvx2HogLite.cpp            |   48 +-
 src/3rd/Simd/SimdAvx2Neural.cpp             |  218 +--
 src/3rd/Simd/SimdAvx2Resizer.cpp            |  151 ++
 src/3rd/Simd/SimdAvx2Statistic.cpp          |   42 +-
 src/3rd/Simd/SimdAvx2Synet.cpp              |  238 +++
 src/3rd/Simd/SimdAvx512bw.h                 |   10 +-
 src/3rd/Simd/SimdAvx512bwFloat16.cpp        |   63 +-
 src/3rd/Simd/SimdAvx512bwFloat32.cpp        |   63 +-
 src/3rd/Simd/SimdAvx512bwHog.cpp            |   12 +-
 src/3rd/Simd/SimdAvx512bwHogLite.cpp        |   64 +-
 src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp  |    2 +-
 src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp |   32 +-
 src/3rd/Simd/SimdAvx512bwStatistic.cpp      |   58 +-
 src/3rd/Simd/SimdAvx512f.h                  |   12 +-
 src/3rd/Simd/SimdAvx512fGemm32f.cpp         | 1055 ++++++++++++++
 src/3rd/Simd/SimdAvx512fNeural.cpp          |  401 +++---
 src/3rd/Simd/SimdAvx512fResizer.cpp         |  156 ++
 src/3rd/Simd/SimdAvx512fSynet.cpp           |  368 +++++
 src/3rd/Simd/SimdBase.h                     |   24 +-
 src/3rd/Simd/SimdBaseDetection.cpp          |   10 +-
 src/3rd/Simd/SimdBaseFloat16.cpp            |   16 +-
 src/3rd/Simd/SimdBaseFloat32.cpp            |   18 +-
 src/3rd/Simd/SimdBaseGemm32f.cpp            |   48 +
 src/3rd/Simd/SimdBaseHogLite.cpp            |   28 +-
 src/3rd/Simd/SimdBaseNeural.cpp             |    4 +-
 src/3rd/Simd/SimdBaseReduceGray5x5.cpp      |   18 +-
 src/3rd/Simd/SimdBaseResizer.cpp            |  257 ++++
 src/3rd/Simd/SimdBaseStatistic.cpp          |   24 +-
 src/3rd/Simd/SimdBaseSynet.cpp              |  232 +++
 src/3rd/Simd/SimdBaseThread.cpp             |   45 +
 src/3rd/Simd/SimdBase_tinyxml2.cpp          |  964 +++++++++----
 src/3rd/Simd/SimdBase_tinyxml2.h            | 1435 ++++++++++---------
 src/3rd/Simd/SimdConst.h                    |   10 +-
 src/3rd/Simd/SimdDetection.h                |    7 +-
 src/3rd/Simd/SimdDetection.hpp              |    6 +-
 src/3rd/Simd/SimdEnable.h                   |   17 +-
 src/3rd/Simd/SimdGemm.h                     |  163 +++
 src/3rd/Simd/SimdLib.cpp                    |  190 ++-
 src/3rd/Simd/SimdLib.h                      |  359 ++++-
 src/3rd/Simd/SimdLib.hpp                    |   62 +-
 src/3rd/Simd/SimdLoad.h                     |   16 +-
 src/3rd/Simd/SimdMath.h                     |   35 +-
 src/3rd/Simd/SimdMemory.h                   |   17 +-
 src/3rd/Simd/SimdNeon.h                     |    5 +-
 src/3rd/Simd/SimdNeonResizeBilinear.cpp     |  171 ++-
 src/3rd/Simd/SimdNeonStatistic.cpp          |   45 +-
 src/3rd/Simd/SimdNeural.hpp                 |    8 +-
 src/3rd/Simd/SimdParallel.hpp               |   10 +-
 src/3rd/Simd/SimdPow.h                      |  205 +++
 src/3rd/Simd/SimdResizer.h                  |  139 ++
 src/3rd/Simd/SimdSse1.h                     |   12 +-
 src/3rd/Simd/SimdSse1Float32.cpp            |   92 ++
 src/3rd/Simd/SimdSse1Gemm32f.cpp            |  595 ++++++++
 src/3rd/Simd/SimdSse1Resizer.cpp            |  118 ++
 src/3rd/Simd/SimdSse1Synet.cpp              |  325 +++++
 src/3rd/Simd/SimdSse2.h                     |    6 +-
 src/3rd/Simd/SimdSse2Float32.cpp            |    4 +-
 src/3rd/Simd/SimdSse2Neural.cpp             |   95 +-
 src/3rd/Simd/SimdSse2ReduceGray2x2.cpp      |    9 +-
 src/3rd/Simd/SimdSse2Statistic.cpp          |   45 +-
 src/3rd/Simd/SimdSse2Synet.cpp              |   91 ++
 src/3rd/Simd/SimdSse3Neural.cpp             |    4 +-
 src/3rd/Simd/SimdSse41.h                    |    2 +-
 src/3rd/Simd/SimdSse41Hog.cpp               |   12 +-
 src/3rd/Simd/SimdSse41HogLite.cpp           |   48 +-
 src/3rd/Simd/SimdStore.h                    |   12 +
 src/3rd/Simd/SimdVersion.h                  |    8 +-
 src/3rd/Simd/SimdView.hpp                   |  113 +-
 81 files changed, 9526 insertions(+), 1533 deletions(-)
 create mode 100644 src/3rd/Simd/SimdAvx1Float32.cpp
 create mode 100644 src/3rd/Simd/SimdAvx1Gemm32f.cpp
 create mode 100644 src/3rd/Simd/SimdAvx1Resizer.cpp
 create mode 100644 src/3rd/Simd/SimdAvx1Synet.cpp
 create mode 100644 src/3rd/Simd/SimdAvx2Gemm32f.cpp
 create mode 100644 src/3rd/Simd/SimdAvx2Resizer.cpp
 create mode 100644 src/3rd/Simd/SimdAvx2Synet.cpp
 create mode 100644 src/3rd/Simd/SimdAvx512fGemm32f.cpp
 create mode 100644 src/3rd/Simd/SimdAvx512fResizer.cpp
 create mode 100644 src/3rd/Simd/SimdAvx512fSynet.cpp
 create mode 100644 src/3rd/Simd/SimdBaseGemm32f.cpp
 create mode 100644 src/3rd/Simd/SimdBaseResizer.cpp
 create mode 100644 src/3rd/Simd/SimdBaseSynet.cpp
 create mode 100644 src/3rd/Simd/SimdBaseThread.cpp
 create mode 100644 src/3rd/Simd/SimdGemm.h
 create mode 100644 src/3rd/Simd/SimdPow.h
 create mode 100644 src/3rd/Simd/SimdResizer.h
 create mode 100644 src/3rd/Simd/SimdSse1Float32.cpp
 create mode 100644 src/3rd/Simd/SimdSse1Gemm32f.cpp
 create mode 100644 src/3rd/Simd/SimdSse1Resizer.cpp
 create mode 100644 src/3rd/Simd/SimdSse1Synet.cpp
 create mode 100644 src/3rd/Simd/SimdSse2Synet.cpp

diff --git a/src/3rd/Simd/SimdArray.h b/src/3rd/Simd/SimdArray.h
index 73476787..de2e917b 100644
--- a/src/3rd/Simd/SimdArray.h
+++ b/src/3rd/Simd/SimdArray.h
@@ -25,6 +25,7 @@
 #define __SimdArray_h__
 
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdEnable.h"
 
 namespace Simd
 {
@@ -33,7 +34,7 @@ namespace Simd
         T * const data;
         size_t const size;
 
-        SIMD_INLINE Array(size_t size_ = 0, bool clear = false)
+        SIMD_INLINE Array(size_t size_ = 0, bool clear = false, size_t align = SIMD_ALIGN)
             : data(0)
             , size(0)
         {
@@ -46,7 +47,7 @@ namespace Simd
                 Simd::Free(data);
         }
 
-        SIMD_INLINE void Resize(size_t size_, bool clear = false)
+        SIMD_INLINE void Resize(size_t size_, bool clear = false, size_t align = SIMD_ALIGN)
         {
             if (size_ != size)
             {
@@ -54,7 +55,7 @@ namespace Simd
                     Simd::Free(data);
                 *(size_t*)&size = size_;
                 if (size_)
-                    *(T**)&data = (T*)Simd::Allocate(size * sizeof(T));
+                    *(T**)&data = (T*)Simd::Allocate(size * sizeof(T), align);
             }
             if (clear)
                 Clear();
diff --git a/src/3rd/Simd/SimdAvx1.h b/src/3rd/Simd/SimdAvx1.h
index 93b96e07..33f76cd3 100644
--- a/src/3rd/Simd/SimdAvx1.h
+++ b/src/3rd/Simd/SimdAvx1.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,11 @@ namespace Simd
 #ifdef SIMD_AVX_ENABLE    
     namespace Avx
     {
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight);
 
@@ -100,6 +104,12 @@ namespace Simd
         void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum);
 
         void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum);
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
     }
 #endif// SIMD_AVX_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx1Float32.cpp b/src/3rd/Simd/SimdAvx1Float32.cpp
new file mode 100644
index 00000000..78de5333
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx1Float32.cpp
@@ -0,0 +1,92 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX_ENABLE    
+    namespace Avx
+    {
+        template<bool align> void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t partialAlignedSize = AlignLo(size, F);
+            size_t fullAlignedSize = AlignLo(size, DF);
+            size_t i = 0;
+            __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            if (fullAlignedSize)
+            {
+                for (; i < fullAlignedSize; i += DF)
+                {
+                    __m256 a0 = Load<align>(a + i + 0 * F);
+                    __m256 b0 = Load<align>(b + i + 0 * F);
+                    _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0));
+                    _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0));
+                    _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0));
+                    __m256 a1 = Load<align>(a + i + 1 * F);
+                    __m256 b1 = Load<align>(b + i + 1 * F);
+                    _aa[1] = _mm256_add_ps(_aa[1], _mm256_mul_ps(a1, a1));
+                    _ab[1] = _mm256_add_ps(_ab[1], _mm256_mul_ps(a1, b1));
+                    _bb[1] = _mm256_add_ps(_bb[1], _mm256_mul_ps(b1, b1));
+                }
+                _aa[0] = _mm256_add_ps(_aa[0], _aa[1]);
+                _ab[0] = _mm256_add_ps(_ab[0], _ab[1]);
+                _bb[0] = _mm256_add_ps(_bb[0], _bb[1]);
+            }
+            for (; i < partialAlignedSize; i += F)
+            {
+                __m256 a0 = Load<align>(a + i);
+                __m256 b0 = Load<align>(b + i);
+                _aa[0] = _mm256_add_ps(_aa[0], _mm256_mul_ps(a0, a0));
+                _ab[0] = _mm256_add_ps(_ab[0], _mm256_mul_ps(a0, b0));
+                _bb[0] = _mm256_add_ps(_bb[0], _mm256_mul_ps(b0, b0));
+            }
+            float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]);
+            for (; i < size; ++i)
+            {
+                float _a = a[i];
+                float _b = b[i];
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance32f<true>(a, b, size, distance);
+            else
+                CosineDistance32f<false>(a, b, size, distance);
+        }
+    }
+#endif// SIMD_AVX_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx1Gemm32f.cpp b/src/3rd/Simd/SimdAvx1Gemm32f.cpp
new file mode 100644
index 00000000..99fa9dd1
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx1Gemm32f.cpp
@@ -0,0 +1,627 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdGemm.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX_ENABLE    
+    namespace Avx
+    {
+        SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha)
+        {
+            _mm256_storeu_ps(ptr, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr)));
+        }
+
+        SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail)
+        {
+            if (tail == F)
+                AddProduct(ptr, value, alpha);
+            else
+            {
+                float tmp[F];
+                _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr)));
+                for (size_t i = 0; i < tail; ++i)
+                    ptr[i] = tmp[i];
+            }
+        }
+
+        static void Kernel4x24(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            __m256 c02 = _mm256_setzero_ps();
+            __m256 c12 = _mm256_setzero_ps();
+            __m256 c22 = _mm256_setzero_ps();
+            __m256 c32 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m256 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                b2 = _mm256_loadu_ps(B + 2 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00);
+                c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01);
+                c02 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c02);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10);
+                c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11);
+                c12 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c12);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20);
+                c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21);
+                c22 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c22);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30);
+                c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31);
+                c32 = _mm256_add_ps(_mm256_mul_ps(a0, b2), c32);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01);
+            AddProduct(C + 2 * F, _alpha, c02, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11);
+            AddProduct(C + 2 * F, _alpha, c12, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21);
+            AddProduct(C + 2 * F, _alpha, c22, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31);
+            AddProduct(C + 2 * F, _alpha, c32, tail);
+        }
+
+        static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m256 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00);
+                c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10);
+                c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20);
+                c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30);
+                c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+        }
+
+        static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c0 = _mm256_setzero_ps();
+            __m256 c1 = _mm256_setzero_ps();
+            __m256 c2 = _mm256_setzero_ps();
+            __m256 c3 = _mm256_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            __m256 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B);
+                c0 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a0++)), c0);
+                c1 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a1++)), c1);
+                c2 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a2++)), c2);
+                c3 = _mm256_add_ps(_mm256_mul_ps(b0, _mm256_set1_ps(*a3++)), c3);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, tail);
+            AddProduct(C + 1 * ldc, _alpha, c1, tail);
+            AddProduct(C + 2 * ldc, _alpha, c2, tail);
+            AddProduct(C + 3 * ldc, _alpha, c3, tail);
+        }
+
+        static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c40 = _mm256_setzero_ps();
+            __m256 c50 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            __m256 c41 = _mm256_setzero_ps();
+            __m256 c51 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m256 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00);
+                c01 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c01);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10);
+                c11 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c11);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20);
+                c21 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c21);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30);
+                c31 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c31);
+                a0 = _mm256_set1_ps(*A4++);
+                c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40);
+                c41 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c41);
+                a0 = _mm256_set1_ps(*A5++);
+                c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50);
+                c51 = _mm256_add_ps(_mm256_mul_ps(a0, b1), c51);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, tail);
+        }
+
+        static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c40 = _mm256_setzero_ps();
+            __m256 c50 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m256 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c00);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c10);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c20);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c30);
+                a0 = _mm256_set1_ps(*A4++);
+                c40 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c40);
+                a0 = _mm256_set1_ps(*A5++);
+                c50 = _mm256_add_ps(_mm256_mul_ps(a0, b0), c50);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50, tail);
+        }
+
+        static void KernelMx24(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c[4][3];
+            const float * a[4];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm256_setzero_ps();
+                c[i][1] = _mm256_setzero_ps();
+                c[i][2] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                b2 = _mm256_loadu_ps(B + 2 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]);
+                    c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1]);
+                AddProduct(C + 2 * F, _alpha, c[i][2], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c[6][2];
+            const float * a[6];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm256_setzero_ps();
+                c[i][1] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+#ifdef SIMD_X64_ENABLE
+            __m256 c[6];
+            const float * a[6];
+#else
+            __m256 c[4];
+            const float * a[4];
+#endif
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+                AddProduct(C + i * ldc, _alpha, c[i], tail);
+        }
+
+        SIMD_INLINE void ScaleC(float * C, __m256 beta)
+        {
+            _mm256_storeu_ps(C, _mm256_mul_ps(_mm256_loadu_ps(C), beta));
+        }
+
+        void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc)
+        {
+            if (beta == 1.0f)
+                return;
+            else if (beta == 0.0f)
+            {
+                for (size_t i = 0; i < M; ++i)
+                    memset(C + i * ldc, 0, N * sizeof(float));
+            }
+            else
+            {
+                size_t NQF = AlignLo(N, QF);
+                size_t NF = AlignLo(N, F);
+                __m256 _beta = _mm256_set1_ps(beta);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    size_t j = 0;
+                    for (; j < NQF; j += QF)
+                    {
+                        ScaleC(C + j + F * 0, _beta);
+                        ScaleC(C + j + F * 1, _beta);
+                        ScaleC(C + j + F * 2, _beta);
+                        ScaleC(C + j + F * 3, _beta);
+                    }
+                    for (; j < NF; j += F)
+                        ScaleC(C + j, _beta);
+                    for (; j < N; ++j)
+                        C[j] *= beta;
+                    C += ldc;
+                }
+            }
+        }
+
+        void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB)
+        {
+            for (size_t j = 0; j < N; j += microN)
+            {
+                size_t n = Simd::Min(microN, N - j);
+                size_t k = 0;
+                if (microN == 1 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m256 mask0 = Avx::LeftNotZero(n - 0 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                else if (microN == 2 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F));
+                            _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m256 mask0 = Avx::LeftNotZero(n - 0 * F);
+                        __m256 mask1 = Avx::LeftNotZero(n - 1 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F)));
+                            _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                else if (microN == 3 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_loadu_ps(b + 0 * F));
+                            _mm256_storeu_ps(pB + 1 * F, _mm256_loadu_ps(b + 1 * F));
+                            _mm256_storeu_ps(pB + 2 * F, _mm256_loadu_ps(b + 2 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m256 mask0 = Avx::LeftNotZero(n - 0 * F);
+                        __m256 mask1 = Avx::LeftNotZero(n - 1 * F);
+                        __m256 mask2 = Avx::LeftNotZero(n - 2 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm256_storeu_ps(pB + 0 * F, _mm256_and_ps(mask0, _mm256_loadu_ps(b + 0 * F)));
+                            _mm256_storeu_ps(pB + 1 * F, _mm256_and_ps(mask1, _mm256_loadu_ps(b + 1 * F)));
+                            _mm256_storeu_ps(pB + 2 * F, _mm256_and_ps(mask2, _mm256_loadu_ps(b + 2 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    const float * b = B + k * ldb;
+                    size_t c = 0;
+                    for (; c < n; ++c)
+                        *(pB++) = *(b++);
+                    for (; c < microN; ++c)
+                        *(pB++) = 0;
+                }
+                B += microN;
+            }
+        }
+
+        static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst)
+        {
+            size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8);
+            for (size_t i = 0; i < M; i += cell)
+            {
+                size_t m = Simd::Min(cell, M - i), k = 0;
+                if (cell == 4 && m == 4)
+                {
+                    for (; k < K8; k += 8)
+                    {
+                        const float * ps = src + k;
+                        __m256 s0 = _mm256_loadu_ps(ps + 0 * K);
+                        __m256 s1 = _mm256_loadu_ps(ps + 1 * K);
+                        __m256 s2 = _mm256_loadu_ps(ps + 2 * K);
+                        __m256 s3 = _mm256_loadu_ps(ps + 3 * K);
+                        __m256 s00 = _mm256_unpacklo_ps(s0, s2);
+                        __m256 s01 = _mm256_unpacklo_ps(s1, s3);
+                        __m256 s10 = _mm256_unpackhi_ps(s0, s2);
+                        __m256 s11 = _mm256_unpackhi_ps(s1, s3);
+                        __m256 d0 = _mm256_unpacklo_ps(s00, s01);
+                        __m256 d1 = _mm256_unpackhi_ps(s00, s01);
+                        __m256 d2 = _mm256_unpacklo_ps(s10, s11);
+                        __m256 d3 = _mm256_unpackhi_ps(s10, s11);
+                        _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20));
+                        _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20));
+                        _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31));
+                        _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31));
+                        dst += 32;
+                    };
+                    for (; k < K4; k += 4)
+                    {
+                        const float * ps = src + k;
+                        __m128 s0 = _mm_loadu_ps(ps + 0 * stride);
+                        __m128 s1 = _mm_loadu_ps(ps + 1 * stride);
+                        __m128 s2 = _mm_loadu_ps(ps + 2 * stride);
+                        __m128 s3 = _mm_loadu_ps(ps + 3 * stride);
+                        __m128 s00 = _mm_unpacklo_ps(s0, s2);
+                        __m128 s01 = _mm_unpacklo_ps(s1, s3);
+                        __m128 s10 = _mm_unpackhi_ps(s0, s2);
+                        __m128 s11 = _mm_unpackhi_ps(s1, s3);
+                        _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01));
+                        _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01));
+                        _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11));
+                        _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11));
+                        dst += 16;
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    for (size_t c = 0; c < m; ++c)
+                        *(dst++) = src[c*stride + k];
+                }  
+                src += cell * stride;
+            }
+        }
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+        {
+            const size_t CACHE_L1_SIZE = 32 * 1024;
+            const size_t CACHE_L2_SIZE = 256 * 1024;
+            const size_t CACHE_L3_SIZE = 2 * 1024 * 1024;
+            typedef Simd::GemmNN<float, size_t> GemmNN;
+            GemmNN::Main kernelMM, kernelMT;
+            GemmNN::Tail kernelTM, kernelTT;
+            size_t microM, microN, L1, L2;
+#ifdef SIMD_X64_ENABLE
+            if (K > 4024)
+            {
+                microM = 6;
+                microN = 16;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel6x16;
+                kernelMT = tail > F ? Kernel6x16 : Kernel6x8;
+                kernelTM = KernelMx16;
+                kernelTT = tail > F ? KernelMx16 : KernelMx8;
+            }
+            else
+            {
+                microM = 4;
+                microN = 24;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel4x24;
+                kernelMT = tail > DF ? Kernel4x24 : (tail > F ? Kernel4x16 : Kernel4x8);
+                kernelTM = KernelMx24;
+                kernelTT = tail > DF ? KernelMx24 : (tail > F ? KernelMx16 : KernelMx8);
+            }
+#else
+            microM = 4;
+            microN = 8;
+            kernelMM = Kernel4x8;
+            kernelMT = Kernel4x8;
+            kernelTM = KernelMx8;
+            kernelTT = KernelMx8;
+#endif
+            L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE;
+            L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE;
+            GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F,
+                kernelMM, kernelMT, kernelTM, kernelTT, Avx::GemmScaleC, Avx::GemmPackB, NULL);
+            gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc);
+        }
+    }
+#endif// SIMD_AVX_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx1HogLite.cpp b/src/3rd/Simd/SimdAvx1HogLite.cpp
index 00cbe0bf..1ce3a087 100644
--- a/src/3rd/Simd/SimdAvx1HogLite.cpp
+++ b/src/3rd/Simd/SimdAvx1HogLite.cpp
@@ -51,9 +51,9 @@ namespace Simd
                 sums[3] = _mm256_add_ps(sums[3], _mm256_mul_ps(Load<align>(src + 3 * step), _filter));
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
                 {
@@ -63,7 +63,7 @@ namespace Simd
                         __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() };
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             size_t filterCol = 0;
                             for (; filterCol < filterStride; filterCol += F)
@@ -78,7 +78,7 @@ namespace Simd
                         __m256 sum = _mm256_setzero_ps();
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                 ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -91,9 +91,9 @@ namespace Simd
                 }
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 __m128 _min = _mm_set1_ps(-FLT_MAX);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
@@ -109,7 +109,7 @@ namespace Simd
                             __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() };
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 size_t filterCol = 0;
                                 for (; filterCol < filterStride; filterCol += F)
@@ -127,7 +127,7 @@ namespace Simd
                             __m256 sum = _mm256_setzero_ps();
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                     ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -144,53 +144,53 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
             }
 
         public:
 
-            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 assert(featureSize == 8 || featureSize == 16);
-                assert(srcWidth >= filterSize && srcHeight >= filterSize);
+                assert(srcWidth >= filterWidth && srcHeight >= filterHeight);
 
-                size_t dstWidth = srcWidth - filterSize + 1;
-                size_t dstHeight = srcHeight - filterSize + 1;
+                size_t dstWidth = srcWidth - filterWidth + 1;
+                size_t dstHeight = srcHeight - filterHeight + 1;
 
                 if (mask)
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 }
                 else
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                 }
             }
         };
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
         {
             HogLiteFeatureFilter featureFilter;
-            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
         }
 
         namespace HogLiteFeatureResizerDetail
diff --git a/src/3rd/Simd/SimdAvx1Neural.cpp b/src/3rd/Simd/SimdAvx1Neural.cpp
index dfd0ac55..054b15fa 100644
--- a/src/3rd/Simd/SimdAvx1Neural.cpp
+++ b/src/3rd/Simd/SimdAvx1Neural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -1764,7 +1764,7 @@ namespace Simd
 
                 bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
-                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1)
+                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F)
                     {
                         if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3)
                             return true;
diff --git a/src/3rd/Simd/SimdAvx1Resizer.cpp b/src/3rd/Simd/SimdAvx1Resizer.cpp
new file mode 100644
index 00000000..eb58eccf
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx1Resizer.cpp
@@ -0,0 +1,135 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdResizer.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX_ENABLE 
+    namespace Avx
+    {
+        ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp)
+            : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m256), caffeInterp)
+        {
+        }
+
+        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const
+        {
+            Array32f bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            float * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            size_t rsa = AlignLo(_rs, Avx::F);
+            size_t rsh = AlignLo(_rs, Sse::F);
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    float * pb = pbx[k];
+                    const float * ps = src + (sy + k)*srcStride;
+                    size_t dx = 0;
+                    if (_cn == 1)
+                    {
+                        __m256 _1 = _mm256_set1_ps(1.0f);
+                        for (; dx < rsa; dx += Avx::F)
+                        {
+                            __m256 s0145 = Avx::Load(ps + _ix[dx + 0], ps + _ix[dx + 1], ps + _ix[dx + 4], ps + _ix[dx + 5]);
+                            __m256 s2367 = Avx::Load(ps + _ix[dx + 2], ps + _ix[dx + 3], ps + _ix[dx + 6], ps + _ix[dx + 7]);
+                            __m256 fx1 = _mm256_load_ps(_ax.data + dx);
+                            __m256 fx0 = _mm256_sub_ps(_1, fx1);
+                            __m256 m0 = _mm256_mul_ps(fx0, _mm256_shuffle_ps(s0145, s2367, 0x88));
+                            __m256 m1 = _mm256_mul_ps(fx1, _mm256_shuffle_ps(s0145, s2367, 0xDD));
+                            _mm256_store_ps(pb + dx, _mm256_add_ps(m0, m1));
+                        }
+                        for (; dx < rsh; dx += Sse::F)
+                        {
+                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 fx1 = _mm_load_ps(_ax.data + dx);
+                            __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
+                            __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
+                            __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
+                            _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
+                        }
+                    }
+                    for (; dx < _rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx;
+                    }
+                }  
+
+                size_t dx = 0;
+                __m256 _fy0 = _mm256_set1_ps(fy0);
+                __m256 _fy1 = _mm256_set1_ps(fy1);
+                for (; dx < rsa; dx += Avx::F)
+                {
+                    __m256 m0 = _mm256_mul_ps(_mm256_load_ps(pbx[0] + dx), _fy0);
+                    __m256 m1 = _mm256_mul_ps(_mm256_load_ps(pbx[1] + dx), _fy1);
+                    _mm256_storeu_ps(dst + dx, _mm256_add_ps(m0, m1));
+                }
+                for (; dx < rsh; dx += Sse::F)
+                {
+                    __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
+                    __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
+                    _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
+                }
+                for (; dx < _rs; dx++)
+                    dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+        {
+            if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true);
+            else
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+        }
+    }
+#endif //SIMD_AVX_ENABLE 
+}
+
diff --git a/src/3rd/Simd/SimdAvx1Synet.cpp b/src/3rd/Simd/SimdAvx1Synet.cpp
new file mode 100644
index 00000000..6f1401b6
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx1Synet.cpp
@@ -0,0 +1,325 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX_ENABLE    
+    namespace Avx
+    {
+        template <bool align> SIMD_INLINE void SynetAddBias(const __m256 & bias, float * dst)
+        {
+            Store<align>(dst, _mm256_add_ps(Load<align>(dst), bias));
+        }
+
+        template <bool align> SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);     
+            for (size_t i = 0; i < count; ++i)
+            {
+                size_t j = 0;
+                if (partial)
+                {
+                    __m256 _bias = _mm256_set1_ps(bias[i]);
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetAddBias<align>(_bias, dst + j + F * 0);
+                        SynetAddBias<align>(_bias, dst + j + F * 1);
+                        SynetAddBias<align>(_bias, dst + j + F * 2);
+                        SynetAddBias<align>(_bias, dst + j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetAddBias<align>(_bias, dst + j);
+                }
+                for (; j < size; ++j)
+                    dst[j] += bias[i];
+                dst += size;
+            }
+        }
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetAddBias<true>(bias, count, size, dst);
+            else
+                SynetAddBias<false>(bias, count, size, dst);
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_mul_ps(Load<align>(src0 + offset), Load<align>(src1 + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * src1[j];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 0);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 1);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 2);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] *= srci[j];
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load<align>(src0 + offset), weight0), _mm256_mul_ps(Load<align>(src1 + offset), weight1)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load<align>(src + offset), weight), Load<align>(dst + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            __m256 weight0 = _mm256_set1_ps(weight[0]);
+            __m256 weight1 = _mm256_set1_ps(weight[1]);
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * weight[0] + src1[j] * weight[1];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                __m256 weighti = _mm256_set1_ps(weight[i]);
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 0);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 1);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 2);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] += srci[j] * weight[i];
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_max_ps(Load<align>(src0 + offset), Load<align>(src1 + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = Simd::Max(src0[j], src1[j]);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 0);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 1);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 2);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] = Simd::Max(dst[j], srci[j]);
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            switch (type)
+            {
+            case SimdSynetEltwiseOperationProduct:
+                SynetEltwiseLayerForwardProduct<align>(src, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationSum:
+                SynetEltwiseLayerForwardSum<align>(src, weight, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationMax:
+                SynetEltwiseLayerForwardMax<align>(src, count, size, dst);
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            assert(count >= 2);
+            bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]);
+            for (size_t i = 2; i < count; ++i)
+                aligned = aligned && Aligned(src[i]);
+            if (aligned)
+                SynetEltwiseLayerForward<true>(src, weight, count, size, type, dst);
+            else
+                SynetEltwiseLayerForward<false>(src, weight, count, size, type, dst);
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_add_ps(_mm256_mul_ps(Load<align>(src + offset), scale), bias));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm256_mul_ps(Load<align>(src + offset), scale));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            if (bias)
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m256 _scale = _mm256_set1_ps(scale[i]);
+                        __m256 _bias = _mm256_set1_ps(bias[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i] + bias[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m256 _scale = _mm256_set1_ps(scale[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+        }
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetScaleLayerForward<true>(src, scale, bias, count, size, dst);
+            else
+                SynetScaleLayerForward<false>(src, scale, bias, count, size, dst);
+        }
+    }
+#endif// SIMD_AVX_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx2.h b/src/3rd/Simd/SimdAvx2.h
index fdca8625..6b4dc9c2 100644
--- a/src/3rd/Simd/SimdAvx2.h
+++ b/src/3rd/Simd/SimdAvx2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -181,13 +181,19 @@ namespace Simd
 
         void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum);
 
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance);
+
         void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst);
 
         void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst);
 
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
         void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             size_t channelCount, uint8_t * dst, size_t dstStride);
 
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
         void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
 
         void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha);
@@ -212,7 +218,7 @@ namespace Simd
 
         void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride);
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight);
 
@@ -402,11 +408,19 @@ namespace Simd
 
         void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
 
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
+
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
 
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
+        
         void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride);
 
diff --git a/src/3rd/Simd/SimdAvx2Float16.cpp b/src/3rd/Simd/SimdAvx2Float16.cpp
index b4cb4b58..81365746 100644
--- a/src/3rd/Simd/SimdAvx2Float16.cpp
+++ b/src/3rd/Simd/SimdAvx2Float16.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -151,6 +151,65 @@ namespace Simd
             else
                 SquaredDifferenceSum16f<false>(a, b, size, sum);
         }
+
+        template<bool align> void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t partialAlignedSize = AlignLo(size, F);
+            size_t fullAlignedSize = AlignLo(size, DF);
+            size_t i = 0;
+            __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            if (fullAlignedSize)
+            {
+                for (; i < fullAlignedSize; i += DF)
+                {
+                    __m256 a0 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(a + i) + 0));
+                    __m256 b0 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(b + i) + 0));
+                    _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]);
+                    _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]);
+                    _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]);
+                    __m256 a1 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(a + i) + 1));
+                    __m256 b1 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(b + i) + 1));
+                    _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]);
+                    _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]);
+                    _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]);
+                }
+                _aa[0] = _mm256_add_ps(_aa[0], _aa[1]);
+                _ab[0] = _mm256_add_ps(_ab[0], _ab[1]);
+                _bb[0] = _mm256_add_ps(_bb[0], _bb[1]);
+            }
+            for (; i < partialAlignedSize; i += F)
+            {
+                __m256 a0 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(a + i) + 0));
+                __m256 b0 = _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(b + i) + 0));
+                _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]);
+                _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]);
+                _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]);
+            }
+            if (partialAlignedSize != size)
+            {
+                __m256 mask = RightNotZero(size - partialAlignedSize);
+                __m256 a0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(a + size - F))));
+                __m256 b0 = _mm256_and_ps(mask, _mm256_cvtph_ps(Sse2::Load<align>((__m128i*)(b + size - F))));
+                _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]);
+                _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]);
+                _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]);
+            }
+            float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]);
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance16f<true>(a, b, size, distance);
+            else
+                CosineDistance16f<false>(a, b, size, distance);
+        }
     }
 #endif// SIMD_AVX2_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx2Float32.cpp b/src/3rd/Simd/SimdAvx2Float32.cpp
index 242aba1f..948eb875 100644
--- a/src/3rd/Simd/SimdAvx2Float32.cpp
+++ b/src/3rd/Simd/SimdAvx2Float32.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
 
 namespace Simd
 {
@@ -70,7 +71,7 @@ namespace Simd
 
         SIMD_INLINE __m256 Uint8ToFloat32(const __m128i & value, const __m256 & lower, const __m256 & boost)
         {
-            return _mm256_sub_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(value)), boost), lower);
+            return _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(value)), boost), lower);
         }
 
         template <bool align> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m256 & lower, const __m256 & boost, float * dst)
@@ -103,6 +104,64 @@ namespace Simd
             else
                 Uint8ToFloat32<false>(src, size, lower, upper, dst);
         }
+
+        template<bool align> void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t partialAlignedSize = AlignLo(size, F);
+            size_t fullAlignedSize = AlignLo(size, DF);
+            size_t i = 0;
+            __m256 _aa[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _ab[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            __m256 _bb[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() };
+            if (fullAlignedSize)
+            {
+                for (; i < fullAlignedSize; i += DF)
+                {
+                    __m256 a0 = Load<align>(a + i + 0 * F);
+                    __m256 b0 = Load<align>(b + i + 0 * F);
+                    _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]);
+                    _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]);
+                    _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]);
+                    __m256 a1 = Load<align>(a + i + 1 * F);
+                    __m256 b1 = Load<align>(b + i + 1 * F);
+                    _aa[1] = _mm256_fmadd_ps(a1, a1, _aa[1]);
+                    _ab[1] = _mm256_fmadd_ps(a1, b1, _ab[1]);
+                    _bb[1] = _mm256_fmadd_ps(b1, b1, _bb[1]);
+                }
+                _aa[0] = _mm256_add_ps(_aa[0], _aa[1]);
+                _ab[0] = _mm256_add_ps(_ab[0], _ab[1]);
+                _bb[0] = _mm256_add_ps(_bb[0], _bb[1]);
+            }
+            for (; i < partialAlignedSize; i += F)
+            {
+                __m256 a0 = Load<align>(a + i);
+                __m256 b0 = Load<align>(b + i);
+                _aa[0] = _mm256_fmadd_ps(a0, a0, _aa[0]);
+                _ab[0] = _mm256_fmadd_ps(a0, b0, _ab[0]);
+                _bb[0] = _mm256_fmadd_ps(b0, b0, _bb[0]);
+            }
+            float aa = Avx::ExtractSum(_aa[0]), ab = Avx::ExtractSum(_ab[0]), bb = Avx::ExtractSum(_bb[0]);
+            for (; i < size; ++i)
+            {
+                float _a = a[i];
+                float _b = b[i];
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance32f<true>(a, b, size, distance);
+            else
+                CosineDistance32f<false>(a, b, size, distance);
+        }
     }
 #endif// SIMD_AVX2_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx2Gemm32f.cpp b/src/3rd/Simd/SimdAvx2Gemm32f.cpp
new file mode 100644
index 00000000..8aa7b2be
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx2Gemm32f.cpp
@@ -0,0 +1,481 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdGemm.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha)
+        {
+            _mm256_storeu_ps(ptr, _mm256_fmadd_ps(value, alpha, _mm256_loadu_ps(ptr)));
+        }
+
+        SIMD_INLINE void AddProduct(float * ptr, __m256 value, __m256 alpha, size_t tail)
+        {
+            if (tail == F)
+                AddProduct(ptr, value, alpha);
+            else
+            {
+                float tmp[F];
+                _mm256_storeu_ps(tmp, _mm256_add_ps(_mm256_mul_ps(value, alpha), _mm256_loadu_ps(ptr)));
+                for (size_t i = 0; i < tail; ++i)
+                    ptr[i] = tmp[i];
+            }
+        }
+
+        static void Kernel4x24(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            __m256 c02 = _mm256_setzero_ps();
+            __m256 c12 = _mm256_setzero_ps();
+            __m256 c22 = _mm256_setzero_ps();
+            __m256 c32 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m256 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; ++k)
+            {
+                _mm_prefetch((char*)B + 384, _MM_HINT_T0);
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                b2 = _mm256_loadu_ps(B + 2 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_fmadd_ps(a0, b0, c00);
+                c01 = _mm256_fmadd_ps(a0, b1, c01);
+                c02 = _mm256_fmadd_ps(a0, b2, c02);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_fmadd_ps(a0, b0, c10);
+                c11 = _mm256_fmadd_ps(a0, b1, c11);
+                c12 = _mm256_fmadd_ps(a0, b2, c12);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_fmadd_ps(a0, b0, c20);
+                c21 = _mm256_fmadd_ps(a0, b1, c21);
+                c22 = _mm256_fmadd_ps(a0, b2, c22);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_fmadd_ps(a0, b0, c30);
+                c31 = _mm256_fmadd_ps(a0, b1, c31);
+                c32 = _mm256_fmadd_ps(a0, b2, c32);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01);
+            AddProduct(C + 2 * F, _alpha, c02, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11);
+            AddProduct(C + 2 * F, _alpha, c12, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21);
+            AddProduct(C + 2 * F, _alpha, c22, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31);
+            AddProduct(C + 2 * F, _alpha, c32, tail);
+        }
+
+        static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m256 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                c00 = _mm256_fmadd_ps(a0, b0, c00);
+                c01 = _mm256_fmadd_ps(a0, b1, c01);
+                a0 = _mm256_set1_ps(*A1++);
+                c10 = _mm256_fmadd_ps(a0, b0, c10);
+                c11 = _mm256_fmadd_ps(a0, b1, c11);
+                a0 = _mm256_set1_ps(*A2++);
+                c20 = _mm256_fmadd_ps(a0, b0, c20);
+                c21 = _mm256_fmadd_ps(a0, b1, c21);
+                a0 = _mm256_set1_ps(*A3++);
+                c30 = _mm256_fmadd_ps(a0, b0, c30);
+                c31 = _mm256_fmadd_ps(a0, b1, c31);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+        }
+
+        static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c0 = _mm256_setzero_ps();
+            __m256 c1 = _mm256_setzero_ps();
+            __m256 c2 = _mm256_setzero_ps();
+            __m256 c3 = _mm256_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            __m256 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B);
+                c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a0++), c0);
+                c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a1++), c1);
+                c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a2++), c2);
+                c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a3++), c3);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, tail);
+            AddProduct(C + 1 * ldc, _alpha, c1, tail);
+            AddProduct(C + 2 * ldc, _alpha, c2, tail);
+            AddProduct(C + 3 * ldc, _alpha, c3, tail);
+        }
+
+        static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c00 = _mm256_setzero_ps();
+            __m256 c10 = _mm256_setzero_ps();
+            __m256 c20 = _mm256_setzero_ps();
+            __m256 c30 = _mm256_setzero_ps();
+            __m256 c40 = _mm256_setzero_ps();
+            __m256 c50 = _mm256_setzero_ps();
+            __m256 c01 = _mm256_setzero_ps();
+            __m256 c11 = _mm256_setzero_ps();
+            __m256 c21 = _mm256_setzero_ps();
+            __m256 c31 = _mm256_setzero_ps();
+            __m256 c41 = _mm256_setzero_ps();
+            __m256 c51 = _mm256_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m256 b0, b1, a0, a1;
+            for (size_t k = 0; k < K; k++)
+            {
+                _mm_prefetch((char*)B + 512, _MM_HINT_T0);
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                a0 = _mm256_set1_ps(*A0++);
+                a1 = _mm256_set1_ps(*A1++);
+                c00 = _mm256_fmadd_ps(a0, b0, c00);
+                c01 = _mm256_fmadd_ps(a0, b1, c01);
+                c10 = _mm256_fmadd_ps(a1, b0, c10);
+                c11 = _mm256_fmadd_ps(a1, b1, c11);
+                a0 = _mm256_set1_ps(*A2++);
+                a1 = _mm256_set1_ps(*A3++);
+                c20 = _mm256_fmadd_ps(a0, b0, c20);
+                c21 = _mm256_fmadd_ps(a0, b1, c21);
+                c30 = _mm256_fmadd_ps(a1, b0, c30);
+                c31 = _mm256_fmadd_ps(a1, b1, c31);
+                a0 = _mm256_set1_ps(*A4++);
+                a1 = _mm256_set1_ps(*A5++);
+                c40 = _mm256_fmadd_ps(a0, b0, c40);
+                c41 = _mm256_fmadd_ps(a0, b1, c41);
+                c50 = _mm256_fmadd_ps(a1, b0, c50);
+                c51 = _mm256_fmadd_ps(a1, b1, c51);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, tail);
+        }
+
+        static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c0 = _mm256_setzero_ps();
+            __m256 c1 = _mm256_setzero_ps();
+            __m256 c2 = _mm256_setzero_ps();
+            __m256 c3 = _mm256_setzero_ps();
+            __m256 c4 = _mm256_setzero_ps();
+            __m256 c5 = _mm256_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            const float * a4 = A + lda * 4;
+            const float * a5 = A + lda * 5;
+            __m256 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B);
+                c0 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a0++), c0);
+                c1 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a1++), c1);
+                c2 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a2++), c2);
+                c3 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a3++), c3);
+                c4 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a4++), c4);
+                c5 = _mm256_fmadd_ps(b0, _mm256_set1_ps(*a5++), c5);
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, tail);
+            AddProduct(C + 1 * ldc, _alpha, c1, tail);
+            AddProduct(C + 2 * ldc, _alpha, c2, tail);
+            AddProduct(C + 3 * ldc, _alpha, c3, tail);
+            AddProduct(C + 4 * ldc, _alpha, c4, tail);
+            AddProduct(C + 5 * ldc, _alpha, c5, tail);
+        }
+
+        static void KernelMx24(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c[4][3];
+            const float * a[4];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm256_setzero_ps();
+                c[i][1] = _mm256_setzero_ps();
+                c[i][2] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                b2 = _mm256_loadu_ps(B + 2 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i][0] = _mm256_add_ps(_mm256_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm256_add_ps(_mm256_mul_ps(b1, a0), c[i][1]);
+                    c[i][2] = _mm256_add_ps(_mm256_mul_ps(b2, a0), c[i][2]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1]);
+                AddProduct(C + 2 * F, _alpha, c[i][2], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c[6][2];
+            const float * a[6];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm256_setzero_ps();
+                c[i][1] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                b1 = _mm256_loadu_ps(B + 1 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i][0] = _mm256_fmadd_ps(b0, a0, c[i][0]);
+                    c[i][1] = _mm256_fmadd_ps(b1, a0, c[i][1]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m256 c[4];
+            const float * a[4];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i] = _mm256_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m256 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm256_loadu_ps(B + 0 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm256_set1_ps(*a[i]++);
+                    c[i] = _mm256_fmadd_ps(b0, a0, c[i]);
+                }
+                B += ldb;
+            }
+            __m256 _alpha = _mm256_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+                AddProduct(C + i * ldc, _alpha, c[i], tail);
+        }
+
+        static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst)
+        {
+            size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8);
+            for (size_t i = 0; i < M; i += cell)
+            {
+                size_t m = Simd::Min(cell, M - i), k = 0;
+                if (cell == 4 && m == 4)
+                {
+                    for (; k < K8; k += 8)
+                    {
+                        const float * ps = src + k;
+                        __m256 s0 = _mm256_loadu_ps(ps + 0 * K);
+                        __m256 s1 = _mm256_loadu_ps(ps + 1 * K);
+                        __m256 s2 = _mm256_loadu_ps(ps + 2 * K);
+                        __m256 s3 = _mm256_loadu_ps(ps + 3 * K);
+                        __m256 s00 = _mm256_unpacklo_ps(s0, s2);
+                        __m256 s01 = _mm256_unpacklo_ps(s1, s3);
+                        __m256 s10 = _mm256_unpackhi_ps(s0, s2);
+                        __m256 s11 = _mm256_unpackhi_ps(s1, s3);
+                        __m256 d0 = _mm256_unpacklo_ps(s00, s01);
+                        __m256 d1 = _mm256_unpackhi_ps(s00, s01);
+                        __m256 d2 = _mm256_unpacklo_ps(s10, s11);
+                        __m256 d3 = _mm256_unpackhi_ps(s10, s11);
+                        _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20));
+                        _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20));
+                        _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31));
+                        _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31));
+                        dst += 32;
+                    };
+                    for (; k < K4; k += 4)
+                    {
+                        const float * ps = src + k;
+                        __m128 s0 = _mm_loadu_ps(ps + 0 * stride);
+                        __m128 s1 = _mm_loadu_ps(ps + 1 * stride);
+                        __m128 s2 = _mm_loadu_ps(ps + 2 * stride);
+                        __m128 s3 = _mm_loadu_ps(ps + 3 * stride);
+                        __m128 s00 = _mm_unpacklo_ps(s0, s2);
+                        __m128 s01 = _mm_unpacklo_ps(s1, s3);
+                        __m128 s10 = _mm_unpackhi_ps(s0, s2);
+                        __m128 s11 = _mm_unpackhi_ps(s1, s3);
+                        _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01));
+                        _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01));
+                        _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11));
+                        _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11));
+                        dst += 16;
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    for (size_t c = 0; c < m; ++c)
+                        *(dst++) = src[c*stride + k];
+                }  
+                src += cell * stride;
+            }
+        }
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+        {
+            const size_t CACHE_L1_SIZE = 32 * 1024;
+            const size_t CACHE_L2_SIZE = 256 * 1024;
+            const size_t CACHE_L3_SIZE = 2 * 1024 * 1024;
+            typedef Simd::GemmNN<float, size_t> GemmNN;
+            GemmNN::Main kernelMM, kernelMT;
+            GemmNN::Tail kernelTM, kernelTT;
+            size_t microM, microN, L1, L2;
+#ifdef SIMD_X64_ENABLE
+            if (K > 4096)
+            {
+                microM = 6;
+                microN = 16;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel6x16;
+                kernelMT = tail > F ? Kernel6x16 : Kernel6x8;
+                kernelTM = KernelMx16;
+                kernelTT = tail > F ? KernelMx16 : KernelMx8;
+            }
+            else
+            {
+                microM = 4;
+                microN = 24;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel4x24;
+                kernelMT = tail > DF ? Kernel4x24 : (tail > F ? Kernel4x16 : Kernel4x8);
+                kernelTM = KernelMx24;
+                kernelTT = tail > DF ? KernelMx24 : (tail > F ? KernelMx16 : KernelMx8);
+            }
+#else
+            microM = 4;
+            microN = 8;
+            kernelMM = Kernel4x8;
+            kernelMT = Kernel4x8;
+            kernelTM = KernelMx8;
+            kernelTT = KernelMx8;
+#endif
+            L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE;
+            L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE;
+            GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F,
+                kernelMM, kernelMT, kernelTM, kernelTT, Avx::GemmScaleC, Avx::GemmPackB, NULL);
+            gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx2Hog.cpp b/src/3rd/Simd/SimdAvx2Hog.cpp
index b477826e..e8d22bee 100644
--- a/src/3rd/Simd/SimdAvx2Hog.cpp
+++ b/src/3rd/Simd/SimdAvx2Hog.cpp
@@ -542,12 +542,12 @@ namespace Simd
                         Avx::Store<false>(h1[1] + i, _mm256_add_ps(Avx::Load<false>(h1[1] + i), _mm256_unpackhi_ps(b1, b3)));
                     }
                     __m128 * ps = (__m128*)src;
-                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16)));
-                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16)));
-                    _mm_storel_pi((__m64*)(h0[0] + 16), s0);
-                    _mm_storeh_pi((__m64*)(h0[1] + 16), s0);
-                    _mm_storel_pi((__m64*)(h1[0] + 16), s1);
-                    _mm_storeh_pi((__m64*)(h1[1] + 16), s1);
+                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16));
+                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16));
+                    Sse::StoreHalf<0>(h0[0] + 16, s0);
+                    Sse::StoreHalf<1>(h0[1] + 16, s0);
+                    Sse::StoreHalf<0>(h1[0] + 16, s1);
+                    Sse::StoreHalf<1>(h1[1] + 16, s1);
                     h0++;
                     h1++;
                     src += 72;
diff --git a/src/3rd/Simd/SimdAvx2HogLite.cpp b/src/3rd/Simd/SimdAvx2HogLite.cpp
index 6ded259c..6951cd8d 100644
--- a/src/3rd/Simd/SimdAvx2HogLite.cpp
+++ b/src/3rd/Simd/SimdAvx2HogLite.cpp
@@ -507,9 +507,9 @@ namespace Simd
                 sums[3] = _mm256_fmadd_ps(Avx::Load<align>(src + 3 * step), _filter, sums[3]);
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 size_t alignedFilterStride = AlignLo(filterStride, QF);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
@@ -520,7 +520,7 @@ namespace Simd
                         __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() };
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             size_t filterCol = 0;
                             for (; filterCol < alignedFilterStride; filterCol += QF)
@@ -537,7 +537,7 @@ namespace Simd
                         __m256 sum = _mm256_setzero_ps();
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                 ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -550,9 +550,9 @@ namespace Simd
                 }
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 size_t alignedFilterStride = AlignLo(filterStride, QF);
                 __m128 _min = _mm_set1_ps(-FLT_MAX);
@@ -569,7 +569,7 @@ namespace Simd
                             __m256 sums[4] = { _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps(), _mm256_setzero_ps() };
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 size_t filterCol = 0;
                                 for (; filterCol < alignedFilterStride; filterCol += QF)
@@ -589,7 +589,7 @@ namespace Simd
                             __m256 sum = _mm256_setzero_ps();
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                     ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -606,53 +606,53 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
             }
 
         public:
 
-            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 assert(featureSize == 8 || featureSize == 16);
-                assert(srcWidth >= filterSize && srcHeight >= filterSize);
+                assert(srcWidth >= filterWidth && srcHeight >= filterHeight);
 
-                size_t dstWidth = srcWidth - filterSize + 1;
-                size_t dstHeight = srcHeight - filterSize + 1;
+                size_t dstWidth = srcWidth - filterWidth + 1;
+                size_t dstHeight = srcHeight - filterHeight + 1;
 
                 if (mask)
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 }
                 else
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                 }
             }
         };
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
         {
             HogLiteFeatureFilter featureFilter;
-            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
         }
 
         namespace HogLiteFeatureResizerDetail
diff --git a/src/3rd/Simd/SimdAvx2Neural.cpp b/src/3rd/Simd/SimdAvx2Neural.cpp
index e0dcdfa6..709617ce 100644
--- a/src/3rd/Simd/SimdAvx2Neural.cpp
+++ b/src/3rd/Simd/SimdAvx2Neural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,6 +27,7 @@
 #include "Simd/SimdStream.h"
 #include "Simd/SimdBase.h"
 #include "Simd/SimdNeural.h"
+#include "Simd/SimdPow.h"
 
 namespace Simd
 {
@@ -332,84 +333,28 @@ namespace Simd
                 NeuralRoughSigmoid2<false>(src, size, slope, dst);
         }
 
-        class PowEstimator
+        template<bool align> void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
-            __m256i _exponent, _mantissa;
-            __m256 _one;
-
-            void Init()
-            {
-                _exponent = _mm256_set1_epi32(0x7F800000);
-                _mantissa = _mm256_set1_epi32(0x007FFFFF);
-                _one = _mm256_set1_ps(1.0f);
-            }
-
-            SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f)
-            {
-                __m256 p = _mm256_set1_ps(f);
-                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(e));
-                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(d));
-                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(c));
-                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(b));
-                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(a));
-                return p;
-            }
-
-            SIMD_INLINE __m256 Exp2(__m256 x)
-            {
-                x = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(129.00000f)), _mm256_set1_ps(-126.99999f));
-                __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f)));
-                __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart));
-                __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23));
-                __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-                return _mm256_mul_ps(expipart, expfpart);
-            }
-
-            SIMD_INLINE __m256 Log2(__m256 x)
-            {
-                __m256i i = _mm256_castps_si256(x);
-                __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _exponent), 23), _mm256_set1_epi32(127)));
-                __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mantissa)), _one);
-                __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
-                return _mm256_fmadd_ps(p, _mm256_sub_ps(m, _one), e);
-            }
-
-            SIMD_INLINE __m256 Pow(__m256 basis, __m256 exponent)
-            {
-                return Exp2(_mm256_mul_ps(Log2(basis), exponent));
-            }
-
-            template<bool align> void Run(const float * src, size_t size, const float * exponent, float * dst)
-            {
-                if (align)
-                    assert(Aligned(src) && Aligned(dst));
-
-                float e = exponent[0];
-                size_t alignedSize = AlignLo(size, F);
-                __m256 _e = _mm256_set1_ps(e);
-                size_t i = 0;
-                for (; i < alignedSize; i += F)
-                    Avx::Store<align>(dst + i, Pow(Avx::Load<align>(src + i), _e));
-                for (; i < size; ++i)
-                    dst[i] = Base::Pow(src[i], e);
-            }
-
-        public:
-            void Run(const float * src, size_t size, const float * exponent, float * dst)
-            {
-                Init();
+            if (align)
+                assert(Aligned(src) && Aligned(dst));
 
-                if (Aligned(src) && Aligned(dst))
-                    Run<true>(src, size, exponent, dst);
-                else
-                    Run<false>(src, size, exponent, dst);
-            }
-        };
+            float e = exponent[0];
+            size_t alignedSize = AlignLo(size, F);
+            __m256 _e = _mm256_set1_ps(e);
+            Pow pow;
+            size_t i = 0;
+            for (; i < alignedSize; i += F)
+                Avx::Store<align>(dst + i, pow(Avx::Load<align>(src + i), _e));
+            for (; i < size; ++i)
+                dst[i] = Base::Pow(src[i], e);
+        }
 
         void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
-            PowEstimator estimator;
-            estimator.Run(src, size, exponent, dst);
+            if (Aligned(src) && Aligned(dst))
+                NeuralPow<true>(src, size, exponent, dst);
+            else
+                NeuralPow<false>(src, size, exponent, dst);
         }
 
         template <bool align, size_t coreX, size_t coreY> void NeuralAddConvolutionForward(const float * src, size_t srcStride, size_t width, size_t height, const float * weights, float * dst, size_t dstStride)
@@ -1542,7 +1487,7 @@ namespace Simd
 
                 void Kernel4x24(size_t N, size_t K, const float * a, const float * b, float * c)
                 {
-                    register __m256 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32;
+                    __m256 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32;
 
                     c00 = _mm256_setzero_ps();
                     c01 = _mm256_setzero_ps();
@@ -1607,22 +1552,45 @@ namespace Simd
                         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
                     const float * tail = (float*)mask + 24 - N + N24;
-                    size_t i = 0;
-                    for (; i < M4; i += 4)
+                    if (M > N)
                     {
-                        size_t j = 0;
-                        for (; j < N24; j += 24)
-                            Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j);
-                        if (N24 < N)
-                            KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4);
+                        size_t i = 0;
+                        for (; i < M4; i += 4)
+                        {
+                            size_t j = 0;
+                            for (; j < N24; j += 24)
+                                Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j);
+                            if (N24 < N)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4);
+                        }
+                        if (M4 < M)
+                        {
+                            size_t j = 0;
+                            for (; j < N24; j += 24)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4);
+                            if (N24 < N)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4);
+                        }
                     }
-                    if (M4 < M)
+                    else
                     {
                         size_t j = 0;
                         for (; j < N24; j += 24)
-                            KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4);
+                        {
+                            size_t i = 0;
+                            for (; i < M4; i += 4)
+                                Kernel4x24(N, K, a + i * K, b + j * K, c + i * N + j);
+                            if (M4 < M)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, NULL, M - M4);
+                        }
                         if (N24 < N)
-                            KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4);
+                        {
+                            size_t i = 0;
+                            for (; i < M4; i += 4)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, 4);
+                            if (M4 < M)
+                                KernelMx24<align>(N, K, a + i * K, b + j * K, c + i * N + j, tail, M - M4);
+                        }
                     }
                 }
 
@@ -1667,7 +1635,6 @@ namespace Simd
                 template <bool align, size_t kernelX, size_t kernelY> void AddConvolution8x8(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth,
                     const float * weight, float * dst, size_t dstDepth)
                 {
-                    __m256 _weight[kernelX*kernelY];
                     for (size_t dstChannel = 0; dstChannel < dstDepth; ++dstChannel)
                     {
                         __m256 _dst[8];
@@ -1676,6 +1643,7 @@ namespace Simd
                             _dst[row] = Avx::Load<align>(pdst);
                         if (kernelY < 4)
                         {
+                            __m256 _weight[kernelX*kernelY];
                             for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
                             {
                                 const float * psrc = src + srcWidth*srcHeight*srcChannel;
@@ -1690,6 +1658,7 @@ namespace Simd
                         }
                         else
                         {
+                            __m256 _weight[kernelX];
                             for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
                             {
                                 const float * psrc = src + srcWidth*srcHeight*srcChannel;
@@ -1753,11 +1722,80 @@ namespace Simd
                     }
                 }
 
+                void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth)
+                {
+                    size_t dstDepth4 = dstDepth/4*4;
+                    size_t dstChannel = 0;
+                    for (; dstChannel < dstDepth4; dstChannel += 4)
+                    {
+                        __m256 dst00 = _mm256_loadu_ps(dst + 0 * F);
+                        __m256 dst01 = _mm256_loadu_ps(dst + 1 * F);
+                        __m256 dst10 = _mm256_loadu_ps(dst + 2 * F);
+                        __m256 dst11 = _mm256_loadu_ps(dst + 3 * F);
+                        __m256 dst20 = _mm256_loadu_ps(dst + 4 * F);
+                        __m256 dst21 = _mm256_loadu_ps(dst + 5 * F);
+                        __m256 dst30 = _mm256_loadu_ps(dst + 6 * F);
+                        __m256 dst31 = _mm256_loadu_ps(dst + 7 * F);
+                        const float * psrc = src;
+                        const float * pw0 = weight;
+                        const float * pw1 = pw0 + srcDepth;
+                        const float * pw2 = pw1 + srcDepth;
+                        const float * pw3 = pw2 + srcDepth;
+                        for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
+                        {
+                            __m256 _weight;
+                            __m256 src0 = _mm256_loadu_ps(psrc + 0 * F);
+                            __m256 src1 = _mm256_loadu_ps(psrc + 1 * F);
+                            _weight = _mm256_set1_ps(pw0[srcChannel]);
+                            dst00 = _mm256_fmadd_ps(_weight, src0, dst00);
+                            dst01 = _mm256_fmadd_ps(_weight, src1, dst01);
+                            _weight = _mm256_set1_ps(pw1[srcChannel]);
+                            dst10 = _mm256_fmadd_ps(_weight, src0, dst10);
+                            dst11 = _mm256_fmadd_ps(_weight, src1, dst11);
+                            _weight = _mm256_set1_ps(pw2[srcChannel]);
+                            dst20 = _mm256_fmadd_ps(_weight, src0, dst20);
+                            dst21 = _mm256_fmadd_ps(_weight, src1, dst21);
+                            _weight = _mm256_set1_ps(pw3[srcChannel]);
+                            dst30 = _mm256_fmadd_ps(_weight, src0, dst30);
+                            dst31 = _mm256_fmadd_ps(_weight, src1, dst31);
+                            psrc += 16;
+                        }
+                        _mm256_storeu_ps(dst + 0 * F, dst00);
+                        _mm256_storeu_ps(dst + 1 * F, dst01);
+                        _mm256_storeu_ps(dst + 2 * F, dst10);
+                        _mm256_storeu_ps(dst + 3 * F, dst11);
+                        _mm256_storeu_ps(dst + 4 * F, dst20);
+                        _mm256_storeu_ps(dst + 5 * F, dst21);
+                        _mm256_storeu_ps(dst + 6 * F, dst30);
+                        _mm256_storeu_ps(dst + 7 * F, dst31);
+                        dst += 16*4;
+                        weight += srcDepth * 4;
+                    }
+                    for (; dstChannel < dstDepth; ++dstChannel)
+                    {
+                        __m256 dst0 = _mm256_loadu_ps(dst + 0 * F);
+                        __m256 dst1 = _mm256_loadu_ps(dst + 1 * F);
+                        const float * psrc = src;
+                        for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
+                        {
+                            __m256 weight0 = _mm256_set1_ps(*weight++);
+                            dst0 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 0 * F), dst0);
+                            dst1 = _mm256_fmadd_ps(weight0, _mm256_loadu_ps(psrc + 1 * F), dst1);
+                            psrc += 16;
+                        }
+                        _mm256_storeu_ps(dst + 0 * F, dst0);
+                        _mm256_storeu_ps(dst + 1 * F, dst1);
+                        dst += 16;
+                    }
+                }
+
                 void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth,
                     const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
                     assert(kernelX == kernelY);
-                    if (kernelX == 2)
+                    if (kernelX == 1 && dstWidth*dstHeight == 16)
+                        AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth);
+                    else if (kernelX == 2)
                         AddConvolution<false, 2, 2>(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth);
                     else if (kernelX == 3)
                         AddConvolution<false, 3, 3>(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth);
@@ -1771,9 +1809,11 @@ namespace Simd
 
                 bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
-                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1)
+                    if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F)
                     {
-                        if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3)
+                        if (kernelX >= 2 && kernelX <= 5 && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3)
+                            return true;
+                        if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64))
                             return true;
                     }
                     return false;
diff --git a/src/3rd/Simd/SimdAvx2Resizer.cpp b/src/3rd/Simd/SimdAvx2Resizer.cpp
new file mode 100644
index 00000000..f263b94d
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx2Resizer.cpp
@@ -0,0 +1,151 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdResizer.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE 
+    namespace Avx2
+    {
+        ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp)
+            : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m256), caffeInterp)
+        {
+        }
+
+        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const
+        {
+            Array32f bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            float * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            size_t rsa = AlignLo(_rs, Avx::F);
+            size_t rsh = AlignLo(_rs, Sse::F);
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    float * pb = pbx[k];
+                    const float * ps = src + (sy + k)*srcStride;
+                    size_t dx = 0;
+                    if (_cn == 1)
+                    {
+                        __m256 _1 = _mm256_set1_ps(1.0f);
+                        for (; dx < rsa; dx += Avx::F)
+                        {
+                            __m256i idx = Avx2::LoadPermuted<true>((__m256i*)(_ix.data + dx));
+                            __m256 s0145 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 0), 4));
+                            __m256 s2367 = _mm256_castpd_ps(_mm256_i32gather_pd((double*)ps, _mm256_extracti128_si256(idx, 1), 4));
+                            __m256 fx1 = _mm256_load_ps(_ax.data + dx);
+                            __m256 fx0 = _mm256_sub_ps(_1, fx1);
+                            __m256 s0 = _mm256_shuffle_ps(s0145, s2367, 0x88);
+                            __m256 s1 = _mm256_shuffle_ps(s0145, s2367, 0xDD);
+                            _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1)));
+                        }
+                        for (; dx < rsh; dx += Sse::F)
+                        {
+                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]);
+                            __m128 fx1 = _mm_load_ps(_ax.data + dx);
+                            __m128 fx0 = _mm_sub_ps(_mm256_castps256_ps128(_1), fx1);
+                            __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
+                            __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
+                            _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
+                        }
+                    }
+                    else
+                    {
+                        __m256 _1 = _mm256_set1_ps(1.0f);
+                        __m256i cn = _mm256_set1_epi32((int)_cn);
+                        for (; dx < rsa; dx += Avx::F)
+                        {
+                            __m256i i0 = _mm256_load_si256((__m256i*)(_ix.data + dx));
+                            __m256i i1 = _mm256_add_epi32(i0, cn);
+                            __m256 s0 = _mm256_i32gather_ps(ps, i0, 4);
+                            __m256 s1 = _mm256_i32gather_ps(ps, i1, 4);
+                            __m256 fx1 = _mm256_load_ps(_ax.data + dx);
+                            __m256 fx0 = _mm256_sub_ps(_1, fx1);
+                            _mm256_store_ps(pb + dx, _mm256_fmadd_ps(s0, fx0, _mm256_mul_ps(s1, fx1)));
+                        }
+                    }
+                    for (; dx < _rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx;
+                    }
+                }  
+
+                size_t dx = 0;
+                __m256 _fy0 = _mm256_set1_ps(fy0);
+                __m256 _fy1 = _mm256_set1_ps(fy1);
+                for (; dx < rsa; dx += Avx::F)
+                {
+                    __m256 b0 = _mm256_load_ps(pbx[0] + dx);
+                    __m256 b1 = _mm256_load_ps(pbx[1] + dx);
+                    _mm256_storeu_ps(dst + dx, _mm256_fmadd_ps(b0, _fy0, _mm256_mul_ps(b1, _fy1)));
+                }
+                for (; dx < rsh; dx += Sse::F)
+                {
+                    __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _mm256_castps256_ps128(_fy0));
+                    __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _mm256_castps256_ps128(_fy1));
+                    _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
+                }
+                for (; dx < _rs; dx++)
+                    dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+        {
+            if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true);
+            else
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+        }
+    }
+#endif //SIMD_AVX2_ENABLE 
+}
+
diff --git a/src/3rd/Simd/SimdAvx2Statistic.cpp b/src/3rd/Simd/SimdAvx2Statistic.cpp
index 03d7c5eb..af9209df 100644
--- a/src/3rd/Simd/SimdAvx2Statistic.cpp
+++ b/src/3rd/Simd/SimdAvx2Statistic.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -538,6 +538,46 @@ namespace Simd
                 SquareSum<false>(src, stride, width, height, sum);
         }
 
+        template <bool align> void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(src) && Aligned(stride));
+
+            size_t bodyWidth = AlignLo(width, A);
+            __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF);
+            __m256i fullValueSum = _mm256_setzero_si256();
+            __m256i fullSquareSum = _mm256_setzero_si256();
+            for (size_t row = 0; row < height; ++row)
+            {
+                __m256i rowSquareSum = _mm256_setzero_si256();
+                for (size_t col = 0; col < bodyWidth; col += A)
+                {
+                    const __m256i value = Load<align>((__m256i*)(src + col));
+                    fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum);
+                    rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value));
+                }
+                if (width - bodyWidth)
+                {
+                    const __m256i value = _mm256_and_si256(tailMask, Load<false>((__m256i*)(src + width - A)));
+                    fullValueSum = _mm256_add_epi64(_mm256_sad_epu8(value, K_ZERO), fullValueSum);
+                    rowSquareSum = _mm256_add_epi32(rowSquareSum, Square(value));
+                }
+                fullSquareSum = _mm256_add_epi64(fullSquareSum, HorizontalSum32(rowSquareSum));
+                src += stride;
+            }
+            *valueSum = ExtractSum<uint64_t>(fullValueSum);
+            *squareSum = ExtractSum<uint64_t>(fullSquareSum);
+        }
+
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            if (Aligned(src) && Aligned(stride))
+                ValueSquareSum<true>(src, stride, width, height, valueSum, squareSum);
+            else
+                ValueSquareSum<false>(src, stride, width, height, valueSum, squareSum);
+        }
+		
         SIMD_INLINE __m256i Correlation(__m256i a, __m256i b)
         {
             const __m256i lo = _mm256_madd_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
diff --git a/src/3rd/Simd/SimdAvx2Synet.cpp b/src/3rd/Simd/SimdAvx2Synet.cpp
new file mode 100644
index 00000000..6da16eae
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx2Synet.cpp
@@ -0,0 +1,238 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+#include "Simd/SimdAvx1.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdPow.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src0, const __m256 & weight0, const float * src1, const __m256 & weight1, float * dst, size_t offset)
+        {
+            Avx::Store<align>(dst + offset, _mm256_fmadd_ps(Avx::Load<align>(src0 + offset), weight0, _mm256_mul_ps(Avx::Load<align>(src1 + offset), weight1)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src, const __m256 & weight, float * dst, size_t offset)
+        {
+            Avx::Store<align>(dst + offset, _mm256_fmadd_ps(Avx::Load<align>(src + offset), weight, Load<align>(dst + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            __m256 weight0 = _mm256_set1_ps(weight[0]);
+            __m256 weight1 = _mm256_set1_ps(weight[1]);
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * weight[0] + src1[j] * weight[1];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                __m256 weighti = _mm256_set1_ps(weight[i]);
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 0);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 1);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 2);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] += srci[j] * weight[i];
+            }
+        }
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            if (type != SimdSynetEltwiseOperationSum)
+            {
+                Avx::SynetEltwiseLayerForward(src, weight, count, size, type, dst);
+                return;
+            }
+            assert(count >= 2);
+            bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]);
+            for (size_t i = 2; i < count; ++i)
+                aligned = aligned && Aligned(src[i]);
+            if (aligned)
+                SynetEltwiseLayerForwardSum<true>(src, weight, count, size, dst);
+            else
+                SynetEltwiseLayerForwardSum<false>(src, weight, count, size, dst);
+        }
+
+        template <bool align> SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            size_t aligned = AlignLo(size, F);
+            Array32f sum(size, true), zero(size, true);
+
+            for (size_t i = 0; i < half; ++i)
+            {
+                const float * pos = src + i * size;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m256 _pos = Avx::Load<align>(pos + j);
+                    Avx::Store<true>(sum.data + j, _mm256_fmadd_ps(_pos, _pos, Avx::Load<true>(sum.data + j)));
+                }
+                for (; j < size; ++j)
+                    sum[j] += Simd::Square(pos[j]);
+            }
+
+            __m256 k0 = _mm256_set1_ps(k[0]);
+            __m256 k1 = _mm256_set1_ps(k[1]);
+            __m256 k2 = _mm256_set1_ps(k[2]);
+            Avx2::Pow pow;
+            for (size_t i = 0; i < count; ++i)
+            {
+                const float * pos = (i < count - half) ? src + half * size : zero.data;
+                const float * neg = (i > half) ? src - (half + 1) * size : zero.data;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m256 _pos = Avx::Load<align>(pos + j);
+                    __m256 _neg = Avx::Load<align>(neg + j);
+                    __m256 _sum = Avx::Load<true>(sum.data + j);
+                    _sum = _mm256_fmadd_ps(_pos, _pos, _mm256_fnmadd_ps(_neg, _neg, _sum));
+                    __m256 _src = Avx::Load<align>(src + j);
+                    Avx::Store<true>(sum.data + j, _sum);
+                    Avx::Store<align>(dst + j, _mm256_mul_ps(_src, pow(_mm256_fmadd_ps(k1, _sum, k0), k2)));
+                }
+                for (; j < size; ++j)
+                {
+                    sum[j] += Simd::Square(pos[j]);
+                    sum[j] -= Simd::Square(neg[j]);
+                    dst[j] = src[j] * Base::Pow(k[0] + k[1] * sum[j], k[2]);
+                }
+                src += size;
+                dst += size;
+            }
+        }
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            if (Aligned(src) && Aligned(dst) && Aligned(size))
+                SynetLrnLayerCrossChannels<true>(src, half, count, size, k, dst);
+            else
+                SynetLrnLayerCrossChannels<false>(src, half, count, size, k, dst);
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, const __m256 & bias, float * dst, size_t offset)
+        {
+            Avx::Store<align>(dst + offset, _mm256_fmadd_ps(Avx::Load<align>(src + offset), scale, bias));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m256 & scale, float * dst, size_t offset)
+        {
+            Avx::Store<align>(dst + offset, _mm256_mul_ps(Avx::Load<align>(src + offset), scale));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            if (bias)
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m256 _scale = _mm256_set1_ps(scale[i]);
+                        __m256 _bias = _mm256_set1_ps(bias[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i] + bias[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m256 _scale = _mm256_set1_ps(scale[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+        }
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetScaleLayerForward<true>(src, scale, bias, count, size, dst);
+            else
+                SynetScaleLayerForward<false>(src, scale, bias, count, size, dst);
+        }
+    }
+#endif// SIMD_AVX2_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx512bw.h b/src/3rd/Simd/SimdAvx512bw.h
index 45f56475..f8d843e0 100644
--- a/src/3rd/Simd/SimdAvx512bw.h
+++ b/src/3rd/Simd/SimdAvx512bw.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -179,10 +179,14 @@ namespace Simd
 
         void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum);
 
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance);
+
         void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst);
 
         void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst);
 
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
         void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride);
 
         void GrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride);
@@ -211,7 +215,7 @@ namespace Simd
 
         void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride);
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight);
 
@@ -355,6 +359,8 @@ namespace Simd
 
         void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
 
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
+
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
 
         void StretchGray2x2(const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride,
diff --git a/src/3rd/Simd/SimdAvx512bwFloat16.cpp b/src/3rd/Simd/SimdAvx512bwFloat16.cpp
index 3f4252bd..d8b6eac7 100644
--- a/src/3rd/Simd/SimdAvx512bwFloat16.cpp
+++ b/src/3rd/Simd/SimdAvx512bwFloat16.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -202,6 +202,67 @@ namespace Simd
             else
                 SquaredDifferenceSum16f<false>(a, b, size, sum);
         }
+
+        template <int part> SIMD_INLINE void CosineDistance16f(const __m512i & a, const __m512i & b, __m512 * aa, __m512 * ab, __m512 * bb)
+        {
+            __m512 a0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(a, part));
+            __m512 b0 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(b, part));
+            aa[part] = _mm512_fmadd_ps(a0, a0, aa[part]);
+            ab[part] = _mm512_fmadd_ps(a0, b0, ab[part]);
+            bb[part] = _mm512_fmadd_ps(b0, b0, bb[part]);
+        }
+
+        template <bool align, bool mask> SIMD_INLINE void CosineDistance16f2(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb, __mmask32 tail = -1)
+        {
+            __m512i a0 = Load<align, mask>(a, tail);
+            __m512i b0 = Load<align, mask>(b, tail);
+            CosineDistance16f<0>(a0, b0, aa, ab, bb);
+            CosineDistance16f<1>(a0, b0, aa, ab, bb);
+        }
+
+        template <bool align> SIMD_INLINE void CosineDistance16f4(const uint16_t * a, const uint16_t * b, __m512 * aa, __m512 * ab, __m512 * bb)
+        {
+            __m512i a0 = Load<align>(a + 00);
+            __m512i b0 = Load<align>(b + 00);
+            CosineDistance16f<0>(a0, b0, aa, ab, bb);
+            CosineDistance16f<1>(a0, b0, aa, ab, bb);
+            __m512i a1 = Load<align>(a + HA);
+            __m512i b1 = Load<align>(b + HA);
+            CosineDistance16f<0>(a1, b1, aa, ab, bb);
+            CosineDistance16f<1>(a1, b1, aa, ab, bb);
+        }
+
+        template<bool align> void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t alignedSize = AlignLo(size, DF);
+            __mmask32 tailMask = TailMask32(size - alignedSize);
+            size_t fullAlignedSize = AlignLo(size, QF);
+            size_t i = 0;
+            __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            for (; i < fullAlignedSize; i += QF)
+                CosineDistance16f4<align>(a + i, b + i, _aa, _ab, _bb);
+            for (; i < alignedSize; i += DF)
+                CosineDistance16f2<align, false>(a + i, b + i, _aa, _ab, _bb);
+            if (i < size)
+                CosineDistance16f2<align, true>(a + i, b + i, _aa, _ab, _bb, tailMask);
+            float aa = Avx512f::ExtractSum(_mm512_add_ps(_aa[0], _aa[1]));
+            float ab = Avx512f::ExtractSum(_mm512_add_ps(_ab[0], _ab[1]));
+            float bb = Avx512f::ExtractSum(_mm512_add_ps(_bb[0], _bb[1]));
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance16f<true>(a, b, size, distance);
+            else
+                CosineDistance16f<false>(a, b, size, distance);
+        }
     }
 #endif// SIMD_AVX512BW_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx512bwFloat32.cpp b/src/3rd/Simd/SimdAvx512bwFloat32.cpp
index 8cbbede8..165a1ed4 100644
--- a/src/3rd/Simd/SimdAvx512bwFloat32.cpp
+++ b/src/3rd/Simd/SimdAvx512bwFloat32.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,7 @@
 */
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
 
 namespace Simd
 {
@@ -75,7 +76,7 @@ namespace Simd
 
         template <bool align, bool mask> SIMD_INLINE void Uint8ToFloat32(const __m128i & value, const __m512 & lower, const __m512 & boost, float * dst, __mmask16 tail)
         {
-            Avx512f::Store<align, mask>(dst, _mm512_sub_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(value)), boost), lower), tail);
+            Avx512f::Store<align, mask>(dst, _mm512_add_ps(_mm512_mul_ps(_mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(value)), boost), lower), tail);
         }
 
         template <bool align, bool mask> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m512 & lower, const __m512 & boost, float * dst, __mmask64 srcTail, const __mmask16 * dstTails)
@@ -115,6 +116,64 @@ namespace Simd
             else
                 Uint8ToFloat32<false>(src, size, lower, upper, dst);
         }
+
+        template<bool align> void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t partialAlignedSize = AlignLo(size, F);
+            size_t fullAlignedSize = AlignLo(size, DF);
+            size_t i = 0;
+            __m512 _aa[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            __m512 _ab[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            __m512 _bb[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
+            if (fullAlignedSize)
+            {
+                for (; i < fullAlignedSize; i += DF)
+                {
+                    __m512 a0 = Avx512f::Load<align>(a + i + 0 * F);
+                    __m512 b0 = Avx512f::Load<align>(b + i + 0 * F);
+                    _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]);
+                    _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]);
+                    _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]);
+                    __m512 a1 = Avx512f::Load<align>(a + i + 1 * F);
+                    __m512 b1 = Avx512f::Load<align>(b + i + 1 * F);
+                    _aa[1] = _mm512_fmadd_ps(a1, a1, _aa[1]);
+                    _ab[1] = _mm512_fmadd_ps(a1, b1, _ab[1]);
+                    _bb[1] = _mm512_fmadd_ps(b1, b1, _bb[1]);
+                }
+                _aa[0] = _mm512_add_ps(_aa[0], _aa[1]);
+                _ab[0] = _mm512_add_ps(_ab[0], _ab[1]);
+                _bb[0] = _mm512_add_ps(_bb[0], _bb[1]);
+            }
+            for (; i < partialAlignedSize; i += F)
+            {
+                __m512 a0 = Avx512f::Load<align>(a + i);
+                __m512 b0 = Avx512f::Load<align>(b + i);
+                _aa[0] = _mm512_fmadd_ps(a0, a0, _aa[0]);
+                _ab[0] = _mm512_fmadd_ps(a0, b0, _ab[0]);
+                _bb[0] = _mm512_fmadd_ps(b0, b0, _bb[0]);
+            }
+            float aa = Avx512f::ExtractSum(_aa[0]), ab = Avx512f::ExtractSum(_ab[0]), bb = Avx512f::ExtractSum(_bb[0]);
+            for (; i < size; ++i)
+            {
+                float _a = a[i];
+                float _b = b[i];
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance32f<true>(a, b, size, distance);
+            else
+                CosineDistance32f<false>(a, b, size, distance);
+        }
     }
 #endif// SIMD_AVX512BW_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx512bwHog.cpp b/src/3rd/Simd/SimdAvx512bwHog.cpp
index 9817e50d..c875227f 100644
--- a/src/3rd/Simd/SimdAvx512bwHog.cpp
+++ b/src/3rd/Simd/SimdAvx512bwHog.cpp
@@ -517,12 +517,12 @@ namespace Simd
                     }
 #else
                     __m128 * ps = (__m128*)src;
-                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16)));
-                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16)));
-                    _mm_storel_pi((__m64*)(h0[0] + 16), s0);
-                    _mm_storeh_pi((__m64*)(h0[1] + 16), s0);
-                    _mm_storel_pi((__m64*)(h1[0] + 16), s1);
-                    _mm_storeh_pi((__m64*)(h1[1] + 16), s1);
+                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16));
+                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16));
+                    Sse::StoreHalf<0>(h0[0] + 16, s0);
+                    Sse::StoreHalf<1>(h0[1] + 16, s0);
+                    Sse::StoreHalf<0>(h1[0] + 16, s1);
+                    Sse::StoreHalf<1>(h1[1] + 16, s1);
 #endif
                     h0++;
                     h1++;
diff --git a/src/3rd/Simd/SimdAvx512bwHogLite.cpp b/src/3rd/Simd/SimdAvx512bwHogLite.cpp
index edb32257..00cfb3da 100644
--- a/src/3rd/Simd/SimdAvx512bwHogLite.cpp
+++ b/src/3rd/Simd/SimdAvx512bwHogLite.cpp
@@ -470,9 +470,9 @@ namespace Simd
                 sums[1] = _mm512_fmadd_ps(src5, filter3, sums[1]);
             }
 
-            template <bool align> void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = 8 * filterSize;
+                size_t filterStride = 8 * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 8);
                 size_t alignedFilterStride = AlignLo(filterStride, DF);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
@@ -483,7 +483,7 @@ namespace Simd
                         __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
                         const float * pSrc = src + dstRow * srcStride + dstCol * 8;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             size_t filterCol = 0;
                             for (; filterCol < alignedFilterStride; filterCol += DF)
@@ -503,7 +503,7 @@ namespace Simd
                         __m256 sum = _mm256_setzero_ps();
                         const float * pSrc = src + dstRow * srcStride + dstCol * 8;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F)
                                 ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -516,9 +516,9 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter8(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = 8 * filterSize;
+                size_t filterStride = 8 * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 8);
                 size_t alignedFilterStride = AlignLo(filterStride, DF);
                 __m128 _min = _mm_set1_ps(-FLT_MAX);
@@ -535,7 +535,7 @@ namespace Simd
                             __m512 sums[2] = { _mm512_setzero_ps(), _mm512_setzero_ps() };
                             const float * pSrc = src + dstRow * srcStride + dstCol * 8;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 size_t filterCol = 0;
                                 for (; filterCol < alignedFilterStride; filterCol += DF)
@@ -558,7 +558,7 @@ namespace Simd
                             __m256 sum = _mm256_setzero_ps();
                             const float * pSrc = src + dstRow * srcStride + dstCol * 8;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; filterCol += Avx::F)
                                     ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -603,9 +603,9 @@ namespace Simd
                 sums[3] = _mm512_fmadd_ps(src4, filter1, sums[3]);
             }
 
-            template <bool align> void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = 16 * filterSize;
+                size_t filterStride = 16 * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 size_t alignedFilterStride = AlignLo(filterStride, DF);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
@@ -616,7 +616,7 @@ namespace Simd
                         __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                         const float * pSrc = src + dstRow * srcStride + dstCol * 16;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             size_t filterCol = 0;
                             for (; filterCol < alignedFilterStride; filterCol += DF)
@@ -638,7 +638,7 @@ namespace Simd
                         __m512 sum = _mm512_setzero_ps();
                         const float * pSrc = src + dstRow * srcStride + dstCol * 16;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                 ProductSum1x1<align, false>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -651,9 +651,9 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter16(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = 16 * filterSize;
+                size_t filterStride = 16 * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 size_t alignedFilterStride = AlignLo(filterStride, DF);
                 __m128 _min = _mm_set1_ps(-FLT_MAX);
@@ -670,7 +670,7 @@ namespace Simd
                             __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                             const float * pSrc = src + dstRow * srcStride + dstCol * 16;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 size_t filterCol = 0;
                                 for (; filterCol < alignedFilterStride; filterCol += DF)
@@ -695,7 +695,7 @@ namespace Simd
                             __m512 sum = _mm512_setzero_ps();
                             const float * pSrc = src + dstRow * srcStride + dstCol * 16;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                     ProductSum1x1<align, false>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -712,53 +712,53 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter16<align>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter16<align>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
                 else
-                    Filter8<align>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter8<align>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter16<align>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter16<align>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 else
-                    Filter8<align>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter8<align>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
             }
 
         public:
 
-            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 assert(featureSize == 8 || featureSize == 16);
-                assert(srcWidth >= filterSize && srcHeight >= filterSize);
+                assert(srcWidth >= filterWidth && srcHeight >= filterHeight);
 
-                size_t dstWidth = srcWidth - filterSize + 1;
-                size_t dstHeight = srcHeight - filterSize + 1;
+                size_t dstWidth = srcWidth - filterWidth + 1;
+                size_t dstHeight = srcHeight - filterHeight + 1;
 
                 if (mask)
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 }
                 else
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                 }
             }
         };
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
         {
             HogLiteFeatureFilter featureFilter;
-            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
         }
 
         class HogLiteFeatureResizer
diff --git a/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp b/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp
index df7520e0..9475c008 100644
--- a/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp
+++ b/src/3rd/Simd/SimdAvx512bwReduceGray4x4.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp b/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp
index f3fb710d..499a176d 100644
--- a/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp
+++ b/src/3rd/Simd/SimdAvx512bwResizeBilinear.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -306,15 +306,34 @@ namespace Simd
             Store<false>(dst, _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_packus_epi16(lo, hi)));
         }
 
+        template <size_t channelCount > SIMD_INLINE void Gather(const uint8_t * src, const int * idx, size_t size, uint8_t * dst)
+        {
+            struct Src { uint8_t channels[channelCount * 1]; };
+            struct Dst { uint8_t channels[channelCount * 2]; };
+            const Src * s = (const Src *)src;
+            Dst * d = (Dst*)dst;
+            for (size_t i = 0; i < size; i++)
+               d[i] = *(Dst *)(s + idx[i]);
+        }
+
+        template <> SIMD_INLINE void Gather<2>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst)
+        {
+            for (size_t i = 0; i < size; i += 16)
+                _mm512_storeu_si512(dst + 4*i, _mm512_i32gather_epi32(_mm512_loadu_si512(idx + i), src, 2));
+        }
+
+        template <> SIMD_INLINE void Gather<4>(const uint8_t * src, const int * idx, size_t size, uint8_t * dst)
+        {
+            for (size_t i = 0; i < size; i += 8)
+                _mm512_storeu_si512(dst + 8 * i, _mm512_i32gather_epi64(_mm256_loadu_si256((__m256i*)(idx + i)), src, 4));
+        }
+
         template <size_t channelCount> void ResizeBilinear(
             const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
         {
             assert(dstWidth >= A);
 
-            struct One { uint8_t channels[channelCount]; };
-            struct Two { uint8_t channels[channelCount * 2]; };
-
             size_t size = 2 * dstWidth*channelCount;
             size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2;
             size_t alignedSize = AlignHi(size, DA) - DA;
@@ -350,10 +369,7 @@ namespace Simd
 
                 for (; k < 2; k++)
                 {
-                    Two * pb = (Two *)buffer.bx[k];
-                    const One * psrc = (const One *)(src + (sy + k)*srcStride);
-                    for (size_t x = 0; x < dstWidth; x++)
-                        pb[x] = *(Two *)(psrc + buffer.ix[x]);
+                    Gather<channelCount>(src + (sy + k)*srcStride, buffer.ix, dstWidth, buffer.bx[k]);
 
                     uint8_t * pbx = buffer.bx[k];
                     for (size_t i = 0; i < bufferSize; i += step)
diff --git a/src/3rd/Simd/SimdAvx512bwStatistic.cpp b/src/3rd/Simd/SimdAvx512bwStatistic.cpp
index e5a9cf8d..27a35a7d 100644
--- a/src/3rd/Simd/SimdAvx512bwStatistic.cpp
+++ b/src/3rd/Simd/SimdAvx512bwStatistic.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -555,6 +555,62 @@ namespace Simd
                 SquareSum<false>(src, stride, width, height, sum);
         }
 
+        template <int index> void ValueSquareSum(const __m512i & value, __m512i * valueSums, __m512i * squareSums)
+        {
+            valueSums[index] = _mm512_add_epi64(valueSums[index], _mm512_sad_epu8(value, K_ZERO));
+            squareSums[index] = _mm512_add_epi32(squareSums[index], SquareSum(value));
+        }
+
+        template <bool align> void ValueSquareSum4(const uint8_t * src, __m512i * valueSums, __m512i * squareSums)
+        {
+            ValueSquareSum<0>(Load<align>(src + 0 * A), valueSums, squareSums);
+            ValueSquareSum<1>(Load<align>(src + 1 * A), valueSums, squareSums);
+            ValueSquareSum<2>(Load<align>(src + 2 * A), valueSums, squareSums);
+            ValueSquareSum<3>(Load<align>(src + 3 * A), valueSums, squareSums);
+        }
+
+        template <bool align> void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            assert(width < 256 * 256 * F);
+            if (align)
+                assert(Aligned(src) && Aligned(stride));
+
+            size_t alignedWidth = Simd::AlignLo(width, A);
+            size_t fullAlignedWidth = Simd::AlignLo(width, QA);
+            __mmask64 tailMask = TailMask64(width - alignedWidth);
+            size_t blockSize = (256 * 256 * F) / width;
+            size_t blockCount = height / blockSize + 1;
+            __m512i valueSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() };
+            __m512i fullSquareSum = _mm512_setzero_si512();
+            for (size_t block = 0; block < blockCount; ++block)
+            {
+                __m512i squareSums[4] = { _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512(), _mm512_setzero_si512() };
+                for (size_t row = block * blockSize, endRow = Simd::Min(row + blockSize, height); row < endRow; ++row)
+                {
+                    size_t col = 0;
+                    for (; col < fullAlignedWidth; col += QA)
+                        ValueSquareSum4<align>(src + col, valueSums, squareSums);
+                    for (; col < alignedWidth; col += A)
+                        ValueSquareSum<0>(Load<align>(src + col), valueSums, squareSums);
+                    if (col < width)
+                        ValueSquareSum<0>(Load<align, true>(src + col, tailMask), valueSums, squareSums);
+                    src += stride;
+                }
+                fullSquareSum = _mm512_add_epi64(fullSquareSum, HorizontalSum32(
+                    _mm512_add_epi32(_mm512_add_epi32(squareSums[0], squareSums[1]), _mm512_add_epi32(squareSums[2], squareSums[3]))));
+            }
+            *valueSum = ExtractSum<uint64_t>(_mm512_add_epi64(_mm512_add_epi64(valueSums[0], valueSums[1]), _mm512_add_epi64(valueSums[2], valueSums[3])));
+            *squareSum = ExtractSum<uint64_t>(fullSquareSum);
+        }
+
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            if (Aligned(src) && Aligned(stride))
+                ValueSquareSum<true>(src, stride, width, height, valueSum, squareSum);
+            else
+                ValueSquareSum<false>(src, stride, width, height, valueSum, squareSum);
+        }
+
         SIMD_INLINE __m512i CorrelationSum(__m512i a, __m512i b)
         {
             const __m512i lo = _mm512_madd_epi16(_mm512_unpacklo_epi8(a, _mm512_setzero_si512()), _mm512_unpacklo_epi8(b, _mm512_setzero_si512()));
diff --git a/src/3rd/Simd/SimdAvx512f.h b/src/3rd/Simd/SimdAvx512f.h
index 96017961..ccc05797 100644
--- a/src/3rd/Simd/SimdAvx512f.h
+++ b/src/3rd/Simd/SimdAvx512f.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,8 @@ namespace Simd
 #ifdef SIMD_AVX512F_ENABLE    
     namespace Avx512f
     {
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
         void NeuralProductSum(const float * a, const float * b, size_t size, float * sum);
 
         void NeuralAddVectorMultipliedByValue(const float * src, size_t size, const float * value, float * dst);
@@ -102,6 +104,14 @@ namespace Simd
         void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum);
 
         void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum);
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
     }
 #endif// SIMD_AVX512F_ENABLE
 }
diff --git a/src/3rd/Simd/SimdAvx512fGemm32f.cpp b/src/3rd/Simd/SimdAvx512fGemm32f.cpp
new file mode 100644
index 00000000..49afdc12
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx512fGemm32f.cpp
@@ -0,0 +1,1055 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdGemm.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX512F_ENABLE    
+    namespace Avx512f
+    {
+        SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha)
+        {
+            _mm512_storeu_ps(ptr, _mm512_fmadd_ps(value, alpha, _mm512_loadu_ps(ptr)));
+        }
+
+        SIMD_INLINE void AddProduct(float * ptr, __m512 value, __m512 alpha, __mmask16 mask)
+        {
+            _mm512_mask_storeu_ps(ptr, mask, _mm512_fmadd_ps(value, alpha, _mm512_maskz_loadu_ps(mask, ptr)));
+        }
+
+        static void Kernel4x48(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            __m512 c02 = _mm512_setzero_ps();
+            __m512 c12 = _mm512_setzero_ps();
+            __m512 c22 = _mm512_setzero_ps();
+            __m512 c32 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m512 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                b2 = _mm512_loadu_ps(B + 2 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                c02 = _mm512_fmadd_ps(a0, b2, c02);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                c12 = _mm512_fmadd_ps(a0, b2, c12);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                c22 = _mm512_fmadd_ps(a0, b2, c22);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                c32 = _mm512_fmadd_ps(a0, b2, c32);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01);
+            AddProduct(C + 2 * F, _alpha, c02, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11);
+            AddProduct(C + 2 * F, _alpha, c12, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21);
+            AddProduct(C + 2 * F, _alpha, c22, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31);
+            AddProduct(C + 2 * F, _alpha, c32, mask);
+        }
+
+        static void Kernel4x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m512 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, mask);
+        }
+
+        static void Kernel4x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c0 = _mm512_setzero_ps();
+            __m512 c1 = _mm512_setzero_ps();
+            __m512 c2 = _mm512_setzero_ps();
+            __m512 c3 = _mm512_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            __m512 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B);
+                c0 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a0++), c0);
+                c1 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a1++), c1);
+                c2 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a2++), c2);
+                c3 = _mm512_fmadd_ps(b0, _mm512_set1_ps(*a3++), c3);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, mask);
+            AddProduct(C + 1 * ldc, _alpha, c1, mask);
+            AddProduct(C + 2 * ldc, _alpha, c2, mask);
+            AddProduct(C + 3 * ldc, _alpha, c3, mask);
+        }
+
+        static void Kernel6x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            __m512 c41 = _mm512_setzero_ps();
+            __m512 c51 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m512 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                c41 = _mm512_fmadd_ps(a0, b1, c41);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                c51 = _mm512_fmadd_ps(a0, b1, c51);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, mask);
+        }
+
+        static void Kernel6x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m512 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50, mask);
+        }
+
+        static void Kernel8x48(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c02 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c12 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c22 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            __m512 c32 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c41 = _mm512_setzero_ps();
+            __m512 c42 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c51 = _mm512_setzero_ps();
+            __m512 c52 = _mm512_setzero_ps();
+            __m512 c60 = _mm512_setzero_ps();
+            __m512 c61 = _mm512_setzero_ps();
+            __m512 c62 = _mm512_setzero_ps();
+            __m512 c70 = _mm512_setzero_ps();
+            __m512 c71 = _mm512_setzero_ps();
+            __m512 c72 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            const float * A6 = A + lda * 6;
+            const float * A7 = A + lda * 7;
+            __m512 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                b2 = _mm512_loadu_ps(B + 2 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                c02 = _mm512_fmadd_ps(a0, b2, c02);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                c12 = _mm512_fmadd_ps(a0, b2, c12);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                c22 = _mm512_fmadd_ps(a0, b2, c22);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                c32 = _mm512_fmadd_ps(a0, b2, c32);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                c41 = _mm512_fmadd_ps(a0, b1, c41);
+                c42 = _mm512_fmadd_ps(a0, b2, c42);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                c51 = _mm512_fmadd_ps(a0, b1, c51);
+                c52 = _mm512_fmadd_ps(a0, b2, c52);
+                a0 = _mm512_set1_ps(*A6++);
+                c60 = _mm512_fmadd_ps(a0, b0, c60);
+                c61 = _mm512_fmadd_ps(a0, b1, c61);
+                c62 = _mm512_fmadd_ps(a0, b2, c62);
+                a0 = _mm512_set1_ps(*A7++);
+                c70 = _mm512_fmadd_ps(a0, b0, c70);
+                c71 = _mm512_fmadd_ps(a0, b1, c71);
+                c72 = _mm512_fmadd_ps(a0, b2, c72);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01);
+            AddProduct(C + 2 * F, _alpha, c02, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11);
+            AddProduct(C + 2 * F, _alpha, c12, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21);
+            AddProduct(C + 2 * F, _alpha, c22, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31);
+            AddProduct(C + 2 * F, _alpha, c32, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41);
+            AddProduct(C + 2 * F, _alpha, c42, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51);
+            AddProduct(C + 2 * F, _alpha, c52, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c60);
+            AddProduct(C + 1 * F, _alpha, c61);
+            AddProduct(C + 2 * F, _alpha, c62, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c70);
+            AddProduct(C + 1 * F, _alpha, c71);
+            AddProduct(C + 2 * F, _alpha, c72, mask);
+        }
+
+        static void Kernel8x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc,  __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c41 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c51 = _mm512_setzero_ps();
+            __m512 c60 = _mm512_setzero_ps();
+            __m512 c61 = _mm512_setzero_ps();
+            __m512 c70 = _mm512_setzero_ps();
+            __m512 c71 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            const float * A6 = A + lda * 6;
+            const float * A7 = A + lda * 7;
+            __m512 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                c41 = _mm512_fmadd_ps(a0, b1, c41);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                c51 = _mm512_fmadd_ps(a0, b1, c51);
+                a0 = _mm512_set1_ps(*A6++);
+                c60 = _mm512_fmadd_ps(a0, b0, c60);
+                c61 = _mm512_fmadd_ps(a0, b1, c61);
+                a0 = _mm512_set1_ps(*A7++);
+                c70 = _mm512_fmadd_ps(a0, b0, c70);
+                c71 = _mm512_fmadd_ps(a0, b1, c71);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c60);
+            AddProduct(C + 1 * F, _alpha, c61, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c70);
+            AddProduct(C + 1 * F, _alpha, c71, mask);
+        }
+
+        static void Kernel8x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c60 = _mm512_setzero_ps();
+            __m512 c70 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            const float * A6 = A + lda * 6;
+            const float * A7 = A + lda * 7;
+            __m512 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                a0 = _mm512_set1_ps(*A6++);
+                c60 = _mm512_fmadd_ps(a0, b0, c60);
+                a0 = _mm512_set1_ps(*A7++);
+                c70 = _mm512_fmadd_ps(a0, b0, c70);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c60, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c70, mask);
+        }
+
+        static void Kernel12x32(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c01 = _mm512_setzero_ps();
+            __m512 c11 = _mm512_setzero_ps();
+            __m512 c21 = _mm512_setzero_ps();
+            __m512 c31 = _mm512_setzero_ps();
+            __m512 c41 = _mm512_setzero_ps();
+            __m512 c51 = _mm512_setzero_ps();
+            __m512 c60 = _mm512_setzero_ps();
+            __m512 c70 = _mm512_setzero_ps();
+            __m512 c80 = _mm512_setzero_ps();
+            __m512 c90 = _mm512_setzero_ps();
+            __m512 cA0 = _mm512_setzero_ps();
+            __m512 cB0 = _mm512_setzero_ps();
+            __m512 c61 = _mm512_setzero_ps();
+            __m512 c71 = _mm512_setzero_ps();
+            __m512 c81 = _mm512_setzero_ps();
+            __m512 c91 = _mm512_setzero_ps();
+            __m512 cA1 = _mm512_setzero_ps();
+            __m512 cB1 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            const float * A6 = A + lda * 6;
+            const float * A7 = A + lda * 7;
+            const float * A8 = A + lda * 8;
+            const float * A9 = A + lda * 9;
+            const float * AA = A + lda * 10;
+            const float * AB = A + lda * 11;
+            __m512 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                c01 = _mm512_fmadd_ps(a0, b1, c01);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                c11 = _mm512_fmadd_ps(a0, b1, c11);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                c21 = _mm512_fmadd_ps(a0, b1, c21);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                c31 = _mm512_fmadd_ps(a0, b1, c31);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                c41 = _mm512_fmadd_ps(a0, b1, c41);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                c51 = _mm512_fmadd_ps(a0, b1, c51);
+                a0 = _mm512_set1_ps(*A6++);
+                c60 = _mm512_fmadd_ps(a0, b0, c60);
+                c61 = _mm512_fmadd_ps(a0, b1, c61);
+                a0 = _mm512_set1_ps(*A7++);
+                c70 = _mm512_fmadd_ps(a0, b0, c70);
+                c71 = _mm512_fmadd_ps(a0, b1, c71);
+                a0 = _mm512_set1_ps(*A8++);
+                c80 = _mm512_fmadd_ps(a0, b0, c80);
+                c81 = _mm512_fmadd_ps(a0, b1, c81);
+                a0 = _mm512_set1_ps(*A9++);
+                c90 = _mm512_fmadd_ps(a0, b0, c90);
+                c91 = _mm512_fmadd_ps(a0, b1, c91);
+                a0 = _mm512_set1_ps(*AA++);
+                cA0 = _mm512_fmadd_ps(a0, b0, cA0);
+                cA1 = _mm512_fmadd_ps(a0, b1, cA1);
+                a0 = _mm512_set1_ps(*AB++);
+                cB0 = _mm512_fmadd_ps(a0, b0, cB0);
+                cB1 = _mm512_fmadd_ps(a0, b1, cB1);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c60);
+            AddProduct(C + 1 * F, _alpha, c61, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c70);
+            AddProduct(C + 1 * F, _alpha, c71, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c80);
+            AddProduct(C + 1 * F, _alpha, c81, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c90);
+            AddProduct(C + 1 * F, _alpha, c91, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, cA0);
+            AddProduct(C + 1 * F, _alpha, cA1, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, cB0);
+            AddProduct(C + 1 * F, _alpha, cB1, mask);
+        }
+
+        static void Kernel12x16(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+            __m512 c00 = _mm512_setzero_ps();
+            __m512 c10 = _mm512_setzero_ps();
+            __m512 c20 = _mm512_setzero_ps();
+            __m512 c30 = _mm512_setzero_ps();
+            __m512 c40 = _mm512_setzero_ps();
+            __m512 c50 = _mm512_setzero_ps();
+            __m512 c60 = _mm512_setzero_ps();
+            __m512 c70 = _mm512_setzero_ps();
+            __m512 c80 = _mm512_setzero_ps();
+            __m512 c90 = _mm512_setzero_ps();
+            __m512 cA0 = _mm512_setzero_ps();
+            __m512 cB0 = _mm512_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            const float * A6 = A + lda * 6;
+            const float * A7 = A + lda * 7;
+            const float * A8 = A + lda * 8;
+            const float * A9 = A + lda * 9;
+            const float * AA = A + lda * 10;
+            const float * AB = A + lda * 11;
+            __m512 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                a0 = _mm512_set1_ps(*A0++);
+                c00 = _mm512_fmadd_ps(a0, b0, c00);
+                a0 = _mm512_set1_ps(*A1++);
+                c10 = _mm512_fmadd_ps(a0, b0, c10);
+                a0 = _mm512_set1_ps(*A2++);
+                c20 = _mm512_fmadd_ps(a0, b0, c20);
+                a0 = _mm512_set1_ps(*A3++);
+                c30 = _mm512_fmadd_ps(a0, b0, c30);
+                a0 = _mm512_set1_ps(*A4++);
+                c40 = _mm512_fmadd_ps(a0, b0, c40);
+                a0 = _mm512_set1_ps(*A5++);
+                c50 = _mm512_fmadd_ps(a0, b0, c50);
+                a0 = _mm512_set1_ps(*A6++);
+                c60 = _mm512_fmadd_ps(a0, b0, c60);
+                a0 = _mm512_set1_ps(*A7++);
+                c70 = _mm512_fmadd_ps(a0, b0, c70);
+                a0 = _mm512_set1_ps(*A8++);
+                c80 = _mm512_fmadd_ps(a0, b0, c80);
+                a0 = _mm512_set1_ps(*A9++);
+                c90 = _mm512_fmadd_ps(a0, b0, c90);
+                a0 = _mm512_set1_ps(*AA++);
+                cA0 = _mm512_fmadd_ps(a0, b0, cA0);
+                a0 = _mm512_set1_ps(*AB++);
+                cB0 = _mm512_fmadd_ps(a0, b0, cB0);
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c60, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c70, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c80, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c90, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, cA0, mask);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, cB0, mask);
+        }
+
+        static void KernelMx48(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+#if SIMD_ZMM_COUNT == 32
+            __m512 c[8][3];
+            const float * a[8];
+#else
+            __m512 c[4][3];
+            const float * a[4];
+#endif
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm512_setzero_ps();
+                c[i][1] = _mm512_setzero_ps();
+                c[i][2] = _mm512_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m512 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                b2 = _mm512_loadu_ps(B + 2 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm512_set1_ps(*a[i]++);
+                    c[i][0] = _mm512_add_ps(_mm512_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm512_add_ps(_mm512_mul_ps(b1, a0), c[i][1]);
+                    c[i][2] = _mm512_add_ps(_mm512_mul_ps(b2, a0), c[i][2]);
+                }
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1]);
+                AddProduct(C + 2 * F, _alpha, c[i][2], mask);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx32(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+#if SIMD_ZMM_COUNT == 32
+            __m512 c[12][2];
+            const float * a[12];
+#else
+            __m512 c[6][2];
+            const float * a[6];
+#endif
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm512_setzero_ps();
+                c[i][1] = _mm512_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m512 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                b1 = _mm512_loadu_ps(B + 1 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm512_set1_ps(*a[i]++);
+                    c[i][0] = _mm512_fmadd_ps(b0, a0, c[i][0]);
+                    c[i][1] = _mm512_fmadd_ps(b1, a0, c[i][1]);
+                }
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1], mask);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx16(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, __mmask16 mask)
+        {
+#if SIMD_ZMM_COUNT == 32
+            __m512 c[12];
+            const float * a[12];
+#elif SIMD_ZMM_COUNT == 16
+            __m512 c[6];
+            const float * a[6];
+#else
+            __m512 c[4];
+            const float * a[4];
+#endif
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i] = _mm512_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m512 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm512_loadu_ps(B + 0 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm512_set1_ps(*a[i]++);
+                    c[i] = _mm512_fmadd_ps(b0, a0, c[i]);
+                }
+                B += ldb;
+            }
+            __m512 _alpha = _mm512_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+                AddProduct(C + i * ldc, _alpha, c[i], mask);
+        }
+
+        SIMD_INLINE void ScaleC(float * ptr, __m512 value, __mmask16 mask = -1)
+        {
+            _mm512_mask_storeu_ps(ptr, mask, _mm512_mul_ps(_mm512_maskz_loadu_ps(mask, ptr), value));
+        }
+
+        static void ScaleC(size_t M, size_t N, float value, float * C, size_t ldc)
+        {
+            size_t NQF = AlignLo(N, QF);
+            size_t NF = AlignLo(N, F);
+            __m512 _value = _mm512_set1_ps(value);
+            __mmask16 tail = TailMask16(N - NF);
+            for (size_t i = 0; i < M; ++i)
+            {
+                size_t j = 0;
+                for (; j < NQF; j += QF)
+                {
+                    ScaleC(C + j + F * 0, _value);
+                    ScaleC(C + j + F * 1, _value);
+                    ScaleC(C + j + F * 2, _value);
+                    ScaleC(C + j + F * 3, _value);
+                }
+                for (; j < NF; j += F)
+                    ScaleC(C + j, _value);
+                if(j < N)
+                    ScaleC(C + j, _value, tail);
+                C += ldc;
+            }
+        }
+
+        static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst)
+        {
+            size_t K4 = AlignLo(K, 4), K8 = AlignLo(K, 8);
+            for (size_t i = 0; i < M; i += cell)
+            {
+                size_t m = Simd::Min(cell, M - i), k = 0;
+                if (cell == 4 && m == 4)
+                {
+                    for (; k < K8; k += 8)
+                    {
+                        const float * ps = src + k;
+                        __m256 s0 = _mm256_loadu_ps(ps + 0 * K);
+                        __m256 s1 = _mm256_loadu_ps(ps + 1 * K);
+                        __m256 s2 = _mm256_loadu_ps(ps + 2 * K);
+                        __m256 s3 = _mm256_loadu_ps(ps + 3 * K);
+                        __m256 s00 = _mm256_unpacklo_ps(s0, s2);
+                        __m256 s01 = _mm256_unpacklo_ps(s1, s3);
+                        __m256 s10 = _mm256_unpackhi_ps(s0, s2);
+                        __m256 s11 = _mm256_unpackhi_ps(s1, s3);
+                        __m256 d0 = _mm256_unpacklo_ps(s00, s01);
+                        __m256 d1 = _mm256_unpackhi_ps(s00, s01);
+                        __m256 d2 = _mm256_unpacklo_ps(s10, s11);
+                        __m256 d3 = _mm256_unpackhi_ps(s10, s11);
+                        _mm256_storeu_ps(dst + 0, _mm256_permute2f128_ps(d0, d1, 0x20));
+                        _mm256_storeu_ps(dst + 8, _mm256_permute2f128_ps(d2, d3, 0x20));
+                        _mm256_storeu_ps(dst + 16, _mm256_permute2f128_ps(d0, d1, 0x31));
+                        _mm256_storeu_ps(dst + 24, _mm256_permute2f128_ps(d2, d3, 0x31));
+                        dst += 32;
+                    };
+                    for (; k < K4; k += 4)
+                    {
+                        const float * ps = src + k;
+                        __m128 s0 = _mm_loadu_ps(ps + 0 * stride);
+                        __m128 s1 = _mm_loadu_ps(ps + 1 * stride);
+                        __m128 s2 = _mm_loadu_ps(ps + 2 * stride);
+                        __m128 s3 = _mm_loadu_ps(ps + 3 * stride);
+                        __m128 s00 = _mm_unpacklo_ps(s0, s2);
+                        __m128 s01 = _mm_unpacklo_ps(s1, s3);
+                        __m128 s10 = _mm_unpackhi_ps(s0, s2);
+                        __m128 s11 = _mm_unpackhi_ps(s1, s3);
+                        _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01));
+                        _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01));
+                        _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11));
+                        _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11));
+                        dst += 16;
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    for (size_t c = 0; c < m; ++c)
+                        *(dst++) = src[c*stride + k];
+                }  
+                src += cell * stride;
+            }
+        }
+
+        static void PackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB)
+        {
+            for (size_t j = 0; j < N; j += microN)
+            {
+                size_t n = Simd::Min(microN, N - j);
+                if (microN == 1 * F)
+                {
+                    __mmask16 mask0 = TailMask16(n - 0 * F);
+                    for (size_t k = 0; k < K; ++k)
+                    {
+                        const float * b = B + k * ldb;
+                        _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F));
+                        pB += microN;
+                    }
+                }
+                else if (microN == 2 * F)
+                {
+                    __mmask16 mask0 = TailMask16(n - 0 * F);
+                    __mmask16 mask1 = TailMask16(n - 1 * F);
+                    for (size_t k = 0; k < K; ++k)
+                    {
+                        const float * b = B + k * ldb;
+                        _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F));
+                        _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F));
+                        pB += microN;
+                    }
+                }
+                else if (microN == 3 * F)
+                {
+                    __mmask16 mask0 = TailMask16(n - 0 * F);
+                    __mmask16 mask1 = TailMask16(n - 1 * F);
+                    __mmask16 mask2 = TailMask16(n - 2 * F);
+                    for (size_t k = 0; k < K; ++k)
+                    {
+                        const float * b = B + k * ldb;
+                        _mm512_storeu_ps(pB + 0 * F, _mm512_maskz_loadu_ps(mask0, b + 0 * F));
+                        _mm512_storeu_ps(pB + 1 * F, _mm512_maskz_loadu_ps(mask1, b + 1 * F));
+                        _mm512_storeu_ps(pB + 2 * F, _mm512_maskz_loadu_ps(mask2, b + 2 * F));
+                        pB += microN;
+                    }
+                }
+                else
+                {
+                    for (size_t k = 0; k < K; ++k)
+                    {
+                        const float * b = B + k * ldb;
+                        size_t c = 0;
+                        for (; c < n; ++c)
+                            *(pB++) = *(b++);
+                        for (; c < microN; ++c)
+                            *(pB++) = 0;
+                    }
+                }
+                B += microN;
+            }
+        }
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+        {
+            const size_t CACHE_L1_SIZE = 32 * 1024;
+            const size_t CACHE_L2_SIZE = 256 * 1024;
+            const size_t CACHE_L3_SIZE = 2 * 1024 * 1024;
+            typedef Simd::GemmNN<float, __mmask16> GemmNN;
+            GemmNN::Main kernelMM, kernelMT;
+            GemmNN::Tail kernelTM, kernelTT;
+            size_t microM, microN;
+#if SIMD_ZMM_COUNT == 32
+            if (K > 4024 && false)
+            {
+                microM = 12;
+                microN = 32;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel12x32;
+                kernelMT = tail > F ? Kernel12x32 : Kernel12x16;
+                kernelTM = KernelMx32;
+                kernelTT = tail > F ? KernelMx32 : KernelMx16;
+            }
+            else
+            {
+                microM = 8;
+                microN = 48;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel8x48;
+                kernelMT = tail > DF ? Kernel8x48 : (tail > F ? Kernel8x32 : Kernel8x16);
+                kernelTM = KernelMx48;
+                kernelTT = tail > DF ? KernelMx48 : (tail > F ? KernelMx32 : KernelMx16);
+            }
+#elif SIMD_ZMM_COUNT == 16
+            if (K > 4024)
+            {
+                microM = 6;
+                microN = 32;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel6x32;
+                kernelMT = tail > F ? Kernel6x32 : Kernel6x16;
+                kernelTM = KernelMx32;
+                kernelTT = tail > F ? KernelMx32 : KernelMx16;
+        }
+            else
+            {
+                microM = 4;
+                microN = 48;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel4x48;
+                kernelMT = tail > DF ? Kernel4x48 : (tail > F ? Kernel4x32 : Kernel4x16);
+                kernelTM = KernelMx48;
+                kernelTT = tail > DF ? KernelMx48 : (tail > F ? KernelMx32 : KernelMx16);
+            }
+#else
+            microM = 4;
+            microN = 16;
+            kernelMM = Kernel4x16;
+            kernelMT = Kernel4x16;
+            kernelTM = KernelMx16;
+            kernelTT = KernelMx16;
+#endif
+            GemmNN gemmNN(M, N, K, microM, microN, CACHE_L2_SIZE, CACHE_L3_SIZE, CACHE_L3_SIZE, F,
+                kernelMM, kernelMT, kernelTM, kernelTT, Avx512f::ScaleC, Avx512f::PackB, TailMask16);
+            gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc);
+        }
+    }
+#endif// SIMD_AVX512F_ENABLE
+}
diff --git a/src/3rd/Simd/SimdAvx512fNeural.cpp b/src/3rd/Simd/SimdAvx512fNeural.cpp
index 670d6a20..8bb8db3e 100644
--- a/src/3rd/Simd/SimdAvx512fNeural.cpp
+++ b/src/3rd/Simd/SimdAvx512fNeural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -27,6 +27,7 @@
 #include "Simd/SimdStream.h"
 #include "Simd/SimdNeural.h"
 #include "Simd/SimdAvx2.h"
+#include "Simd/SimdPow.h"
 
 namespace Simd
 {
@@ -621,90 +622,35 @@ namespace Simd
                 NeuralDerivativeRelu<false>(src, size, slope, dst);
         }
 
-        class PowEstimator
+        template<bool align> void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
-            __m512i _exponent, _mantissa, _127;
-            __m512 _1_0, _0_5;
-
-            void Init()
-            {
-                _exponent = _mm512_set1_epi32(0x7F800000);
-                _mantissa = _mm512_set1_epi32(0x007FFFFF);
-                _127 = _mm512_set1_epi32(127);
-                _1_0 = _mm512_set1_ps(1.0f);
-                _0_5 = _mm512_set1_ps(0.5f);
-            }
-
-            SIMD_INLINE __m512 Poly5(__m512 x, float a, float b, float c, float d, float e, float f)
-            {
-                __m512 p = _mm512_set1_ps(f);
-                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(e));
-                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(d));
-                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(c));
-                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(b));
-                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(a));
-                return p;
-            }
-
-            SIMD_INLINE __m512 Exp2(__m512 x)
-            {
-                x = _mm512_max_ps(_mm512_min_ps(x, _mm512_set1_ps(129.00000f)), _mm512_set1_ps(-126.99999f));
-                __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _0_5));
-                __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart));
-                __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _mm512_set1_epi32(127)), 23));
-                __m512 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-                return _mm512_mul_ps(expipart, expfpart);
-            }
-
-            SIMD_INLINE __m512 Log2(__m512 x)
-            {
-                __m512i i = _mm512_castps_si512(x);
-                __m512 e = _mm512_cvtepi32_ps(_mm512_sub_epi32(_mm512_srli_epi32(_mm512_and_si512(i, _exponent), 23), _127));
-                __m512 m = _mm512_or_ps(_mm512_castsi512_ps(_mm512_and_si512(i, _mantissa)), _1_0);
-                __m512 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
-                return _mm512_fmadd_ps(p, _mm512_sub_ps(m, _1_0), e);
-            }
-
-            SIMD_INLINE __m512 Pow(__m512 basis, __m512 exponent)
-            {
-                return Exp2(_mm512_mul_ps(Log2(basis), exponent));
-            }
-
-            template<bool align> void Run(const float * src, size_t size, const float * exponent, float * dst)
-            {
-                if (align)
-                    assert(Aligned(src) && Aligned(dst));
-
-                float e = exponent[0];
-                size_t alignedSize = AlignLo(size, F);
-                __m512 _e = _mm512_set1_ps(e);
-                size_t i = 0;
-                for (; i < alignedSize; i += F)
-                    Store<align>(dst + i, Pow(Load<align>(src + i), _e));
-                for (; i < size; ++i)
-                    dst[i] = Base::Pow(src[i], e);
-            }
+            if (align)
+                assert(Aligned(src) && Aligned(dst));
 
-        public:
-            void Run(const float * src, size_t size, const float * exponent, float * dst)
+            float e = exponent[0];
+            size_t aligned = AlignLo(size, F);
+            __m512 _e = _mm512_set1_ps(e);
+            Pow pow;
+            size_t i = 0;
+            for (; i < aligned; i += F)
+                Avx512f::Store<align>(dst + i, pow(Avx512f::Load<align>(src + i), _e));
+            if (i < size)
             {
-                Init();
-
-                if (Aligned(src) && Aligned(dst))
-                    Run<true>(src, size, exponent, dst);
-                else
-                    Run<false>(src, size, exponent, dst);
+                __mmask16 tail = TailMask16(size - i);
+                Avx512f::Store<align, true>(dst + i, pow(Avx512f::Load<align, true>(src + i, tail), _e), tail);
             }
-        };
+        }
 
         void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
 #if defined(_MSC_VER) && _MSC_VER <= 1912
             Avx2::NeuralPow(src, size, exponent, dst);
-#else
-            PowEstimator estimator;
-            estimator.Run(src, size, exponent, dst);
-#endif
+#else            
+            if (Aligned(src) && Aligned(dst))
+                NeuralPow<true>(src, size, exponent, dst);
+            else
+                NeuralPow<false>(src, size, exponent, dst);
+#endif        
         }
 
         template <bool align, bool mask> SIMD_INLINE void NeuralUpdateWeights(const float * x, const __m512 & a, const __m512 & b, float * d, float * w, __mmask16 m)
@@ -1996,37 +1942,48 @@ namespace Simd
                     _mm_storeu_ps(dst, _mm_add_ps(_mm_loadu_ps(dst), sum128));
                 }
 
-                template <bool align> static SIMD_INLINE void Kernel4x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums)
+                template <bool align> static SIMD_INLINE void Kernel6x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums)
                 {
-                    __m512 b0 = Load<align>(b + 0 * K);
-                    sums[0x0] = _mm512_fmadd_ps(a[0], b0, sums[0x0]);
-                    sums[0x4] = _mm512_fmadd_ps(a[1], b0, sums[0x4]);
-                    sums[0x8] = _mm512_fmadd_ps(a[2], b0, sums[0x8]);
-                    sums[0xC] = _mm512_fmadd_ps(a[3], b0, sums[0xC]);
-                    __m512 b1 = Load<align>(b + 1 * K);
-                    sums[0x1] = _mm512_fmadd_ps(a[0], b1, sums[0x1]);
-                    sums[0x5] = _mm512_fmadd_ps(a[1], b1, sums[0x5]);
-                    sums[0x9] = _mm512_fmadd_ps(a[2], b1, sums[0x9]);
-                    sums[0xD] = _mm512_fmadd_ps(a[3], b1, sums[0xD]);
-                    __m512 b2 = Load<align>(b + 2 * K);
-                    sums[0x2] = _mm512_fmadd_ps(a[0], b2, sums[0x2]);
-                    sums[0x6] = _mm512_fmadd_ps(a[1], b2, sums[0x6]);
-                    sums[0xA] = _mm512_fmadd_ps(a[2], b2, sums[0xA]);
-                    sums[0xE] = _mm512_fmadd_ps(a[3], b2, sums[0xE]);
-                    __m512 b3 = Load<align>(b + 3 * K);
-                    sums[0x3] = _mm512_fmadd_ps(a[0], b3, sums[0x3]);
-                    sums[0x7] = _mm512_fmadd_ps(a[1], b3, sums[0x7]);
-                    sums[0xB] = _mm512_fmadd_ps(a[2], b3, sums[0xB]);
-                    sums[0xF] = _mm512_fmadd_ps(a[3], b3, sums[0xF]);
+                    __m512 _b;
+                    _b = Load<align>(b + 0 * K);
+                    sums[0x00] = _mm512_fmadd_ps(a[0], _b, sums[0x00]);
+                    sums[0x04] = _mm512_fmadd_ps(a[1], _b, sums[0x04]);
+                    sums[0x08] = _mm512_fmadd_ps(a[2], _b, sums[0x08]);
+                    sums[0x0C] = _mm512_fmadd_ps(a[3], _b, sums[0x0C]);
+                    sums[0x10] = _mm512_fmadd_ps(a[4], _b, sums[0x10]);
+                    sums[0x14] = _mm512_fmadd_ps(a[5], _b, sums[0x14]);
+                    _b = Load<align>(b + 1 * K);
+                    sums[0x01] = _mm512_fmadd_ps(a[0], _b, sums[0x01]);
+                    sums[0x05] = _mm512_fmadd_ps(a[1], _b, sums[0x05]);
+                    sums[0x09] = _mm512_fmadd_ps(a[2], _b, sums[0x09]);
+                    sums[0x0D] = _mm512_fmadd_ps(a[3], _b, sums[0x0D]);
+                    sums[0x11] = _mm512_fmadd_ps(a[4], _b, sums[0x11]);
+                    sums[0x15] = _mm512_fmadd_ps(a[5], _b, sums[0x15]);
+                    _b = Load<align>(b + 2 * K);
+                    sums[0x02] = _mm512_fmadd_ps(a[0], _b, sums[0x02]);
+                    sums[0x06] = _mm512_fmadd_ps(a[1], _b, sums[0x06]);
+                    sums[0x0A] = _mm512_fmadd_ps(a[2], _b, sums[0x0A]);
+                    sums[0x0E] = _mm512_fmadd_ps(a[3], _b, sums[0x0E]);
+                    sums[0x12] = _mm512_fmadd_ps(a[4], _b, sums[0x12]);
+                    sums[0x16] = _mm512_fmadd_ps(a[5], _b, sums[0x16]);
+                    _b = Load<align>(b + 3 * K);
+                    sums[0x03] = _mm512_fmadd_ps(a[0], _b, sums[0x03]);
+                    sums[0x07] = _mm512_fmadd_ps(a[1], _b, sums[0x07]);
+                    sums[0x0B] = _mm512_fmadd_ps(a[2], _b, sums[0x0B]);
+                    sums[0x0F] = _mm512_fmadd_ps(a[3], _b, sums[0x0F]);
+                    sums[0x13] = _mm512_fmadd_ps(a[4], _b, sums[0x13]);
+                    sums[0x17] = _mm512_fmadd_ps(a[5], _b, sums[0x17]);
                 }
 
-                template <bool align> static SIMD_INLINE void Kernel4x1x16(const __m512 * a, const float * b, __m512 * sums)
+                template <bool align> static SIMD_INLINE void Kernel6x1x16(const __m512 * a, const float * b, __m512 * sums)
                 {
                     __m512 b0 = Load<align>(b);
                     sums[0] = _mm512_fmadd_ps(a[0], b0, sums[0]);
                     sums[1] = _mm512_fmadd_ps(a[1], b0, sums[1]);
                     sums[2] = _mm512_fmadd_ps(a[2], b0, sums[2]);
                     sums[3] = _mm512_fmadd_ps(a[3], b0, sums[3]);
+                    sums[4] = _mm512_fmadd_ps(a[4], b0, sums[4]);
+                    sums[5] = _mm512_fmadd_ps(a[5], b0, sums[5]);
                 }
 
                 template <bool align> static SIMD_INLINE void Kernel3x4x16(const __m512 * a, size_t K, const float * b, __m512 * sums)
@@ -2058,12 +2015,14 @@ namespace Simd
                     sums[0x2] = _mm512_fmadd_ps(a[2], _b, sums[0x2]);
                 }
 
-                template <bool align, bool mask> static SIMD_INLINE void Load4(const float * p, __m512 * a, size_t step, __mmask16 tail = -1)
+                template <bool align, bool mask> static SIMD_INLINE void Load6(const float * p, __m512 * a, size_t step, __mmask16 tail = -1)
                 {
                     a[0] = Load<align, mask>(p + 0 * step, tail);
                     a[1] = Load<align, mask>(p + 1 * step, tail);
                     a[2] = Load<align, mask>(p + 2 * step, tail);
                     a[3] = Load<align, mask>(p + 3 * step, tail);
+                    a[4] = Load<align, mask>(p + 4 * step, tail);
+                    a[5] = Load<align, mask>(p + 5 * step, tail);
                 }
 
                 template <bool align, bool mask> static SIMD_INLINE void Load3(const float * p, __m512 * a, size_t step, __mmask16 tail = -1)
@@ -2076,22 +2035,24 @@ namespace Simd
                 template <bool align> void Execute(size_t M, size_t N, size_t K, const float * a, const float * b, float * c)
                 {
                     size_t M3 = M / 3 * 3;
-                    size_t M4 = Simd::AlignLo(M, 4);
+                    size_t M6 = M / 6 * 6;
                     size_t N4 = Simd::AlignLo(N, 4);
                     size_t K16 = Simd::AlignLo(K, 16);
                     __mmask16 tailMask = TailMask16(K - K16);
                     size_t i = 0;
 #if SIMD_ZMM_COUNT == 32
-                    for (; i < M4; i += 4)
+                    for (; i < M6; i += 6)
                     {
                         const float * pa = a + i*K;
                         float * pc = c + i*N;
                         size_t j = 0;
-                        register __m512 _a[4];
+                        __m512 _a[6];
                         for (; j < N4; j += 4)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sums[16] = {
+                            __m512 sums[24] = {
+                                _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
+                                _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
@@ -2099,38 +2060,44 @@ namespace Simd
                             size_t k = 0;
                             for (; k < K16; k += 16)
                             {
-                                Load4<false, false>(pa + k, _a, K);
-                                Kernel4x4x16<align>(_a, K, pb + k, sums);
+                                Load6<false, false>(pa + k, _a, K);
+                                Kernel6x4x16<align>(_a, K, pb + k, sums);
                             }
                             if (k < K)
                             {
-                                Load4<false, true>(pa + k, _a, K, tailMask);
-                                Kernel4x4x16<false>(_a, K, pb + k, sums);
+                                Load6<false, true>(pa + k, _a, K, tailMask);
+                                Kernel6x4x16<false>(_a, K, pb + k, sums);
                             }
-                            Add4ExtractedSums(sums + 0x0, pc + 0 * N + j);
-                            Add4ExtractedSums(sums + 0x4, pc + 1 * N + j);
-                            Add4ExtractedSums(sums + 0x8, pc + 2 * N + j);
-                            Add4ExtractedSums(sums + 0xC, pc + 3 * N + j);
+                            Add4ExtractedSums(sums + 0x00, pc + 0 * N + j);
+                            Add4ExtractedSums(sums + 0x04, pc + 1 * N + j);
+                            Add4ExtractedSums(sums + 0x08, pc + 2 * N + j);
+                            Add4ExtractedSums(sums + 0x0C, pc + 3 * N + j);
+                            Add4ExtractedSums(sums + 0x10, pc + 4 * N + j);
+                            Add4ExtractedSums(sums + 0x14, pc + 5 * N + j);
                         }
                         for (; j < N; ++j)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
+                            __m512 sums[6] = { 
+                                _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), 
+                                _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                             size_t k = 0;
                             for (; k < K16; k += 16)
                             {
-                                Load4<false, false>(pa + k, _a, K);
-                                Kernel4x1x16<align>(_a, pb + k, sums);
+                                Load6<false, false>(pa + k, _a, K);
+                                Kernel6x1x16<align>(_a, pb + k, sums);
                             }
                             if (k < K)
                             {
-                                Load4<false, true>(pa + k, _a, K, tailMask);
-                                Kernel4x1x16<false>(_a, pb + k, sums);
+                                Load6<false, true>(pa + k, _a, K, tailMask);
+                                Kernel6x1x16<false>(_a, pb + k, sums);
                             }
                             pc[0 * N + j] += ExtractSum(sums[0]);
                             pc[1 * N + j] += ExtractSum(sums[1]);
                             pc[2 * N + j] += ExtractSum(sums[2]);
                             pc[3 * N + j] += ExtractSum(sums[3]);
+                            pc[4 * N + j] += ExtractSum(sums[4]);
+                            pc[5 * N + j] += ExtractSum(sums[5]);
                         }
                     }
 #endif
@@ -2139,11 +2106,11 @@ namespace Simd
                         const float * pa = a + i*K;
                         float * pc = c + i*N;
                         size_t j = 0;
-                        register __m512 _a[3];
+                        __m512 _a[3];
                         for (; j < N4; j += 4)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sums[12] = {
+                            __m512 sums[12] = {
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                                 _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
@@ -2165,7 +2132,7 @@ namespace Simd
                         for (; j < N; ++j)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sums[3] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
+                            __m512 sums[3] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                             size_t k = 0;
                             for (; k < K16; k += 16)
                             {
@@ -2190,16 +2157,16 @@ namespace Simd
                         for (; j < N4; j += 4)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
+                            __m512 sums[4] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                             size_t k = 0;
                             for (; k < K16; k += 16)
                             {
-                                register __m512 _a = Load<false>(pa + k);
+                                __m512 _a = Load<false>(pa + k);
                                 Kernel1x4x16<align>(_a, K, pb + k, sums);
                             }
                             if (k < K)
                             {
-                                register __m512 _a = Load<false, true>(pa + k, tailMask);
+                                __m512 _a = Load<false, true>(pa + k, tailMask);
                                 Kernel1x4x16<false>(_a, K, pb + k, sums);
                             }
                             Add4ExtractedSums(sums + 0, pc + j);
@@ -2207,16 +2174,16 @@ namespace Simd
                         for (; j < N; ++j)
                         {
                             const float * pb = b + j*K;
-                            register __m512 sum = _mm512_setzero_ps();
+                            __m512 sum = _mm512_setzero_ps();
                             size_t k = 0;
                             for (; k < K16; k += 16)
                             {
-                                register __m512 _a = Load<false>(pa + k);
+                                __m512 _a = Load<false>(pa + k);
                                 Kernel1x1x16<align>(_a, pb + k, sum);
                             }
                             if (k < K)
                             {
-                                register __m512 _a = Load<false, true>(pa + k, tailMask);
+                                __m512 _a = Load<false, true>(pa + k, tailMask);
                                 Kernel1x1x16<false>(_a, pb + k, sum);
                             }
                             pc[j] += ExtractSum(sum);
@@ -2380,7 +2347,7 @@ namespace Simd
                         }
                         src = tmp;
                     }
-                    if (cell == 32)
+                    if (cell == 48)
                     {
                         for (size_t j = 0; j < N; j += cell)
                         {
@@ -2389,17 +2356,18 @@ namespace Simd
                             {
                                 for (size_t k = 0; k < K; ++k)
                                 {
-                                    const float * psrc = src + k*N;
-                                    Store<false>(dst + 0, Load<false>(psrc + 0));
-                                    Store<false>(dst + F, Load<false>(psrc + F));
-                                    dst += 32;
+                                    const float * psrc = src + k * N;
+                                    Store<false>(dst + 0 * F, Load<false>(psrc + 0 * F));
+                                    Store<false>(dst + 1 * F, Load<false>(psrc + 1 * F));
+                                    Store<false>(dst + 2 * F, Load<false>(psrc + 2 * F));
+                                    dst += 48;
                                 }
                             }
                             else
                             {
                                 for (size_t k = 0; k < K; ++k)
                                 {
-                                    const float * psrc = src + k*N;
+                                    const float * psrc = src + k * N;
                                     size_t c = 0;
                                     for (; c < n; ++c)
                                         *(dst++) = *(psrc++);
@@ -2535,104 +2503,140 @@ namespace Simd
                     }
                 }
 
-                template<bool mask> SIMD_INLINE void AddSums32(const __m512 * sums, size_t size, float * dst, size_t stride, const __mmask16 * tails)
+                template <bool align, bool mask> SIMD_INLINE void KernelMx48(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, const __mmask16 * tails)
                 {
-                    for (size_t i = 0; i < size; ++i, dst += stride)
-                    {
-                        AddSum<mask>(sums[i + 0], dst + 00, tails[0]);
-                        AddSum<mask>(sums[i + 4], dst + 16, tails[1]);
-                    }
-                }
-
-                template <bool align, bool mask> SIMD_INLINE void KernelMx32(size_t N, size_t K, const float * a, const float * b, float * c, size_t m, const __mmask16 * tails)
-                {
-                    __m512 sums[8] = { _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
+                    __m512 sums[12] = { 
+                        _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
+                        _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(),
                         _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps(), _mm512_setzero_ps() };
                     for (size_t k = 0; k < K; ++k)
                     {
                         __m512 b0 = Load<align>(b + 00);
                         __m512 b1 = Load<align>(b + 16);
+                        __m512 b2 = Load<align>(b + 32);
                         for (size_t s = 0; s < m; ++s)
                         {
                             __m512 a0 = _mm512_set1_ps(a[s]);
                             sums[s + 0] = _mm512_fmadd_ps(b0, a0, sums[s + 0]);
                             sums[s + 4] = _mm512_fmadd_ps(b1, a0, sums[s + 4]);
+                            sums[s + 8] = _mm512_fmadd_ps(b2, a0, sums[s + 8]);
                         }
-                        b += 32;
+                        b += 48;
                         a += m;
                     }
-                    AddSums32<mask>(sums, m, c, N, tails);
+                    for (size_t i = 0; i < m; ++i, c += N)
+                    {
+                        AddSum<mask>(sums[i + 0], c + 00, tails[0]);
+                        AddSum<mask>(sums[i + 4], c + 16, tails[1]);
+                        AddSum<mask>(sums[i + 8], c + 32, tails[2]);
+                    }
                 }
 
-                void Kernel4x32(size_t N, size_t K, const float * a, const float * b, float * c)
+                void Kernel4x48(size_t N, size_t K, const float * a, const float * b, float * c)
                 {
-                    register __m512 _a, b0, b1, c00, c01, c10, c11, c20, c21, c30, c31;
+                    __m512 _a, b0, b1, b2, c00, c01, c02, c10, c11, c12, c20, c21, c22, c30, c31, c32;
 
                     c00 = _mm512_setzero_ps();
                     c01 = _mm512_setzero_ps();
+                    c02 = _mm512_setzero_ps();
                     c10 = _mm512_setzero_ps();
                     c11 = _mm512_setzero_ps();
+                    c12 = _mm512_setzero_ps();
                     c20 = _mm512_setzero_ps();
                     c21 = _mm512_setzero_ps();
+                    c22 = _mm512_setzero_ps();
                     c30 = _mm512_setzero_ps();
                     c31 = _mm512_setzero_ps();
+                    c32 = _mm512_setzero_ps();
 
                     for (size_t k = 0; k < K; ++k)
                     {
                         b0 = _mm512_loadu_ps(b + 0 * F);
                         b1 = _mm512_loadu_ps(b + 1 * F);
+                        b2 = _mm512_loadu_ps(b + 2 * F);
                         _a = _mm512_set1_ps(a[0]);
                         c00 = _mm512_fmadd_ps(b0, _a, c00);
                         c01 = _mm512_fmadd_ps(b1, _a, c01);
+                        c02 = _mm512_fmadd_ps(b2, _a, c02);
                         _a = _mm512_set1_ps(a[1]);
                         c10 = _mm512_fmadd_ps(b0, _a, c10);
                         c11 = _mm512_fmadd_ps(b1, _a, c11);
+                        c12 = _mm512_fmadd_ps(b2, _a, c12);
                         _a = _mm512_set1_ps(a[2]);
                         c20 = _mm512_fmadd_ps(b0, _a, c20);
                         c21 = _mm512_fmadd_ps(b1, _a, c21);
+                        c22 = _mm512_fmadd_ps(b2, _a, c22);
                         _a = _mm512_set1_ps(a[3]);
                         c30 = _mm512_fmadd_ps(b0, _a, c30);
                         c31 = _mm512_fmadd_ps(b1, _a, c31);
-                        b += 32;
+                        c32 = _mm512_fmadd_ps(b2, _a, c32);
+                        b += 48;
                         a += 4;
                     }
 
                     AddSum(c00, c + 0 * F);
                     AddSum(c01, c + 1 * F);
+                    AddSum(c02, c + 2 * F);
                     c += N;
                     AddSum(c10, c + 0 * F);
                     AddSum(c11, c + 1 * F);
+                    AddSum(c12, c + 2 * F);
                     c += N;
                     AddSum(c20, c + 0 * F);
                     AddSum(c21, c + 1 * F);
+                    AddSum(c22, c + 2 * F);
                     c += N;
                     AddSum(c30, c + 0 * F);
                     AddSum(c31, c + 1 * F);
+                    AddSum(c32, c + 2 * F);
                 }
 
-                template <bool align> void Execute4x32(size_t M, size_t N, size_t K, const float * a, const float * b, float * c)
+                template <bool align> void Execute4x48(size_t M, size_t N, size_t K, const float * a, const float * b, float * c)
                 {
                     size_t M4 = Simd::AlignLo(M, 4);
-                    size_t N32 = Simd::AlignLo(N, 32);
-                    __mmask16 tailMasks[2];
-                    for (size_t i = 0; i < 2; ++i)
-                        tailMasks[i] = TailMask16(N - N32 - F*i);
-                    size_t i = 0;
-                    for (; i < M4; i += 4)
+                    size_t N48 = N/48*48;
+                    __mmask16 tailMasks[3];
+                    for (size_t i = 0; i < 3; ++i)
+                        tailMasks[i] = TailMask16(N - N48 - F*i);
+                    if (M > N)
                     {
-                        size_t j = 0;
-                        for (; j < N32; j += 32)
-                            Kernel4x32(N, K, a + i * K, b + j * K, c + i * N + j);
-                        if (j < N)
-                            KernelMx32<align, true>(N, K, a + i*K, b + j*K, c + i*N + j, 4, tailMasks);
+                        size_t i = 0;
+                        for (; i < M4; i += 4)
+                        {
+                            size_t j = 0;
+                            for (; j < N48; j += 48)
+                                Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j);
+                            if (j < N)
+                                KernelMx48<align, true>(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks);
+                        }
+                        if (i < M)
+                        {
+                            size_t j = 0;
+                            for (; j < N48; j += 48)
+                                KernelMx48<align, false>(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks);
+                            if (j < N)
+                                KernelMx48<align, true>(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks);
+                        }
                     }
-                    if (i < M)
+                    else
                     {
                         size_t j = 0;
-                        for (; j < N32; j += 32)
-                            KernelMx32<align, false>(N, K, a + i*K, b + j*K, c + i*N + j, M - M4, tailMasks);
-                        if (j < N)
-                            KernelMx32<align, true>(N, K, a + i*K, b + j*K, c + i*N + j, M - M4, tailMasks);
+                        for (; j < N48; j += 48)
+                        {
+                            size_t i = 0;
+                            for (; i < M4; i += 4)
+                                Kernel4x48(N, K, a + i * K, b + j * K, c + i * N + j);
+                            if (M4 < M)
+                                KernelMx48<align, false>(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks);
+                        }
+                        if (N48 < N)
+                        {
+                            size_t i = 0;
+                            for (; i < M4; i += 4)
+                                KernelMx48<align, true>(N, K, a + i * K, b + j * K, c + i * N + j, 4, tailMasks);
+                            if (M4 < M)
+                                KernelMx48<align, true>(N, K, a + i * K, b + j * K, c + i * N + j, M - M4, tailMasks);
+                        }
                     }
                 }
 
@@ -2642,8 +2646,8 @@ namespace Simd
                     {
                         if (cellB == 16)
                             Execute4x16<false>(M, N, K, a, b, c);
-                        if (cellB == 32)
-                            Execute4x32<false>(M, N, K, a, b, c);
+                        if (cellB == 48)
+                            Execute4x48<false>(M, N, K, a, b, c);
                     }
                 }
             }
@@ -2812,11 +2816,64 @@ namespace Simd
                     }
                 }
 
+                void AddConvolution1x1x16(const float * src, size_t srcDepth, const float * weight, float * dst, size_t dstDepth)
+                {
+                    size_t dstDepth4 = dstDepth / 4 * 4;
+                    size_t dstChannel = 0;
+                    for (; dstChannel < dstDepth4; dstChannel += 4)
+                    {
+                        __m512 dst00 = _mm512_loadu_ps(dst + 0 * F);
+                        __m512 dst10 = _mm512_loadu_ps(dst + 1 * F);
+                        __m512 dst20 = _mm512_loadu_ps(dst + 2 * F);
+                        __m512 dst30 = _mm512_loadu_ps(dst + 3 * F);
+                        const float * psrc = src;
+                        const float * pw0 = weight;
+                        const float * pw1 = pw0 + srcDepth;
+                        const float * pw2 = pw1 + srcDepth;
+                        const float * pw3 = pw2 + srcDepth;
+                        for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
+                        {
+                            __m512 _weight;
+                            __m512 src0 = _mm512_loadu_ps(psrc + 0 * F);
+                            _weight = _mm512_set1_ps(pw0[srcChannel]);
+                            dst00 = _mm512_fmadd_ps(_weight, src0, dst00);
+                            _weight = _mm512_set1_ps(pw1[srcChannel]);
+                            dst10 = _mm512_fmadd_ps(_weight, src0, dst10);
+                            _weight = _mm512_set1_ps(pw2[srcChannel]);
+                            dst20 = _mm512_fmadd_ps(_weight, src0, dst20);
+                            _weight = _mm512_set1_ps(pw3[srcChannel]);
+                            dst30 = _mm512_fmadd_ps(_weight, src0, dst30);
+                            psrc += 16;
+                        }
+                        _mm512_storeu_ps(dst + 0 * F, dst00);
+                        _mm512_storeu_ps(dst + 1 * F, dst10);
+                        _mm512_storeu_ps(dst + 2 * F, dst20);
+                        _mm512_storeu_ps(dst + 3 * F, dst30);
+                        dst += 16 * 4;
+                        weight += srcDepth * 4;
+                    }
+                    for (; dstChannel < dstDepth; ++dstChannel)
+                    {
+                        __m512 dst0 = _mm512_loadu_ps(dst + 0 * F);
+                        const float * psrc = src;
+                        for (size_t srcChannel = 0; srcChannel < srcDepth; ++srcChannel)
+                        {
+                            __m512 weight0 = _mm512_set1_ps(*weight++);
+                            dst0 = _mm512_fmadd_ps(weight0, _mm512_loadu_ps(psrc + 0 * F), dst0);
+                            psrc += 16;
+                        }
+                        _mm512_storeu_ps(dst + 0 * F, dst0);
+                        dst += 16;
+                    }
+                }
+
                 void Execute(const float * src, size_t srcWidth, size_t srcHeight, size_t srcDepth,
                     const float * weight, size_t kernelX, size_t kernelY, float * dst, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
                     assert(kernelX == kernelY);
-                    if (kernelX == 2)
+                    if (kernelX == 1 && dstWidth*dstHeight == 16)
+                        AddConvolution1x1x16(src, srcDepth, weight, dst, dstDepth);
+                    else if (kernelX == 2)
                         AddConvolution<false, 2, 2>(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth);
                     else if (kernelX == 3)
                         AddConvolution<false, 3, 3>(src, srcWidth, srcHeight, srcDepth, weight, dst, dstWidth, dstHeight, dstDepth);
@@ -2830,9 +2887,11 @@ namespace Simd
 
                 bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
-                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1)
+                    if (kernelX == kernelY && strideX*strideY*dilationX*dilationY == 1)
                     {
-                        if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3)
+                        if (kernelX >= 2 && kernelX <= 5 && dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 3 * 3)
+                            return true;
+                        if (kernelX == 1 && (dstWidth*dstHeight == 16))// || dstWidth * dstHeight == 64))
                             return true;
                     }
                     return false;
@@ -2888,9 +2947,9 @@ namespace Simd
                         break;
                     case Ver1:
                         cellA = 4;
-                        cellB = 32;
+                        cellB = 48;
                         sizeA = M*K;
-                        strideB = Simd::AlignHi(N, cellB);
+                        strideB = (N + cellB - 1)/cellB*cellB;
                         sizeB = strideB*K;
                         if (kernelX*kernelY > 1)
                             sizeT = sizeB;
diff --git a/src/3rd/Simd/SimdAvx512fResizer.cpp b/src/3rd/Simd/SimdAvx512fResizer.cpp
new file mode 100644
index 00000000..b4a30135
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx512fResizer.cpp
@@ -0,0 +1,156 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdResizer.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX512F_ENABLE 
+    namespace Avx512f
+    {
+        const __m512i K64_PERMUTE_FOR_PACK = SIMD_MM512_SETR_EPI64(0, 2, 4, 6, 1, 3, 5, 7);
+
+        ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp)
+            : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m512), caffeInterp)
+        {
+        }
+
+        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const
+        {
+            Array32f bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            float * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            size_t rsa = AlignLo(_rs, Avx512f::F);
+            __mmask16 tail = TailMask16(_rs - rsa);
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    float * pb = pbx[k];
+                    const float * ps = src + (sy + k)*srcStride;
+                    size_t dx = 0;
+                    if (_cn == 1)
+                    {
+                        __m512 _1 = _mm512_set1_ps(1.0f);
+                        for (; dx < rsa; dx += Avx512f::F)
+                        {
+                            __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_load_si512(_ix.data + dx));
+                            __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4));
+                            __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4));
+                            __m512 fx1 = _mm512_load_ps(_ax.data + dx);
+                            __m512 fx0 = _mm512_sub_ps(_1, fx1);
+                            __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88);
+                            __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD);
+                            _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1)));
+                        }
+                        if (dx < _rs)
+                        {
+                            __m512i idx = _mm512_permutexvar_epi64(K64_PERMUTE_FOR_PACK, _mm512_maskz_loadu_epi32(tail, _ix.data + dx));
+                            __m512 sp0 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 0), (double*)ps, 4));
+                            __m512 sp1 = _mm512_castpd_ps(_mm512_i32gather_pd(_mm512_extracti64x4_epi64(idx, 1), (double*)ps, 4));
+                            __m512 fx1 = _mm512_maskz_loadu_ps(tail, _ax.data + dx);
+                            __m512 fx0 = _mm512_sub_ps(_1, fx1);
+                            __m512 s0 = _mm512_shuffle_ps(sp0, sp1, 0x88);
+                            __m512 s1 = _mm512_shuffle_ps(sp0, sp1, 0xDD);
+                            _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1)));
+                        }
+                    }
+                    else
+                    {
+                        __m512 _1 = _mm512_set1_ps(1.0f);
+                        __m512i cn = _mm512_set1_epi32((int)_cn);
+                        for (; dx < rsa; dx += Avx512f::F)
+                        {
+                            __m512i i0 = _mm512_load_si512(_ix.data + dx);
+                            __m512i i1 = _mm512_add_epi32(i0, cn);
+                            __m512 s0 = _mm512_i32gather_ps(i0, ps, 4);
+                            __m512 s1 = _mm512_i32gather_ps(i1, ps, 4);
+                            __m512 fx1 = _mm512_load_ps(_ax.data + dx);
+                            __m512 fx0 = _mm512_sub_ps(_1, fx1);
+                            _mm512_store_ps(pb + dx, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1)));
+                        }
+                        if (dx < _rs)
+                        {
+                            __m512i i0 = _mm512_maskz_loadu_epi32(tail, _ix.data + dx);
+                            __m512i i1 = _mm512_add_epi32(i0, cn);
+                            __m512 s0 = _mm512_i32gather_ps(i0, ps, 4);
+                            __m512 s1 = _mm512_i32gather_ps(i1, ps, 4);
+                            __m512 fx1 = _mm512_maskz_loadu_ps(tail, _ax.data + dx);
+                            __m512 fx0 = _mm512_sub_ps(_1, fx1);
+                            _mm512_mask_store_ps(pb + dx, tail, _mm512_fmadd_ps(s0, fx0, _mm512_mul_ps(s1, fx1)));
+                        }
+                    }
+                }  
+
+                size_t dx = 0;
+                __m512 _fy0 = _mm512_set1_ps(fy0);
+                __m512 _fy1 = _mm512_set1_ps(fy1);
+                for (; dx < rsa; dx += Avx512f::F)
+                {
+                    __m512 b0 = Load<true>(pbx[0] + dx);
+                    __m512 b1 = Load<true>(pbx[1] + dx);
+                    Store<false>(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1)));
+                }
+                if (dx < _rs)
+                {
+                    __m512 b0 = Load<true, true>(pbx[0] + dx, tail);
+                    __m512 b1 = Load<true, true>(pbx[1] + dx, tail);
+                    Store<false, true>(dst + dx, _mm512_fmadd_ps(b0, _fy0, _mm512_mul_ps(b1, _fy1)), tail);
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+        {
+            if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true);
+            else
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+        }
+    }
+#endif //SIMD_AVX512f_ENABLE 
+}
+
diff --git a/src/3rd/Simd/SimdAvx512fSynet.cpp b/src/3rd/Simd/SimdAvx512fSynet.cpp
new file mode 100644
index 00000000..a3756f81
--- /dev/null
+++ b/src/3rd/Simd/SimdAvx512fSynet.cpp
@@ -0,0 +1,368 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+#include "Simd/SimdPow.h"
+#include "Simd/SimdAvx2.h"
+#include "Simd/SimdArray.h"
+
+namespace Simd
+{
+#ifdef SIMD_AVX512F_ENABLE    
+    namespace Avx512f
+    {
+        template <bool align, bool mask> SIMD_INLINE void SynetAddBias(const __m512 & bias, float * dst, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst, _mm512_add_ps((Load<align, mask>(dst, tail)), bias), tail);
+        }
+
+        template <bool align> SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            __mmask16 tail = __mmask16(-1) >> (F + partial - size);
+            for (size_t i = 0; i < count; ++i)
+            {
+                size_t j = 0;
+                __m512 _bias = _mm512_set1_ps(bias[i]);
+                for (; j < aligned; j += QF)
+                {
+                    SynetAddBias<align, false>(_bias, dst + j + F * 0);
+                    SynetAddBias<align, false>(_bias, dst + j + F * 1);
+                    SynetAddBias<align, false>(_bias, dst + j + F * 2);
+                    SynetAddBias<align, false>(_bias, dst + j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetAddBias<align, false>(_bias, dst + j);
+                if(j < size)
+                    SynetAddBias<align, true>(_bias, dst + j, tail);
+                dst += size;
+            }
+        }
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetAddBias<true>(bias, count, size, dst);
+            else
+                SynetAddBias<false>(bias, count, size, dst);
+        }
+
+        template <bool align, bool mask> void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_mul_ps((Load<align, mask>(src0 + offset, tail)), (Load<align, mask>(src1 + offset, tail))), tail);
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            __mmask16 tail = __mmask16(-1) >> (F + partial - size);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            for (; j < aligned; j += QF)
+            {
+                SynetEltwiseLayerForwardProduct<align, false>(src0, src1, dst, j + F * 0);
+                SynetEltwiseLayerForwardProduct<align, false>(src0, src1, dst, j + F * 1);
+                SynetEltwiseLayerForwardProduct<align, false>(src0, src1, dst, j + F * 2);
+                SynetEltwiseLayerForwardProduct<align, false>(src0, src1, dst, j + F * 3);
+            }
+            for (; j < partial; j += F)
+                SynetEltwiseLayerForwardProduct<align, false>(src0, src1, dst, j);
+            if (j < size)
+                SynetEltwiseLayerForwardProduct<align, true>(src0, src1, dst, j, tail);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                for (j = 0; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardProduct<align, false>(dst, srci, dst, j + F * 0);
+                    SynetEltwiseLayerForwardProduct<align, false>(dst, srci, dst, j + F * 1);
+                    SynetEltwiseLayerForwardProduct<align, false>(dst, srci, dst, j + F * 2);
+                    SynetEltwiseLayerForwardProduct<align, false>(dst, srci, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardProduct<align, false>(dst, srci, dst, j);
+                if (j < size)
+                    SynetEltwiseLayerForwardProduct<align, true>(dst, srci, dst, j, tail);
+            }
+        }
+
+        template <bool align, bool mask> void SynetEltwiseLayerForwardSum(const float * src0, const __m512 & weight0, const float * src1, const __m512 & weight1, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_fmadd_ps((Load<align, mask>(src0 + offset, tail)), weight0, _mm512_mul_ps((Load<align, mask>(src1 + offset, tail)), weight1)), tail);
+        }
+
+        template <bool align, bool mask> void SynetEltwiseLayerForwardSum(const float * src, const __m512 & weight, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_fmadd_ps((Load<align, mask>(src + offset, tail)), weight, (Load<align, mask>(dst + offset, tail))), tail);
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            __mmask16 tail = __mmask16(-1) >> (F + partial - size);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            __m512 weight0 = _mm512_set1_ps(weight[0]);
+            __m512 weight1 = _mm512_set1_ps(weight[1]);
+            size_t j = 0;
+            for (; j < aligned; j += QF)
+            {
+                SynetEltwiseLayerForwardSum<align, false>(src0, weight0, src1, weight1, dst, j + F * 0);
+                SynetEltwiseLayerForwardSum<align, false>(src0, weight0, src1, weight1, dst, j + F * 1);
+                SynetEltwiseLayerForwardSum<align, false>(src0, weight0, src1, weight1, dst, j + F * 2);
+                SynetEltwiseLayerForwardSum<align, false>(src0, weight0, src1, weight1, dst, j + F * 3);
+            }
+            for (; j < partial; j += F)
+                SynetEltwiseLayerForwardSum<align, false>(src0, weight0, src1, weight1, dst, j);
+            if (j < size)
+                SynetEltwiseLayerForwardSum<align, true>(src0, weight0, src1, weight1, dst, j, tail);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                __m512 weighti = _mm512_set1_ps(weight[i]);
+                for (j = 0; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardSum<align, false>(srci, weighti, dst, j + F * 0);
+                    SynetEltwiseLayerForwardSum<align, false>(srci, weighti, dst, j + F * 1);
+                    SynetEltwiseLayerForwardSum<align, false>(srci, weighti, dst, j + F * 2);
+                    SynetEltwiseLayerForwardSum<align, false>(srci, weighti, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardSum<align, false>(srci, weighti, dst, j);
+                if (j < size)
+                    SynetEltwiseLayerForwardSum<align, true>(srci, weighti, dst, j, tail);
+            }
+        }
+
+        template <bool align, bool mask> void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_max_ps((Load<align, mask>(src0 + offset, tail)), (Load<align, mask>(src1 + offset, tail))), tail);
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            __mmask16 tail = __mmask16(-1) >> (F + partial - size);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            for (; j < aligned; j += QF)
+            {
+                SynetEltwiseLayerForwardMax<align, false>(src0, src1, dst, j + F * 0);
+                SynetEltwiseLayerForwardMax<align, false>(src0, src1, dst, j + F * 1);
+                SynetEltwiseLayerForwardMax<align, false>(src0, src1, dst, j + F * 2);
+                SynetEltwiseLayerForwardMax<align, false>(src0, src1, dst, j + F * 3);
+            }
+            for (; j < partial; j += F)
+                SynetEltwiseLayerForwardMax<align, false>(src0, src1, dst, j);
+            if(j < size)
+                SynetEltwiseLayerForwardMax<align, true>(src0, src1, dst, j, tail);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                for (j = 0; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardMax<align, false>(dst, srci, dst, j + F * 0);
+                    SynetEltwiseLayerForwardMax<align, false>(dst, srci, dst, j + F * 1);
+                    SynetEltwiseLayerForwardMax<align, false>(dst, srci, dst, j + F * 2);
+                    SynetEltwiseLayerForwardMax<align, false>(dst, srci, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardMax<align, false>(dst, srci, dst, j);
+                if (j < size)
+                    SynetEltwiseLayerForwardMax<align, true>(dst, srci, dst, j, tail);
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            switch (type)
+            {
+            case SimdSynetEltwiseOperationProduct:
+                SynetEltwiseLayerForwardProduct<align>(src, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationSum:
+                SynetEltwiseLayerForwardSum<align>(src, weight, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationMax:
+                SynetEltwiseLayerForwardMax<align>(src, count, size, dst);
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            assert(count >= 2);
+            bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]);
+            for (size_t i = 2; i < count; ++i)
+                aligned = aligned && Aligned(src[i]);
+            if (aligned)
+                SynetEltwiseLayerForward<true>(src, weight, count, size, type, dst);
+            else
+                SynetEltwiseLayerForward<false>(src, weight, count, size, type, dst);
+        }
+
+        template <bool align> SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            size_t aligned = AlignLo(size, F);
+            __mmask16 tail = TailMask16(size - aligned);
+            Array32f sum(size, true), zero(size, true);
+
+            for (size_t i = 0; i < half; ++i)
+            {
+                const float * pos = src + i * size;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m512 _pos = Avx512f::Load<align>(pos + j);
+                    Avx512f::Store<true>(sum.data + j, _mm512_fmadd_ps(_pos, _pos, Avx512f::Load<true>(sum.data + j)));
+                }
+                if (j < size)
+                {
+                    __m512 _pos = Avx512f::Load<align, true>(pos + j, tail);
+                    __m512 _sum = Avx512f::Load<true, true>(sum.data + j, tail);
+                    Avx512f::Store<true, true>(sum.data + j, _mm512_fmadd_ps(_pos, _pos, _sum), tail);
+                }
+            }
+
+            __m512 k0 = _mm512_set1_ps(k[0]);
+            __m512 k1 = _mm512_set1_ps(k[1]);
+            __m512 k2 = _mm512_set1_ps(k[2]);
+            Avx512f::Pow pow;
+            for (size_t i = 0; i < count; ++i)
+            {
+                const float * pos = (i < count - half) ? src + half * size : zero.data;
+                const float * neg = (i > half) ? src - (half + 1) * size : zero.data;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m512 _pos = Avx512f::Load<align>(pos + j);
+                    __m512 _neg = Avx512f::Load<align>(neg + j);
+                    __m512 _sum = Avx512f::Load<true>(sum.data + j);
+                    _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum));
+                    __m512 _src = Avx512f::Load<align>(src + j);
+                    Avx512f::Store<true>(sum.data + j, _sum);
+                    Avx512f::Store<align>(dst + j, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2)));
+                }
+                if (j < size)
+                {
+                    __m512 _pos = Avx512f::Load<align, true>(pos + j, tail);
+                    __m512 _neg = Avx512f::Load<align, true>(neg + j, tail);
+                    __m512 _sum = Avx512f::Load<true, true>(sum.data + j, tail);
+                    _sum = _mm512_fmadd_ps(_pos, _pos, _mm512_fnmadd_ps(_neg, _neg, _sum));
+                    __m512 _src = Avx512f::Load<align, true>(src + j, tail);
+                    Avx512f::Store<true, true>(sum.data + j, _sum, tail);
+                    Avx512f::Store<align, true>(dst + j, _mm512_mul_ps(_src, pow(_mm512_fmadd_ps(k1, _sum, k0), k2)), tail);
+                }
+                src += size;
+                dst += size;
+            }
+        }
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            if (Aligned(src) && Aligned(dst) && Aligned(size))
+                SynetLrnLayerCrossChannels<true>(src, half, count, size, k, dst);
+            else
+                SynetLrnLayerCrossChannels<false>(src, half, count, size, k, dst);
+        }
+
+        template <bool align, bool mask> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m512 & scale, const __m512 & bias, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_fmadd_ps((Load<align, mask>(src + offset, tail)), scale, bias), tail);
+        }
+
+        template <bool align, bool mask> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m512 & scale, float * dst, size_t offset, __mmask16 tail = -1)
+        {
+            Store<align, mask>(dst + offset, _mm512_mul_ps((Load<align, mask>(src + offset, tail)), scale), tail);
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            __mmask16 tail = __mmask16(-1) >> (F + partial - size);
+            if (bias)
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    __m512 _scale = _mm512_set1_ps(scale[i]);
+                    __m512 _bias = _mm512_set1_ps(bias[i]);
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetScaleLayerForward<align, false>(src, _scale, _bias, dst, j + F * 0);
+                        SynetScaleLayerForward<align, false>(src, _scale, _bias, dst, j + F * 1);
+                        SynetScaleLayerForward<align, false>(src, _scale, _bias, dst, j + F * 2);
+                        SynetScaleLayerForward<align, false>(src, _scale, _bias, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetScaleLayerForward<align, false>(src, _scale, _bias, dst, j);
+                    if (j < size)
+                        SynetScaleLayerForward<align, true>(src, _scale, _bias, dst, j, tail);
+                    src += size;
+                    dst += size;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    __m512 _scale = _mm512_set1_ps(scale[i]);
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetScaleLayerForward<align, false>(src, _scale, dst, j + F * 0);
+                        SynetScaleLayerForward<align, false>(src, _scale, dst, j + F * 1);
+                        SynetScaleLayerForward<align, false>(src, _scale, dst, j + F * 2);
+                        SynetScaleLayerForward<align, false>(src, _scale, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetScaleLayerForward<align, false>(src, _scale, dst, j);
+                    if (j < size)
+                        SynetScaleLayerForward<align, true>(src, _scale, dst, j, tail);
+                    src += size;
+                    dst += size;
+                }
+            }
+        }
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetScaleLayerForward<true>(src, scale, bias, count, size, dst);
+            else
+                SynetScaleLayerForward<false>(src, scale, bias, count, size, dst);
+        }
+    }
+#endif// SIMD_AVX512F_ENABLE
+}
diff --git a/src/3rd/Simd/SimdBase.h b/src/3rd/Simd/SimdBase.h
index 6c4a6646..3750e7ed 100644
--- a/src/3rd/Simd/SimdBase.h
+++ b/src/3rd/Simd/SimdBase.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar,
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
 *               2014-2016 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,6 +31,10 @@ namespace Simd
 {
     namespace Base
     {
+        size_t GetThreadNumber();
+
+        void SetThreadNumber(size_t threadNumber);
+
         uint32_t Crc32c(const void * src, size_t size);
 
         void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
@@ -227,13 +231,19 @@ namespace Simd
 
         void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum);
 
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance);
+
         void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst);
 
         void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst);
 
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
         void GaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             size_t channelCount, uint8_t * dst, size_t dstStride);
 
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
         void GrayToBgr(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgr, size_t bgrStride);
 
         void GrayToBgra(const uint8_t *gray, size_t width, size_t height, size_t grayStride, uint8_t *bgra, size_t bgraStride, uint8_t alpha);
@@ -269,7 +279,7 @@ namespace Simd
 
         void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride);
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight);
 
@@ -495,6 +505,8 @@ namespace Simd
         void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
 
         void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
+		
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
 
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
 
@@ -503,6 +515,14 @@ namespace Simd
 
         void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum);
 
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
+
         void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride);
 
diff --git a/src/3rd/Simd/SimdBaseDetection.cpp b/src/3rd/Simd/SimdBaseDetection.cpp
index 37fbbbe5..e7a1830e 100644
--- a/src/3rd/Simd/SimdBaseDetection.cpp
+++ b/src/3rd/Simd/SimdBaseDetection.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -94,7 +94,8 @@ namespace Simd
                 tinyxml2::XMLNode * child = parent->FirstChild();
                 if (child == NULL)
                     SIMD_EX("Invalid node!");
-                std::stringstream ss(tinyxml2::XMLUtil::SkipWhiteSpace(child->Value()));
+                int curLineNum = 0;
+                std::stringstream ss(tinyxml2::XMLUtil::SkipWhiteSpace(child->Value(), &curLineNum));
                 std::vector<T> values;
                 while (!ss.eof())
                 {
@@ -900,10 +901,5 @@ namespace Simd
                 Rect(left, top, right, bottom),
                 Image(hid.sum.width - 1, hid.sum.height - 1, dstStride, Image::Gray8, dst).Ref());
         }
-
-        void DetectionFree(void * ptr)
-        {
-            delete (Deletable*)ptr;
-        }
     }
 }
diff --git a/src/3rd/Simd/SimdBaseFloat16.cpp b/src/3rd/Simd/SimdBaseFloat16.cpp
index 0c1d268b..1ae91895 100644
--- a/src/3rd/Simd/SimdBaseFloat16.cpp
+++ b/src/3rd/Simd/SimdBaseFloat16.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -146,5 +146,19 @@ namespace Simd
                 sums[0] += SquaredDifference16f(a[i], b[i]);
             *sum = sums[0] + sums[1] + sums[2] + sums[3];
         }
+
+        void CosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+        {
+            float aa = 0, ab = 0, bb = 0;
+            for (size_t i = 0; i < size; ++i)
+            {
+                float _a = Float16ToFloat32(a[i]);
+                float _b = Float16ToFloat32(b[i]);
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
     }
 }
diff --git a/src/3rd/Simd/SimdBaseFloat32.cpp b/src/3rd/Simd/SimdBaseFloat32.cpp
index 127825fa..23f19a2a 100644
--- a/src/3rd/Simd/SimdBaseFloat32.cpp
+++ b/src/3rd/Simd/SimdBaseFloat32.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -51,7 +51,7 @@ namespace Simd
 
         SIMD_INLINE float Uint8ToFloat32(int value, float lower, float boost)
         {
-            return value*boost - lower;
+            return value*boost + lower;
         }
 
         void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst)
@@ -69,5 +69,19 @@ namespace Simd
             for (; i < size; ++i)
                 dst[i] = Uint8ToFloat32(src[i], _lower, boost);
         }
+
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            float aa = 0, ab = 0, bb = 0;
+            for (size_t i = 0; i < size; ++i)
+            {
+                float _a = a[i];
+                float _b = b[i];
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
     }
 }
diff --git a/src/3rd/Simd/SimdBaseGemm32f.cpp b/src/3rd/Simd/SimdBaseGemm32f.cpp
new file mode 100644
index 00000000..9425bb7d
--- /dev/null
+++ b/src/3rd/Simd/SimdBaseGemm32f.cpp
@@ -0,0 +1,48 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdDefs.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+        {
+            float b = beta[0];
+            for (size_t i = 0; i < M; ++i)
+            {
+                float * pC = C + i * ldc;
+                for (size_t j = 0; j < N; ++j)
+                    pC[j] = b * pC[j];
+                for (size_t k = 0; k < K; ++k)
+                {
+                    const float * pB = B + k * ldb;
+                    float a = alpha[0] * A[i*lda + k];
+                    for (size_t j = 0; j < N; ++j)
+                        pC[j] = a * pB[j] + pC[j];
+                }
+            }
+        }
+    }
+}
diff --git a/src/3rd/Simd/SimdBaseHogLite.cpp b/src/3rd/Simd/SimdBaseHogLite.cpp
index 78119de6..bd9570ee 100644
--- a/src/3rd/Simd/SimdBaseHogLite.cpp
+++ b/src/3rd/Simd/SimdBaseHogLite.cpp
@@ -214,9 +214,9 @@ namespace Simd
 
         class HogLiteFeatureFilter
         {
-            void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize*filterSize;
+                size_t filterStride = featureSize*filterWidth;
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
                 {
                     for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol)
@@ -224,7 +224,7 @@ namespace Simd
                         float sum = 0;
                         const float * pSrc = src + dstRow*srcStride + dstCol*featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; ++filterCol)
                                 sum += pSrc[filterCol] * pFilter[filterCol];
@@ -237,9 +237,9 @@ namespace Simd
                 }
             }
 
-            void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize*filterSize;
+                size_t filterStride = featureSize*filterWidth;
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
                 {
                     for (size_t dstCol = 0; dstCol < dstWidth; ++dstCol)
@@ -249,7 +249,7 @@ namespace Simd
                             float sum = 0;
                             const float * pSrc = src + dstRow*srcStride + dstCol*featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; ++filterCol)
                                     sum += pSrc[filterCol] * pFilter[filterCol];
@@ -266,24 +266,24 @@ namespace Simd
                 }
             }
         public:
-            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 assert(featureSize == 8 || featureSize == 16);
-                assert(srcWidth >= filterSize && srcHeight >= filterSize);
+                assert(srcWidth >= filterWidth && srcHeight >= filterHeight);
 
-                size_t dstWidth = srcWidth - filterSize + 1;
-                size_t dstHeight = srcHeight - filterSize + 1;
+                size_t dstWidth = srcWidth - filterWidth + 1;
+                size_t dstHeight = srcHeight - filterHeight + 1;
                 if (mask)
-                    Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 else
-                    Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                    Filter(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
             }
         };
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
         {
             HogLiteFeatureFilter featureFilter;
-            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
         }
 
         class HogLiteFeatureResizer
diff --git a/src/3rd/Simd/SimdBaseNeural.cpp b/src/3rd/Simd/SimdBaseNeural.cpp
index c5fc008b..5b03f0cd 100644
--- a/src/3rd/Simd/SimdBaseNeural.cpp
+++ b/src/3rd/Simd/SimdBaseNeural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,8 +21,8 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
-#include "Simd/SimdMath.h"
 #include "Simd/SimdMemory.h"
+#include "Simd/SimdPow.h"
 
 namespace Simd
 {
diff --git a/src/3rd/Simd/SimdBaseReduceGray5x5.cpp b/src/3rd/Simd/SimdBaseReduceGray5x5.cpp
index 2d16c539..a95c56d5 100644
--- a/src/3rd/Simd/SimdBaseReduceGray5x5.cpp
+++ b/src/3rd/Simd/SimdBaseReduceGray5x5.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -208,9 +208,9 @@ namespace Simd
                         ++sx;
                         dx = dy;
 
-                        register unsigned short * p_isc0 = buffer.isc0;
-                        register unsigned short * p_isc1 = buffer.isc1;
-                        register unsigned short * p_iscp = buffer.iscp;
+                        unsigned short * p_isc0 = buffer.isc0;
+                        unsigned short * p_isc1 = buffer.isc1;
+                        unsigned short * p_iscp = buffer.iscp;
 
                         // Main entries in row
                         for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx)
@@ -219,7 +219,7 @@ namespace Simd
                             p_isc1++;
                             p_iscp++;
 
-                            register unsigned short icurrent = (unsigned short)(*sx);
+                            unsigned short icurrent = (unsigned short)(*sx);
 
                             isrp = icurrent * 4;
                             icurrent = (unsigned short)(*(++sx));
@@ -239,7 +239,7 @@ namespace Simd
                         //doing the last operation due to even number of operations in previous cycle
                         if (!(srcWidth & 1))
                         {
-                            register unsigned short icurrent = (unsigned short)(*sx);
+                            unsigned short icurrent = (unsigned short)(*sx);
                             isrp = icurrent * 4;
                             ++dstx;
                             evenX = !evenX;
@@ -281,11 +281,11 @@ namespace Simd
                         ++sx;
 
                         // Main entries in odd-numbered row
-                        register unsigned short * p_iscp = buffer.iscp;
+                        unsigned short * p_iscp = buffer.iscp;
 
                         for (evenX = false, srcx = 1, dstx = 0; srcx < (srcWidth - 1); srcx += 2, ++sx)
                         {
-                            register unsigned short icurrent = (unsigned short)(*sx);
+                            unsigned short icurrent = (unsigned short)(*sx);
                             isrp = icurrent * 4;
 
                             p_iscp++;
@@ -301,7 +301,7 @@ namespace Simd
                         //doing the last operation due to even number of operations in previous cycle
                         if (!(srcWidth & 1))
                         {
-                            register unsigned short icurrent = (unsigned short)(*sx);
+                            unsigned short icurrent = (unsigned short)(*sx);
                             isrp = icurrent * 4;
                             ++dstx;
                             evenX = !evenX;
diff --git a/src/3rd/Simd/SimdBaseResizer.cpp b/src/3rd/Simd/SimdBaseResizer.cpp
new file mode 100644
index 00000000..1eba12f0
--- /dev/null
+++ b/src/3rd/Simd/SimdBaseResizer.cpp
@@ -0,0 +1,257 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdResizer.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        ResizerByteBilinear::ResizerByteBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels)
+            : Resizer(SimdResizeChannelByte, SimdResizeMethodBilinear)
+            , _sx(srcX), _sy(srcY), _dx(dstX), _dy(dstY), _cn(channels)
+        {
+            _ay.Resize(_dy);
+            _iy.Resize(_dy);
+            EstimateIndexAlpha(_sy, _dy, _iy.data, _ay.data, 1);
+
+            _rs = _dx * _cn;
+            _ax.Resize(_rs);
+            _ix.Resize(_rs);
+            EstimateIndexAlpha(_sx, _dx, _ix.data, _ax.data, _cn);
+        }
+
+        void ResizerByteBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, int32_t * alphas, size_t channels)
+        {
+            float scale = (float)srcSize / dstSize;
+
+            for (size_t i = 0; i < dstSize; ++i)
+            {
+                float alpha = (float)((i + 0.5f)*scale - 0.5f);
+                ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                alpha -= index;
+
+                if (index < 0)
+                {
+                    index = 0;
+                    alpha = 0;
+                }
+
+                if (index >(ptrdiff_t)srcSize - 2)
+                {
+                    index = srcSize - 2;
+                    alpha = 1;
+                }
+
+                for (size_t c = 0; c < channels; c++)
+                {
+                    size_t offset = i * channels + c;
+                    indices[offset] = (int32_t)(channels*index + c);
+                    alphas[offset] = (int32_t)(alpha * FRACTION_RANGE + 0.5f);
+                }
+            }
+        }
+
+        void ResizerByteBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const
+        {
+            Array32i bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            int32_t * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                int32_t fy = _ay[dy];
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    int32_t * pb = pbx[k];
+                    const uint8_t * ps = src + (sy + k)*srcStride;
+                    for (size_t dx = 0; dx < _rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        int32_t fx = _ax[dx];
+                        int32_t t = ps[sx];
+                        pb[dx] = (t << LINEAR_SHIFT) + (ps[sx + _cn] - t)*fx;
+                    }
+                }
+
+                if (fy == 0)
+                    for (size_t dx = 0; dx < _rs; dx++)
+                        dst[dx] = ((pbx[0][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT;
+                else if (fy == FRACTION_RANGE)
+                    for (size_t dx = 0; dx < _rs; dx++)
+                        dst[dx] = ((pbx[1][dx] << LINEAR_SHIFT) + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT;
+                else
+                {
+                    for (size_t dx = 0; dx < _rs; dx++)
+                    {
+                        int32_t t = pbx[0][dx];
+                        dst[dx] = ((t << LINEAR_SHIFT) + (pbx[1][dx] - t)*fy + BILINEAR_ROUND_TERM) >> BILINEAR_SHIFT;
+                    }
+                }
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, size_t align, bool caffeInterp)
+            : Resizer(SimdResizeChannelFloat, SimdResizeMethodBilinear)
+            , _sx(srcX), _sy(srcY), _dx(dstX), _dy(dstY), _cn(channels)
+        {
+            _ay.Resize(_dy, false, align);
+            _iy.Resize(_dy, false, align);
+            EstimateIndexAlpha(_sy, _dy, _iy.data, _ay.data, 1, caffeInterp);
+
+            _rs = _dx * _cn;
+            _ax.Resize(_rs, false, align);
+            _ix.Resize(_rs, false, align);
+            EstimateIndexAlpha(_sx, _dx, _ix.data, _ax.data, _cn, caffeInterp);
+        }
+
+        void ResizerFloatBilinear::EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, float * alphas, size_t channels, bool caffeInterp)
+        {
+            if (caffeInterp)
+            {
+                float scale = dstSize > 1 ? float(srcSize - 1) / float(dstSize - 1) : 0.0f;
+                for (size_t i = 0; i < dstSize; ++i)
+                {
+                    float alpha = float(i)*scale;
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+                    if (index > (ptrdiff_t)srcSize - 2)
+                    {
+                        index = srcSize - 2;
+                        alpha = 1;
+                    }
+                    for (size_t c = 0; c < channels; c++)
+                    {
+                        size_t offset = i * channels + c;
+                        indices[offset] = (int32_t)(channels*index + c);
+                        alphas[offset] = alpha;
+                    }
+                }
+            }
+            else
+            {
+                float scale = (float)srcSize / dstSize;
+                for (size_t i = 0; i < dstSize; ++i)
+                {
+                    float alpha = (float)((i + 0.5f)*scale - 0.5f);
+                    ptrdiff_t index = (ptrdiff_t)::floor(alpha);
+                    alpha -= index;
+                    if (index < 0)
+                    {
+                        index = 0;
+                        alpha = 0;
+                    }
+                    if (index >(ptrdiff_t)srcSize - 2)
+                    {
+                        index = srcSize - 2;
+                        alpha = 1;
+                    }
+                    for (size_t c = 0; c < channels; c++)
+                    {
+                        size_t offset = i * channels + c;
+                        indices[offset] = (int32_t)(channels*index + c);
+                        alphas[offset] = alpha;
+                    }
+                }
+            }
+        }
+
+        void ResizerFloatBilinear::Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const
+        {
+            Run((const float*)src, srcStride / sizeof(float), (float*)dst, dstStride / sizeof(float));
+        }
+
+        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const
+        {
+            Array32f bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            float * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    float * pb = pbx[k];
+                    const float * ps = src + (sy + k)*srcStride;
+                    for (size_t dx = 0; dx < _rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx]*(1.0f - fx) + ps[sx + _cn]*fx;
+                    }
+                }
+
+                for (size_t dx = 0; dx < _rs; dx++)
+                    dst[dx] = pbx[0][dx]*fy0 + pbx[1][dx]*fy1;
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+        {
+            if (type == SimdResizeChannelByte && method == SimdResizeMethodBilinear)
+                return new ResizerByteBilinear(srcX, srcY, dstX, dstY, channels);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(void*), false);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(void*), true);
+            else
+                return NULL;
+        }
+    }
+}
+
diff --git a/src/3rd/Simd/SimdBaseStatistic.cpp b/src/3rd/Simd/SimdBaseStatistic.cpp
index c95634cd..c897f09b 100644
--- a/src/3rd/Simd/SimdBaseStatistic.cpp
+++ b/src/3rd/Simd/SimdBaseStatistic.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -220,6 +220,28 @@ namespace Simd
                 src += stride;
             }
         }
+		
+		void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            assert(width < 0x10000);
+
+            *valueSum = 0;
+			*squareSum = 0;
+            for (size_t row = 0; row < height; ++row)
+            {
+                int rowValueSum = 0;
+				int rowSquareSum = 0;
+                for (size_t col = 0; col < width; ++col)
+				{
+                    int value = src[col];
+                    rowValueSum += value;
+                    rowSquareSum += Square(value);
+				}
+                *valueSum += rowValueSum;
+				*squareSum += rowSquareSum;
+                src += stride;
+            }
+        }
 
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum)
         {
diff --git a/src/3rd/Simd/SimdBaseSynet.cpp b/src/3rd/Simd/SimdBaseSynet.cpp
new file mode 100644
index 00000000..76507206
--- /dev/null
+++ b/src/3rd/Simd/SimdBaseSynet.cpp
@@ -0,0 +1,232 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdArray.h"
+#include "Simd/SimdPow.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = Simd::AlignLo(size, 4);
+            for (size_t i = 0; i < count; ++i)
+            {
+                float value = bias[i];
+                size_t j = 0;
+                for (; j < aligned; j += 4)
+                {
+                    dst[j + 0] += value;
+                    dst[j + 1] += value;
+                    dst[j + 2] += value;
+                    dst[j + 3] += value;
+                }
+                for (; j < size; ++j)
+                    dst[j] += value;
+                dst += size;
+            }
+        }
+
+        void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = Simd::AlignLo(size, 4);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            for (; j < aligned; j += 4)
+            {
+                dst[j + 0] = src0[j + 0] * src1[j + 0];
+                dst[j + 1] = src0[j + 1] * src1[j + 1];
+                dst[j + 2] = src0[j + 2] * src1[j + 2];
+                dst[j + 3] = src0[j + 3] * src1[j + 3];
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * src1[j];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                for (j = 0; j < aligned; j += 4)
+                {
+                    dst[j + 0] *= srci[j + 0];
+                    dst[j + 1] *= srci[j + 1];
+                    dst[j + 2] *= srci[j + 2];
+                    dst[j + 3] *= srci[j + 3];
+                }
+                for (; j < size; ++j)
+                    dst[j] *= srci[j];
+            }
+        }
+
+        void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = Simd::AlignLo(size, 4);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            float weight0 = weight[0], weight1 = weight[1];
+            size_t j = 0;
+            for (; j < aligned; j += 4)
+            {
+                dst[j + 0] = src0[j + 0] * weight0 + src1[j + 0] * weight1;
+                dst[j + 1] = src0[j + 1] * weight0 + src1[j + 1] * weight1;
+                dst[j + 2] = src0[j + 2] * weight0 + src1[j + 2] * weight1;
+                dst[j + 3] = src0[j + 3] * weight0 + src1[j + 3] * weight1;
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * weight0 + src1[j] * weight1;
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                float weighti = weight[i];
+                for (j = 0; j < aligned; j += 4)
+                {
+                    dst[j + 0] += srci[j + 0] * weighti;
+                    dst[j + 1] += srci[j + 1] * weighti;
+                    dst[j + 2] += srci[j + 2] * weighti;
+                    dst[j + 3] += srci[j + 3] * weighti;
+                }
+                for (; j < size; ++j)
+                    dst[j] += srci[j] * weighti;
+            }
+        }
+
+        void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = Simd::AlignLo(size, 4);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            for (; j < aligned; j += 4)
+            {
+                dst[j + 0] = Simd::Max(src0[j + 0], src1[j + 0]);
+                dst[j + 1] = Simd::Max(src0[j + 1], src1[j + 1]);
+                dst[j + 2] = Simd::Max(src0[j + 2], src1[j + 2]);
+                dst[j + 3] = Simd::Max(src0[j + 3], src1[j + 3]);
+            }
+            for (; j < size; ++j)
+                dst[j] = Simd::Max(src0[j], src1[j]);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                for (j = 0; j < aligned; j += 4)
+                {
+                    dst[j + 0] = Simd::Max(dst[j + 0], srci[j + 0]);
+                    dst[j + 1] = Simd::Max(dst[j + 1], srci[j + 1]);
+                    dst[j + 2] = Simd::Max(dst[j + 2], srci[j + 2]);
+                    dst[j + 3] = Simd::Max(dst[j + 3], srci[j + 3]);
+                }
+                for (; j < size; ++j)
+                    dst[j] = Simd::Max(dst[j], srci[j]);
+            }
+        }
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            switch (type)
+            {
+            case SimdSynetEltwiseOperationProduct:
+                SynetEltwiseLayerForwardProduct(src, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationSum:
+                SynetEltwiseLayerForwardSum(src, weight, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationMax:
+                SynetEltwiseLayerForwardMax(src, count, size, dst);
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            float k0 = k[0], k1 = k[1], k2 = k[2];
+            Array32f sum(size, true), zero(size, true);
+
+            for (size_t i = 0; i < half; ++i)
+            {
+                const float * pos = src + i * size;
+                for (size_t j = 0; j < size; ++j)
+                    sum[j] += Simd::Square(pos[j]);
+            }
+
+            for (size_t i = 0; i < count; ++i)
+            {
+                const float * pos = (i < count - half) ? src + half * size : zero.data;
+                const float * neg = (i > half) ? src - (half + 1) * size : zero.data;
+                for (size_t j = 0; j < size; ++j)
+                {
+                    sum[j] += Simd::Square(pos[j]);
+                    sum[j] -= Simd::Square(neg[j]);
+                    dst[j] = src[j] * Pow(k0 + k1 * sum[j], k2);
+                }
+                src += size;
+                dst += size;
+            }
+        }
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = Simd::AlignLo(size, 4);
+            if (bias)
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    float s = scale[i];
+                    float b = bias[i];
+                    size_t j = 0;
+                    for (; j < aligned; j += 4)
+                    {
+                        dst[j + 0] = src[j + 0] * s + b;
+                        dst[j + 1] = src[j + 1] * s + b;
+                        dst[j + 2] = src[j + 2] * s + b;
+                        dst[j + 3] = src[j + 3] * s + b;
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * s + b;
+                    src += size;
+                    dst += size;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    float s = scale[i];
+                    size_t j = 0;
+                    for (; j < aligned; j += 4)
+                    {
+                        dst[j + 0] = src[j + 0] * s;
+                        dst[j + 1] = src[j + 1] * s;
+                        dst[j + 2] = src[j + 2] * s;
+                        dst[j + 3] = src[j + 3] * s;
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * s;
+                    src += size;
+                    dst += size;
+                }
+            }
+        }
+    }
+}
diff --git a/src/3rd/Simd/SimdBaseThread.cpp b/src/3rd/Simd/SimdBaseThread.cpp
new file mode 100644
index 00000000..fb000a65
--- /dev/null
+++ b/src/3rd/Simd/SimdBaseThread.cpp
@@ -0,0 +1,45 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMath.h"
+#include "Simd/SimdBase.h"
+
+#include <thread>
+
+namespace Simd
+{
+    namespace Base
+    {
+        size_t g_threadNumber = 1;
+
+        size_t GetThreadNumber()
+        {
+            return g_threadNumber;
+        }
+
+        void SetThreadNumber(size_t threadNumber)
+        {
+            g_threadNumber = Simd::RestrictRange<size_t>(threadNumber, 1, std::thread::hardware_concurrency());
+        }
+    }
+}
diff --git a/src/3rd/Simd/SimdBase_tinyxml2.cpp b/src/3rd/Simd/SimdBase_tinyxml2.cpp
index 2f9e5d0a..4541dcb4 100644
--- a/src/3rd/Simd/SimdBase_tinyxml2.cpp
+++ b/src/3rd/Simd/SimdBase_tinyxml2.cpp
@@ -21,15 +21,86 @@ must not be misrepresented as being the original software.
 distribution.
 */
 
-#include "SimdBase_tinyxml2.h"
+#include "Simd/SimdBase_tinyxml2.h"
 
 #include <new>		// yes, this one new style header, is in the Android SDK.
-#if defined(ANDROID_NDK) || defined(__QNXNTO__)
+#if defined(ANDROID_NDK) || defined(__BORLANDC__) || defined(__QNXNTO__)
 #   include <stddef.h>
+#   include <stdarg.h>
 #else
 #   include <cstddef>
+#   include <cstdarg>
 #endif
 
+#if defined(_MSC_VER) && (_MSC_VER >= 1400 ) && (!defined WINCE)
+// Microsoft Visual Studio, version 2005 and higher. Not WinCE.
+/*int _snprintf_s(
+char *buffer,
+size_t sizeOfBuffer,
+size_t count,
+const char *format [,
+argument] ...
+);*/
+static inline int TIXML_SNPRINTF(char* buffer, size_t size, const char* format, ...)
+{
+    va_list va;
+    va_start(va, format);
+    int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va);
+    va_end(va);
+    return result;
+}
+
+static inline int TIXML_VSNPRINTF(char* buffer, size_t size, const char* format, va_list va)
+{
+    int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va);
+    return result;
+}
+
+#define TIXML_VSCPRINTF	_vscprintf
+#define TIXML_SSCANF	sscanf_s
+#elif defined _MSC_VER
+// Microsoft Visual Studio 2003 and earlier or WinCE
+#define TIXML_SNPRINTF	_snprintf
+#define TIXML_VSNPRINTF _vsnprintf
+#define TIXML_SSCANF	sscanf
+#if (_MSC_VER < 1400 ) && (!defined WINCE)
+// Microsoft Visual Studio 2003 and not WinCE.
+#define TIXML_VSCPRINTF   _vscprintf // VS2003's C runtime has this, but VC6 C runtime or WinCE SDK doesn't have.
+#else
+// Microsoft Visual Studio 2003 and earlier or WinCE.
+static inline int TIXML_VSCPRINTF(const char* format, va_list va)
+{
+    int len = 512;
+    for (;;) {
+        len = len * 2;
+        char* str = new char[len]();
+        const int required = _vsnprintf(str, len, format, va);
+        delete[] str;
+        if (required != -1) {
+            TIXMLASSERT(required >= 0);
+            len = required;
+            break;
+        }
+    }
+    TIXMLASSERT(len >= 0);
+    return len;
+}
+#endif
+#else
+// GCC version 3 and higher
+//#warning( "Using sn* functions." )
+#define TIXML_SNPRINTF	snprintf
+#define TIXML_VSNPRINTF	vsnprintf
+static inline int TIXML_VSCPRINTF(const char* format, va_list va)
+{
+    int len = vsnprintf(0, 0, format, va);
+    TIXMLASSERT(len >= 0);
+    return len;
+}
+#define TIXML_SSCANF   sscanf
+#endif
+
+
 static const char LINE_FEED = (char)0x0a;			// all line endings are normalized to LF
 static const char LF = LINE_FEED;
 static const char CARRIAGE_RETURN = (char)0x0d;			// CR gets filtered out
@@ -57,10 +128,10 @@ namespace tinyxml2
     static const int NUM_ENTITIES = 5;
     static const Entity entities[NUM_ENTITIES] = {
         { "quot", 4,	DOUBLE_QUOTE },
-        { "amp", 3,		'&'  },
-        { "apos", 4,	SINGLE_QUOTE },
-        { "lt",	2, 		'<'	 },
-        { "gt",	2,		'>'	 }
+    { "amp", 3,		'&' },
+    { "apos", 4,	SINGLE_QUOTE },
+    { "lt",	2, 		'<' },
+    { "gt",	2,		'>' }
     };
 
 
@@ -78,6 +149,7 @@ namespace tinyxml2
         // This in effect implements the assignment operator by "moving"
         // ownership (as in auto_ptr).
 
+        TIXMLASSERT(other != 0);
         TIXMLASSERT(other->_flags == 0);
         TIXMLASSERT(other->_start == 0);
         TIXMLASSERT(other->_end == 0);
@@ -93,6 +165,7 @@ namespace tinyxml2
         _end = 0;
     }
 
+
     void StrPair::Reset()
     {
         if (_flags & NEEDS_DELETE) {
@@ -106,8 +179,10 @@ namespace tinyxml2
 
     void StrPair::SetStr(const char* str, int flags)
     {
+        TIXMLASSERT(str);
         Reset();
         size_t len = strlen(str);
+        TIXMLASSERT(_start == 0);
         _start = new char[len + 1];
         memcpy(_start, str, len + 1);
         _end = _start + len;
@@ -115,9 +190,11 @@ namespace tinyxml2
     }
 
 
-    char* StrPair::ParseText(char* p, const char* endTag, int strFlags)
+    char* StrPair::ParseText(char* p, const char* endTag, int strFlags, int* curLineNumPtr)
     {
+        TIXMLASSERT(p);
         TIXMLASSERT(endTag && *endTag);
+        TIXMLASSERT(curLineNumPtr);
 
         char* start = p;
         char  endChar = *endTag;
@@ -129,7 +206,11 @@ namespace tinyxml2
                 Set(start, p, strFlags);
                 return p + length;
             }
+            else if (*p == '\n') {
+                ++(*curLineNumPtr);
+            }
             ++p;
+            TIXMLASSERT(p);
         }
         return 0;
     }
@@ -160,15 +241,15 @@ namespace tinyxml2
         // Adjusting _start would cause undefined behavior on delete[]
         TIXMLASSERT((_flags & NEEDS_DELETE) == 0);
         // Trim leading space.
-        _start = XMLUtil::SkipWhiteSpace(_start);
+        _start = XMLUtil::SkipWhiteSpace(_start, 0);
 
         if (*_start) {
-            char* p = _start;	// the read pointer
+            const char* p = _start;	// the read pointer
             char* q = _start;	// the write pointer
 
             while (*p) {
                 if (XMLUtil::IsWhiteSpace(*p)) {
-                    p = XMLUtil::SkipWhiteSpace(p);
+                    p = XMLUtil::SkipWhiteSpace(p, 0);
                     if (*p == 0) {
                         break;    // don't write to q; this trims the trailing space.
                     }
@@ -193,7 +274,7 @@ namespace tinyxml2
             _flags ^= NEEDS_FLUSH;
 
             if (_flags) {
-                char* p = _start;	// the read pointer
+                const char* p = _start;	// the read pointer
                 char* q = _start;	// the write pointer
 
                 while (p < _end) {
@@ -207,7 +288,8 @@ namespace tinyxml2
                         else {
                             ++p;
                         }
-                        *q++ = LF;
+                        *q = LF;
+                        ++q;
                     }
                     else if ((_flags & NEEDS_NEWLINE_NORMALIZATION) && *p == LF) {
                         if (*(p + 1) == CR) {
@@ -216,7 +298,8 @@ namespace tinyxml2
                         else {
                             ++p;
                         }
-                        *q++ = LF;
+                        *q = LF;
+                        ++q;
                     }
                     else if ((_flags & NEEDS_ENTITY_PROCESSING) && *p == '&') {
                         // Entities handled by tinyXML2:
@@ -243,8 +326,8 @@ namespace tinyxml2
                             }
                         }
                         else {
-                            int i = 0;
-                            for (; i < NUM_ENTITIES; ++i) {
+                            bool entityFound = false;
+                            for (int i = 0; i < NUM_ENTITIES; ++i) {
                                 const Entity& entity = entities[i];
                                 if (strncmp(p + 1, entity.pattern, entity.length) == 0
                                     && *(p + entity.length + 1) == ';') {
@@ -252,10 +335,11 @@ namespace tinyxml2
                                     *q = entity.value;
                                     ++q;
                                     p += entity.length + 2;
+                                    entityFound = true;
                                     break;
                                 }
                             }
-                            if (i == NUM_ENTITIES) {
+                            if (!entityFound) {
                                 // fixme: treat as error?
                                 ++p;
                                 ++q;
@@ -272,7 +356,7 @@ namespace tinyxml2
             }
             // The loop below has plenty going on, and this
             // is a less useful mode. Break it out.
-            if (_flags & COLLAPSE_WHITESPACE) {
+            if (_flags & NEEDS_WHITESPACE_COLLAPSING) {
                 CollapseWhitespace();
             }
             _flags = (_flags & NEEDS_DELETE);
@@ -286,6 +370,19 @@ namespace tinyxml2
 
     // --------- XMLUtil ----------- //
 
+    const char* XMLUtil::writeBoolTrue = "true";
+    const char* XMLUtil::writeBoolFalse = "false";
+
+    void XMLUtil::SetBoolSerialization(const char* writeTrue, const char* writeFalse)
+    {
+        static const char* defTrue = "true";
+        static const char* defFalse = "false";
+
+        writeBoolTrue = (writeTrue) ? writeTrue : defTrue;
+        writeBoolFalse = (writeFalse) ? writeFalse : defFalse;
+    }
+
+
     const char* XMLUtil::ReadBOM(const char* p, bool* bom)
     {
         TIXMLASSERT(p);
@@ -323,26 +420,30 @@ namespace tinyxml2
             *length = 4;
         }
         else {
-            *length = 0;    // This code won't covert this correctly anyway.
+            *length = 0;    // This code won't convert this correctly anyway.
             return;
         }
 
         output += *length;
 
-        // Scary scary fall throughs.
+        // Scary scary fall throughs are annotated with carefully designed comments
+        // to suppress compiler warnings such as -Wimplicit-fallthrough in gcc
         switch (*length) {
         case 4:
             --output;
             *output = (char)((input | BYTE_MARK) & BYTE_MASK);
             input >>= 6;
+            //fall through
         case 3:
             --output;
             *output = (char)((input | BYTE_MARK) & BYTE_MASK);
             input >>= 6;
+            //fall through
         case 2:
             --output;
             *output = (char)((input | BYTE_MARK) & BYTE_MASK);
             input >>= 6;
+            //fall through
         case 1:
             --output;
             *output = (char)(input | FIRST_BYTE_MARK[*length]);
@@ -397,8 +498,8 @@ namespace tinyxml2
                     else {
                         return 0;
                     }
+                    TIXMLASSERT(digit < 16);
                     TIXMLASSERT(digit == 0 || mult <= UINT_MAX / digit);
-                    TIXMLASSERT(digit >= 0 && digit < 16);
                     const unsigned int digitScaled = mult * digit;
                     TIXMLASSERT(ucs <= ULONG_MAX - digitScaled);
                     ucs += digitScaled;
@@ -427,6 +528,7 @@ namespace tinyxml2
                 while (*q != '#') {
                     if (*q >= '0' && *q <= '9') {
                         const unsigned int digit = *q - '0';
+                        TIXMLASSERT(digit < 10);
                         TIXMLASSERT(digit == 0 || mult <= UINT_MAX / digit);
                         const unsigned int digitScaled = mult * digit;
                         TIXMLASSERT(ucs <= ULONG_MAX - digitScaled);
@@ -462,12 +564,12 @@ namespace tinyxml2
 
     void XMLUtil::ToStr(bool v, char* buffer, int bufferSize)
     {
-        TIXML_SNPRINTF(buffer, bufferSize, "%d", v ? 1 : 0);
+        TIXML_SNPRINTF(buffer, bufferSize, "%s", v ? writeBoolTrue : writeBoolFalse);
     }
 
     /*
-        ToStr() of a number is a very tricky topic.
-        https://github.com/leethomason/tinyxml2/issues/106
+    ToStr() of a number is a very tricky topic.
+    https://github.com/leethomason/tinyxml2/issues/106
     */
     void XMLUtil::ToStr(float v, char* buffer, int bufferSize)
     {
@@ -481,6 +583,13 @@ namespace tinyxml2
     }
 
 
+    void XMLUtil::ToStr(int64_t v, char* buffer, int bufferSize)
+    {
+        // horrible syntax trick to make the compiler happy about %lld
+        TIXML_SNPRINTF(buffer, bufferSize, "%lld", (long long)v);
+    }
+
+
     bool XMLUtil::ToInt(const char* str, int* value)
     {
         if (TIXML_SSCANF(str, "%d", value) == 1) {
@@ -524,6 +633,7 @@ namespace tinyxml2
         return false;
     }
 
+
     bool XMLUtil::ToDouble(const char* str, double* value)
     {
         if (TIXML_SSCANF(str, "%lf", value) == 1) {
@@ -533,72 +643,78 @@ namespace tinyxml2
     }
 
 
+    bool XMLUtil::ToInt64(const char* str, int64_t* value)
+    {
+        long long v = 0;	// horrible syntax trick to make the compiler happy about %lld
+        if (TIXML_SSCANF(str, "%lld", &v) == 1) {
+            *value = (int64_t)v;
+            return true;
+        }
+        return false;
+    }
+
+
     char* XMLDocument::Identify(char* p, XMLNode** node)
     {
         TIXMLASSERT(node);
         TIXMLASSERT(p);
         char* const start = p;
-        p = XMLUtil::SkipWhiteSpace(p);
+        int const startLine = _parseCurLineNum;
+        p = XMLUtil::SkipWhiteSpace(p, &_parseCurLineNum);
         if (!*p) {
             *node = 0;
             TIXMLASSERT(p);
             return p;
         }
 
-        // What is this thing?
-        // These strings define the matching patters:
+        // These strings define the matching patterns:
         static const char* xmlHeader = { "<?" };
         static const char* commentHeader = { "<!--" };
-        static const char* dtdHeader = { "<!" };
         static const char* cdataHeader = { "<![CDATA[" };
+        static const char* dtdHeader = { "<!" };
         static const char* elementHeader = { "<" };	// and a header for everything else; check last.
 
         static const int xmlHeaderLen = 2;
         static const int commentHeaderLen = 4;
-        static const int dtdHeaderLen = 2;
         static const int cdataHeaderLen = 9;
+        static const int dtdHeaderLen = 2;
         static const int elementHeaderLen = 1;
 
         TIXMLASSERT(sizeof(XMLComment) == sizeof(XMLUnknown));		// use same memory pool
         TIXMLASSERT(sizeof(XMLComment) == sizeof(XMLDeclaration));	// use same memory pool
         XMLNode* returnNode = 0;
         if (XMLUtil::StringEqual(p, xmlHeader, xmlHeaderLen)) {
-            TIXMLASSERT(sizeof(XMLDeclaration) == _commentPool.ItemSize());
-            returnNode = new (_commentPool.Alloc()) XMLDeclaration(this);
-            returnNode->_memPool = &_commentPool;
+            returnNode = CreateUnlinkedNode<XMLDeclaration>(_commentPool);
+            returnNode->_parseLineNum = _parseCurLineNum;
             p += xmlHeaderLen;
         }
         else if (XMLUtil::StringEqual(p, commentHeader, commentHeaderLen)) {
-            TIXMLASSERT(sizeof(XMLComment) == _commentPool.ItemSize());
-            returnNode = new (_commentPool.Alloc()) XMLComment(this);
-            returnNode->_memPool = &_commentPool;
+            returnNode = CreateUnlinkedNode<XMLComment>(_commentPool);
+            returnNode->_parseLineNum = _parseCurLineNum;
             p += commentHeaderLen;
         }
         else if (XMLUtil::StringEqual(p, cdataHeader, cdataHeaderLen)) {
-            TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize());
-            XMLText* text = new (_textPool.Alloc()) XMLText(this);
+            XMLText* text = CreateUnlinkedNode<XMLText>(_textPool);
             returnNode = text;
-            returnNode->_memPool = &_textPool;
+            returnNode->_parseLineNum = _parseCurLineNum;
             p += cdataHeaderLen;
             text->SetCData(true);
         }
         else if (XMLUtil::StringEqual(p, dtdHeader, dtdHeaderLen)) {
-            TIXMLASSERT(sizeof(XMLUnknown) == _commentPool.ItemSize());
-            returnNode = new (_commentPool.Alloc()) XMLUnknown(this);
-            returnNode->_memPool = &_commentPool;
+            returnNode = CreateUnlinkedNode<XMLUnknown>(_commentPool);
+            returnNode->_parseLineNum = _parseCurLineNum;
             p += dtdHeaderLen;
         }
         else if (XMLUtil::StringEqual(p, elementHeader, elementHeaderLen)) {
-            TIXMLASSERT(sizeof(XMLElement) == _elementPool.ItemSize());
-            returnNode = new (_elementPool.Alloc()) XMLElement(this);
-            returnNode->_memPool = &_elementPool;
+            returnNode = CreateUnlinkedNode<XMLElement>(_elementPool);
+            returnNode->_parseLineNum = _parseCurLineNum;
             p += elementHeaderLen;
         }
         else {
-            TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize());
-            returnNode = new (_textPool.Alloc()) XMLText(this);
-            returnNode->_memPool = &_textPool;
+            returnNode = CreateUnlinkedNode<XMLText>(_textPool);
+            returnNode->_parseLineNum = _parseCurLineNum; // Report line of first non-whitespace character
             p = start;	// Back it up, all the text counts.
+            _parseCurLineNum = startLine;
         }
 
         TIXMLASSERT(returnNode);
@@ -627,8 +743,11 @@ namespace tinyxml2
     XMLNode::XMLNode(XMLDocument* doc) :
         _document(doc),
         _parent(0),
+        _value(),
+        _parseLineNum(0),
         _firstChild(0), _lastChild(0),
         _prev(0), _next(0),
+        _userData(0),
         _memPool(0)
     {
     }
@@ -644,6 +763,9 @@ namespace tinyxml2
 
     const char* XMLNode::Value() const
     {
+        // Edge case: XMLDocuments don't have a Value. Return null.
+        if (this->ToDocument())
+            return 0;
         return _value.GetStr();
     }
 
@@ -657,15 +779,24 @@ namespace tinyxml2
         }
     }
 
+    XMLNode* XMLNode::DeepClone(XMLDocument* target) const
+    {
+        XMLNode* clone = this->ShallowClone(target);
+        if (!clone) return 0;
+
+        for (const XMLNode* child = this->FirstChild(); child; child = child->NextSibling()) {
+            XMLNode* childClone = child->DeepClone(target);
+            TIXMLASSERT(childClone);
+            clone->InsertEndChild(childClone);
+        }
+        return clone;
+    }
 
     void XMLNode::DeleteChildren()
     {
         while (_firstChild) {
-            TIXMLASSERT(_firstChild->_document == _document);
-            XMLNode* node = _firstChild;
-            Unlink(node);
-
-            DeleteNode(node);
+            TIXMLASSERT(_lastChild);
+            DeleteChild(_firstChild);
         }
         _firstChild = _lastChild = 0;
     }
@@ -675,6 +806,7 @@ namespace tinyxml2
     {
         TIXMLASSERT(child);
         TIXMLASSERT(child->_document == _document);
+        TIXMLASSERT(child->_parent == this);
         if (child == _firstChild) {
             _firstChild = _firstChild->_next;
         }
@@ -688,6 +820,8 @@ namespace tinyxml2
         if (child->_next) {
             child->_next->_prev = child->_prev;
         }
+        child->_next = 0;
+        child->_prev = 0;
         child->_parent = 0;
     }
 
@@ -697,6 +831,10 @@ namespace tinyxml2
         TIXMLASSERT(node);
         TIXMLASSERT(node->_document == _document);
         TIXMLASSERT(node->_parent == this);
+        Unlink(node);
+        TIXMLASSERT(node->_prev == 0);
+        TIXMLASSERT(node->_next == 0);
+        TIXMLASSERT(node->_parent == 0);
         DeleteNode(node);
     }
 
@@ -776,6 +914,13 @@ namespace tinyxml2
             TIXMLASSERT(false);
             return 0;
         }
+        if (afterThis == addThis) {
+            // Current state: BeforeThis -> AddThis -> OneAfterAddThis
+            // Now AddThis must disappear from it's location and then
+            // reappear between BeforeThis and OneAfterAddThis.
+            // So just leave it where it is.
+            return addThis;
+        }
 
         if (afterThis->_next == 0) {
             // The last node or the only node.
@@ -793,40 +938,35 @@ namespace tinyxml2
 
 
 
-    const XMLElement* XMLNode::FirstChildElement(const char* value) const
+    const XMLElement* XMLNode::FirstChildElement(const char* name) const
     {
-        for (XMLNode* node = _firstChild; node; node = node->_next) {
-            XMLElement* element = node->ToElement();
+        for (const XMLNode* node = _firstChild; node; node = node->_next) {
+            const XMLElement* element = node->ToElementWithName(name);
             if (element) {
-                if (!value || XMLUtil::StringEqual(element->Name(), value)) {
-                    return element;
-                }
+                return element;
             }
         }
         return 0;
     }
 
 
-    const XMLElement* XMLNode::LastChildElement(const char* value) const
+    const XMLElement* XMLNode::LastChildElement(const char* name) const
     {
-        for (XMLNode* node = _lastChild; node; node = node->_prev) {
-            XMLElement* element = node->ToElement();
+        for (const XMLNode* node = _lastChild; node; node = node->_prev) {
+            const XMLElement* element = node->ToElementWithName(name);
             if (element) {
-                if (!value || XMLUtil::StringEqual(element->Name(), value)) {
-                    return element;
-                }
+                return element;
             }
         }
         return 0;
     }
 
 
-    const XMLElement* XMLNode::NextSiblingElement(const char* value) const
+    const XMLElement* XMLNode::NextSiblingElement(const char* name) const
     {
-        for (XMLNode* node = this->_next; node; node = node->_next) {
-            const XMLElement* element = node->ToElement();
-            if (element
-                && (!value || XMLUtil::StringEqual(value, node->Value()))) {
+        for (const XMLNode* node = _next; node; node = node->_next) {
+            const XMLElement* element = node->ToElementWithName(name);
+            if (element) {
                 return element;
             }
         }
@@ -834,12 +974,11 @@ namespace tinyxml2
     }
 
 
-    const XMLElement* XMLNode::PreviousSiblingElement(const char* value) const
+    const XMLElement* XMLNode::PreviousSiblingElement(const char* name) const
     {
-        for (XMLNode* node = _prev; node; node = node->_prev) {
-            const XMLElement* element = node->ToElement();
-            if (element
-                && (!value || XMLUtil::StringEqual(value, node->Value()))) {
+        for (const XMLNode* node = _prev; node; node = node->_prev) {
+            const XMLElement* element = node->ToElementWithName(name);
+            if (element) {
                 return element;
             }
         }
@@ -847,7 +986,7 @@ namespace tinyxml2
     }
 
 
-    char* XMLNode::ParseDeep(char* p, StrPair* parentEnd)
+    char* XMLNode::ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr)
     {
         // This is a recursive method, but thinking about it "at the current level"
         // it is a pretty simple flat list:
@@ -870,26 +1009,50 @@ namespace tinyxml2
             XMLNode* node = 0;
 
             p = _document->Identify(p, &node);
+            TIXMLASSERT(p);
             if (node == 0) {
                 break;
             }
 
+            int initialLineNum = node->_parseLineNum;
+
             StrPair endTag;
-            p = node->ParseDeep(p, &endTag);
+            p = node->ParseDeep(p, &endTag, curLineNumPtr);
             if (!p) {
                 DeleteNode(node);
                 if (!_document->Error()) {
-                    _document->SetError(XML_ERROR_PARSING, 0, 0);
+                    _document->SetError(XML_ERROR_PARSING, initialLineNum, 0);
                 }
                 break;
             }
 
+            XMLDeclaration* decl = node->ToDeclaration();
+            if (decl) {
+                // Declarations are only allowed at document level
+                bool wellLocated = (ToDocument() != 0);
+                if (wellLocated) {
+                    // Multiple declarations are allowed but all declarations
+                    // must occur before anything else
+                    for (const XMLNode* existingNode = _document->FirstChild(); existingNode; existingNode = existingNode->NextSibling()) {
+                        if (!existingNode->ToDeclaration()) {
+                            wellLocated = false;
+                            break;
+                        }
+                    }
+                }
+                if (!wellLocated) {
+                    _document->SetError(XML_ERROR_PARSING_DECLARATION, initialLineNum, "XMLDeclaration value=%s", decl->Value());
+                    DeleteNode(node);
+                    break;
+                }
+            }
+
             XMLElement* ele = node->ToElement();
             if (ele) {
                 // We read the end tag. Return it to the parent.
                 if (ele->ClosingType() == XMLElement::CLOSING) {
-                    if (parentEnd) {
-                        ele->_value.TransferTo(parentEnd);
+                    if (parentEndTag) {
+                        ele->_value.TransferTo(parentEndTag);
                     }
                     node->_memPool->SetTracked();   // created and then immediately deleted.
                     DeleteNode(node);
@@ -908,12 +1071,12 @@ namespace tinyxml2
                     if (ele->ClosingType() != XMLElement::OPEN) {
                         mismatch = true;
                     }
-                    else if (!XMLUtil::StringEqual(endTag.GetStr(), node->Value())) {
+                    else if (!XMLUtil::StringEqual(endTag.GetStr(), ele->Name())) {
                         mismatch = true;
                     }
                 }
                 if (mismatch) {
-                    _document->SetError(XML_ERROR_MISMATCHED_ELEMENT, node->Value(), 0);
+                    _document->SetError(XML_ERROR_MISMATCHED_ELEMENT, initialLineNum, "XMLElement name=%s", ele->Name());
                     DeleteNode(node);
                     break;
                 }
@@ -923,11 +1086,16 @@ namespace tinyxml2
         return 0;
     }
 
-    void XMLNode::DeleteNode(XMLNode* node)
+    /*static*/ void XMLNode::DeleteNode(XMLNode* node)
     {
         if (node == 0) {
             return;
         }
+        TIXMLASSERT(node->_document);
+        if (!node->ToDocument()) {
+            node->_document->MarkInUse(node);
+        }
+
         MemPool* pool = node->_memPool;
         node->~XMLNode();
         pool->Free(node);
@@ -938,35 +1106,52 @@ namespace tinyxml2
         TIXMLASSERT(insertThis);
         TIXMLASSERT(insertThis->_document == _document);
 
-        if (insertThis->_parent)
+        if (insertThis->_parent) {
             insertThis->_parent->Unlink(insertThis);
-        else
+        }
+        else {
+            insertThis->_document->MarkInUse(insertThis);
             insertThis->_memPool->SetTracked();
+        }
+    }
+
+    const XMLElement* XMLNode::ToElementWithName(const char* name) const
+    {
+        const XMLElement* element = this->ToElement();
+        if (element == 0) {
+            return 0;
+        }
+        if (name == 0) {
+            return element;
+        }
+        if (XMLUtil::StringEqual(element->Name(), name)) {
+            return element;
+        }
+        return 0;
     }
 
     // --------- XMLText ---------- //
-    char* XMLText::ParseDeep(char* p, StrPair*)
+    char* XMLText::ParseDeep(char* p, StrPair*, int* curLineNumPtr)
     {
-        const char* start = p;
         if (this->CData()) {
-            p = _value.ParseText(p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION);
+            p = _value.ParseText(p, "]]>", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr);
             if (!p) {
-                _document->SetError(XML_ERROR_PARSING_CDATA, start, 0);
+                _document->SetError(XML_ERROR_PARSING_CDATA, _parseLineNum, 0);
             }
             return p;
         }
         else {
             int flags = _document->ProcessEntities() ? StrPair::TEXT_ELEMENT : StrPair::TEXT_ELEMENT_LEAVE_ENTITIES;
             if (_document->WhitespaceMode() == COLLAPSE_WHITESPACE) {
-                flags |= StrPair::COLLAPSE_WHITESPACE;
+                flags |= StrPair::NEEDS_WHITESPACE_COLLAPSING;
             }
 
-            p = _value.ParseText(p, "<", flags);
+            p = _value.ParseText(p, "<", flags, curLineNumPtr);
             if (p && *p) {
                 return p - 1;
             }
             if (!p) {
-                _document->SetError(XML_ERROR_PARSING_TEXT, start, 0);
+                _document->SetError(XML_ERROR_PARSING_TEXT, _parseLineNum, 0);
             }
         }
         return 0;
@@ -986,6 +1171,7 @@ namespace tinyxml2
 
     bool XMLText::ShallowEqual(const XMLNode* compare) const
     {
+        TIXMLASSERT(compare);
         const XMLText* text = compare->ToText();
         return (text && XMLUtil::StringEqual(text->Value(), Value()));
     }
@@ -1010,13 +1196,12 @@ namespace tinyxml2
     }
 
 
-    char* XMLComment::ParseDeep(char* p, StrPair*)
+    char* XMLComment::ParseDeep(char* p, StrPair*, int* curLineNumPtr)
     {
         // Comment parses as text.
-        const char* start = p;
-        p = _value.ParseText(p, "-->", StrPair::COMMENT);
+        p = _value.ParseText(p, "-->", StrPair::COMMENT, curLineNumPtr);
         if (p == 0) {
-            _document->SetError(XML_ERROR_PARSING_COMMENT, start, 0);
+            _document->SetError(XML_ERROR_PARSING_COMMENT, _parseLineNum, 0);
         }
         return p;
     }
@@ -1060,13 +1245,12 @@ namespace tinyxml2
     }
 
 
-    char* XMLDeclaration::ParseDeep(char* p, StrPair*)
+    char* XMLDeclaration::ParseDeep(char* p, StrPair*, int* curLineNumPtr)
     {
         // Declaration parses as text.
-        const char* start = p;
-        p = _value.ParseText(p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION);
+        p = _value.ParseText(p, "?>", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr);
         if (p == 0) {
-            _document->SetError(XML_ERROR_PARSING_DECLARATION, start, 0);
+            _document->SetError(XML_ERROR_PARSING_DECLARATION, _parseLineNum, 0);
         }
         return p;
     }
@@ -1109,14 +1293,12 @@ namespace tinyxml2
     }
 
 
-    char* XMLUnknown::ParseDeep(char* p, StrPair*)
+    char* XMLUnknown::ParseDeep(char* p, StrPair*, int* curLineNumPtr)
     {
         // Unknown parses as text.
-        const char* start = p;
-
-        p = _value.ParseText(p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION);
+        p = _value.ParseText(p, ">", StrPair::NEEDS_NEWLINE_NORMALIZATION, curLineNumPtr);
         if (!p) {
-            _document->SetError(XML_ERROR_PARSING_UNKNOWN, start, 0);
+            _document->SetError(XML_ERROR_PARSING_UNKNOWN, _parseLineNum, 0);
         }
         return p;
     }
@@ -1158,7 +1340,7 @@ namespace tinyxml2
         return _value.GetStr();
     }
 
-    char* XMLAttribute::ParseDeep(char* p, bool processEntities)
+    char* XMLAttribute::ParseDeep(char* p, bool processEntities, int* curLineNumPtr)
     {
         // Parse using the name rules: bug fix, was using ParseText before
         p = _name.ParseName(p);
@@ -1167,13 +1349,13 @@ namespace tinyxml2
         }
 
         // Skip white space before =
-        p = XMLUtil::SkipWhiteSpace(p);
+        p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr);
         if (*p != '=') {
             return 0;
         }
 
         ++p;	// move up to opening quote
-        p = XMLUtil::SkipWhiteSpace(p);
+        p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr);
         if (*p != '\"' && *p != '\'') {
             return 0;
         }
@@ -1181,7 +1363,7 @@ namespace tinyxml2
         char endTag[2] = { *p, 0 };
         ++p;	// move past opening quote
 
-        p = _value.ParseText(p, endTag, processEntities ? StrPair::ATTRIBUTE_VALUE : StrPair::ATTRIBUTE_VALUE_LEAVE_ENTITIES);
+        p = _value.ParseText(p, endTag, processEntities ? StrPair::ATTRIBUTE_VALUE : StrPair::ATTRIBUTE_VALUE_LEAVE_ENTITIES, curLineNumPtr);
         return p;
     }
 
@@ -1195,7 +1377,7 @@ namespace tinyxml2
     XMLError XMLAttribute::QueryIntValue(int* value) const
     {
         if (XMLUtil::ToInt(Value(), value)) {
-            return XML_NO_ERROR;
+            return XML_SUCCESS;
         }
         return XML_WRONG_ATTRIBUTE_TYPE;
     }
@@ -1204,7 +1386,16 @@ namespace tinyxml2
     XMLError XMLAttribute::QueryUnsignedValue(unsigned int* value) const
     {
         if (XMLUtil::ToUnsigned(Value(), value)) {
-            return XML_NO_ERROR;
+            return XML_SUCCESS;
+        }
+        return XML_WRONG_ATTRIBUTE_TYPE;
+    }
+
+
+    XMLError XMLAttribute::QueryInt64Value(int64_t* value) const
+    {
+        if (XMLUtil::ToInt64(Value(), value)) {
+            return XML_SUCCESS;
         }
         return XML_WRONG_ATTRIBUTE_TYPE;
     }
@@ -1213,7 +1404,7 @@ namespace tinyxml2
     XMLError XMLAttribute::QueryBoolValue(bool* value) const
     {
         if (XMLUtil::ToBool(Value(), value)) {
-            return XML_NO_ERROR;
+            return XML_SUCCESS;
         }
         return XML_WRONG_ATTRIBUTE_TYPE;
     }
@@ -1222,7 +1413,7 @@ namespace tinyxml2
     XMLError XMLAttribute::QueryFloatValue(float* value) const
     {
         if (XMLUtil::ToFloat(Value(), value)) {
-            return XML_NO_ERROR;
+            return XML_SUCCESS;
         }
         return XML_WRONG_ATTRIBUTE_TYPE;
     }
@@ -1231,7 +1422,7 @@ namespace tinyxml2
     XMLError XMLAttribute::QueryDoubleValue(double* value) const
     {
         if (XMLUtil::ToDouble(Value(), value)) {
-            return XML_NO_ERROR;
+            return XML_SUCCESS;
         }
         return XML_WRONG_ATTRIBUTE_TYPE;
     }
@@ -1259,6 +1450,15 @@ namespace tinyxml2
     }
 
 
+    void XMLAttribute::SetAttribute(int64_t v)
+    {
+        char buf[BUF_SIZE];
+        XMLUtil::ToStr(v, buf, BUF_SIZE);
+        _value.SetStr(buf);
+    }
+
+
+
     void XMLAttribute::SetAttribute(bool v)
     {
         char buf[BUF_SIZE];
@@ -1283,7 +1483,7 @@ namespace tinyxml2
 
     // --------- XMLElement ---------- //
     XMLElement::XMLElement(XMLDocument* doc) : XMLNode(doc),
-        _closingType(0),
+        _closingType(OPEN),
         _rootAttribute(0)
     {
     }
@@ -1322,6 +1522,47 @@ namespace tinyxml2
         return 0;
     }
 
+    int XMLElement::IntAttribute(const char* name, int defaultValue) const
+    {
+        int i = defaultValue;
+        QueryIntAttribute(name, &i);
+        return i;
+    }
+
+    unsigned XMLElement::UnsignedAttribute(const char* name, unsigned defaultValue) const
+    {
+        unsigned i = defaultValue;
+        QueryUnsignedAttribute(name, &i);
+        return i;
+    }
+
+    int64_t XMLElement::Int64Attribute(const char* name, int64_t defaultValue) const
+    {
+        int64_t i = defaultValue;
+        QueryInt64Attribute(name, &i);
+        return i;
+    }
+
+    bool XMLElement::BoolAttribute(const char* name, bool defaultValue) const
+    {
+        bool b = defaultValue;
+        QueryBoolAttribute(name, &b);
+        return b;
+    }
+
+    double XMLElement::DoubleAttribute(const char* name, double defaultValue) const
+    {
+        double d = defaultValue;
+        QueryDoubleAttribute(name, &d);
+        return d;
+    }
+
+    float XMLElement::FloatAttribute(const char* name, float defaultValue) const
+    {
+        float f = defaultValue;
+        QueryFloatAttribute(name, &f);
+        return f;
+    }
 
     const char* XMLElement::GetText() const
     {
@@ -1359,6 +1600,14 @@ namespace tinyxml2
     }
 
 
+    void XMLElement::SetText(int64_t v)
+    {
+        char buf[BUF_SIZE];
+        XMLUtil::ToStr(v, buf, BUF_SIZE);
+        SetText(buf);
+    }
+
+
     void XMLElement::SetText(bool v)
     {
         char buf[BUF_SIZE];
@@ -1409,6 +1658,19 @@ namespace tinyxml2
     }
 
 
+    XMLError XMLElement::QueryInt64Text(int64_t* ival) const
+    {
+        if (FirstChild() && FirstChild()->ToText()) {
+            const char* t = FirstChild()->Value();
+            if (XMLUtil::ToInt64(t, ival)) {
+                return XML_SUCCESS;
+            }
+            return XML_CAN_NOT_CONVERT_TEXT;
+        }
+        return XML_NO_TEXT_NODE;
+    }
+
+
     XMLError XMLElement::QueryBoolText(bool* bval) const
     {
         if (FirstChild() && FirstChild()->ToText()) {
@@ -1447,6 +1709,47 @@ namespace tinyxml2
         return XML_NO_TEXT_NODE;
     }
 
+    int XMLElement::IntText(int defaultValue) const
+    {
+        int i = defaultValue;
+        QueryIntText(&i);
+        return i;
+    }
+
+    unsigned XMLElement::UnsignedText(unsigned defaultValue) const
+    {
+        unsigned i = defaultValue;
+        QueryUnsignedText(&i);
+        return i;
+    }
+
+    int64_t XMLElement::Int64Text(int64_t defaultValue) const
+    {
+        int64_t i = defaultValue;
+        QueryInt64Text(&i);
+        return i;
+    }
+
+    bool XMLElement::BoolText(bool defaultValue) const
+    {
+        bool b = defaultValue;
+        QueryBoolText(&b);
+        return b;
+    }
+
+    double XMLElement::DoubleText(double defaultValue) const
+    {
+        double d = defaultValue;
+        QueryDoubleText(&d);
+        return d;
+    }
+
+    float XMLElement::FloatText(float defaultValue) const
+    {
+        float f = defaultValue;
+        QueryFloatText(&f);
+        return f;
+    }
 
 
     XMLAttribute* XMLElement::FindOrCreateAttribute(const char* name)
@@ -1461,17 +1764,17 @@ namespace tinyxml2
             }
         }
         if (!attrib) {
-            TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize());
-            attrib = new (_document->_attributePool.Alloc()) XMLAttribute();
-            attrib->_memPool = &_document->_attributePool;
+            attrib = CreateAttribute();
+            TIXMLASSERT(attrib);
             if (last) {
+                TIXMLASSERT(last->_next == 0);
                 last->_next = attrib;
             }
             else {
+                TIXMLASSERT(_rootAttribute == 0);
                 _rootAttribute = attrib;
             }
             attrib->SetName(name);
-            attrib->_memPool->SetTracked(); // always created and linked.
         }
         return attrib;
     }
@@ -1496,30 +1799,30 @@ namespace tinyxml2
     }
 
 
-    char* XMLElement::ParseAttributes(char* p)
+    char* XMLElement::ParseAttributes(char* p, int* curLineNumPtr)
     {
-        const char* start = p;
         XMLAttribute* prevAttribute = 0;
 
         // Read the attributes.
         while (p) {
-            p = XMLUtil::SkipWhiteSpace(p);
+            p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr);
             if (!(*p)) {
-                _document->SetError(XML_ERROR_PARSING_ELEMENT, start, Name());
+                _document->SetError(XML_ERROR_PARSING_ELEMENT, _parseLineNum, "XMLElement name=%s", Name());
                 return 0;
             }
 
             // attribute.
             if (XMLUtil::IsNameStartChar(*p)) {
-                TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize());
-                XMLAttribute* attrib = new (_document->_attributePool.Alloc()) XMLAttribute();
-                attrib->_memPool = &_document->_attributePool;
-                attrib->_memPool->SetTracked();
+                XMLAttribute* attrib = CreateAttribute();
+                TIXMLASSERT(attrib);
+                attrib->_parseLineNum = _document->_parseCurLineNum;
 
-                p = attrib->ParseDeep(p, _document->ProcessEntities());
+                int attrLineNum = attrib->_parseLineNum;
+
+                p = attrib->ParseDeep(p, _document->ProcessEntities(), curLineNumPtr);
                 if (!p || Attribute(attrib->Name())) {
                     DeleteAttribute(attrib);
-                    _document->SetError(XML_ERROR_PARSING_ATTRIBUTE, start, p);
+                    _document->SetError(XML_ERROR_PARSING_ATTRIBUTE, attrLineNum, "XMLElement name=%s", Name());
                     return 0;
                 }
                 // There is a minor bug here: if the attribute in the source xml
@@ -1528,25 +1831,27 @@ namespace tinyxml2
                 // avoids re-scanning the attribute list. Preferring performance for
                 // now, may reconsider in the future.
                 if (prevAttribute) {
+                    TIXMLASSERT(prevAttribute->_next == 0);
                     prevAttribute->_next = attrib;
                 }
                 else {
+                    TIXMLASSERT(_rootAttribute == 0);
                     _rootAttribute = attrib;
                 }
                 prevAttribute = attrib;
             }
             // end of the tag
-            else if (*p == '/' && *(p + 1) == '>') {
-                _closingType = CLOSED;
-                return p + 2;	// done; sealed element.
-            }
-            // end of the tag
             else if (*p == '>') {
                 ++p;
                 break;
             }
+            // end of the tag
+            else if (*p == '/' && *(p + 1) == '>') {
+                _closingType = CLOSED;
+                return p + 2;	// done; sealed element.
+            }
             else {
-                _document->SetError(XML_ERROR_PARSING_ELEMENT, start, p);
+                _document->SetError(XML_ERROR_PARSING_ELEMENT, _parseLineNum, 0);
                 return 0;
             }
         }
@@ -1563,14 +1868,24 @@ namespace tinyxml2
         pool->Free(attribute);
     }
 
+    XMLAttribute* XMLElement::CreateAttribute()
+    {
+        TIXMLASSERT(sizeof(XMLAttribute) == _document->_attributePool.ItemSize());
+        XMLAttribute* attrib = new (_document->_attributePool.Alloc()) XMLAttribute();
+        TIXMLASSERT(attrib);
+        attrib->_memPool = &_document->_attributePool;
+        attrib->_memPool->SetTracked();
+        return attrib;
+    }
+
     //
     //	<ele></ele>
     //	<ele>foo<b>bar</b></ele>
     //
-    char* XMLElement::ParseDeep(char* p, StrPair* strPair)
+    char* XMLElement::ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr)
     {
         // Read the element name.
-        p = XMLUtil::SkipWhiteSpace(p);
+        p = XMLUtil::SkipWhiteSpace(p, curLineNumPtr);
 
         // The closing element is the </element> form. It is
         // parsed just like a regular element then deleted from
@@ -1585,12 +1900,12 @@ namespace tinyxml2
             return 0;
         }
 
-        p = ParseAttributes(p);
-        if (!p || !*p || _closingType) {
+        p = ParseAttributes(p, curLineNumPtr);
+        if (!p || !*p || _closingType != OPEN) {
             return p;
         }
 
-        p = XMLNode::ParseDeep(p, strPair);
+        p = XMLNode::ParseDeep(p, parentEndTag, curLineNumPtr);
         return p;
     }
 
@@ -1613,7 +1928,7 @@ namespace tinyxml2
     {
         TIXMLASSERT(compare);
         const XMLElement* other = compare->ToElement();
-        if (other && XMLUtil::StringEqual(other->Value(), Value())) {
+        if (other && XMLUtil::StringEqual(other->Name(), Name())) {
 
             const XMLAttribute* a = FirstAttribute();
             const XMLAttribute* b = other->FirstAttribute();
@@ -1659,10 +1974,10 @@ namespace tinyxml2
         "XML_ERROR_FILE_NOT_FOUND",
         "XML_ERROR_FILE_COULD_NOT_BE_OPENED",
         "XML_ERROR_FILE_READ_ERROR",
-        "XML_ERROR_ELEMENT_MISMATCH",
+        "UNUSED_XML_ERROR_ELEMENT_MISMATCH",
         "XML_ERROR_PARSING_ELEMENT",
         "XML_ERROR_PARSING_ATTRIBUTE",
-        "XML_ERROR_IDENTIFYING_TAG",
+        "UNUSED_XML_ERROR_IDENTIFYING_TAG",
         "XML_ERROR_PARSING_TEXT",
         "XML_ERROR_PARSING_CDATA",
         "XML_ERROR_PARSING_COMMENT",
@@ -1676,17 +1991,24 @@ namespace tinyxml2
     };
 
 
-    XMLDocument::XMLDocument(bool processEntities, Whitespace whitespace) :
+    XMLDocument::XMLDocument(bool processEntities, Whitespace whitespaceMode) :
         XMLNode(0),
         _writeBOM(false),
         _processEntities(processEntities),
-        _errorID(XML_NO_ERROR),
-        _whitespace(whitespace),
-        _errorStr1(0),
-        _errorStr2(0),
-        _charBuffer(0)
+        _errorID(XML_SUCCESS),
+        _whitespaceMode(whitespaceMode),
+        _errorStr(),
+        _errorLineNum(0),
+        _charBuffer(0),
+        _parseCurLineNum(0),
+        _unlinked(),
+        _elementPool(),
+        _attributePool(),
+        _textPool(),
+        _commentPool()
     {
-        _document = this;	// avoid warning about 'this' in initializer list
+        // avoid VC++ C4355 warning about 'this' in initializer list (C4355 is off by default in VS2012+)
+        _document = this;
     }
 
 
@@ -1696,16 +2018,30 @@ namespace tinyxml2
     }
 
 
+    void XMLDocument::MarkInUse(XMLNode* node)
+    {
+        TIXMLASSERT(node);
+        TIXMLASSERT(node->_parent == 0);
+
+        for (int i = 0; i < _unlinked.Size(); ++i) {
+            if (node == _unlinked[i]) {
+                _unlinked.SwapRemove(i);
+                break;
+            }
+        }
+    }
+
     void XMLDocument::Clear()
     {
         DeleteChildren();
+        while (_unlinked.Size()) {
+            DeleteNode(_unlinked[0]);	// Will remove from _unlinked as part of delete.
+        }
 
-#ifdef DEBUG
+#ifdef TINYXML2_DEBUG
         const bool hadError = Error();
 #endif
-        _errorID = XML_NO_ERROR;
-        _errorStr1 = 0;
-        _errorStr2 = 0;
+        ClearError();
 
         delete[] _charBuffer;
         _charBuffer = 0;
@@ -1717,7 +2053,7 @@ namespace tinyxml2
         _attributePool.Trace("attribute");
 #endif
 
-#ifdef DEBUG
+#ifdef TINYXML2_DEBUG
         if (!hadError) {
             TIXMLASSERT(_elementPool.CurrentAllocs() == _elementPool.Untracked());
             TIXMLASSERT(_attributePool.CurrentAllocs() == _attributePool.Untracked());
@@ -1728,11 +2064,22 @@ namespace tinyxml2
     }
 
 
+    void XMLDocument::DeepCopy(XMLDocument* target) const
+    {
+        TIXMLASSERT(target);
+        if (target == this) {
+            return; // technically success - a no-op.
+        }
+
+        target->Clear();
+        for (const XMLNode* node = this->FirstChild(); node; node = node->NextSibling()) {
+            target->InsertEndChild(node->DeepClone(target));
+        }
+    }
+
     XMLElement* XMLDocument::NewElement(const char* name)
     {
-        TIXMLASSERT(sizeof(XMLElement) == _elementPool.ItemSize());
-        XMLElement* ele = new (_elementPool.Alloc()) XMLElement(this);
-        ele->_memPool = &_elementPool;
+        XMLElement* ele = CreateUnlinkedNode<XMLElement>(_elementPool);
         ele->SetName(name);
         return ele;
     }
@@ -1740,9 +2087,7 @@ namespace tinyxml2
 
     XMLComment* XMLDocument::NewComment(const char* str)
     {
-        TIXMLASSERT(sizeof(XMLComment) == _commentPool.ItemSize());
-        XMLComment* comment = new (_commentPool.Alloc()) XMLComment(this);
-        comment->_memPool = &_commentPool;
+        XMLComment* comment = CreateUnlinkedNode<XMLComment>(_commentPool);
         comment->SetValue(str);
         return comment;
     }
@@ -1750,9 +2095,7 @@ namespace tinyxml2
 
     XMLText* XMLDocument::NewText(const char* str)
     {
-        TIXMLASSERT(sizeof(XMLText) == _textPool.ItemSize());
-        XMLText* text = new (_textPool.Alloc()) XMLText(this);
-        text->_memPool = &_textPool;
+        XMLText* text = CreateUnlinkedNode<XMLText>(_textPool);
         text->SetValue(str);
         return text;
     }
@@ -1760,9 +2103,7 @@ namespace tinyxml2
 
     XMLDeclaration* XMLDocument::NewDeclaration(const char* str)
     {
-        TIXMLASSERT(sizeof(XMLDeclaration) == _commentPool.ItemSize());
-        XMLDeclaration* dec = new (_commentPool.Alloc()) XMLDeclaration(this);
-        dec->_memPool = &_commentPool;
+        XMLDeclaration* dec = CreateUnlinkedNode<XMLDeclaration>(_commentPool);
         dec->SetValue(str ? str : "xml version=\"1.0\" encoding=\"UTF-8\"");
         return dec;
     }
@@ -1770,9 +2111,7 @@ namespace tinyxml2
 
     XMLUnknown* XMLDocument::NewUnknown(const char* str)
     {
-        TIXMLASSERT(sizeof(XMLUnknown) == _commentPool.ItemSize());
-        XMLUnknown* unk = new (_commentPool.Alloc()) XMLUnknown(this);
-        unk->_memPool = &_commentPool;
+        XMLUnknown* unk = CreateUnlinkedNode<XMLUnknown>(_commentPool);
         unk->SetValue(str);
         return unk;
     }
@@ -1816,7 +2155,7 @@ namespace tinyxml2
         Clear();
         FILE* fp = callfopen(filename, "rb");
         if (!fp) {
-            SetError(XML_ERROR_FILE_NOT_FOUND, filename, 0);
+            SetError(XML_ERROR_FILE_NOT_FOUND, 0, "filename=%s", filename ? filename : "<null>");
             return _errorID;
         }
         LoadFile(fp);
@@ -1824,6 +2163,28 @@ namespace tinyxml2
         return _errorID;
     }
 
+    // This is likely overengineered template art to have a check that unsigned long value incremented
+    // by one still fits into size_t. If size_t type is larger than unsigned long type
+    // (x86_64-w64-mingw32 target) then the check is redundant and gcc and clang emit
+    // -Wtype-limits warning. This piece makes the compiler select code with a check when a check
+    // is useful and code with no check when a check is redundant depending on how size_t and unsigned long
+    // types sizes relate to each other.
+    template
+        <bool = (sizeof(unsigned long) >= sizeof(size_t))>
+        struct LongFitsIntoSizeTMinusOne {
+        static bool Fits(unsigned long value)
+        {
+            return value < (size_t)-1;
+        }
+    };
+
+    template <>
+    struct LongFitsIntoSizeTMinusOne<false> {
+        static bool Fits(unsigned long)
+        {
+            return true;
+        }
+    };
 
     XMLError XMLDocument::LoadFile(FILE* fp)
     {
@@ -1842,13 +2203,21 @@ namespace tinyxml2
             SetError(XML_ERROR_FILE_READ_ERROR, 0, 0);
             return _errorID;
         }
+        TIXMLASSERT(filelength >= 0);
 
-        const size_t size = filelength;
-        if (size == 0) {
+        if (!LongFitsIntoSizeTMinusOne<>::Fits(filelength)) {
+            // Cannot handle files which won't fit in buffer together with null terminator
+            SetError(XML_ERROR_FILE_READ_ERROR, 0, 0);
+            return _errorID;
+        }
+
+        if (filelength == 0) {
             SetError(XML_ERROR_EMPTY_DOCUMENT, 0, 0);
             return _errorID;
         }
 
+        const size_t size = filelength;
+        TIXMLASSERT(_charBuffer == 0);
         _charBuffer = new char[size + 1];
         size_t read = fread(_charBuffer, 1, size, fp);
         if (read != size) {
@@ -1867,7 +2236,7 @@ namespace tinyxml2
     {
         FILE* fp = callfopen(filename, "w");
         if (!fp) {
-            SetError(XML_ERROR_FILE_COULD_NOT_BE_OPENED, filename, 0);
+            SetError(XML_ERROR_FILE_COULD_NOT_BE_OPENED, 0, "filename=%s", filename ? filename : "<null>");
             return _errorID;
         }
         SaveFile(fp, compact);
@@ -1878,6 +2247,9 @@ namespace tinyxml2
 
     XMLError XMLDocument::SaveFile(FILE* fp, bool compact)
     {
+        // Clear any error from the last save, otherwise it will get reported
+        // for *this* call.
+        ClearError();
         XMLPrinter stream(fp, compact);
         Print(&stream);
         return _errorID;
@@ -1895,6 +2267,7 @@ namespace tinyxml2
         if (len == (size_t)(-1)) {
             len = strlen(p);
         }
+        TIXMLASSERT(_charBuffer == 0);
         _charBuffer = new char[len + 1];
         memcpy(_charBuffer, p, len);
         _charBuffer[len] = 0;
@@ -1916,78 +2289,103 @@ namespace tinyxml2
 
     void XMLDocument::Print(XMLPrinter* streamer) const
     {
-        XMLPrinter stdStreamer(stdout);
-        if (!streamer) {
-            streamer = &stdStreamer;
+        if (streamer) {
+            Accept(streamer);
+        }
+        else {
+            XMLPrinter stdoutStreamer(stdout);
+            Accept(&stdoutStreamer);
         }
-        Accept(streamer);
     }
 
 
-    void XMLDocument::SetError(XMLError error, const char* str1, const char* str2)
+    void XMLDocument::SetError(XMLError error, int lineNum, const char* format, ...)
     {
         TIXMLASSERT(error >= 0 && error < XML_ERROR_COUNT);
         _errorID = error;
-        _errorStr1 = str1;
-        _errorStr2 = str2;
+        _errorLineNum = lineNum;
+        _errorStr.Reset();
+
+        size_t BUFFER_SIZE = 1000;
+        char* buffer = new char[BUFFER_SIZE];
+
+        TIXML_SNPRINTF(buffer, BUFFER_SIZE, "Error=%s ErrorID=%d (0x%x) Line number=%d", ErrorIDToName(error), int(error), int(error), lineNum);
+
+        if (format) {
+            size_t len = strlen(buffer);
+            TIXML_SNPRINTF(buffer + len, BUFFER_SIZE - len, ": ");
+            len = strlen(buffer);
+
+            va_list va;
+            va_start(va, format);
+            TIXML_VSNPRINTF(buffer + len, BUFFER_SIZE - len, format, va);
+            va_end(va);
+        }
+        _errorStr.SetStr(buffer);
+        delete[] buffer;
     }
 
-    const char* XMLDocument::ErrorName() const
+
+    /*static*/ const char* XMLDocument::ErrorIDToName(XMLError errorID)
     {
-        TIXMLASSERT(_errorID >= 0 && _errorID < XML_ERROR_COUNT);
-        return _errorNames[_errorID];
+        TIXMLASSERT(errorID >= 0 && errorID < XML_ERROR_COUNT);
+        const char* errorName = _errorNames[errorID];
+        TIXMLASSERT(errorName && errorName[0]);
+        return errorName;
     }
 
-    void XMLDocument::PrintError() const
+    const char* XMLDocument::ErrorStr() const
     {
-        if (Error()) {
-            static const int LEN = 20;
-            char buf1[LEN] = { 0 };
-            char buf2[LEN] = { 0 };
+        return _errorStr.Empty() ? "" : _errorStr.GetStr();
+    }
 
-            if (_errorStr1) {
-                TIXML_SNPRINTF(buf1, LEN, "%s", _errorStr1);
-            }
-            if (_errorStr2) {
-                TIXML_SNPRINTF(buf2, LEN, "%s", _errorStr2);
-            }
 
-            printf("XMLDocument error id=%d '%s' str1=%s str2=%s\n",
-                _errorID, ErrorName(), buf1, buf2);
-        }
+    void XMLDocument::PrintError() const
+    {
+        printf("%s\n", ErrorStr());
+    }
+
+    const char* XMLDocument::ErrorName() const
+    {
+        return ErrorIDToName(_errorID);
     }
 
     void XMLDocument::Parse()
     {
         TIXMLASSERT(NoChildren()); // Clear() must have been called previously
         TIXMLASSERT(_charBuffer);
+        _parseCurLineNum = 1;
+        _parseLineNum = 1;
         char* p = _charBuffer;
-        p = XMLUtil::SkipWhiteSpace(p);
+        p = XMLUtil::SkipWhiteSpace(p, &_parseCurLineNum);
         p = const_cast<char*>(XMLUtil::ReadBOM(p, &_writeBOM));
         if (!*p) {
             SetError(XML_ERROR_EMPTY_DOCUMENT, 0, 0);
             return;
         }
-        ParseDeep(p, 0);
+        ParseDeep(p, 0, &_parseCurLineNum);
     }
 
     XMLPrinter::XMLPrinter(FILE* file, bool compact, int depth) :
         _elementJustOpened(false),
+        _stack(),
         _firstElement(true),
         _fp(file),
         _depth(depth),
         _textDepth(-1),
         _processEntities(true),
-        _compactMode(compact)
+        _compactMode(compact),
+        _buffer()
     {
-        for (int i = 0; i < ENTITY_RANGE; ++i) {
+        for (int i = 0; i<ENTITY_RANGE; ++i) {
             _entityFlag[i] = false;
             _restrictedEntityFlag[i] = false;
         }
-        for (int i = 0; i < NUM_ENTITIES; ++i) {
+        for (int i = 0; i<NUM_ENTITIES; ++i) {
             const char entityValue = entities[i].value;
-            TIXMLASSERT(0 <= entityValue && entityValue < ENTITY_RANGE);
-            _entityFlag[(unsigned char)entityValue] = true;
+            const unsigned char flagIndex = (unsigned char)entityValue;
+            TIXMLASSERT(flagIndex < ENTITY_RANGE);
+            _entityFlag[flagIndex] = true;
         }
         _restrictedEntityFlag[(unsigned char)'&'] = true;
         _restrictedEntityFlag[(unsigned char)'<'] = true;
@@ -2005,44 +2403,49 @@ namespace tinyxml2
             vfprintf(_fp, format, va);
         }
         else {
-#if defined(_MSC_VER) && (_MSC_VER >= 1400 )
-#if defined(WINCE)
-            int len = 512;
-            do {
-                len = len * 2;
-                char* str = new char[len]();
-                len = _vsnprintf(str, len, format, va);
-                delete[] str;
-            } while (len < 0);
-#else
-            int len = _vscprintf(format, va);
-#endif
-#else
-            int len = vsnprintf(0, 0, format, va);
-#endif
+            const int len = TIXML_VSCPRINTF(format, va);
             // Close out and re-start the va-args
             va_end(va);
+            TIXMLASSERT(len >= 0);
             va_start(va, format);
             TIXMLASSERT(_buffer.Size() > 0 && _buffer[_buffer.Size() - 1] == 0);
             char* p = _buffer.PushArr(len) - 1;	// back up over the null terminator.
-#if defined(_MSC_VER) && (_MSC_VER >= 1400 )
-#if defined(WINCE)
-            _vsnprintf(p, len + 1, format, va);
-#else
-            vsnprintf_s(p, len + 1, _TRUNCATE, format, va);
-#endif
-#else
-            vsnprintf(p, len + 1, format, va);
-#endif
+            TIXML_VSNPRINTF(p, len + 1, format, va);
         }
         va_end(va);
     }
 
 
+    void XMLPrinter::Write(const char* data, size_t size)
+    {
+        if (_fp) {
+            fwrite(data, sizeof(char), size, _fp);
+        }
+        else {
+            char* p = _buffer.PushArr(static_cast<int>(size)) - 1;   // back up over the null terminator.
+            memcpy(p, data, size);
+            p[size] = 0;
+        }
+    }
+
+
+    void XMLPrinter::Putc(char ch)
+    {
+        if (_fp) {
+            fputc(ch, _fp);
+        }
+        else {
+            char* p = _buffer.PushArr(sizeof(char)) - 1;   // back up over the null terminator.
+            p[0] = ch;
+            p[1] = 0;
+        }
+    }
+
+
     void XMLPrinter::PrintSpace(int depth)
     {
-        for (int i = 0; i < depth; ++i) {
-            Print("    ");
+        for (int i = 0; i<depth; ++i) {
+            Write("    ");
         }
     }
 
@@ -2055,6 +2458,7 @@ namespace tinyxml2
         if (_processEntities) {
             const bool* flag = restricted ? _restrictedEntityFlag : _entityFlag;
             while (*q) {
+                TIXMLASSERT(p <= q);
                 // Remember, char is sometimes signed. (How many times has that bitten me?)
                 if (*q > 0 && *q < ENTITY_RANGE) {
                     // Check for entities. If one is found, flush
@@ -2062,25 +2466,39 @@ namespace tinyxml2
                     // entity, and keep looking.
                     if (flag[(unsigned char)(*q)]) {
                         while (p < q) {
-                            Print("%c", *p);
-                            ++p;
+                            const size_t delta = q - p;
+                            const int toPrint = (INT_MAX < delta) ? INT_MAX : (int)delta;
+                            Write(p, toPrint);
+                            p += toPrint;
                         }
-                        for (int i = 0; i < NUM_ENTITIES; ++i) {
+                        bool entityPatternPrinted = false;
+                        for (int i = 0; i<NUM_ENTITIES; ++i) {
                             if (entities[i].value == *q) {
-                                Print("&%s;", entities[i].pattern);
+                                Putc('&');
+                                Write(entities[i].pattern, entities[i].length);
+                                Putc(';');
+                                entityPatternPrinted = true;
                                 break;
                             }
                         }
+                        if (!entityPatternPrinted) {
+                            // TIXMLASSERT( entityPatternPrinted ) causes gcc -Wunused-but-set-variable in release
+                            TIXMLASSERT(false);
+                        }
                         ++p;
                     }
                 }
                 ++q;
+                TIXMLASSERT(p <= q);
             }
         }
         // Flush the remaining string. This will be the entire
         // string if an entity wasn't found.
-        if (!_processEntities || (q - p > 0)) {
-            Print("%s", p);
+        TIXMLASSERT(p <= q);
+        if (!_processEntities || (p < q)) {
+            const size_t delta = q - p;
+            const int toPrint = (INT_MAX < delta) ? INT_MAX : (int)delta;
+            Write(p, toPrint);
         }
     }
 
@@ -2089,7 +2507,7 @@ namespace tinyxml2
     {
         if (writeBOM) {
             static const unsigned char bom[] = { TIXML_UTF_LEAD_0, TIXML_UTF_LEAD_1, TIXML_UTF_LEAD_2, 0 };
-            Print("%s", bom);
+            Write(reinterpret_cast< const char* >(bom));
         }
         if (writeDec) {
             PushDeclaration("xml version=\"1.0\"");
@@ -2103,13 +2521,15 @@ namespace tinyxml2
         _stack.Push(name);
 
         if (_textDepth < 0 && !_firstElement && !compactMode) {
-            Print("\n");
+            Putc('\n');
         }
         if (!compactMode) {
             PrintSpace(_depth);
         }
 
-        Print("<%s", name);
+        Write("<");
+        Write(name);
+
         _elementJustOpened = true;
         _firstElement = false;
         ++_depth;
@@ -2119,9 +2539,11 @@ namespace tinyxml2
     void XMLPrinter::PushAttribute(const char* name, const char* value)
     {
         TIXMLASSERT(_elementJustOpened);
-        Print(" %s=\"", name);
+        Putc(' ');
+        Write(name);
+        Write("=\"");
         PrintString(value, false);
-        Print("\"");
+        Putc('\"');
     }
 
 
@@ -2141,6 +2563,14 @@ namespace tinyxml2
     }
 
 
+    void XMLPrinter::PushAttribute(const char* name, int64_t v)
+    {
+        char buf[BUF_SIZE];
+        XMLUtil::ToStr(v, buf, BUF_SIZE);
+        PushAttribute(name, buf);
+    }
+
+
     void XMLPrinter::PushAttribute(const char* name, bool v)
     {
         char buf[BUF_SIZE];
@@ -2163,21 +2593,23 @@ namespace tinyxml2
         const char* name = _stack.Pop();
 
         if (_elementJustOpened) {
-            Print("/>");
+            Write("/>");
         }
         else {
             if (_textDepth < 0 && !compactMode) {
-                Print("\n");
+                Putc('\n');
                 PrintSpace(_depth);
             }
-            Print("</%s>", name);
+            Write("</");
+            Write(name);
+            Write(">");
         }
 
         if (_textDepth == _depth) {
             _textDepth = -1;
         }
         if (_depth == 0 && !compactMode) {
-            Print("\n");
+            Putc('\n');
         }
         _elementJustOpened = false;
     }
@@ -2189,7 +2621,7 @@ namespace tinyxml2
             return;
         }
         _elementJustOpened = false;
-        Print(">");
+        Putc('>');
     }
 
 
@@ -2199,15 +2631,22 @@ namespace tinyxml2
 
         SealElementIfJustOpened();
         if (cdata) {
-            Print("<![CDATA[");
-            Print("%s", text);
-            Print("]]>");
+            Write("<![CDATA[");
+            Write(text);
+            Write("]]>");
         }
         else {
             PrintString(text, true);
         }
     }
 
+    void XMLPrinter::PushText(int64_t value)
+    {
+        char buf[BUF_SIZE];
+        XMLUtil::ToStr(value, buf, BUF_SIZE);
+        PushText(buf, false);
+    }
+
     void XMLPrinter::PushText(int value)
     {
         char buf[BUF_SIZE];
@@ -2252,11 +2691,14 @@ namespace tinyxml2
     {
         SealElementIfJustOpened();
         if (_textDepth < 0 && !_firstElement && !_compactMode) {
-            Print("\n");
+            Putc('\n');
             PrintSpace(_depth);
         }
         _firstElement = false;
-        Print("<!--%s-->", comment);
+
+        Write("<!--");
+        Write(comment);
+        Write("-->");
     }
 
 
@@ -2264,11 +2706,14 @@ namespace tinyxml2
     {
         SealElementIfJustOpened();
         if (_textDepth < 0 && !_firstElement && !_compactMode) {
-            Print("\n");
+            Putc('\n');
             PrintSpace(_depth);
         }
         _firstElement = false;
-        Print("<?%s?>", value);
+
+        Write("<?");
+        Write(value);
+        Write("?>");
     }
 
 
@@ -2276,11 +2721,14 @@ namespace tinyxml2
     {
         SealElementIfJustOpened();
         if (_textDepth < 0 && !_firstElement && !_compactMode) {
-            Print("\n");
+            Putc('\n');
             PrintSpace(_depth);
         }
         _firstElement = false;
-        Print("<!%s>", value);
+
+        Write("<!");
+        Write(value);
+        Putc('>');
     }
 
 
@@ -2296,8 +2744,11 @@ namespace tinyxml2
 
     bool XMLPrinter::VisitEnter(const XMLElement& element, const XMLAttribute* attribute)
     {
-        const XMLElement*	parentElem = element.Parent()->ToElement();
-        bool		compactMode = parentElem ? CompactMode(*parentElem) : _compactMode;
+        const XMLElement* parentElem = 0;
+        if (element.Parent()) {
+            parentElem = element.Parent()->ToElement();
+        }
+        const bool compactMode = parentElem ? CompactMode(*parentElem) : _compactMode;
         OpenElement(element.Name(), compactMode);
         while (attribute) {
             PushAttribute(attribute->Name(), attribute->Value());
@@ -2341,3 +2792,4 @@ namespace tinyxml2
     }
 
 }   // namespace tinyxml2
+
diff --git a/src/3rd/Simd/SimdBase_tinyxml2.h b/src/3rd/Simd/SimdBase_tinyxml2.h
index bda9804e..ac732fe1 100644
--- a/src/3rd/Simd/SimdBase_tinyxml2.h
+++ b/src/3rd/Simd/SimdBase_tinyxml2.h
@@ -24,38 +24,38 @@ distribution.
 #ifndef TINYXML2_INCLUDED
 #define TINYXML2_INCLUDED
 
-#include "Simd/SimdConfig.h"
-
 #if defined(ANDROID_NDK) || defined(__BORLANDC__) || defined(__QNXNTO__)
 #   include <ctype.h>
 #   include <limits.h>
 #   include <stdio.h>
 #   include <stdlib.h>
 #   include <string.h>
-#   include <stdarg.h>
+#	if defined(__PS3__)
+#		include <stddef.h>
+#	endif
 #else
 #   include <cctype>
 #   include <climits>
 #   include <cstdio>
 #   include <cstdlib>
 #   include <cstring>
-#   include <cstdarg>
 #endif
+#include <stdint.h>
 
 /*
-   TODO: intern strings instead of allocation.
+TODO: intern strings instead of allocation.
 */
 /*
-    gcc:
-        g++ -Wall -DDEBUG tinyxml2.cpp xmltest.cpp -o gccxmltest.exe
+gcc:
+g++ -Wall -DTINYXML2_DEBUG tinyxml2.cpp xmltest.cpp -o gccxmltest.exe
 
-    Formatting, Artistic Style:
-        AStyle.exe --style=1tbs --indent-switches --break-closing-brackets --indent-preprocessor tinyxml2.cpp tinyxml2.h
+Formatting, Artistic Style:
+AStyle.exe --style=1tbs --indent-switches --break-closing-brackets --indent-preprocessor tinyxml2.cpp tinyxml2.h
 */
 
-#if defined( _DEBUG ) || defined( DEBUG ) || defined (__DEBUG__)
-#   ifndef DEBUG
-#       define DEBUG
+#if defined( _DEBUG ) || defined (__DEBUG__)
+#   ifndef TINYXML2_DEBUG
+#       define TINYXML2_DEBUG
 #   endif
 #endif
 
@@ -72,15 +72,17 @@ distribution.
 #   else
 #       define TINYXML2_LIB
 #   endif
+#elif __GNUC__ >= 4
+#   define TINYXML2_LIB __attribute__((visibility("default")))
 #else
 #   define TINYXML2_LIB
 #endif
 
 
-#if defined(DEBUG)
+#if defined(TINYXML2_DEBUG)
 #   if defined(_MSC_VER)
 #       // "(void)0," is for suppressing C4127 warning in "assert(false)", "assert(true)" and the like
-#       define TIXMLASSERT( x )           if ( !((void)0,(x))) { __debugbreak(); } //if ( !(x)) WinDebugBreak()
+#       define TIXMLASSERT( x )           if ( !((void)0,(x))) { __debugbreak(); }
 #   elif defined (ANDROID_NDK)
 #       include <android/log.h>
 #       define TIXMLASSERT( x )           if ( !(x)) { __android_log_assert( "assert", "grinliz", "ASSERT in '%s' at %d.", __FILE__, __LINE__ ); }
@@ -88,46 +90,22 @@ distribution.
 #       include <assert.h>
 #       define TIXMLASSERT                assert
 #   endif
-#   else
-#       define TIXMLASSERT( x )           {}
-#endif
-
-
-#if defined(_MSC_VER) && (_MSC_VER >= 1400 ) && (!defined WINCE)
-// Microsoft visual studio, version 2005 and higher.
-/*int _snprintf_s(
-   char *buffer,
-   size_t sizeOfBuffer,
-   size_t count,
-   const char *format [,
-      argument] ...
-);*/
-inline int TIXML_SNPRINTF(char* buffer, size_t size, const char* format, ...)
-{
-    va_list va;
-    va_start(va, format);
-    int result = vsnprintf_s(buffer, size, _TRUNCATE, format, va);
-    va_end(va);
-    return result;
-}
-#define TIXML_SSCANF   sscanf_s
-#elif defined WINCE
-#define TIXML_SNPRINTF _snprintf
-#define TIXML_SSCANF   sscanf
 #else
-// GCC version 3 and higher
-//#warning( "Using sn* functions." )
-#define TIXML_SNPRINTF snprintf
-#define TIXML_SSCANF   sscanf
+#   define TIXMLASSERT( x )               {}
 #endif
 
+
 /* Versioning, past 1.0.14:
-    http://semver.org/
+http://semver.org/
 */
-static const int TIXML2_MAJOR_VERSION = 3;
-static const int TIXML2_MINOR_VERSION = 0;
+static const int TIXML2_MAJOR_VERSION = 6;
+static const int TIXML2_MINOR_VERSION = 1;
 static const int TIXML2_PATCH_VERSION = 0;
 
+#define TINYXML2_MAJOR_VERSION 6
+#define TINYXML2_MINOR_VERSION 1
+#define TINYXML2_PATCH_VERSION 0
+
 namespace tinyxml2
 {
     class XMLDocument;
@@ -140,10 +118,10 @@ namespace tinyxml2
     class XMLPrinter;
 
     /*
-        A class that wraps strings. Normally stores the start and end
-        pointers into the XML file itself, and will apply normalization
-        and entity translation if actually read. Can also store (and memory
-        manage) a traditional char[]
+    A class that wraps strings. Normally stores the start and end
+    pointers into the XML file itself, and will apply normalization
+    and entity translation if actually read. Can also store (and memory
+    manage) a traditional char[]
     */
     class StrPair
     {
@@ -151,7 +129,7 @@ namespace tinyxml2
         enum {
             NEEDS_ENTITY_PROCESSING = 0x01,
             NEEDS_NEWLINE_NORMALIZATION = 0x02,
-            COLLAPSE_WHITESPACE = 0x04,
+            NEEDS_WHITESPACE_COLLAPSING = 0x04,
 
             TEXT_ELEMENT = NEEDS_ENTITY_PROCESSING | NEEDS_NEWLINE_NORMALIZATION,
             TEXT_ELEMENT_LEAVE_ENTITIES = NEEDS_NEWLINE_NORMALIZATION,
@@ -165,6 +143,8 @@ namespace tinyxml2
         ~StrPair();
 
         void Set(char* start, char* end, int flags) {
+            TIXMLASSERT(start);
+            TIXMLASSERT(end);
             Reset();
             _start = start;
             _end = end;
@@ -184,13 +164,13 @@ namespace tinyxml2
 
         void SetStr(const char* str, int flags = 0);
 
-        char* ParseText(char* in, const char* endTag, int strFlags);
+        char* ParseText(char* in, const char* endTag, int strFlags, int* curLineNumPtr);
         char* ParseName(char* in);
 
         void TransferTo(StrPair* other);
+        void Reset();
 
     private:
-        void Reset();
         void CollapseWhitespace();
 
         enum {
@@ -198,7 +178,6 @@ namespace tinyxml2
             NEEDS_DELETE = 0x200
         };
 
-        // After parsing, if *_end != 0, it can be set to zero.
         int     _flags;
         char*   _start;
         char*   _end;
@@ -209,18 +188,19 @@ namespace tinyxml2
 
 
     /*
-        A dynamic array of Plain Old Data. Doesn't support constructors, etc.
-        Has a small initial memory pool, so that low or no usage will not
-        cause a call to new/delete
+    A dynamic array of Plain Old Data. Doesn't support constructors, etc.
+    Has a small initial memory pool, so that low or no usage will not
+    cause a call to new/delete
     */
-    template <class T, int INIT>
+    template <class T, int INITIAL_SIZE>
     class DynArray
     {
     public:
-        DynArray() {
-            _mem = _pool;
-            _allocated = INIT;
-            _size = 0;
+        DynArray() :
+            _mem(_pool),
+            _allocated(INITIAL_SIZE),
+            _size(0)
+        {
         }
 
         ~DynArray() {
@@ -236,7 +216,8 @@ namespace tinyxml2
         void Push(T t) {
             TIXMLASSERT(_size < INT_MAX);
             EnsureCapacity(_size + 1);
-            _mem[_size++] = t;
+            _mem[_size] = t;
+            ++_size;
         }
 
         T* PushArr(int count) {
@@ -250,7 +231,8 @@ namespace tinyxml2
 
         T Pop() {
             TIXMLASSERT(_size > 0);
-            return _mem[--_size];
+            --_size;
+            return _mem[_size];
         }
 
         void PopArr(int count) {
@@ -283,14 +265,24 @@ namespace tinyxml2
         }
 
         int Capacity() const {
+            TIXMLASSERT(_allocated >= INITIAL_SIZE);
             return _allocated;
         }
 
+        void SwapRemove(int i) {
+            TIXMLASSERT(i >= 0 && i < _size);
+            TIXMLASSERT(_size > 0);
+            _mem[i] = _mem[_size - 1];
+            --_size;
+        }
+
         const T* Mem() const {
+            TIXMLASSERT(_mem);
             return _mem;
         }
 
         T* Mem() {
+            TIXMLASSERT(_mem);
             return _mem;
         }
 
@@ -304,6 +296,7 @@ namespace tinyxml2
                 TIXMLASSERT(cap <= INT_MAX / 2);
                 int newAllocated = cap * 2;
                 T* newMem = new T[newAllocated];
+                TIXMLASSERT(newAllocated >= _size);
                 memcpy(newMem, _mem, sizeof(T)*_size);	// warning: not using constructors, only works for PODs
                 if (_mem != _pool) {
                     delete[] _mem;
@@ -314,15 +307,15 @@ namespace tinyxml2
         }
 
         T*  _mem;
-        T   _pool[INIT];
+        T   _pool[INITIAL_SIZE];
         int _allocated;		// objects allocated
         int _size;			// number objects in use
     };
 
 
     /*
-        Parent virtual class of a pool for fast allocation
-        and deallocation of objects.
+    Parent virtual class of a pool for fast allocation
+    and deallocation of objects.
     */
     class MemPool
     {
@@ -339,13 +332,13 @@ namespace tinyxml2
 
 
     /*
-        Template child class to create pools of the correct type.
+    Template child class to create pools of the correct type.
     */
-    template< int SIZE >
+    template< int ITEM_SIZE >
     class MemPoolT : public MemPool
     {
     public:
-        MemPoolT() : _root(0), _currentAllocs(0), _nAllocs(0), _maxAllocs(0), _nUntracked(0) {}
+        MemPoolT() : _blockPtrs(), _root(0), _currentAllocs(0), _nAllocs(0), _maxAllocs(0), _nUntracked(0) {}
         ~MemPoolT() {
             Clear();
         }
@@ -353,8 +346,8 @@ namespace tinyxml2
         void Clear() {
             // Delete the blocks.
             while (!_blockPtrs.Empty()) {
-                Block* b = _blockPtrs.Pop();
-                delete b;
+                Block* lastBlock = _blockPtrs.Pop();
+                delete lastBlock;
             }
             _root = 0;
             _currentAllocs = 0;
@@ -364,7 +357,7 @@ namespace tinyxml2
         }
 
         virtual int ItemSize() const {
-            return SIZE;
+            return ITEM_SIZE;
         }
         int CurrentAllocs() const {
             return _currentAllocs;
@@ -376,21 +369,23 @@ namespace tinyxml2
                 Block* block = new Block();
                 _blockPtrs.Push(block);
 
-                for (int i = 0; i < COUNT - 1; ++i) {
-                    block->chunk[i].next = &block->chunk[i + 1];
+                Item* blockItems = block->items;
+                for (int i = 0; i < ITEMS_PER_BLOCK - 1; ++i) {
+                    blockItems[i].next = &(blockItems[i + 1]);
                 }
-                block->chunk[COUNT - 1].next = 0;
-                _root = block->chunk;
+                blockItems[ITEMS_PER_BLOCK - 1].next = 0;
+                _root = blockItems;
             }
-            void* result = _root;
+            Item* const result = _root;
+            TIXMLASSERT(result != 0);
             _root = _root->next;
 
             ++_currentAllocs;
             if (_currentAllocs > _maxAllocs) {
                 _maxAllocs = _currentAllocs;
             }
-            _nAllocs++;
-            _nUntracked++;
+            ++_nAllocs;
+            ++_nUntracked;
             return result;
         }
 
@@ -399,20 +394,21 @@ namespace tinyxml2
                 return;
             }
             --_currentAllocs;
-            Chunk* chunk = static_cast<Chunk*>(mem);
-#ifdef DEBUG
-            memset(chunk, 0xfe, sizeof(Chunk));
+            Item* item = static_cast<Item*>(mem);
+#ifdef TINYXML2_DEBUG
+            memset(item, 0xfe, sizeof(*item));
 #endif
-            chunk->next = _root;
-            _root = chunk;
+            item->next = _root;
+            _root = item;
         }
         void Trace(const char* name) {
             printf("Mempool %s watermark=%d [%dk] current=%d size=%d nAlloc=%d blocks=%d\n",
-                name, _maxAllocs, _maxAllocs*SIZE / 1024, _currentAllocs, SIZE, _nAllocs, _blockPtrs.Size());
+                name, _maxAllocs, _maxAllocs * ITEM_SIZE / 1024, _currentAllocs,
+                ITEM_SIZE, _nAllocs, _blockPtrs.Size());
         }
 
         void SetTracked() {
-            _nUntracked--;
+            --_nUntracked;
         }
 
         int Untracked() const {
@@ -428,21 +424,23 @@ namespace tinyxml2
         //		16k:	5200
         //		32k:	4300
         //		64k:	4000	21000
-        enum { COUNT = (4 * 1024) / SIZE }; // Some compilers do not accept to use COUNT in private part if COUNT is private
+        // Declared public because some compilers do not accept to use ITEMS_PER_BLOCK
+        // in private part if ITEMS_PER_BLOCK is private
+        enum { ITEMS_PER_BLOCK = (4 * 1024) / ITEM_SIZE };
 
     private:
         MemPoolT(const MemPoolT&); // not supported
         void operator=(const MemPoolT&); // not supported
 
-        union Chunk {
-            Chunk*  next;
-            char    mem[SIZE];
+        union Item {
+            Item*   next;
+            char    itemData[ITEM_SIZE];
         };
         struct Block {
-            Chunk chunk[COUNT];
+            Item items[ITEMS_PER_BLOCK];
         };
         DynArray< Block*, 10 > _blockPtrs;
-        Chunk* _root;
+        Item* _root;
 
         int _currentAllocs;
         int _nAllocs;
@@ -453,23 +451,23 @@ namespace tinyxml2
 
 
     /**
-        Implements the interface to the "Visitor pattern" (see the Accept() method.)
-        If you call the Accept() method, it requires being passed a XMLVisitor
-        class to handle callbacks. For nodes that contain other nodes (Document, Element)
-        you will get called with a VisitEnter/VisitExit pair. Nodes that are always leafs
-        are simply called with Visit().
+    Implements the interface to the "Visitor pattern" (see the Accept() method.)
+    If you call the Accept() method, it requires being passed a XMLVisitor
+    class to handle callbacks. For nodes that contain other nodes (Document, Element)
+    you will get called with a VisitEnter/VisitExit pair. Nodes that are always leafs
+    are simply called with Visit().
 
-        If you return 'true' from a Visit method, recursive parsing will continue. If you return
-        false, <b>no children of this node or its siblings</b> will be visited.
+    If you return 'true' from a Visit method, recursive parsing will continue. If you return
+    false, <b>no children of this node or its siblings</b> will be visited.
 
-        All flavors of Visit methods have a default implementation that returns 'true' (continue
-        visiting). You need to only override methods that are interesting to you.
+    All flavors of Visit methods have a default implementation that returns 'true' (continue
+    visiting). You need to only override methods that are interesting to you.
 
-        Generally Accept() is called on the XMLDocument, although all nodes support visiting.
+    Generally Accept() is called on the XMLDocument, although all nodes support visiting.
 
-        You should never change the document from a callback.
+    You should never change the document from a callback.
 
-        @sa XMLNode::Accept()
+    @sa XMLNode::Accept()
     */
     class TINYXML2_LIB XMLVisitor
     {
@@ -515,16 +513,15 @@ namespace tinyxml2
     // WARNING: must match XMLDocument::_errorNames[]
     enum XMLError {
         XML_SUCCESS = 0,
-        XML_NO_ERROR = 0,
         XML_NO_ATTRIBUTE,
         XML_WRONG_ATTRIBUTE_TYPE,
         XML_ERROR_FILE_NOT_FOUND,
         XML_ERROR_FILE_COULD_NOT_BE_OPENED,
         XML_ERROR_FILE_READ_ERROR,
-        XML_ERROR_ELEMENT_MISMATCH,
+        UNUSED_XML_ERROR_ELEMENT_MISMATCH,	// remove at next major version
         XML_ERROR_PARSING_ELEMENT,
         XML_ERROR_PARSING_ATTRIBUTE,
-        XML_ERROR_IDENTIFYING_TAG,
+        UNUSED_XML_ERROR_IDENTIFYING_TAG,	// remove at next major version
         XML_ERROR_PARSING_TEXT,
         XML_ERROR_PARSING_CDATA,
         XML_ERROR_PARSING_COMMENT,
@@ -541,21 +538,25 @@ namespace tinyxml2
 
 
     /*
-        Utility functionality.
+    Utility functionality.
     */
-    class XMLUtil
+    class TINYXML2_LIB XMLUtil
     {
     public:
-        static const char* SkipWhiteSpace(const char* p) {
+        static const char* SkipWhiteSpace(const char* p, int* curLineNumPtr) {
             TIXMLASSERT(p);
+
             while (IsWhiteSpace(*p)) {
+                if (curLineNumPtr && *p == '\n') {
+                    ++(*curLineNumPtr);
+                }
                 ++p;
             }
             TIXMLASSERT(p);
             return p;
         }
-        static char* SkipWhiteSpace(char* p) {
-            return const_cast<char*>(SkipWhiteSpace(const_cast<const char*>(p)));
+        static char* SkipWhiteSpace(char* p, int* curLineNumPtr) {
+            return const_cast<char*>(SkipWhiteSpace(const_cast<const char*>(p), curLineNumPtr));
         }
 
         // Anything in the high order range of UTF-8 is assumed to not be whitespace. This isn't
@@ -586,19 +587,13 @@ namespace tinyxml2
             if (p == q) {
                 return true;
             }
-            int n = 0;
-            while (*p && *q && *p == *q && n < nChar) {
-                ++p;
-                ++q;
-                ++n;
-            }
-            if ((n == nChar) || (*p == 0 && *q == 0)) {
-                return true;
-            }
-            return false;
+            TIXMLASSERT(p);
+            TIXMLASSERT(q);
+            TIXMLASSERT(nChar >= 0);
+            return strncmp(p, q, nChar) == 0;
         }
 
-        inline static bool IsUTF8Continuation(const char p) {
+        inline static bool IsUTF8Continuation(char p) {
             return (p & 0x80) != 0;
         }
 
@@ -614,6 +609,7 @@ namespace tinyxml2
         static void ToStr(bool v, char* buffer, int bufferSize);
         static void ToStr(float v, char* buffer, int bufferSize);
         static void ToStr(double v, char* buffer, int bufferSize);
+        static void ToStr(int64_t v, char* buffer, int bufferSize);
 
         // converts strings to primitive types
         static bool	ToInt(const char* str, int* value);
@@ -621,33 +617,45 @@ namespace tinyxml2
         static bool	ToBool(const char* str, bool* value);
         static bool	ToFloat(const char* str, float* value);
         static bool ToDouble(const char* str, double* value);
-    };
+        static bool ToInt64(const char* str, int64_t* value);
 
+        // Changes what is serialized for a boolean value.
+        // Default to "true" and "false". Shouldn't be changed
+        // unless you have a special testing or compatibility need.
+        // Be careful: static, global, & not thread safe.
+        // Be sure to set static const memory as parameters.
+        static void SetBoolSerialization(const char* writeTrue, const char* writeFalse);
 
-    /** XMLNode is a base class for every object that is in the
-        XML Document Object Model (DOM), except XMLAttributes.
-        Nodes have siblings, a parent, and children which can
-        be navigated. A node is always in a XMLDocument.
-        The type of a XMLNode can be queried, and it can
-        be cast to its more defined type.
-
-        A XMLDocument allocates memory for all its Nodes.
-        When the XMLDocument gets deleted, all its Nodes
-        will also be deleted.
-
-        @verbatim
-        A Document can contain:	Element	(container or leaf)
-                                Comment (leaf)
-                                Unknown (leaf)
-                                Declaration( leaf )
+    private:
+        static const char* writeBoolTrue;
+        static const char* writeBoolFalse;
+    };
 
-        An Element can contain:	Element (container or leaf)
-                                Text	(leaf)
-                                Attributes (not on tree)
-                                Comment (leaf)
-                                Unknown (leaf)
 
-        @endverbatim
+    /** XMLNode is a base class for every object that is in the
+    XML Document Object Model (DOM), except XMLAttributes.
+    Nodes have siblings, a parent, and children which can
+    be navigated. A node is always in a XMLDocument.
+    The type of a XMLNode can be queried, and it can
+    be cast to its more defined type.
+
+    A XMLDocument allocates memory for all its Nodes.
+    When the XMLDocument gets deleted, all its Nodes
+    will also be deleted.
+
+    @verbatim
+    A Document can contain:	Element	(container or leaf)
+    Comment (leaf)
+    Unknown (leaf)
+    Declaration( leaf )
+
+    An Element can contain:	Element (container or leaf)
+    Text	(leaf)
+    Attributes (not on tree)
+    Comment (leaf)
+    Unknown (leaf)
+
+    @endverbatim
     */
     class TINYXML2_LIB XMLNode
     {
@@ -657,10 +665,12 @@ namespace tinyxml2
 
         /// Get the XMLDocument that owns this XMLNode.
         const XMLDocument* GetDocument() const {
+            TIXMLASSERT(_document);
             return _document;
         }
         /// Get the XMLDocument that owns this XMLNode.
         XMLDocument* GetDocument() {
+            TIXMLASSERT(_document);
             return _document;
         }
 
@@ -709,21 +719,24 @@ namespace tinyxml2
         }
 
         /** The meaning of 'value' changes for the specific type.
-            @verbatim
-            Document:	empty
-            Element:	name of the element
-            Comment:	the comment text
-            Unknown:	the tag contents
-            Text:		the text string
-            @endverbatim
+        @verbatim
+        Document:	empty (NULL is returned, not an empty string)
+        Element:	name of the element
+        Comment:	the comment text
+        Unknown:	the tag contents
+        Text:		the text string
+        @endverbatim
         */
         const char* Value() const;
 
         /** Set the Value of an XML node.
-            @sa Value()
+        @sa Value()
         */
         void SetValue(const char* val, bool staticMem = false);
 
+        /// Gets the line number the node is in, if the document was parsed from a file.
+        int GetLineNum() const { return _parseLineNum; }
+
         /// Get the parent of this node on the DOM.
         const XMLNode*	Parent() const {
             return _parent;
@@ -748,12 +761,12 @@ namespace tinyxml2
         }
 
         /** Get the first child element, or optionally the first child
-            element with the specified name.
+        element with the specified name.
         */
-        const XMLElement* FirstChildElement(const char* value = 0) const;
+        const XMLElement* FirstChildElement(const char* name = 0) const;
 
-        XMLElement* FirstChildElement(const char* value = 0) {
-            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->FirstChildElement(value));
+        XMLElement* FirstChildElement(const char* name = 0) {
+            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->FirstChildElement(name));
         }
 
         /// Get the last child node, or null if none exists.
@@ -762,16 +775,16 @@ namespace tinyxml2
         }
 
         XMLNode*		LastChild() {
-            return const_cast<XMLNode*>(const_cast<const XMLNode*>(this)->LastChild());
+            return _lastChild;
         }
 
         /** Get the last child element or optionally the last child
-            element with the specified name.
+        element with the specified name.
         */
-        const XMLElement* LastChildElement(const char* value = 0) const;
+        const XMLElement* LastChildElement(const char* name = 0) const;
 
-        XMLElement* LastChildElement(const char* value = 0) {
-            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->LastChildElement(value));
+        XMLElement* LastChildElement(const char* name = 0) {
+            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->LastChildElement(name));
         }
 
         /// Get the previous (left) sibling node of this node.
@@ -784,10 +797,10 @@ namespace tinyxml2
         }
 
         /// Get the previous (left) sibling element of this node, with an optionally supplied name.
-        const XMLElement*	PreviousSiblingElement(const char* value = 0) const;
+        const XMLElement*	PreviousSiblingElement(const char* name = 0) const;
 
-        XMLElement*	PreviousSiblingElement(const char* value = 0) {
-            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->PreviousSiblingElement(value));
+        XMLElement*	PreviousSiblingElement(const char* name = 0) {
+            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->PreviousSiblingElement(name));
         }
 
         /// Get the next (right) sibling node of this node.
@@ -800,18 +813,18 @@ namespace tinyxml2
         }
 
         /// Get the next (right) sibling element of this node, with an optionally supplied name.
-        const XMLElement*	NextSiblingElement(const char* value = 0) const;
+        const XMLElement*	NextSiblingElement(const char* name = 0) const;
 
-        XMLElement*	NextSiblingElement(const char* value = 0) {
-            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->NextSiblingElement(value));
+        XMLElement*	NextSiblingElement(const char* name = 0) {
+            return const_cast<XMLElement*>(const_cast<const XMLNode*>(this)->NextSiblingElement(name));
         }
 
         /**
-            Add a child node as the last (right) child.
-            If the child node is already part of the document,
-            it is moved from its old location to the new location.
-            Returns the addThis argument or 0 if the node does not
-            belong to the same document.
+        Add a child node as the last (right) child.
+        If the child node is already part of the document,
+        it is moved from its old location to the new location.
+        Returns the addThis argument or 0 if the node does not
+        belong to the same document.
         */
         XMLNode* InsertEndChild(XMLNode* addThis);
 
@@ -819,86 +832,115 @@ namespace tinyxml2
             return InsertEndChild(addThis);
         }
         /**
-            Add a child node as the first (left) child.
-            If the child node is already part of the document,
-            it is moved from its old location to the new location.
-            Returns the addThis argument or 0 if the node does not
-            belong to the same document.
+        Add a child node as the first (left) child.
+        If the child node is already part of the document,
+        it is moved from its old location to the new location.
+        Returns the addThis argument or 0 if the node does not
+        belong to the same document.
         */
         XMLNode* InsertFirstChild(XMLNode* addThis);
         /**
-            Add a node after the specified child node.
-            If the child node is already part of the document,
-            it is moved from its old location to the new location.
-            Returns the addThis argument or 0 if the afterThis node
-            is not a child of this node, or if the node does not
-            belong to the same document.
+        Add a node after the specified child node.
+        If the child node is already part of the document,
+        it is moved from its old location to the new location.
+        Returns the addThis argument or 0 if the afterThis node
+        is not a child of this node, or if the node does not
+        belong to the same document.
         */
         XMLNode* InsertAfterChild(XMLNode* afterThis, XMLNode* addThis);
 
         /**
-            Delete all the children of this node.
+        Delete all the children of this node.
         */
         void DeleteChildren();
 
         /**
-            Delete a child of this node.
+        Delete a child of this node.
         */
         void DeleteChild(XMLNode* node);
 
         /**
-            Make a copy of this node, but not its children.
-            You may pass in a Document pointer that will be
-            the owner of the new Node. If the 'document' is
-            null, then the node returned will be allocated
-            from the current Document. (this->GetDocument())
+        Make a copy of this node, but not its children.
+        You may pass in a Document pointer that will be
+        the owner of the new Node. If the 'document' is
+        null, then the node returned will be allocated
+        from the current Document. (this->GetDocument())
 
-            Note: if called on a XMLDocument, this will return null.
+        Note: if called on a XMLDocument, this will return null.
         */
         virtual XMLNode* ShallowClone(XMLDocument* document) const = 0;
 
         /**
-            Test if 2 nodes are the same, but don't test children.
-            The 2 nodes do not need to be in the same Document.
+        Make a copy of this node and all its children.
+
+        If the 'target' is null, then the nodes will
+        be allocated in the current document. If 'target'
+        is specified, the memory will be allocated is the
+        specified XMLDocument.
 
-            Note: if called on a XMLDocument, this will return false.
+        NOTE: This is probably not the correct tool to
+        copy a document, since XMLDocuments can have multiple
+        top level XMLNodes. You probably want to use
+        XMLDocument::DeepCopy()
+        */
+        XMLNode* DeepClone(XMLDocument* target) const;
+
+        /**
+        Test if 2 nodes are the same, but don't test children.
+        The 2 nodes do not need to be in the same Document.
+
+        Note: if called on a XMLDocument, this will return false.
         */
         virtual bool ShallowEqual(const XMLNode* compare) const = 0;
 
         /** Accept a hierarchical visit of the nodes in the TinyXML-2 DOM. Every node in the
-            XML tree will be conditionally visited and the host will be called back
-            via the XMLVisitor interface.
+        XML tree will be conditionally visited and the host will be called back
+        via the XMLVisitor interface.
 
-            This is essentially a SAX interface for TinyXML-2. (Note however it doesn't re-parse
-            the XML for the callbacks, so the performance of TinyXML-2 is unchanged by using this
-            interface versus any other.)
+        This is essentially a SAX interface for TinyXML-2. (Note however it doesn't re-parse
+        the XML for the callbacks, so the performance of TinyXML-2 is unchanged by using this
+        interface versus any other.)
 
-            The interface has been based on ideas from:
+        The interface has been based on ideas from:
 
-            - http://www.saxproject.org/
-            - http://c2.com/cgi/wiki?HierarchicalVisitorPattern
+        - http://www.saxproject.org/
+        - http://c2.com/cgi/wiki?HierarchicalVisitorPattern
 
-            Which are both good references for "visiting".
+        Which are both good references for "visiting".
 
-            An example of using Accept():
-            @verbatim
-            XMLPrinter printer;
-            tinyxmlDoc.Accept( &printer );
-            const char* xmlcstr = printer.CStr();
-            @endverbatim
+        An example of using Accept():
+        @verbatim
+        XMLPrinter printer;
+        tinyxmlDoc.Accept( &printer );
+        const char* xmlcstr = printer.CStr();
+        @endverbatim
         */
         virtual bool Accept(XMLVisitor* visitor) const = 0;
 
-        // internal
-        virtual char* ParseDeep(char*, StrPair*);
+        /**
+        Set user data into the XMLNode. TinyXML-2 in
+        no way processes or interprets user data.
+        It is initially 0.
+        */
+        void SetUserData(void* userData) { _userData = userData; }
+
+        /**
+        Get user data set into the XMLNode. TinyXML-2 in
+        no way processes or interprets user data.
+        It is initially 0.
+        */
+        void* GetUserData() const { return _userData; }
 
     protected:
         XMLNode(XMLDocument*);
         virtual ~XMLNode();
 
+        virtual char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
         XMLDocument*	_document;
         XMLNode*		_parent;
         mutable StrPair	_value;
+        int             _parseLineNum;
 
         XMLNode*		_firstChild;
         XMLNode*		_lastChild;
@@ -906,11 +948,14 @@ namespace tinyxml2
         XMLNode*		_prev;
         XMLNode*		_next;
 
+        void*			_userData;
+
     private:
-        MemPool*		_memPool;
+        MemPool * _memPool;
         void Unlink(XMLNode* child);
         static void DeleteNode(XMLNode* node);
         void InsertChildPreamble(XMLNode* insertThis) const;
+        const XMLElement* ToElementWithName(const char* name) const;
 
         XMLNode(const XMLNode&);	// not supported
         XMLNode& operator=(const XMLNode&);	// not supported
@@ -919,19 +964,18 @@ namespace tinyxml2
 
     /** XML text.
 
-        Note that a text node can have child element nodes, for example:
-        @verbatim
-        <root>This is <b>bold</b></root>
-        @endverbatim
+    Note that a text node can have child element nodes, for example:
+    @verbatim
+    <root>This is <b>bold</b></root>
+    @endverbatim
 
-        A text node can have 2 ways to output the next. "normal" output
-        and CDATA. It will default to the mode it was parsed from the XML file and
-        you generally want to leave it alone, but you can change the output mode with
-        SetCData() and query it with CData().
+    A text node can have 2 ways to output the next. "normal" output
+    and CDATA. It will default to the mode it was parsed from the XML file and
+    you generally want to leave it alone, but you can change the output mode with
+    SetCData() and query it with CData().
     */
     class TINYXML2_LIB XMLText : public XMLNode
     {
-        friend class XMLBase;
         friend class XMLDocument;
     public:
         virtual bool Accept(XMLVisitor* visitor) const;
@@ -952,7 +996,6 @@ namespace tinyxml2
             return _isCData;
         }
 
-        char* ParseDeep(char*, StrPair* endTag);
         virtual XMLNode* ShallowClone(XMLDocument* document) const;
         virtual bool ShallowEqual(const XMLNode* compare) const;
 
@@ -960,6 +1003,8 @@ namespace tinyxml2
         XMLText(XMLDocument* doc) : XMLNode(doc), _isCData(false) {}
         virtual ~XMLText() {}
 
+        char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
     private:
         bool _isCData;
 
@@ -982,7 +1027,6 @@ namespace tinyxml2
 
         virtual bool Accept(XMLVisitor* visitor) const;
 
-        char* ParseDeep(char*, StrPair* endTag);
         virtual XMLNode* ShallowClone(XMLDocument* document) const;
         virtual bool ShallowEqual(const XMLNode* compare) const;
 
@@ -990,6 +1034,8 @@ namespace tinyxml2
         XMLComment(XMLDocument* doc);
         virtual ~XMLComment();
 
+        char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
     private:
         XMLComment(const XMLComment&);	// not supported
         XMLComment& operator=(const XMLComment&);	// not supported
@@ -997,15 +1043,15 @@ namespace tinyxml2
 
 
     /** In correct XML the declaration is the first entry in the file.
-        @verbatim
-            <?xml version="1.0" standalone="yes"?>
-        @endverbatim
+    @verbatim
+    <?xml version="1.0" standalone="yes"?>
+    @endverbatim
 
-        TinyXML-2 will happily read or write files without a declaration,
-        however.
+    TinyXML-2 will happily read or write files without a declaration,
+    however.
 
-        The text of the declaration isn't interpreted. It is parsed
-        and written as a string.
+    The text of the declaration isn't interpreted. It is parsed
+    and written as a string.
     */
     class TINYXML2_LIB XMLDeclaration : public XMLNode
     {
@@ -1020,7 +1066,6 @@ namespace tinyxml2
 
         virtual bool Accept(XMLVisitor* visitor) const;
 
-        char* ParseDeep(char*, StrPair* endTag);
         virtual XMLNode* ShallowClone(XMLDocument* document) const;
         virtual bool ShallowEqual(const XMLNode* compare) const;
 
@@ -1028,6 +1073,8 @@ namespace tinyxml2
         XMLDeclaration(XMLDocument* doc);
         virtual ~XMLDeclaration();
 
+        char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
     private:
         XMLDeclaration(const XMLDeclaration&);	// not supported
         XMLDeclaration& operator=(const XMLDeclaration&);	// not supported
@@ -1035,11 +1082,11 @@ namespace tinyxml2
 
 
     /** Any tag that TinyXML-2 doesn't recognize is saved as an
-        unknown. It is a tag of text, but should not be modified.
-        It will be written back to the XML, unchanged, when the file
-        is saved.
+    unknown. It is a tag of text, but should not be modified.
+    It will be written back to the XML, unchanged, when the file
+    is saved.
 
-        DTD tags get thrown into XMLUnknowns.
+    DTD tags get thrown into XMLUnknowns.
     */
     class TINYXML2_LIB XMLUnknown : public XMLNode
     {
@@ -1054,7 +1101,6 @@ namespace tinyxml2
 
         virtual bool Accept(XMLVisitor* visitor) const;
 
-        char* ParseDeep(char*, StrPair* endTag);
         virtual XMLNode* ShallowClone(XMLDocument* document) const;
         virtual bool ShallowEqual(const XMLNode* compare) const;
 
@@ -1062,6 +1108,8 @@ namespace tinyxml2
         XMLUnknown(XMLDocument* doc);
         virtual ~XMLUnknown();
 
+        char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
     private:
         XMLUnknown(const XMLUnknown&);	// not supported
         XMLUnknown& operator=(const XMLUnknown&);	// not supported
@@ -1070,10 +1118,10 @@ namespace tinyxml2
 
 
     /** An attribute is a name-value pair. Elements have an arbitrary
-        number of attributes, each with a unique name.
+    number of attributes, each with a unique name.
 
-        @note The attributes are not XMLNodes. You may only query the
-        Next() attribute in a list.
+    @note The attributes are not XMLNodes. You may only query the
+    Next() attribute in a list.
     */
     class TINYXML2_LIB XMLAttribute
     {
@@ -1085,20 +1133,30 @@ namespace tinyxml2
         /// The value of the attribute.
         const char* Value() const;
 
+        /// Gets the line number the attribute is in, if the document was parsed from a file.
+        int GetLineNum() const { return _parseLineNum; }
+
         /// The next attribute in the list.
         const XMLAttribute* Next() const {
             return _next;
         }
 
         /** IntValue interprets the attribute as an integer, and returns the value.
-            If the value isn't an integer, 0 will be returned. There is no error checking;
-            use QueryIntValue() if you need error checking.
+        If the value isn't an integer, 0 will be returned. There is no error checking;
+        use QueryIntValue() if you need error checking.
         */
-        int		 IntValue() const {
+        int	IntValue() const {
             int i = 0;
             QueryIntValue(&i);
             return i;
         }
+
+        int64_t Int64Value() const {
+            int64_t i = 0;
+            QueryInt64Value(&i);
+            return i;
+        }
+
         /// Query as an unsigned integer. See IntValue()
         unsigned UnsignedValue() const {
             unsigned i = 0;
@@ -1125,13 +1183,15 @@ namespace tinyxml2
         }
 
         /** QueryIntValue interprets the attribute as an integer, and returns the value
-            in the provided parameter. The function will return XML_NO_ERROR on success,
-            and XML_WRONG_ATTRIBUTE_TYPE if the conversion is not successful.
+        in the provided parameter. The function will return XML_SUCCESS on success,
+        and XML_WRONG_ATTRIBUTE_TYPE if the conversion is not successful.
         */
         XMLError QueryIntValue(int* value) const;
         /// See QueryIntValue
         XMLError QueryUnsignedValue(unsigned int* value) const;
         /// See QueryIntValue
+        XMLError QueryInt64Value(int64_t* value) const;
+        /// See QueryIntValue
         XMLError QueryBoolValue(bool* value) const;
         /// See QueryIntValue
         XMLError QueryDoubleValue(double* value) const;
@@ -1145,6 +1205,8 @@ namespace tinyxml2
         /// Set the attribute to value.
         void SetAttribute(unsigned value);
         /// Set the attribute to value.
+        void SetAttribute(int64_t value);
+        /// Set the attribute to value.
         void SetAttribute(bool value);
         /// Set the attribute to value.
         void SetAttribute(double value);
@@ -1154,29 +1216,29 @@ namespace tinyxml2
     private:
         enum { BUF_SIZE = 200 };
 
-        XMLAttribute() : _next(0), _memPool(0) {}
+        XMLAttribute() : _name(), _value(), _parseLineNum(0), _next(0), _memPool(0) {}
         virtual ~XMLAttribute() {}
 
         XMLAttribute(const XMLAttribute&);	// not supported
         void operator=(const XMLAttribute&);	// not supported
         void SetName(const char* name);
 
-        char* ParseDeep(char* p, bool processEntities);
+        char* ParseDeep(char* p, bool processEntities, int* curLineNumPtr);
 
         mutable StrPair _name;
         mutable StrPair _value;
+        int             _parseLineNum;
         XMLAttribute*   _next;
         MemPool*        _memPool;
     };
 
 
     /** The element is a container class. It has a value, the element name,
-        and can contain other elements, text, comments, and unknowns.
-        Elements also contain an arbitrary number of attributes.
+    and can contain other elements, text, comments, and unknowns.
+    Elements also contain an arbitrary number of attributes.
     */
     class TINYXML2_LIB XMLElement : public XMLNode
     {
-        friend class XMLBase;
         friend class XMLDocument;
     public:
         /// Get the name of an element (which is the Value() of the node.)
@@ -1197,77 +1259,60 @@ namespace tinyxml2
         virtual bool Accept(XMLVisitor* visitor) const;
 
         /** Given an attribute name, Attribute() returns the value
-            for the attribute of that name, or null if none
-            exists. For example:
+        for the attribute of that name, or null if none
+        exists. For example:
 
-            @verbatim
-            const char* value = ele->Attribute( "foo" );
-            @endverbatim
+        @verbatim
+        const char* value = ele->Attribute( "foo" );
+        @endverbatim
 
-            The 'value' parameter is normally null. However, if specified,
-            the attribute will only be returned if the 'name' and 'value'
-            match. This allow you to write code:
+        The 'value' parameter is normally null. However, if specified,
+        the attribute will only be returned if the 'name' and 'value'
+        match. This allow you to write code:
 
-            @verbatim
-            if ( ele->Attribute( "foo", "bar" ) ) callFooIsBar();
-            @endverbatim
+        @verbatim
+        if ( ele->Attribute( "foo", "bar" ) ) callFooIsBar();
+        @endverbatim
 
-            rather than:
-            @verbatim
-            if ( ele->Attribute( "foo" ) ) {
-                if ( strcmp( ele->Attribute( "foo" ), "bar" ) == 0 ) callFooIsBar();
-            }
-            @endverbatim
+        rather than:
+        @verbatim
+        if ( ele->Attribute( "foo" ) ) {
+        if ( strcmp( ele->Attribute( "foo" ), "bar" ) == 0 ) callFooIsBar();
+        }
+        @endverbatim
         */
         const char* Attribute(const char* name, const char* value = 0) const;
 
         /** Given an attribute name, IntAttribute() returns the value
-            of the attribute interpreted as an integer. 0 will be
-            returned if there is an error. For a method with error
-            checking, see QueryIntAttribute()
+        of the attribute interpreted as an integer. The default
+        value will be returned if the attribute isn't present,
+        or if there is an error. (For a method with error
+        checking, see QueryIntAttribute()).
         */
-        int		 IntAttribute(const char* name) const {
-            int i = 0;
-            QueryIntAttribute(name, &i);
-            return i;
-        }
+        int IntAttribute(const char* name, int defaultValue = 0) const;
         /// See IntAttribute()
-        unsigned UnsignedAttribute(const char* name) const {
-            unsigned i = 0;
-            QueryUnsignedAttribute(name, &i);
-            return i;
-        }
+        unsigned UnsignedAttribute(const char* name, unsigned defaultValue = 0) const;
         /// See IntAttribute()
-        bool	 BoolAttribute(const char* name) const {
-            bool b = false;
-            QueryBoolAttribute(name, &b);
-            return b;
-        }
+        int64_t Int64Attribute(const char* name, int64_t defaultValue = 0) const;
         /// See IntAttribute()
-        double 	 DoubleAttribute(const char* name) const {
-            double d = 0;
-            QueryDoubleAttribute(name, &d);
-            return d;
-        }
+        bool BoolAttribute(const char* name, bool defaultValue = false) const;
         /// See IntAttribute()
-        float	 FloatAttribute(const char* name) const {
-            float f = 0;
-            QueryFloatAttribute(name, &f);
-            return f;
-        }
+        double DoubleAttribute(const char* name, double defaultValue = 0) const;
+        /// See IntAttribute()
+        float FloatAttribute(const char* name, float defaultValue = 0) const;
 
         /** Given an attribute name, QueryIntAttribute() returns
-            XML_NO_ERROR, XML_WRONG_ATTRIBUTE_TYPE if the conversion
-            can't be performed, or XML_NO_ATTRIBUTE if the attribute
-            doesn't exist. If successful, the result of the conversion
-            will be written to 'value'. If not successful, nothing will
-            be written to 'value'. This allows you to provide default
-            value:
-
-            @verbatim
-            int value = 10;
-            QueryIntAttribute( "foo", &value );		// if "foo" isn't found, value will still be 10
-            @endverbatim
+        XML_SUCCESS, XML_WRONG_ATTRIBUTE_TYPE if the conversion
+        can't be performed, or XML_NO_ATTRIBUTE if the attribute
+        doesn't exist. If successful, the result of the conversion
+        will be written to 'value'. If not successful, nothing will
+        be written to 'value'. This allows you to provide default
+        value:
+
+        @verbatim
+        int value = 10;
+        QueryIntAttribute( "foo", &value );		// if "foo" isn't found, value will still be 10
+        @endverbatim
         */
         XMLError QueryIntAttribute(const char* name, int* value) const {
             const XMLAttribute* a = FindAttribute(name);
@@ -1276,6 +1321,7 @@ namespace tinyxml2
             }
             return a->QueryIntValue(value);
         }
+
         /// See QueryIntAttribute()
         XMLError QueryUnsignedAttribute(const char* name, unsigned int* value) const {
             const XMLAttribute* a = FindAttribute(name);
@@ -1284,6 +1330,16 @@ namespace tinyxml2
             }
             return a->QueryUnsignedValue(value);
         }
+
+        /// See QueryIntAttribute()
+        XMLError QueryInt64Attribute(const char* name, int64_t* value) const {
+            const XMLAttribute* a = FindAttribute(name);
+            if (!a) {
+                return XML_NO_ATTRIBUTE;
+            }
+            return a->QueryInt64Value(value);
+        }
+
         /// See QueryIntAttribute()
         XMLError QueryBoolAttribute(const char* name, bool* value) const {
             const XMLAttribute* a = FindAttribute(name);
@@ -1309,23 +1365,34 @@ namespace tinyxml2
             return a->QueryFloatValue(value);
         }
 
+        /// See QueryIntAttribute()
+        XMLError QueryStringAttribute(const char* name, const char** value) const {
+            const XMLAttribute* a = FindAttribute(name);
+            if (!a) {
+                return XML_NO_ATTRIBUTE;
+            }
+            *value = a->Value();
+            return XML_SUCCESS;
+        }
+
+
 
         /** Given an attribute name, QueryAttribute() returns
-            XML_NO_ERROR, XML_WRONG_ATTRIBUTE_TYPE if the conversion
-            can't be performed, or XML_NO_ATTRIBUTE if the attribute
-            doesn't exist. It is overloaded for the primitive types,
-            and is a generally more convenient replacement of
-            QueryIntAttribute() and related functions.
-
-            If successful, the result of the conversion
-            will be written to 'value'. If not successful, nothing will
-            be written to 'value'. This allows you to provide default
-            value:
-
-            @verbatim
-            int value = 10;
-            QueryAttribute( "foo", &value );		// if "foo" isn't found, value will still be 10
-            @endverbatim
+        XML_SUCCESS, XML_WRONG_ATTRIBUTE_TYPE if the conversion
+        can't be performed, or XML_NO_ATTRIBUTE if the attribute
+        doesn't exist. It is overloaded for the primitive types,
+        and is a generally more convenient replacement of
+        QueryIntAttribute() and related functions.
+
+        If successful, the result of the conversion
+        will be written to 'value'. If not successful, nothing will
+        be written to 'value'. This allows you to provide default
+        value:
+
+        @verbatim
+        int value = 10;
+        QueryAttribute( "foo", &value );		// if "foo" isn't found, value will still be 10
+        @endverbatim
         */
         int QueryAttribute(const char* name, int* value) const {
             return QueryIntAttribute(name, value);
@@ -1335,6 +1402,10 @@ namespace tinyxml2
             return QueryUnsignedAttribute(name, value);
         }
 
+        int QueryAttribute(const char* name, int64_t* value) const {
+            return QueryInt64Attribute(name, value);
+        }
+
         int QueryAttribute(const char* name, bool* value) const {
             return QueryBoolAttribute(name, value);
         }
@@ -1362,6 +1433,13 @@ namespace tinyxml2
             XMLAttribute* a = FindOrCreateAttribute(name);
             a->SetAttribute(value);
         }
+
+        /// Sets the named attribute to value.
+        void SetAttribute(const char* name, int64_t value) {
+            XMLAttribute* a = FindOrCreateAttribute(name);
+            a->SetAttribute(value);
+        }
+
         /// Sets the named attribute to value.
         void SetAttribute(const char* name, bool value) {
             XMLAttribute* a = FindOrCreateAttribute(name);
@@ -1379,7 +1457,7 @@ namespace tinyxml2
         }
 
         /**
-            Delete an attribute.
+        Delete an attribute.
         */
         void DeleteAttribute(const char* name);
 
@@ -1391,130 +1469,149 @@ namespace tinyxml2
         const XMLAttribute* FindAttribute(const char* name) const;
 
         /** Convenience function for easy access to the text inside an element. Although easy
-            and concise, GetText() is limited compared to getting the XMLText child
-            and accessing it directly.
-
-            If the first child of 'this' is a XMLText, the GetText()
-            returns the character string of the Text node, else null is returned.
-
-            This is a convenient method for getting the text of simple contained text:
-            @verbatim
-            <foo>This is text</foo>
-                const char* str = fooElement->GetText();
-            @endverbatim
-
-            'str' will be a pointer to "This is text".
-
-            Note that this function can be misleading. If the element foo was created from
-            this XML:
-            @verbatim
-                <foo><b>This is text</b></foo>
-            @endverbatim
-
-            then the value of str would be null. The first child node isn't a text node, it is
-            another element. From this XML:
-            @verbatim
-                <foo>This is <b>text</b></foo>
-            @endverbatim
-            GetText() will return "This is ".
+        and concise, GetText() is limited compared to getting the XMLText child
+        and accessing it directly.
+
+        If the first child of 'this' is a XMLText, the GetText()
+        returns the character string of the Text node, else null is returned.
+
+        This is a convenient method for getting the text of simple contained text:
+        @verbatim
+        <foo>This is text</foo>
+        const char* str = fooElement->GetText();
+        @endverbatim
+
+        'str' will be a pointer to "This is text".
+
+        Note that this function can be misleading. If the element foo was created from
+        this XML:
+        @verbatim
+        <foo><b>This is text</b></foo>
+        @endverbatim
+
+        then the value of str would be null. The first child node isn't a text node, it is
+        another element. From this XML:
+        @verbatim
+        <foo>This is <b>text</b></foo>
+        @endverbatim
+        GetText() will return "This is ".
         */
         const char* GetText() const;
 
         /** Convenience function for easy access to the text inside an element. Although easy
-            and concise, SetText() is limited compared to creating an XMLText child
-            and mutating it directly.
-
-            If the first child of 'this' is a XMLText, SetText() sets its value to
-            the given string, otherwise it will create a first child that is an XMLText.
-
-            This is a convenient method for setting the text of simple contained text:
-            @verbatim
-            <foo>This is text</foo>
-                fooElement->SetText( "Hullaballoo!" );
-            <foo>Hullaballoo!</foo>
-            @endverbatim
-
-            Note that this function can be misleading. If the element foo was created from
-            this XML:
-            @verbatim
-                <foo><b>This is text</b></foo>
-            @endverbatim
-
-            then it will not change "This is text", but rather prefix it with a text element:
-            @verbatim
-                <foo>Hullaballoo!<b>This is text</b></foo>
-            @endverbatim
-
-            For this XML:
-            @verbatim
-                <foo />
-            @endverbatim
-            SetText() will generate
-            @verbatim
-                <foo>Hullaballoo!</foo>
-            @endverbatim
+        and concise, SetText() is limited compared to creating an XMLText child
+        and mutating it directly.
+
+        If the first child of 'this' is a XMLText, SetText() sets its value to
+        the given string, otherwise it will create a first child that is an XMLText.
+
+        This is a convenient method for setting the text of simple contained text:
+        @verbatim
+        <foo>This is text</foo>
+        fooElement->SetText( "Hullaballoo!" );
+        <foo>Hullaballoo!</foo>
+        @endverbatim
+
+        Note that this function can be misleading. If the element foo was created from
+        this XML:
+        @verbatim
+        <foo><b>This is text</b></foo>
+        @endverbatim
+
+        then it will not change "This is text", but rather prefix it with a text element:
+        @verbatim
+        <foo>Hullaballoo!<b>This is text</b></foo>
+        @endverbatim
+
+        For this XML:
+        @verbatim
+        <foo />
+        @endverbatim
+        SetText() will generate
+        @verbatim
+        <foo>Hullaballoo!</foo>
+        @endverbatim
         */
         void SetText(const char* inText);
-        /// Convenience method for setting text inside and element. See SetText() for important limitations.
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
         void SetText(int value);
-        /// Convenience method for setting text inside and element. See SetText() for important limitations.
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
         void SetText(unsigned value);
-        /// Convenience method for setting text inside and element. See SetText() for important limitations.
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
+        void SetText(int64_t value);
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
         void SetText(bool value);
-        /// Convenience method for setting text inside and element. See SetText() for important limitations.
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
         void SetText(double value);
-        /// Convenience method for setting text inside and element. See SetText() for important limitations.
+        /// Convenience method for setting text inside an element. See SetText() for important limitations.
         void SetText(float value);
 
         /**
-            Convenience method to query the value of a child text node. This is probably best
-            shown by example. Given you have a document is this form:
-            @verbatim
-                <point>
-                    <x>1</x>
-                    <y>1.4</y>
-                </point>
-            @endverbatim
-
-            The QueryIntText() and similar functions provide a safe and easier way to get to the
-            "value" of x and y.
-
-            @verbatim
-                int x = 0;
-                float y = 0;	// types of x and y are contrived for example
-                const XMLElement* xElement = pointElement->FirstChildElement( "x" );
-                const XMLElement* yElement = pointElement->FirstChildElement( "y" );
-                xElement->QueryIntText( &x );
-                yElement->QueryFloatText( &y );
-            @endverbatim
-
-            @returns XML_SUCCESS (0) on success, XML_CAN_NOT_CONVERT_TEXT if the text cannot be converted
-                     to the requested type, and XML_NO_TEXT_NODE if there is no child text to query.
+        Convenience method to query the value of a child text node. This is probably best
+        shown by example. Given you have a document is this form:
+        @verbatim
+        <point>
+        <x>1</x>
+        <y>1.4</y>
+        </point>
+        @endverbatim
+
+        The QueryIntText() and similar functions provide a safe and easier way to get to the
+        "value" of x and y.
+
+        @verbatim
+        int x = 0;
+        float y = 0;	// types of x and y are contrived for example
+        const XMLElement* xElement = pointElement->FirstChildElement( "x" );
+        const XMLElement* yElement = pointElement->FirstChildElement( "y" );
+        xElement->QueryIntText( &x );
+        yElement->QueryFloatText( &y );
+        @endverbatim
+
+        @returns XML_SUCCESS (0) on success, XML_CAN_NOT_CONVERT_TEXT if the text cannot be converted
+        to the requested type, and XML_NO_TEXT_NODE if there is no child text to query.
 
         */
         XMLError QueryIntText(int* ival) const;
         /// See QueryIntText()
         XMLError QueryUnsignedText(unsigned* uval) const;
         /// See QueryIntText()
+        XMLError QueryInt64Text(int64_t* uval) const;
+        /// See QueryIntText()
         XMLError QueryBoolText(bool* bval) const;
         /// See QueryIntText()
         XMLError QueryDoubleText(double* dval) const;
         /// See QueryIntText()
         XMLError QueryFloatText(float* fval) const;
 
+        int IntText(int defaultValue = 0) const;
+
+        /// See QueryIntText()
+        unsigned UnsignedText(unsigned defaultValue = 0) const;
+        /// See QueryIntText()
+        int64_t Int64Text(int64_t defaultValue = 0) const;
+        /// See QueryIntText()
+        bool BoolText(bool defaultValue = false) const;
+        /// See QueryIntText()
+        double DoubleText(double defaultValue = 0) const;
+        /// See QueryIntText()
+        float FloatText(float defaultValue = 0) const;
+
         // internal:
-        enum {
+        enum ElementClosingType {
             OPEN,		// <foo>
             CLOSED,		// <foo/>
             CLOSING		// </foo>
         };
-        int ClosingType() const {
+        ElementClosingType ClosingType() const {
             return _closingType;
         }
-        char* ParseDeep(char* p, StrPair* endTag);
         virtual XMLNode* ShallowClone(XMLDocument* document) const;
         virtual bool ShallowEqual(const XMLNode* compare) const;
 
+    protected:
+        char* ParseDeep(char* p, StrPair* parentEndTag, int* curLineNumPtr);
+
     private:
         XMLElement(XMLDocument* doc);
         virtual ~XMLElement();
@@ -1526,11 +1623,12 @@ namespace tinyxml2
         }
         XMLAttribute* FindOrCreateAttribute(const char* name);
         //void LinkAttribute( XMLAttribute* attrib );
-        char* ParseAttributes(char* p);
+        char* ParseAttributes(char* p, int* curLineNumPtr);
         static void DeleteAttribute(XMLAttribute* attribute);
+        XMLAttribute* CreateAttribute();
 
         enum { BUF_SIZE = 200 };
-        int _closingType;
+        ElementClosingType _closingType;
         // The attribute list is ordered; there is no 'lastAttribute'
         // because the list needs to be scanned for dupes before adding
         // a new attribute.
@@ -1545,70 +1643,79 @@ namespace tinyxml2
 
 
     /** A Document binds together all the functionality.
-        It can be saved, loaded, and printed to the screen.
-        All Nodes are connected and allocated to a Document.
-        If the Document is deleted, all its Nodes are also deleted.
+    It can be saved, loaded, and printed to the screen.
+    All Nodes are connected and allocated to a Document.
+    If the Document is deleted, all its Nodes are also deleted.
     */
     class TINYXML2_LIB XMLDocument : public XMLNode
     {
         friend class XMLElement;
+        // Gives access to SetError, but over-access for everything else.
+        // Wishing C++ had "internal" scope.
+        friend class XMLNode;
+        friend class XMLText;
+        friend class XMLComment;
+        friend class XMLDeclaration;
+        friend class XMLUnknown;
     public:
         /// constructor
-        XMLDocument(bool processEntities = true, Whitespace = PRESERVE_WHITESPACE);
+        XMLDocument(bool processEntities = true, Whitespace whitespaceMode = PRESERVE_WHITESPACE);
         ~XMLDocument();
 
         virtual XMLDocument* ToDocument() {
+            TIXMLASSERT(this == _document);
             return this;
         }
         virtual const XMLDocument* ToDocument() const {
+            TIXMLASSERT(this == _document);
             return this;
         }
 
         /**
-            Parse an XML file from a character string.
-            Returns XML_NO_ERROR (0) on success, or
-            an errorID.
-
-            You may optionally pass in the 'nBytes', which is
-            the number of bytes which will be parsed. If not
-            specified, TinyXML-2 will assume 'xml' points to a
-            null terminated string.
+        Parse an XML file from a character string.
+        Returns XML_SUCCESS (0) on success, or
+        an errorID.
+
+        You may optionally pass in the 'nBytes', which is
+        the number of bytes which will be parsed. If not
+        specified, TinyXML-2 will assume 'xml' points to a
+        null terminated string.
         */
         XMLError Parse(const char* xml, size_t nBytes = (size_t)(-1));
 
         /**
-            Load an XML file from disk.
-            Returns XML_NO_ERROR (0) on success, or
-            an errorID.
+        Load an XML file from disk.
+        Returns XML_SUCCESS (0) on success, or
+        an errorID.
         */
         XMLError LoadFile(const char* filename);
 
         /**
-            Load an XML file from disk. You are responsible
-            for providing and closing the FILE*.
+        Load an XML file from disk. You are responsible
+        for providing and closing the FILE*.
 
-            NOTE: The file should be opened as binary ("rb")
-            not text in order for TinyXML-2 to correctly
-            do newline normalization.
+        NOTE: The file should be opened as binary ("rb")
+        not text in order for TinyXML-2 to correctly
+        do newline normalization.
 
-            Returns XML_NO_ERROR (0) on success, or
-            an errorID.
+        Returns XML_SUCCESS (0) on success, or
+        an errorID.
         */
         XMLError LoadFile(FILE*);
 
         /**
-            Save the XML file to disk.
-            Returns XML_NO_ERROR (0) on success, or
-            an errorID.
+        Save the XML file to disk.
+        Returns XML_SUCCESS (0) on success, or
+        an errorID.
         */
         XMLError SaveFile(const char* filename, bool compact = false);
 
         /**
-            Save the XML file to disk. You are responsible
-            for providing and closing the FILE*.
+        Save the XML file to disk. You are responsible
+        for providing and closing the FILE*.
 
-            Returns XML_NO_ERROR (0) on success, or
-            an errorID.
+        Returns XML_SUCCESS (0) on success, or
+        an errorID.
         */
         XMLError SaveFile(FILE* fp, bool compact = false);
 
@@ -1616,11 +1723,11 @@ namespace tinyxml2
             return _processEntities;
         }
         Whitespace WhitespaceMode() const {
-            return _whitespace;
+            return _whitespaceMode;
         }
 
         /**
-            Returns true if this document has a leading Byte Order Mark of UTF8.
+        Returns true if this document has a leading Byte Order Mark of UTF8.
         */
         bool HasBOM() const {
             return _writeBOM;
@@ -1632,7 +1739,7 @@ namespace tinyxml2
         }
 
         /** Return the root element of DOM. Equivalent to FirstChildElement().
-            To get the first node, use FirstChild().
+        To get the first node, use FirstChild().
         */
         XMLElement* RootElement() {
             return FirstChildElement();
@@ -1642,94 +1749,112 @@ namespace tinyxml2
         }
 
         /** Print the Document. If the Printer is not provided, it will
-            print to stdout. If you provide Printer, this can print to a file:
-            @verbatim
-            XMLPrinter printer( fp );
-            doc.Print( &printer );
-            @endverbatim
-
-            Or you can use a printer to print to memory:
-            @verbatim
-            XMLPrinter printer;
-            doc.Print( &printer );
-            // printer.CStr() has a const char* to the XML
-            @endverbatim
+        print to stdout. If you provide Printer, this can print to a file:
+        @verbatim
+        XMLPrinter printer( fp );
+        doc.Print( &printer );
+        @endverbatim
+
+        Or you can use a printer to print to memory:
+        @verbatim
+        XMLPrinter printer;
+        doc.Print( &printer );
+        // printer.CStr() has a const char* to the XML
+        @endverbatim
         */
         void Print(XMLPrinter* streamer = 0) const;
         virtual bool Accept(XMLVisitor* visitor) const;
 
         /**
-            Create a new Element associated with
-            this Document. The memory for the Element
-            is managed by the Document.
+        Create a new Element associated with
+        this Document. The memory for the Element
+        is managed by the Document.
         */
         XMLElement* NewElement(const char* name);
         /**
-            Create a new Comment associated with
-            this Document. The memory for the Comment
-            is managed by the Document.
+        Create a new Comment associated with
+        this Document. The memory for the Comment
+        is managed by the Document.
         */
         XMLComment* NewComment(const char* comment);
         /**
-            Create a new Text associated with
-            this Document. The memory for the Text
-            is managed by the Document.
+        Create a new Text associated with
+        this Document. The memory for the Text
+        is managed by the Document.
         */
         XMLText* NewText(const char* text);
         /**
-            Create a new Declaration associated with
-            this Document. The memory for the object
-            is managed by the Document.
-
-            If the 'text' param is null, the standard
-            declaration is used.:
-            @verbatim
-                <?xml version="1.0" encoding="UTF-8"?>
-            @endverbatim
+        Create a new Declaration associated with
+        this Document. The memory for the object
+        is managed by the Document.
+
+        If the 'text' param is null, the standard
+        declaration is used.:
+        @verbatim
+        <?xml version="1.0" encoding="UTF-8"?>
+        @endverbatim
         */
         XMLDeclaration* NewDeclaration(const char* text = 0);
         /**
-            Create a new Unknown associated with
-            this Document. The memory for the object
-            is managed by the Document.
+        Create a new Unknown associated with
+        this Document. The memory for the object
+        is managed by the Document.
         */
         XMLUnknown* NewUnknown(const char* text);
 
         /**
-            Delete a node associated with this document.
-            It will be unlinked from the DOM.
+        Delete a node associated with this document.
+        It will be unlinked from the DOM.
         */
         void DeleteNode(XMLNode* node);
 
-        void SetError(XMLError error, const char* str1, const char* str2);
+        void ClearError() {
+            SetError(XML_SUCCESS, 0, 0);
+        }
 
         /// Return true if there was an error parsing the document.
         bool Error() const {
-            return _errorID != XML_NO_ERROR;
+            return _errorID != XML_SUCCESS;
         }
         /// Return the errorID.
         XMLError  ErrorID() const {
             return _errorID;
         }
         const char* ErrorName() const;
+        static const char* ErrorIDToName(XMLError errorID);
 
-        /// Return a possibly helpful diagnostic location or string.
-        const char* GetErrorStr1() const {
-            return _errorStr1;
-        }
-        /// Return a possibly helpful secondary diagnostic location or string.
-        const char* GetErrorStr2() const {
-            return _errorStr2;
-        }
-        /// If there is an error, print it to stdout.
+        /** Returns a "long form" error description. A hopefully helpful
+        diagnostic with location, line number, and/or additional info.
+        */
+        const char* ErrorStr() const;
+
+        /// A (trivial) utility function that prints the ErrorStr() to stdout.
         void PrintError() const;
 
+        /// Return the line where the error occured, or zero if unknown.
+        int ErrorLineNum() const
+        {
+            return _errorLineNum;
+        }
+
         /// Clear the document, resetting it to the initial state.
         void Clear();
 
+        /**
+        Copies this document to a target document.
+        The target will be completely cleared before the copy.
+        If you want to copy a sub-tree, see XMLNode::DeepClone().
+
+        NOTE: that the 'target' must be non-null.
+        */
+        void DeepCopy(XMLDocument* target) const;
+
         // internal
         char* Identify(char* p, XMLNode** node);
 
+        // internal
+        void MarkInUse(XMLNode*);
+
         virtual XMLNode* ShallowClone(XMLDocument* /*document*/) const {
             return 0;
         }
@@ -1741,13 +1866,21 @@ namespace tinyxml2
         XMLDocument(const XMLDocument&);	// not supported
         void operator=(const XMLDocument&);	// not supported
 
-        bool        _writeBOM;
-        bool        _processEntities;
-        XMLError    _errorID;
-        Whitespace  _whitespace;
-        const char* _errorStr1;
-        const char* _errorStr2;
-        char*       _charBuffer;
+        bool			_writeBOM;
+        bool			_processEntities;
+        XMLError		_errorID;
+        Whitespace		_whitespaceMode;
+        mutable StrPair	_errorStr;
+        int             _errorLineNum;
+        char*			_charBuffer;
+        int				_parseCurLineNum;
+        // Memory tracking does add some overhead.
+        // However, the code assumes that you don't
+        // have a bunch of unlinked nodes around.
+        // Therefore it takes less memory to track
+        // in the document vs. a linked list in the XMLNode,
+        // and the performance is the same.
+        DynArray<XMLNode*, 10> _unlinked;
 
         MemPoolT< sizeof(XMLElement) >	 _elementPool;
         MemPoolT< sizeof(XMLAttribute) > _attributePool;
@@ -1757,78 +1890,92 @@ namespace tinyxml2
         static const char* _errorNames[XML_ERROR_COUNT];
 
         void Parse();
-    };
 
+        void SetError(XMLError error, int lineNum, const char* format, ...);
 
-    /**
-        A XMLHandle is a class that wraps a node pointer with null checks; this is
-        an incredibly useful thing. Note that XMLHandle is not part of the TinyXML-2
-        DOM structure. It is a separate utility class.
+        template<class NodeType, int PoolElementSize>
+        NodeType* CreateUnlinkedNode(MemPoolT<PoolElementSize>& pool);
+    };
 
-        Take an example:
-        @verbatim
-        <Document>
-            <Element attributeA = "valueA">
-                <Child attributeB = "value1" />
-                <Child attributeB = "value2" />
-            </Element>
-        </Document>
-        @endverbatim
+    template<class NodeType, int PoolElementSize>
+    inline NodeType* XMLDocument::CreateUnlinkedNode(MemPoolT<PoolElementSize>& pool)
+    {
+        TIXMLASSERT(sizeof(NodeType) == PoolElementSize);
+        TIXMLASSERT(sizeof(NodeType) == pool.ItemSize());
+        NodeType* returnNode = new (pool.Alloc()) NodeType(this);
+        TIXMLASSERT(returnNode);
+        returnNode->_memPool = &pool;
 
-        Assuming you want the value of "attributeB" in the 2nd "Child" element, it's very
-        easy to write a *lot* of code that looks like:
+        _unlinked.Push(returnNode);
+        return returnNode;
+    }
 
-        @verbatim
-        XMLElement* root = document.FirstChildElement( "Document" );
-        if ( root )
-        {
-            XMLElement* element = root->FirstChildElement( "Element" );
-            if ( element )
-            {
-                XMLElement* child = element->FirstChildElement( "Child" );
-                if ( child )
-                {
-                    XMLElement* child2 = child->NextSiblingElement( "Child" );
-                    if ( child2 )
-                    {
-                        // Finally do something useful.
-        @endverbatim
+    /**
+    A XMLHandle is a class that wraps a node pointer with null checks; this is
+    an incredibly useful thing. Note that XMLHandle is not part of the TinyXML-2
+    DOM structure. It is a separate utility class.
+
+    Take an example:
+    @verbatim
+    <Document>
+    <Element attributeA = "valueA">
+    <Child attributeB = "value1" />
+    <Child attributeB = "value2" />
+    </Element>
+    </Document>
+    @endverbatim
+
+    Assuming you want the value of "attributeB" in the 2nd "Child" element, it's very
+    easy to write a *lot* of code that looks like:
+
+    @verbatim
+    XMLElement* root = document.FirstChildElement( "Document" );
+    if ( root )
+    {
+    XMLElement* element = root->FirstChildElement( "Element" );
+    if ( element )
+    {
+    XMLElement* child = element->FirstChildElement( "Child" );
+    if ( child )
+    {
+    XMLElement* child2 = child->NextSiblingElement( "Child" );
+    if ( child2 )
+    {
+    // Finally do something useful.
+    @endverbatim
 
-        And that doesn't even cover "else" cases. XMLHandle addresses the verbosity
-        of such code. A XMLHandle checks for null pointers so it is perfectly safe
-        and correct to use:
+    And that doesn't even cover "else" cases. XMLHandle addresses the verbosity
+    of such code. A XMLHandle checks for null pointers so it is perfectly safe
+    and correct to use:
 
-        @verbatim
-        XMLHandle docHandle( &document );
-        XMLElement* child2 = docHandle.FirstChildElement( "Document" ).FirstChildElement( "Element" ).FirstChildElement().NextSiblingElement();
-        if ( child2 )
-        {
-            // do something useful
-        @endverbatim
+    @verbatim
+    XMLHandle docHandle( &document );
+    XMLElement* child2 = docHandle.FirstChildElement( "Document" ).FirstChildElement( "Element" ).FirstChildElement().NextSiblingElement();
+    if ( child2 )
+    {
+    // do something useful
+    @endverbatim
 
-        Which is MUCH more concise and useful.
+    Which is MUCH more concise and useful.
 
-        It is also safe to copy handles - internally they are nothing more than node pointers.
-        @verbatim
-        XMLHandle handleCopy = handle;
-        @endverbatim
+    It is also safe to copy handles - internally they are nothing more than node pointers.
+    @verbatim
+    XMLHandle handleCopy = handle;
+    @endverbatim
 
-        See also XMLConstHandle, which is the same as XMLHandle, but operates on const objects.
+    See also XMLConstHandle, which is the same as XMLHandle, but operates on const objects.
     */
     class TINYXML2_LIB XMLHandle
     {
     public:
         /// Create a handle from any node (at any depth of the tree.) This can be a null pointer.
-        XMLHandle(XMLNode* node) {
-            _node = node;
+        XMLHandle(XMLNode* node) : _node(node) {
         }
         /// Create a handle from a node.
-        XMLHandle(XMLNode& node) {
-            _node = &node;
+        XMLHandle(XMLNode& node) : _node(&node) {
         }
         /// Copy constructor
-        XMLHandle(const XMLHandle& ref) {
-            _node = ref._node;
+        XMLHandle(const XMLHandle& ref) : _node(ref._node) {
         }
         /// Assignment
         XMLHandle& operator=(const XMLHandle& ref) {
@@ -1841,32 +1988,32 @@ namespace tinyxml2
             return XMLHandle(_node ? _node->FirstChild() : 0);
         }
         /// Get the first child element of this handle.
-        XMLHandle FirstChildElement(const char* value = 0) {
-            return XMLHandle(_node ? _node->FirstChildElement(value) : 0);
+        XMLHandle FirstChildElement(const char* name = 0) {
+            return XMLHandle(_node ? _node->FirstChildElement(name) : 0);
         }
         /// Get the last child of this handle.
         XMLHandle LastChild() {
             return XMLHandle(_node ? _node->LastChild() : 0);
         }
         /// Get the last child element of this handle.
-        XMLHandle LastChildElement(const char* _value = 0) {
-            return XMLHandle(_node ? _node->LastChildElement(_value) : 0);
+        XMLHandle LastChildElement(const char* name = 0) {
+            return XMLHandle(_node ? _node->LastChildElement(name) : 0);
         }
         /// Get the previous sibling of this handle.
         XMLHandle PreviousSibling() {
             return XMLHandle(_node ? _node->PreviousSibling() : 0);
         }
         /// Get the previous sibling element of this handle.
-        XMLHandle PreviousSiblingElement(const char* _value = 0) {
-            return XMLHandle(_node ? _node->PreviousSiblingElement(_value) : 0);
+        XMLHandle PreviousSiblingElement(const char* name = 0) {
+            return XMLHandle(_node ? _node->PreviousSiblingElement(name) : 0);
         }
         /// Get the next sibling of this handle.
         XMLHandle NextSibling() {
             return XMLHandle(_node ? _node->NextSibling() : 0);
         }
         /// Get the next sibling element of this handle.
-        XMLHandle NextSiblingElement(const char* _value = 0) {
-            return XMLHandle(_node ? _node->NextSiblingElement(_value) : 0);
+        XMLHandle NextSiblingElement(const char* name = 0) {
+            return XMLHandle(_node ? _node->NextSiblingElement(name) : 0);
         }
 
         /// Safe cast to XMLNode. This can return null.
@@ -1875,41 +2022,38 @@ namespace tinyxml2
         }
         /// Safe cast to XMLElement. This can return null.
         XMLElement* ToElement() {
-            return ((_node == 0) ? 0 : _node->ToElement());
+            return (_node ? _node->ToElement() : 0);
         }
         /// Safe cast to XMLText. This can return null.
         XMLText* ToText() {
-            return ((_node == 0) ? 0 : _node->ToText());
+            return (_node ? _node->ToText() : 0);
         }
         /// Safe cast to XMLUnknown. This can return null.
         XMLUnknown* ToUnknown() {
-            return ((_node == 0) ? 0 : _node->ToUnknown());
+            return (_node ? _node->ToUnknown() : 0);
         }
         /// Safe cast to XMLDeclaration. This can return null.
         XMLDeclaration* ToDeclaration() {
-            return ((_node == 0) ? 0 : _node->ToDeclaration());
+            return (_node ? _node->ToDeclaration() : 0);
         }
 
     private:
-        XMLNode* _node;
+        XMLNode * _node;
     };
 
 
     /**
-        A variant of the XMLHandle class for working with const XMLNodes and Documents. It is the
-        same in all regards, except for the 'const' qualifiers. See XMLHandle for API.
+    A variant of the XMLHandle class for working with const XMLNodes and Documents. It is the
+    same in all regards, except for the 'const' qualifiers. See XMLHandle for API.
     */
     class TINYXML2_LIB XMLConstHandle
     {
     public:
-        XMLConstHandle(const XMLNode* node) {
-            _node = node;
+        XMLConstHandle(const XMLNode* node) : _node(node) {
         }
-        XMLConstHandle(const XMLNode& node) {
-            _node = &node;
+        XMLConstHandle(const XMLNode& node) : _node(&node) {
         }
-        XMLConstHandle(const XMLConstHandle& ref) {
-            _node = ref._node;
+        XMLConstHandle(const XMLConstHandle& ref) : _node(ref._node) {
         }
 
         XMLConstHandle& operator=(const XMLConstHandle& ref) {
@@ -1920,26 +2064,26 @@ namespace tinyxml2
         const XMLConstHandle FirstChild() const {
             return XMLConstHandle(_node ? _node->FirstChild() : 0);
         }
-        const XMLConstHandle FirstChildElement(const char* value = 0) const {
-            return XMLConstHandle(_node ? _node->FirstChildElement(value) : 0);
+        const XMLConstHandle FirstChildElement(const char* name = 0) const {
+            return XMLConstHandle(_node ? _node->FirstChildElement(name) : 0);
         }
         const XMLConstHandle LastChild()	const {
             return XMLConstHandle(_node ? _node->LastChild() : 0);
         }
-        const XMLConstHandle LastChildElement(const char* _value = 0) const {
-            return XMLConstHandle(_node ? _node->LastChildElement(_value) : 0);
+        const XMLConstHandle LastChildElement(const char* name = 0) const {
+            return XMLConstHandle(_node ? _node->LastChildElement(name) : 0);
         }
         const XMLConstHandle PreviousSibling() const {
             return XMLConstHandle(_node ? _node->PreviousSibling() : 0);
         }
-        const XMLConstHandle PreviousSiblingElement(const char* _value = 0) const {
-            return XMLConstHandle(_node ? _node->PreviousSiblingElement(_value) : 0);
+        const XMLConstHandle PreviousSiblingElement(const char* name = 0) const {
+            return XMLConstHandle(_node ? _node->PreviousSiblingElement(name) : 0);
         }
         const XMLConstHandle NextSibling() const {
             return XMLConstHandle(_node ? _node->NextSibling() : 0);
         }
-        const XMLConstHandle NextSiblingElement(const char* _value = 0) const {
-            return XMLConstHandle(_node ? _node->NextSiblingElement(_value) : 0);
+        const XMLConstHandle NextSiblingElement(const char* name = 0) const {
+            return XMLConstHandle(_node ? _node->NextSiblingElement(name) : 0);
         }
 
 
@@ -1947,16 +2091,16 @@ namespace tinyxml2
             return _node;
         }
         const XMLElement* ToElement() const {
-            return ((_node == 0) ? 0 : _node->ToElement());
+            return (_node ? _node->ToElement() : 0);
         }
         const XMLText* ToText() const {
-            return ((_node == 0) ? 0 : _node->ToText());
+            return (_node ? _node->ToText() : 0);
         }
         const XMLUnknown* ToUnknown() const {
-            return ((_node == 0) ? 0 : _node->ToUnknown());
+            return (_node ? _node->ToUnknown() : 0);
         }
         const XMLDeclaration* ToDeclaration() const {
-            return ((_node == 0) ? 0 : _node->ToDeclaration());
+            return (_node ? _node->ToDeclaration() : 0);
         }
 
     private:
@@ -1965,55 +2109,55 @@ namespace tinyxml2
 
 
     /**
-        Printing functionality. The XMLPrinter gives you more
-        options than the XMLDocument::Print() method.
-
-        It can:
-        -# Print to memory.
-        -# Print to a file you provide.
-        -# Print XML without a XMLDocument.
-
-        Print to Memory
-
-        @verbatim
-        XMLPrinter printer;
-        doc.Print( &printer );
-        SomeFunction( printer.CStr() );
-        @endverbatim
-
-        Print to a File
-
-        You provide the file pointer.
-        @verbatim
-        XMLPrinter printer( fp );
-        doc.Print( &printer );
-        @endverbatim
-
-        Print without a XMLDocument
-
-        When loading, an XML parser is very useful. However, sometimes
-        when saving, it just gets in the way. The code is often set up
-        for streaming, and constructing the DOM is just overhead.
-
-        The Printer supports the streaming case. The following code
-        prints out a trivially simple XML file without ever creating
-        an XML document.
-
-        @verbatim
-        XMLPrinter printer( fp );
-        printer.OpenElement( "foo" );
-        printer.PushAttribute( "foo", "bar" );
-        printer.CloseElement();
-        @endverbatim
+    Printing functionality. The XMLPrinter gives you more
+    options than the XMLDocument::Print() method.
+
+    It can:
+    -# Print to memory.
+    -# Print to a file you provide.
+    -# Print XML without a XMLDocument.
+
+    Print to Memory
+
+    @verbatim
+    XMLPrinter printer;
+    doc.Print( &printer );
+    SomeFunction( printer.CStr() );
+    @endverbatim
+
+    Print to a File
+
+    You provide the file pointer.
+    @verbatim
+    XMLPrinter printer( fp );
+    doc.Print( &printer );
+    @endverbatim
+
+    Print without a XMLDocument
+
+    When loading, an XML parser is very useful. However, sometimes
+    when saving, it just gets in the way. The code is often set up
+    for streaming, and constructing the DOM is just overhead.
+
+    The Printer supports the streaming case. The following code
+    prints out a trivially simple XML file without ever creating
+    an XML document.
+
+    @verbatim
+    XMLPrinter printer( fp );
+    printer.OpenElement( "foo" );
+    printer.PushAttribute( "foo", "bar" );
+    printer.CloseElement();
+    @endverbatim
     */
     class TINYXML2_LIB XMLPrinter : public XMLVisitor
     {
     public:
         /** Construct the printer. If the FILE* is specified,
-            this will print to the FILE. Else it will print
-            to memory, and the result is available in CStr().
-            If 'compact' is set to true, then output is created
-            with only required whitespace and newlines.
+        this will print to the FILE. Else it will print
+        to memory, and the result is available in CStr().
+        If 'compact' is set to true, then output is created
+        with only required whitespace and newlines.
         */
         XMLPrinter(FILE* file = 0, bool compact = false, int depth = 0);
         virtual ~XMLPrinter() {}
@@ -2021,13 +2165,14 @@ namespace tinyxml2
         /** If streaming, write the BOM and declaration. */
         void PushHeader(bool writeBOM, bool writeDeclaration);
         /** If streaming, start writing an element.
-            The element must be closed with CloseElement()
+        The element must be closed with CloseElement()
         */
         void OpenElement(const char* name, bool compactMode = false);
         /// If streaming, add an attribute to an open element.
         void PushAttribute(const char* name, const char* value);
         void PushAttribute(const char* name, int value);
         void PushAttribute(const char* name, unsigned value);
+        void PushAttribute(const char* name, int64_t value);
         void PushAttribute(const char* name, bool value);
         void PushAttribute(const char* name, double value);
         /// If streaming, close the Element.
@@ -2039,6 +2184,8 @@ namespace tinyxml2
         void PushText(int value);
         /// Add a text node from an unsigned.
         void PushText(unsigned value);
+        /// Add a text node from an unsigned.
+        void PushText(int64_t value);
         /// Add a text node from a bool.
         void PushText(bool value);
         /// Add a text node from a float.
@@ -2066,37 +2213,41 @@ namespace tinyxml2
         virtual bool Visit(const XMLUnknown& unknown);
 
         /**
-            If in print to memory mode, return a pointer to
-            the XML file in memory.
+        If in print to memory mode, return a pointer to
+        the XML file in memory.
         */
         const char* CStr() const {
             return _buffer.Mem();
         }
         /**
-            If in print to memory mode, return the size
-            of the XML file in memory. (Note the size returned
-            includes the terminating null.)
+        If in print to memory mode, return the size
+        of the XML file in memory. (Note the size returned
+        includes the terminating null.)
         */
         int CStrSize() const {
             return _buffer.Size();
         }
         /**
-            If in print to memory mode, reset the buffer to the
-            beginning.
+        If in print to memory mode, reset the buffer to the
+        beginning.
         */
         void ClearBuffer() {
             _buffer.Clear();
             _buffer.Push(0);
+            _firstElement = true;
         }
 
     protected:
         virtual bool CompactMode(const XMLElement&) { return _compactMode; }
 
         /** Prints out the space before an element. You may override to change
-            the space and tabs used. A PrintSpace() override should call Print().
+        the space and tabs used. A PrintSpace() override should call Print().
         */
         virtual void PrintSpace(int depth);
         void Print(const char* format, ...);
+        void Write(const char* data, size_t size);
+        inline void Write(const char* data) { Write(data, strlen(data)); }
+        void Putc(char ch);
 
         void SealElementIfJustOpened();
         bool _elementJustOpened;
@@ -2120,6 +2271,10 @@ namespace tinyxml2
         bool _restrictedEntityFlag[ENTITY_RANGE];
 
         DynArray< char, 20 > _buffer;
+
+        // Prohibit cloning, intentionally not implemented
+        XMLPrinter(const XMLPrinter&);
+        XMLPrinter& operator=(const XMLPrinter&);
     };
 
 
diff --git a/src/3rd/Simd/SimdConst.h b/src/3rd/Simd/SimdConst.h
index 97e2f5e7..0caa63f8 100644
--- a/src/3rd/Simd/SimdConst.h
+++ b/src/3rd/Simd/SimdConst.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar,
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
 *               2014-2015 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -90,7 +90,7 @@ namespace Simd
     namespace Sse2
     {
         using namespace Sse;
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug
         using Sse::F;
         using Sse::DF;
         using Sse::QF;
@@ -165,7 +165,7 @@ namespace Simd
     namespace Sse3
     {
         using namespace Sse2;
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
         using Sse::F;
         using Sse::DF;
         using Sse::QF;
@@ -212,7 +212,7 @@ namespace Simd
     namespace Sse41
     {
         using namespace Ssse3;
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
         using Sse::F;
         using Sse::DF;
         using Sse::QF;
@@ -241,7 +241,7 @@ namespace Simd
     namespace Avx2
     {
         using namespace Avx;
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug    
         using Avx::F;
         using Avx::DF;
         using Avx::QF;
diff --git a/src/3rd/Simd/SimdDetection.h b/src/3rd/Simd/SimdDetection.h
index 7bf8dd90..cc96135f 100644
--- a/src/3rd/Simd/SimdDetection.h
+++ b/src/3rd/Simd/SimdDetection.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -40,11 +40,6 @@ namespace Simd
         typedef Simd::Point<ptrdiff_t> Size;
         typedef Simd::Rectangle<ptrdiff_t> Rect;
 
-        struct Deletable
-        {
-            virtual ~Deletable() {}
-        };
-
         struct Data : public Deletable
         {
             struct DTreeNode
diff --git a/src/3rd/Simd/SimdDetection.hpp b/src/3rd/Simd/SimdDetection.hpp
index 9fe896ee..7f53c463 100644
--- a/src/3rd/Simd/SimdDetection.hpp
+++ b/src/3rd/Simd/SimdDetection.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -206,7 +206,7 @@ namespace Simd
         ~Detection()
         {
             for (size_t i = 0; i < _data.size(); ++i)
-                ::SimdDetectionFree(_data[i].handle);
+                ::SimdRelease(_data[i].handle);
         }
 
         /*!
@@ -383,7 +383,7 @@ namespace Simd
             ~Level()
             {
                 for (size_t i = 0; i < hids.size(); ++i)
-                    ::SimdDetectionFree(hids[i].handle);
+                    ::SimdRelease(hids[i].handle);
             }
         };
         typedef std::unique_ptr<Level> LevelPtr;
diff --git a/src/3rd/Simd/SimdEnable.h b/src/3rd/Simd/SimdEnable.h
index b1e96d15..14139149 100644
--- a/src/3rd/Simd/SimdEnable.h
+++ b/src/3rd/Simd/SimdEnable.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -172,6 +172,21 @@ namespace Simd
         }
 
         const bool Enable = SupportedByCPU() && SupportedByOS();
+
+        const unsigned int SCR_FTZ = 1 << 15;
+
+        SIMD_INLINE SimdBool GetFlushToZero()
+        {
+            return _mm_getcsr() | SCR_FTZ ? SimdTrue : SimdFalse;
+        }
+
+        SIMD_INLINE void SetFlushToZero(SimdBool value)
+        {
+            if (value)
+                _mm_setcsr(_mm_getcsr() | SCR_FTZ);
+            else
+                _mm_setcsr(_mm_getcsr() & ~SCR_FTZ);
+        }
     }
 #endif
 
diff --git a/src/3rd/Simd/SimdGemm.h b/src/3rd/Simd/SimdGemm.h
new file mode 100644
index 00000000..a71e7489
--- /dev/null
+++ b/src/3rd/Simd/SimdGemm.h
@@ -0,0 +1,163 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdGemm_h__
+#define __SimdGemm_h__
+
+#include "Simd/SimdArray.h"
+#include "Simd/SimdBase.h"
+#include "Simd/SimdParallel.hpp"
+
+namespace Simd
+{
+    template <class T, class TM> class GemmNN
+    {
+    public:
+        typedef void(*Main)(size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T * C, size_t ldc, TM tail);
+        typedef void(*Tail)(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T * C, size_t ldc, TM tail);
+        typedef void(*ScaleC)(size_t M, size_t N, T beta, T * C, size_t ldc);
+        typedef void(*PackB)(const T * B, size_t ldb, size_t K, size_t N, size_t microN, T * pB);
+        typedef TM(*TailMask)(ptrdiff_t tail);
+
+        GemmNN(size_t M, size_t N, size_t K, size_t microM, size_t microN, size_t L1, size_t L2, size_t L3, size_t F,
+            Main kernelMM, Main kernelMT, Tail kernelTM, Tail kernelTT, ScaleC scaleC, PackB packB, TailMask tailMask)
+            : _M(M)
+            , _N(N)
+            , _K(K)
+            , _microM(microM)
+            , _microN(microN)
+            , _F(F)
+            , _threadNumber(Base::GetThreadNumber())
+            , _kernelMM(kernelMM)
+            , _kernelMT(kernelMT)
+            , _kernelTM(kernelTM)
+            , _kernelTT(kernelTT)
+            , _scaleC(scaleC)
+            , _packB(packB)
+        {
+
+            _macroK = L1 / sizeof(T) / _microN;
+            _macroM = AlignLoAny(L2 / sizeof(T) / _macroK, _microM);
+            _macroN = AlignLoAny(L3 / sizeof(T) / _macroK, _microN);
+            if (_N * _M * _K < 256 * 256 * 256 * 2)
+                _threadNumber = 1;
+            _pA.resize(_threadNumber);
+            _pB.resize(_threadNumber);
+            for (size_t t = 0; t < _threadNumber; ++t)
+            {
+                _pA[t].Resize(_macroM * _macroK);
+                _pB[t].Resize(_macroN * _macroK);
+            }
+            size_t NF = AlignLo(_N, _F);
+            if (tailMask)
+            {
+                _main = TM(-1);
+                _tail = NF == _N ? TM(-1) : tailMask(_N - NF);
+            }
+            else
+            {
+                _main = TM(_F);
+                _tail = NF == _N ? TM(_F) : TM(_N - NF);
+            }
+        }
+
+        void Run(const T * alpha, const T * A, size_t lda, const T * B, size_t ldb, const T * beta, T * C, size_t ldc)
+        {
+            Simd::Parallel(0, _N, [&](size_t thread, size_t begin, size_t end)
+            {
+                ThreadKernel(end - begin, *alpha, A, lda, B + begin, ldb, *beta, C + begin, ldc, thread);
+            }, _threadNumber, _microN);
+        }
+
+    private:
+
+        void ThreadKernel(size_t N, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, size_t thread)
+        {
+            for (size_t j = 0; j < N; j += _macroN)
+            {
+                size_t macroN = Simd::Min(N, j + _macroN) - j;
+                for (size_t k = 0; k < _K; k += _macroK)
+                {
+                    size_t macroK = Simd::Min(_K, k + _macroK) - k;
+                    //PackA(A + i * lda, lda, macroM, K, _microM, _A.data);
+                    for (size_t i = 0; i < _M; i += _macroM)
+                    {
+                        size_t macroM = Simd::Min(_M, i + _macroM) - i;
+                        if (k == 0)
+                            _scaleC(macroM, macroN, beta, C + i * ldc + j, ldc);
+                        MacroKernel(macroM, macroN, macroK, alpha, A + i * lda + k, lda, B + k * ldb + j, ldb, beta, C + i * ldc + j, ldc, i == 0, thread);
+                    }
+                }
+            }
+        }
+
+        void MacroKernel(size_t M, size_t N, size_t K, T alpha, const T * A, size_t lda, const T * B, size_t ldb, T beta, T * C, size_t ldc, bool packB, size_t thread)
+        {
+            size_t MA = AlignLoAny(M, _microM);
+            size_t NA = AlignLoAny(N, _microN);
+            size_t j = 0;
+            for (; j < NA; j += _microN)
+            {
+                T * pB = _pB[thread].data + j * _macroK;
+                if (packB)
+                    _packB(B + j, ldb, K, _microN, _microN, pB);
+                size_t i = 0;
+                for (; i < MA; i += _microM)
+                    _kernelMM(K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _main);
+                if (i < M)
+                    _kernelTM(M - i, _microN, K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _main);
+            }
+            if (j < N)
+            {
+                T * pB = _pB[thread].data + j * _macroK;
+                if (packB)
+                    _packB(B + j, ldb, K, N - j, _microN, pB);
+                size_t i = 0;
+                for (; i < MA; i += _microM)
+                    _kernelMT(K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _tail);
+                if (i < M)
+                    _kernelTT(M - i, NA - j, K, alpha, A + i * lda, lda, pB, _microN, C + i * ldc + j, ldc, _tail);
+            }
+        }
+
+        typedef std::vector<Simd::Array<T>> Arrays;
+
+        Arrays _pA, _pB;
+        size_t _M, _N, _K, _microM, _microN, _macroM, _macroN, _macroK, _F, _threadNumber;
+        TM _main, _tail;
+        Main _kernelMM, _kernelMT;
+        Tail _kernelTM, _kernelTT;
+        ScaleC _scaleC;
+        PackB _packB;
+    };
+
+#ifdef SIMD_AVX_ENABLE
+    namespace Avx
+    {
+        void GemmScaleC(size_t M, size_t N, float beta, float * C, size_t ldc);
+
+        void GemmPackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB);
+    }
+#endif//SIMD_AVX_ENABLE
+}
+#endif//__SimdGemm_h__
diff --git a/src/3rd/Simd/SimdLib.cpp b/src/3rd/Simd/SimdLib.cpp
index c7cc2667..0862d1b4 100644
--- a/src/3rd/Simd/SimdLib.cpp
+++ b/src/3rd/Simd/SimdLib.cpp
@@ -1,8 +1,9 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar,
-*               2014-2016 Antonenka Mikhail.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2014-2018 Antonenka Mikhail,
+*               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -48,10 +49,11 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 
 #include "Simd/SimdMemory.h"
 #include "Simd/SimdEnable.h"
-#include "Simd/SimdVersion.h"
 #include "Simd/SimdConst.h"
 #include "Simd/SimdLog.h"
 
+#include "Simd/SimdResizer.h"
+
 #include "Simd/SimdBase.h"
 #include "Simd/SimdSse1.h"
 #include "Simd/SimdSse2.h"
@@ -68,6 +70,15 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD dwReasonForCall, LPVOID lpReserved)
 #include "Simd/SimdNeon.h"
 #include "Simd/SimdMsa.h"
 
+#if !defined(SIMD_VERSION)
+#include "Simd/SimdVersion.h"
+#endif
+
+SIMD_API const char * SimdVersion()
+{
+    return SIMD_VERSION;
+}
+
 using namespace Simd;
 
 SIMD_API int SimdCpuInfo()
@@ -118,11 +129,6 @@ SIMD_API int SimdCpuInfo()
     return info;
 }
 
-SIMD_API const char * SimdVersion()
-{
-    return SIMD_VERSION;
-}
-
 SIMD_API void * SimdAllocate(size_t size, size_t align)
 {
     return Allocate(size, align);
@@ -143,6 +149,39 @@ SIMD_API size_t SimdAlignment()
     return Simd::ALIGNMENT;
 }
 
+SIMD_API void SimdRelease(void * context)
+{
+    delete (Deletable*)context;
+}
+
+SIMD_API size_t SimdGetThreadNumber()
+{
+    return Base::GetThreadNumber();
+}
+
+SIMD_API void SimdSetThreadNumber(size_t threadNumber)
+{
+    Base::SetThreadNumber(threadNumber);
+}
+
+SIMD_API SimdBool SimdGetFlushToZero()
+{
+#ifdef SIMD_SSE_ENABLE
+    if (Sse::Enable)
+        return Sse::GetFlushToZero();
+    else
+#endif
+        return SimdFalse;
+}
+
+SIMD_API void SimdSetFlushToZero(SimdBool value)
+{
+#ifdef SIMD_SSE_ENABLE
+    if (Sse::Enable)
+        Sse::SetFlushToZero(value);
+#endif
+}
+
 SIMD_API uint32_t SimdCrc32c(const void * src, size_t size)
 {
 #ifdef SIMD_SSE42_ENABLE
@@ -1600,11 +1639,6 @@ SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask,
         Base::DetectionLbpDetect16ii(hid, mask, maskStride, left, top, right, bottom, dst, dstStride);
 }
 
-SIMD_API void SimdDetectionFree(void * ptr)
-{
-    Base::DetectionFree(ptr);
-}
-
 SIMD_API void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height,
                                  uint8_t * background, size_t backgroundStride)
 {
@@ -1943,6 +1977,21 @@ SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b
         Base::SquaredDifferenceSum16f(a, b, size, sum);
 }
 
+SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance)
+{
+#ifdef SIMD_AVX512BW_ENABLE
+    if (Avx512bw::Enable)
+        Avx512bw::CosineDistance16f(a, b, size, distance);
+    else
+#endif
+#ifdef SIMD_AVX2_ENABLE
+    if (Avx2::Enable && size >= Avx2::F)
+        Avx2::CosineDistance16f(a, b, size, distance);
+    else
+#endif
+        Base::CosineDistance16f(a, b, size, distance);
+}
+
 SIMD_API void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst)
 {
 #ifdef SIMD_AVX512BW_ENABLE
@@ -1993,6 +2042,14 @@ SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float *
         Base::Uint8ToFloat32(src, size, lower, upper, dst);
 }
 
+typedef void(*SimdCosineDistance32fPtr) (const float * a, const float * b, size_t size, float * distance);
+SimdCosineDistance32fPtr simdCosineDistance32f = SIMD_FUNC4(CosineDistance32f, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC);
+
+SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+{
+    simdCosineDistance32f(a, b, size, distance);
+}
+
 SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
                      size_t channelCount, uint8_t * dst, size_t dstStride)
 {
@@ -2029,6 +2086,14 @@ SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t
         Base::GaussianBlur3x3(src, srcStride, width, height, channelCount, dst, dstStride);
 }
 
+typedef void(*SimdGemm32fNNPtr) (size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+SimdGemm32fNNPtr simdGemm32fNN = SIMD_FUNC4(Gemm32fNN, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC);
+
+SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+{
+    simdGemm32fNN(M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
 SIMD_API void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride)
 {
 #ifdef SIMD_AVX512BW_ENABLE
@@ -2338,29 +2403,29 @@ SIMD_API void SimdHogLiteExtractFeatures(const uint8_t * src, size_t srcStride,
         Base::HogLiteExtractFeatures(src, srcStride, width, height, cell, features, featuresStride);
 }
 
-SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
 {
 #ifdef SIMD_AVX512BW_ENABLE
     if (Avx512bw::Enable)
-        Avx512bw::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+        Avx512bw::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
     else
 #endif
 #ifdef SIMD_AVX2_ENABLE
     if (Avx2::Enable)
-        Avx2::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+        Avx2::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
     else
 #endif
 #ifdef SIMD_AVX_ENABLE
     if (Avx::Enable)
-        Avx::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+        Avx::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
     else
 #endif
 #ifdef SIMD_SSE41_ENABLE
     if (Sse41::Enable)
-        Sse41::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+        Sse41::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
     else
 #endif
-        Base::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+        Base::HogLiteFilterFeatures(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
 }
 
 SIMD_API void SimdHogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight)
@@ -3894,6 +3959,36 @@ SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t src
         Base::ResizeBilinear(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride, channelCount);
 }
 
+SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+{
+#ifdef SIMD_AVX512F_ENABLE
+    if (Avx512f::Enable)
+        return Avx512f::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+    else
+#endif
+#ifdef SIMD_AVX2_ENABLE
+    if (Avx2::Enable)
+        return Avx2::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+    else
+#endif
+#ifdef SIMD_AVX_ENABLE
+    if (Avx::Enable)
+        return Avx::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+    else
+#endif
+#ifdef SIMD_SSE_ENABLE
+    if (Sse::Enable)
+        return Sse::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+    else
+#endif
+        return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+}
+
+SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride)
+{
+    ((const Resizer*)resizer)->Run(src, srcStride, dst, dstStride);
+}
+
 SIMD_API void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex)
 {
 #ifdef SIMD_AVX512BW_ENABLE
@@ -4672,6 +4767,31 @@ SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, si
         Base::SquareSum(src, stride, width, height, sum);
 }
 
+SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+{
+#ifdef SIMD_AVX512BW_ENABLE
+    if (Avx512bw::Enable)
+        Avx512bw::ValueSquareSum(src, stride, width, height, valueSum, squareSum);
+    else
+#endif
+#ifdef SIMD_AVX2_ENABLE
+    if(Avx2::Enable && width >= Avx2::A)
+        Avx2::ValueSquareSum(src, stride, width, height, valueSum, squareSum);
+    else
+#endif
+#ifdef SIMD_SSE2_ENABLE
+    if(Sse2::Enable && width >= Sse2::A)
+        Sse2::ValueSquareSum(src, stride, width, height, valueSum, squareSum);
+    else
+#endif
+#ifdef SIMD_NEON_ENABLE
+    if (Neon::Enable && width >= Neon::A)
+        Neon::ValueSquareSum(src, stride, width, height, valueSum, squareSum);
+    else
+#endif
+        Base::ValueSquareSum(src, stride, width, height, valueSum, squareSum);
+}
+
 SIMD_API void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum)
 {
 #ifdef SIMD_AVX512BW_ENABLE
@@ -4763,6 +4883,38 @@ SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float *
         Base::SvmSumLinear(x, svs, weights, length, count, sum);
 }
 
+typedef void(*SimdSynetAddBiasPtr) (const float * bias, size_t count, size_t size, float * dst);
+volatile SimdSynetAddBiasPtr simdSynetAddBias = SIMD_FUNC3(SynetAddBias, SIMD_AVX512F_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC);
+
+SIMD_API void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+{
+    simdSynetAddBias(bias, count, size, dst);
+}
+
+typedef void(*SimdSynetEltwiseLayerForwardPtr) (float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+volatile SimdSynetEltwiseLayerForwardPtr simdSynetEltwiseLayerForward = SIMD_FUNC4(SynetEltwiseLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC);
+
+SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+{
+    simdSynetEltwiseLayerForward(src, weight, count, size, type, dst);
+}
+
+typedef void(*SimdSynetLrnLayerCrossChannelsPtr) (const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+volatile SimdSynetLrnLayerCrossChannelsPtr simdSynetLrnLayerCrossChannels = SIMD_FUNC3(SynetLrnLayerCrossChannels, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_SSE2_FUNC);
+
+SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+{
+    simdSynetLrnLayerCrossChannels(src, half, count, size, k, dst);
+}
+
+typedef void(*SimdSynetScaleLayerForwardPtr) (const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
+volatile SimdSynetScaleLayerForwardPtr simdSynetScaleLayerForward = SIMD_FUNC4(SynetScaleLayerForward, SIMD_AVX512F_FUNC, SIMD_AVX2_FUNC, SIMD_AVX_FUNC, SIMD_SSE_FUNC);
+
+SIMD_API void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+{
+    simdSynetScaleLayerForward(src, scale, bias, count, size, dst);
+}
+
 SIMD_API void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height,
                                      uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride)
 {
diff --git a/src/3rd/Simd/SimdLib.h b/src/3rd/Simd/SimdLib.h
index 30b1bdf6..c584894a 100644
--- a/src/3rd/Simd/SimdLib.h
+++ b/src/3rd/Simd/SimdLib.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar,
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
 *               2014-2016 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -66,6 +66,15 @@ typedef signed __int64    int64_t;
 typedef unsigned __int64  uint64_t;
 #endif
 
+/*! @ingroup c_types
+    Describes boolean type.
+*/
+typedef enum
+{
+    SimdFalse = 0, /*!< False value. */
+    SimdTrue = 1, /*!< True value. */
+} SimdBool;
+
 /*! @ingroup c_types
     Describes types of SIMD extensions which supported by current CPU and Simd Library (see function ::SimdCpuInfo).
 */
@@ -324,6 +333,58 @@ extern "C"
     */
     SIMD_API size_t SimdAlignment();
 
+    /*! @ingroup memory
+
+        \fn void SimdRelease(void * context);
+
+        \short Releases context created with using of Simd Library API.
+
+        \note This function releases a context created by functions ::SimdDetectionLoadA and ::SimdDetectionInit.
+
+        \param [in] context - a context to be released.
+    */    
+    SIMD_API void SimdRelease(void * context);
+
+    /*! @ingroup thread
+
+        \fn size_t SimdGetThreadNumber();
+
+        \short Gets number of threads used by Simd Library to parallelize some algorithms.
+
+        \return current thread number.
+    */
+    SIMD_API size_t SimdGetThreadNumber();
+
+    /*! @ingroup thread
+
+        \fn void SimdSetThreadNumber(size_t threadNumber);
+
+        \short Sets number of threads used by Simd Library to parallelize some algorithms.
+
+        \param [in] threadNumber - a number of threads.
+    */
+    SIMD_API void SimdSetThreadNumber(size_t threadNumber);
+
+    /*! @ingroup cpu_flags
+
+        \fn SimdBool SimdGetFlushToZero();
+
+        \short Gets current CPU Flush-To-Zero (FTZ) flag. It is used in order to process subnormal numbers.
+
+        \return current FTZ flag.
+    */
+    SIMD_API SimdBool SimdGetFlushToZero();
+
+    /*! @ingroup cpu_flags
+
+        \fn void SimdSetFlushToZero(SimdBool value);
+
+        \short Sets current CPU Flush-To-Zero (FTZ) flag. It is used in order to process subnormal numbers.
+
+        \param [in] value - a value of Flush-To-Zero (FTZ) flag.
+    */
+    SIMD_API void SimdSetFlushToZero(SimdBool value);
+
     /*! @ingroup hash
 
         \fn uint32_t SimdCrc32c(const void * src, size_t size);
@@ -1591,7 +1652,7 @@ extern "C"
 
         \param [in] path - a path to cascade.
         \return a pointer to loaded cascade. On error it returns NULL.
-                This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using function ::SimdDetectionFree.
+                This pointer is used in functions ::SimdDetectionInfo and ::SimdDetectionInit, and must be released with using of function ::SimdRelease.
     */
     SIMD_API void * SimdDetectionLoadA(const char * path);
 
@@ -1635,7 +1696,7 @@ extern "C"
         \return a pointer to hidden cascade. On error it returns NULL.
                 This pointer is used in functions ::SimdDetectionPrepare, ::SimdDetectionHaarDetect32fp, ::SimdDetectionHaarDetect32fi,
                 ::SimdDetectionLbpDetect32fp, ::SimdDetectionLbpDetect32fi, ::SimdDetectionLbpDetect16ip and ::SimdDetectionLbpDetect16ii.
-                It must be released with using function ::SimdDetectionFree.
+                It must be released with using of function ::SimdRelease.
     */
     SIMD_API void * SimdDetectionInit(const void * data, uint8_t * sum, size_t sumStride, size_t width, size_t height,
         uint8_t * sqsum, size_t sqsumStride, uint8_t * tilted, size_t tiltedStride, int throughColumn, int int16);
@@ -1799,17 +1860,6 @@ extern "C"
     SIMD_API void SimdDetectionLbpDetect16ii(const void * hid, const uint8_t * mask, size_t maskStride,
         ptrdiff_t left, ptrdiff_t top, ptrdiff_t right, ptrdiff_t bottom, uint8_t * dst, size_t dstStride);
 
-    /*! @ingroup object_detection
-
-        \fn void SimdDetectionFree(void * ptr);
-
-        \short Frees pointers which was received with using of functions ::SimdDetectionLoadA and ::SimdDetectionInit.
-
-        \note This function is used for implementation of Simd::Detection.
-
-        \param [in] ptr - a pointer which was received with using of functions ::SimdDetectionLoadA and ::SimdDetectionInit.
-    */    SIMD_API void SimdDetectionFree(void * ptr);
-
     /*! @ingroup edge_background
 
         \fn void SimdEdgeBackgroundGrowRangeSlow(const uint8_t * value, size_t valueStride, size_t width, size_t height, uint8_t * background, size_t backgroundStride);
@@ -2153,6 +2203,26 @@ extern "C"
     */
     SIMD_API void SimdSquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum);
 
+    /*! @ingroup float16
+
+        \fn void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance);
+
+        \short Calculates cosine distance of two 16-bit float arrays.
+
+        All arrays must have the same size.
+
+        Algorithm description:
+        \verbatim
+        distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i]));
+        \endverbatim
+
+        \param [in] a - a pointer to the first 16-bit float array.
+        \param [in] b - a pointer to the second 16-bit float array.
+        \param [in] size - a size of arrays.
+        \param [out] distance - a pointer to 32-bit float with cosine distance.
+    */
+    SIMD_API void SimdCosineDistance16f(const uint16_t * a, const uint16_t * b, size_t size, float * distance);
+
     /*! @ingroup other_conversion
 
         \fn void SimdFloat32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst);
@@ -2191,6 +2261,26 @@ extern "C"
     */
     SIMD_API void SimdUint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst);
 
+    /*! @ingroup correlation
+
+        \fn void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
+        \short Calculates cosine distance of two 32-bit float arrays.
+
+        All arrays must have the same size.
+
+        Algorithm description:
+        \verbatim
+        distance = 1 - Sum(a[i]*b[i])/Sqrt(Sum(a[i]*a[i])*Sum(b[i]*b[i]));
+        \endverbatim
+
+        \param [in] a - a pointer to the first 32-bit float array.
+        \param [in] b - a pointer to the second 32-bit float array.
+        \param [in] size - a size of arrays.
+        \param [out] distance - a pointer to 32-bit float with cosine distance.
+    */
+    SIMD_API void SimdCosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
     /*! @ingroup other_filter
 
         \fn void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t channelCount, uint8_t * dst, size_t dstStride);
@@ -2219,6 +2309,30 @@ extern "C"
     SIMD_API void SimdGaussianBlur3x3(const uint8_t * src, size_t srcStride, size_t width, size_t height,
         size_t channelCount, uint8_t * dst, size_t dstStride);
 
+    /*! @ingroup matrix
+
+        \fn void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
+        \short Performs general matrix multiplication (for 32-bit float numbers).
+
+        C(M, N) = alpha*A(M, K)*B(K, N) + beta*C(M, N);
+
+        \note This function supports multithreading (See functions ::SimdGetThreadNumber and ::SimdSetThreadNumber).
+
+        \param [in] M - a height of A and C matrices.
+        \param [in] N - a width of B and C matrices.
+        \param [in] K - a width of A and height of C matrices.
+        \param [in] alpha - a pointer to multiplier of the first term.
+        \param [in] A - a pointer to input A matrix.
+        \param [in] lda - a leading dimension of A matrix.
+        \param [in] B - a pointer to input B matrix.
+        \param [in] ldb - a leading dimension of B matrix.
+        \param [in] beta - a pointer to multiplier of the second term.
+        \param [out] C - a pointer to output C matrix.
+        \param [in] ldc - a leading dimension of C matrix.
+    */
+    SIMD_API void SimdGemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
     /*! @ingroup gray_conversion
 
         \fn void SimdGrayToBgr(const uint8_t * gray, size_t width, size_t height, size_t grayStride, uint8_t * bgr, size_t bgrStride);
@@ -2521,7 +2635,7 @@ extern "C"
 
     /*! @ingroup hog
 
-        \fn void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        \fn void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         \short Applies filter to lite HOG features.
 
@@ -2531,8 +2645,8 @@ extern "C"
         \verbatim
         if(mask[x, y])
             sum = 0;
-            for(dy = 0; dy < filterSize; dy++)
-                for(dx = 0; dx < filterSize*featureSize; dx++)
+            for(dy = 0; dy < filterHeight; dy++)
+                for(dx = 0; dx < filterWidth*featureSize; dx++)
                     sum += src[x*featureSize + dx, y + dy]*filter[dx, dy];
             dst[x, y] = sum;
         else
@@ -2546,15 +2660,16 @@ extern "C"
         \param [in] featureSize - a size of cell with features. It must be 8 or 16.
         \param [in] filter - a pointer to the 32-bit float array with filter values. 
                     Array must have size equal to filterSize*filterSize*featureSize.
-        \param [in] filterSize - a size (width and height) of used filter. 
-        \param [in] mask - a pointer to the 32-bit integer array with mask (0 or -1). 
+        \param [in] filterWidth - a width of used filter. 
+        \param [in] filterHeight - a height of used filter.
+        \param [in] mask - a pointer to the 32-bit integer array with mask (0 or -1).
                     Pointer can be null otherwise the array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize).
                     A function ::SimdHogLiteCreateMask is usefull in order to create this mask.
         \param [in] maskStride - a row size of mask array. 
         \param [out] dst - a pointer to output buffer with result of filtration. Array must have size greater then (srcHeight - filterSize)*(srcWidth - filterSize).
         \param [in] dstStride - a row size of the output buffer with result of filtration.
     */
-    SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+    SIMD_API void SimdHogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
     /*! @ingroup hog
 
@@ -4126,6 +4241,61 @@ extern "C"
     SIMD_API void SimdResizeBilinear(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
         uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount);
 
+    /*! @ingroup resizing
+        Describes resized image channel types.
+    */
+    typedef enum
+    {
+        /*! 8-bit integer channel type.  */
+        SimdResizeChannelByte,
+        /*! 32-bit float channel type.  */
+        SimdResizeChannelFloat,
+    } SimdResizeChannelType;
+
+    /*! @ingroup resizing
+        Describes methods used in oreder to resize image.
+    */
+    typedef enum
+    {
+        /*! Bilinear method. */
+        SimdResizeMethodBilinear,
+        /*! caffe::interp compatible method. */
+        SimdResizeMethodCaffeInterp,
+    } SimdResizeMethodType;
+
+    /*! @ingroup resizing
+
+        \fn void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+
+        \short Creates resize context.
+
+        \param [in] srcX - a width of the input image.
+        \param [in] srcY - a height of the input image.
+        \param [in] dstX - a width of the output image.
+        \param [in] dstY - a height of the output image.
+        \param [in] channels - a channel number of input and output image.
+        \param [in] type - a type of input and output image channel.
+        \param [in] method - a method used in order to resize image.
+        \return a pointer to resize context. On error it returns NULL. 
+                This pointer is used in functions ::SimdResizerRun. 
+                It must be released with using of function ::SimdRelease.
+    */
+    SIMD_API void * SimdResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+
+    /*! @ingroup resizing
+
+        \fn void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
+
+        \short Performs image resizing.
+
+        \param [in] resizer - a resize context. It must be created by function ::SimdResizerInit and released by function ::SimdRelease.
+        \param [in] src - a pointer to pixels data of the original input image.
+        \param [in] srcStride - a row size (in bytes) of the input image.
+        \param [out] dst - a pointer to pixels data of the resized output image.
+        \param [in] dstStride - a row size (in bytes) of the output image.
+    */
+    SIMD_API void SimdResizerRun(const void * resizer, const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride);
+
     /*! @ingroup segmentation
 
         \fn void SimdSegmentationChangeIndex(uint8_t * mask, size_t stride, size_t width, size_t height, uint8_t oldIndex, uint8_t newIndex);
@@ -4768,8 +4938,26 @@ extern "C"
         \param [in] height - an image height.
         \param [out] sum - the result sum.
     */
+	
     SIMD_API void SimdSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
+	
+	    /*! @ingroup other_statistic
+
+        \fn void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
+
+        \short Gets sum and squared sum of value of pixels for gray 8-bit image.
 
+        \note This function has a C++ wrappers: Simd::ValueSquareSum(const View<A>& src, uint64_t & valueSum, uint64_t & squareSum).
+
+        \param [in] src - a pointer to pixels data of the image.
+        \param [in] stride - a row size of the image.
+        \param [in] width - an image width.
+        \param [in] height - an image height.
+        \param [out] valueSum - the result value sum.
+		\param [out] squareSum - the result square sum.
+    */
+    SIMD_API void SimdValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
+	
     /*! @ingroup other_statistic
 
         \fn void SimdCorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
@@ -4840,6 +5028,137 @@ extern "C"
     */
     SIMD_API void SimdSvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum);
 
+    /*! @ingroup synet
+
+        \fn void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+        \short Adds a bias to given vector.
+
+        Algorithm's details:
+        \verbatim
+         for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+                dst[i*size + j] += bias[i];
+        \endverbatim
+
+        \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
+
+        \param [in] bias - a pointer to the 32-bit float array with bias coefficients.
+        \param [in] count - a size of bias array.
+        \param [in] size - an internal size of bias addition.
+        \param [in, out] dst - a pointer to cumulative 32-bit float array. The size of the array must be equal to count*size.
+    */
+    SIMD_API void SimdSynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+    /*! @ingroup synet
+        Describes operation type used in function ::SimdSynetEltwiseLayerForward.
+    */
+    typedef enum
+    {
+        SimdSynetEltwiseOperationProduct, /*!< Product. */
+        SimdSynetEltwiseOperationSum, /*!< Weighted sum. */
+        SimdSynetEltwiseOperationMax, /*!< Maximum. */
+    } SimdSynetEltwiseOperationType;
+
+    /*! @ingroup synet
+
+        \fn void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        \short This function is used for forward propagation of EltwiseLayer.
+
+        Algorithm's details for ::SimdSynetEltwiseOperationProduct:
+        \verbatim
+        for(j = 0; j < size; ++j)
+            dst[j] = 1;
+        for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+                dst[j] *= src[i][j];
+        \endverbatim
+
+        Algorithm's details for ::SimdSynetEltwiseOperationSum:
+        \verbatim
+        for(j = 0; j < size; ++j)
+            dst[j] = 0;
+        for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+                dst[j] += src[i][j]*weight[i];
+        \endverbatim
+
+        Algorithm's details for ::SimdSynetEltwiseOperationMax:
+        \verbatim
+        for(j = 0; j < size; ++j)
+            dst[j] = -FLT_MAX;
+        for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+                dst[j] = max(dst[j], src[i][j]);
+        \endverbatim
+
+        \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
+
+        \param [in] src - a pointer to poitres to the input 32-bit float arrays. 
+        \param [in] weight - a pointer to the 32-bit float array with sum coefficients. It is need only for ::SimdSynetEltwiseOperationSum operation type otherwise it can be NULL.
+        \param [in] count - a count of input arrays. Must be at least 2.
+        \param [in] size - a size of the input and output arrays.
+        \param [in] type - a type of operation (see ::SimdSynetEltwiseOperationType).
+        \param [out] dst - a pointer to the output 32-bit float array.
+    */
+    SIMD_API void SimdSynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+    /*! @ingroup synet
+
+        \fn void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
+        \short This function is used for forward propagation of LrnLayer (cross channels normalization).
+
+        Algorithm's details:
+        \verbatim
+        for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+            {
+                lo = Max(0, i - half);
+                ln = Min(count, i + half + 1);
+                sum = 0;
+                for(l = lo; l < ln; ++l)
+                    sum += Square(src[l*size + j]);
+                dst[i*size + j] = src[i*size + j]*Pow(k[0] + sum*k[1], k[2]);
+            }
+        \endverbatim
+
+        \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
+
+        \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to count*size.
+        \param [in] half - a local normalization half size.
+        \param [in] count - a channels count.
+        \param [in] size - an internal size of the operation.
+        \param [in] k - a pointer to the 32-bit float array with 3 coefficients (see algorithm details). 
+        \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count*size.
+    */
+    SIMD_API void SimdSynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
+    /*! @ingroup synet
+
+        \fn void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
+
+        \short This function is used for forward propagation of ScaleLayer.
+
+        Algorithm's details:
+        \verbatim
+        for(i = 0; i < count; ++i)
+            for(j = 0; j < size; ++j)
+                dst[i*size + j] = src[i*size + j]*scale[i] + (bias ? bias[i] : 0);
+        \endverbatim
+
+        \note This function is used in <a href="http://github.com/ermig1979/Synet">Synet Framework</a>.
+
+        \param [in] src - a pointer to the input 32-bit float array. The size of the array must be equal to count*size.
+        \param [in] scale - a pointer to the 32-bit float array with scale coefficients.
+        \param [in] bias - a pointer to the 32-bit float array with bias coefficients. Can be NULL.
+        \param [in] count - a size of scale and bias arrays.
+        \param [in] size - an internal size of the operation.
+        \param [out] dst - a pointer to the output 32-bit float array. The size of the array must be equal to count*size.
+    */
+    SIMD_API void SimdSynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
+
     /*! @ingroup texture_estimation
 
         \fn void SimdTextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride);
diff --git a/src/3rd/Simd/SimdLib.hpp b/src/3rd/Simd/SimdLib.hpp
index 4896360f..74bba30f 100644
--- a/src/3rd/Simd/SimdLib.hpp
+++ b/src/3rd/Simd/SimdLib.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar,
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
 *               2014-2016 Antonenka Mikhail.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -2592,9 +2592,9 @@ namespace Simd
         \note This function is a C++ wrapper for function ::SimdResizeBilinear.
 
         \param [in] src - an original input image.
-        \param [out] dst - a reduced output image.
+        \param [out] dst - a resized output image.
     */
-    template<template<class> class A> SIMD_INLINE void ResizeBilinear(const View<A>& src, View<A>& dst)
+    template<template<class> class A> SIMD_INLINE void ResizeBilinear(const View<A> & src, View<A> & dst)
     {
         assert(src.format == dst.format && src.ChannelSize() == 1);
 
@@ -2609,6 +2609,43 @@ namespace Simd
         }
     }
 
+    /*! @ingroup resizing
+
+        \fn void ResizeAreaGray(const View<A> & src, View<A> & dst)
+
+        \short Performs resizing of input image with using area interpolation.
+
+        All images must have the same format (8-bit gray).
+
+        \param [in] src - an original input image.
+        \param [out] dst - a resized output image.
+    */
+    template<template<class> class A> SIMD_INLINE void ResizeAreaGray(const View<A> & src, View<A> & dst)
+    {
+        assert(src.format == dst.format && src.format == View<A>::Gray8);
+
+        if (EqualSize(src, dst))
+        {
+            Copy(src, dst);
+        }
+        else
+        {
+            size_t level = 0;
+            for (; (dst.width << (level + 1)) < (size_t)src.width; level++);
+            Point<ptrdiff_t> size = src.Size() << level;
+            if (level)
+            {
+                Pyramid<A> pyramid(size, level + 1);
+                Simd::ResizeBilinear(src, pyramid[0]);
+                for (size_t i = 0; i < level; ++i)
+                    Simd::ReduceGray(pyramid.At(i), pyramid.At(i + 1), ::SimdReduce2x2);
+                Simd::Copy(pyramid[level], dst);
+            }
+            else
+                Simd::ResizeBilinear(src, dst);
+        }
+    }
+
     /*! @ingroup segmentation
 
         \fn void SegmentationChangeIndex(View<A> & mask, uint8_t oldIndex, uint8_t newIndex)
@@ -3227,6 +3264,25 @@ namespace Simd
 
         SimdSquareSum(src.data, src.stride, src.width, src.height, &sum);
     }
+	
+	    /*! @ingroup other_statistic
+
+        \fn void ValueSquareSum(const View<A>& src, uint64_t & valueSum, uint64_t & squareSum)
+
+        \short Gets sum and sum of squared value of pixels for gray 8-bit image.
+
+        \note This function is a C++ wrapper for function ::SimdValueSquareSum.
+
+        \param [in] src - an input image.
+        \param [out] valueSum - a result value sum.
+		\param [out] squareSum - a result square sum.
+    */
+    template<template<class> class A> SIMD_INLINE void ValueSquareSum(const View<A>& src, uint64_t & valueSum, uint64_t & squareSum)
+    {
+        assert(src.format == View<A>::Gray8);
+
+        SimdValueSquareSum(src.data, src.stride, src.width, src.height, &valueSum, &squareSum);
+    }
 
     /*! @ingroup other_statistic
 
diff --git a/src/3rd/Simd/SimdLoad.h b/src/3rd/Simd/SimdLoad.h
index 3ef69919..c173d816 100644
--- a/src/3rd/Simd/SimdLoad.h
+++ b/src/3rd/Simd/SimdLoad.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -42,6 +42,11 @@ namespace Simd
         {
             return _mm_load_ps(p);
         }
+
+        SIMD_INLINE __m128 Load(const float * p0, const float * p1)
+        {
+            return _mm_loadh_pi(_mm_loadl_pi(_mm_undefined_ps(), (__m64*)p0), (__m64*)p1);
+        }
     }
 #endif//SIMD_SSE_ENABLE
 
@@ -148,7 +153,7 @@ namespace Simd
 #ifdef SIMD_SSE3_ENABLE
     namespace Sse3
     {
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
         using Sse::Load;
         using Sse2::Load;
 #endif
@@ -158,7 +163,7 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
         using Sse::Load;
         using Sse2::Load;
 #endif
@@ -184,6 +189,11 @@ namespace Simd
         {
             return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load<align>(p0)), Sse::Load<align>(p1), 1);
         }
+
+        SIMD_INLINE __m256 Load(const float * p0, const float * p1, const float * p2, const float * p3)
+        {
+            return _mm256_insertf128_ps(_mm256_castps128_ps256(Sse::Load(p0, p1)), Sse::Load(p2, p3), 1);
+        }
     }
 #endif//SIMD_AVX_ENABLE
 
diff --git a/src/3rd/Simd/SimdMath.h b/src/3rd/Simd/SimdMath.h
index b26fef98..e5cd3e99 100644
--- a/src/3rd/Simd/SimdMath.h
+++ b/src/3rd/Simd/SimdMath.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -260,15 +260,6 @@ namespace Simd
             gradient[offset] += d*d;
             weight[offset] -= alpha * d / ::sqrt(gradient[offset] + epsilon);
         }
-
-        SIMD_INLINE float Pow(float basis, float exponent)
-        {
-#if defined(__GNUC__) && defined(SIMD_X86_ENABLE)
-
-#else
-            return ::expf(::logf(basis)*exponent);
-#endif
-        }
     }
 
 #ifdef SIMD_SSE_ENABLE
@@ -296,16 +287,16 @@ namespace Simd
             return _mm_or_ps(_mm_and_ps(mask, positive), _mm_andnot_ps(mask, negative));
         }
 
-        SIMD_INLINE __m128 RightNotZero(size_t count)
+        SIMD_INLINE __m128 RightNotZero(ptrdiff_t count)
         {
             const int32_t mask[DF] = { 0, 0, 0, 0, -1, -1, -1, -1 };
-            return _mm_loadu_ps((float*)(mask + count));
+            return _mm_loadu_ps((float*)(mask + Simd::RestrictRange<ptrdiff_t>(count, 0, F)));
         }
 
-        SIMD_INLINE __m128 LeftNotZero(size_t count)
+        SIMD_INLINE __m128 LeftNotZero(ptrdiff_t count)
         {
             const int32_t mask[DF] = { -1, -1, -1, -1, 0, 0, 0, 0 };
-            return _mm_loadu_ps((float*)(mask + 4 - count));
+            return _mm_loadu_ps((float*)(mask + F - Simd::RestrictRange<ptrdiff_t>(count, 0, F)));
         }
 
         template <bool condition> SIMD_INLINE __m128 Masked(const __m128 & value, const __m128 & mask);
@@ -497,7 +488,7 @@ namespace Simd
 #ifdef SIMD_SSE3_ENABLE
     namespace Sse3
     {
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug      
         using Sse::RightNotZero;
 #endif
     }
@@ -530,7 +521,7 @@ namespace Simd
 #ifdef SIMD_SSE41_ENABLE
     namespace Sse41
     {
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug     
         using Sse::RightNotZero;
 #endif
 
@@ -583,10 +574,16 @@ namespace Simd
             return _mm256_mul_ps(_mm256_rsqrt_ps(_mm256_max_ps(value, _mm256_set1_ps(0.00000001f))), value);
         }
 
-        SIMD_INLINE __m256 RightNotZero(size_t count)
+        SIMD_INLINE __m256 RightNotZero(ptrdiff_t count)
         {
             const int32_t mask[DF] = { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 };
-            return _mm256_loadu_ps((float*)(mask + count));
+            return _mm256_loadu_ps((float*)(mask + Simd::RestrictRange<ptrdiff_t>(count, 0, F)));
+        }
+
+        SIMD_INLINE __m256 LeftNotZero(ptrdiff_t count)
+        {
+            const int32_t mask[DF] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 };
+            return _mm256_loadu_ps((float*)(mask + F - Simd::RestrictRange<ptrdiff_t>(count, 0, F)));
         }
 
         SIMD_INLINE __m256 PermutedHorizontalAdd(__m256 a, __m256 b)
@@ -618,7 +615,7 @@ namespace Simd
 #ifdef SIMD_AVX2_ENABLE
     namespace Avx2
     {
-#if defined(_MSC_VER) && _MSC_VER >= 1800  && _MSC_VER < 1900 // Visual Studio 2013 compiler bug       
+#if defined(_MSC_VER) && _MSC_VER >= 1700  && _MSC_VER < 1900 // Visual Studio 2012/2013 compiler bug     
         using Avx::RightNotZero;
 #endif
 
diff --git a/src/3rd/Simd/SimdMemory.h b/src/3rd/Simd/SimdMemory.h
index 1530ad56..a4b49cd8 100644
--- a/src/3rd/Simd/SimdMemory.h
+++ b/src/3rd/Simd/SimdMemory.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *               2016-2016 Sintegrial Technologies.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -30,6 +30,16 @@
 
 namespace Simd
 {
+    SIMD_INLINE size_t AlignHiAny(size_t size, size_t align)
+    {
+        return (size + align - 1) / align * align;
+    }
+
+    SIMD_INLINE size_t AlignLoAny(size_t size, size_t align)
+    {
+        return size / align * align;
+    }
+
     SIMD_INLINE size_t AlignHi(size_t size, size_t align)
     {
         return (size + align - 1) & ~(align - 1);
@@ -91,6 +101,11 @@ namespace Simd
 #endif
     }
 
+    struct Deletable
+    {
+        virtual ~Deletable() {}
+    };
+
 #ifdef SIMD_SSE_ENABLE
     namespace Sse
     {
diff --git a/src/3rd/Simd/SimdNeon.h b/src/3rd/Simd/SimdNeon.h
index b6d6828e..ea5e901a 100644
--- a/src/3rd/Simd/SimdNeon.h
+++ b/src/3rd/Simd/SimdNeon.h
@@ -1,7 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -407,6 +408,8 @@ namespace Simd
         void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
 
         void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
+		
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
 
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
 
diff --git a/src/3rd/Simd/SimdNeonResizeBilinear.cpp b/src/3rd/Simd/SimdNeonResizeBilinear.cpp
index 8fc9512d..7b67a4b1 100644
--- a/src/3rd/Simd/SimdNeonResizeBilinear.cpp
+++ b/src/3rd/Simd/SimdNeonResizeBilinear.cpp
@@ -1,7 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -58,6 +59,40 @@ namespace Simd
             private:
                 void *_p;
             };
+
+            struct Index
+            {
+                int src, dst;
+                uint8_t shuffle[Simd::Neon::A];
+            };
+
+            struct BufferG
+            {
+                BufferG(size_t width, size_t blocks, size_t height)
+                {
+                    _p = Simd::Allocate(3 * width + sizeof(int) * 2 * height + blocks * sizeof(Index) + 2 * A);
+                    bx[0] = (uint8_t*)_p;
+                    bx[1] = bx[0] + width + A;
+                    ax = bx[1] + width + A;
+                    ix = (Index*)(ax + width);
+                    iy = (int*)(ix + blocks);
+                    ay = iy + height;
+                }
+
+                ~BufferG()
+                {
+                    Free(_p);
+                }
+
+                uint8_t * bx[2];
+                uint8_t * ax;
+                Index * ix;
+                int * ay;
+                int * iy;
+            private:
+                void *_p;
+            };
+
         }
 
         template <size_t channelCount> void EstimateAlphaIndexX(size_t srcSize, size_t dstSize, int * indexes, uint8_t * alphas)
@@ -91,6 +126,55 @@ namespace Simd
             }
         }
 
+        void EstimateAlphaIndexX(int srcSize, int dstSize, Index * indexes, uint8_t * alphas, size_t & blockCount)
+        {
+            float scale = (float)srcSize / dstSize;
+            int block = 0;
+            indexes[0].src = 0;
+            indexes[0].dst = 0;
+            for (int dstIndex = 0; dstIndex < dstSize; ++dstIndex)
+            {
+                float alpha = (float)((dstIndex + 0.5)*scale - 0.5);
+                int srcIndex = (int)::floor(alpha);
+                alpha -= srcIndex;
+
+                if (srcIndex < 0)
+                {
+                    srcIndex = 0;
+                    alpha = 0;
+                }
+
+                if (srcIndex > srcSize - 2)
+                {
+                    srcIndex = srcSize - 2;
+                    alpha = 1;
+                }
+
+                int dst = 2 * dstIndex - indexes[block].dst;
+                int src = srcIndex - indexes[block].src;
+                if (src >= A - 1 || dst >= A)
+                {
+                    block++;
+                    indexes[block].src = Simd::Min(srcIndex, srcSize - (int)A);
+                    indexes[block].dst = 2 * dstIndex;
+                    dst = 0;
+                    src = srcIndex - indexes[block].src;
+                }
+                indexes[block].shuffle[dst] = src;
+                indexes[block].shuffle[dst + 1] = src + 1;
+
+                alphas[1] = (uint8_t)(alpha * Base::FRACTION_RANGE + 0.5);
+                alphas[0] = (uint8_t)(Base::FRACTION_RANGE - alphas[1]);
+                alphas += 2;
+            }
+            blockCount = block + 1;
+        }
+
+        SIMD_INLINE size_t BlockCountMax(size_t src, size_t dst)
+        {
+            return (size_t)Simd::Max(::ceil(float(src) / (A - 1)), ::ceil(float(dst) / HA));
+        }
+
         template <size_t channelCount> void InterpolateX(const uint8_t * alpha, uint8_t * buffer);
 
         template <> SIMD_INLINE void InterpolateX<1>(const uint8_t * alpha, uint8_t * buffer)
@@ -172,7 +256,7 @@ namespace Simd
             size_t size = 2 * dstWidth*channelCount;
             size_t bufferSize = AlignHi(dstWidth, A)*channelCount * 2;
             size_t alignedSize = AlignHi(size, DA) - DA;
-            const size_t step = A*channelCount;
+            const size_t step = A * channelCount;
 
             Buffer buffer(bufferSize, dstWidth, dstHeight);
 
@@ -221,6 +305,84 @@ namespace Simd
             }
         }
 
+        SIMD_INLINE void LoadGray(const uint8_t * src, const Index & index, uint8_t * dst)
+        {
+
+            uint8x16_t _src = vld1q_u8(src + index.src);
+            uint8x16_t _shuffle = vld1q_u8(index.shuffle);
+
+            uint8x8x2_t src1;
+            src1.val[0] = vget_low_u8(_src);
+            src1.val[1] = vget_high_u8(_src);
+
+            uint8x8_t dstLow = vtbl2_u8(src1, vget_low_u8(_shuffle));
+            uint8x8_t dstHigh = vtbl2_u8(src1, vget_high_u8(_shuffle));
+
+            uint8x16_t _dst = vcombine_u8(dstLow, dstHigh);
+
+            vst1q_u8(dst + index.dst, _dst);
+
+        }
+
+        void ResizeBilinearGray(
+            const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
+            uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
+        {
+            assert(dstWidth >= A);
+
+            size_t bufferWidth = AlignHi(dstWidth, A) * 2;
+            size_t blockCount = BlockCountMax(srcWidth, dstWidth);
+            size_t size = 2 * dstWidth;
+            size_t alignedSize = AlignHi(size, DA) - DA;
+            const size_t step = A;
+
+            BufferG buffer(bufferWidth, blockCount, dstHeight);
+
+            Base::EstimateAlphaIndex(srcHeight, dstHeight, buffer.iy, buffer.ay, 1);
+
+            EstimateAlphaIndexX((int)srcWidth, (int)dstWidth, buffer.ix, buffer.ax, blockCount);
+
+            ptrdiff_t previous = -2;
+
+            uint16x8_t a[2];
+
+            for (size_t yDst = 0; yDst < dstHeight; yDst++, dst += dstStride)
+            {
+                a[0] = vdupq_n_u16(Base::FRACTION_RANGE - buffer.ay[yDst]);
+                a[1] = vdupq_n_u16(buffer.ay[yDst]);
+
+                ptrdiff_t sy = buffer.iy[yDst];
+                int k = 0;
+
+                if (sy == previous)
+                    k = 2;
+                else if (sy == previous + 1)
+                {
+                    Swap(buffer.bx[0], buffer.bx[1]);
+                    k = 1;
+                }
+
+                previous = sy;
+
+                for (; k < 2; k++)
+                {
+                    const uint8_t * psrc = src + (sy + k)*srcStride;
+                    uint8_t * pdst = buffer.bx[k];
+                    for (size_t i = 0; i < blockCount; ++i)
+                        LoadGray(psrc, buffer.ix[i], pdst);
+
+                    uint8_t * pbx = buffer.bx[k];
+                    for (size_t i = 0; i < bufferWidth; i += step)
+                        InterpolateX<1>(buffer.ax + i, pbx + i);
+                }
+
+                for (size_t ib = 0, id = 0; ib < alignedSize; ib += DA, id += A)
+                    InterpolateY<true>(buffer.bx[0] + ib, buffer.bx[1] + ib, a, dst + id);
+                size_t i = size - DA;
+                InterpolateY<false>(buffer.bx[0] + i, buffer.bx[1] + i, a, dst + i / 2);
+            }
+        }
+
         void ResizeBilinear(
             const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride, size_t channelCount)
@@ -228,7 +390,10 @@ namespace Simd
             switch (channelCount)
             {
             case 1:
-                ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+                if (srcWidth >= A && srcWidth < 4 * dstWidth)
+                    ResizeBilinearGray(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
+                else
+                    ResizeBilinear<1>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
                 break;
             case 2:
                 ResizeBilinear<2>(src, srcWidth, srcHeight, srcStride, dst, dstWidth, dstHeight, dstStride);
diff --git a/src/3rd/Simd/SimdNeonStatistic.cpp b/src/3rd/Simd/SimdNeonStatistic.cpp
index ff38b6fc..466590bb 100644
--- a/src/3rd/Simd/SimdNeonStatistic.cpp
+++ b/src/3rd/Simd/SimdNeonStatistic.cpp
@@ -1,7 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2018-2018 Radchenko Andrey.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -554,6 +555,48 @@ namespace Simd
                 SquareSum<false>(src, stride, width, height, sum);
         }
 
+        template <bool align> void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(src) && Aligned(stride));
+
+            size_t alignedWidth = Simd::AlignLo(width, A);
+            uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);
+            uint64x2_t fullValueSum = K64_0000000000000000;
+            uint64x2_t fullSquareSum = K64_0000000000000000;
+            for (size_t row = 0; row < height; ++row)
+            {
+                uint32x4_t rowValueSum = K32_00000000;
+                uint32x4_t rowSquareSum = K32_00000000;
+                for (size_t col = 0; col < alignedWidth; col += A)
+                {
+                    uint8x16_t _src = Load<align>(src + col);
+                    rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src));
+                    rowSquareSum = vaddq_u32(rowSquareSum, Square(_src));
+                }
+                if (alignedWidth != width)
+                {
+                    uint8x16_t _src = vandq_u8(Load<false>(src + width - A), tailMask);
+                    rowValueSum = vpadalq_u16(rowValueSum, vpaddlq_u8(_src));
+                    rowSquareSum = vaddq_u32(rowSquareSum, Square(_src));
+                }
+                fullValueSum = vaddq_u64(fullValueSum, vpaddlq_u32(rowValueSum));
+                fullSquareSum = vaddq_u64(fullSquareSum, vpaddlq_u32(rowSquareSum));
+                src += stride;
+            }
+            *valueSum = ExtractSum64u(fullValueSum);
+            *squareSum = ExtractSum64u(fullSquareSum);
+        }
+
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            if (Aligned(src) && Aligned(stride))
+                ValueSquareSum<true>(src, stride, width, height, valueSum, squareSum);
+            else
+                ValueSquareSum<false>(src, stride, width, height, valueSum, squareSum);
+        }
+
         SIMD_INLINE uint32x4_t Correlation(const uint8x16_t & a, const uint8x16_t & b)
         {
             uint16x8_t lo = vmull_u8(Half<0>(a), Half<0>(b));
diff --git a/src/3rd/Simd/SimdNeural.hpp b/src/3rd/Simd/SimdNeural.hpp
index e9c7ba04..3ac72df1 100644
--- a/src/3rd/Simd/SimdNeural.hpp
+++ b/src/3rd/Simd/SimdNeural.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -1282,7 +1282,7 @@ namespace Simd
         */
         class DropoutLayer : public Layer
         {
-            const size_t RANDOM_SIZE = 256;
+            static size_t SIMD_INLINE RandomSize() { return 256; }
         public:
             /*!
             \short Creates new DropoutLayer class.
@@ -1344,7 +1344,7 @@ namespace Simd
                 _specific.resize(number);
                 if (train)
                 {
-                    _mask.resize(_src.Volume()*(1 + RANDOM_SIZE));
+                    _mask.resize(_src.Volume()*(1 + RandomSize()));
                     for (size_t i = 0; i < _mask.size(); ++i)
                         _mask[i] = Detail::RandomUniform(0.0f, 1.0f) <= _rate ? 1.0f : 0.0f;
                 }
@@ -1362,7 +1362,7 @@ namespace Simd
 
             const float * Mask()
             {
-                size_t start = Detail::RandomUniform(0, int(RANDOM_SIZE*_src.Volume()));
+                size_t start = Detail::RandomUniform(0, int(RandomSize()*_src.Volume()));
                 return _mask.data() + start;
             }
         };
diff --git a/src/3rd/Simd/SimdParallel.hpp b/src/3rd/Simd/SimdParallel.hpp
index 34464d2e..142c2a4a 100644
--- a/src/3rd/Simd/SimdParallel.hpp
+++ b/src/3rd/Simd/SimdParallel.hpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -26,21 +26,21 @@
 
 #include <thread>
 #include <future>
+#include <vector>
 
 namespace Simd
 {
-    template<class Function> inline void Parallel(size_t begin, size_t end, const Function & function, size_t threadNumber, size_t blockStepMin = 1)
+    template<class Function> inline void Parallel(size_t begin, size_t end, const Function & function, size_t threadNumber, size_t blockAlign = 1)
     {
         threadNumber = std::min<size_t>(threadNumber, std::thread::hardware_concurrency());
-        if (threadNumber <= 1)
+        if (threadNumber <= 1 || size_t(blockAlign*1.5) >= (end - begin))
             function(0, begin, end);
         else
         {
             std::vector<std::future<void>> futures;
 
             size_t blockSize = (end - begin + threadNumber - 1) / threadNumber;
-            if (blockStepMin > 1)
-                blockSize += blockSize%blockStepMin;
+            blockSize = (blockSize + blockAlign - 1) / blockAlign * blockAlign;
             size_t blockBegin = begin;
             size_t blockEnd = blockBegin + blockSize;
 
diff --git a/src/3rd/Simd/SimdPow.h b/src/3rd/Simd/SimdPow.h
new file mode 100644
index 00000000..6fcb4d04
--- /dev/null
+++ b/src/3rd/Simd/SimdPow.h
@@ -0,0 +1,205 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdPow_h__
+#define __SimdPow_h__
+
+#include "Simd/SimdMath.h"
+
+namespace Simd
+{
+    namespace Base
+    {
+        SIMD_INLINE float Pow(float basis, float exponent)
+        {
+            return ::expf(::logf(basis)*exponent);
+        }
+    }
+
+#ifdef SIMD_SSE2_ENABLE    
+    namespace Sse2
+    {
+        class Pow
+        {
+            __m128i _exponent, _mantissa;
+            __m128 _one;
+
+            SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f) const
+            {
+                __m128 p = _mm_set1_ps(f);
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b));
+                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a));
+                return p;
+            }
+
+            SIMD_INLINE __m128 Exp2(__m128 x) const
+            {
+                x = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(129.00000f)), _mm_set1_ps(-126.99999f));
+                __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+                __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+                __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+                __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+                return _mm_mul_ps(expipart, expfpart);
+            }
+
+            SIMD_INLINE __m128 Log2(__m128 x) const
+            {
+                __m128i i = _mm_castps_si128(x);
+                __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _exponent), 23), _mm_set1_epi32(127)));
+                __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mantissa)), _one);
+                __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _one)), e);
+            }
+
+        public:
+
+            SIMD_INLINE Pow()
+            {
+                _exponent = _mm_set1_epi32(0x7F800000);
+                _mantissa = _mm_set1_epi32(0x007FFFFF);
+                _one = _mm_set1_ps(1.0f);
+            }
+
+            SIMD_INLINE __m128 operator() (__m128 basis, __m128 exponent) const 
+            {
+                return Exp2(_mm_mul_ps(Log2(basis), exponent));
+            }            
+        };
+    }
+#endif //SIMD_SSE2_ENABLE   
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class Pow
+        {
+            __m256i _exponent, _mantissa;
+            __m256 _one;
+
+            SIMD_INLINE __m256 Poly5(__m256 x, float a, float b, float c, float d, float e, float f) const
+            {
+                __m256 p = _mm256_set1_ps(f);
+                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(e));
+                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(d));
+                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(c));
+                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(b));
+                p = _mm256_fmadd_ps(x, p, _mm256_set1_ps(a));
+                return p;
+            }
+
+            SIMD_INLINE __m256 Exp2(__m256 x) const
+            {
+                x = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(129.00000f)), _mm256_set1_ps(-126.99999f));
+                __m256i ipart = _mm256_cvtps_epi32(_mm256_sub_ps(x, _mm256_set1_ps(0.5f)));
+                __m256 fpart = _mm256_sub_ps(x, _mm256_cvtepi32_ps(ipart));
+                __m256 expipart = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(ipart, _mm256_set1_epi32(127)), 23));
+                __m256 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+                return _mm256_mul_ps(expipart, expfpart);
+            }
+
+            SIMD_INLINE __m256 Log2(__m256 x) const
+            {
+                __m256i i = _mm256_castps_si256(x);
+                __m256 e = _mm256_cvtepi32_ps(_mm256_sub_epi32(_mm256_srli_epi32(_mm256_and_si256(i, _exponent), 23), _mm256_set1_epi32(127)));
+                __m256 m = _mm256_or_ps(_mm256_castsi256_ps(_mm256_and_si256(i, _mantissa)), _one);
+                __m256 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm256_fmadd_ps(p, _mm256_sub_ps(m, _one), e);
+            }
+
+        public:
+
+            SIMD_INLINE Pow()
+            {
+                _exponent = _mm256_set1_epi32(0x7F800000);
+                _mantissa = _mm256_set1_epi32(0x007FFFFF);
+                _one = _mm256_set1_ps(1.0f);
+            }
+
+            SIMD_INLINE __m256 operator()(__m256 basis, __m256 exponent) const
+            {
+                return Exp2(_mm256_mul_ps(Log2(basis), exponent));
+            }
+        };
+    }
+#endif //SIMD_AVX2_ENABLE
+
+#ifdef SIMD_AVX512F_ENABLE    
+    namespace Avx512f
+    {
+        class Pow
+        {
+            __m512i _exponent, _mantissa;
+            __m512 _one;
+
+            SIMD_INLINE __m512 Poly5(__m512 x, float a, float b, float c, float d, float e, float f) const
+            {
+                __m512 p = _mm512_set1_ps(f);
+                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(e));
+                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(d));
+                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(c));
+                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(b));
+                p = _mm512_fmadd_ps(x, p, _mm512_set1_ps(a));
+                return p;
+            }
+
+            SIMD_INLINE __m512 Exp2(__m512 x) const
+            {
+                x = _mm512_max_ps(_mm512_min_ps(x, _mm512_set1_ps(129.00000f)), _mm512_set1_ps(-126.99999f));
+                __m512i ipart = _mm512_cvtps_epi32(_mm512_sub_ps(x, _mm512_set1_ps(0.5f)));
+                __m512 fpart = _mm512_sub_ps(x, _mm512_cvtepi32_ps(ipart));
+                __m512 expipart = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_add_epi32(ipart, _mm512_set1_epi32(127)), 23));
+                __m512 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+                return _mm512_mul_ps(expipart, expfpart);
+            }
+
+            SIMD_INLINE __m512 Log2(__m512 x) const
+            {
+                __m512i i = _mm512_castps_si512(x);
+                __m512 e = _mm512_cvtepi32_ps(_mm512_sub_epi32(_mm512_srli_epi32(_mm512_and_si512(i, _exponent), 23), _mm512_set1_epi32(127)));
+                __m512 m = _mm512_or_ps(_mm512_castsi512_ps(_mm512_and_si512(i, _mantissa)), _one);
+                __m512 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+                return _mm512_fmadd_ps(p, _mm512_sub_ps(m, _one), e);
+            }
+
+        public:
+
+            SIMD_INLINE Pow()
+            {
+                _exponent = _mm512_set1_epi32(0x7F800000);
+                _mantissa = _mm512_set1_epi32(0x007FFFFF);
+                _one = _mm512_set1_ps(1.0f);
+            }
+
+            SIMD_INLINE __m512 operator()(__m512 basis, __m512 exponent) const
+            {
+                return Exp2(_mm512_mul_ps(Log2(basis), exponent));
+            }
+        };
+    }
+#endif //SIMD_AVX512F_ENABLE
+}
+
+#endif//__SimdPow_h__
diff --git a/src/3rd/Simd/SimdResizer.h b/src/3rd/Simd/SimdResizer.h
new file mode 100644
index 00000000..3d709314
--- /dev/null
+++ b/src/3rd/Simd/SimdResizer.h
@@ -0,0 +1,139 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#ifndef __SimdResizer_h__
+#define __SimdResizer_h__
+
+#include "Simd/SimdArray.h"
+
+namespace Simd
+{
+    class Resizer : Deletable
+    {
+        SimdResizeChannelType _type;
+        SimdResizeMethodType _method;
+
+    public:
+        Resizer(SimdResizeChannelType type, SimdResizeMethodType method)
+            : _type(type)
+            , _method(method)
+        {
+        }
+
+        SimdResizeChannelType Type() const { return _type; }
+        SimdResizeMethodType Method() const { return _method; }
+
+        virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const = 0;
+    };
+
+    namespace Base
+    {
+        class ResizerByteBilinear : Resizer
+        {
+            size_t _sx, _sy, _dx, _dy, _cn, _rs;
+            Array32i _ax, _ix, _ay, _iy;
+        public:
+            ResizerByteBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels);
+
+            static void EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, int32_t * alphas, size_t channels);
+
+            virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const;
+        };
+
+        class ResizerFloatBilinear : Resizer
+        {
+        protected:
+            size_t _sx, _sy, _dx, _dy, _cn, _rs;
+            Array32i _ix, _iy;
+            Array32f _ax, _ay;
+
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const;
+
+        public:
+            ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, size_t align, bool caffeInterp);
+
+            virtual void Run(const uint8_t * src, size_t srcStride, uint8_t * dst, size_t dstStride) const;
+
+            static void EstimateIndexAlpha(size_t srcSize, size_t dstSize, int32_t * indices, float * alphas, size_t channels, bool caffeInterp);
+        };
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+    }
+
+#ifdef SIMD_SSE_ENABLE    
+    namespace Sse
+    {
+        class ResizerFloatBilinear : Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const;
+        public:
+            ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp);
+        };
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+    }
+#endif //SIMD_SSE_ENABLE 
+
+#ifdef SIMD_AVX_ENABLE    
+    namespace Avx
+    {
+        class ResizerFloatBilinear : Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const;
+        public:
+            ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp);
+        };
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+    }
+#endif //SIMD_AVX_ENABLE 
+
+#ifdef SIMD_AVX2_ENABLE    
+    namespace Avx2
+    {
+        class ResizerFloatBilinear : Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const;
+        public:
+            ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp);
+        };
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+    }
+#endif //SIMD_AVX2_ENABLE 
+
+#ifdef SIMD_AVX512F_ENABLE    
+    namespace Avx512f
+    {
+        class ResizerFloatBilinear : Base::ResizerFloatBilinear
+        {
+            virtual void Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const;
+        public:
+            ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp);
+        };
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method);
+    }
+#endif //SIMD_AVX512F_ENABLE 
+}
+#endif//__SimdResizer_h__
diff --git a/src/3rd/Simd/SimdSse1.h b/src/3rd/Simd/SimdSse1.h
index d216df1a..3edf8be5 100644
--- a/src/3rd/Simd/SimdSse1.h
+++ b/src/3rd/Simd/SimdSse1.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,10 @@ namespace Simd
 #ifdef SIMD_SSE_ENABLE    
     namespace Sse
     {
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance);
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc);
+
         void HogDeinterleave(const float * src, size_t srcStride, size_t width, size_t height, size_t count, float ** dst, size_t dstStride);
 
         void HogFilterSeparable(const float * src, size_t srcStride, size_t width, size_t height, const float * rowFilter, size_t rowSize, const float * colFilter, size_t colSize, float * dst, size_t dstStride, int add);
@@ -96,6 +100,12 @@ namespace Simd
         void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t size, float * sum);
 
         void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum);
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst);
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst);
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst);
     }
 #endif// SIMD_SSE_ENABLE
 }
diff --git a/src/3rd/Simd/SimdSse1Float32.cpp b/src/3rd/Simd/SimdSse1Float32.cpp
new file mode 100644
index 00000000..f2c3370a
--- /dev/null
+++ b/src/3rd/Simd/SimdSse1Float32.cpp
@@ -0,0 +1,92 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE_ENABLE    
+    namespace Sse
+    {
+        template<bool align> void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (align)
+                assert(Aligned(a) && Aligned(b));
+
+            size_t partialAlignedSize = AlignLo(size, F);
+            size_t fullAlignedSize = AlignLo(size, DF);
+            size_t i = 0;
+            __m128 _aa[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
+            __m128 _ab[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
+            __m128 _bb[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
+            if (fullAlignedSize)
+            {
+                for (; i < fullAlignedSize; i += DF)
+                {
+                    __m128 a0 = Load<align>(a + i + 0 * F);
+                    __m128 b0 = Load<align>(b + i + 0 * F);
+                    _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
+                    _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
+                    _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
+                    __m128 a1 = Load<align>(a + i + 1 * F);
+                    __m128 b1 = Load<align>(b + i + 1 * F);
+                    _aa[1] = _mm_add_ps(_aa[1], _mm_mul_ps(a1, a1));
+                    _ab[1] = _mm_add_ps(_ab[1], _mm_mul_ps(a1, b1));
+                    _bb[1] = _mm_add_ps(_bb[1], _mm_mul_ps(b1, b1));
+                }
+                _aa[0] = _mm_add_ps(_aa[0], _aa[1]);
+                _ab[0] = _mm_add_ps(_ab[0], _ab[1]);
+                _bb[0] = _mm_add_ps(_bb[0], _bb[1]);
+            }
+            for (; i < partialAlignedSize; i += F)
+            {
+                __m128 a0 = Load<align>(a + i);
+                __m128 b0 = Load<align>(b + i);
+                _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
+                _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
+                _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
+            }
+            float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]);
+            for (; i < size; ++i)
+            {
+                float _a = a[i];
+                float _b = b[i];
+                aa += _a * _a;
+                ab += _a * _b;
+                bb += _b * _b;
+            }
+            *distance = 1.0f - ab / ::sqrt(aa*bb);
+        }
+
+        void CosineDistance32f(const float * a, const float * b, size_t size, float * distance)
+        {
+            if (Aligned(a) && Aligned(b))
+                CosineDistance32f<true>(a, b, size, distance);
+            else
+                CosineDistance32f<false>(a, b, size, distance);
+        }
+    }
+#endif// SIMD_SSE_ENABLE
+}
diff --git a/src/3rd/Simd/SimdSse1Gemm32f.cpp b/src/3rd/Simd/SimdSse1Gemm32f.cpp
new file mode 100644
index 00000000..7d480aff
--- /dev/null
+++ b/src/3rd/Simd/SimdSse1Gemm32f.cpp
@@ -0,0 +1,595 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdStore.h"
+#include "Simd/SimdGemm.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE_ENABLE    
+    namespace Sse
+    {
+        SIMD_INLINE void AddProduct(float * ptr, __m128 value, __m128 alpha)
+        {
+            _mm_storeu_ps(ptr, _mm_add_ps(_mm_mul_ps(value, alpha), _mm_loadu_ps(ptr)));
+        }
+
+        SIMD_INLINE void AddProduct(float * ptr, __m128 value, __m128 alpha, size_t tail)
+        {
+            if (tail == F)
+                AddProduct(ptr, value, alpha);
+            else
+            {
+                float tmp[F];
+                _mm_storeu_ps(tmp, _mm_add_ps(_mm_mul_ps(value, alpha), _mm_loadu_ps(ptr)));
+                for (size_t i = 0; i < tail; ++i)
+                    ptr[i] = tmp[i];
+            }
+        }
+
+        static void Kernel4x12(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c00 = _mm_setzero_ps();
+            __m128 c10 = _mm_setzero_ps();
+            __m128 c20 = _mm_setzero_ps();
+            __m128 c30 = _mm_setzero_ps();
+            __m128 c01 = _mm_setzero_ps();
+            __m128 c11 = _mm_setzero_ps();
+            __m128 c21 = _mm_setzero_ps();
+            __m128 c31 = _mm_setzero_ps();
+            __m128 c02 = _mm_setzero_ps();
+            __m128 c12 = _mm_setzero_ps();
+            __m128 c22 = _mm_setzero_ps();
+            __m128 c32 = _mm_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m128 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                b1 = _mm_loadu_ps(B + 1 * F);
+                b2 = _mm_loadu_ps(B + 2 * F);
+                a0 = _mm_set1_ps(*A0++);
+                c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00);
+                c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01);
+                c02 = _mm_add_ps(_mm_mul_ps(a0, b2), c02);
+                a0 = _mm_set1_ps(*A1++);
+                c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10);
+                c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11);
+                c12 = _mm_add_ps(_mm_mul_ps(a0, b2), c12);
+                a0 = _mm_set1_ps(*A2++);
+                c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20);
+                c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21);
+                c22 = _mm_add_ps(_mm_mul_ps(a0, b2), c22);
+                a0 = _mm_set1_ps(*A3++);
+                c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30);
+                c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31);
+                c32 = _mm_add_ps(_mm_mul_ps(a0, b2), c32);
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01);
+            AddProduct(C + 2 * F, _alpha, c02, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11);
+            AddProduct(C + 2 * F, _alpha, c12, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21);
+            AddProduct(C + 2 * F, _alpha, c22, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31);
+            AddProduct(C + 2 * F, _alpha, c32, tail);
+        }
+
+        static void Kernel4x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c00 = _mm_setzero_ps();
+            __m128 c10 = _mm_setzero_ps();
+            __m128 c20 = _mm_setzero_ps();
+            __m128 c30 = _mm_setzero_ps();
+            __m128 c01 = _mm_setzero_ps();
+            __m128 c11 = _mm_setzero_ps();
+            __m128 c21 = _mm_setzero_ps();
+            __m128 c31 = _mm_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            __m128 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                b1 = _mm_loadu_ps(B + 1 * F);
+                a0 = _mm_set1_ps(*A0++);
+                c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00);
+                c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01);
+                a0 = _mm_set1_ps(*A1++);
+                c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10);
+                c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11);
+                a0 = _mm_set1_ps(*A2++);
+                c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20);
+                c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21);
+                a0 = _mm_set1_ps(*A3++);
+                c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30);
+                c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31);
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+        }
+
+        static void Kernel4x4(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c0 = _mm_setzero_ps();
+            __m128 c1 = _mm_setzero_ps();
+            __m128 c2 = _mm_setzero_ps();
+            __m128 c3 = _mm_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            __m128 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B);
+                c0 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a0++)), c0);
+                c1 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a1++)), c1);
+                c2 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a2++)), c2);
+                c3 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a3++)), c3);
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, tail);
+            AddProduct(C + 1 * ldc, _alpha, c1, tail);
+            AddProduct(C + 2 * ldc, _alpha, c2, tail);
+            AddProduct(C + 3 * ldc, _alpha, c3, tail);
+        }
+
+        static void Kernel6x8(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c00 = _mm_setzero_ps();
+            __m128 c10 = _mm_setzero_ps();
+            __m128 c20 = _mm_setzero_ps();
+            __m128 c30 = _mm_setzero_ps();
+            __m128 c40 = _mm_setzero_ps();
+            __m128 c50 = _mm_setzero_ps();
+            __m128 c01 = _mm_setzero_ps();
+            __m128 c11 = _mm_setzero_ps();
+            __m128 c21 = _mm_setzero_ps();
+            __m128 c31 = _mm_setzero_ps();
+            __m128 c41 = _mm_setzero_ps();
+            __m128 c51 = _mm_setzero_ps();
+            const float * A0 = A + lda * 0;
+            const float * A1 = A + lda * 1;
+            const float * A2 = A + lda * 2;
+            const float * A3 = A + lda * 3;
+            const float * A4 = A + lda * 4;
+            const float * A5 = A + lda * 5;
+            __m128 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                b1 = _mm_loadu_ps(B + 1 * F);
+                a0 = _mm_set1_ps(*A0++);
+                c00 = _mm_add_ps(_mm_mul_ps(a0, b0), c00);
+                c01 = _mm_add_ps(_mm_mul_ps(a0, b1), c01);
+                a0 = _mm_set1_ps(*A1++);
+                c10 = _mm_add_ps(_mm_mul_ps(a0, b0), c10);
+                c11 = _mm_add_ps(_mm_mul_ps(a0, b1), c11);
+                a0 = _mm_set1_ps(*A2++);
+                c20 = _mm_add_ps(_mm_mul_ps(a0, b0), c20);
+                c21 = _mm_add_ps(_mm_mul_ps(a0, b1), c21);
+                a0 = _mm_set1_ps(*A3++);
+                c30 = _mm_add_ps(_mm_mul_ps(a0, b0), c30);
+                c31 = _mm_add_ps(_mm_mul_ps(a0, b1), c31);
+                a0 = _mm_set1_ps(*A4++);
+                c40 = _mm_add_ps(_mm_mul_ps(a0, b0), c40);
+                c41 = _mm_add_ps(_mm_mul_ps(a0, b1), c41);
+                a0 = _mm_set1_ps(*A5++);
+                c50 = _mm_add_ps(_mm_mul_ps(a0, b0), c50);
+                c51 = _mm_add_ps(_mm_mul_ps(a0, b1), c51);
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            AddProduct(C + 0 * F, _alpha, c00);
+            AddProduct(C + 1 * F, _alpha, c01, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c10);
+            AddProduct(C + 1 * F, _alpha, c11, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c20);
+            AddProduct(C + 1 * F, _alpha, c21, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c30);
+            AddProduct(C + 1 * F, _alpha, c31, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c40);
+            AddProduct(C + 1 * F, _alpha, c41, tail);
+            C += ldc;
+            AddProduct(C + 0 * F, _alpha, c50);
+            AddProduct(C + 1 * F, _alpha, c51, tail);
+        }
+
+        static void Kernel6x4(size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c0 = _mm_setzero_ps();
+            __m128 c1 = _mm_setzero_ps();
+            __m128 c2 = _mm_setzero_ps();
+            __m128 c3 = _mm_setzero_ps();
+            __m128 c4 = _mm_setzero_ps();
+            __m128 c5 = _mm_setzero_ps();
+            const float * a0 = A + lda * 0;
+            const float * a1 = A + lda * 1;
+            const float * a2 = A + lda * 2;
+            const float * a3 = A + lda * 3;
+            const float * a4 = A + lda * 4;
+            const float * a5 = A + lda * 5;
+            __m128 b0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B);
+                c0 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a0++)), c0);
+                c1 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a1++)), c1);
+                c2 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a2++)), c2);
+                c3 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a3++)), c3);
+                c4 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a4++)), c4);
+                c5 = _mm_add_ps(_mm_mul_ps(b0, _mm_set1_ps(*a5++)), c5);
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            AddProduct(C + 0 * ldc, _alpha, c0, tail);
+            AddProduct(C + 1 * ldc, _alpha, c1, tail);
+            AddProduct(C + 2 * ldc, _alpha, c2, tail);
+            AddProduct(C + 3 * ldc, _alpha, c3, tail);
+            AddProduct(C + 4 * ldc, _alpha, c4, tail);
+            AddProduct(C + 5 * ldc, _alpha, c5, tail);
+        }
+
+        static void KernelMx12(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c[4][3];
+            const float * a[4];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm_setzero_ps();
+                c[i][1] = _mm_setzero_ps();
+                c[i][2] = _mm_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m128 b0, b1, b2, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                b1 = _mm_loadu_ps(B + 1 * F);
+                b2 = _mm_loadu_ps(B + 2 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm_set1_ps(*a[i]++);
+                    c[i][0] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm_add_ps(_mm_mul_ps(b1, a0), c[i][1]);
+                    c[i][2] = _mm_add_ps(_mm_mul_ps(b2, a0), c[i][2]);
+                }
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1]);
+                AddProduct(C + 2 * F, _alpha, c[i][2], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx8(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+            __m128 c[6][2];
+            const float * a[6];
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i][0] = _mm_setzero_ps();
+                c[i][1] = _mm_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m128 b0, b1, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                b1 = _mm_loadu_ps(B + 1 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm_set1_ps(*a[i]++);
+                    c[i][0] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i][0]);
+                    c[i][1] = _mm_add_ps(_mm_mul_ps(b1, a0), c[i][1]);
+                }
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+            {
+                AddProduct(C + 0 * F, _alpha, c[i][0]);
+                AddProduct(C + 1 * F, _alpha, c[i][1], tail);
+                C += ldc;
+            }
+        }
+
+        static void KernelMx4(size_t M, size_t N, size_t K, float alpha, const float * A, size_t lda, const float * B, size_t ldb, float * C, size_t ldc, size_t tail)
+        {
+#ifdef SIMD_X64_ENABLE
+            __m128 c[6];
+            const float * a[6];
+#else
+            __m128 c[4];
+            const float * a[4];
+#endif
+            for (size_t i = 0; i < M; ++i)
+            {
+                c[i] = _mm_setzero_ps();
+                a[i] = A + lda * i;
+            }
+            __m128 b0, a0;
+            for (size_t k = 0; k < K; k++)
+            {
+                b0 = _mm_loadu_ps(B + 0 * F);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    a0 = _mm_set1_ps(*a[i]++);
+                    c[i] = _mm_add_ps(_mm_mul_ps(b0, a0), c[i]);
+                }
+                B += ldb;
+            }
+            __m128 _alpha = _mm_set1_ps(alpha);
+            for (size_t i = 0; i < M; ++i)
+                AddProduct(C + i * ldc, _alpha, c[i], tail);
+        }
+
+        SIMD_INLINE void ScaleC(float * C, __m128 beta)
+        {
+            _mm_storeu_ps(C, _mm_mul_ps(_mm_loadu_ps(C), beta));
+        }
+
+        void ScaleC(size_t M, size_t N, float beta, float * C, size_t ldc)
+        {
+            if (beta == 1.0f)
+                return;
+            else if (beta == 0.0f)
+            {
+                for (size_t i = 0; i < M; ++i)
+                    memset(C + i * ldc, 0, N * sizeof(float));
+            }
+            else
+            {
+                size_t NQF = AlignLo(N, QF);
+                size_t NF = AlignLo(N, F);
+                __m128 _beta = _mm_set1_ps(beta);
+                for (size_t i = 0; i < M; ++i)
+                {
+                    size_t j = 0;
+                    for (; j < NQF; j += QF)
+                    {
+                        ScaleC(C + j + F * 0, _beta);
+                        ScaleC(C + j + F * 1, _beta);
+                        ScaleC(C + j + F * 2, _beta);
+                        ScaleC(C + j + F * 3, _beta);
+                    }
+                    for (; j < NF; j += F)
+                        ScaleC(C + j, _beta);
+                    for (; j < N; ++j)
+                        C[j] *= beta;
+                    C += ldc;
+                }
+            }
+        }
+
+        static void PackA(const float * src, size_t stride, size_t M, size_t K, size_t cell, float * dst)
+        {
+            for (size_t i = 0; i < M; i += cell)
+            {
+                size_t m = Simd::Min(cell, M - i), k = 0;
+                if (cell == 4 && m == 4)
+                {
+                    size_t K4 = AlignLo(K, 4);
+                    for (; k < K4; k += 4)
+                    {
+                        const float * ps = src + k;
+                        __m128 s0 = _mm_loadu_ps(ps + 0 * stride);
+                        __m128 s1 = _mm_loadu_ps(ps + 1 * stride);
+                        __m128 s2 = _mm_loadu_ps(ps + 2 * stride);
+                        __m128 s3 = _mm_loadu_ps(ps + 3 * stride);
+                        __m128 s00 = _mm_unpacklo_ps(s0, s2);
+                        __m128 s01 = _mm_unpacklo_ps(s1, s3);
+                        __m128 s10 = _mm_unpackhi_ps(s0, s2);
+                        __m128 s11 = _mm_unpackhi_ps(s1, s3);
+                        _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(s00, s01));
+                        _mm_storeu_ps(dst + 4, _mm_unpackhi_ps(s00, s01));
+                        _mm_storeu_ps(dst + 8, _mm_unpacklo_ps(s10, s11));
+                        _mm_storeu_ps(dst + 12, _mm_unpackhi_ps(s10, s11));
+                        dst += 16;
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    for (size_t c = 0; c < m; ++c)
+                        *(dst++) = src[c*stride + k];
+                }  
+                src += cell * stride;
+            }
+        }
+
+        static void PackB(const float * B, size_t ldb, size_t K, size_t N, size_t microN, float * pB)
+        {
+            for (size_t j = 0; j < N; j += microN)
+            {
+                size_t n = Simd::Min(microN, N - j);
+                size_t k = 0;
+                if (microN == 1 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m128 mask0 = Sse::LeftNotZero(n - 0 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                else if (microN == 2 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F));
+                            _mm_storeu_ps(pB + 1 * F, _mm_loadu_ps(b + 1 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m128 mask0 = Sse::LeftNotZero(n - 0 * F);
+                        __m128 mask1 = Sse::LeftNotZero(n - 1 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F)));
+                            _mm_storeu_ps(pB + 1 * F, _mm_and_ps(mask1, _mm_loadu_ps(b + 1 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                else if (microN == 3 * F)
+                {
+                    if (n == microN)
+                    {
+                        for (; k < K; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_loadu_ps(b + 0 * F));
+                            _mm_storeu_ps(pB + 1 * F, _mm_loadu_ps(b + 1 * F));
+                            _mm_storeu_ps(pB + 2 * F, _mm_loadu_ps(b + 2 * F));
+                            pB += microN;
+                        }
+                    }
+                    else
+                    {
+                        __m128 mask0 = Sse::LeftNotZero(n - 0 * F);
+                        __m128 mask1 = Sse::LeftNotZero(n - 1 * F);
+                        __m128 mask2 = Sse::LeftNotZero(n - 2 * F);
+                        for (; k < K - 1; ++k)
+                        {
+                            const float * b = B + k * ldb;
+                            _mm_storeu_ps(pB + 0 * F, _mm_and_ps(mask0, _mm_loadu_ps(b + 0 * F)));
+                            _mm_storeu_ps(pB + 1 * F, _mm_and_ps(mask1, _mm_loadu_ps(b + 1 * F)));
+                            _mm_storeu_ps(pB + 2 * F, _mm_and_ps(mask2, _mm_loadu_ps(b + 2 * F)));
+                            pB += microN;
+                        }
+                    }
+                }
+                for (; k < K; ++k)
+                {
+                    const float * b = B + k * ldb;
+                    size_t c = 0;
+                    for (; c < n; ++c)
+                        *(pB++) = *(b++);
+                    for (; c < microN; ++c)
+                        *(pB++) = 0;
+                }
+                B += microN;
+            }
+        }
+
+        void Gemm32fNN(size_t M, size_t N, size_t K, const float * alpha, const float * A, size_t lda, const float * B, size_t ldb, const float * beta, float * C, size_t ldc)
+        {
+            const size_t CACHE_L1_SIZE = 32 * 1024;
+            const size_t CACHE_L2_SIZE = 256 * 1024;
+            const size_t CACHE_L3_SIZE = 2 * 1024 * 1024;
+            typedef Simd::GemmNN<float, size_t> GemmNN;
+            GemmNN::Main kernelMM, kernelMT;
+            GemmNN::Tail kernelTM, kernelTT;
+            size_t microM, microN, L1, L2;
+#ifdef SIMD_X64_ENABLE
+            if (K > 4024)
+            {
+                microM = 6;
+                microN = 8;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel6x8;
+                kernelMT = tail > F ? Kernel6x8 : Kernel6x4;
+                kernelTM = KernelMx8;
+                kernelTT = tail > F ? KernelMx8 : KernelMx4;
+            }
+            else
+            {
+                microM = 4;
+                microN = 12;
+                size_t tail = N - AlignLoAny(N, microN);
+                kernelMM = Kernel4x12;
+                kernelMT = tail > DF ? Kernel4x12 : (tail > F ? Kernel4x8 : Kernel4x4);
+                kernelTM = KernelMx12;
+                kernelTT = tail > DF ? KernelMx12 : (tail > F ? KernelMx8 : KernelMx4);
+            }
+#else
+            microM = 4;
+            microN = 4;
+            kernelMM = Kernel4x4;
+            kernelMT = Kernel4x4;
+            kernelTM = KernelMx4;
+            kernelTT = KernelMx4;
+#endif
+            L1 = N > 4024 ? CACHE_L2_SIZE : CACHE_L1_SIZE;
+            L2 = N > 4024 ? CACHE_L3_SIZE : CACHE_L2_SIZE;
+            GemmNN gemmNN(M, N, K, microM, microN, L1, L2, CACHE_L3_SIZE, F,
+                kernelMM, kernelMT, kernelTM, kernelTT, ScaleC, PackB, NULL); 
+            gemmNN.Run(alpha, A, lda, B, ldb, beta, C, ldc);
+        }
+    }
+#endif// SIMD_SSE_ENABLE
+}
diff --git a/src/3rd/Simd/SimdSse1Resizer.cpp b/src/3rd/Simd/SimdSse1Resizer.cpp
new file mode 100644
index 00000000..c93f57ad
--- /dev/null
+++ b/src/3rd/Simd/SimdSse1Resizer.cpp
@@ -0,0 +1,118 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdResizer.h"
+#include "Simd/SimdStore.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE_ENABLE 
+    namespace Sse
+    {
+        ResizerFloatBilinear::ResizerFloatBilinear(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, bool caffeInterp)
+            : Base::ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, sizeof(__m128), caffeInterp)
+        {
+        }
+
+        void ResizerFloatBilinear::Run(const float * src, size_t srcStride, float * dst, size_t dstStride) const
+        {
+            Array32f bx[2];
+            bx[0].Resize(_rs);
+            bx[1].Resize(_rs);
+            float * pbx[2] = { bx[0].data, bx[1].data };
+            int32_t prev = -2;
+            size_t rsa = AlignLo(_rs, Sse::F);
+            for (size_t dy = 0; dy < _dy; dy++, dst += dstStride)
+            {
+                float fy1 = _ay[dy];
+                float fy0 = 1.0f - fy1;
+                int32_t sy = _iy[dy];
+                int32_t k = 0;
+
+                if (sy == prev)
+                    k = 2;
+                else if (sy == prev + 1)
+                {
+                    Swap(pbx[0], pbx[1]);
+                    k = 1;
+                }
+
+                prev = sy;
+
+                for (; k < 2; k++)
+                {
+                    float * pb = pbx[k];
+                    const float * ps = src + (sy + k)*srcStride;
+                    size_t dx = 0;
+                    if (_cn == 1)
+                    {
+                        __m128 _1 = _mm_set1_ps(1.0f);
+                        for (; dx < rsa; dx += Sse::F)
+                        {
+                            __m128 s01 = Sse::Load(ps + _ix[dx + 0], ps + _ix[dx + 1]);
+                            __m128 s23 = Sse::Load(ps + _ix[dx + 2], ps + _ix[dx + 3]); 
+                            __m128 fx1 = _mm_load_ps(_ax.data + dx);
+                            __m128 fx0 = _mm_sub_ps(_1, fx1);
+                            __m128 m0 = _mm_mul_ps(fx0, _mm_shuffle_ps(s01, s23, 0x88));
+                            __m128 m1 = _mm_mul_ps(fx1, _mm_shuffle_ps(s01, s23, 0xDD));
+                            _mm_store_ps(pb + dx, _mm_add_ps(m0, m1));
+                        }
+                    }
+                    for (; dx < _rs; dx++)
+                    {
+                        int32_t sx = _ix[dx];
+                        float fx = _ax[dx];
+                        pb[dx] = ps[sx] * (1.0f - fx) + ps[sx + _cn] * fx;
+                    }
+                }  
+
+                size_t dx = 0;
+                __m128 _fy0 = _mm_set1_ps(fy0);
+                __m128 _fy1 = _mm_set1_ps(fy1);
+                for (; dx < rsa; dx += Sse::F)
+                {
+                    __m128 m0 = _mm_mul_ps(_mm_load_ps(pbx[0] + dx), _fy0);
+                    __m128 m1 = _mm_mul_ps(_mm_load_ps(pbx[1] + dx), _fy1);
+                    _mm_storeu_ps(dst + dx, _mm_add_ps(m0, m1));
+                }
+                for (; dx < _rs; dx++)
+                    dst[dx] = pbx[0][dx] * fy0 + pbx[1][dx] * fy1;
+            }
+        }
+
+        //---------------------------------------------------------------------
+
+        void * ResizerInit(size_t srcX, size_t srcY, size_t dstX, size_t dstY, size_t channels, SimdResizeChannelType type, SimdResizeMethodType method)
+        {
+            if (type == SimdResizeChannelFloat && method == SimdResizeMethodBilinear)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, false);
+            else if (type == SimdResizeChannelFloat && method == SimdResizeMethodCaffeInterp)
+                return new ResizerFloatBilinear(srcX, srcY, dstX, dstY, channels, true);
+            else
+                return Base::ResizerInit(srcX, srcY, dstX, dstY, channels, type, method);
+        }
+    }
+#endif //SIMD_SSE_ENABLE 
+}
+
diff --git a/src/3rd/Simd/SimdSse1Synet.cpp b/src/3rd/Simd/SimdSse1Synet.cpp
new file mode 100644
index 00000000..3a8efdc8
--- /dev/null
+++ b/src/3rd/Simd/SimdSse1Synet.cpp
@@ -0,0 +1,325 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdExtract.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE_ENABLE    
+    namespace Sse
+    {
+        template <bool align> SIMD_INLINE void SynetAddBias(const __m128 & bias, float * dst)
+        {
+            Store<align>(dst, _mm_add_ps(Load<align>(dst), bias));
+        }
+
+        template <bool align> SIMD_INLINE void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);     
+            for (size_t i = 0; i < count; ++i)
+            {
+                size_t j = 0;
+                if (partial)
+                {
+                    __m128 _bias = _mm_set1_ps(bias[i]);
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetAddBias<align>(_bias, dst + j + F * 0);
+                        SynetAddBias<align>(_bias, dst + j + F * 1);
+                        SynetAddBias<align>(_bias, dst + j + F * 2);
+                        SynetAddBias<align>(_bias, dst + j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetAddBias<align>(_bias, dst + j);
+                }
+                for (; j < size; ++j)
+                    dst[j] += bias[i];
+                dst += size;
+            }
+        }
+
+        void SynetAddBias(const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetAddBias<true>(bias, count, size, dst);
+            else
+                SynetAddBias<false>(bias, count, size, dst);
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardProduct(const float * src0, const float * src1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_mul_ps(Load<align>(src0 + offset), Load<align>(src1 + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardProduct(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardProduct<align>(src0, src1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * src1[j];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 0);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 1);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 2);
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardProduct<align>(dst, srci, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] *= srci[j];
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src0, const __m128 & weight0, const float * src1, const __m128 & weight1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_add_ps(_mm_mul_ps(Load<align>(src0 + offset), weight0), _mm_mul_ps(Load<align>(src1 + offset), weight1)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(const float * src, const __m128 & weight, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_add_ps(_mm_mul_ps(Load<align>(src + offset), weight), Load<align>(dst + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardSum(float const * const * src, const float * weight, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            __m128 weight0 = _mm_set1_ps(weight[0]);
+            __m128 weight1 = _mm_set1_ps(weight[1]);
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardSum<align>(src0, weight0, src1, weight1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = src0[j] * weight[0] + src1[j] * weight[1];
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                __m128 weighti = _mm_set1_ps(weight[i]);
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 0);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 1);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 2);
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardSum<align>(srci, weighti, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] += srci[j] * weight[i];
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardMax(const float * src0, const float * src1, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_max_ps(Load<align>(src0 + offset), Load<align>(src1 + offset)));
+        }
+
+        template <bool align> void SynetEltwiseLayerForwardMax(float const * const * src, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            const float * src0 = src[0];
+            const float * src1 = src[1];
+            size_t j = 0;
+            if (partial)
+            {
+                for (; j < aligned; j += QF)
+                {
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 0);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 1);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 2);
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j + F * 3);
+                }
+                for (; j < partial; j += F)
+                    SynetEltwiseLayerForwardMax<align>(src0, src1, dst, j);
+            }
+            for (; j < size; ++j)
+                dst[j] = Simd::Max(src0[j], src1[j]);
+            for (size_t i = 2; i < count; ++i)
+            {
+                const float * srci = src[i];
+                size_t j = 0;
+                if (partial)
+                {
+                    for (; j < aligned; j += QF)
+                    {
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 0);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 1);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 2);
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j + F * 3);
+                    }
+                    for (; j < partial; j += F)
+                        SynetEltwiseLayerForwardMax<align>(dst, srci, dst, j);
+                }
+                for (; j < size; ++j)
+                    dst[j] = Simd::Max(dst[j], srci[j]);
+            }
+        }
+
+        template <bool align> void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            switch (type)
+            {
+            case SimdSynetEltwiseOperationProduct:
+                SynetEltwiseLayerForwardProduct<align>(src, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationSum:
+                SynetEltwiseLayerForwardSum<align>(src, weight, count, size, dst);
+                break;
+            case SimdSynetEltwiseOperationMax:
+                SynetEltwiseLayerForwardMax<align>(src, count, size, dst);
+                break;
+            default:
+                assert(0);
+            }
+        }
+
+        void SynetEltwiseLayerForward(float const * const * src, const float * weight, size_t count, size_t size, SimdSynetEltwiseOperationType type, float * dst)
+        {
+            assert(count >= 2);
+            bool aligned = Aligned(dst) && Aligned(src[0]) && Aligned(src[1]);
+            for (size_t i = 2; i < count; ++i)
+                aligned = aligned && Aligned(src[i]);
+            if (aligned)
+                SynetEltwiseLayerForward<true>(src, weight, count, size, type, dst);
+            else
+                SynetEltwiseLayerForward<false>(src, weight, count, size, type, dst);
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m128 & scale, const __m128 & bias, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_add_ps(_mm_mul_ps(Load<align>(src + offset), scale), bias));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const __m128 & scale, float * dst, size_t offset)
+        {
+            Store<align>(dst + offset, _mm_mul_ps(Load<align>(src + offset), scale));
+        }
+
+        template <bool align> SIMD_INLINE void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            size_t aligned = AlignLo(size, QF);
+            size_t partial = AlignLo(size, F);
+            if (bias)
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m128 _scale = _mm_set1_ps(scale[i]);
+                        __m128 _bias = _mm_set1_ps(bias[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, _bias, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i] + bias[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < count; ++i)
+                {
+                    size_t j = 0;
+                    if (partial)
+                    {
+                        __m128 _scale = _mm_set1_ps(scale[i]);
+                        for (; j < aligned; j += QF)
+                        {
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 0);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 1);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 2);
+                            SynetScaleLayerForward<align>(src, _scale, dst, j + F * 3);
+                        }
+                        for (; j < partial; j += F)
+                            SynetScaleLayerForward<align>(src, _scale, dst, j);
+                    }
+                    for (; j < size; ++j)
+                        dst[j] = src[j] * scale[i];
+                    src += size;
+                    dst += size;
+                }
+            }
+        }
+
+        void SynetScaleLayerForward(const float * src, const float * scale, const float * bias, size_t count, size_t size, float * dst)
+        {
+            if (Aligned(dst) && Aligned(size))
+                SynetScaleLayerForward<true>(src, scale, bias, count, size, dst);
+            else
+                SynetScaleLayerForward<false>(src, scale, bias, count, size, dst);
+        }
+    }
+#endif// SIMD_SSE_ENABLE
+}
diff --git a/src/3rd/Simd/SimdSse2.h b/src/3rd/Simd/SimdSse2.h
index 812010e0..6801ef5d 100644
--- a/src/3rd/Simd/SimdSse2.h
+++ b/src/3rd/Simd/SimdSse2.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -277,12 +277,16 @@ namespace Simd
         void ValueSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
 
         void SquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum);
+		
+        void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum);
 
         void CorrelationSum(const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride, size_t width, size_t height, uint64_t * sum);
 
         void StretchGray2x2(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride,
             uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride);
 
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst);
+
         void TextureBoostedSaturatedGradient(const uint8_t * src, size_t srcStride, size_t width, size_t height,
             uint8_t saturation, uint8_t boost, uint8_t * dx, size_t dxStride, uint8_t * dy, size_t dyStride);
 
diff --git a/src/3rd/Simd/SimdSse2Float32.cpp b/src/3rd/Simd/SimdSse2Float32.cpp
index 4418419f..6338efed 100644
--- a/src/3rd/Simd/SimdSse2Float32.cpp
+++ b/src/3rd/Simd/SimdSse2Float32.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -70,7 +70,7 @@ namespace Simd
 
         SIMD_INLINE __m128 Uint8ToFloat32(const __m128i & value, const __m128 & lower, const __m128 & boost)
         {
-            return _mm_sub_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower);
+            return _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower);
         }
 
         template <bool align> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m128 & lower, const __m128 & boost, float * dst)
diff --git a/src/3rd/Simd/SimdSse2Neural.cpp b/src/3rd/Simd/SimdSse2Neural.cpp
index abde119c..fe625263 100644
--- a/src/3rd/Simd/SimdSse2Neural.cpp
+++ b/src/3rd/Simd/SimdSse2Neural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +25,7 @@
 #include "Simd/SimdExtract.h"
 #include "Simd/SimdStore.h"
 #include "Simd/SimdStream.h"
+#include "Simd/SimdPow.h"
 
 namespace Simd
 {
@@ -99,84 +100,28 @@ namespace Simd
                 NeuralConvert<false>(src, srcStride, width, height, dst, dstStride);
         }
 
-        class PowEstimator
+        template<bool align> void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
-            __m128i _exponent, _mantissa;
-            __m128 _one;
-
-            void Init()
-            {
-                _exponent = _mm_set1_epi32(0x7F800000);
-                _mantissa = _mm_set1_epi32(0x007FFFFF);
-                _one = _mm_set1_ps(1.0f);
-            }
-
-            SIMD_INLINE __m128 Poly5(__m128 x, float a, float b, float c, float d, float e, float f)
-            {
-                __m128 p = _mm_set1_ps(f);
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(e));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(d));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(c));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(b));
-                p = _mm_add_ps(_mm_mul_ps(x, p), _mm_set1_ps(a));
-                return p;
-            }
-
-            SIMD_INLINE __m128 Exp2(__m128 x)
-            {
-                x = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(129.00000f)), _mm_set1_ps(-126.99999f));
-                __m128i ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-                __m128 fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-                __m128 expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-                __m128 expfpart = Poly5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-                return _mm_mul_ps(expipart, expfpart);
-            }
-
-            SIMD_INLINE __m128 Log2(__m128 x)
-            {
-                __m128i i = _mm_castps_si128(x);
-                __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, _exponent), 23), _mm_set1_epi32(127)));
-                __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, _mantissa)), _one);
-                __m128 p = Poly5(m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
-                return _mm_add_ps(_mm_mul_ps(p, _mm_sub_ps(m, _one)), e);
-            }
-
-            SIMD_INLINE __m128 Pow(__m128 basis, __m128 exponent)
-            {
-                return Exp2(_mm_mul_ps(Log2(basis), exponent));
-            }
-
-            template<bool align> void Run(const float * src, size_t size, const float * exponent, float * dst)
-            {
-                if (align)
-                    assert(Aligned(src) && Aligned(dst));
-
-                float e = exponent[0];
-                size_t alignedSize = AlignLo(size, F);
-                __m128 _e = _mm_set1_ps(e);
-                size_t i = 0;
-                for (; i < alignedSize; i += F)
-                    Sse::Store<align>(dst + i, Pow(Sse::Load<align>(src + i), _e));
-                for (; i < size; ++i)
-                    dst[i] = Base::Pow(src[i], e);
-            } 
-
-        public:
-            void Run(const float * src, size_t size, const float * exponent, float * dst)
-            {
-                Init();
-
-                if (Aligned(src) && Aligned(dst))
-                    Run<true>(src, size, exponent, dst);
-                else
-                    Run<false>(src, size, exponent, dst);
-            }
-        };
+            if (align)
+                assert(Aligned(src) && Aligned(dst));
+
+            float e = exponent[0];
+            size_t alignedSize = AlignLo(size, F);
+            __m128 _e = _mm_set1_ps(e);
+            Pow pow;
+            size_t i = 0;
+            for (; i < alignedSize; i += F)
+                Sse::Store<align>(dst + i, pow(Sse::Load<align>(src + i), _e));
+            for (; i < size; ++i)
+                dst[i] = Base::Pow(src[i], e);
+        }
 
         void NeuralPow(const float * src, size_t size, const float * exponent, float * dst)
         {
-            PowEstimator estimator;
-            estimator.Run(src, size, exponent, dst);
+            if (Aligned(src) && Aligned(dst))
+                NeuralPow<true>(src, size, exponent, dst);
+            else
+                NeuralPow<false>(src, size, exponent, dst);
         }
 
         class ExpEstimator
diff --git a/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp b/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp
index a913fa22..15b262ce 100644
--- a/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp
+++ b/src/3rd/Simd/SimdSse2ReduceGray2x2.cpp
@@ -1,7 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2018-2018 Kirill Matsaberydze.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -77,9 +78,9 @@ namespace Simd
                 {
                     dstOffset = dstWidth - A - (evenWidth != srcWidth ? 1 : 0);
                     srcOffset = evenWidth - DA;
-                    Store<align>((__m128i*)(dst + dstOffset), Average8(
-                        Load<align>((__m128i*)(src0 + srcOffset)), Load<align>((__m128i*)(src0 + srcOffset + A)),
-                        Load<align>((__m128i*)(src1 + srcOffset)), Load<align>((__m128i*)(src1 + srcOffset + A))));
+                    Store<false>((__m128i*)(dst + dstOffset), Average8(
+                        Load<false>((__m128i*)(src0 + srcOffset)), Load<false>((__m128i*)(src0 + srcOffset + A)),
+                        Load<false>((__m128i*)(src1 + srcOffset)), Load<false>((__m128i*)(src1 + srcOffset + A))));
                     if (evenWidth != srcWidth)
                     {
                         dst[dstWidth - 1] = Base::Average(src0[evenWidth], src1[evenWidth]);
diff --git a/src/3rd/Simd/SimdSse2Statistic.cpp b/src/3rd/Simd/SimdSse2Statistic.cpp
index 4f3ba118..57ef1950 100644
--- a/src/3rd/Simd/SimdSse2Statistic.cpp
+++ b/src/3rd/Simd/SimdSse2Statistic.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -521,10 +521,49 @@ namespace Simd
             else
                 SquareSum<false>(src, stride, width, height, sum);
         }
+		
+		template <bool align> void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+        {
+            assert(width >= A);
+            if (align)
+                assert(Aligned(src) && Aligned(stride));
+
+            size_t bodyWidth = AlignLo(width, A);
+            __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth);
+            __m128i fullValueSum = _mm_setzero_si128();
+			__m128i fullSquareSum = _mm_setzero_si128();
+            for (size_t row = 0; row < height; ++row)
+            {
+				__m128i rowSquareSum = _mm_setzero_si128();
+                for (size_t col = 0; col < bodyWidth; col += A)
+                {
+                    const __m128i value = Load<align>((__m128i*)(src + col));
+                    fullValueSum = _mm_add_epi64(_mm_sad_epu8(value, K_ZERO), fullValueSum);
+                    rowSquareSum = _mm_add_epi32(rowSquareSum, Square(value));
+                }
+                if (width - bodyWidth)
+                {
+                    const __m128i value = _mm_and_si128(tailMask, Load<false>((__m128i*)(src + width - A)));
+                    fullValueSum = _mm_add_epi64(_mm_sad_epu8(value, K_ZERO), fullValueSum);
+                    rowSquareSum = _mm_add_epi32(rowSquareSum, Square(value));
+                }
+                fullSquareSum = _mm_add_epi64(fullSquareSum, HorizontalSum32(rowSquareSum));
+                src += stride;
+            }
+            *valueSum = ExtractInt64Sum(fullValueSum);
+			*squareSum = ExtractInt64Sum(fullSquareSum);
+        }
+		
+		void ValueSquareSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * valueSum, uint64_t * squareSum)
+		{
+			if (Aligned(src) && Aligned(stride))
+                ValueSquareSum<true>(src, stride, width, height, valueSum, squareSum);
+            else
+                ValueSquareSum<false>(src, stride, width, height, valueSum, squareSum);
+		}
 
         SIMD_INLINE __m128i Correlation(__m128i a, __m128i b)
-        {
-            const __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+        {            const __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), _mm_unpacklo_epi8(b, _mm_setzero_si128()));
             const __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), _mm_unpackhi_epi8(b, _mm_setzero_si128()));
             return _mm_add_epi32(lo, hi);
         }
diff --git a/src/3rd/Simd/SimdSse2Synet.cpp b/src/3rd/Simd/SimdSse2Synet.cpp
new file mode 100644
index 00000000..d0e1ae88
--- /dev/null
+++ b/src/3rd/Simd/SimdSse2Synet.cpp
@@ -0,0 +1,91 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdMemory.h"
+#include "Simd/SimdStore.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdPow.h"
+
+namespace Simd
+{
+#ifdef SIMD_SSE2_ENABLE    
+    namespace Sse2
+    {
+        template <bool align> SIMD_INLINE void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            size_t aligned = AlignLo(size, F);
+            Array32f sum(size, true), zero(size, true);
+
+            for (size_t i = 0; i < half; ++i)
+            {
+                const float * pos = src + i * size;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m128 _pos = Sse::Load<align>(pos + j);
+                    Sse::Store<true>(sum.data + j, _mm_add_ps(Sse::Load<true>(sum.data + j), _mm_mul_ps(_pos, _pos)));
+                }
+                for (; j < size; ++j)
+                    sum[j] += Simd::Square(pos[j]);
+            }
+
+            __m128 k0 = _mm_set1_ps(k[0]);
+            __m128 k1 = _mm_set1_ps(k[1]);
+            __m128 k2 = _mm_set1_ps(k[2]);
+            Sse2::Pow pow;
+            for (size_t i = 0; i < count; ++i)
+            {
+                const float * pos = (i < count - half) ? src + half * size : zero.data;
+                const float * neg = (i > half) ? src - (half + 1) * size : zero.data;
+                size_t j = 0;
+                for (; j < aligned; j += F)
+                {
+                    __m128 _pos = Sse::Load<align>(pos + j);
+                    __m128 _neg = Sse::Load<align>(neg + j);
+                    __m128 _sum = Sse::Load<true>(sum.data + j);
+                    _sum = _mm_add_ps(_sum, _mm_sub_ps(_mm_mul_ps(_pos, _pos), _mm_mul_ps(_neg, _neg)));
+                    __m128 _src = Sse::Load<align>(src + j);
+                    Sse::Store<true>(sum.data + j, _sum);
+                    Sse::Store<align>(dst + j, _mm_mul_ps(_src, pow(_mm_add_ps(k0, _mm_mul_ps(k1, _sum)), k2)));
+                }
+                for (; j < size; ++j)
+                {
+                    sum[j] += Simd::Square(pos[j]);
+                    sum[j] -= Simd::Square(neg[j]);
+                    dst[j] = src[j] * Base::Pow(k[0] + k[1] * sum[j], k[2]);
+                }
+                src += size;
+                dst += size;
+            }
+        }
+
+        void SynetLrnLayerCrossChannels(const float * src, size_t half, size_t count, size_t size, const float * k, float * dst)
+        {
+            if (Aligned(src) && Aligned(dst) && Aligned(size))
+                SynetLrnLayerCrossChannels<true>(src, half, count, size, k, dst);
+            else
+                SynetLrnLayerCrossChannels<false>(src, half, count, size, k, dst);
+        }
+    }
+#endif// SIMD_SSE2_ENABLE
+}
diff --git a/src/3rd/Simd/SimdSse3Neural.cpp b/src/3rd/Simd/SimdSse3Neural.cpp
index 78f9c3c5..886468db 100644
--- a/src/3rd/Simd/SimdSse3Neural.cpp
+++ b/src/3rd/Simd/SimdSse3Neural.cpp
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -912,7 +912,7 @@ namespace Simd
 
                 bool Preferable(size_t srcDepth, size_t kernelX, size_t kernelY, size_t strideX, size_t strideY, size_t dilationX, size_t dilationY, size_t dstWidth, size_t dstHeight, size_t dstDepth)
                 {
-                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1)
+                    if (kernelX == kernelY && kernelX >= 2 && kernelX <= 5 && strideX*strideY*dilationX*dilationY == 1 && dstWidth >= F)
                     {
                         if (dstWidth*dstHeight*kernelX*kernelY >= 8 * 8 * 5 * 5)
                             return true;
diff --git a/src/3rd/Simd/SimdSse41.h b/src/3rd/Simd/SimdSse41.h
index c51f90b5..0fddd817 100644
--- a/src/3rd/Simd/SimdSse41.h
+++ b/src/3rd/Simd/SimdSse41.h
@@ -55,7 +55,7 @@ namespace Simd
 
         void HogLiteExtractFeatures(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t cell, float * features, size_t featuresStride);
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride);
 
         void HogLiteResizeFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, float * dst, size_t dstStride, size_t dstWidth, size_t dstHeight);
 
diff --git a/src/3rd/Simd/SimdSse41Hog.cpp b/src/3rd/Simd/SimdSse41Hog.cpp
index be0e5499..d7d841e5 100644
--- a/src/3rd/Simd/SimdSse41Hog.cpp
+++ b/src/3rd/Simd/SimdSse41Hog.cpp
@@ -434,12 +434,12 @@ namespace Simd
                         _mm_storeu_ps(h1[0] + i, _mm_add_ps(_mm_loadu_ps(h1[0] + i), _mm_unpacklo_ps(s10, s11)));
                         _mm_storeu_ps(h1[1] + i, _mm_add_ps(_mm_loadu_ps(h1[1] + i), _mm_unpackhi_ps(s10, s11)));
                     }
-                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h0[0] + 16)), (__m64*)(h0[1] + 16)));
-                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), _mm_loadh_pi(_mm_loadl_pi(_mm_setzero_ps(), (__m64*)(h1[0] + 16)), (__m64*)(h1[1] + 16)));
-                    _mm_storel_pi((__m64*)(h0[0] + 16), s0);
-                    _mm_storeh_pi((__m64*)(h0[1] + 16), s0);
-                    _mm_storel_pi((__m64*)(h1[0] + 16), s1);
-                    _mm_storeh_pi((__m64*)(h1[1] + 16), s1);
+                    __m128 s0 = _mm_add_ps(_mm_unpacklo_ps(ps[16], ps[17]), Sse::Load(h0[0] + 16, h0[1] + 16));
+                    __m128 s1 = _mm_add_ps(_mm_unpackhi_ps(ps[16], ps[17]), Sse::Load(h1[0] + 16, h1[1] + 16));
+                    Sse::StoreHalf<0>(h0[0] + 16, s0);
+                    Sse::StoreHalf<1>(h0[1] + 16, s0);
+                    Sse::StoreHalf<0>(h1[0] + 16, s1);
+                    Sse::StoreHalf<1>(h1[1] + 16, s1);
                     h0++;
                     h1++;
                     src += 4 * Q2;
diff --git a/src/3rd/Simd/SimdSse41HogLite.cpp b/src/3rd/Simd/SimdSse41HogLite.cpp
index aca011bc..0d249610 100644
--- a/src/3rd/Simd/SimdSse41HogLite.cpp
+++ b/src/3rd/Simd/SimdSse41HogLite.cpp
@@ -358,9 +358,9 @@ namespace Simd
                 sums[3] = _mm_add_ps(sums[3], _mm_mul_ps(Load<align>(src + 3 * step), _filter));
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
                 {
@@ -370,7 +370,7 @@ namespace Simd
                         __m128 sums[4] = { _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps() };
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             size_t filterCol = 0;
                             for (; filterCol < filterStride; filterCol += F)
@@ -385,7 +385,7 @@ namespace Simd
                         __m128 sum = _mm_setzero_ps();
                         const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                         const float * pFilter = filter;
-                        for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                        for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                         {
                             for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                 ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -398,9 +398,9 @@ namespace Simd
                 }
             }
 
-            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align, size_t featureSize> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
-                size_t filterStride = featureSize * filterSize;
+                size_t filterStride = featureSize * filterWidth;
                 size_t alignedDstWidth = AlignLo(dstWidth, 4);
                 __m128 _min = _mm_set1_ps(-FLT_MAX);
                 for (size_t dstRow = 0; dstRow < dstHeight; ++dstRow)
@@ -416,7 +416,7 @@ namespace Simd
                             __m128 sums[4] = { _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps(), _mm_setzero_ps() };
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 size_t filterCol = 0;
                                 for (; filterCol < filterStride; filterCol += F)
@@ -434,7 +434,7 @@ namespace Simd
                             __m128 sum = _mm_setzero_ps();
                             const float * pSrc = src + dstRow * srcStride + dstCol * featureSize;
                             const float * pFilter = filter;
-                            for (size_t filterRow = 0; filterRow < filterSize; ++filterRow)
+                            for (size_t filterRow = 0; filterRow < filterHeight; ++filterRow)
                             {
                                 for (size_t filterCol = 0; filterCol < filterStride; filterCol += F)
                                     ProductSum1x1<align>(pSrc + filterCol, pFilter + filterCol, sum);
@@ -451,53 +451,53 @@ namespace Simd
                 }
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, dst, dstStride);
             }
 
-            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            template <bool align> void Filter(const float * src, size_t srcStride, size_t dstWidth, size_t dstHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 if (featureSize == 16)
-                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 16>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 else
-                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterSize, mask, maskStride, dst, dstStride);
+                    Filter<align, 8>(src, srcStride, dstWidth, dstHeight, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
             }
 
         public:
 
-            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+            void Run(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
             {
                 assert(featureSize == 8 || featureSize == 16);
-                assert(srcWidth >= filterSize && srcHeight >= filterSize);
+                assert(srcWidth >= filterWidth && srcHeight >= filterHeight);
 
-                size_t dstWidth = srcWidth - filterSize + 1;
-                size_t dstHeight = srcHeight - filterSize + 1;
+                size_t dstWidth = srcWidth - filterWidth + 1;
+                size_t dstHeight = srcHeight - filterHeight + 1;
 
                 if (mask)
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
                 }
                 else
                 {
                     if (Aligned(src) && Aligned(srcStride) && Aligned(filter))
-                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<true>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                     else
-                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterSize, dst, dstStride);
+                        Filter<false>(src, srcStride, dstWidth, dstHeight, featureSize, filter, filterWidth, filterHeight, dst, dstStride);
                 }
             }
         };
 
-        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterSize, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
+        void HogLiteFilterFeatures(const float * src, size_t srcStride, size_t srcWidth, size_t srcHeight, size_t featureSize, const float * filter, size_t filterWidth, size_t filterHeight, const uint32_t * mask, size_t maskStride, float * dst, size_t dstStride)
         {
             HogLiteFeatureFilter featureFilter;
-            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterSize, mask, maskStride, dst, dstStride);
+            featureFilter.Run(src, srcStride, srcWidth, srcHeight, featureSize, filter, filterWidth, filterHeight, mask, maskStride, dst, dstStride);
         }
 
         namespace HogLiteFeatureResizerDetail
diff --git a/src/3rd/Simd/SimdStore.h b/src/3rd/Simd/SimdStore.h
index 0fa43918..1d188986 100644
--- a/src/3rd/Simd/SimdStore.h
+++ b/src/3rd/Simd/SimdStore.h
@@ -45,6 +45,18 @@ namespace Simd
         {
             _mm_store_ps(p, a);
         }
+
+        template <int part> SIMD_INLINE void StoreHalf(float  * p, __m128 a);
+
+        template <> SIMD_INLINE void StoreHalf<0>(float  * p, __m128 a)
+        {
+            _mm_storel_pi((__m64*)p, a);
+        }
+
+        template <> SIMD_INLINE void StoreHalf<1>(float  * p, __m128 a)
+        {
+            _mm_storeh_pi((__m64*)p, a);
+        }
     }
 #endif//SIMD_SSE_ENABLE
 
diff --git a/src/3rd/Simd/SimdVersion.h b/src/3rd/Simd/SimdVersion.h
index 686d4e81..dc3163f8 100644
--- a/src/3rd/Simd/SimdVersion.h
+++ b/src/3rd/Simd/SimdVersion.h
@@ -1,7 +1,7 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -26,13 +26,15 @@
 * File name   : SimdVersion.h
 * Description : This file contains information about current version.
 *
-* Do not change this file because the file is auto generated by script.
+* Do not change this file because the file is auto generated by script:
+* 'prj/cmd/GetVersion.cmd' for Microsoft Visual Studio or 
+* 'prj/sh/GetVersion.sh' for CMake.
 */
 
 #ifndef __SimdVersion_h__
 #define __SimdVersion_h__
 
-#define SIMD_VERSION "4.1.60.1349"
+#define SIMD_VERSION "4.1.64.1404"
 
 #endif//__SimdVersion_h__
 
diff --git a/src/3rd/Simd/SimdView.hpp b/src/3rd/Simd/SimdView.hpp
index 2abc5791..8fcdb628 100644
--- a/src/3rd/Simd/SimdView.hpp
+++ b/src/3rd/Simd/SimdView.hpp
@@ -1,7 +1,8 @@
 /*
 * Simd Library (http://ermig1979.github.io/Simd).
 *
-* Copyright (c) 2011-2017 Yermalayeu Ihar.
+* Copyright (c) 2011-2018 Yermalayeu Ihar,
+*               2018-2018 Dmitry Fedorov.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -202,6 +203,28 @@ namespace Simd
         operator cv::Mat() const;
 #endif
 
+
+#ifdef SIMD_TENSORFLOW_ENABLE
+        /*!
+            Creates an Tensorflow Tensor which references this image.
+
+            \note You have to define SIMD_TENSORFLOW_ENABLE in order to use this functionality.
+
+            \return an Tensorflow Tensor which references to this image.
+        */
+        void ToTFTensor(tensorflow::Tensor & tensor, float shift = 0, float scale = 1) const;
+
+
+        /*!
+           Creates an Tensorflow Tensor which references this image.
+
+           \note You have to define SIMD_TENSORFLOW_ENABLE in order to use this functionality.
+
+           \return an Tensorflow Tensor which references to this image.
+       */
+        void ToTFTensor(tensorflow::Tensor & tensor, int batchIndex, float shift = 0, float scale = 0) const;
+#endif
+
         /*!
             Gets a copy of current image view.
 
@@ -626,6 +649,92 @@ namespace Simd
     }
 #endif
 
+#ifdef SIMD_TENSORFLOW_ENABLE
+    template <template<class> class A> SIMD_INLINE void View<A>::ToTFTensor( tensorflow::Tensor & tensor, float shift, float scale) const
+    {
+        auto mapped = tensor.tensor<float, 3>();
+
+        if (format == View<A>::Bgr24)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * bgr = data + row*stride;
+                for (size_t col = 0; col < width; ++col, bgr += 3)
+                {
+                    mapped(row, col, 0) = (bgr[0] + shift) * scale;
+                    mapped(row, col, 1) = (bgr[1] + shift) * scale;
+                    mapped(row, col, 2) = (bgr[2] + shift) * scale;
+                }
+            }
+        } else if (format == View<A>::Bgra32)
+        {
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * bgra = data + row*stride;
+                for (size_t col = 0; col < width; ++col, bgra += 4)
+                {
+                    mapped(row, col, 0) = (bgra[0] + shift) * scale;
+                    mapped(row, col, 1) = (bgra[1] + shift) * scale;
+                    mapped(row, col, 2) = (bgra[2] + shift) * scale;
+                }
+            }
+        } else if (format == View<A>::Gray8)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * gray = data + row*stride;
+                for (size_t col = 0; col < width; ++col)
+                {
+                    mapped(row, col, 0) = (gray[0] + shift) * scale;
+                }
+            }
+        }
+    }
+
+    template <template<class> class A> SIMD_INLINE void View<A>::ToTFTensor( tensorflow::Tensor & tensor, int batchIndex, float shift, float scale) const
+    {
+        auto mapped = tensor.tensor<float, 4>();
+
+        if (format == View<A>::Bgr24)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * bgr = data + row*stride;
+                for (size_t col = 0; col < width; ++col, bgr += 3)
+                {
+                    mapped(batchIndex, row, col, 0) = ((float)bgr[0] + shift) * scale;
+                    mapped(batchIndex, row, col, 1) = ((float)bgr[1] + shift) * scale;
+                    mapped(batchIndex, row, col, 2) = ((float)bgr[2] + shift) * scale;
+                }
+            }
+        } else if (format == View<A>::Bgra32)
+        {
+
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * bgra = data + row*stride;
+                for (size_t col = 0; col < width; ++col, bgra += 4)
+                {
+                    mapped(batchIndex, row, col, 0) = ((float)bgra[0] + shift) * scale;
+                    mapped(batchIndex, row, col, 1) = ((float)bgra[1] + shift) * scale;
+                    mapped(batchIndex, row, col, 2) = ((float)bgra[2] + shift) * scale;
+                }
+            }
+        } else if (format == View<A>::Gray8)
+        {
+            for (size_t row = 0; row < height; ++row)
+            {
+                const uint8_t * gray = data + row*stride;
+                for (size_t col = 0; col < width; ++col)
+                {
+                    mapped(batchIndex, row, col, 0) = ((float)gray[0] + shift) * scale;
+                }
+            }
+        }
+    }
+#endif
+
     template <template<class> class A> SIMD_INLINE View<A>::View(size_t w, size_t h, ptrdiff_t s, Format f, void * d)
         : width(w)
         , height(h)
@@ -1039,7 +1148,7 @@ namespace Simd
         if (!(format == View<A>::Gray8 || format == View<A>::Bgr24 || format == View<A>::Bgra32))
             return false;
 
-        std::ofstream ofs(path.c_str(), std::ifstream::binary);
+        std::ofstream ofs(path.c_str(), std::ofstream::binary);
         if (ofs.is_open())
         {
             if (format == View<A>::Gray8)