Skip to content

Commit

Permalink
+add SSE4.1 optimizations of function SynetTiledScale2D32f.
Browse files Browse the repository at this point in the history
  • Loading branch information
ermig1979 committed Dec 19, 2024
1 parent 12ad5a8 commit d546bb9
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 7 deletions.
2 changes: 1 addition & 1 deletion docs/2025.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ <h4>Algorithms</h4>
<h5>New features</h5>
<ul>
<li>Parameter add in function SimdSynetMergedConvolution16bInit.</li>
<li>Base implementation of function SynetTiledScale2D32f.</li>
<li>Base implementation, SSE4.1 optimizations of function SynetTiledScale2D32f.</li>
</ul>
<h5>Improving</h5>
<ul>
Expand Down
2 changes: 1 addition & 1 deletion src/Simd/SimdLib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6054,7 +6054,7 @@ SIMD_API void SimdSynetTiledScale2D32f(const float* src, size_t channels, size_t
SIMD_EMPTY();
#if defined(SIMD_SYNET_ENABLE)
typedef void(*SimdSynetTiledScale2D32fPtr) (const float* src, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* ver, const float* hor, float* dst);
const static SimdSynetTiledScale2D32fPtr simdSynetTiledScale2D32f = SIMD_FUNC0(SynetTiledScale2D32f);// , SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);
const static SimdSynetTiledScale2D32fPtr simdSynetTiledScale2D32f = SIMD_FUNC1(SynetTiledScale2D32f, SIMD_SSE41_FUNC);// , SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC);

simdSynetTiledScale2D32f(src, channels, height, width, format, ver, hor, dst);
#else
Expand Down
2 changes: 2 additions & 0 deletions src/Simd/SimdSse41.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,8 @@ namespace Simd

void SynetScaleLayerForward(const float* src, const float* scale, const float* bias, size_t channels, size_t height, size_t width, float* dst, SimdTensorFormatType format, SimdSynetCompatibilityType compatibility);

void SynetTiledScale2D32f(const float* src, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* ver, const float* hor, float* dst);

void TextureBoostedSaturatedGradient(const uint8_t* src, size_t srcStride, size_t width, size_t height,
uint8_t saturation, uint8_t boost, uint8_t* dx, size_t dxStride, uint8_t* dy, size_t dyStride);

Expand Down
45 changes: 45 additions & 0 deletions src/Simd/SimdSse41Synet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,51 @@ namespace Simd
else
assert(0);
}

//-------------------------------------------------------------------------------------------------

void SynetTiledScale2D32f(const float* src, size_t channels, size_t height, size_t width, SimdTensorFormatType format, const float* ver, const float* hor, float* dst)
{
if (format == SimdTensorFormatNchw)
{
size_t widthF = AlignLo(width, F);
for (size_t c = 0; c < channels; ++c)
{
for (size_t y = 0; y < height; ++y)
{
__m128 _hor = _mm_set1_ps(hor[y]);
size_t x = 0;
for (; x < widthF; x += F)
_mm_storeu_ps(dst + x, _mm_mul_ps(_mm_loadu_ps(src + x), _mm_mul_ps(_mm_loadu_ps(ver + x), _hor)));
for (; x < width; x += 1)
_mm_store_ss(dst + x, _mm_mul_ss(_mm_load_ss(src + x), _mm_mul_ss(_mm_load_ss(ver + x), _hor)));
src += width, dst += width;
}
hor += height;
ver += width;
}
}
else if (format == SimdTensorFormatNhwc)
{
size_t channelsF = AlignLo(channels, F);
for (size_t y = 0; y < height; ++y)
{
const float* pVer = ver;
for (size_t x = 0; x < width; ++x)
{
size_t c = 0;
for (; c < channelsF; c += F)
_mm_storeu_ps(dst + c, _mm_mul_ps(_mm_loadu_ps(src + c), _mm_mul_ps(_mm_loadu_ps(pVer + c), _mm_loadu_ps(hor + c))));
for (; c < channels; c += 1)
_mm_store_ss(dst + c, _mm_mul_ss(_mm_load_ss(src + c), _mm_mul_ss(_mm_load_ss(pVer + c), _mm_load_ss(hor + c))));
src += channels, dst += channels, pVer += channels;
}
hor += channels;
}
}
else
assert(0);
}
}
#endif
}
10 changes: 5 additions & 5 deletions src/Test/TestSynet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,11 +431,11 @@ namespace Test
if (TestBase())
result = result && SynetTiledScale2D32fAutoTest(FUNC_TS2D32F(Simd::Base::SynetTiledScale2D32f), FUNC_TS2D32F(SimdSynetTiledScale2D32f));

//#ifdef SIMD_SSE41_ENABLE
// if (Simd::Sse41::Enable && TestSse41())
// result = result && SynetTiledScale2D32fAutoTest(FUNC_TS2D32F(Simd::Sse41::SynetTiledScale2D32f), FUNC_TS2D32F(SimdSynetTiledScale2D32f));
//#endif
//
#ifdef SIMD_SSE41_ENABLE
if (Simd::Sse41::Enable && TestSse41())
result = result && SynetTiledScale2D32fAutoTest(FUNC_TS2D32F(Simd::Sse41::SynetTiledScale2D32f), FUNC_TS2D32F(SimdSynetTiledScale2D32f));
#endif

//#ifdef SIMD_AVX2_ENABLE
// if (Simd::Avx2::Enable && TestAvx2())
// result = result && SynetTiledScale2D32fAutoTest(FUNC_TS2D32F(Simd::Avx2::SynetTiledScale2D32f), FUNC_TS2D32F(SimdSynetTiledScale2D32f));
Expand Down

0 comments on commit d546bb9

Please sign in to comment.