Fine tune AVX optimized routine
In some instances, AVX optimized routines were slower than compiler generated output. This commit addresses a few of those issues by tuning code for better branch prediction and appropriate loop unrolls.
This commit is contained in:
parent
ca5043288c
commit
a07ae169a3
@ -28,6 +28,14 @@
|
||||
|
||||
#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
|
||||
|
||||
#if defined(__GNUC__)
|
||||
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
|
||||
__builtin_expect(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)), 0)
|
||||
#else
|
||||
#define IS_NOT_ALIGNED_TO(ptr, bytes) \
|
||||
(!!(reinterpret_cast<intptr_t>(ptr) % (bytes)))
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define C_FUNC extern "C"
|
||||
#else
|
||||
@ -67,82 +75,104 @@ x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n
|
||||
C_FUNC float
|
||||
x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current)
|
||||
{
|
||||
const __m256 ABS_MASK = _mm256_set1_ps(-0.0F);
|
||||
// If src is null then skip processing
|
||||
if ((src == nullptr) || (nframes == 0))
|
||||
{
|
||||
return current;
|
||||
}
|
||||
|
||||
// Broadcast mask to compute absolute value
|
||||
const uint32_t f32_nan = UINT32_C(0x7FFFFFFF);
|
||||
const __m256 ABS_MASK =
|
||||
_mm256_broadcast_ss(reinterpret_cast<const float *>(&f32_nan));
|
||||
|
||||
// Broadcast the current max value to all elements of the YMM register
|
||||
__m256 vcurrent = _mm256_broadcast_ss(¤t);
|
||||
__m256 vmax = _mm256_set1_ps(current);
|
||||
|
||||
// Compute single min/max of unaligned portion until alignment is reached
|
||||
while ((((intptr_t)src) % 32 != 0) && nframes > 0) {
|
||||
while (IS_NOT_ALIGNED_TO(src, sizeof(__m256)) && (nframes > 0))
|
||||
{
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_setzero_ps();
|
||||
vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
vsrc = _mm256_broadcast_ss(src);
|
||||
vsrc = _mm256_and_ps(ABS_MASK, vsrc);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Process the aligned portion 16 samples at a time
|
||||
while (nframes >= 16) {
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
// Process the aligned portion 32 samples at a time
|
||||
while (nframes >= 32)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
_mm_prefetch(reinterpret_cast<char const *>(src + 32), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(reinterpret_cast<void const *>(src + 32), 0, 0);
|
||||
#endif
|
||||
__m256 vsrc1, vsrc2;
|
||||
vsrc1 = _mm256_load_ps(src + 0);
|
||||
vsrc2 = _mm256_load_ps(src + 8);
|
||||
__m256 t0 = _mm256_load_ps(src + 0);
|
||||
__m256 t1 = _mm256_load_ps(src + 8);
|
||||
__m256 t2 = _mm256_load_ps(src + 16);
|
||||
__m256 t3 = _mm256_load_ps(src + 24);
|
||||
|
||||
vsrc1 = _mm256_andnot_ps(ABS_MASK, vsrc1);
|
||||
vsrc2 = _mm256_andnot_ps(ABS_MASK, vsrc2);
|
||||
t0 = _mm256_and_ps(ABS_MASK, t0);
|
||||
t1 = _mm256_and_ps(ABS_MASK, t1);
|
||||
t2 = _mm256_and_ps(ABS_MASK, t2);
|
||||
t3 = _mm256_and_ps(ABS_MASK, t3);
|
||||
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc1);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc2);
|
||||
vmax = _mm256_max_ps(vmax, t0);
|
||||
vmax = _mm256_max_ps(vmax, t1);
|
||||
vmax = _mm256_max_ps(vmax, t2);
|
||||
vmax = _mm256_max_ps(vmax, t3);
|
||||
|
||||
src += 16;
|
||||
nframes -= 16;
|
||||
src += 32;
|
||||
nframes -= 32;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
while (nframes >= 8)
|
||||
{
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_load_ps(src);
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
vsrc = _mm256_and_ps(ABS_MASK, vsrc);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
|
||||
src += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
// If there are still some left 4 to 8 samples, process them below
|
||||
while (nframes > 0) {
|
||||
while (nframes > 0)
|
||||
{
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_setzero_ps();
|
||||
vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
vsrc = _mm256_broadcast_ss(src);
|
||||
vsrc = _mm256_and_ps(ABS_MASK, vsrc);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Get the current max from YMM register
|
||||
vcurrent = avx_getmax_ps(vcurrent);
|
||||
vmax = _mm256_max_ps(vmax, _mm256_permute2f128_ps(vmax, vmax, 1));
|
||||
vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 3, 2)));
|
||||
vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
|
||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
|
||||
_mm256_zeroupper();
|
||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX
|
||||
// instructions.
|
||||
|
||||
// _mm256_zeroupper();
|
||||
// This is probably not needed in 2021 as compilers will insert them
|
||||
// automatically. See stackoverflow reference:
|
||||
// https://stackoverflow.com/questions/68736527/do-i-need-to-use-mm256-zeroupper-in-2021
|
||||
|
||||
#if defined(__GNUC__) && (__GNUC__ < 5)
|
||||
return *((float *)&vcurrent);
|
||||
return *((float *)&vmax);
|
||||
#elif defined(__GNUC__) && (__GNUC__ < 8)
|
||||
return vcurrent[0];
|
||||
return vmax[0];
|
||||
#else
|
||||
return _mm256_cvtss_f32 (vcurrent);
|
||||
return _mm256_cvtss_f32(vmax);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user