diff --git a/libs/ardour/sse_functions_avx_linux.cc b/libs/ardour/sse_functions_avx_linux.cc index b5991a0552..3b8a444e26 100644 --- a/libs/ardour/sse_functions_avx_linux.cc +++ b/libs/ardour/sse_functions_avx_linux.cc @@ -28,6 +28,14 @@ #define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0) +#if defined(__GNUC__) +#define IS_NOT_ALIGNED_TO(ptr, bytes) \ + __builtin_expect(!!(reinterpret_cast(ptr) % (bytes)), 0) +#else +#define IS_NOT_ALIGNED_TO(ptr, bytes) \ + (!!(reinterpret_cast(ptr) % (bytes))) +#endif + #ifdef __cplusplus #define C_FUNC extern "C" #else @@ -67,82 +75,104 @@ x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n C_FUNC float x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current) { - const __m256 ABS_MASK = _mm256_set1_ps(-0.0F); + // If src is null then skip processing + if ((src == nullptr) || (nframes == 0)) + { + return current; + } + + // Broadcast mask to compute absolute value + const uint32_t f32_nan = UINT32_C(0x7FFFFFFF); + const __m256 ABS_MASK = + _mm256_broadcast_ss(reinterpret_cast(&f32_nan)); // Broadcast the current max value to all elements of the YMM register - __m256 vcurrent = _mm256_broadcast_ss(¤t); + __m256 vmax = _mm256_set1_ps(current); // Compute single min/max of unaligned portion until alignment is reached - while ((((intptr_t)src) % 32 != 0) && nframes > 0) { + while (IS_NOT_ALIGNED_TO(src, sizeof(__m256)) && (nframes > 0)) + { __m256 vsrc; - vsrc = _mm256_setzero_ps(); - vsrc = _mm256_castps128_ps256(_mm_load_ss(src)); - vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); - vcurrent = _mm256_max_ps(vcurrent, vsrc); + vsrc = _mm256_broadcast_ss(src); + vsrc = _mm256_and_ps(ABS_MASK, vsrc); + vmax = _mm256_max_ps(vmax, vsrc); ++src; --nframes; } - // Process the aligned portion 16 samples at a time - while (nframes >= 16) { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); + // Process the aligned portion 32 samples at a time + while (nframes >= 32) + { +#ifdef _WIN32 + _mm_prefetch(reinterpret_cast(src + 32), _mm_hint(0)); #else - __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(reinterpret_cast(src + 32), 0, 0); #endif - __m256 vsrc1, vsrc2; - vsrc1 = _mm256_load_ps(src + 0); - vsrc2 = _mm256_load_ps(src + 8); + __m256 t0 = _mm256_load_ps(src + 0); + __m256 t1 = _mm256_load_ps(src + 8); + __m256 t2 = _mm256_load_ps(src + 16); + __m256 t3 = _mm256_load_ps(src + 24); - vsrc1 = _mm256_andnot_ps(ABS_MASK, vsrc1); - vsrc2 = _mm256_andnot_ps(ABS_MASK, vsrc2); + t0 = _mm256_and_ps(ABS_MASK, t0); + t1 = _mm256_and_ps(ABS_MASK, t1); + t2 = _mm256_and_ps(ABS_MASK, t2); + t3 = _mm256_and_ps(ABS_MASK, t3); - vcurrent = _mm256_max_ps(vcurrent, vsrc1); - vcurrent = _mm256_max_ps(vcurrent, vsrc2); + vmax = _mm256_max_ps(vmax, t0); + vmax = _mm256_max_ps(vmax, t1); + vmax = _mm256_max_ps(vmax, t2); + vmax = _mm256_max_ps(vmax, t3); - src += 16; - nframes -= 16; + src += 32; + nframes -= 32; } // Process the remaining samples 8 at a time - while (nframes >= 8) { + while (nframes >= 8) + { __m256 vsrc; vsrc = _mm256_load_ps(src); - vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); - vcurrent = _mm256_max_ps(vcurrent, vsrc); + vsrc = _mm256_and_ps(ABS_MASK, vsrc); + vmax = _mm256_max_ps(vmax, vsrc); src += 8; nframes -= 8; } // If there are still some left 4 to 8 samples, process them below - while (nframes > 0) { + while (nframes > 0) + { __m256 vsrc; - vsrc = _mm256_setzero_ps(); - vsrc = _mm256_castps128_ps256(_mm_load_ss(src)); - vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); - vcurrent = _mm256_max_ps(vcurrent, vsrc); + vsrc = _mm256_broadcast_ss(src); + vsrc = _mm256_and_ps(ABS_MASK, vsrc); + vmax = _mm256_max_ps(vmax, vsrc); ++src; --nframes; } - // Get the current max from YMM register - vcurrent = avx_getmax_ps(vcurrent); + vmax = _mm256_max_ps(vmax, _mm256_permute2f128_ps(vmax, vmax, 1)); + vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 3, 2))); + vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 0, 1))); - // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions - _mm256_zeroupper(); + // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX + // instructions. + + // _mm256_zeroupper(); + // This is probably not needed in 2021 as compilers will insert them + // automatically. See stackoverflow reference: + // https://stackoverflow.com/questions/68736527/do-i-need-to-use-mm256-zeroupper-in-2021 #if defined(__GNUC__) && (__GNUC__ < 5) - return *((float *)&vcurrent); + return *((float *)&vmax); #elif defined(__GNUC__) && (__GNUC__ < 8) - return vcurrent[0]; + return vmax[0]; #else - return _mm256_cvtss_f32 (vcurrent); + return _mm256_cvtss_f32(vmax); #endif }