From 03abc1076eaef4057c089eda28e69e1d26bf7455 Mon Sep 17 00:00:00 2001 From: Ayan Shafqat Date: Sun, 9 Aug 2020 16:10:40 -0400 Subject: [PATCH] Adding AVX optimized routines for Linux This commit adds AVX optimized routines for the following procedures below: *_compute_peak *_find_peaks *_apply_gain_to_buffer *_mix_buffers_with_gain *_mix_buffers_no_gain AVX optimized routine has the prefix of: x86_sse_avx_ Note: mix_buffer_with_gain and mix_buffers_no_gain may prefer SSE implementaion over AVX if source and destination pointers are aligned to 16 byte boundaries. Therefore, it will be optimal if _all_ audio buffers are allocated to 32 byte boundaries to take full advantage of AVX ISA extension. --- libs/ardour/ardour/mix.h | 2 +- libs/ardour/globals.cc | 10 +- libs/ardour/sse_functions_avx_linux.cc | 839 ++++++++++++++++++++++++- 3 files changed, 824 insertions(+), 27 deletions(-) diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h index 40f6348c5e..a22d9587fb 100644 --- a/libs/ardour/ardour/mix.h +++ b/libs/ardour/ardour/mix.h @@ -40,10 +40,10 @@ extern "C" { LIBARDOUR_API void x86_sse_avx_mix_buffers_with_gain(float * dst, const float * src, uint32_t nframes, float gain); LIBARDOUR_API void x86_sse_avx_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes); LIBARDOUR_API void x86_sse_avx_copy_vector (float * dst, const float * src, uint32_t nframes); + LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max); } LIBARDOUR_API void x86_sse_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max); -LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max); /* debug wrappers for SSE functions */ diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index 07eced83c3..b70e51fc4c 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -189,14 +189,8 @@ setup_hardware_optimization (bool try_optimization) #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS) -#ifdef PLATFORM_WINDOWS - /* We have AVX-optimized code for Windows */ - if (fpu->has_avx ()) -#else - /* AVX code doesn't compile on Linux yet */ - if (false) -#endif - { + /* We have AVX-optimized code for Windows and Linux */ + if (fpu->has_avx ()) { info << "Using AVX optimized routines" << endmsg; // AVX SET diff --git a/libs/ardour/sse_functions_avx_linux.cc b/libs/ardour/sse_functions_avx_linux.cc index ec591642e7..6d49e46a78 100644 --- a/libs/ardour/sse_functions_avx_linux.cc +++ b/libs/ardour/sse_functions_avx_linux.cc @@ -1,5 +1,6 @@ /* * Copyright (C) 2015 Paul Davis + * Copyright (C) 2020 Ayan Shafqat * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,38 +19,840 @@ #include "ardour/mix.h" -float -x86_sse_avx_compute_peak (const float * buf, uint32_t nsamples, float current) +#include +#include + +#ifndef __AVX__ +#error "__AVX__ must be enabled for this module to work" +#endif + +#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0) + +#ifdef __cplusplus +#define C_FUNC extern "C" +#else +#define C_FUNC +#endif + +/** + * Local functions + */ + +static inline __m256 avx_getmax_ps(__m256 vmax); +static inline __m256 avx_getmin_ps(__m256 vmin); + +static void +x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain); + +static void +x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain); + +static void +x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes); + +static void +x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes); + +/** + * Module implementation + */ + +/** + * @brief x86-64 AVX optimized routine for compute peak procedure + * @param src Pointer to source buffer + * @param nframes Number of frames to process + * @param current Current peak value + * @return float New peak value + */ +C_FUNC float +x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current) { - return default_compute_peak (buf, nsamples, current); + const __m256 ABS_MASK = _mm256_set1_ps(-0.0F); + + // Broadcast the current max value to all elements of the YMM register + __m256 vcurrent = _mm256_broadcast_ss(¤t); + + // Compute single min/max of unaligned portion until alignment is reached + while ((((intptr_t)src) % 32 != 0) && nframes > 0) { + __m256 vsrc; + + vsrc = _mm256_setzero_ps(); + vsrc = _mm256_castps128_ps256(_mm_load_ss(src)); + vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); + vcurrent = _mm256_max_ps(vcurrent, vsrc); + + ++src; + --nframes; + } + + // Process the aligned portion 16 samples at a time + while (nframes >= 16) { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); +#endif + __m256 vsrc1, vsrc2; + vsrc1 = _mm256_load_ps(src + 0); + vsrc2 = _mm256_load_ps(src + 8); + + vsrc1 = _mm256_andnot_ps(ABS_MASK, vsrc1); + vsrc2 = _mm256_andnot_ps(ABS_MASK, vsrc2); + + vcurrent = _mm256_max_ps(vcurrent, vsrc1); + vcurrent = _mm256_max_ps(vcurrent, vsrc2); + + src += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 vsrc; + + vsrc = _mm256_load_ps(src); + vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); + vcurrent = _mm256_max_ps(vcurrent, vsrc); + + src += 8; + nframes -= 8; + } + + // If there are still some left 4 to 8 samples, process them below + while (nframes > 0) { + __m256 vsrc; + + vsrc = _mm256_setzero_ps(); + vsrc = _mm256_castps128_ps256(_mm_load_ss(src)); + vsrc = _mm256_andnot_ps(ABS_MASK, vsrc); + vcurrent = _mm256_max_ps(vcurrent, vsrc); + + ++src; + --nframes; + } + + // Get the current max from YMM register + vcurrent = avx_getmax_ps(vcurrent); + + // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions + _mm256_zeroupper(); + + return _mm256_cvtss_f32(vcurrent); } -void -x86_sse_avx_apply_gain_to_buffer (float * buf, uint32_t nframes, float gain) +/** + * @brief x86-64 AVX optimized routine for find peak procedure + * @param src Pointer to source buffer + * @param nframes Number of frames to process + * @param[in,out] minf Current minimum value, updated + * @param[in,out] maxf Current maximum value, updated + */ +C_FUNC void +x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf) { - default_apply_gain_to_buffer (buf, nframes, gain); + // Broadcast the current min and max values to all elements of the YMM register + __m256 vmin = _mm256_broadcast_ss(minf); + __m256 vmax = _mm256_broadcast_ss(maxf); + + // Compute single min/max of unaligned portion until alignment is reached + while ((((intptr_t)src) % 32 != 0) && nframes > 0) { + __m256 vsrc; + + vsrc = _mm256_broadcast_ss(src); + vmax = _mm256_max_ps(vmax, vsrc); + vmin = _mm256_min_ps(vmin, vsrc); + + ++src; + --nframes; + } + + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)src + 64), _mm_hint(0)); +#else + __builtin_prefetch(src + 64, 0, 0); +#endif + + __m256 vsrc1, vsrc2; + vsrc1 = _mm256_load_ps(src + 0); + vsrc2 = _mm256_load_ps(src + 8); + + vmax = _mm256_max_ps(vmax, vsrc1); + vmin = _mm256_min_ps(vmin, vsrc1); + + vmax = _mm256_max_ps(vmax, vsrc2); + vmin = _mm256_min_ps(vmin, vsrc2); + + src += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 vsrc; + + vsrc = _mm256_load_ps(src); + vmax = _mm256_max_ps(vmax, vsrc); + vmin = _mm256_min_ps(vmin, vsrc); + + src += 8; + nframes -= 8; + } + + // If there are still some left 4 to 8 samples, process them one at a time. + while (nframes > 0) { + __m256 vsrc; + + vsrc = _mm256_broadcast_ss(src); + vmax = _mm256_max_ps(vmax, vsrc); + vmin = _mm256_min_ps(vmin, vsrc); + + ++src; + --nframes; + } + + // Get min and max of the YMM registers + vmin = avx_getmin_ps(vmin); + vmax = avx_getmax_ps(vmax); + + // There's a penalty going away from AVX mode to SSE mode. This can + // be avoided by ensuring to the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions + _mm256_zeroupper(); + + _mm_store_ss(minf, _mm256_castps256_ps128(vmin)); + _mm_store_ss(maxf, _mm256_castps256_ps128(vmax)); } -void -x86_sse_avx_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain) +/** + * @brief x86-64 AVX optimized routine for apply gain routine + * @param[in,out] dst Pointer to the destination buffer, which gets updated + * @param nframes Number of frames (or samples) to process + * @param gain Gain to apply + */ +C_FUNC void +x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) { - default_mix_buffers_with_gain (dst, src, nframes, gain); + // Load gain vector to all elements of YMM register + __m256 vgain = _mm256_set1_ps(gain); + + if (nframes) { + __m128 g0 = _mm256_castps256_ps128(vgain); + // Here comes the horror, poor-man's loop unrolling + switch (((intptr_t)dst) % 32) { + case 0: + default: + // Buffer is aligned, skip to the next section of aligned + break; + case 4: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + case 8: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + case 12: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + case 16: + // This is a special case where pointer is 16 byte aligned + // for a XMM load/store operation. + _mm_store_ps(dst, _mm_mul_ps(g0, _mm_load_ps(dst))); + dst += 4; + nframes -= 4; + break; + case 20: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + case 24: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + case 28: + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + } + } else { + return; + } + + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 d0, d1; + d0 = _mm256_load_ps(dst + 0 ); + d1 = _mm256_load_ps(dst + 8 ); + + d0 = _mm256_mul_ps(vgain, d0); + d1 = _mm256_mul_ps(vgain, d1); + + _mm256_store_ps(dst + 0 , d0); + _mm256_store_ps(dst + 8 , d1); + + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + _mm256_store_ps(dst, _mm256_mul_ps(vgain, _mm256_load_ps(dst))); + dst += 8; + nframes -= 8; + } + + + // There's a penalty going away from AVX mode to SSE mode. This can + // be avoided by ensuring to the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples + do { + __m128 g0 = _mm256_castps256_ps128(vgain); + while (nframes > 0) { + _mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst))); + ++dst; + --nframes; + } + } while (0); } -void -x86_sse_avx_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes) +/** + * @brief x86-64 AVX optimized routine for mixing buffer with gain. + * + * This function may choose SSE over AVX if the pointers are aligned + * to 16 byte boundary instead of 32 byte boundary to reduce time to + * process. + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + * @param gain Gain to apply + */ +C_FUNC void +x86_sse_avx_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain) { - default_mix_buffers_no_gain (dst, src, nframes); + if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) { + // Pointers are both aligned to 32 bit boundaries, this can be processed with AVX + x86_sse_avx_mix_buffers_with_gain_aligned(dst, src, nframes, gain); + } else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) { + // This can still be processed with SSE + x86_sse_mix_buffers_with_gain(dst, src, nframes, gain); + } else { + // Pointers are unaligned, so process them with unaligned load/store AVX + x86_sse_avx_mix_buffers_with_gain_unaligned(dst, src, nframes, gain); + } } -void -x86_sse_avx_copy_vector (float * dst, const float * src, uint32_t nframes) +/** + * @brief x86-64 AVX optimized routine for mixing buffer with no gain. + * + * This function may choose SSE over AVX if the pointers are aligned + * to 16 byte boundary instead of 32 byte boundary to reduce time to + * process. + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + */ +C_FUNC void +x86_sse_avx_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) { - default_copy_vector (dst, src, nframes); + if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) { + // Pointers are both aligned to 32 bit boundaries, this can be processed with AVX + x86_sse_avx_mix_buffers_no_gain_aligned(dst, src, nframes); + } else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) { + // This can still be processed with SSE + x86_sse_mix_buffers_no_gain(dst, src, nframes); + } else { + // Pointers are unaligned, so process them with unaligned load/store AVX + x86_sse_avx_mix_buffers_no_gain_unaligned(dst, src, nframes); + } } -void -x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max) +/** + * @brief Copy vector from one location to another + * + * This has not been hand optimized for AVX with the rationale that standard + * C library implementation will provide faster memory copy operation. It will + * be redundant to implement memcpy for floats. + * + * @param[out] dst Pointer to destination buffer + * @param[in] src Pointer to source buffer + * @param nframes Number of samples to copy + */ +C_FUNC void +x86_sse_avx_copy_vector(float *dst, const float *src, uint32_t nframes) { - default_find_peaks (buf, nsamples, min, max); + (void) memcpy(dst, src, nframes * sizeof(float)); +} + +/** + * Local helper functions + */ + +/** + * @brief Helper routine for mixing buffers with gain for unaligned buffers + * + * @details This routine executes the following expression below per element: + * + * dst = dst + (gain * src) + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + * @param gain Gain to apply + */ +static void +x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain) +{ + // Load gain vector to all elements of YMM register + __m256 vgain = _mm256_set1_ps(gain); + + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1; + __m256 d0, d1; + + // Load sources + s0 = _mm256_loadu_ps(src + 0); + s1 = _mm256_loadu_ps(src + 8); + + // Load destinations + d0 = _mm256_loadu_ps(dst + 0); + d1 = _mm256_loadu_ps(dst + 8); + + // src = src * gain + s0 = _mm256_mul_ps(vgain, s0); + s1 = _mm256_mul_ps(vgain, s1); + + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + d1 = _mm256_add_ps(d1, s1); + + // Store result + _mm256_storeu_ps(dst + 0, d0); + _mm256_storeu_ps(dst + 8, d1); + + // Update pointers and counters + src += 16; + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 s0, d0; + // Load sources + s0 = _mm256_loadu_ps(src); + // Load destinations + d0 = _mm256_loadu_ps(dst); + // src = src * gain + s0 = _mm256_mul_ps(vgain, s0); + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + // Store result + _mm256_storeu_ps(dst, d0); + // Update pointers and counters + src+= 8; + dst += 8; + nframes -= 8; + } + + + // There's a penalty going away from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples + do { + __m128 g0 = _mm_set_ss(gain); + while (nframes > 0) { + __m128 s0, d0; + s0 = _mm_load_ss(src); + d0 = _mm_load_ss(dst); + s0 = _mm_mul_ss(g0, s0); + d0 = _mm_add_ss(d0, s0); + _mm_store_ss(dst, d0); + ++src; + ++dst; + --nframes; + } + } while (0); +} + +/** + * @brief Helper routine for mixing buffers with gain for aligned buffers + * + * @details This routine executes the following expression below per element: + * + * dst = dst + (gain * src) + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + * @param gain Gain to apply + */ +static void +x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain) +{ + // Load gain vector to all elements of YMM register + __m256 vgain = _mm256_set1_ps(gain); + + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1; + __m256 d0, d1; + + // Load sources + s0 = _mm256_load_ps(src + 0); + s1 = _mm256_load_ps(src + 8); + + // Load destinations + d0 = _mm256_load_ps(dst + 0); + d1 = _mm256_load_ps(dst + 8); + + // src = src * gain + s0 = _mm256_mul_ps(vgain, s0); + s1 = _mm256_mul_ps(vgain, s1); + + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + d1 = _mm256_add_ps(d1, s1); + + // Store result + _mm256_store_ps(dst + 0, d0); + _mm256_store_ps(dst + 8, d1); + + // Update pointers and counters + src += 16; + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 s0, d0; + // Load sources + s0 = _mm256_load_ps(src + 0 ); + // Load destinations + d0 = _mm256_load_ps(dst + 0 ); + // src = src * gain + s0 = _mm256_mul_ps(vgain, s0); + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + // Store result + _mm256_store_ps(dst, d0); + // Update pointers and counters + src += 8; + dst += 8; + nframes -= 8; + } + + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples, one sample at a time. + do { + __m128 g0 = _mm256_castps256_ps128(vgain); // use the same register + while (nframes > 0) { + __m128 s0, d0; + s0 = _mm_load_ss(src); + d0 = _mm_load_ss(dst); + s0 = _mm_mul_ss(g0, s0); + d0 = _mm_add_ss(d0, s0); + _mm_store_ss(dst, d0); + ++src; + ++dst; + --nframes; + } + } while (0); +} + +/** + * @brief Helper routine for mixing buffers with no gain for aligned buffers + * + * @details This routine executes the following expression below per element: + * + * dst = dst + src + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + */ +static void +x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes) +{ + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1; + __m256 d0, d1; + + // Load sources + s0 = _mm256_loadu_ps(src + 0); + s1 = _mm256_loadu_ps(src + 8); + + // Load destinations + d0 = _mm256_loadu_ps(dst + 0); + d1 = _mm256_loadu_ps(dst + 8); + + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + d1 = _mm256_add_ps(d1, s1); + + // Store result + _mm256_storeu_ps(dst + 0, d0); + _mm256_storeu_ps(dst + 8, d1); + + // Update pointers and counters + src += 16; + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 s0, d0; + // Load sources + s0 = _mm256_loadu_ps(src); + // Load destinations + d0 = _mm256_loadu_ps(dst); + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + // Store result + _mm256_storeu_ps(dst, d0); + // Update pointers and counters + src+= 8; + dst += 8; + nframes -= 8; + } + + // There's a penalty going away from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples + do { + while (nframes > 0) { + __m128 s0, d0; + s0 = _mm_load_ss(src); + d0 = _mm_load_ss(dst); + d0 = _mm_add_ss(d0, s0); + _mm_store_ss(dst, d0); + ++src; + ++dst; + --nframes; + } + } while (0); + +} + +/** + * @brief Helper routine for mixing buffers with no gain for unaligned buffers + * + * @details This routine executes the following expression below per element: + * + * dst = dst + src + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + */ +static void +x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes) +{ + // Process the aligned portion 32 samples at a time + while (nframes >= 32) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (32 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (32 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (32 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (32 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1, s2, s3; + __m256 d0, d1, d2, d3; + + // Load sources + s0 = _mm256_load_ps(src + 0 ); + s1 = _mm256_load_ps(src + 8 ); + s2 = _mm256_load_ps(src + 16); + s3 = _mm256_load_ps(src + 24); + + // Load destinations + d0 = _mm256_load_ps(dst + 0 ); + d1 = _mm256_load_ps(dst + 8 ); + d2 = _mm256_load_ps(dst + 16); + d3 = _mm256_load_ps(dst + 24); + + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + d1 = _mm256_add_ps(d1, s1); + d2 = _mm256_add_ps(d2, s2); + d3 = _mm256_add_ps(d3, s3); + + // Store result + _mm256_store_ps(dst + 0 , d0); + _mm256_store_ps(dst + 8 , d1); + _mm256_store_ps(dst + 16, d2); + _mm256_store_ps(dst + 24, d3); + + // Update pointers and counters + src += 32; + dst += 32; + nframes -= 32; + } + + // Process the remaining samples 16 at a time + while (nframes >= 16) + { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1; + __m256 d0, d1; + + // Load sources + s0 = _mm256_load_ps(src + 0); + s1 = _mm256_load_ps(src + 8); + + // Load destinations + d0 = _mm256_load_ps(dst + 0); + d1 = _mm256_load_ps(dst + 8); + + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + d1 = _mm256_add_ps(d1, s1); + + // Store result + _mm256_store_ps(dst + 0, d0); + _mm256_store_ps(dst + 8, d1); + + // Update pointers and counters + src += 16; + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 s0, d0; + // Load sources + s0 = _mm256_load_ps(src + 0 ); + // Load destinations + d0 = _mm256_load_ps(dst + 0 ); + // dst = dst + src + d0 = _mm256_add_ps(d0, s0); + // Store result + _mm256_store_ps(dst, d0); + // Update pointers and counters + src += 8; + dst += 8; + nframes -= 8; + } + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples + do { + while (nframes > 0) { + __m128 s0, d0; + s0 = _mm_load_ss(src); + d0 = _mm_load_ss(dst); + d0 = _mm_add_ss(d0, s0); + _mm_store_ss(dst, d0); + ++src; + ++dst; + --nframes; + } + } while (0); +} + +/** + * @brief Get the maximum value of packed float register + * @param vmax Packed float 8x register + * @return __m256 Maximum value in p[0] + */ +static inline __m256 avx_getmax_ps(__m256 vmax) +{ + __m256 tmp; + tmp = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(2, 3, 0, 1)); + vmax = _mm256_max_ps(tmp, vmax); + tmp = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(1, 0, 3, 2)); + vmax = _mm256_max_ps(tmp, vmax); + tmp = _mm256_permute2f128_ps(vmax, vmax, 1); + vmax = _mm256_max_ps(tmp, vmax); + return vmax; +} + +/** + * @brief Get the minimum value of packed float register + * @param vmax Packed float 8x register + * @return __m256 Minimum value in p[0] + */ +static inline __m256 avx_getmin_ps(__m256 vmin) +{ + __m256 tmp; + tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(2, 3, 0, 1)); + vmin = _mm256_min_ps(tmp, vmin); + tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(1, 0, 3, 2)); + vmin = _mm256_min_ps(tmp, vmin); + tmp = _mm256_permute2f128_ps(vmin, vmin, 1); + vmin = _mm256_min_ps(tmp, vmin); + return vmin; }