Adding AVX optimized routines for Linux
This commit adds AVX optimized routines for the following procedures below: *_compute_peak *_find_peaks *_apply_gain_to_buffer *_mix_buffers_with_gain *_mix_buffers_no_gain AVX optimized routine has the prefix of: x86_sse_avx_ Note: mix_buffer_with_gain and mix_buffers_no_gain may prefer SSE implementaion over AVX if source and destination pointers are aligned to 16 byte boundaries. Therefore, it will be optimal if _all_ audio buffers are allocated to 32 byte boundaries to take full advantage of AVX ISA extension.
This commit is contained in:
parent
0ddaf3fe68
commit
03abc1076e
|
@ -40,10 +40,10 @@ extern "C" {
|
|||
LIBARDOUR_API void x86_sse_avx_mix_buffers_with_gain(float * dst, const float * src, uint32_t nframes, float gain);
|
||||
LIBARDOUR_API void x86_sse_avx_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes);
|
||||
LIBARDOUR_API void x86_sse_avx_copy_vector (float * dst, const float * src, uint32_t nframes);
|
||||
LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
|
||||
}
|
||||
|
||||
LIBARDOUR_API void x86_sse_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
|
||||
LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
|
||||
|
||||
/* debug wrappers for SSE functions */
|
||||
|
||||
|
|
|
@ -189,14 +189,8 @@ setup_hardware_optimization (bool try_optimization)
|
|||
|
||||
#if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)
|
||||
|
||||
#ifdef PLATFORM_WINDOWS
|
||||
/* We have AVX-optimized code for Windows */
|
||||
if (fpu->has_avx ())
|
||||
#else
|
||||
/* AVX code doesn't compile on Linux yet */
|
||||
if (false)
|
||||
#endif
|
||||
{
|
||||
/* We have AVX-optimized code for Windows and Linux */
|
||||
if (fpu->has_avx ()) {
|
||||
info << "Using AVX optimized routines" << endmsg;
|
||||
|
||||
// AVX SET
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2015 Paul Davis <paul@linuxaudiosystems.com>
|
||||
* Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
|
@ -18,38 +19,840 @@
|
|||
|
||||
#include "ardour/mix.h"
|
||||
|
||||
float
|
||||
x86_sse_avx_compute_peak (const float * buf, uint32_t nsamples, float current)
|
||||
#include <immintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#ifndef __AVX__
|
||||
#error "__AVX__ must be enabled for this module to work"
|
||||
#endif
|
||||
|
||||
#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define C_FUNC extern "C"
|
||||
#else
|
||||
#define C_FUNC
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Local functions
|
||||
*/
|
||||
|
||||
static inline __m256 avx_getmax_ps(__m256 vmax);
|
||||
static inline __m256 avx_getmin_ps(__m256 vmin);
|
||||
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain);
|
||||
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain);
|
||||
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes);
|
||||
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes);
|
||||
|
||||
/**
|
||||
* Module implementation
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief x86-64 AVX optimized routine for compute peak procedure
|
||||
* @param src Pointer to source buffer
|
||||
* @param nframes Number of frames to process
|
||||
* @param current Current peak value
|
||||
* @return float New peak value
|
||||
*/
|
||||
C_FUNC float
|
||||
x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current)
|
||||
{
|
||||
return default_compute_peak (buf, nsamples, current);
|
||||
const __m256 ABS_MASK = _mm256_set1_ps(-0.0F);
|
||||
|
||||
// Broadcast the current max value to all elements of the YMM register
|
||||
__m256 vcurrent = _mm256_broadcast_ss(¤t);
|
||||
|
||||
// Compute single min/max of unaligned portion until alignment is reached
|
||||
while ((((intptr_t)src) % 32 != 0) && nframes > 0) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_setzero_ps();
|
||||
vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Process the aligned portion 16 samples at a time
|
||||
while (nframes >= 16) {
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 vsrc1, vsrc2;
|
||||
vsrc1 = _mm256_load_ps(src + 0);
|
||||
vsrc2 = _mm256_load_ps(src + 8);
|
||||
|
||||
vsrc1 = _mm256_andnot_ps(ABS_MASK, vsrc1);
|
||||
vsrc2 = _mm256_andnot_ps(ABS_MASK, vsrc2);
|
||||
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc1);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc2);
|
||||
|
||||
src += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_load_ps(src);
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
|
||||
src += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
// If there are still some left 4 to 8 samples, process them below
|
||||
while (nframes > 0) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_setzero_ps();
|
||||
vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
|
||||
vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
|
||||
vcurrent = _mm256_max_ps(vcurrent, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Get the current max from YMM register
|
||||
vcurrent = avx_getmax_ps(vcurrent);
|
||||
|
||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
|
||||
_mm256_zeroupper();
|
||||
|
||||
return _mm256_cvtss_f32(vcurrent);
|
||||
}
|
||||
|
||||
void
|
||||
x86_sse_avx_apply_gain_to_buffer (float * buf, uint32_t nframes, float gain)
|
||||
/**
|
||||
* @brief x86-64 AVX optimized routine for find peak procedure
|
||||
* @param src Pointer to source buffer
|
||||
* @param nframes Number of frames to process
|
||||
* @param[in,out] minf Current minimum value, updated
|
||||
* @param[in,out] maxf Current maximum value, updated
|
||||
*/
|
||||
C_FUNC void
|
||||
x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
|
||||
{
|
||||
default_apply_gain_to_buffer (buf, nframes, gain);
|
||||
// Broadcast the current min and max values to all elements of the YMM register
|
||||
__m256 vmin = _mm256_broadcast_ss(minf);
|
||||
__m256 vmax = _mm256_broadcast_ss(maxf);
|
||||
|
||||
// Compute single min/max of unaligned portion until alignment is reached
|
||||
while ((((intptr_t)src) % 32 != 0) && nframes > 0) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_broadcast_ss(src);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
vmin = _mm256_min_ps(vmin, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)src + 64), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + 64, 0, 0);
|
||||
#endif
|
||||
|
||||
__m256 vsrc1, vsrc2;
|
||||
vsrc1 = _mm256_load_ps(src + 0);
|
||||
vsrc2 = _mm256_load_ps(src + 8);
|
||||
|
||||
vmax = _mm256_max_ps(vmax, vsrc1);
|
||||
vmin = _mm256_min_ps(vmin, vsrc1);
|
||||
|
||||
vmax = _mm256_max_ps(vmax, vsrc2);
|
||||
vmin = _mm256_min_ps(vmin, vsrc2);
|
||||
|
||||
src += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_load_ps(src);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
vmin = _mm256_min_ps(vmin, vsrc);
|
||||
|
||||
src += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
// If there are still some left 4 to 8 samples, process them one at a time.
|
||||
while (nframes > 0) {
|
||||
__m256 vsrc;
|
||||
|
||||
vsrc = _mm256_broadcast_ss(src);
|
||||
vmax = _mm256_max_ps(vmax, vsrc);
|
||||
vmin = _mm256_min_ps(vmin, vsrc);
|
||||
|
||||
++src;
|
||||
--nframes;
|
||||
}
|
||||
|
||||
// Get min and max of the YMM registers
|
||||
vmin = avx_getmin_ps(vmin);
|
||||
vmax = avx_getmax_ps(vmax);
|
||||
|
||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring to the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
|
||||
_mm256_zeroupper();
|
||||
|
||||
_mm_store_ss(minf, _mm256_castps256_ps128(vmin));
|
||||
_mm_store_ss(maxf, _mm256_castps256_ps128(vmax));
|
||||
}
|
||||
|
||||
void
|
||||
x86_sse_avx_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain)
|
||||
/**
|
||||
* @brief x86-64 AVX optimized routine for apply gain routine
|
||||
* @param[in,out] dst Pointer to the destination buffer, which gets updated
|
||||
* @param nframes Number of frames (or samples) to process
|
||||
* @param gain Gain to apply
|
||||
*/
|
||||
C_FUNC void
|
||||
x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
|
||||
{
|
||||
default_mix_buffers_with_gain (dst, src, nframes, gain);
|
||||
// Load gain vector to all elements of YMM register
|
||||
__m256 vgain = _mm256_set1_ps(gain);
|
||||
|
||||
if (nframes) {
|
||||
__m128 g0 = _mm256_castps256_ps128(vgain);
|
||||
// Here comes the horror, poor-man's loop unrolling
|
||||
switch (((intptr_t)dst) % 32) {
|
||||
case 0:
|
||||
default:
|
||||
// Buffer is aligned, skip to the next section of aligned
|
||||
break;
|
||||
case 4:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
case 8:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
case 12:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
case 16:
|
||||
// This is a special case where pointer is 16 byte aligned
|
||||
// for a XMM load/store operation.
|
||||
_mm_store_ps(dst, _mm_mul_ps(g0, _mm_load_ps(dst)));
|
||||
dst += 4;
|
||||
nframes -= 4;
|
||||
break;
|
||||
case 20:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
case 24:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
case 28:
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 d0, d1;
|
||||
d0 = _mm256_load_ps(dst + 0 );
|
||||
d1 = _mm256_load_ps(dst + 8 );
|
||||
|
||||
d0 = _mm256_mul_ps(vgain, d0);
|
||||
d1 = _mm256_mul_ps(vgain, d1);
|
||||
|
||||
_mm256_store_ps(dst + 0 , d0);
|
||||
_mm256_store_ps(dst + 8 , d1);
|
||||
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
_mm256_store_ps(dst, _mm256_mul_ps(vgain, _mm256_load_ps(dst)));
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
|
||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring to the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
||||
|
||||
// Process the remaining samples
|
||||
do {
|
||||
__m128 g0 = _mm256_castps256_ps128(vgain);
|
||||
while (nframes > 0) {
|
||||
_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} while (0);
|
||||
}
|
||||
|
||||
void
|
||||
x86_sse_avx_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes)
|
||||
/**
|
||||
* @brief x86-64 AVX optimized routine for mixing buffer with gain.
|
||||
*
|
||||
* This function may choose SSE over AVX if the pointers are aligned
|
||||
* to 16 byte boundary instead of 32 byte boundary to reduce time to
|
||||
* process.
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
* @param gain Gain to apply
|
||||
*/
|
||||
C_FUNC void
|
||||
x86_sse_avx_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
|
||||
{
|
||||
default_mix_buffers_no_gain (dst, src, nframes);
|
||||
if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) {
|
||||
// Pointers are both aligned to 32 bit boundaries, this can be processed with AVX
|
||||
x86_sse_avx_mix_buffers_with_gain_aligned(dst, src, nframes, gain);
|
||||
} else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) {
|
||||
// This can still be processed with SSE
|
||||
x86_sse_mix_buffers_with_gain(dst, src, nframes, gain);
|
||||
} else {
|
||||
// Pointers are unaligned, so process them with unaligned load/store AVX
|
||||
x86_sse_avx_mix_buffers_with_gain_unaligned(dst, src, nframes, gain);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
x86_sse_avx_copy_vector (float * dst, const float * src, uint32_t nframes)
|
||||
/**
|
||||
* @brief x86-64 AVX optimized routine for mixing buffer with no gain.
|
||||
*
|
||||
* This function may choose SSE over AVX if the pointers are aligned
|
||||
* to 16 byte boundary instead of 32 byte boundary to reduce time to
|
||||
* process.
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
*/
|
||||
C_FUNC void
|
||||
x86_sse_avx_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
|
||||
{
|
||||
default_copy_vector (dst, src, nframes);
|
||||
if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) {
|
||||
// Pointers are both aligned to 32 bit boundaries, this can be processed with AVX
|
||||
x86_sse_avx_mix_buffers_no_gain_aligned(dst, src, nframes);
|
||||
} else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) {
|
||||
// This can still be processed with SSE
|
||||
x86_sse_mix_buffers_no_gain(dst, src, nframes);
|
||||
} else {
|
||||
// Pointers are unaligned, so process them with unaligned load/store AVX
|
||||
x86_sse_avx_mix_buffers_no_gain_unaligned(dst, src, nframes);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max)
|
||||
/**
|
||||
* @brief Copy vector from one location to another
|
||||
*
|
||||
* This has not been hand optimized for AVX with the rationale that standard
|
||||
* C library implementation will provide faster memory copy operation. It will
|
||||
* be redundant to implement memcpy for floats.
|
||||
*
|
||||
* @param[out] dst Pointer to destination buffer
|
||||
* @param[in] src Pointer to source buffer
|
||||
* @param nframes Number of samples to copy
|
||||
*/
|
||||
C_FUNC void
|
||||
x86_sse_avx_copy_vector(float *dst, const float *src, uint32_t nframes)
|
||||
{
|
||||
default_find_peaks (buf, nsamples, min, max);
|
||||
(void) memcpy(dst, src, nframes * sizeof(float));
|
||||
}
|
||||
|
||||
/**
|
||||
* Local helper functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Helper routine for mixing buffers with gain for unaligned buffers
|
||||
*
|
||||
* @details This routine executes the following expression below per element:
|
||||
*
|
||||
* dst = dst + (gain * src)
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
* @param gain Gain to apply
|
||||
*/
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain)
|
||||
{
|
||||
// Load gain vector to all elements of YMM register
|
||||
__m256 vgain = _mm256_set1_ps(gain);
|
||||
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 s0, s1;
|
||||
__m256 d0, d1;
|
||||
|
||||
// Load sources
|
||||
s0 = _mm256_loadu_ps(src + 0);
|
||||
s1 = _mm256_loadu_ps(src + 8);
|
||||
|
||||
// Load destinations
|
||||
d0 = _mm256_loadu_ps(dst + 0);
|
||||
d1 = _mm256_loadu_ps(dst + 8);
|
||||
|
||||
// src = src * gain
|
||||
s0 = _mm256_mul_ps(vgain, s0);
|
||||
s1 = _mm256_mul_ps(vgain, s1);
|
||||
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
d1 = _mm256_add_ps(d1, s1);
|
||||
|
||||
// Store result
|
||||
_mm256_storeu_ps(dst + 0, d0);
|
||||
_mm256_storeu_ps(dst + 8, d1);
|
||||
|
||||
// Update pointers and counters
|
||||
src += 16;
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 s0, d0;
|
||||
// Load sources
|
||||
s0 = _mm256_loadu_ps(src);
|
||||
// Load destinations
|
||||
d0 = _mm256_loadu_ps(dst);
|
||||
// src = src * gain
|
||||
s0 = _mm256_mul_ps(vgain, s0);
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
// Store result
|
||||
_mm256_storeu_ps(dst, d0);
|
||||
// Update pointers and counters
|
||||
src+= 8;
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
|
||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
||||
|
||||
// Process the remaining samples
|
||||
do {
|
||||
__m128 g0 = _mm_set_ss(gain);
|
||||
while (nframes > 0) {
|
||||
__m128 s0, d0;
|
||||
s0 = _mm_load_ss(src);
|
||||
d0 = _mm_load_ss(dst);
|
||||
s0 = _mm_mul_ss(g0, s0);
|
||||
d0 = _mm_add_ss(d0, s0);
|
||||
_mm_store_ss(dst, d0);
|
||||
++src;
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} while (0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper routine for mixing buffers with gain for aligned buffers
|
||||
*
|
||||
* @details This routine executes the following expression below per element:
|
||||
*
|
||||
* dst = dst + (gain * src)
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
* @param gain Gain to apply
|
||||
*/
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain)
|
||||
{
|
||||
// Load gain vector to all elements of YMM register
|
||||
__m256 vgain = _mm256_set1_ps(gain);
|
||||
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 s0, s1;
|
||||
__m256 d0, d1;
|
||||
|
||||
// Load sources
|
||||
s0 = _mm256_load_ps(src + 0);
|
||||
s1 = _mm256_load_ps(src + 8);
|
||||
|
||||
// Load destinations
|
||||
d0 = _mm256_load_ps(dst + 0);
|
||||
d1 = _mm256_load_ps(dst + 8);
|
||||
|
||||
// src = src * gain
|
||||
s0 = _mm256_mul_ps(vgain, s0);
|
||||
s1 = _mm256_mul_ps(vgain, s1);
|
||||
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
d1 = _mm256_add_ps(d1, s1);
|
||||
|
||||
// Store result
|
||||
_mm256_store_ps(dst + 0, d0);
|
||||
_mm256_store_ps(dst + 8, d1);
|
||||
|
||||
// Update pointers and counters
|
||||
src += 16;
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 s0, d0;
|
||||
// Load sources
|
||||
s0 = _mm256_load_ps(src + 0 );
|
||||
// Load destinations
|
||||
d0 = _mm256_load_ps(dst + 0 );
|
||||
// src = src * gain
|
||||
s0 = _mm256_mul_ps(vgain, s0);
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
// Store result
|
||||
_mm256_store_ps(dst, d0);
|
||||
// Update pointers and counters
|
||||
src += 8;
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
|
||||
// There's a penalty going from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
||||
|
||||
// Process the remaining samples, one sample at a time.
|
||||
do {
|
||||
__m128 g0 = _mm256_castps256_ps128(vgain); // use the same register
|
||||
while (nframes > 0) {
|
||||
__m128 s0, d0;
|
||||
s0 = _mm_load_ss(src);
|
||||
d0 = _mm_load_ss(dst);
|
||||
s0 = _mm_mul_ss(g0, s0);
|
||||
d0 = _mm_add_ss(d0, s0);
|
||||
_mm_store_ss(dst, d0);
|
||||
++src;
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} while (0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper routine for mixing buffers with no gain for aligned buffers
|
||||
*
|
||||
* @details This routine executes the following expression below per element:
|
||||
*
|
||||
* dst = dst + src
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
*/
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes)
|
||||
{
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 s0, s1;
|
||||
__m256 d0, d1;
|
||||
|
||||
// Load sources
|
||||
s0 = _mm256_loadu_ps(src + 0);
|
||||
s1 = _mm256_loadu_ps(src + 8);
|
||||
|
||||
// Load destinations
|
||||
d0 = _mm256_loadu_ps(dst + 0);
|
||||
d1 = _mm256_loadu_ps(dst + 8);
|
||||
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
d1 = _mm256_add_ps(d1, s1);
|
||||
|
||||
// Store result
|
||||
_mm256_storeu_ps(dst + 0, d0);
|
||||
_mm256_storeu_ps(dst + 8, d1);
|
||||
|
||||
// Update pointers and counters
|
||||
src += 16;
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 s0, d0;
|
||||
// Load sources
|
||||
s0 = _mm256_loadu_ps(src);
|
||||
// Load destinations
|
||||
d0 = _mm256_loadu_ps(dst);
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
// Store result
|
||||
_mm256_storeu_ps(dst, d0);
|
||||
// Update pointers and counters
|
||||
src+= 8;
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
||||
|
||||
// Process the remaining samples
|
||||
do {
|
||||
while (nframes > 0) {
|
||||
__m128 s0, d0;
|
||||
s0 = _mm_load_ss(src);
|
||||
d0 = _mm_load_ss(dst);
|
||||
d0 = _mm_add_ss(d0, s0);
|
||||
_mm_store_ss(dst, d0);
|
||||
++src;
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} while (0);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper routine for mixing buffers with no gain for unaligned buffers
|
||||
*
|
||||
* @details This routine executes the following expression below per element:
|
||||
*
|
||||
* dst = dst + src
|
||||
*
|
||||
* @param[in,out] dst Pointer to destination buffer, which gets updated
|
||||
* @param[in] src Pointer to source buffer (not updated)
|
||||
* @param nframes Number of samples to process
|
||||
*/
|
||||
static void
|
||||
x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes)
|
||||
{
|
||||
// Process the aligned portion 32 samples at a time
|
||||
while (nframes >= 32)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (32 * sizeof(float))), _mm_hint(0));
|
||||
_mm_prefetch(((char *)src + (32 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (32 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(dst + (32 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 s0, s1, s2, s3;
|
||||
__m256 d0, d1, d2, d3;
|
||||
|
||||
// Load sources
|
||||
s0 = _mm256_load_ps(src + 0 );
|
||||
s1 = _mm256_load_ps(src + 8 );
|
||||
s2 = _mm256_load_ps(src + 16);
|
||||
s3 = _mm256_load_ps(src + 24);
|
||||
|
||||
// Load destinations
|
||||
d0 = _mm256_load_ps(dst + 0 );
|
||||
d1 = _mm256_load_ps(dst + 8 );
|
||||
d2 = _mm256_load_ps(dst + 16);
|
||||
d3 = _mm256_load_ps(dst + 24);
|
||||
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
d1 = _mm256_add_ps(d1, s1);
|
||||
d2 = _mm256_add_ps(d2, s2);
|
||||
d3 = _mm256_add_ps(d3, s3);
|
||||
|
||||
// Store result
|
||||
_mm256_store_ps(dst + 0 , d0);
|
||||
_mm256_store_ps(dst + 8 , d1);
|
||||
_mm256_store_ps(dst + 16, d2);
|
||||
_mm256_store_ps(dst + 24, d3);
|
||||
|
||||
// Update pointers and counters
|
||||
src += 32;
|
||||
dst += 32;
|
||||
nframes -= 32;
|
||||
}
|
||||
|
||||
// Process the remaining samples 16 at a time
|
||||
while (nframes >= 16)
|
||||
{
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
|
||||
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
|
||||
#else
|
||||
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
|
||||
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
|
||||
#endif
|
||||
__m256 s0, s1;
|
||||
__m256 d0, d1;
|
||||
|
||||
// Load sources
|
||||
s0 = _mm256_load_ps(src + 0);
|
||||
s1 = _mm256_load_ps(src + 8);
|
||||
|
||||
// Load destinations
|
||||
d0 = _mm256_load_ps(dst + 0);
|
||||
d1 = _mm256_load_ps(dst + 8);
|
||||
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
d1 = _mm256_add_ps(d1, s1);
|
||||
|
||||
// Store result
|
||||
_mm256_store_ps(dst + 0, d0);
|
||||
_mm256_store_ps(dst + 8, d1);
|
||||
|
||||
// Update pointers and counters
|
||||
src += 16;
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
// Process the remaining samples 8 at a time
|
||||
while (nframes >= 8) {
|
||||
__m256 s0, d0;
|
||||
// Load sources
|
||||
s0 = _mm256_load_ps(src + 0 );
|
||||
// Load destinations
|
||||
d0 = _mm256_load_ps(dst + 0 );
|
||||
// dst = dst + src
|
||||
d0 = _mm256_add_ps(d0, s0);
|
||||
// Store result
|
||||
_mm256_store_ps(dst, d0);
|
||||
// Update pointers and counters
|
||||
src += 8;
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
// There's a penalty going from AVX mode to SSE mode. This can
|
||||
// be avoided by ensuring the CPU that rest of the routine is no
|
||||
// longer interested in the upper portion of the YMM register.
|
||||
|
||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
||||
|
||||
// Process the remaining samples
|
||||
do {
|
||||
while (nframes > 0) {
|
||||
__m128 s0, d0;
|
||||
s0 = _mm_load_ss(src);
|
||||
d0 = _mm_load_ss(dst);
|
||||
d0 = _mm_add_ss(d0, s0);
|
||||
_mm_store_ss(dst, d0);
|
||||
++src;
|
||||
++dst;
|
||||
--nframes;
|
||||
}
|
||||
} while (0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the maximum value of packed float register
|
||||
* @param vmax Packed float 8x register
|
||||
* @return __m256 Maximum value in p[0]
|
||||
*/
|
||||
static inline __m256 avx_getmax_ps(__m256 vmax)
|
||||
{
|
||||
__m256 tmp;
|
||||
tmp = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
vmax = _mm256_max_ps(tmp, vmax);
|
||||
tmp = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vmax = _mm256_max_ps(tmp, vmax);
|
||||
tmp = _mm256_permute2f128_ps(vmax, vmax, 1);
|
||||
vmax = _mm256_max_ps(tmp, vmax);
|
||||
return vmax;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Get the minimum value of packed float register
|
||||
* @param vmax Packed float 8x register
|
||||
* @return __m256 Minimum value in p[0]
|
||||
*/
|
||||
static inline __m256 avx_getmin_ps(__m256 vmin)
|
||||
{
|
||||
__m256 tmp;
|
||||
tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
vmin = _mm256_min_ps(tmp, vmin);
|
||||
tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
vmin = _mm256_min_ps(tmp, vmin);
|
||||
tmp = _mm256_permute2f128_ps(vmin, vmin, 1);
|
||||
vmin = _mm256_min_ps(tmp, vmin);
|
||||
return vmin;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user