ardour/libs/ardour/x86_functions_avx512f.cc

1150 lines
30 KiB
C++

/*
* Copyright (C) 2023 Ayan Shafqat <ayan.x.shafqat@gmail.com>
* Copyright (C) 2024 Robin Gareus <robin@gareus.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#ifdef FPU_AVX512F_SUPPORT
#include "ardour/mix.h"
#include <immintrin.h>
#define IS_ALIGNED_TO(ptr, bytes) \
(reinterpret_cast<uintptr_t>(ptr) % (bytes) == 0)
#ifndef __AVX512F__
#error "__AVX512F__ must be enabled for this module to work"
#endif
/**
* @brief x86-64 AVX-512F optimized routine for compute peak procedure
* @param src Pointer to source buffer
* @param nframes Number of frames to process
* @param current Current peak value
* @return float New peak value
*/
float
x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
// Broadcast the current max values to all elements of the ZMM register
__m512 zmax = _mm512_set1_ps(current);
// Compute single/4/8 min/max of unaligned portion until alignment is reached
while (frames > 0) {
if (IS_ALIGNED_TO(src, sizeof(__m512))) {
break;
}
if (frames >=8 && IS_ALIGNED_TO(src, sizeof(__m256))) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
continue;
}
if (frames >= 4 && IS_ALIGNED_TO(src, sizeof(__m128))) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
x4 = _mm512_abs_ps(x4);
x5 = _mm512_abs_ps(x5);
x6 = _mm512_abs_ps(x6);
x7 = _mm512_abs_ps(x7);
x8 = _mm512_abs_ps(x8);
x9 = _mm512_abs_ps(x9);
x10 = _mm512_abs_ps(x10);
x11 = _mm512_abs_ps(x11);
x12 = _mm512_abs_ps(x12);
x13 = _mm512_abs_ps(x13);
x14 = _mm512_abs_ps(x14);
x15 = _mm512_abs_ps(x15);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
zmax = _mm512_max_ps(zmax, x8);
zmax = _mm512_max_ps(zmax, x9);
zmax = _mm512_max_ps(zmax, x10);
zmax = _mm512_max_ps(zmax, x11);
zmax = _mm512_max_ps(zmax, x12);
zmax = _mm512_max_ps(zmax, x13);
zmax = _mm512_max_ps(zmax, x14);
zmax = _mm512_max_ps(zmax, x15);
src += 256;
frames -= 256;
}
while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
x4 = _mm512_abs_ps(x4);
x5 = _mm512_abs_ps(x5);
x6 = _mm512_abs_ps(x6);
x7 = _mm512_abs_ps(x7);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
src += 128;
frames -= 128;
}
while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
x0 = _mm512_abs_ps(x0);
x1 = _mm512_abs_ps(x1);
x2 = _mm512_abs_ps(x2);
x3 = _mm512_abs_ps(x3);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
src += 64;
frames -= 64;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 16;
frames -= 16;
}
// Process the remaining samples 8 at a time
while (frames >= 8) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
}
// Process the remaining samples 4 at a time
while (frames >= 4) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
}
// If there are still some left 2-4 samples, process them one at a time.
while (frames > 0) {
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
x = _mm512_abs_ps(x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
// Get the max of the ZMM registers
current = _mm512_reduce_max_ps(zmax);
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
return current;
}
/**
* @brief x86-64 AVX-512F optimized routine for find peak procedure
* @param src Pointer to source buffer
* @param nframes Number of frames to process
* @param[in,out] minf Current minimum value, updated
* @param[in,out] maxf Current maximum value, updated
*/
void
x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
// Broadcast the current min and max values to all elements of the ZMM register
__m512 zmin = _mm512_set1_ps(*minf);
__m512 zmax = _mm512_set1_ps(*maxf);
// Compute single/4/8 min/max of unaligned portion until alignment is reached
while (frames > 0) {
if (IS_ALIGNED_TO(src, sizeof(__m512))) {
break;
}
if (frames >= 8 && IS_ALIGNED_TO(src, sizeof(__m256))) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
continue;
}
if (frames >= 4 && IS_ALIGNED_TO(src, sizeof(__m128))) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmin = _mm512_min_ps(zmin, x4);
zmin = _mm512_min_ps(zmin, x5);
zmin = _mm512_min_ps(zmin, x6);
zmin = _mm512_min_ps(zmin, x7);
zmin = _mm512_min_ps(zmin, x8);
zmin = _mm512_min_ps(zmin, x9);
zmin = _mm512_min_ps(zmin, x10);
zmin = _mm512_min_ps(zmin, x11);
zmin = _mm512_min_ps(zmin, x12);
zmin = _mm512_min_ps(zmin, x13);
zmin = _mm512_min_ps(zmin, x14);
zmin = _mm512_min_ps(zmin, x15);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
zmax = _mm512_max_ps(zmax, x8);
zmax = _mm512_max_ps(zmax, x9);
zmax = _mm512_max_ps(zmax, x10);
zmax = _mm512_max_ps(zmax, x11);
zmax = _mm512_max_ps(zmax, x12);
zmax = _mm512_max_ps(zmax, x13);
zmax = _mm512_max_ps(zmax, x14);
zmax = _mm512_max_ps(zmax, x15);
src += 256;
frames -= 256;
}
while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmin = _mm512_min_ps(zmin, x4);
zmin = _mm512_min_ps(zmin, x5);
zmin = _mm512_min_ps(zmin, x6);
zmin = _mm512_min_ps(zmin, x7);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
zmax = _mm512_max_ps(zmax, x4);
zmax = _mm512_max_ps(zmax, x5);
zmax = _mm512_max_ps(zmax, x6);
zmax = _mm512_max_ps(zmax, x7);
src += 128;
frames -= 128;
}
while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
zmin = _mm512_min_ps(zmin, x0);
zmin = _mm512_min_ps(zmin, x1);
zmin = _mm512_min_ps(zmin, x2);
zmin = _mm512_min_ps(zmin, x3);
zmax = _mm512_max_ps(zmax, x0);
zmax = _mm512_max_ps(zmax, x1);
zmax = _mm512_max_ps(zmax, x2);
zmax = _mm512_max_ps(zmax, x3);
src += 64;
frames -= 64;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 16;
frames -= 16;
}
// Process the remaining samples 8 at a time
while (frames >= 8) {
__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 8;
frames -= 8;
}
// Process the remaining samples 4 at a time
while (frames >= 4) {
__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
src += 4;
frames -= 4;
}
// If there are still some left 2-4 samples, process them one at a time.
while (frames > 0) {
__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));
zmin = _mm512_min_ps(zmin, x);
zmax = _mm512_max_ps(zmax, x);
++src;
--frames;
}
// Get min and max of the ZMM registers
*minf = _mm512_reduce_min_ps(zmin);
*maxf = _mm512_reduce_max_ps(zmax);
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX-512F optimized routine for apply gain routine
* @param[in,out] dst Pointer to the destination buffer, which gets updated
* @param nframes Number of frames (or samples) to process
* @param gain Gain to apply
*/
void
x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
// Load gain vector to all elements of XMM, YMM, and ZMM register
// It's the same register, but used for SSE, AVX, and AVX512 calculation
__m512 zgain = _mm512_set1_ps(gain);
__m256 ygain = _mm512_castps512_ps256(zgain);
__m128 xgain = _mm512_castps512_ps128(zgain);
while (frames > 0) {
if (IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (frames >= 8 && IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(dst);
__m256 y = _mm256_mul_ps(ygain, x);
_mm256_store_ps(dst, y);
dst += 8;
frames -= 8;
continue;
}
if (frames >= 4 && IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(dst);
__m128 y = _mm_mul_ps(xgain, x);
_mm_store_ps(dst, y);
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(dst);
__m128 y = _mm_mul_ss(xgain, x);
_mm_store_ss(dst, y);
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(dst + 0);
__m512 x1 = _mm512_load_ps(dst + 16);
__m512 x2 = _mm512_load_ps(dst + 32);
__m512 x3 = _mm512_load_ps(dst + 48);
__m512 x4 = _mm512_load_ps(dst + 64);
__m512 x5 = _mm512_load_ps(dst + 80);
__m512 x6 = _mm512_load_ps(dst + 96);
__m512 x7 = _mm512_load_ps(dst + 112);
__m512 y0 = _mm512_mul_ps(zgain, x0);
__m512 y1 = _mm512_mul_ps(zgain, x1);
__m512 y2 = _mm512_mul_ps(zgain, x2);
__m512 y3 = _mm512_mul_ps(zgain, x3);
__m512 y4 = _mm512_mul_ps(zgain, x4);
__m512 y5 = _mm512_mul_ps(zgain, x5);
__m512 y6 = _mm512_mul_ps(zgain, x6);
__m512 y7 = _mm512_mul_ps(zgain, x7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(dst);
__m512 y = _mm512_mul_ps(zgain, x);
_mm512_store_ps(dst, y);
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(dst);
__m256 y = _mm256_mul_ps(ygain, x);
_mm256_store_ps(dst, y);
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(dst);
__m128 y = _mm_mul_ps(xgain, x);
_mm_store_ps(dst, y);
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(dst);
__m128 y = _mm_mul_ss(xgain, x);
_mm_store_ss(dst, y);
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX-512F optimized routine for mixing buffer with gain.
* @param[in,out] dst Pointer to destination buffer, which gets updated
* @param[in] src Pointer to source buffer (not updated)
* @param nframes Number of samples to process
* @param gain Gain to apply
*/
void
x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
// Load gain vector to all elements of XMM, YMM, and ZMM register
// It's the same register, but used for SSE, AVX, and AVX512 calculation
__m512 zgain = _mm512_set1_ps(gain);
__m256 ygain = _mm512_castps512_ps256(zgain);
__m128 xgain = _mm512_castps512_ps128(zgain);
while (frames > 0)
{
if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (frames >= 8 &&
IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_fmadd_ps(ygain, x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (frames >= 4 &&
IS_ALIGNED_TO(src, sizeof(__m128)) &&
IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_fmadd_ps(xgain, x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_fmadd_ss(xgain, x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 y0 = _mm512_load_ps(dst + 0);
__m512 y1 = _mm512_load_ps(dst + 16);
__m512 y2 = _mm512_load_ps(dst + 32);
__m512 y3 = _mm512_load_ps(dst + 48);
__m512 y4 = _mm512_load_ps(dst + 64);
__m512 y5 = _mm512_load_ps(dst + 80);
__m512 y6 = _mm512_load_ps(dst + 96);
__m512 y7 = _mm512_load_ps(dst + 112);
y0 = _mm512_fmadd_ps(zgain, x0, y0);
y1 = _mm512_fmadd_ps(zgain, x1, y1);
y2 = _mm512_fmadd_ps(zgain, x2, y2);
y3 = _mm512_fmadd_ps(zgain, x3, y3);
y4 = _mm512_fmadd_ps(zgain, x4, y4);
y5 = _mm512_fmadd_ps(zgain, x5, y5);
y6 = _mm512_fmadd_ps(zgain, x6, y6);
y7 = _mm512_fmadd_ps(zgain, x7, y7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
src += 128;
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
__m512 y = _mm512_load_ps(dst);
y = _mm512_fmadd_ps(zgain, x, y);
_mm512_store_ps(dst, y);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_fmadd_ps(ygain, x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_fmadd_ps(xgain, x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_fmadd_ss(xgain, x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief x86-64 AVX-512F optimized routine for mixing buffer with no gain.
* @param[in,out] dst Pointer to destination buffer, which gets updated
* @param[in] src Pointer to source buffer (not updated)
* @param nframes Number of samples to process
*/
void
x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
while (frames > 0)
{
if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (frames >= 8 &&
IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_add_ps(x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (frames >= 4 &&
IS_ALIGNED_TO(src, sizeof(__m128)) &&
IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_add_ps(x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_add_ss(x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// Process the remaining samples 128 at a time
while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 y0 = _mm512_load_ps(dst + 0);
__m512 y1 = _mm512_load_ps(dst + 16);
__m512 y2 = _mm512_load_ps(dst + 32);
__m512 y3 = _mm512_load_ps(dst + 48);
__m512 y4 = _mm512_load_ps(dst + 64);
__m512 y5 = _mm512_load_ps(dst + 80);
__m512 y6 = _mm512_load_ps(dst + 96);
__m512 y7 = _mm512_load_ps(dst + 112);
y0 = _mm512_add_ps(x0, y0);
y1 = _mm512_add_ps(x1, y1);
y2 = _mm512_add_ps(x2, y2);
y3 = _mm512_add_ps(x3, y3);
y4 = _mm512_add_ps(x4, y4);
y5 = _mm512_add_ps(x5, y5);
y6 = _mm512_add_ps(x6, y6);
y7 = _mm512_add_ps(x7, y7);
_mm512_store_ps(dst + 0, y0);
_mm512_store_ps(dst + 16, y1);
_mm512_store_ps(dst + 32, y2);
_mm512_store_ps(dst + 48, y3);
_mm512_store_ps(dst + 64, y4);
_mm512_store_ps(dst + 80, y5);
_mm512_store_ps(dst + 96, y6);
_mm512_store_ps(dst + 112, y7);
src += 128;
dst += 128;
frames -= 128;
}
// Process the remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
__m512 y = _mm512_load_ps(dst);
y = _mm512_add_ps(x, y);
_mm512_store_ps(dst, y);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
__m256 y = _mm256_load_ps(dst);
y = _mm256_add_ps(x, y);
_mm256_store_ps(dst, y);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
__m128 y = _mm_load_ps(dst);
y = _mm_add_ps(x, y);
_mm_store_ps(dst, y);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
__m128 y = _mm_load_ss(dst);
y = _mm_add_ss(x, y);
_mm_store_ss(dst, y);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
//
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
/**
* @brief Copy vector from one location to another
* @param[out] dst Pointer to destination buffer
* @param[in] src Pointer to source buffer
* @param nframes Number of samples to copy
*/
void
x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes)
{
// Convert to signed integer to prevent any arithmetic overflow errors
int32_t frames = static_cast<int32_t>(nframes);
while (frames > 0) {
if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
IS_ALIGNED_TO(dst, sizeof(__m512))) {
break;
}
if (frames >= 8 &&
IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) {
__m256 x = _mm256_load_ps(src);
_mm256_store_ps(dst, x);
src += 8;
dst += 8;
frames -= 8;
continue;
}
if (frames >= 4 &&
IS_ALIGNED_TO(src, sizeof(__m128)) &&
IS_ALIGNED_TO(dst, sizeof(__m128))) {
__m128 x = _mm_load_ps(src);
_mm_store_ps(dst, x);
src += 4;
dst += 4;
frames -= 4;
continue;
}
// Pointers are aligned to float boundaries (4 bytes)
__m128 x = _mm_load_ss(src);
_mm_store_ss(dst, x);
++src;
++dst;
--frames;
}
// Process 256 samples at a time
while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
__builtin_prefetch(reinterpret_cast<void const *>(dst + 256), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
__m512 x4 = _mm512_load_ps(src + 64);
__m512 x5 = _mm512_load_ps(src + 80);
__m512 x6 = _mm512_load_ps(src + 96);
__m512 x7 = _mm512_load_ps(src + 112);
__m512 x8 = _mm512_load_ps(src + 128);
__m512 x9 = _mm512_load_ps(src + 144);
__m512 x10 = _mm512_load_ps(src + 160);
__m512 x11 = _mm512_load_ps(src + 176);
__m512 x12 = _mm512_load_ps(src + 192);
__m512 x13 = _mm512_load_ps(src + 208);
__m512 x14 = _mm512_load_ps(src + 224);
__m512 x15 = _mm512_load_ps(src + 240);
_mm512_store_ps(dst + 0, x0);
_mm512_store_ps(dst + 16, x1);
_mm512_store_ps(dst + 32, x2);
_mm512_store_ps(dst + 48, x3);
_mm512_store_ps(dst + 64, x4);
_mm512_store_ps(dst + 80, x5);
_mm512_store_ps(dst + 96, x6);
_mm512_store_ps(dst + 112, x7);
_mm512_store_ps(dst + 128, x8);
_mm512_store_ps(dst + 144, x9);
_mm512_store_ps(dst + 160, x10);
_mm512_store_ps(dst + 176, x11);
_mm512_store_ps(dst + 192, x12);
_mm512_store_ps(dst + 208, x13);
_mm512_store_ps(dst + 224, x14);
_mm512_store_ps(dst + 240, x15);
src += 256;
dst += 256;
frames -= 256;
}
// Process remaining samples 64 at a time
while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
_mm_prefetch(reinterpret_cast<void const *>(dst + 64), _mm_hint(0));
#else
__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
__builtin_prefetch(reinterpret_cast<void const *>(dst + 64), 0, 0);
#endif
__m512 x0 = _mm512_load_ps(src + 0);
__m512 x1 = _mm512_load_ps(src + 16);
__m512 x2 = _mm512_load_ps(src + 32);
__m512 x3 = _mm512_load_ps(src + 48);
_mm512_store_ps(dst + 0, x0);
_mm512_store_ps(dst + 16, x1);
_mm512_store_ps(dst + 32, x2);
_mm512_store_ps(dst + 48, x3);
src += 64;
dst += 64;
frames -= 64;
}
// Process remaining samples 16 at a time
while (frames >= 16) {
__m512 x = _mm512_load_ps(src);
_mm512_store_ps(dst, x);
src += 16;
dst += 16;
frames -= 16;
}
// Process remaining samples x8
while (frames >= 8) {
__m256 x = _mm256_load_ps(src);
_mm256_store_ps(dst, x);
src += 8;
dst += 8;
frames -= 8;
}
// Process remaining samples x4
while (frames >= 4) {
__m128 x = _mm_load_ps(src);
_mm_store_ps(dst, x);
src += 4;
dst += 4;
frames -= 4;
}
// Process remaining samples
while (frames > 0) {
__m128 x = _mm_load_ss(src);
_mm_store_ss(dst, x);
++src;
++dst;
--frames;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
}
#endif // FPU_AVX512F_SUPPORT