ardour/libs/ardour/x86_functions_avx512f.cc

/*
 * Copyright (C) 2023 Ayan Shafqat <ayan.x.shafqat@gmail.com>
 * Copyright (C) 2024 Robin Gareus <robin@gareus.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
#ifdef FPU_AVX512F_SUPPORT

#include "ardour/mix.h"

#include <immintrin.h>

#define IS_ALIGNED_TO(ptr, bytes) \
	(reinterpret_cast<uintptr_t>(ptr) % (bytes) == 0)

#ifndef __AVX512F__
#error "__AVX512F__ must be enabled for this module to work"
#endif

/**
 * @brief x86-64 AVX-512F optimized routine for compute peak procedure
 * @param src Pointer to source buffer
 * @param nframes Number of frames to process
 * @param current Current peak value
 * @return float New peak value
 */
float
x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current)
{
	// Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	// Broadcast the current max values to all elements of the ZMM register
	__m512 zmax = _mm512_set1_ps(current);

	// Compute single/4/8 min/max of unaligned portion until alignment is reached
	while (frames > 0) {
		if (IS_ALIGNED_TO(src, sizeof(__m512))) {
			break;
		}

		if (frames >=8 && IS_ALIGNED_TO(src, sizeof(__m256))) {
			__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

			x = _mm512_abs_ps(x);
			zmax = _mm512_max_ps(zmax, x);

			src += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4 && IS_ALIGNED_TO(src, sizeof(__m128))) {
			__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

			x = _mm512_abs_ps(x);
			zmax = _mm512_max_ps(zmax, x);

			src += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);
		x4  = _mm512_abs_ps(x4);
		x5  = _mm512_abs_ps(x5);
		x6  = _mm512_abs_ps(x6);
		x7  = _mm512_abs_ps(x7);
		x8  = _mm512_abs_ps(x8);
		x9  = _mm512_abs_ps(x9);
		x10 = _mm512_abs_ps(x10);
		x11 = _mm512_abs_ps(x11);
		x12 = _mm512_abs_ps(x12);
		x13 = _mm512_abs_ps(x13);
		x14 = _mm512_abs_ps(x14);
		x15 = _mm512_abs_ps(x15);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);
		zmax = _mm512_max_ps(zmax, x8);
		zmax = _mm512_max_ps(zmax, x9);
		zmax = _mm512_max_ps(zmax, x10);
		zmax = _mm512_max_ps(zmax, x11);
		zmax = _mm512_max_ps(zmax, x12);
		zmax = _mm512_max_ps(zmax, x13);
		zmax = _mm512_max_ps(zmax, x14);
		zmax = _mm512_max_ps(zmax, x15);

		src += 256;
		frames -= 256;
	}

	while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);
		x4  = _mm512_abs_ps(x4);
		x5  = _mm512_abs_ps(x5);
		x6  = _mm512_abs_ps(x6);
		x7  = _mm512_abs_ps(x7);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		src += 128;
		frames -= 128;
	}

	while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);

		x0  = _mm512_abs_ps(x0);
		x1  = _mm512_abs_ps(x1);
		x2  = _mm512_abs_ps(x2);
		x3  = _mm512_abs_ps(x3);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);

		src += 64;
		frames -= 64;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 16;
		frames -= 16;
	}

	// Process the remaining samples 8 at a time
	while (frames >= 8) {
		__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 8;
		frames -= 8;
	}

	// Process the remaining samples 4 at a time
	while (frames >= 4) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		src += 4;
		frames -= 4;
	}

	// If there are still some left 2-4 samples, process them one at a time.
	while (frames > 0) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		x = _mm512_abs_ps(x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	// Get the max of the ZMM registers
	current = _mm512_reduce_max_ps(zmax);

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register

	return current;
}

/**
 * @brief x86-64 AVX-512F optimized routine for find peak procedure
 * @param src Pointer to source buffer
 * @param nframes Number of frames to process
 * @param[in,out] minf Current minimum value, updated
 * @param[in,out] maxf Current maximum value, updated
 */
void
x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
{
        // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	// Broadcast the current min and max values to all elements of the ZMM register
	__m512 zmin = _mm512_set1_ps(*minf);
	__m512 zmax = _mm512_set1_ps(*maxf);


	// Compute single/4/8 min/max of unaligned portion until alignment is reached
	while (frames > 0) {
		if (IS_ALIGNED_TO(src, sizeof(__m512))) {
			break;
		}

		if (frames >= 8 && IS_ALIGNED_TO(src, sizeof(__m256))) {
			__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

			zmin = _mm512_min_ps(zmin, x);
			zmax = _mm512_max_ps(zmax, x);

			src += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4 && IS_ALIGNED_TO(src, sizeof(__m128))) {
			__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

			zmin = _mm512_min_ps(zmin, x);
			zmax = _mm512_max_ps(zmax, x);

			src += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);
		zmin = _mm512_min_ps(zmin, x4);
		zmin = _mm512_min_ps(zmin, x5);
		zmin = _mm512_min_ps(zmin, x6);
		zmin = _mm512_min_ps(zmin, x7);

		zmin = _mm512_min_ps(zmin, x8);
		zmin = _mm512_min_ps(zmin, x9);
		zmin = _mm512_min_ps(zmin, x10);
		zmin = _mm512_min_ps(zmin, x11);
		zmin = _mm512_min_ps(zmin, x12);
		zmin = _mm512_min_ps(zmin, x13);
		zmin = _mm512_min_ps(zmin, x14);
		zmin = _mm512_min_ps(zmin, x15);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		zmax = _mm512_max_ps(zmax, x8);
		zmax = _mm512_max_ps(zmax, x9);
		zmax = _mm512_max_ps(zmax, x10);
		zmax = _mm512_max_ps(zmax, x11);
		zmax = _mm512_max_ps(zmax, x12);
		zmax = _mm512_max_ps(zmax, x13);
		zmax = _mm512_max_ps(zmax, x14);
		zmax = _mm512_max_ps(zmax, x15);

		src += 256;
		frames -= 256;
	}

	while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);
		zmin = _mm512_min_ps(zmin, x4);
		zmin = _mm512_min_ps(zmin, x5);
		zmin = _mm512_min_ps(zmin, x6);
		zmin = _mm512_min_ps(zmin, x7);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);
		zmax = _mm512_max_ps(zmax, x4);
		zmax = _mm512_max_ps(zmax, x5);
		zmax = _mm512_max_ps(zmax, x6);
		zmax = _mm512_max_ps(zmax, x7);

		src += 128;
		frames -= 128;
	}

	while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);

		zmin = _mm512_min_ps(zmin, x0);
		zmin = _mm512_min_ps(zmin, x1);
		zmin = _mm512_min_ps(zmin, x2);
		zmin = _mm512_min_ps(zmin, x3);

		zmax = _mm512_max_ps(zmax, x0);
		zmax = _mm512_max_ps(zmax, x1);
		zmax = _mm512_max_ps(zmax, x2);
		zmax = _mm512_max_ps(zmax, x3);

		src += 64;
		frames -= 64;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 16;
		frames -= 16;
	}

	// Process the remaining samples 8 at a time
	while (frames >= 8) {
		__m512 x = _mm512_castps256_ps512(_mm256_load_ps(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 8;
		frames -= 8;
	}

	// Process the remaining samples 4 at a time
	while (frames >= 4) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ps(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		src += 4;
		frames -= 4;
	}

	// If there are still some left 2-4 samples, process them one at a time.
	while (frames > 0) {
		__m512 x = _mm512_castps128_ps512(_mm_load_ss(src));

		zmin = _mm512_min_ps(zmin, x);
		zmax = _mm512_max_ps(zmax, x);

		++src;
		--frames;
	}

	// Get min and max of the ZMM registers
	*minf = _mm512_reduce_min_ps(zmin);
	*maxf = _mm512_reduce_max_ps(zmax);

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX-512F optimized routine for apply gain routine
 * @param[in,out] dst Pointer to the destination buffer, which gets updated
 * @param nframes Number of frames (or samples) to process
 * @param gain Gain to apply
 */
void
x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
{
	// Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	// Load gain vector to all elements of XMM, YMM, and ZMM register
	// It's the same register, but used for SSE, AVX, and AVX512 calculation
	__m512 zgain = _mm512_set1_ps(gain);
	__m256 ygain = _mm512_castps512_ps256(zgain);
	__m128 xgain = _mm512_castps512_ps128(zgain);

	while (frames > 0) {
		if (IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (frames >= 8 && IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(dst);
			__m256 y = _mm256_mul_ps(ygain, x);
			_mm256_store_ps(dst, y);
			dst += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4 && IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(dst);
			__m128 y = _mm_mul_ps(xgain, x);
			_mm_store_ps(dst, y);
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(dst);
		__m128 y = _mm_mul_ss(xgain, x);
		_mm_store_ss(dst, y);
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(dst + 0);
		__m512 x1 = _mm512_load_ps(dst + 16);
		__m512 x2 = _mm512_load_ps(dst + 32);
		__m512 x3 = _mm512_load_ps(dst + 48);
		__m512 x4 = _mm512_load_ps(dst + 64);
		__m512 x5 = _mm512_load_ps(dst + 80);
		__m512 x6 = _mm512_load_ps(dst + 96);
		__m512 x7 = _mm512_load_ps(dst + 112);

		__m512 y0 = _mm512_mul_ps(zgain, x0);
		__m512 y1 = _mm512_mul_ps(zgain, x1);
		__m512 y2 = _mm512_mul_ps(zgain, x2);
		__m512 y3 = _mm512_mul_ps(zgain, x3);
		__m512 y4 = _mm512_mul_ps(zgain, x4);
		__m512 y5 = _mm512_mul_ps(zgain, x5);
		__m512 y6 = _mm512_mul_ps(zgain, x6);
		__m512 y7 = _mm512_mul_ps(zgain, x7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(dst);
		__m512 y = _mm512_mul_ps(zgain, x);
		_mm512_store_ps(dst, y);

		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(dst);
		__m256 y = _mm256_mul_ps(ygain, x);
		_mm256_store_ps(dst, y);

		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(dst);
		__m128 y = _mm_mul_ps(xgain, x);
		_mm_store_ps(dst, y);

		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(dst);
		__m128 y = _mm_mul_ss(xgain, x);
		_mm_store_ss(dst, y);
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX-512F optimized routine for mixing buffer with gain.
 * @param[in,out] dst Pointer to destination buffer, which gets updated
 * @param[in] src Pointer to source buffer (not updated)
 * @param nframes Number of samples to process
 * @param gain Gain to apply
 */
void
x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
{
	// Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	// Load gain vector to all elements of XMM, YMM, and ZMM register
	// It's the same register, but used for SSE, AVX, and AVX512 calculation
	__m512 zgain = _mm512_set1_ps(gain);
	__m256 ygain = _mm512_castps512_ps256(zgain);
	__m128 xgain = _mm512_castps512_ps128(zgain);

	while (frames > 0)
	{
		if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (frames >= 8                        &&
		    IS_ALIGNED_TO(src, sizeof(__m256)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			__m256 y = _mm256_load_ps(dst);

			y = _mm256_fmadd_ps(ygain, x, y);
			_mm256_store_ps(dst, y);

			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4                        &&
		    IS_ALIGNED_TO(src, sizeof(__m128)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			__m128 y = _mm_load_ps(dst);

			y = _mm_fmadd_ps(xgain, x, y);
			_mm_store_ps(dst, y);

			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_fmadd_ss(xgain, x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
		__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 y0 = _mm512_load_ps(dst + 0);
		__m512 y1 = _mm512_load_ps(dst + 16);
		__m512 y2 = _mm512_load_ps(dst + 32);
		__m512 y3 = _mm512_load_ps(dst + 48);
		__m512 y4 = _mm512_load_ps(dst + 64);
		__m512 y5 = _mm512_load_ps(dst + 80);
		__m512 y6 = _mm512_load_ps(dst + 96);
		__m512 y7 = _mm512_load_ps(dst + 112);

		y0 = _mm512_fmadd_ps(zgain, x0, y0);
		y1 = _mm512_fmadd_ps(zgain, x1, y1);
		y2 = _mm512_fmadd_ps(zgain, x2, y2);
		y3 = _mm512_fmadd_ps(zgain, x3, y3);
		y4 = _mm512_fmadd_ps(zgain, x4, y4);
		y5 = _mm512_fmadd_ps(zgain, x5, y5);
		y6 = _mm512_fmadd_ps(zgain, x6, y6);
		y7 = _mm512_fmadd_ps(zgain, x7, y7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		src += 128;
		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		__m512 y = _mm512_load_ps(dst);
		y = _mm512_fmadd_ps(zgain, x, y);
		_mm512_store_ps(dst, y);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		__m256 y = _mm256_load_ps(dst);

		y = _mm256_fmadd_ps(ygain, x, y);
		_mm256_store_ps(dst, y);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		__m128 y = _mm_load_ps(dst);

		y = _mm_fmadd_ps(xgain, x, y);
		_mm_store_ps(dst, y);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_fmadd_ss(xgain, x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief x86-64 AVX-512F optimized routine for mixing buffer with no gain.
 * @param[in,out] dst Pointer to destination buffer, which gets updated
 * @param[in] src Pointer to source buffer (not updated)
 * @param nframes Number of samples to process
 */
void
x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
{
	// Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	while (frames > 0)
	{
		if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (frames >= 8                        &&
		    IS_ALIGNED_TO(src, sizeof(__m256)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			__m256 y = _mm256_load_ps(dst);
			y = _mm256_add_ps(x, y);
			_mm256_store_ps(dst, y);
			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4                        &&
		    IS_ALIGNED_TO(src, sizeof(__m128)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			__m128 y = _mm_load_ps(dst);
			y = _mm_add_ps(x, y);
			_mm_store_ps(dst, y);
			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);
		y = _mm_add_ss(x, y);
		_mm_store_ss(dst, y);
		++src;
		++dst;
		--frames;
	}

	// Process the remaining samples 128 at a time
	while (frames >= 128) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 128), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 128), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 128), 0, 0);
		__builtin_prefetch(reinterpret_cast<void const *>(dst + 128), 0, 0);
#endif


		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 y0 = _mm512_load_ps(dst + 0);
		__m512 y1 = _mm512_load_ps(dst + 16);
		__m512 y2 = _mm512_load_ps(dst + 32);
		__m512 y3 = _mm512_load_ps(dst + 48);
		__m512 y4 = _mm512_load_ps(dst + 64);
		__m512 y5 = _mm512_load_ps(dst + 80);
		__m512 y6 = _mm512_load_ps(dst + 96);
		__m512 y7 = _mm512_load_ps(dst + 112);

		y0 = _mm512_add_ps(x0, y0);
		y1 = _mm512_add_ps(x1, y1);
		y2 = _mm512_add_ps(x2, y2);
		y3 = _mm512_add_ps(x3, y3);
		y4 = _mm512_add_ps(x4, y4);
		y5 = _mm512_add_ps(x5, y5);
		y6 = _mm512_add_ps(x6, y6);
		y7 = _mm512_add_ps(x7, y7);

		_mm512_store_ps(dst + 0, y0);
		_mm512_store_ps(dst + 16, y1);
		_mm512_store_ps(dst + 32, y2);
		_mm512_store_ps(dst + 48, y3);
		_mm512_store_ps(dst + 64, y4);
		_mm512_store_ps(dst + 80, y5);
		_mm512_store_ps(dst + 96, y6);
		_mm512_store_ps(dst + 112, y7);

		src += 128;
		dst += 128;
		frames -= 128;
	}

	// Process the remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		__m512 y = _mm512_load_ps(dst);

		y = _mm512_add_ps(x, y);
		_mm512_store_ps(dst, y);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		__m256 y = _mm256_load_ps(dst);

		y = _mm256_add_ps(x, y);
		_mm256_store_ps(dst, y);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		__m128 y = _mm_load_ps(dst);

		y = _mm_add_ps(x, y);
		_mm_store_ps(dst, y);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		__m128 y = _mm_load_ss(dst);

		y = _mm_add_ss(x, y);
		_mm_store_ss(dst, y);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.
	//
	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

/**
 * @brief Copy vector from one location to another
 * @param[out] dst Pointer to destination buffer
 * @param[in] src Pointer to source buffer
 * @param nframes Number of samples to copy
 */
void
x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes)
{
    // Convert to signed integer to prevent any arithmetic overflow errors
	int32_t frames = static_cast<int32_t>(nframes);

	while (frames > 0) {
		if (IS_ALIGNED_TO(src, sizeof(__m512)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m512))) {
			break;
		}

		if (frames >= 8                        &&
		    IS_ALIGNED_TO(src, sizeof(__m256)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m256))) {
			__m256 x = _mm256_load_ps(src);
			_mm256_store_ps(dst, x);
			src += 8;
			dst += 8;
			frames -= 8;
			continue;
		}

		if (frames >= 4                        &&
		    IS_ALIGNED_TO(src, sizeof(__m128)) &&
		    IS_ALIGNED_TO(dst, sizeof(__m128))) {
			__m128 x = _mm_load_ps(src);
			_mm_store_ps(dst, x);
			src += 4;
			dst += 4;
			frames -= 4;
			continue;
		}

		// Pointers are aligned to float boundaries (4 bytes)
		__m128 x = _mm_load_ss(src);
		_mm_store_ss(dst, x);
		++src;
		++dst;
		--frames;
	}

	// Process 256 samples at a time
	while (frames >= 256) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 256), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 256), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 256), 0, 0);
		__builtin_prefetch(reinterpret_cast<void const *>(dst + 256), 0, 0);
#endif
		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);
		__m512 x4 = _mm512_load_ps(src + 64);
		__m512 x5 = _mm512_load_ps(src + 80);
		__m512 x6 = _mm512_load_ps(src + 96);
		__m512 x7 = _mm512_load_ps(src + 112);

		__m512 x8 = _mm512_load_ps(src +  128);
		__m512 x9 = _mm512_load_ps(src +  144);
		__m512 x10 = _mm512_load_ps(src + 160);
		__m512 x11 = _mm512_load_ps(src + 176);
		__m512 x12 = _mm512_load_ps(src + 192);
		__m512 x13 = _mm512_load_ps(src + 208);
		__m512 x14 = _mm512_load_ps(src + 224);
		__m512 x15 = _mm512_load_ps(src + 240);

		_mm512_store_ps(dst + 0, x0);
		_mm512_store_ps(dst + 16, x1);
		_mm512_store_ps(dst + 32, x2);
		_mm512_store_ps(dst + 48, x3);
		_mm512_store_ps(dst + 64, x4);
		_mm512_store_ps(dst + 80, x5);
		_mm512_store_ps(dst + 96, x6);
		_mm512_store_ps(dst + 112, x7);

		_mm512_store_ps(dst + 128, x8);
		_mm512_store_ps(dst + 144, x9);
		_mm512_store_ps(dst + 160, x10);
		_mm512_store_ps(dst + 176, x11);
		_mm512_store_ps(dst + 192, x12);
		_mm512_store_ps(dst + 208, x13);
		_mm512_store_ps(dst + 224, x14);
		_mm512_store_ps(dst + 240, x15);

		src += 256;
		dst += 256;
		frames -= 256;
	}

	// Process remaining samples 64 at a time
	while (frames >= 64) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
		_mm_prefetch(reinterpret_cast<void const *>(src + 64), _mm_hint(0));
		_mm_prefetch(reinterpret_cast<void const *>(dst + 64), _mm_hint(0));
#else
		__builtin_prefetch(reinterpret_cast<void const *>(src + 64), 0, 0);
		__builtin_prefetch(reinterpret_cast<void const *>(dst + 64), 0, 0);
#endif

		__m512 x0 = _mm512_load_ps(src + 0);
		__m512 x1 = _mm512_load_ps(src + 16);
		__m512 x2 = _mm512_load_ps(src + 32);
		__m512 x3 = _mm512_load_ps(src + 48);

		_mm512_store_ps(dst + 0, x0);
		_mm512_store_ps(dst + 16, x1);
		_mm512_store_ps(dst + 32, x2);
		_mm512_store_ps(dst + 48, x3);

		src += 64;
		dst += 64;
		frames -= 64;
	}

	// Process remaining samples 16 at a time
	while (frames >= 16) {
		__m512 x = _mm512_load_ps(src);
		_mm512_store_ps(dst, x);

		src += 16;
		dst += 16;
		frames -= 16;
	}

	// Process remaining samples x8
	while (frames >= 8) {
		__m256 x = _mm256_load_ps(src);
		_mm256_store_ps(dst, x);

		src += 8;
		dst += 8;
		frames -= 8;
	}

	// Process remaining samples x4
	while (frames >= 4) {
		__m128 x = _mm_load_ps(src);
		_mm_store_ps(dst, x);

		src += 4;
		dst += 4;
		frames -= 4;
	}

	// Process remaining samples
	while (frames > 0) {
		__m128 x = _mm_load_ss(src);
		_mm_store_ss(dst, x);

		++src;
		++dst;
		--frames;
	}

	// There's a penalty going from AVX mode to SSE mode. This can
	// be avoided by ensuring the CPU that rest of the routine is no
	// longer interested in the upper portion of the YMM register.

	_mm256_zeroupper(); // zeros the upper portion of YMM register
}

#endif // FPU_AVX512F_SUPPORT