From 03abc1076eaef4057c089eda28e69e1d26bf7455 Mon Sep 17 00:00:00 2001
From: Ayan Shafqat <ayan.x.shafqat@gmail.com>
Date: Sun, 9 Aug 2020 16:10:40 -0400
Subject: [PATCH] Adding AVX optimized routines for Linux

This commit adds AVX optimized routines for the following
procedures below:

*_compute_peak
*_find_peaks
*_apply_gain_to_buffer
*_mix_buffers_with_gain
*_mix_buffers_no_gain

AVX optimized routine has the prefix of: x86_sse_avx_

Note: mix_buffer_with_gain and mix_buffers_no_gain may prefer
SSE implementaion over AVX if source and destination pointers
are aligned to 16 byte boundaries. Therefore, it will be optimal if
_all_ audio buffers are allocated to 32 byte boundaries to take
full advantage of AVX ISA extension.
---
 libs/ardour/ardour/mix.h               |   2 +-
 libs/ardour/globals.cc                 |  10 +-
 libs/ardour/sse_functions_avx_linux.cc | 839 ++++++++++++++++++++++++-
 3 files changed, 824 insertions(+), 27 deletions(-)

diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index 40f6348c5e..a22d9587fb 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -40,10 +40,10 @@ extern "C" {
 	LIBARDOUR_API void  x86_sse_avx_mix_buffers_with_gain(float * dst, const float * src, uint32_t nframes, float gain);
 	LIBARDOUR_API void  x86_sse_avx_mix_buffers_no_gain  (float * dst, const float * src, uint32_t nframes);
 	LIBARDOUR_API void  x86_sse_avx_copy_vector          (float * dst, const float * src, uint32_t nframes);
+	LIBARDOUR_API void  x86_sse_avx_find_peaks           (const float * buf, uint32_t nsamples, float *min, float *max);
 }
 
 LIBARDOUR_API void  x86_sse_find_peaks                 (const float * buf, uint32_t nsamples, float *min, float *max);
-LIBARDOUR_API void  x86_sse_avx_find_peaks             (const float * buf, uint32_t nsamples, float *min, float *max);
 
 /* debug wrappers for SSE functions */
 
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index 07eced83c3..b70e51fc4c 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -189,14 +189,8 @@ setup_hardware_optimization (bool try_optimization)
 
 #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)
 
-#ifdef PLATFORM_WINDOWS
-		/* We have AVX-optimized code for Windows */
-		if (fpu->has_avx ())
-#else
-		/* AVX code doesn't compile on Linux yet */
-		if (false)
-#endif
-		{
+		/* We have AVX-optimized code for Windows and Linux */
+		if (fpu->has_avx ()) {
 			info << "Using AVX optimized routines" << endmsg;
 
 			// AVX SET
diff --git a/libs/ardour/sse_functions_avx_linux.cc b/libs/ardour/sse_functions_avx_linux.cc
index ec591642e7..6d49e46a78 100644
--- a/libs/ardour/sse_functions_avx_linux.cc
+++ b/libs/ardour/sse_functions_avx_linux.cc
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Paul Davis <paul@linuxaudiosystems.com>
+ * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -18,38 +19,840 @@
 
 #include "ardour/mix.h"
 
-float
-x86_sse_avx_compute_peak (const float * buf, uint32_t nsamples, float current)
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#ifndef __AVX__
+#error "__AVX__ must be enabled for this module to work"
+#endif
+
+#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
+
+#ifdef __cplusplus
+#define C_FUNC extern "C"
+#else
+#define C_FUNC
+#endif
+
+/**
+ * Local functions
+ */
+
+static inline __m256 avx_getmax_ps(__m256 vmax);
+static inline __m256 avx_getmin_ps(__m256 vmin);
+
+static void
+x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain);
+
+static void
+x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain);
+
+static void
+x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes);
+
+static void
+x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes);
+
+/**
+ * Module implementation
+ */
+
+/**
+ * @brief x86-64 AVX optimized routine for compute peak procedure
+ * @param src Pointer to source buffer
+ * @param nframes Number of frames to process
+ * @param current Current peak value
+ * @return float New peak value
+ */
+C_FUNC float
+x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current)
 {
-	return default_compute_peak (buf, nsamples, current);
+	const __m256 ABS_MASK = _mm256_set1_ps(-0.0F);
+
+	// Broadcast the current max value to all elements of the YMM register
+	__m256 vcurrent = _mm256_broadcast_ss(&current);
+
+	// Compute single min/max of unaligned portion until alignment is reached
+	while ((((intptr_t)src) % 32 != 0) && nframes > 0) {
+		__m256 vsrc;
+
+		vsrc = _mm256_setzero_ps();
+		vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
+		vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
+		vcurrent = _mm256_max_ps(vcurrent, vsrc);
+
+		++src;
+		--nframes;
+	}
+
+	// Process the aligned portion 16 samples at a time
+	while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 vsrc1, vsrc2;
+		vsrc1 = _mm256_load_ps(src + 0);
+		vsrc2 = _mm256_load_ps(src + 8);
+
+		vsrc1 = _mm256_andnot_ps(ABS_MASK, vsrc1);
+		vsrc2 = _mm256_andnot_ps(ABS_MASK, vsrc2);
+
+		vcurrent = _mm256_max_ps(vcurrent, vsrc1);
+		vcurrent = _mm256_max_ps(vcurrent, vsrc2);
+
+		src += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 vsrc;
+
+		vsrc = _mm256_load_ps(src);
+		vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
+		vcurrent = _mm256_max_ps(vcurrent, vsrc);
+
+		src += 8;
+		nframes -= 8;
+	}
+
+	// If there are still some left 4 to 8 samples, process them below
+	while (nframes > 0) {
+		__m256 vsrc;
+
+		vsrc = _mm256_setzero_ps();
+		vsrc = _mm256_castps128_ps256(_mm_load_ss(src));
+		vsrc = _mm256_andnot_ps(ABS_MASK, vsrc);
+		vcurrent = _mm256_max_ps(vcurrent, vsrc);
+
+		++src;
+		--nframes;
+	}
+
+	// Get the current max from YMM register
+	vcurrent = avx_getmax_ps(vcurrent);
+
+	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
+	_mm256_zeroupper();
+
+	return _mm256_cvtss_f32(vcurrent);
 }
 
-void
-x86_sse_avx_apply_gain_to_buffer (float * buf, uint32_t nframes, float gain)
+/**
+ * @brief x86-64 AVX optimized routine for find peak procedure
+ * @param src Pointer to source buffer
+ * @param nframes Number of frames to process
+ * @param[in,out] minf Current minimum value, updated
+ * @param[in,out] maxf Current maximum value, updated
+ */
+C_FUNC void
+x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
 {
-	default_apply_gain_to_buffer (buf, nframes, gain);
+	// Broadcast the current min and max values to all elements of the YMM register
+	__m256 vmin = _mm256_broadcast_ss(minf);
+	__m256 vmax = _mm256_broadcast_ss(maxf);
+
+	// Compute single min/max of unaligned portion until alignment is reached
+	while ((((intptr_t)src) % 32 != 0) && nframes > 0) {
+		__m256 vsrc;
+
+		vsrc = _mm256_broadcast_ss(src);
+		vmax = _mm256_max_ps(vmax, vsrc);
+		vmin = _mm256_min_ps(vmin, vsrc);
+
+		++src;
+		--nframes;
+	}
+
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)src + 64), _mm_hint(0));
+#else
+		__builtin_prefetch(src + 64, 0, 0);
+#endif
+
+		__m256 vsrc1, vsrc2;
+		vsrc1 = _mm256_load_ps(src + 0);
+		vsrc2 = _mm256_load_ps(src + 8);
+
+		vmax = _mm256_max_ps(vmax, vsrc1);
+		vmin = _mm256_min_ps(vmin, vsrc1);
+
+		vmax = _mm256_max_ps(vmax, vsrc2);
+		vmin = _mm256_min_ps(vmin, vsrc2);
+
+		src += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 vsrc;
+
+		vsrc = _mm256_load_ps(src);
+		vmax = _mm256_max_ps(vmax, vsrc);
+		vmin = _mm256_min_ps(vmin, vsrc);
+
+		src += 8;
+		nframes -= 8;
+	}
+
+	// If there are still some left 4 to 8 samples, process them one at a time.
+	while (nframes > 0) {
+		__m256 vsrc;
+
+		vsrc = _mm256_broadcast_ss(src);
+		vmax = _mm256_max_ps(vmax, vsrc);
+		vmin = _mm256_min_ps(vmin, vsrc);
+
+		++src;
+		--nframes;
+	}
+
+	// Get min and max of the YMM registers
+	vmin = avx_getmin_ps(vmin);
+	vmax = avx_getmax_ps(vmax);
+
+	// There's a penalty going away from AVX mode to SSE mode. This can
+	// be avoided by ensuring to the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
+	_mm256_zeroupper();
+
+	_mm_store_ss(minf, _mm256_castps256_ps128(vmin));
+	_mm_store_ss(maxf, _mm256_castps256_ps128(vmax));
 }
 
-void
-x86_sse_avx_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain)
+/**
+ * @brief x86-64 AVX optimized routine for apply gain routine
+ * @param[in,out] dst Pointer to the destination buffer, which gets updated
+ * @param nframes Number of frames (or samples) to process
+ * @param gain Gain to apply
+ */
+C_FUNC void
+x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
 {
-	default_mix_buffers_with_gain (dst, src, nframes, gain);
+	// Load gain vector to all elements of YMM register
+	__m256 vgain = _mm256_set1_ps(gain);
+
+	if (nframes) {
+		__m128 g0 = _mm256_castps256_ps128(vgain);
+		// Here comes the horror, poor-man's loop unrolling
+		switch (((intptr_t)dst) % 32) {
+		case 0:
+		default:
+			// Buffer is aligned, skip to the next section of aligned
+			break;
+		case 4:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		case 8:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		case 12:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		case 16:
+			// This is a special case where pointer is 16 byte aligned
+			// for a XMM load/store operation.
+			_mm_store_ps(dst, _mm_mul_ps(g0, _mm_load_ps(dst)));
+			dst += 4;
+			nframes -= 4;
+			break;
+		case 20:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		case 24:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		case 28:
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		}
+	} else {
+		return;
+	}
+
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 d0, d1;
+		d0 = _mm256_load_ps(dst + 0 );
+		d1 = _mm256_load_ps(dst + 8 );
+
+		d0 = _mm256_mul_ps(vgain, d0);
+		d1 = _mm256_mul_ps(vgain, d1);
+
+		_mm256_store_ps(dst + 0 , d0);
+		_mm256_store_ps(dst + 8 , d1);
+
+		dst += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		_mm256_store_ps(dst, _mm256_mul_ps(vgain, _mm256_load_ps(dst)));
+		dst += 8;
+		nframes -= 8;
+	}
+
+
+	// There's a penalty going away from AVX mode to SSE mode. This can
+	// be avoided by ensuring to the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples
+	do {
+		__m128 g0 = _mm256_castps256_ps128(vgain);
+		while (nframes > 0) {
+			_mm_store_ss(dst, _mm_mul_ss(g0, _mm_load_ss(dst)));
+			++dst;
+			--nframes;
+		}
+	} while (0);
 }
 
-void
-x86_sse_avx_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes)
+/**
+ * @brief x86-64 AVX optimized routine for mixing buffer with gain.
+ *
+ * This function may choose SSE over AVX if the pointers are aligned
+ * to 16 byte boundary instead of 32 byte boundary to reduce time to
+ * process.
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+C_FUNC void
+x86_sse_avx_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain)
 {
-	default_mix_buffers_no_gain (dst, src, nframes);
+	if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) {
+		// Pointers are both aligned to 32 bit boundaries, this can be processed with AVX
+		x86_sse_avx_mix_buffers_with_gain_aligned(dst, src, nframes, gain);
+	} else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) {
+		// This can still be processed with SSE
+		x86_sse_mix_buffers_with_gain(dst, src, nframes, gain);
+	} else {
+		// Pointers are unaligned, so process them with unaligned load/store AVX
+		x86_sse_avx_mix_buffers_with_gain_unaligned(dst, src, nframes, gain);
+	}
 }
 
-void
-x86_sse_avx_copy_vector (float * dst, const float * src, uint32_t nframes)
+/**
+ * @brief x86-64 AVX optimized routine for mixing buffer with no gain.
+ *
+ * This function may choose SSE over AVX if the pointers are aligned
+ * to 16 byte boundary instead of 32 byte boundary to reduce time to
+ * process.
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ */
+C_FUNC void
+x86_sse_avx_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
 {
-	default_copy_vector (dst, src, nframes);
+	if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) {
+		// Pointers are both aligned to 32 bit boundaries, this can be processed with AVX
+		x86_sse_avx_mix_buffers_no_gain_aligned(dst, src, nframes);
+	} else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) {
+		// This can still be processed with SSE
+		x86_sse_mix_buffers_no_gain(dst, src, nframes);
+	} else {
+		// Pointers are unaligned, so process them with unaligned load/store AVX
+		x86_sse_avx_mix_buffers_no_gain_unaligned(dst, src, nframes);
+	}
 }
 
-void
-x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max)
+/**
+ * @brief Copy vector from one location to another
+ *
+ * This has not been hand optimized for AVX with the rationale that standard
+ * C library implementation will provide faster memory copy operation. It will
+ * be redundant to implement memcpy for floats.
+ *
+ * @param[out] dst Pointer to destination buffer
+ * @param[in] src Pointer to source buffer
+ * @param nframes Number of samples to copy
+ */
+C_FUNC void
+x86_sse_avx_copy_vector(float *dst, const float *src, uint32_t nframes)
 {
-	default_find_peaks (buf, nsamples, min, max);
+	(void) memcpy(dst, src, nframes * sizeof(float));
+}
+
+/**
+ * Local helper functions
+ */
+
+/**
+ * @brief Helper routine for mixing buffers with gain for unaligned buffers
+ *
+ * @details This routine executes the following expression below per element:
+ *
+ * dst = dst + (gain * src)
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+static void
+x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain)
+{
+	// Load gain vector to all elements of YMM register
+	__m256 vgain = _mm256_set1_ps(gain);
+
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+		_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+		__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 s0, s1;
+		__m256 d0, d1;
+
+		// Load sources
+		s0 = _mm256_loadu_ps(src + 0);
+		s1 = _mm256_loadu_ps(src + 8);
+
+		// Load destinations
+		d0 = _mm256_loadu_ps(dst + 0);
+		d1 = _mm256_loadu_ps(dst + 8);
+
+		// src = src * gain
+		s0 = _mm256_mul_ps(vgain, s0);
+		s1 = _mm256_mul_ps(vgain, s1);
+
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		d1 = _mm256_add_ps(d1, s1);
+
+		// Store result
+		_mm256_storeu_ps(dst + 0, d0);
+		_mm256_storeu_ps(dst + 8, d1);
+
+		// Update pointers and counters
+		src += 16;
+		dst += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 s0, d0;
+		// Load sources
+		s0 = _mm256_loadu_ps(src);
+		// Load destinations
+		d0 = _mm256_loadu_ps(dst);
+		// src = src * gain
+		s0 = _mm256_mul_ps(vgain, s0);
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		// Store result
+		_mm256_storeu_ps(dst, d0);
+		// Update pointers and counters
+		src+= 8;
+		dst += 8;
+		nframes -= 8;
+	}
+
+
+	// There's a penalty going away from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples
+	do {
+		__m128 g0 = _mm_set_ss(gain);
+		while (nframes > 0) {
+			__m128 s0, d0;
+			s0 = _mm_load_ss(src);
+			d0 = _mm_load_ss(dst);
+			s0 = _mm_mul_ss(g0, s0);
+			d0 = _mm_add_ss(d0, s0);
+			_mm_store_ss(dst, d0);
+			++src;
+			++dst;
+			--nframes;
+		}
+	} while (0);
+}
+
+/**
+ * @brief Helper routine for mixing buffers with gain for aligned buffers
+ *
+ * @details This routine executes the following expression below per element:
+ *
+ * dst = dst + (gain * src)
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+static void
+x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain)
+{
+	// Load gain vector to all elements of YMM register
+	__m256 vgain = _mm256_set1_ps(gain);
+
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+		_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+		__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 s0, s1;
+		__m256 d0, d1;
+
+		// Load sources
+		s0 = _mm256_load_ps(src + 0);
+		s1 = _mm256_load_ps(src + 8);
+
+		// Load destinations
+		d0 = _mm256_load_ps(dst + 0);
+		d1 = _mm256_load_ps(dst + 8);
+
+		// src = src * gain
+		s0 = _mm256_mul_ps(vgain, s0);
+		s1 = _mm256_mul_ps(vgain, s1);
+
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		d1 = _mm256_add_ps(d1, s1);
+
+		// Store result
+		_mm256_store_ps(dst + 0, d0);
+		_mm256_store_ps(dst + 8, d1);
+
+		// Update pointers and counters
+		src += 16;
+		dst += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 s0, d0;
+		// Load sources
+		s0 = _mm256_load_ps(src + 0 );
+		// Load destinations
+		d0 = _mm256_load_ps(dst + 0 );
+		// src = src * gain
+		s0 = _mm256_mul_ps(vgain, s0);
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		// Store result
+		_mm256_store_ps(dst, d0);
+		// Update pointers and counters
+		src += 8;
+		dst += 8;
+		nframes -= 8;
+	}
+
+
+	// There's a penalty going from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples, one sample at a time.
+	do {
+		__m128 g0 = _mm256_castps256_ps128(vgain); // use the same register
+		while (nframes > 0) {
+			__m128 s0, d0;
+			s0 = _mm_load_ss(src);
+			d0 = _mm_load_ss(dst);
+			s0 = _mm_mul_ss(g0, s0);
+			d0 = _mm_add_ss(d0, s0);
+			_mm_store_ss(dst, d0);
+			++src;
+			++dst;
+			--nframes;
+		}
+	} while (0);
+}
+
+/**
+ * @brief Helper routine for mixing buffers with no gain for aligned buffers
+ *
+ * @details This routine executes the following expression below per element:
+ *
+ * dst = dst + src
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ */
+static void
+x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes)
+{
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+		_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+		__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 s0, s1;
+		__m256 d0, d1;
+
+		// Load sources
+		s0 = _mm256_loadu_ps(src + 0);
+		s1 = _mm256_loadu_ps(src + 8);
+
+		// Load destinations
+		d0 = _mm256_loadu_ps(dst + 0);
+		d1 = _mm256_loadu_ps(dst + 8);
+
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		d1 = _mm256_add_ps(d1, s1);
+
+		// Store result
+		_mm256_storeu_ps(dst + 0, d0);
+		_mm256_storeu_ps(dst + 8, d1);
+
+		// Update pointers and counters
+		src += 16;
+		dst += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 s0, d0;
+		// Load sources
+		s0 = _mm256_loadu_ps(src);
+		// Load destinations
+		d0 = _mm256_loadu_ps(dst);
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		// Store result
+		_mm256_storeu_ps(dst, d0);
+		// Update pointers and counters
+		src+= 8;
+		dst += 8;
+		nframes -= 8;
+	}
+
+	// There's a penalty going away from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples
+	do {
+		while (nframes > 0) {
+			__m128 s0, d0;
+			s0 = _mm_load_ss(src);
+			d0 = _mm_load_ss(dst);
+			d0 = _mm_add_ss(d0, s0);
+			_mm_store_ss(dst, d0);
+			++src;
+			++dst;
+			--nframes;
+		}
+	} while (0);
+
+}
+
+/**
+ * @brief Helper routine for mixing buffers with no gain for unaligned buffers
+ *
+ * @details This routine executes the following expression below per element:
+ *
+ * dst = dst + src
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ */
+static void
+x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes)
+{
+	// Process the aligned portion 32 samples at a time
+	while (nframes >= 32)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (32 * sizeof(float))), _mm_hint(0));
+		_mm_prefetch(((char *)src + (32 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (32 * sizeof(float)), 0, 0);
+		__builtin_prefetch(dst + (32 * sizeof(float)), 0, 0);
+#endif
+		__m256 s0, s1, s2, s3;
+		__m256 d0, d1, d2, d3;
+
+		// Load sources
+		s0 = _mm256_load_ps(src + 0 );
+		s1 = _mm256_load_ps(src + 8 );
+		s2 = _mm256_load_ps(src + 16);
+		s3 = _mm256_load_ps(src + 24);
+
+		// Load destinations
+		d0 = _mm256_load_ps(dst + 0 );
+		d1 = _mm256_load_ps(dst + 8 );
+		d2 = _mm256_load_ps(dst + 16);
+		d3 = _mm256_load_ps(dst + 24);
+
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		d1 = _mm256_add_ps(d1, s1);
+		d2 = _mm256_add_ps(d2, s2);
+		d3 = _mm256_add_ps(d3, s3);
+
+		// Store result
+		_mm256_store_ps(dst + 0 , d0);
+		_mm256_store_ps(dst + 8 , d1);
+		_mm256_store_ps(dst + 16, d2);
+		_mm256_store_ps(dst + 24, d3);
+
+		// Update pointers and counters
+		src += 32;
+		dst += 32;
+		nframes -= 32;
+	}
+
+	// Process the remaining samples 16 at a time
+	while (nframes >= 16)
+	{
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+		_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+		_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+		__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+		__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+		__m256 s0, s1;
+		__m256 d0, d1;
+
+		// Load sources
+		s0 = _mm256_load_ps(src + 0);
+		s1 = _mm256_load_ps(src + 8);
+
+		// Load destinations
+		d0 = _mm256_load_ps(dst + 0);
+		d1 = _mm256_load_ps(dst + 8);
+
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		d1 = _mm256_add_ps(d1, s1);
+
+		// Store result
+		_mm256_store_ps(dst + 0, d0);
+		_mm256_store_ps(dst + 8, d1);
+
+		// Update pointers and counters
+		src += 16;
+		dst += 16;
+		nframes -= 16;
+	}
+
+	// Process the remaining samples 8 at a time
+	while (nframes >= 8) {
+		__m256 s0, d0;
+		// Load sources
+		s0 = _mm256_load_ps(src + 0 );
+		// Load destinations
+		d0 = _mm256_load_ps(dst + 0 );
+		// dst = dst + src
+		d0 = _mm256_add_ps(d0, s0);
+		// Store result
+		_mm256_store_ps(dst, d0);
+		// Update pointers and counters
+		src += 8;
+		dst += 8;
+		nframes -= 8;
+	}
+
+	// There's a penalty going from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples
+	do {
+		while (nframes > 0) {
+			__m128 s0, d0;
+			s0 = _mm_load_ss(src);
+			d0 = _mm_load_ss(dst);
+			d0 = _mm_add_ss(d0, s0);
+			_mm_store_ss(dst, d0);
+			++src;
+			++dst;
+			--nframes;
+		}
+	} while (0);
+}
+
+/**
+ * @brief Get the maximum value of packed float register
+ * @param vmax Packed float 8x register
+ * @return __m256 Maximum value in p[0]
+ */
+static inline __m256 avx_getmax_ps(__m256 vmax)
+{
+	__m256 tmp;
+	tmp  = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(2, 3, 0, 1));
+	vmax = _mm256_max_ps(tmp, vmax);
+	tmp  = _mm256_shuffle_ps(vmax, vmax, _MM_SHUFFLE(1, 0, 3, 2));
+	vmax = _mm256_max_ps(tmp, vmax);
+	tmp  = _mm256_permute2f128_ps(vmax, vmax, 1);
+	vmax = _mm256_max_ps(tmp, vmax);
+	return vmax;
+}
+
+/**
+ * @brief Get the minimum value of packed float register
+ * @param vmax Packed float 8x register
+ * @return __m256 Minimum value in p[0]
+ */
+static inline __m256 avx_getmin_ps(__m256 vmin)
+{
+	__m256 tmp;
+	tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(2, 3, 0, 1));
+	vmin = _mm256_min_ps(tmp, vmin);
+	tmp = _mm256_shuffle_ps(vmin, vmin, _MM_SHUFFLE(1, 0, 3, 2));
+	vmin = _mm256_min_ps(tmp, vmin);
+	tmp = _mm256_permute2f128_ps(vmin, vmin, 1);
+	vmin = _mm256_min_ps(tmp, vmin);
+	return vmin;
 }