From 407882d23de0c14a55c58c19475e9c2deb1dd095 Mon Sep 17 00:00:00 2001
From: Ayan Shafqat <ayan.x.shafqat@gmail.com>
Date: Fri, 1 Jan 2021 14:01:37 -0500
Subject: [PATCH] Add support for Intel/AMD's FMA extension

By supporting FMA extension, the number of instruction needed
for multiply accumulate to mix channels are reduced. Since,
this extension has been around since middle of 2012, more
computers have this instruction set available.
---
 libs/ardour/ardour/mix.h         |   3 +
 libs/ardour/globals.cc           |  15 +++-
 libs/ardour/wscript              |   7 +-
 libs/ardour/x86_functions_fma.cc | 136 +++++++++++++++++++++++++++++++
 libs/pbd/fpu.cc                  |   7 +-
 libs/pbd/pbd/fpu.h               |   2 +
 wscript                          |   2 +
 7 files changed, 167 insertions(+), 5 deletions(-)
 create mode 100644 libs/ardour/x86_functions_fma.cc

diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index c154442daf..d472873f30 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -45,6 +45,9 @@ extern "C" {
 #endif
 }
 
+/* FMA functions */
+LIBARDOUR_API void  x86_fma_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain);
+
 LIBARDOUR_API void  x86_sse_find_peaks     (const float * buf, uint32_t nsamples, float *min, float *max);
 #ifdef PLATFORM_WINDOWS
 LIBARDOUR_API void  x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index 03d73b7b2d..6e18156731 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -188,7 +188,20 @@ setup_hardware_optimization (bool try_optimization)
 #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)
 
 		/* We have AVX-optimized code for Windows and Linux */
-		if (fpu->has_avx ()) {
+		if (fpu->has_fma ()) {
+			info << "Using AVX and FMA optimized routines" << endmsg;
+
+			// FMA SET (Shares a lot with AVX)
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_fma_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu->has_avx ()) {
 			info << "Using AVX optimized routines" << endmsg;
 
 			// AVX SET
diff --git a/libs/ardour/wscript b/libs/ardour/wscript
index 5c4b383b9c..206652a2b2 100644
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@@ -475,10 +475,10 @@ def build(bld):
     if Options.options.fpu_optimization:
         if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'):
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'x86_64':
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'mingw':
                 # usability of the 64 bit windows assembler depends on the compiler target,
                 # not the build host, which in turn can only be inferred from the name
@@ -486,7 +486,7 @@ def build(bld):
                 if re.search ('x86_64-w64', str(bld.env['CC'])):
                         obj.source += [ 'sse_functions_xmm.cc' ]
                         obj.source += [ 'sse_functions_64bit_win.s',  'sse_avx_functions_64bit_win.s' ]
-                        avx_sources = [ 'sse_functions_avx.cc' ]
+                        avx_sources = [ 'sse_functions_avx.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'aarch64':
             obj.source += ['arm_neon_functions.cc']
             obj.defines += [ 'ARM_NEON_SUPPORT' ]
@@ -512,6 +512,7 @@ def build(bld):
             # compile it with -mavx flag - append avx flag to the existing
             avx_cxxflags = list(bld.env['CXXFLAGS'])
             avx_cxxflags.append (bld.env['compiler_flags_dict']['avx'])
+            avx_cxxflags.append (bld.env['compiler_flags_dict']['fma'])
             avx_cxxflags.append (bld.env['compiler_flags_dict']['pic'])
             bld(features = 'cxx cxxstlib asm',
                 source   = avx_sources,
diff --git a/libs/ardour/x86_functions_fma.cc b/libs/ardour/x86_functions_fma.cc
new file mode 100644
index 0000000000..8ad8ed0226
--- /dev/null
+++ b/libs/ardour/x86_functions_fma.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "ardour/mix.h"
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
+
+/**
+ * @brief x86-64 AVX/FMA optimized routine for mixing buffer with gain.
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+void
+x86_fma_mix_buffers_with_gain(
+    float       *dst,
+    const float *src,
+    uint32_t     nframes,
+    float gain)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (!(IS_ALIGNED_TO(src, sizeof(__m256)) &&
+				IS_ALIGNED_TO(dst, sizeof(__m256))) &&
+				(nframes > 0)) {
+
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+
+
+	// Use AVX registers to process 16 samples in parallel
+	do {
+		__m256 g0 = _mm256_set1_ps(gain);
+
+		while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+			_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+			_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+			__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+			__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+			__m256 s0, s1;
+			__m256 d0, d1;
+
+			// Load sources
+			s0 = _mm256_load_ps(src + 0);
+			s1 = _mm256_load_ps(src + 8);
+
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0);
+			d1 = _mm256_load_ps(dst + 8);
+
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			d1 = _mm256_fmadd_ps(g0, d1, s1);
+
+			// Store result
+			_mm256_store_ps(dst + 0, d0);
+			_mm256_store_ps(dst + 8, d1);
+
+			// Update pointers and counters
+			src += 16;
+			dst += 16;
+			nframes -= 16;
+		}
+
+		// Process the remaining samples 8 at a time
+		while (nframes >= 8) {
+			__m256 s0, d0;
+			// Load sources
+			s0 = _mm256_load_ps(src + 0 );
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0 );
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			// Store result
+			_mm256_store_ps(dst, d0);
+			// Update pointers and counters
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+	} while (0);
+
+	// There's a penalty going from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples, one sample at a time.
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (nframes > 0) {
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+}
diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc
index 3ac5601339..c179def222 100644
--- a/libs/pbd/fpu.cc
+++ b/libs/pbd/fpu.cc
@@ -211,7 +211,12 @@ FPU::FPU ()
 		    (cpu_info[2] & (1<<28) /* AVX */) &&
 		    ((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */
 			info << _("AVX-capable processor") << endmsg;
-			_flags = Flags (_flags | (HasAVX) );
+			_flags = Flags (_flags | (HasAVX));
+		}
+
+		if (cpu_info[2] & (1<<12) /* FMA */) {
+			info << _("AVX with FMA capable processor") << endmsg;
+			_flags = Flags (_flags | (HasFMA));
 		}
 
 		if (cpu_info[3] & (1<<25)) {
diff --git a/libs/pbd/pbd/fpu.h b/libs/pbd/pbd/fpu.h
index b0979daa9e..7d5e21d696 100644
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@@ -33,6 +33,7 @@ class LIBPBD_API FPU {
 		HasSSE2 = 0x8,
 		HasAVX = 0x10,
 		HasNEON = 0x20,
+		HasFMA = 0x40,
 	};
 
   public:
@@ -46,6 +47,7 @@ class LIBPBD_API FPU {
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
 	bool has_avx () const { return _flags & HasAVX; }
+	bool has_fma() const { return _flags & HasFMA; }
 	bool has_neon () const { return _flags & HasNEON; }
 
   private:
diff --git a/wscript b/wscript
index c5d1014158..80d9aa00b8 100644
--- a/wscript
+++ b/wscript
@@ -88,6 +88,8 @@ compiler_flags_dictionaries= {
         'attasm': '-masm=att',
         # Flags to make AVX instructions/intrinsics available
         'avx': '-mavx',
+        # Flags to make FMA instructions/intrinsics available
+        'fma': '-mfma',
         # Flags to make ARM/NEON instructions/intrinsics available
         'neon': '-mfpu=neon',
         # Flags to generate position independent code, when needed to build a shared object