Add support for Intel/AMD's FMA extension

By supporting FMA extension, the number of instruction needed for multiply accumulate to mix channels are reduced. Since, this extension has been around since middle of 2012, more computers have this instruction set available.
2021-01-01 14:01:37 -05:00 · 2021-01-01 14:01:37 -05:00 · 407882d23d
commit 407882d23d
parent f188a1ad10
7 changed files with 167 additions and 5 deletions
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@ -45,6 +45,9 @@ extern "C" {
 #endif
 }

+/* FMA functions */
+LIBARDOUR_API void  x86_fma_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain);
+
 LIBARDOUR_API void  x86_sse_find_peaks     (const float * buf, uint32_t nsamples, float *min, float *max);
 #ifdef PLATFORM_WINDOWS
 LIBARDOUR_API void  x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@ -188,7 +188,20 @@ setup_hardware_optimization (bool try_optimization)
 #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)

 		/* We have AVX-optimized code for Windows and Linux */
-		if (fpu->has_avx ()) {
+		if (fpu->has_fma ()) {
+			info << "Using AVX and FMA optimized routines" << endmsg;
+
+			// FMA SET (Shares a lot with AVX)
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_fma_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu->has_avx ()) {
 			info << "Using AVX optimized routines" << endmsg;

 			// AVX SET
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@ -475,10 +475,10 @@ def build(bld):
    if Options.options.fpu_optimization:
        if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'):
            obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
        elif bld.env['build_target'] == 'x86_64':
            obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
        elif bld.env['build_target'] == 'mingw':
                # usability of the 64 bit windows assembler depends on the compiler target,
                # not the build host, which in turn can only be inferred from the name
@ -486,7 +486,7 @@ def build(bld):
                if re.search ('x86_64-w64', str(bld.env['CC'])):
                        obj.source += [ 'sse_functions_xmm.cc' ]
                        obj.source += [ 'sse_functions_64bit_win.s',  'sse_avx_functions_64bit_win.s' ]
-                        avx_sources = [ 'sse_functions_avx.cc' ]
+                        avx_sources = [ 'sse_functions_avx.cc', 'x86_functions_fma.cc' ]
        elif bld.env['build_target'] == 'aarch64':
            obj.source += ['arm_neon_functions.cc']
            obj.defines += [ 'ARM_NEON_SUPPORT' ]
@ -512,6 +512,7 @@ def build(bld):
            # compile it with -mavx flag - append avx flag to the existing
            avx_cxxflags = list(bld.env['CXXFLAGS'])
            avx_cxxflags.append (bld.env['compiler_flags_dict']['avx'])
+            avx_cxxflags.append (bld.env['compiler_flags_dict']['fma'])
            avx_cxxflags.append (bld.env['compiler_flags_dict']['pic'])
            bld(features = 'cxx cxxstlib asm',
                source   = avx_sources,
--- a/libs/ardour/x86_functions_fma.cc
+++ b/libs/ardour/x86_functions_fma.cc
@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "ardour/mix.h"
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
+
+/**
+ * @brief x86-64 AVX/FMA optimized routine for mixing buffer with gain.
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+void
+x86_fma_mix_buffers_with_gain(
+    float       *dst,
+    const float *src,
+    uint32_t     nframes,
+    float gain)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (!(IS_ALIGNED_TO(src, sizeof(__m256)) &&
+				IS_ALIGNED_TO(dst, sizeof(__m256))) &&
+				(nframes > 0)) {
+
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+
+
+	// Use AVX registers to process 16 samples in parallel
+	do {
+		__m256 g0 = _mm256_set1_ps(gain);
+
+		while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+			_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+			_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+			__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+			__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+			__m256 s0, s1;
+			__m256 d0, d1;
+
+			// Load sources
+			s0 = _mm256_load_ps(src + 0);
+			s1 = _mm256_load_ps(src + 8);
+
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0);
+			d1 = _mm256_load_ps(dst + 8);
+
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			d1 = _mm256_fmadd_ps(g0, d1, s1);
+
+			// Store result
+			_mm256_store_ps(dst + 0, d0);
+			_mm256_store_ps(dst + 8, d1);
+
+			// Update pointers and counters
+			src += 16;
+			dst += 16;
+			nframes -= 16;
+		}
+
+		// Process the remaining samples 8 at a time
+		while (nframes >= 8) {
+			__m256 s0, d0;
+			// Load sources
+			s0 = _mm256_load_ps(src + 0 );
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0 );
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			// Store result
+			_mm256_store_ps(dst, d0);
+			// Update pointers and counters
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+	} while (0);
+
+	// There's a penalty going from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples, one sample at a time.
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (nframes > 0) {
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+}
--- a/libs/pbd/fpu.cc
+++ b/libs/pbd/fpu.cc
@ -211,7 +211,12 @@ FPU::FPU ()
 		    (cpu_info[2] & (1<<28) /* AVX */) &&
 		    ((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */
 			info << _("AVX-capable processor") << endmsg;
-			_flags = Flags (_flags | (HasAVX) );
+			_flags = Flags (_flags | (HasAVX));
+		}
+
+		if (cpu_info[2] & (1<<12) /* FMA */) {
+			info << _("AVX with FMA capable processor") << endmsg;
+			_flags = Flags (_flags | (HasFMA));
 		}

 		if (cpu_info[3] & (1<<25)) {
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@ -33,6 +33,7 @@ class LIBPBD_API FPU {
 		HasSSE2 = 0x8,
 		HasAVX = 0x10,
 		HasNEON = 0x20,
+		HasFMA = 0x40,
 	};

  public:
@ -46,6 +47,7 @@ class LIBPBD_API FPU {
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
 	bool has_avx () const { return _flags & HasAVX; }
+	bool has_fma() const { return _flags & HasFMA; }
 	bool has_neon () const { return _flags & HasNEON; }

  private:
--- a/2
+++ b/2
@ -88,6 +88,8 @@ compiler_flags_dictionaries= {
        'attasm': '-masm=att',
        # Flags to make AVX instructions/intrinsics available
        'avx': '-mavx',
+        # Flags to make FMA instructions/intrinsics available
+        'fma': '-mfma',
        # Flags to make ARM/NEON instructions/intrinsics available
        'neon': '-mfpu=neon',
        # Flags to generate position independent code, when needed to build a shared object