diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h index c154442daf..d472873f30 100644 --- a/libs/ardour/ardour/mix.h +++ b/libs/ardour/ardour/mix.h @@ -45,6 +45,9 @@ extern "C" { #endif } +/* FMA functions */ +LIBARDOUR_API void x86_fma_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain); + LIBARDOUR_API void x86_sse_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max); #ifdef PLATFORM_WINDOWS LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max); diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index 03d73b7b2d..6e18156731 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -188,7 +188,20 @@ setup_hardware_optimization (bool try_optimization) #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS) /* We have AVX-optimized code for Windows and Linux */ - if (fpu->has_avx ()) { + if (fpu->has_fma ()) { + info << "Using AVX and FMA optimized routines" << endmsg; + + // FMA SET (Shares a lot with AVX) + compute_peak = x86_sse_avx_compute_peak; + find_peaks = x86_sse_avx_find_peaks; + apply_gain_to_buffer = x86_sse_avx_apply_gain_to_buffer; + mix_buffers_with_gain = x86_fma_mix_buffers_with_gain; + mix_buffers_no_gain = x86_sse_avx_mix_buffers_no_gain; + copy_vector = x86_sse_avx_copy_vector; + + generic_mix_functions = false; + + } else if (fpu->has_avx ()) { info << "Using AVX optimized routines" << endmsg; // AVX SET diff --git a/libs/ardour/wscript b/libs/ardour/wscript index 5c4b383b9c..206652a2b2 100644 --- a/libs/ardour/wscript +++ b/libs/ardour/wscript @@ -475,10 +475,10 @@ def build(bld): if Options.options.fpu_optimization: if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'): obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ] - avx_sources = [ 'sse_functions_avx_linux.cc' ] + avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ] elif bld.env['build_target'] == 'x86_64': obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s', ] - avx_sources = [ 'sse_functions_avx_linux.cc' ] + avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ] elif bld.env['build_target'] == 'mingw': # usability of the 64 bit windows assembler depends on the compiler target, # not the build host, which in turn can only be inferred from the name @@ -486,7 +486,7 @@ def build(bld): if re.search ('x86_64-w64', str(bld.env['CC'])): obj.source += [ 'sse_functions_xmm.cc' ] obj.source += [ 'sse_functions_64bit_win.s', 'sse_avx_functions_64bit_win.s' ] - avx_sources = [ 'sse_functions_avx.cc' ] + avx_sources = [ 'sse_functions_avx.cc', 'x86_functions_fma.cc' ] elif bld.env['build_target'] == 'aarch64': obj.source += ['arm_neon_functions.cc'] obj.defines += [ 'ARM_NEON_SUPPORT' ] @@ -512,6 +512,7 @@ def build(bld): # compile it with -mavx flag - append avx flag to the existing avx_cxxflags = list(bld.env['CXXFLAGS']) avx_cxxflags.append (bld.env['compiler_flags_dict']['avx']) + avx_cxxflags.append (bld.env['compiler_flags_dict']['fma']) avx_cxxflags.append (bld.env['compiler_flags_dict']['pic']) bld(features = 'cxx cxxstlib asm', source = avx_sources, diff --git a/libs/ardour/x86_functions_fma.cc b/libs/ardour/x86_functions_fma.cc new file mode 100644 index 0000000000..8ad8ed0226 --- /dev/null +++ b/libs/ardour/x86_functions_fma.cc @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2020 Ayan Shafqat + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "ardour/mix.h" + +#include +#include + +#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0) + +/** + * @brief x86-64 AVX/FMA optimized routine for mixing buffer with gain. + * + * @param[in,out] dst Pointer to destination buffer, which gets updated + * @param[in] src Pointer to source buffer (not updated) + * @param nframes Number of samples to process + * @param gain Gain to apply + */ +void +x86_fma_mix_buffers_with_gain( + float *dst, + const float *src, + uint32_t nframes, + float gain) +{ + // While buffers aren't aligned, then process one sample at a time + do { + __m128 g0 = _mm_set_ss(gain); // Should be a no-op + + while (!(IS_ALIGNED_TO(src, sizeof(__m256)) && + IS_ALIGNED_TO(dst, sizeof(__m256))) && + (nframes > 0)) { + + __m128 x0 = _mm_load_ss(src); + __m128 y0 = _mm_load_ss(dst); + __m128 z0 = _mm_fmadd_ss(x0, g0, y0); + _mm_store_ss(dst, z0); + + ++dst; + ++src; + --nframes; + } + } while (0); + + + // Use AVX registers to process 16 samples in parallel + do { + __m256 g0 = _mm256_set1_ps(gain); + + while (nframes >= 16) { +#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) + _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); + _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); +#else + __builtin_prefetch(src + (16 * sizeof(float)), 0, 0); + __builtin_prefetch(dst + (16 * sizeof(float)), 0, 0); +#endif + __m256 s0, s1; + __m256 d0, d1; + + // Load sources + s0 = _mm256_load_ps(src + 0); + s1 = _mm256_load_ps(src + 8); + + // Load destinations + d0 = _mm256_load_ps(dst + 0); + d1 = _mm256_load_ps(dst + 8); + + // dst = dst + (src * gain) + d0 = _mm256_fmadd_ps(g0, d0, s0); + d1 = _mm256_fmadd_ps(g0, d1, s1); + + // Store result + _mm256_store_ps(dst + 0, d0); + _mm256_store_ps(dst + 8, d1); + + // Update pointers and counters + src += 16; + dst += 16; + nframes -= 16; + } + + // Process the remaining samples 8 at a time + while (nframes >= 8) { + __m256 s0, d0; + // Load sources + s0 = _mm256_load_ps(src + 0 ); + // Load destinations + d0 = _mm256_load_ps(dst + 0 ); + // dst = dst + (src * gain) + d0 = _mm256_fmadd_ps(g0, d0, s0); + // Store result + _mm256_store_ps(dst, d0); + // Update pointers and counters + src += 8; + dst += 8; + nframes -= 8; + } + } while (0); + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + // Process the remaining samples, one sample at a time. + do { + __m128 g0 = _mm_set_ss(gain); // Should be a no-op + + while (nframes > 0) { + __m128 x0 = _mm_load_ss(src); + __m128 y0 = _mm_load_ss(dst); + __m128 z0 = _mm_fmadd_ss(x0, g0, y0); + _mm_store_ss(dst, z0); + ++dst; + ++src; + --nframes; + } + } while (0); +} diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc index 3ac5601339..c179def222 100644 --- a/libs/pbd/fpu.cc +++ b/libs/pbd/fpu.cc @@ -211,7 +211,12 @@ FPU::FPU () (cpu_info[2] & (1<<28) /* AVX */) && ((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */ info << _("AVX-capable processor") << endmsg; - _flags = Flags (_flags | (HasAVX) ); + _flags = Flags (_flags | (HasAVX)); + } + + if (cpu_info[2] & (1<<12) /* FMA */) { + info << _("AVX with FMA capable processor") << endmsg; + _flags = Flags (_flags | (HasFMA)); } if (cpu_info[3] & (1<<25)) { diff --git a/libs/pbd/pbd/fpu.h b/libs/pbd/pbd/fpu.h index b0979daa9e..7d5e21d696 100644 --- a/libs/pbd/pbd/fpu.h +++ b/libs/pbd/pbd/fpu.h @@ -33,6 +33,7 @@ class LIBPBD_API FPU { HasSSE2 = 0x8, HasAVX = 0x10, HasNEON = 0x20, + HasFMA = 0x40, }; public: @@ -46,6 +47,7 @@ class LIBPBD_API FPU { bool has_sse () const { return _flags & HasSSE; } bool has_sse2 () const { return _flags & HasSSE2; } bool has_avx () const { return _flags & HasAVX; } + bool has_fma() const { return _flags & HasFMA; } bool has_neon () const { return _flags & HasNEON; } private: diff --git a/wscript b/wscript index c5d1014158..80d9aa00b8 100644 --- a/wscript +++ b/wscript @@ -88,6 +88,8 @@ compiler_flags_dictionaries= { 'attasm': '-masm=att', # Flags to make AVX instructions/intrinsics available 'avx': '-mavx', + # Flags to make FMA instructions/intrinsics available + 'fma': '-mfma', # Flags to make ARM/NEON instructions/intrinsics available 'neon': '-mfpu=neon', # Flags to generate position independent code, when needed to build a shared object