Add support for Intel/AMD's FMA extension

By supporting FMA extension, the number of instruction needed
for multiply accumulate to mix channels are reduced. Since,
this extension has been around since middle of 2012, more
computers have this instruction set available.
This commit is contained in:
Ayan Shafqat 2021-01-01 14:01:37 -05:00 committed by Robin Gareus
parent f188a1ad10
commit 407882d23d
Signed by: rgareus
GPG Key ID: A090BCE02CF57F04
7 changed files with 167 additions and 5 deletions

View File

@ -45,6 +45,9 @@ extern "C" {
#endif
}
/* FMA functions */
LIBARDOUR_API void x86_fma_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain);
LIBARDOUR_API void x86_sse_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
#ifdef PLATFORM_WINDOWS
LIBARDOUR_API void x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);

View File

@ -188,7 +188,20 @@ setup_hardware_optimization (bool try_optimization)
#if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)
/* We have AVX-optimized code for Windows and Linux */
if (fpu->has_avx ()) {
if (fpu->has_fma ()) {
info << "Using AVX and FMA optimized routines" << endmsg;
// FMA SET (Shares a lot with AVX)
compute_peak = x86_sse_avx_compute_peak;
find_peaks = x86_sse_avx_find_peaks;
apply_gain_to_buffer = x86_sse_avx_apply_gain_to_buffer;
mix_buffers_with_gain = x86_fma_mix_buffers_with_gain;
mix_buffers_no_gain = x86_sse_avx_mix_buffers_no_gain;
copy_vector = x86_sse_avx_copy_vector;
generic_mix_functions = false;
} else if (fpu->has_avx ()) {
info << "Using AVX optimized routines" << endmsg;
// AVX SET

View File

@ -475,10 +475,10 @@ def build(bld):
if Options.options.fpu_optimization:
if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'):
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ]
avx_sources = [ 'sse_functions_avx_linux.cc' ]
avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
elif bld.env['build_target'] == 'x86_64':
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s', ]
avx_sources = [ 'sse_functions_avx_linux.cc' ]
avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
elif bld.env['build_target'] == 'mingw':
# usability of the 64 bit windows assembler depends on the compiler target,
# not the build host, which in turn can only be inferred from the name
@ -486,7 +486,7 @@ def build(bld):
if re.search ('x86_64-w64', str(bld.env['CC'])):
obj.source += [ 'sse_functions_xmm.cc' ]
obj.source += [ 'sse_functions_64bit_win.s', 'sse_avx_functions_64bit_win.s' ]
avx_sources = [ 'sse_functions_avx.cc' ]
avx_sources = [ 'sse_functions_avx.cc', 'x86_functions_fma.cc' ]
elif bld.env['build_target'] == 'aarch64':
obj.source += ['arm_neon_functions.cc']
obj.defines += [ 'ARM_NEON_SUPPORT' ]
@ -512,6 +512,7 @@ def build(bld):
# compile it with -mavx flag - append avx flag to the existing
avx_cxxflags = list(bld.env['CXXFLAGS'])
avx_cxxflags.append (bld.env['compiler_flags_dict']['avx'])
avx_cxxflags.append (bld.env['compiler_flags_dict']['fma'])
avx_cxxflags.append (bld.env['compiler_flags_dict']['pic'])
bld(features = 'cxx cxxstlib asm',
source = avx_sources,

View File

@ -0,0 +1,136 @@
/*
* Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "ardour/mix.h"
#include <immintrin.h>
#include <xmmintrin.h>
#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
/**
* @brief x86-64 AVX/FMA optimized routine for mixing buffer with gain.
*
* @param[in,out] dst Pointer to destination buffer, which gets updated
* @param[in] src Pointer to source buffer (not updated)
* @param nframes Number of samples to process
* @param gain Gain to apply
*/
void
x86_fma_mix_buffers_with_gain(
float *dst,
const float *src,
uint32_t nframes,
float gain)
{
// While buffers aren't aligned, then process one sample at a time
do {
__m128 g0 = _mm_set_ss(gain); // Should be a no-op
while (!(IS_ALIGNED_TO(src, sizeof(__m256)) &&
IS_ALIGNED_TO(dst, sizeof(__m256))) &&
(nframes > 0)) {
__m128 x0 = _mm_load_ss(src);
__m128 y0 = _mm_load_ss(dst);
__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
_mm_store_ss(dst, z0);
++dst;
++src;
--nframes;
}
} while (0);
// Use AVX registers to process 16 samples in parallel
do {
__m256 g0 = _mm256_set1_ps(gain);
while (nframes >= 16) {
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
#else
__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
#endif
__m256 s0, s1;
__m256 d0, d1;
// Load sources
s0 = _mm256_load_ps(src + 0);
s1 = _mm256_load_ps(src + 8);
// Load destinations
d0 = _mm256_load_ps(dst + 0);
d1 = _mm256_load_ps(dst + 8);
// dst = dst + (src * gain)
d0 = _mm256_fmadd_ps(g0, d0, s0);
d1 = _mm256_fmadd_ps(g0, d1, s1);
// Store result
_mm256_store_ps(dst + 0, d0);
_mm256_store_ps(dst + 8, d1);
// Update pointers and counters
src += 16;
dst += 16;
nframes -= 16;
}
// Process the remaining samples 8 at a time
while (nframes >= 8) {
__m256 s0, d0;
// Load sources
s0 = _mm256_load_ps(src + 0 );
// Load destinations
d0 = _mm256_load_ps(dst + 0 );
// dst = dst + (src * gain)
d0 = _mm256_fmadd_ps(g0, d0, s0);
// Store result
_mm256_store_ps(dst, d0);
// Update pointers and counters
src += 8;
dst += 8;
nframes -= 8;
}
} while (0);
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples, one sample at a time.
do {
__m128 g0 = _mm_set_ss(gain); // Should be a no-op
while (nframes > 0) {
__m128 x0 = _mm_load_ss(src);
__m128 y0 = _mm_load_ss(dst);
__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
_mm_store_ss(dst, z0);
++dst;
++src;
--nframes;
}
} while (0);
}

View File

@ -211,7 +211,12 @@ FPU::FPU ()
(cpu_info[2] & (1<<28) /* AVX */) &&
((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */
info << _("AVX-capable processor") << endmsg;
_flags = Flags (_flags | (HasAVX) );
_flags = Flags (_flags | (HasAVX));
}
if (cpu_info[2] & (1<<12) /* FMA */) {
info << _("AVX with FMA capable processor") << endmsg;
_flags = Flags (_flags | (HasFMA));
}
if (cpu_info[3] & (1<<25)) {

View File

@ -33,6 +33,7 @@ class LIBPBD_API FPU {
HasSSE2 = 0x8,
HasAVX = 0x10,
HasNEON = 0x20,
HasFMA = 0x40,
};
public:
@ -46,6 +47,7 @@ class LIBPBD_API FPU {
bool has_sse () const { return _flags & HasSSE; }
bool has_sse2 () const { return _flags & HasSSE2; }
bool has_avx () const { return _flags & HasAVX; }
bool has_fma() const { return _flags & HasFMA; }
bool has_neon () const { return _flags & HasNEON; }
private:

View File

@ -88,6 +88,8 @@ compiler_flags_dictionaries= {
'attasm': '-masm=att',
# Flags to make AVX instructions/intrinsics available
'avx': '-mavx',
# Flags to make FMA instructions/intrinsics available
'fma': '-mfma',
# Flags to make ARM/NEON instructions/intrinsics available
'neon': '-mfpu=neon',
# Flags to generate position independent code, when needed to build a shared object