Added optimized AVX function for sample processing

Added AVX versions of existing 5 SSE functions. Added 6th AVX function to copy vectors which is 1.5 times faster then memcpy.
Data consistency and validness  is fully tested after processing with new AVX functions on aligned and non aligned buffers.
This commit is contained in:
Paul Davis 2015-05-12 21:07:09 -04:00
parent e2a76746e6
commit 6410aa896f
10 changed files with 75 additions and 25 deletions

View File

@ -33,7 +33,17 @@ extern "C" {
LIBARDOUR_API void x86_sse_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
}
extern "C" {
/* AVX functions */
LIBARDOUR_API float x86_sse_avx_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
LIBARDOUR_API void x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void x86_sse_avx_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
LIBARDOUR_API void x86_sse_avx_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
}
LIBARDOUR_API void x86_sse_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
LIBARDOUR_API void x86_sse_avx_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
/* debug wrappers for SSE functions */
@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak (const ARDOUR::Sample * buf
LIBARDOUR_API void debug_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void debug_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void debug_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
LIBARDOUR_API void debug_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
#endif
@ -61,5 +72,6 @@ LIBARDOUR_API void default_find_peaks (const ARDOUR::Sample * bu
LIBARDOUR_API void default_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void default_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
LIBARDOUR_API void default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
LIBARDOUR_API void default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
#endif /* __ardour_mix_h__ */

View File

@ -25,17 +25,19 @@
namespace ARDOUR {
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
typedef void (*apply_gain_to_buffer_t) (ARDOUR::Sample *, pframes_t, float);
typedef void (*mix_buffers_with_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
typedef void (*mix_buffers_no_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
typedef void (*copy_vector_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
LIBARDOUR_API extern compute_peak_t compute_peak;
LIBARDOUR_API extern find_peaks_t find_peaks;
LIBARDOUR_API extern apply_gain_to_buffer_t apply_gain_to_buffer;
LIBARDOUR_API extern mix_buffers_with_gain_t mix_buffers_with_gain;
LIBARDOUR_API extern mix_buffers_no_gain_t mix_buffers_no_gain;
LIBARDOUR_API extern copy_vector_t copy_vector;
}
#endif /* __ardour_runtime_functions_h__ */

View File

@ -131,6 +131,7 @@ find_peaks_t ARDOUR::find_peaks = 0;
apply_gain_to_buffer_t ARDOUR::apply_gain_to_buffer = 0;
mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
mix_buffers_no_gain_t ARDOUR::mix_buffers_no_gain = 0;
copy_vector_t ARDOUR::copy_vector = 0;
PBD::Signal1<void,std::string> ARDOUR::BootMessage;
PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
@ -160,7 +161,21 @@ setup_hardware_optimization (bool try_optimization)
#if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)
if (fpu.has_sse()) {
if (fpu.has_avx()) {
info << "Using AVX optimized routines" << endmsg;
// AVX SET
compute_peak = x86_sse_avx_compute_peak;
find_peaks = x86_sse_avx_find_peaks;
apply_gain_to_buffer = x86_sse_avx_apply_gain_to_buffer;
mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
mix_buffers_no_gain = x86_sse_avx_mix_buffers_no_gain;
copy_vector = x86_sse_avx_copy_vector;
generic_mix_functions = false;
} else if (fpu.has_sse()) {
info << "Using SSE optimized routines" << endmsg;
@ -170,6 +185,7 @@ setup_hardware_optimization (bool try_optimization)
apply_gain_to_buffer = x86_sse_apply_gain_to_buffer;
mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
mix_buffers_no_gain = x86_sse_mix_buffers_no_gain;
copy_vector = default_copy_vector;
generic_mix_functions = false;
@ -187,6 +203,7 @@ setup_hardware_optimization (bool try_optimization)
apply_gain_to_buffer = veclib_apply_gain_to_buffer;
mix_buffers_with_gain = veclib_mix_buffers_with_gain;
mix_buffers_no_gain = veclib_mix_buffers_no_gain;
copy_vector = default_copy_vector;
generic_mix_functions = false;
@ -206,6 +223,7 @@ setup_hardware_optimization (bool try_optimization)
apply_gain_to_buffer = default_apply_gain_to_buffer;
mix_buffers_with_gain = default_mix_buffers_with_gain;
mix_buffers_no_gain = default_mix_buffers_no_gain;
copy_vector = default_copy_vector;
info << "No H/W specific optimizations in use" << endmsg;
}

View File

@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
}
}
void
default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
{
memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
}
#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
#include <Accelerate/Accelerate.h>

View File

@ -417,8 +417,12 @@ def build(bld):
# not the build host, which in turn can only be inferred from the name
# of the compiler.
if re.search ('/^x86_64/', str(bld.env['CC'])):
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
obj.source += [ 'sse_functions_xmm.cc',
'sse_functions_avx.cc',
'sse_functions_64bit_win.s',
'sse_avx_functions_64bit_win.s',
]
# i18n
if bld.is_defined('ENABLE_NLS'):
mo_files = bld.path.ant_glob('po/*.mo')

View File

@ -21,6 +21,8 @@
#include "waves_audioport.h"
#include "waves_midiport.h"
#include "ardour/runtime_functions.h"
using namespace ARDOUR;
#if defined __MINGW64__ || defined __MINGW32__
@ -1170,13 +1172,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
{
#if defined(PLATFORM_WINDOWS)
const float **buffer = (const float**)input_buffer;
size_t copied_bytes = nframes*sizeof(float);
for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
it != _physical_audio_inputs.end();
++it)
{
memcpy((*it)->buffer(), *buffer, copied_bytes);
ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
++buffer;
}
#else

View File

@ -35,20 +35,24 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
std::vector<WavesDataPort*>::const_iterator it = get_connections ().begin ();
if (it != get_connections ().end ()) {
/* In fact, the static casting to (const WavesAudioPort*) is not that safe.
* However, mixing the buffers is assumed in the time critical conditions.
* Base class WavesDataPort takes is supposed to provide enough consistentcy
* of the connections.
*/
for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
it != get_connections ().end ();
++it) {
Sample* tgt = buffer ();
const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src) {
*tgt += *src;
}
}
/* In fact, the static casting to (const WavesAudioPort*) is not that safe.
* However, mixing the buffers is assumed in the time critical conditions.
* Base class WavesDataPort takes is supposed to provide enough consistentcy
* of the connections.
*/
// get first buffer data
// use optimized function to fill the buffer intialy
ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
++it;
// mix the rest
for (; it != get_connections ().end (); ++it) {
Sample* tgt = buffer ();
const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src) {
*tgt += *src;
}
}
}
}
return _buffer;
@ -59,4 +63,4 @@ void
WavesAudioPort::_wipe_buffer()
{
memset (_buffer, 0, sizeof (_buffer));
}
}

View File

@ -30,7 +30,8 @@ class LIBPBD_API FPU {
HasFlushToZero = 0x1,
HasDenormalsAreZero = 0x2,
HasSSE = 0x4,
HasSSE2 = 0x8
HasSSE2 = 0x8,
HasAVX = 0x10
};
public:
@ -41,6 +42,7 @@ class LIBPBD_API FPU {
bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
bool has_sse () const { return _flags & HasSSE; }
bool has_sse2 () const { return _flags & HasSSE2; }
bool has_avx () const { return _flags & HasAVX; }
private:
Flags _flags;

View File

@ -145,6 +145,7 @@ def build(bld):
if bld.env['build_target'] == 'x86_64':
obj.defines += [ 'USE_X86_64_ASM' ]
if bld.env['build_target'] == 'mingw':
obj.defines += [ 'NO_POSIX_MEMALIGN' ]
obj.source += [ 'windows_special_dirs.cc' ]
obj.uselib += ' OLE'

View File

@ -417,12 +417,12 @@ int main() { return 0; }''',
if (re.search ("(x86_64|AMD64)", cpu) != None):
# on Windows sse is supported by 64 bit platforms only
build_host_supports_sse = True
# mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
# compiler_flags.append (["--mmnemonic=att", "msyntax=att")
compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dict['attasm'] ])
# end of processor-specific section
# optimization section