Added optimized AVX function for sample processing
Added AVX versions of existing 5 SSE functions. Added 6th AVX function to copy vectors which is 1.5 times faster then memcpy. Data consistency and validness is fully tested after processing with new AVX functions on aligned and non aligned buffers.
This commit is contained in:
parent
e2a76746e6
commit
6410aa896f
@ -33,7 +33,17 @@ extern "C" {
|
||||
LIBARDOUR_API void x86_sse_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
/* AVX functions */
|
||||
LIBARDOUR_API float x86_sse_avx_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
|
||||
LIBARDOUR_API void x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void x86_sse_avx_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void x86_sse_avx_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
}
|
||||
|
||||
LIBARDOUR_API void x86_sse_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
|
||||
LIBARDOUR_API void x86_sse_avx_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
|
||||
|
||||
/* debug wrappers for SSE functions */
|
||||
|
||||
@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak (const ARDOUR::Sample * buf
|
||||
LIBARDOUR_API void debug_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void debug_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void debug_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void debug_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
|
||||
#endif
|
||||
|
||||
@ -61,5 +72,6 @@ LIBARDOUR_API void default_find_peaks (const ARDOUR::Sample * bu
|
||||
LIBARDOUR_API void default_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void default_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
|
||||
#endif /* __ardour_mix_h__ */
|
||||
|
@ -25,17 +25,19 @@
|
||||
|
||||
namespace ARDOUR {
|
||||
|
||||
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
|
||||
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
|
||||
typedef void (*apply_gain_to_buffer_t) (ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*mix_buffers_with_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*mix_buffers_no_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
|
||||
typedef void (*copy_vector_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
|
||||
|
||||
LIBARDOUR_API extern compute_peak_t compute_peak;
|
||||
LIBARDOUR_API extern find_peaks_t find_peaks;
|
||||
LIBARDOUR_API extern apply_gain_to_buffer_t apply_gain_to_buffer;
|
||||
LIBARDOUR_API extern mix_buffers_with_gain_t mix_buffers_with_gain;
|
||||
LIBARDOUR_API extern mix_buffers_no_gain_t mix_buffers_no_gain;
|
||||
LIBARDOUR_API extern copy_vector_t copy_vector;
|
||||
}
|
||||
|
||||
#endif /* __ardour_runtime_functions_h__ */
|
||||
|
@ -131,6 +131,7 @@ find_peaks_t ARDOUR::find_peaks = 0;
|
||||
apply_gain_to_buffer_t ARDOUR::apply_gain_to_buffer = 0;
|
||||
mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
|
||||
mix_buffers_no_gain_t ARDOUR::mix_buffers_no_gain = 0;
|
||||
copy_vector_t ARDOUR::copy_vector = 0;
|
||||
|
||||
PBD::Signal1<void,std::string> ARDOUR::BootMessage;
|
||||
PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
|
||||
@ -160,7 +161,21 @@ setup_hardware_optimization (bool try_optimization)
|
||||
|
||||
#if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)
|
||||
|
||||
if (fpu.has_sse()) {
|
||||
if (fpu.has_avx()) {
|
||||
|
||||
info << "Using AVX optimized routines" << endmsg;
|
||||
|
||||
// AVX SET
|
||||
compute_peak = x86_sse_avx_compute_peak;
|
||||
find_peaks = x86_sse_avx_find_peaks;
|
||||
apply_gain_to_buffer = x86_sse_avx_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = x86_sse_avx_mix_buffers_no_gain;
|
||||
copy_vector = x86_sse_avx_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
} else if (fpu.has_sse()) {
|
||||
|
||||
info << "Using SSE optimized routines" << endmsg;
|
||||
|
||||
@ -170,6 +185,7 @@ setup_hardware_optimization (bool try_optimization)
|
||||
apply_gain_to_buffer = x86_sse_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = x86_sse_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
@ -187,6 +203,7 @@ setup_hardware_optimization (bool try_optimization)
|
||||
apply_gain_to_buffer = veclib_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = veclib_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = veclib_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
@ -206,6 +223,7 @@ setup_hardware_optimization (bool try_optimization)
|
||||
apply_gain_to_buffer = default_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = default_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = default_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
info << "No H/W specific optimizations in use" << endmsg;
|
||||
}
|
||||
|
@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
|
||||
{
|
||||
memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
|
||||
}
|
||||
|
||||
#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
|
@ -417,8 +417,12 @@ def build(bld):
|
||||
# not the build host, which in turn can only be inferred from the name
|
||||
# of the compiler.
|
||||
if re.search ('/^x86_64/', str(bld.env['CC'])):
|
||||
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
|
||||
|
||||
obj.source += [ 'sse_functions_xmm.cc',
|
||||
'sse_functions_avx.cc',
|
||||
'sse_functions_64bit_win.s',
|
||||
'sse_avx_functions_64bit_win.s',
|
||||
]
|
||||
|
||||
# i18n
|
||||
if bld.is_defined('ENABLE_NLS'):
|
||||
mo_files = bld.path.ant_glob('po/*.mo')
|
||||
|
@ -21,6 +21,8 @@
|
||||
#include "waves_audioport.h"
|
||||
#include "waves_midiport.h"
|
||||
|
||||
#include "ardour/runtime_functions.h"
|
||||
|
||||
using namespace ARDOUR;
|
||||
|
||||
#if defined __MINGW64__ || defined __MINGW32__
|
||||
@ -1170,13 +1172,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
|
||||
{
|
||||
#if defined(PLATFORM_WINDOWS)
|
||||
const float **buffer = (const float**)input_buffer;
|
||||
size_t copied_bytes = nframes*sizeof(float);
|
||||
|
||||
for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
|
||||
it != _physical_audio_inputs.end();
|
||||
++it)
|
||||
{
|
||||
memcpy((*it)->buffer(), *buffer, copied_bytes);
|
||||
ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
|
||||
++buffer;
|
||||
}
|
||||
#else
|
||||
|
@ -35,20 +35,24 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
|
||||
std::vector<WavesDataPort*>::const_iterator it = get_connections ().begin ();
|
||||
|
||||
if (it != get_connections ().end ()) {
|
||||
/* In fact, the static casting to (const WavesAudioPort*) is not that safe.
|
||||
* However, mixing the buffers is assumed in the time critical conditions.
|
||||
* Base class WavesDataPort takes is supposed to provide enough consistentcy
|
||||
* of the connections.
|
||||
*/
|
||||
for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
|
||||
it != get_connections ().end ();
|
||||
++it) {
|
||||
Sample* tgt = buffer ();
|
||||
const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
|
||||
for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src) {
|
||||
*tgt += *src;
|
||||
}
|
||||
}
|
||||
/* In fact, the static casting to (const WavesAudioPort*) is not that safe.
|
||||
* However, mixing the buffers is assumed in the time critical conditions.
|
||||
* Base class WavesDataPort takes is supposed to provide enough consistentcy
|
||||
* of the connections.
|
||||
*/
|
||||
// get first buffer data
|
||||
// use optimized function to fill the buffer intialy
|
||||
ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
|
||||
++it;
|
||||
|
||||
// mix the rest
|
||||
for (; it != get_connections ().end (); ++it) {
|
||||
Sample* tgt = buffer ();
|
||||
const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
|
||||
for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src) {
|
||||
*tgt += *src;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return _buffer;
|
||||
@ -59,4 +63,4 @@ void
|
||||
WavesAudioPort::_wipe_buffer()
|
||||
{
|
||||
memset (_buffer, 0, sizeof (_buffer));
|
||||
}
|
||||
}
|
||||
|
@ -30,7 +30,8 @@ class LIBPBD_API FPU {
|
||||
HasFlushToZero = 0x1,
|
||||
HasDenormalsAreZero = 0x2,
|
||||
HasSSE = 0x4,
|
||||
HasSSE2 = 0x8
|
||||
HasSSE2 = 0x8,
|
||||
HasAVX = 0x10
|
||||
};
|
||||
|
||||
public:
|
||||
@ -41,6 +42,7 @@ class LIBPBD_API FPU {
|
||||
bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
|
||||
bool has_sse () const { return _flags & HasSSE; }
|
||||
bool has_sse2 () const { return _flags & HasSSE2; }
|
||||
bool has_avx () const { return _flags & HasAVX; }
|
||||
|
||||
private:
|
||||
Flags _flags;
|
||||
|
@ -145,6 +145,7 @@ def build(bld):
|
||||
if bld.env['build_target'] == 'x86_64':
|
||||
obj.defines += [ 'USE_X86_64_ASM' ]
|
||||
if bld.env['build_target'] == 'mingw':
|
||||
obj.defines += [ 'NO_POSIX_MEMALIGN' ]
|
||||
obj.source += [ 'windows_special_dirs.cc' ]
|
||||
obj.uselib += ' OLE'
|
||||
|
||||
|
4
wscript
4
wscript
@ -417,12 +417,12 @@ int main() { return 0; }''',
|
||||
if (re.search ("(x86_64|AMD64)", cpu) != None):
|
||||
# on Windows sse is supported by 64 bit platforms only
|
||||
build_host_supports_sse = True
|
||||
|
||||
|
||||
# mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
|
||||
# compiler_flags.append (["--mmnemonic=att", "msyntax=att")
|
||||
|
||||
compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dict['attasm'] ])
|
||||
|
||||
|
||||
# end of processor-specific section
|
||||
|
||||
# optimization section
|
||||
|
Loading…
Reference in New Issue
Block a user