Added optimized AVX function for sample processing

Added AVX versions of existing 5 SSE functions. Added 6th AVX function to copy vectors which is 1.5 times faster then memcpy. Data consistency and validness is fully tested after processing with new AVX functions on aligned and non aligned buffers.
2015-05-12 21:07:09 -04:00 · 2015-05-12 21:07:09 -04:00 · 6410aa896f
commit 6410aa896f
parent e2a76746e6
10 changed files with 75 additions and 25 deletions
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@ -33,7 +33,17 @@ extern "C" {
 	LIBARDOUR_API void  x86_sse_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 }

+extern "C" {
+/* AVX functions */
+	LIBARDOUR_API float x86_sse_avx_compute_peak         (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+	LIBARDOUR_API void  x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+	LIBARDOUR_API void  x86_sse_avx_copy_vector          (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+}
+
 LIBARDOUR_API void  x86_sse_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void  x86_sse_avx_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);

 /* debug wrappers for SSE functions */

@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak               (const ARDOUR::Sample * buf
 LIBARDOUR_API void  debug_apply_gain_to_buffer       (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_with_gain      (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_no_gain        (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  debug_copy_vector                (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);

 #endif

@ -61,5 +72,6 @@ LIBARDOUR_API void  default_find_peaks                (const ARDOUR::Sample * bu
 LIBARDOUR_API void  default_apply_gain_to_buffer      (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_with_gain     (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_no_gain       (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  default_copy_vector				  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);

 #endif /* __ardour_mix_h__ */
--- a/libs/ardour/ardour/runtime_functions.h
+++ b/libs/ardour/ardour/runtime_functions.h
@ -25,17 +25,19 @@

 namespace ARDOUR {

-	typedef float (*compute_peak_t)			(const ARDOUR::Sample *, pframes_t, float);
-	typedef void  (*find_peaks_t)                   (const ARDOUR::Sample *, pframes_t, float *, float*);
+	typedef float (*compute_peak_t)			    (const ARDOUR::Sample *, pframes_t, float);
+	typedef void  (*find_peaks_t)               (const ARDOUR::Sample *, pframes_t, float *, float*);
 	typedef void  (*apply_gain_to_buffer_t)		(ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_with_gain_t)	(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_no_gain_t)		(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
+	typedef void  (*copy_vector_t)			    (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);

 	LIBARDOUR_API extern compute_peak_t		compute_peak;
 	LIBARDOUR_API extern find_peaks_t               find_peaks;
 	LIBARDOUR_API extern apply_gain_to_buffer_t	apply_gain_to_buffer;
 	LIBARDOUR_API extern mix_buffers_with_gain_t	mix_buffers_with_gain;
 	LIBARDOUR_API extern mix_buffers_no_gain_t	mix_buffers_no_gain;
+	LIBARDOUR_API extern copy_vector_t			copy_vector;
 }

 #endif /* __ardour_runtime_functions_h__ */
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@ -131,6 +131,7 @@ find_peaks_t            ARDOUR::find_peaks = 0;
 apply_gain_to_buffer_t  ARDOUR::apply_gain_to_buffer = 0;
 mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
 mix_buffers_no_gain_t   ARDOUR::mix_buffers_no_gain = 0;
+copy_vector_t			ARDOUR::copy_vector = 0;

 PBD::Signal1<void,std::string> ARDOUR::BootMessage;
 PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
@ -160,7 +161,21 @@ setup_hardware_optimization (bool try_optimization)

 #if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)

-		if (fpu.has_sse()) {
+		if (fpu.has_avx()) {
+
+			info << "Using AVX optimized routines" << endmsg;
+
+			// AVX SET
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu.has_sse()) {

 			info << "Using SSE optimized routines" << endmsg;

@ -170,6 +185,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer  = x86_sse_apply_gain_to_buffer;
 			mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
 			mix_buffers_no_gain   = x86_sse_mix_buffers_no_gain;
+			copy_vector           = default_copy_vector;

 			generic_mix_functions = false;

@ -187,6 +203,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer   = veclib_apply_gain_to_buffer;
 			mix_buffers_with_gain  = veclib_mix_buffers_with_gain;
 			mix_buffers_no_gain    = veclib_mix_buffers_no_gain;
+			copy_vector            = default_copy_vector;

 			generic_mix_functions = false;

@ -206,6 +223,7 @@ setup_hardware_optimization (bool try_optimization)
 		apply_gain_to_buffer  = default_apply_gain_to_buffer;
 		mix_buffers_with_gain = default_mix_buffers_with_gain;
 		mix_buffers_no_gain   = default_mix_buffers_no_gain;
+		copy_vector           = default_copy_vector;

 		info << "No H/W specific optimizations in use" << endmsg;
 	}
--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
 	}
 }

+void
+default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
+{
+	memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
+}
+
 #if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
 #include <Accelerate/Accelerate.h>

--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@ -417,8 +417,12 @@ def build(bld):
 	        # not the build host, which in turn can only be inferred from the name
 	        # of the compiler. 
 	        if re.search ('/^x86_64/', str(bld.env['CC'])):
-		        obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
-
+		        obj.source += [ 'sse_functions_xmm.cc',
+		                        'sse_functions_avx.cc',
+		                        'sse_functions_64bit_win.s',
+		                        'sse_avx_functions_64bit_win.s',
+	                              ]
+        
    # i18n
    if bld.is_defined('ENABLE_NLS'):
        mo_files = bld.path.ant_glob('po/*.mo')
--- a/libs/backends/wavesaudio/waves_audiobackend.cc
+++ b/libs/backends/wavesaudio/waves_audiobackend.cc
@ -21,6 +21,8 @@
 #include "waves_audioport.h"
 #include "waves_midiport.h"

+#include "ardour/runtime_functions.h"
+
 using namespace ARDOUR;

 #if defined __MINGW64__ || defined __MINGW32__
@ -1170,13 +1172,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
 {
 #if defined(PLATFORM_WINDOWS)
    const float **buffer = (const float**)input_buffer;
-    size_t copied_bytes = nframes*sizeof(float);

    for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
        it != _physical_audio_inputs.end();
        ++it)
    {
-        memcpy((*it)->buffer(), *buffer, copied_bytes);
+		ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
        ++buffer;
    }
 #else
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@ -35,20 +35,24 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
        std::vector<WavesDataPort*>::const_iterator it = get_connections ().begin ();
        
        if (it != get_connections ().end ()) {
-            /* In fact, the static casting to (const WavesAudioPort*) is not that safe.
-             * However, mixing the buffers is assumed in the time critical conditions.
-             * Base class WavesDataPort takes is supposed to provide enough consistentcy
-             * of the connections.
-             */
-            for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
-				 it != get_connections ().end ();
-				 ++it) {
-                Sample* tgt = buffer ();
-                const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
-                for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
-                    *tgt += *src;
-                }
-            }
+	        /* In fact, the static casting to (const WavesAudioPort*) is not that safe.
+	         * However, mixing the buffers is assumed in the time critical conditions.
+	         * Base class WavesDataPort takes is supposed to provide enough consistentcy
+	         * of the connections.
+	         */
+	        // get first buffer data
+	        // use optimized function to fill the buffer intialy
+	        ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
+	        ++it;
+	        
+	        // mix the rest
+	        for (; it != get_connections ().end (); ++it) {
+		        Sample* tgt = buffer ();
+		        const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
+		        for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
+			        *tgt += *src;
+		        }
+	        }
        }
    }
    return _buffer;
@ -59,4 +63,4 @@ void
 WavesAudioPort::_wipe_buffer()
 {
 	memset (_buffer, 0, sizeof (_buffer));
-}
+}
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@ -30,7 +30,8 @@ class LIBPBD_API FPU {
 		HasFlushToZero = 0x1,
 		HasDenormalsAreZero = 0x2,
 		HasSSE = 0x4,
-		HasSSE2 = 0x8
+		HasSSE2 = 0x8,
+		HasAVX = 0x10
 	};

  public:
@ -41,6 +42,7 @@ class LIBPBD_API FPU {
 	bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
+	bool has_avx () const { return _flags & HasAVX; }
 	
  private:
 	Flags _flags;
--- a/libs/pbd/wscript
+++ b/libs/pbd/wscript
@ -145,6 +145,7 @@ def build(bld):
    if bld.env['build_target'] == 'x86_64':
        obj.defines += [ 'USE_X86_64_ASM' ]
    if bld.env['build_target'] == 'mingw':
+        obj.defines += [ 'NO_POSIX_MEMALIGN' ]
        obj.source += [ 'windows_special_dirs.cc' ]
        obj.uselib += ' OLE'

--- a/4
+++ b/4
@ -417,12 +417,12 @@ int main() { return 0; }''',
            if (re.search ("(x86_64|AMD64)", cpu) != None):
                # on Windows sse is supported by 64 bit platforms only
                build_host_supports_sse = True
-                
+
                # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
                # compiler_flags.append (["--mmnemonic=att", "msyntax=att")
                
                compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dict['attasm'] ])
-                
+
    # end of processor-specific section

    # optimization section