diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index c154442daf..d472873f30 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -45,6 +45,9 @@ extern "C" {
 #endif
 }
 
+/* FMA functions */
+LIBARDOUR_API void  x86_fma_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain);
+
 LIBARDOUR_API void  x86_sse_find_peaks     (const float * buf, uint32_t nsamples, float *min, float *max);
 #ifdef PLATFORM_WINDOWS
 LIBARDOUR_API void  x86_sse_avx_find_peaks (const float * buf, uint32_t nsamples, float *min, float *max);
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index 03d73b7b2d..6e18156731 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -188,7 +188,20 @@ setup_hardware_optimization (bool try_optimization)
 #if defined(ARCH_X86) && defined(BUILD_SSE_OPTIMIZATIONS)
 
 		/* We have AVX-optimized code for Windows and Linux */
-		if (fpu->has_avx ()) {
+		if (fpu->has_fma ()) {
+			info << "Using AVX and FMA optimized routines" << endmsg;
+
+			// FMA SET (Shares a lot with AVX)
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_fma_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu->has_avx ()) {
 			info << "Using AVX optimized routines" << endmsg;
 
 			// AVX SET
diff --git a/libs/ardour/wscript b/libs/ardour/wscript
index 5c4b383b9c..206652a2b2 100644
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@@ -475,10 +475,10 @@ def build(bld):
     if Options.options.fpu_optimization:
         if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'):
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'x86_64':
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s', ]
-            avx_sources = [ 'sse_functions_avx_linux.cc' ]
+            avx_sources = [ 'sse_functions_avx_linux.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'mingw':
                 # usability of the 64 bit windows assembler depends on the compiler target,
                 # not the build host, which in turn can only be inferred from the name
@@ -486,7 +486,7 @@ def build(bld):
                 if re.search ('x86_64-w64', str(bld.env['CC'])):
                         obj.source += [ 'sse_functions_xmm.cc' ]
                         obj.source += [ 'sse_functions_64bit_win.s',  'sse_avx_functions_64bit_win.s' ]
-                        avx_sources = [ 'sse_functions_avx.cc' ]
+                        avx_sources = [ 'sse_functions_avx.cc', 'x86_functions_fma.cc' ]
         elif bld.env['build_target'] == 'aarch64':
             obj.source += ['arm_neon_functions.cc']
             obj.defines += [ 'ARM_NEON_SUPPORT' ]
@@ -512,6 +512,7 @@ def build(bld):
             # compile it with -mavx flag - append avx flag to the existing
             avx_cxxflags = list(bld.env['CXXFLAGS'])
             avx_cxxflags.append (bld.env['compiler_flags_dict']['avx'])
+            avx_cxxflags.append (bld.env['compiler_flags_dict']['fma'])
             avx_cxxflags.append (bld.env['compiler_flags_dict']['pic'])
             bld(features = 'cxx cxxstlib asm',
                 source   = avx_sources,
diff --git a/libs/ardour/x86_functions_fma.cc b/libs/ardour/x86_functions_fma.cc
new file mode 100644
index 0000000000..8ad8ed0226
--- /dev/null
+++ b/libs/ardour/x86_functions_fma.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "ardour/mix.h"
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
+
+/**
+ * @brief x86-64 AVX/FMA optimized routine for mixing buffer with gain.
+ *
+ * @param[in,out] dst Pointer to destination buffer, which gets updated
+ * @param[in] src Pointer to source buffer (not updated)
+ * @param nframes Number of samples to process
+ * @param gain Gain to apply
+ */
+void
+x86_fma_mix_buffers_with_gain(
+    float       *dst,
+    const float *src,
+    uint32_t     nframes,
+    float gain)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (!(IS_ALIGNED_TO(src, sizeof(__m256)) &&
+				IS_ALIGNED_TO(dst, sizeof(__m256))) &&
+				(nframes > 0)) {
+
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+
+
+	// Use AVX registers to process 16 samples in parallel
+	do {
+		__m256 g0 = _mm256_set1_ps(gain);
+
+		while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
+			_mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0));
+			_mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0));
+#else
+			__builtin_prefetch(src + (16 * sizeof(float)), 0, 0);
+			__builtin_prefetch(dst + (16 * sizeof(float)), 0, 0);
+#endif
+			__m256 s0, s1;
+			__m256 d0, d1;
+
+			// Load sources
+			s0 = _mm256_load_ps(src + 0);
+			s1 = _mm256_load_ps(src + 8);
+
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0);
+			d1 = _mm256_load_ps(dst + 8);
+
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			d1 = _mm256_fmadd_ps(g0, d1, s1);
+
+			// Store result
+			_mm256_store_ps(dst + 0, d0);
+			_mm256_store_ps(dst + 8, d1);
+
+			// Update pointers and counters
+			src += 16;
+			dst += 16;
+			nframes -= 16;
+		}
+
+		// Process the remaining samples 8 at a time
+		while (nframes >= 8) {
+			__m256 s0, d0;
+			// Load sources
+			s0 = _mm256_load_ps(src + 0 );
+			// Load destinations
+			d0 = _mm256_load_ps(dst + 0 );
+			// dst = dst + (src * gain)
+			d0 = _mm256_fmadd_ps(g0, d0, s0);
+			// Store result
+			_mm256_store_ps(dst, d0);
+			// Update pointers and counters
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+	} while (0);
+
+	// There's a penalty going from AVX mode to SSE mode. This can
+	// be avoided by ensuring the CPU that rest of the routine is no
+	// longer interested in the upper portion of the YMM register.
+
+	_mm256_zeroupper(); // zeros the upper portion of YMM register
+
+	// Process the remaining samples, one sample at a time.
+	do {
+		__m128 g0 = _mm_set_ss(gain); // Should be a no-op
+
+		while (nframes > 0) {
+			__m128 x0 = _mm_load_ss(src);
+			__m128 y0 = _mm_load_ss(dst);
+			__m128 z0 = _mm_fmadd_ss(x0, g0, y0);
+			_mm_store_ss(dst, z0);
+			++dst;
+			++src;
+			--nframes;
+		}
+	} while (0);
+}
diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc
index 3ac5601339..c179def222 100644
--- a/libs/pbd/fpu.cc
+++ b/libs/pbd/fpu.cc
@@ -211,7 +211,12 @@ FPU::FPU ()
 		    (cpu_info[2] & (1<<28) /* AVX */) &&
 		    ((_xgetbv (_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)) { /* OS really supports XSAVE */
 			info << _("AVX-capable processor") << endmsg;
-			_flags = Flags (_flags | (HasAVX) );
+			_flags = Flags (_flags | (HasAVX));
+		}
+
+		if (cpu_info[2] & (1<<12) /* FMA */) {
+			info << _("AVX with FMA capable processor") << endmsg;
+			_flags = Flags (_flags | (HasFMA));
 		}
 
 		if (cpu_info[3] & (1<<25)) {
diff --git a/libs/pbd/pbd/fpu.h b/libs/pbd/pbd/fpu.h
index b0979daa9e..7d5e21d696 100644
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@@ -33,6 +33,7 @@ class LIBPBD_API FPU {
 		HasSSE2 = 0x8,
 		HasAVX = 0x10,
 		HasNEON = 0x20,
+		HasFMA = 0x40,
 	};
 
   public:
@@ -46,6 +47,7 @@ class LIBPBD_API FPU {
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
 	bool has_avx () const { return _flags & HasAVX; }
+	bool has_fma() const { return _flags & HasFMA; }
 	bool has_neon () const { return _flags & HasNEON; }
 
   private:
diff --git a/wscript b/wscript
index c5d1014158..80d9aa00b8 100644
--- a/wscript
+++ b/wscript
@@ -88,6 +88,8 @@ compiler_flags_dictionaries= {
         'attasm': '-masm=att',
         # Flags to make AVX instructions/intrinsics available
         'avx': '-mavx',
+        # Flags to make FMA instructions/intrinsics available
+        'fma': '-mfma',
         # Flags to make ARM/NEON instructions/intrinsics available
         'neon': '-mfpu=neon',
         # Flags to generate position independent code, when needed to build a shared object