livetrax/libs/soundtouch/mmx_win.cpp

////////////////////////////////////////////////////////////////////////////////
///
/// Win32 version of the MMX optimized routines. All MMX optimized functions
/// have been gathered into this single source code file, regardless to their 
/// class or original source code file, in order to ease porting the library
/// to other compiler and processor platforms.
///
/// This file is to be compiled in Windows platform with Microsoft Visual C++ 
/// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all
/// GNU platforms.
///
/// Author        : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai @ iki.fi
/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
///
////////////////////////////////////////////////////////////////////////////////
//
// Last changed  : $Date$
// File revision : $Revision$
//
// $Id$
//
////////////////////////////////////////////////////////////////////////////////
//
// License :
//
//  SoundTouch audio processing library
//  Copyright (c) Olli Parviainen
//
//  This library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU Lesser General Public
//  License as published by the Free Software Foundation; either
//  version 2.1 of the License, or (at your option) any later version.
//
//  This library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//  Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public
//  License along with this library; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
////////////////////////////////////////////////////////////////////////////////

#include "STTypes.h"

#ifndef WIN32
#error "wrong platform - this source code file is exclusively for Win32 platform"
#endif

using namespace soundtouch;

#ifdef ALLOW_MMX
// MMX routines available only with integer sample type    

//////////////////////////////////////////////////////////////////////////////
//
// implementation of MMX optimized functions of class 'TDStretchMMX'
//
//////////////////////////////////////////////////////////////////////////////

#include "TDStretch.h"
#include <limits.h>

// these are declared in 'TDStretch.cpp'
extern int scanOffsets[4][24];

// Calculates cross correlation of two buffers
long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
{
    long corr;
    uint local_overlapLength = overlapLength;
    uint local_overlapDividerBits = overlapDividerBits;

    _asm 
    {
        ; Calculate cross-correlation between the tempOffset and tmpbid_buffer.
        ;
        ; Process 4 parallel batches of 2 * stereo samples each during one 
        ; round to improve CPU-level parallellization.
        ;
        ; load address of sloped pV2 buffer to eax
        ; load address of mixing point of the sample data buffer to ebx
        ; load counter to ecx = overlapLength / 8 - 1
        ; empty the mm0 
        ;
        ; prepare to the first round by loading 
        ; load mm1 = eax[0]
        ; load mm2 = eax[1];

        mov         eax, dword ptr pV1
        mov         ebx, dword ptr pV2

        movq        mm1, qword ptr [eax]
        mov         ecx, local_overlapLength

        movq        mm2, qword ptr [eax+8]
        shr         ecx, 3

        pxor        mm0, mm0
        sub         ecx, 1
        
        movd        mm5, local_overlapDividerBits

    loop1:
        ; multiply-add mm1 = mm1 * ebx[0]
        ; multiply-add mm2 = mm2 * ebx[1]
        ;
        ; add mm2 += mm1
        ; mm2 >>= mm5 (=overlapDividerBits)
        ; add mm0 += mm2
        ;
        ; load mm3 = eax[2]
        ; multiply-add mm3 = mm3 * ebx[2]
        ;
        ; load mm4 = eax[3]
        ; multiply-add mm4 = mm4 * ebx[3]
        ;
        ; add mm3 += mm4
        ; mm3 >>= mm5 (=overlapDividerBits)
        ; add mm0 += mm3
        ;
        ; add eax += 4;
        ; add ebx += 4
        ; load mm1 = eax[0] (~eax[4])
        ; load mm2 = eax[1] (~eax[5])
        ;
        ; loop

        pmaddwd     mm1, qword ptr [ebx]
        movq        mm3, qword ptr [eax+16]

        pmaddwd     mm2, qword ptr [ebx+8]
        movq        mm4, qword ptr [eax+24]

        pmaddwd     mm3, qword ptr [ebx+16]
        paddd       mm2, mm1

        pmaddwd     mm4, qword ptr [ebx+24]
        movq        mm1, qword ptr [eax+32]

        psrad       mm2, mm5
        add         eax, 32

        paddd       mm3, mm4
        paddd       mm0, mm2

        movq        mm2, qword ptr [eax+8]
        psrad       mm3, mm5

        add         ebx, 32
        paddd       mm0, mm3

        dec         ecx
        jnz         loop1

        ; Finalize the last partial loop:

        movq        mm3, qword ptr [eax+16]
        pmaddwd     mm1, qword ptr [ebx]

        movq        mm4, qword ptr [eax+24]
        pmaddwd     mm2, qword ptr [ebx+8]

        pmaddwd     mm3, qword ptr [ebx+16]
        paddd       mm2, mm1

        pmaddwd     mm4, qword ptr [ebx+24]
        psrad       mm2, mm5

        paddd       mm3, mm4
        paddd       mm0, mm2

        psrad       mm3, mm5
        paddd       mm0, mm3

        ; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
        ; and finally store the result into the variable "corr"

        movq        mm1, mm0
        psrlq       mm1, 32
        paddd       mm0, mm1
        movd        corr, mm0
    }
    return corr;
    
    // Note: Warning about the missing EMMS instruction is harmless
    // as it'll be called elsewhere.
}


void TDStretchMMX::clearCrossCorrState()
{
    _asm EMMS;
}


// MMX-optimized version of the function overlapStereo
void TDStretchMMX::overlapStereo(short *output, const short *input) const
{
    short *local_midBuffer = pMidBuffer;
    uint local_overlapLength = overlapLength;
    uint local_overlapDividerBits = overlapDividerBits;

    _asm 
    {
        ; load sliding mixing value counter to mm6 and mm7
        ; load counter value to ecx = overlapLength / 4
        ; load divider-shifter value to esi
        ; load mixing value adder to mm5
        ; load address of midBuffer to eax
        ; load address of inputBuffer added with ovlOffset to ebx
        ; load address of end of the outputBuffer to edx

        mov         eax, local_overlapLength        ; ecx = 0x0000 OVL_
        mov         edi, 0x0002fffe     ; ecx = 0x0002 fffe

        mov            esi, local_overlapDividerBits
        movd        mm6, eax            ; mm6 = 0x0000 0000 0000 OVL_

        mov         ecx, eax;
        sub         eax, 1

        punpckldq   mm6, mm6            ; mm6 = 0x0000 OVL_ 0000 OVL_
        mov         edx, output

        or          eax, 0x00010000     ; eax = 0x0001 overlapLength-1
        mov         ebx, dword ptr input

        movd        mm5, edi            ; mm5 = 0x0000 0000 0002 fffe
        movd        mm7, eax            ; mm7 = 0x0000 0000 0001 01ff

        mov         eax, dword ptr local_midBuffer
        punpckldq   mm5, mm5            ; mm5 = 0x0002 fffe 0002 fffe

        shr         ecx, 2              ; ecx = overlapLength / 2
        punpckldq   mm7, mm7            ; mm7 = 0x0001 01ff 0001 01ff

    loop1:
        ; Process two parallel batches of 2+2 stereo samples during each round 
        ; to improve CPU-level parallellization.
        ;
        ; Load [eax] into mm0 and mm1
        ; Load [ebx] into mm3
        ; unpack words of mm0, mm1 and mm3 into mm0 and mm1
        ; multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
        ; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
        ; pack the result into mm0 and store into [edx]
        ;
        ; Load [eax+8] into mm2 and mm3
        ; Load [ebx+8] into mm4
        ; unpack words of mm2, mm3 and mm4 into mm2 and mm3
        ; multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
        ; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
        ; pack the result into mm2 and store into [edx+8]

                
        movq        mm0, qword ptr [eax]    ; mm0 = m1l m1r m0l m0r
        add         edx, 16

        movq        mm3, qword ptr [ebx]    ; mm3 = i1l i1r i0l i0r
        movq        mm1, mm0                ; mm1 = m1l m1r m0l m0r

        movq        mm2, qword ptr [eax+8]  ; mm2 = m3l m3r m2l m2r
        punpcklwd   mm0, mm3                ; mm0 = i0l m0l i0r m0r

        movq        mm4, qword ptr [ebx+8]  ; mm4 = i3l i3r i2l i2r
        punpckhwd   mm1, mm3                ; mm1 = i1l m1l i1r m1r

        movq        mm3, mm2                ; mm3 = m3l m3r m2l m2r
        punpcklwd   mm2, mm4                ; mm2 = i2l m2l i2r m2r

        pmaddwd     mm0, mm6                ; mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
        punpckhwd   mm3, mm4                ; mm3 = i3l m3l i3r m3r

        movd        mm4, esi                ; mm4 = overlapDividerBits

        pmaddwd     mm1, mm7                ; mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
        paddw       mm6, mm5

        paddw       mm7, mm5
        psrad       mm0, mm4                ; mmo >>= overlapDividerBits

        pmaddwd     mm2, mm6                ; mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
        psrad       mm1, mm4                ; mm1 >>= overlapDividerBits

        pmaddwd     mm3, mm7                ; mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
        psrad       mm2, mm4                ; mm2 >>= overlapDividerBits

        packssdw    mm0, mm1                ; mm0 = mm1h mm1l mm0h mm0l
        psrad       mm3, mm4                ; mm3 >>= overlapDividerBits

        add         eax, 16
        paddw       mm6, mm5

        packssdw    mm2, mm3                ; mm2 = mm2h mm2l mm3h mm3l
        paddw       mm7, mm5

        movq        qword ptr [edx-16], mm0
        add         ebx, 16

        movq        qword ptr [edx-8], mm2
        dec         ecx
    
        jnz         loop1

        emms
    }
}


//////////////////////////////////////////////////////////////////////////////
//
// implementation of MMX optimized functions of class 'FIRFilter'
//
//////////////////////////////////////////////////////////////////////////////

#include "FIRFilter.h"


FIRFilterMMX::FIRFilterMMX() : FIRFilter()
{
    filterCoeffsUnalign = NULL;
}


FIRFilterMMX::~FIRFilterMMX()
{
    delete[] filterCoeffsUnalign;
}


// (overloaded) Calculates filter coefficients for MMX routine
void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
{
    uint i;
    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);

    // Ensure that filter coeffs array is aligned to 16-byte boundary
    delete[] filterCoeffsUnalign;
    filterCoeffsUnalign = new short[2 * newLength + 8];
    filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);

    // rearrange the filter coefficients for mmx routines 
    for (i = 0;i < length; i += 4) 
    {
        filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
        filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
        filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
        filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];

        filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
        filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
        filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
        filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
    }
}


// mmx-optimized version of the filter routine for stereo sound
uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
{
    // Create stack copies of the needed member variables for asm routines :
    uint local_length = length;
    uint local_lengthDiv8 = lengthDiv8;
    uint local_resultDivider = resultDivFactor;
    short *local_filterCoeffs = (short*)filterCoeffsAlign;

    if (local_length < 2) return 0;

    _asm 
    {
        ; Load (num_samples-aa_filter_length)/2 to edi as a i
        ; Load a pointer to samples to esi
        ; Load a pointer to destination to edx

        mov         edi, numSamples
        mov         esi, dword ptr src
        sub         edi, local_length
        mov         edx, dword ptr dest
        sar         edi, 1

        ; Load filter length/8 to ecx
        ; Load pointer to samples from esi to ebx
        ; Load counter from edi to ecx
        ; Load [ebx] to mm3
        ; Load pointer to filter coefficients to eax
loop1:
        mov         ebx, esi
        pxor        mm0, mm0

        mov         ecx, local_lengthDiv8
        pxor        mm7, mm7

        movq        mm1, [ebx]              ; mm1 = l1 r1 l0 r0
        mov         eax, local_filterCoeffs
loop2:

        movq        mm2, [ebx+8]            ; mm2 = l3 r3 l2 r2
        movq        mm4, mm1                ; mm4 = l1 r1 l0 r0

        movq        mm3, [ebx+16]           ; mm3 = l5 r5 l4 r4
        punpckhwd   mm1, mm2                ; mm1 = l3 l1 r3 r1

        movq        mm6, mm2                ; mm6 = l3 r3 l2 r2
        punpcklwd   mm4, mm2                ; mm4 = l2 l0 r2 r0

        movq        mm2, qword ptr [eax]    ; mm2 = f2 f0 f2 f0
        movq        mm5, mm1                ; mm5 = l3 l1 r3 r1

        punpcklwd   mm6, mm3                ; mm6 = l4 l2 r4 r2
        pmaddwd     mm4, mm2                ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0

        pmaddwd     mm5, mm2                ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
        movq        mm2, qword ptr [eax+8]  ; mm2 = f3 f1 f3 f1

        paddd       mm0, mm4                ; mm0 += s02*f02
        movq        mm4, mm3                ; mm4 = l1 r1 l0 r0

        pmaddwd     mm1, mm2                ; mm1 = l3*f3+l1*f1 r3*f3+l1*f1
        paddd       mm7, mm5                ; mm7 += s13*f02

        pmaddwd     mm6, mm2                ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
        movq        mm2, [ebx+24]           ; mm2 = l3 r3 l2 r2

        paddd       mm0, mm1                ; mm0 += s31*f31
        movq        mm1, [ebx+32]           ; mm1 = l5 r5 l4 r4

        paddd       mm7, mm6                ; mm7 += s42*f31
        punpckhwd   mm3, mm2                ; mm3 = l3 l1 r3 r1

        movq        mm6, mm2                ; mm6 = l3 r3 l2 r2
        punpcklwd   mm4, mm2                ; mm4 = l2 l0 r2 r0

        movq        mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0
        movq        mm5, mm3                ; mm5 = l3 l1 r3 r1

        punpcklwd   mm6, mm1                ; mm6 = l4 l2 r4 r2
        add         eax, 32

        pmaddwd     mm4, mm2                ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
        add         ebx, 32

        pmaddwd     mm5, mm2                ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
        movq        mm2, qword ptr [eax-8]  ; mm2 = f3 f1 f3 f1

        paddd       mm0, mm4                ; mm0 += s02*f02
        pmaddwd     mm3, mm2                ; mm3 = l3*f3+l1*f1 r3*f3+l1*f1

        paddd       mm7, mm5                ; mm7 += s13*f02
        pmaddwd     mm6, mm2                ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1

        paddd       mm0, mm3                ; mm0 += s31*f31
        paddd       mm7, mm6                ; mm7 += s42*f31

        dec         ecx
        jnz         loop2

        ; Divide mm0 and mm7 by 8192 (= right-shift by 13),
        ; pack and store to [edx]
        movd        mm4, local_resultDivider;

        psrad       mm0, mm4                ; divider the result

        add         edx, 8
        psrad       mm7, mm4                ; divider the result

        add         esi, 8
        packssdw    mm0, mm7

        movq        qword ptr [edx-8], mm0
        dec         edi

        jnz         loop1

        emms
    }
    return (numSamples & 0xfffffffe) - local_length;
}

#endif  // ALLOW_MMX
Updated to soundtouch-1.3 (plus modifications) git-svn-id: svn://localhost/trunk/ardour2@13 d708f5d6-7413-0410-9779-e7cbd77b26cf 2005-05-18 12:08:37 -04:00			`////////////////////////////////////////////////////////////////////////////////`
			`///`
			`/// Win32 version of the MMX optimized routines. All MMX optimized functions`
			`/// have been gathered into this single source code file, regardless to their`
			`/// class or original source code file, in order to ease porting the library`
			`/// to other compiler and processor platforms.`
			`///`
			`/// This file is to be compiled in Windows platform with Microsoft Visual C++`
			`/// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all`
			`/// GNU platforms.`
			`///`
			`/// Author : Copyright (c) Olli Parviainen`
			`/// Author e-mail : oparviai @ iki.fi`
			`/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch`
			`///`
			`////////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Last changed : $Date$`
			`// File revision : $Revision$`
			`//`
			`// $Id$`
			`//`
			`////////////////////////////////////////////////////////////////////////////////`
			`//`
			`// License :`
			`//`
			`// SoundTouch audio processing library`
			`// Copyright (c) Olli Parviainen`
			`//`
			`// This library is free software; you can redistribute it and/or`
			`// modify it under the terms of the GNU Lesser General Public`
			`// License as published by the Free Software Foundation; either`
			`// version 2.1 of the License, or (at your option) any later version.`
			`//`
			`// This library is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`// Lesser General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU Lesser General Public`
			`// License along with this library; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`
			`//`
			`////////////////////////////////////////////////////////////////////////////////`

			`#include "STTypes.h"`

			`#ifndef WIN32`
			`#error "wrong platform - this source code file is exclusively for Win32 platform"`
			`#endif`

			`using namespace soundtouch;`

			`#ifdef ALLOW_MMX`
			`// MMX routines available only with integer sample type`

			`//////////////////////////////////////////////////////////////////////////////`
			`//`
			`// implementation of MMX optimized functions of class 'TDStretchMMX'`
			`//`
			`//////////////////////////////////////////////////////////////////////////////`

			`#include "TDStretch.h"`
			`#include <limits.h>`

			`// these are declared in 'TDStretch.cpp'`
			`extern int scanOffsets[4][24];`

			`// Calculates cross correlation of two buffers`
			`long TDStretchMMX::calcCrossCorrStereo(const short pV1, const short pV2) const`
			`{`
			`long corr;`
			`uint local_overlapLength = overlapLength;`
			`uint local_overlapDividerBits = overlapDividerBits;`

			`_asm`
			`{`
			`; Calculate cross-correlation between the tempOffset and tmpbid_buffer.`
			`;`
			`; Process 4 parallel batches of 2 * stereo samples each during one`
			`; round to improve CPU-level parallellization.`
			`;`
			`; load address of sloped pV2 buffer to eax`
			`; load address of mixing point of the sample data buffer to ebx`
			`; load counter to ecx = overlapLength / 8 - 1`
			`; empty the mm0`
			`;`
			`; prepare to the first round by loading`
			`; load mm1 = eax[0]`
			`; load mm2 = eax[1];`

			`mov eax, dword ptr pV1`
			`mov ebx, dword ptr pV2`

			`movq mm1, qword ptr [eax]`
			`mov ecx, local_overlapLength`

			`movq mm2, qword ptr [eax+8]`
			`shr ecx, 3`

			`pxor mm0, mm0`
			`sub ecx, 1`

			`movd mm5, local_overlapDividerBits`

			`loop1:`
			`; multiply-add mm1 = mm1 * ebx[0]`
			`; multiply-add mm2 = mm2 * ebx[1]`
			`;`
			`; add mm2 += mm1`
			`; mm2 >>= mm5 (=overlapDividerBits)`
			`; add mm0 += mm2`
			`;`
			`; load mm3 = eax[2]`
			`; multiply-add mm3 = mm3 * ebx[2]`
			`;`
			`; load mm4 = eax[3]`
			`; multiply-add mm4 = mm4 * ebx[3]`
			`;`
			`; add mm3 += mm4`
			`; mm3 >>= mm5 (=overlapDividerBits)`
			`; add mm0 += mm3`
			`;`
			`; add eax += 4;`
			`; add ebx += 4`
			`; load mm1 = eax[0] (~eax[4])`
			`; load mm2 = eax[1] (~eax[5])`
			`;`
			`; loop`

			`pmaddwd mm1, qword ptr [ebx]`
			`movq mm3, qword ptr [eax+16]`

			`pmaddwd mm2, qword ptr [ebx+8]`
			`movq mm4, qword ptr [eax+24]`

			`pmaddwd mm3, qword ptr [ebx+16]`
			`paddd mm2, mm1`

			`pmaddwd mm4, qword ptr [ebx+24]`
			`movq mm1, qword ptr [eax+32]`

			`psrad mm2, mm5`
			`add eax, 32`

			`paddd mm3, mm4`
			`paddd mm0, mm2`

			`movq mm2, qword ptr [eax+8]`
			`psrad mm3, mm5`

			`add ebx, 32`
			`paddd mm0, mm3`

			`dec ecx`
			`jnz loop1`

			`; Finalize the last partial loop:`

			`movq mm3, qword ptr [eax+16]`
			`pmaddwd mm1, qword ptr [ebx]`

			`movq mm4, qword ptr [eax+24]`
			`pmaddwd mm2, qword ptr [ebx+8]`

			`pmaddwd mm3, qword ptr [ebx+16]`
			`paddd mm2, mm1`

			`pmaddwd mm4, qword ptr [ebx+24]`
			`psrad mm2, mm5`

			`paddd mm3, mm4`
			`paddd mm0, mm2`

			`psrad mm3, mm5`
			`paddd mm0, mm3`

			`; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1`
			`; and finally store the result into the variable "corr"`

			`movq mm1, mm0`
			`psrlq mm1, 32`
			`paddd mm0, mm1`
			`movd corr, mm0`
			`}`
			`return corr;`

			`// Note: Warning about the missing EMMS instruction is harmless`
			`// as it'll be called elsewhere.`
			`}`



			`void TDStretchMMX::clearCrossCorrState()`
			`{`
			`_asm EMMS;`
			`}`





			`// MMX-optimized version of the function overlapStereo`
			`void TDStretchMMX::overlapStereo(short output, const short input) const`
			`{`
			`short *local_midBuffer = pMidBuffer;`
			`uint local_overlapLength = overlapLength;`
			`uint local_overlapDividerBits = overlapDividerBits;`

			`_asm`
			`{`
			`; load sliding mixing value counter to mm6 and mm7`
			`; load counter value to ecx = overlapLength / 4`
			`; load divider-shifter value to esi`
			`; load mixing value adder to mm5`
			`; load address of midBuffer to eax`
			`; load address of inputBuffer added with ovlOffset to ebx`
			`; load address of end of the outputBuffer to edx`

			`mov eax, local_overlapLength ; ecx = 0x0000 OVL_`
			`mov edi, 0x0002fffe ; ecx = 0x0002 fffe`

			`mov esi, local_overlapDividerBits`
			`movd mm6, eax ; mm6 = 0x0000 0000 0000 OVL_`

			`mov ecx, eax;`
			`sub eax, 1`

			`punpckldq mm6, mm6 ; mm6 = 0x0000 OVL_ 0000 OVL_`
			`mov edx, output`

			`or eax, 0x00010000 ; eax = 0x0001 overlapLength-1`
			`mov ebx, dword ptr input`

			`movd mm5, edi ; mm5 = 0x0000 0000 0002 fffe`
			`movd mm7, eax ; mm7 = 0x0000 0000 0001 01ff`

			`mov eax, dword ptr local_midBuffer`
			`punpckldq mm5, mm5 ; mm5 = 0x0002 fffe 0002 fffe`

			`shr ecx, 2 ; ecx = overlapLength / 2`
			`punpckldq mm7, mm7 ; mm7 = 0x0001 01ff 0001 01ff`

			`loop1:`
			`; Process two parallel batches of 2+2 stereo samples during each round`
			`; to improve CPU-level parallellization.`
			`;`
			`; Load [eax] into mm0 and mm1`
			`; Load [ebx] into mm3`
			`; unpack words of mm0, mm1 and mm3 into mm0 and mm1`
			`; multiply-add mm0mm6 and mm1mm7, store results into mm0 and mm1`
			`; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)`
			`; pack the result into mm0 and store into [edx]`
			`;`
			`; Load [eax+8] into mm2 and mm3`
			`; Load [ebx+8] into mm4`
			`; unpack words of mm2, mm3 and mm4 into mm2 and mm3`
			`; multiply-add mm2mm6 and mm3mm7, store results into mm2 and mm3`
			`; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)`
			`; pack the result into mm2 and store into [edx+8]`


			`movq mm0, qword ptr [eax] ; mm0 = m1l m1r m0l m0r`
			`add edx, 16`

			`movq mm3, qword ptr [ebx] ; mm3 = i1l i1r i0l i0r`
			`movq mm1, mm0 ; mm1 = m1l m1r m0l m0r`

			`movq mm2, qword ptr [eax+8] ; mm2 = m3l m3r m2l m2r`
			`punpcklwd mm0, mm3 ; mm0 = i0l m0l i0r m0r`

			`movq mm4, qword ptr [ebx+8] ; mm4 = i3l i3r i2l i2r`
			`punpckhwd mm1, mm3 ; mm1 = i1l m1l i1r m1r`

			`movq mm3, mm2 ; mm3 = m3l m3r m2l m2r`
			`punpcklwd mm2, mm4 ; mm2 = i2l m2l i2r m2r`

			`pmaddwd mm0, mm6 ; mm0 = i0lm63+m0lm62 i0rm61+m0rm60`
			`punpckhwd mm3, mm4 ; mm3 = i3l m3l i3r m3r`

			`movd mm4, esi ; mm4 = overlapDividerBits`

			`pmaddwd mm1, mm7 ; mm1 = i1lm73+m1lm72 i1rm71+m1rm70`
			`paddw mm6, mm5`

			`paddw mm7, mm5`
			`psrad mm0, mm4 ; mmo >>= overlapDividerBits`

			`pmaddwd mm2, mm6 ; mm2 = i2lm63+m2lm62 i2rm61+m2rm60`
			`psrad mm1, mm4 ; mm1 >>= overlapDividerBits`

			`pmaddwd mm3, mm7 ; mm3 = i3lm73+m3lm72 i3rm71+m3rm70`
			`psrad mm2, mm4 ; mm2 >>= overlapDividerBits`

			`packssdw mm0, mm1 ; mm0 = mm1h mm1l mm0h mm0l`
			`psrad mm3, mm4 ; mm3 >>= overlapDividerBits`

			`add eax, 16`
			`paddw mm6, mm5`

			`packssdw mm2, mm3 ; mm2 = mm2h mm2l mm3h mm3l`
			`paddw mm7, mm5`

			`movq qword ptr [edx-16], mm0`
			`add ebx, 16`

			`movq qword ptr [edx-8], mm2`
			`dec ecx`

			`jnz loop1`

			`emms`
			`}`
			`}`


			`//////////////////////////////////////////////////////////////////////////////`
			`//`
			`// implementation of MMX optimized functions of class 'FIRFilter'`
			`//`
			`//////////////////////////////////////////////////////////////////////////////`

			`#include "FIRFilter.h"`


			`FIRFilterMMX::FIRFilterMMX() : FIRFilter()`
			`{`
			`filterCoeffsUnalign = NULL;`
			`}`


			`FIRFilterMMX::~FIRFilterMMX()`
			`{`
			`delete[] filterCoeffsUnalign;`
			`}`


			`// (overloaded) Calculates filter coefficients for MMX routine`
			`void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)`
			`{`
			`uint i;`
			`FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);`

			`// Ensure that filter coeffs array is aligned to 16-byte boundary`
			`delete[] filterCoeffsUnalign;`
			`filterCoeffsUnalign = new short[2 * newLength + 8];`
			`filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);`

			`// rearrange the filter coefficients for mmx routines`
			`for (i = 0;i < length; i += 4)`
			`{`
			`filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];`
			`filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];`
			`filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];`
			`filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];`

			`filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];`
			`filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];`
			`filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];`
			`filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];`
			`}`
			`}`



			`// mmx-optimized version of the filter routine for stereo sound`
			`uint FIRFilterMMX::evaluateFilterStereo(short dest, const short src, const uint numSamples) const`
			`{`
			`// Create stack copies of the needed member variables for asm routines :`
			`uint local_length = length;`
			`uint local_lengthDiv8 = lengthDiv8;`
			`uint local_resultDivider = resultDivFactor;`
			`short local_filterCoeffs = (short)filterCoeffsAlign;`

			`if (local_length < 2) return 0;`

			`_asm`
			`{`
			`; Load (num_samples-aa_filter_length)/2 to edi as a i`
			`; Load a pointer to samples to esi`
			`; Load a pointer to destination to edx`

			`mov edi, numSamples`
			`mov esi, dword ptr src`
			`sub edi, local_length`
			`mov edx, dword ptr dest`
			`sar edi, 1`

			`; Load filter length/8 to ecx`
			`; Load pointer to samples from esi to ebx`
			`; Load counter from edi to ecx`
			`; Load [ebx] to mm3`
			`; Load pointer to filter coefficients to eax`
			`loop1:`
			`mov ebx, esi`
			`pxor mm0, mm0`

			`mov ecx, local_lengthDiv8`
			`pxor mm7, mm7`

			`movq mm1, [ebx] ; mm1 = l1 r1 l0 r0`
			`mov eax, local_filterCoeffs`
			`loop2:`

			`movq mm2, [ebx+8] ; mm2 = l3 r3 l2 r2`
			`movq mm4, mm1 ; mm4 = l1 r1 l0 r0`

			`movq mm3, [ebx+16] ; mm3 = l5 r5 l4 r4`
			`punpckhwd mm1, mm2 ; mm1 = l3 l1 r3 r1`

			`movq mm6, mm2 ; mm6 = l3 r3 l2 r2`
			`punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0`

			`movq mm2, qword ptr [eax] ; mm2 = f2 f0 f2 f0`
			`movq mm5, mm1 ; mm5 = l3 l1 r3 r1`

			`punpcklwd mm6, mm3 ; mm6 = l4 l2 r4 r2`
			`pmaddwd mm4, mm2 ; mm4 = l2f2+l0f0 r2f2+r0f0`

			`pmaddwd mm5, mm2 ; mm5 = l3f2+l1f0 r3f2+l1f0`
			`movq mm2, qword ptr [eax+8] ; mm2 = f3 f1 f3 f1`

			`paddd mm0, mm4 ; mm0 += s02*f02`
			`movq mm4, mm3 ; mm4 = l1 r1 l0 r0`

			`pmaddwd mm1, mm2 ; mm1 = l3f3+l1f1 r3f3+l1f1`
			`paddd mm7, mm5 ; mm7 += s13*f02`

			`pmaddwd mm6, mm2 ; mm6 = l4f3+l2f1 r4f3+f4f1`
			`movq mm2, [ebx+24] ; mm2 = l3 r3 l2 r2`

			`paddd mm0, mm1 ; mm0 += s31*f31`
			`movq mm1, [ebx+32] ; mm1 = l5 r5 l4 r4`

			`paddd mm7, mm6 ; mm7 += s42*f31`
			`punpckhwd mm3, mm2 ; mm3 = l3 l1 r3 r1`

			`movq mm6, mm2 ; mm6 = l3 r3 l2 r2`
			`punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0`

			`movq mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0`
			`movq mm5, mm3 ; mm5 = l3 l1 r3 r1`

			`punpcklwd mm6, mm1 ; mm6 = l4 l2 r4 r2`
			`add eax, 32`

			`pmaddwd mm4, mm2 ; mm4 = l2f2+l0f0 r2f2+r0f0`
			`add ebx, 32`

			`pmaddwd mm5, mm2 ; mm5 = l3f2+l1f0 r3f2+l1f0`
			`movq mm2, qword ptr [eax-8] ; mm2 = f3 f1 f3 f1`

			`paddd mm0, mm4 ; mm0 += s02*f02`
			`pmaddwd mm3, mm2 ; mm3 = l3f3+l1f1 r3f3+l1f1`

			`paddd mm7, mm5 ; mm7 += s13*f02`
			`pmaddwd mm6, mm2 ; mm6 = l4f3+l2f1 r4f3+f4f1`

			`paddd mm0, mm3 ; mm0 += s31*f31`
			`paddd mm7, mm6 ; mm7 += s42*f31`

			`dec ecx`
			`jnz loop2`

			`; Divide mm0 and mm7 by 8192 (= right-shift by 13),`
			`; pack and store to [edx]`
			`movd mm4, local_resultDivider;`

			`psrad mm0, mm4 ; divider the result`

			`add edx, 8`
			`psrad mm7, mm4 ; divider the result`

			`add esi, 8`
			`packssdw mm0, mm7`

			`movq qword ptr [edx-8], mm0`
			`dec edi`

			`jnz loop1`

			`emms`
			`}`
			`return (numSamples & 0xfffffffe) - local_length;`
			`}`

			`#endif // ALLOW_MMX`