fa7c141813
git-svn-id: svn://localhost/trunk/ardour2@13 d708f5d6-7413-0410-9779-e7cbd77b26cf
488 lines
16 KiB
C++
488 lines
16 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
///
|
|
/// Win32 version of the MMX optimized routines. All MMX optimized functions
|
|
/// have been gathered into this single source code file, regardless to their
|
|
/// class or original source code file, in order to ease porting the library
|
|
/// to other compiler and processor platforms.
|
|
///
|
|
/// This file is to be compiled in Windows platform with Microsoft Visual C++
|
|
/// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all
|
|
/// GNU platforms.
|
|
///
|
|
/// Author : Copyright (c) Olli Parviainen
|
|
/// Author e-mail : oparviai @ iki.fi
|
|
/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
|
|
///
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Last changed : $Date$
|
|
// File revision : $Revision$
|
|
//
|
|
// $Id$
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// License :
|
|
//
|
|
// SoundTouch audio processing library
|
|
// Copyright (c) Olli Parviainen
|
|
//
|
|
// This library is free software; you can redistribute it and/or
|
|
// modify it under the terms of the GNU Lesser General Public
|
|
// License as published by the Free Software Foundation; either
|
|
// version 2.1 of the License, or (at your option) any later version.
|
|
//
|
|
// This library is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
// Lesser General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Lesser General Public
|
|
// License along with this library; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "STTypes.h"
|
|
|
|
#ifndef WIN32
|
|
#error "wrong platform - this source code file is exclusively for Win32 platform"
|
|
#endif
|
|
|
|
using namespace soundtouch;
|
|
|
|
#ifdef ALLOW_MMX
|
|
// MMX routines available only with integer sample type
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// implementation of MMX optimized functions of class 'TDStretchMMX'
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "TDStretch.h"
|
|
#include <limits.h>
|
|
|
|
// these are declared in 'TDStretch.cpp'
|
|
extern int scanOffsets[4][24];
|
|
|
|
// Calculates cross correlation of two buffers
|
|
long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
|
|
{
|
|
long corr;
|
|
uint local_overlapLength = overlapLength;
|
|
uint local_overlapDividerBits = overlapDividerBits;
|
|
|
|
_asm
|
|
{
|
|
; Calculate cross-correlation between the tempOffset and tmpbid_buffer.
|
|
;
|
|
; Process 4 parallel batches of 2 * stereo samples each during one
|
|
; round to improve CPU-level parallellization.
|
|
;
|
|
; load address of sloped pV2 buffer to eax
|
|
; load address of mixing point of the sample data buffer to ebx
|
|
; load counter to ecx = overlapLength / 8 - 1
|
|
; empty the mm0
|
|
;
|
|
; prepare to the first round by loading
|
|
; load mm1 = eax[0]
|
|
; load mm2 = eax[1];
|
|
|
|
mov eax, dword ptr pV1
|
|
mov ebx, dword ptr pV2
|
|
|
|
movq mm1, qword ptr [eax]
|
|
mov ecx, local_overlapLength
|
|
|
|
movq mm2, qword ptr [eax+8]
|
|
shr ecx, 3
|
|
|
|
pxor mm0, mm0
|
|
sub ecx, 1
|
|
|
|
movd mm5, local_overlapDividerBits
|
|
|
|
loop1:
|
|
; multiply-add mm1 = mm1 * ebx[0]
|
|
; multiply-add mm2 = mm2 * ebx[1]
|
|
;
|
|
; add mm2 += mm1
|
|
; mm2 >>= mm5 (=overlapDividerBits)
|
|
; add mm0 += mm2
|
|
;
|
|
; load mm3 = eax[2]
|
|
; multiply-add mm3 = mm3 * ebx[2]
|
|
;
|
|
; load mm4 = eax[3]
|
|
; multiply-add mm4 = mm4 * ebx[3]
|
|
;
|
|
; add mm3 += mm4
|
|
; mm3 >>= mm5 (=overlapDividerBits)
|
|
; add mm0 += mm3
|
|
;
|
|
; add eax += 4;
|
|
; add ebx += 4
|
|
; load mm1 = eax[0] (~eax[4])
|
|
; load mm2 = eax[1] (~eax[5])
|
|
;
|
|
; loop
|
|
|
|
pmaddwd mm1, qword ptr [ebx]
|
|
movq mm3, qword ptr [eax+16]
|
|
|
|
pmaddwd mm2, qword ptr [ebx+8]
|
|
movq mm4, qword ptr [eax+24]
|
|
|
|
pmaddwd mm3, qword ptr [ebx+16]
|
|
paddd mm2, mm1
|
|
|
|
pmaddwd mm4, qword ptr [ebx+24]
|
|
movq mm1, qword ptr [eax+32]
|
|
|
|
psrad mm2, mm5
|
|
add eax, 32
|
|
|
|
paddd mm3, mm4
|
|
paddd mm0, mm2
|
|
|
|
movq mm2, qword ptr [eax+8]
|
|
psrad mm3, mm5
|
|
|
|
add ebx, 32
|
|
paddd mm0, mm3
|
|
|
|
dec ecx
|
|
jnz loop1
|
|
|
|
; Finalize the last partial loop:
|
|
|
|
movq mm3, qword ptr [eax+16]
|
|
pmaddwd mm1, qword ptr [ebx]
|
|
|
|
movq mm4, qword ptr [eax+24]
|
|
pmaddwd mm2, qword ptr [ebx+8]
|
|
|
|
pmaddwd mm3, qword ptr [ebx+16]
|
|
paddd mm2, mm1
|
|
|
|
pmaddwd mm4, qword ptr [ebx+24]
|
|
psrad mm2, mm5
|
|
|
|
paddd mm3, mm4
|
|
paddd mm0, mm2
|
|
|
|
psrad mm3, mm5
|
|
paddd mm0, mm3
|
|
|
|
; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
|
|
; and finally store the result into the variable "corr"
|
|
|
|
movq mm1, mm0
|
|
psrlq mm1, 32
|
|
paddd mm0, mm1
|
|
movd corr, mm0
|
|
}
|
|
return corr;
|
|
|
|
// Note: Warning about the missing EMMS instruction is harmless
|
|
// as it'll be called elsewhere.
|
|
}
|
|
|
|
|
|
|
|
void TDStretchMMX::clearCrossCorrState()
|
|
{
|
|
_asm EMMS;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// MMX-optimized version of the function overlapStereo
|
|
void TDStretchMMX::overlapStereo(short *output, const short *input) const
|
|
{
|
|
short *local_midBuffer = pMidBuffer;
|
|
uint local_overlapLength = overlapLength;
|
|
uint local_overlapDividerBits = overlapDividerBits;
|
|
|
|
_asm
|
|
{
|
|
; load sliding mixing value counter to mm6 and mm7
|
|
; load counter value to ecx = overlapLength / 4
|
|
; load divider-shifter value to esi
|
|
; load mixing value adder to mm5
|
|
; load address of midBuffer to eax
|
|
; load address of inputBuffer added with ovlOffset to ebx
|
|
; load address of end of the outputBuffer to edx
|
|
|
|
mov eax, local_overlapLength ; ecx = 0x0000 OVL_
|
|
mov edi, 0x0002fffe ; ecx = 0x0002 fffe
|
|
|
|
mov esi, local_overlapDividerBits
|
|
movd mm6, eax ; mm6 = 0x0000 0000 0000 OVL_
|
|
|
|
mov ecx, eax;
|
|
sub eax, 1
|
|
|
|
punpckldq mm6, mm6 ; mm6 = 0x0000 OVL_ 0000 OVL_
|
|
mov edx, output
|
|
|
|
or eax, 0x00010000 ; eax = 0x0001 overlapLength-1
|
|
mov ebx, dword ptr input
|
|
|
|
movd mm5, edi ; mm5 = 0x0000 0000 0002 fffe
|
|
movd mm7, eax ; mm7 = 0x0000 0000 0001 01ff
|
|
|
|
mov eax, dword ptr local_midBuffer
|
|
punpckldq mm5, mm5 ; mm5 = 0x0002 fffe 0002 fffe
|
|
|
|
shr ecx, 2 ; ecx = overlapLength / 2
|
|
punpckldq mm7, mm7 ; mm7 = 0x0001 01ff 0001 01ff
|
|
|
|
loop1:
|
|
; Process two parallel batches of 2+2 stereo samples during each round
|
|
; to improve CPU-level parallellization.
|
|
;
|
|
; Load [eax] into mm0 and mm1
|
|
; Load [ebx] into mm3
|
|
; unpack words of mm0, mm1 and mm3 into mm0 and mm1
|
|
; multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1
|
|
; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits)
|
|
; pack the result into mm0 and store into [edx]
|
|
;
|
|
; Load [eax+8] into mm2 and mm3
|
|
; Load [ebx+8] into mm4
|
|
; unpack words of mm2, mm3 and mm4 into mm2 and mm3
|
|
; multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3
|
|
; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits)
|
|
; pack the result into mm2 and store into [edx+8]
|
|
|
|
|
|
movq mm0, qword ptr [eax] ; mm0 = m1l m1r m0l m0r
|
|
add edx, 16
|
|
|
|
movq mm3, qword ptr [ebx] ; mm3 = i1l i1r i0l i0r
|
|
movq mm1, mm0 ; mm1 = m1l m1r m0l m0r
|
|
|
|
movq mm2, qword ptr [eax+8] ; mm2 = m3l m3r m2l m2r
|
|
punpcklwd mm0, mm3 ; mm0 = i0l m0l i0r m0r
|
|
|
|
movq mm4, qword ptr [ebx+8] ; mm4 = i3l i3r i2l i2r
|
|
punpckhwd mm1, mm3 ; mm1 = i1l m1l i1r m1r
|
|
|
|
movq mm3, mm2 ; mm3 = m3l m3r m2l m2r
|
|
punpcklwd mm2, mm4 ; mm2 = i2l m2l i2r m2r
|
|
|
|
pmaddwd mm0, mm6 ; mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
|
|
punpckhwd mm3, mm4 ; mm3 = i3l m3l i3r m3r
|
|
|
|
movd mm4, esi ; mm4 = overlapDividerBits
|
|
|
|
pmaddwd mm1, mm7 ; mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
|
|
paddw mm6, mm5
|
|
|
|
paddw mm7, mm5
|
|
psrad mm0, mm4 ; mmo >>= overlapDividerBits
|
|
|
|
pmaddwd mm2, mm6 ; mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
|
|
psrad mm1, mm4 ; mm1 >>= overlapDividerBits
|
|
|
|
pmaddwd mm3, mm7 ; mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
|
|
psrad mm2, mm4 ; mm2 >>= overlapDividerBits
|
|
|
|
packssdw mm0, mm1 ; mm0 = mm1h mm1l mm0h mm0l
|
|
psrad mm3, mm4 ; mm3 >>= overlapDividerBits
|
|
|
|
add eax, 16
|
|
paddw mm6, mm5
|
|
|
|
packssdw mm2, mm3 ; mm2 = mm2h mm2l mm3h mm3l
|
|
paddw mm7, mm5
|
|
|
|
movq qword ptr [edx-16], mm0
|
|
add ebx, 16
|
|
|
|
movq qword ptr [edx-8], mm2
|
|
dec ecx
|
|
|
|
jnz loop1
|
|
|
|
emms
|
|
}
|
|
}
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// implementation of MMX optimized functions of class 'FIRFilter'
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "FIRFilter.h"
|
|
|
|
|
|
FIRFilterMMX::FIRFilterMMX() : FIRFilter()
|
|
{
|
|
filterCoeffsUnalign = NULL;
|
|
}
|
|
|
|
|
|
FIRFilterMMX::~FIRFilterMMX()
|
|
{
|
|
delete[] filterCoeffsUnalign;
|
|
}
|
|
|
|
|
|
// (overloaded) Calculates filter coefficients for MMX routine
|
|
void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
|
|
{
|
|
uint i;
|
|
FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
|
|
|
|
// Ensure that filter coeffs array is aligned to 16-byte boundary
|
|
delete[] filterCoeffsUnalign;
|
|
filterCoeffsUnalign = new short[2 * newLength + 8];
|
|
filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16);
|
|
|
|
// rearrange the filter coefficients for mmx routines
|
|
for (i = 0;i < length; i += 4)
|
|
{
|
|
filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
|
|
filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
|
|
filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
|
|
filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
|
|
|
|
filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
|
|
filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
|
|
filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
|
|
filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// mmx-optimized version of the filter routine for stereo sound
|
|
uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
|
|
{
|
|
// Create stack copies of the needed member variables for asm routines :
|
|
uint local_length = length;
|
|
uint local_lengthDiv8 = lengthDiv8;
|
|
uint local_resultDivider = resultDivFactor;
|
|
short *local_filterCoeffs = (short*)filterCoeffsAlign;
|
|
|
|
if (local_length < 2) return 0;
|
|
|
|
_asm
|
|
{
|
|
; Load (num_samples-aa_filter_length)/2 to edi as a i
|
|
; Load a pointer to samples to esi
|
|
; Load a pointer to destination to edx
|
|
|
|
mov edi, numSamples
|
|
mov esi, dword ptr src
|
|
sub edi, local_length
|
|
mov edx, dword ptr dest
|
|
sar edi, 1
|
|
|
|
; Load filter length/8 to ecx
|
|
; Load pointer to samples from esi to ebx
|
|
; Load counter from edi to ecx
|
|
; Load [ebx] to mm3
|
|
; Load pointer to filter coefficients to eax
|
|
loop1:
|
|
mov ebx, esi
|
|
pxor mm0, mm0
|
|
|
|
mov ecx, local_lengthDiv8
|
|
pxor mm7, mm7
|
|
|
|
movq mm1, [ebx] ; mm1 = l1 r1 l0 r0
|
|
mov eax, local_filterCoeffs
|
|
loop2:
|
|
|
|
movq mm2, [ebx+8] ; mm2 = l3 r3 l2 r2
|
|
movq mm4, mm1 ; mm4 = l1 r1 l0 r0
|
|
|
|
movq mm3, [ebx+16] ; mm3 = l5 r5 l4 r4
|
|
punpckhwd mm1, mm2 ; mm1 = l3 l1 r3 r1
|
|
|
|
movq mm6, mm2 ; mm6 = l3 r3 l2 r2
|
|
punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0
|
|
|
|
movq mm2, qword ptr [eax] ; mm2 = f2 f0 f2 f0
|
|
movq mm5, mm1 ; mm5 = l3 l1 r3 r1
|
|
|
|
punpcklwd mm6, mm3 ; mm6 = l4 l2 r4 r2
|
|
pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
|
|
|
|
pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
|
|
movq mm2, qword ptr [eax+8] ; mm2 = f3 f1 f3 f1
|
|
|
|
paddd mm0, mm4 ; mm0 += s02*f02
|
|
movq mm4, mm3 ; mm4 = l1 r1 l0 r0
|
|
|
|
pmaddwd mm1, mm2 ; mm1 = l3*f3+l1*f1 r3*f3+l1*f1
|
|
paddd mm7, mm5 ; mm7 += s13*f02
|
|
|
|
pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
|
|
movq mm2, [ebx+24] ; mm2 = l3 r3 l2 r2
|
|
|
|
paddd mm0, mm1 ; mm0 += s31*f31
|
|
movq mm1, [ebx+32] ; mm1 = l5 r5 l4 r4
|
|
|
|
paddd mm7, mm6 ; mm7 += s42*f31
|
|
punpckhwd mm3, mm2 ; mm3 = l3 l1 r3 r1
|
|
|
|
movq mm6, mm2 ; mm6 = l3 r3 l2 r2
|
|
punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0
|
|
|
|
movq mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0
|
|
movq mm5, mm3 ; mm5 = l3 l1 r3 r1
|
|
|
|
punpcklwd mm6, mm1 ; mm6 = l4 l2 r4 r2
|
|
add eax, 32
|
|
|
|
pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0
|
|
add ebx, 32
|
|
|
|
pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0
|
|
movq mm2, qword ptr [eax-8] ; mm2 = f3 f1 f3 f1
|
|
|
|
paddd mm0, mm4 ; mm0 += s02*f02
|
|
pmaddwd mm3, mm2 ; mm3 = l3*f3+l1*f1 r3*f3+l1*f1
|
|
|
|
paddd mm7, mm5 ; mm7 += s13*f02
|
|
pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1
|
|
|
|
paddd mm0, mm3 ; mm0 += s31*f31
|
|
paddd mm7, mm6 ; mm7 += s42*f31
|
|
|
|
dec ecx
|
|
jnz loop2
|
|
|
|
; Divide mm0 and mm7 by 8192 (= right-shift by 13),
|
|
; pack and store to [edx]
|
|
movd mm4, local_resultDivider;
|
|
|
|
psrad mm0, mm4 ; divider the result
|
|
|
|
add edx, 8
|
|
psrad mm7, mm4 ; divider the result
|
|
|
|
add esi, 8
|
|
packssdw mm0, mm7
|
|
|
|
movq qword ptr [edx-8], mm0
|
|
dec edi
|
|
|
|
jnz loop1
|
|
|
|
emms
|
|
}
|
|
return (numSamples & 0xfffffffe) - local_length;
|
|
}
|
|
|
|
#endif // ALLOW_MMX
|