[Summary] Added SSE sound processing functions support for Windows. Version 1.

Conflicts:
	wscript
This commit is contained in:
Greg Zharun 2015-04-08 16:29:33 +03:00 committed by Paul Davis
parent 70338bfbd6
commit 8af992c449
7 changed files with 765 additions and 77 deletions

View File

@ -32,7 +32,7 @@ using namespace ARDOUR;
// Debug wrappers
float
debug_compute_peak (ARDOUR::Sample *buf, pframes_t nsamples, float current)
debug_compute_peak (const ARDOUR::Sample *buf, pframes_t nsamples, float current)
{
if ( ((intptr_t)buf % 16) != 0) {
std::cerr << "compute_peak(): buffer unaligned!" << std::endl;
@ -52,7 +52,7 @@ debug_apply_gain_to_buffer (ARDOUR::Sample *buf, pframes_t nframes, float gain)
}
void
debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes, float gain)
debug_mix_buffers_with_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes, float gain)
{
if ( ((intptr_t)dst & 15) != 0) {
std::cerr << "mix_buffers_with_gain(): dst unaligned!" << std::endl;
@ -67,7 +67,7 @@ debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t
}
void
debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes)
debug_mix_buffers_no_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes)
{
if ( ((intptr_t)dst & 15) != 0) {
std::cerr << "mix_buffers_no_gain(): dst unaligned!" << std::endl;

View File

@ -0,0 +1,679 @@
/*
Copyright (C) 2005-2006 Paul Davis, John Rigg
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Author: Sampo Savolainen
64-bit conversion: John Rigg
$Id$
*/
#; Microsoft version of SSE sample processing functions
#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
.globl x86_sse_mix_buffers_with_gain
.def x86_sse_mix_buffers_with_gain; .scl 2; .type 32;
.endef
x86_sse_mix_buffers_with_gain:
#; due to Microsoft calling convention
#; %rcx float *dst
#; %rdx float *src
#; %r8 unsigned int nframes
#; %xmm3 float gain
#; due to System V AMD64 (Linux) calling convention
#; %rdi float *dst
#; %rsi float *src
#; %rdx unsigned int nframes
#; %xmm0 float gain
pushq %rbp
movq %rsp, %rbp
#; save the registers
pushq %rbx #; must be preserved
pushq %rcx
pushq %rdx
pushq %rdi #; must be preserved
pushq %rsi #; must be preserved
#; to keep algorithms universal - move input params into Linux specific registers
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movss %xmm3, %xmm0
#; if nframes == 0, go to end
cmp $0, %rdx
je .MBWG_END
#; Check for alignment
movq %rdi, %rax
andq $12, %rax #; mask alignment offset
movq %rsi, %rbx
andq $12, %rbx #; mask alignment offset
cmp %rax, %rbx
jne .MBWG_NONALIGN #; if not aligned, calculate manually
#; if we are aligned
cmp $0, %rbx
jz .MBWG_SSE
#; Pre-loop, we need to run 1-3 frames "manually" without
#; SSE instructions
.MBWG_PRELOOP:
#; gain is already in %xmm0
movss (%rsi), %xmm1
mulss %xmm0, %xmm1
addss (%rdi), %xmm1
movss %xmm1, (%rdi)
addq $4, %rdi #; dst++
addq $4, %rsi #; src++
decq %rdx #; nframes--
jz .MBWG_END
addq $4, %rbx
cmp $16, %rbx #; test if we've reached 16 byte alignment
jne .MBWG_PRELOOP
.MBWG_SSE:
cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
#; gain is already in %xmm0
shufps $0x00, %xmm0, %xmm0
.MBWG_SSELOOP:
movaps (%rsi), %xmm1 #; source => xmm0
mulps %xmm0, %xmm1 #; apply gain to source
addps (%rdi), %xmm1 #; mix with destination
movaps %xmm1, (%rdi) #; copy result to destination
addq $16, %rdi #; dst+=4
addq $16, %rsi #; src+=4
subq $4, %rdx #; nframes-=4
cmp $4, %rdx
jge .MBWG_SSELOOP
cmp $0, %rdx
je .MBWG_END
#; if there are remaining frames, the nonalign code will do nicely
#; for the rest 1-3 frames.
.MBWG_NONALIGN:
#; not aligned!
#; gain is already in %xmm0
.MBWG_NONALIGNLOOP:
movss (%rsi), %xmm1
mulss %xmm0, %xmm1
addss (%rdi), %xmm1
movss %xmm1, (%rdi)
addq $4, %rdi
addq $4, %rsi
decq %rdx
jnz .MBWG_NONALIGNLOOP
.MBWG_END:
popq %rsi
popq %rdi
popq %rdx
popq %rcx
popq %rbx
#; return
leave
ret
#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
.globl x86_sse_mix_buffers_no_gain
.def x86_sse_mix_buffers_no_gain; .scl 2; .type 32;
.endef
x86_sse_mix_buffers_no_gain:
#; due to Microsoft calling convention
#; %rcx float *dst
#; %rdx float *src
#; %r8 unsigned int nframes
#; due to System V AMD64 (Linux) calling convention
#; %rdi float *dst
#; %rsi float *src
#; %rdx unsigned int nframes
pushq %rbp
movq %rsp, %rbp
#; save the registers
pushq %rbx #; must be preserved
pushq %rcx
pushq %rdx
pushq %rdi #; must be preserved
pushq %rsi #; must be preserved
#; to keep algorithms universal - move input params into Linux specific registers
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#; the real function
#; if nframes == 0, go to end
cmp $0, %r8
je .MBNG_END
#; Check for alignment
movq %rdi, %rax
andq $12, %rax #; mask alignment offset
movq %rsi, %rbx
andq $12, %rbx #; mask alignment offset
cmp %rax, %rbx
jne .MBNG_NONALIGN #; if not aligned, calculate manually
cmp $0, %rbx
je .MBNG_SSE
#; Pre-loop, we need to run 1-3 frames "manually" without
#; SSE instructions
.MBNG_PRELOOP:
movss (%rsi), %xmm0
addss (%rdi), %xmm0
movss %xmm0, (%rdi)
addq $4, %rdi #; dst++
addq $4, %rsi #; src++
decq %rdx #; nframes--
jz .MBNG_END
addq $4, %rbx
cmp $16, %rbx #; test if we've reached 16 byte alignment
jne .MBNG_PRELOOP
.MBNG_SSE:
cmp $4, %rdx #; if there are frames left, but less than 4
jnge .MBNG_NONALIGN #; we can't run SSE
.MBNG_SSELOOP:
movaps (%rsi), %xmm0 #; source => xmm0
addps (%rdi), %xmm0 #; mix with destination
movaps %xmm0, (%rdi) #; copy result to destination
addq $16, %rdi #; dst+=4
addq $16, %rsi #; src+=4
subq $4, %rdx #; nframes-=4
cmp $4, %rdx
jge .MBNG_SSELOOP
cmp $0, %rdx
je .MBNG_END
#; if there are remaining frames, the nonalign code will do nicely
#; for the rest 1-3 frames.
.MBNG_NONALIGN:
#; not aligned!
movss (%rsi), %xmm0 #; src => xmm0
addss (%rdi), %xmm0 #; xmm0 += dst
movss %xmm0, (%rdi) #; xmm0 => dst
addq $4, %rdi
addq $4, %rsi
decq %rdx
jnz .MBNG_NONALIGN
.MBNG_END:
popq %rsi
popq %rdi
popq %rdx
popq %rcx
popq %rbx
#; return
leave
ret
#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
.globl x86_sse_apply_gain_to_buffer
.def x86_sse_apply_gain_to_buffer; .scl 2; .type 32;
.endef
x86_sse_apply_gain_to_buffer:
#; due to Microsoft calling convention
#; %rcx float *buf 32(%rbp)
#; %rdx unsigned int nframes
#; %xmm2 float gain
#; %xmm1 float buf[0]
#; due to System V AMD64 (Linux) calling convention
#; %rdi float *buf 32(%rbp)
#; %rsi unsigned int nframes
#; %xmm0 float gain
#; %xmm1 float buf[0]
pushq %rbp
movq %rsp, %rbp
#; save the registers
pushq %rcx
pushq %rdi #; must be preserved
pushq %rsi #; must be preserved
#; to keep algorithms universal - move input params into Linux specific registers
movq %rcx, %rdi
movq %rdx, %rsi
movss %xmm2, %xmm0
#; the real function
#; if nframes == 0, go to end
movq %rsi, %rcx #; nframes
cmp $0, %rcx
je .AG_END
#; set up the gain buffer (gain is already in %xmm0)
shufps $0x00, %xmm0, %xmm0
#; Check for alignment
movq %rdi, %rdx #; buf => %rdx
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
jz .AG_SSE #; if buffer IS aligned
#; PRE-LOOP
#; we iterate 1-3 times, doing normal x87 float comparison
#; so we reach a 16 byte aligned "buf" (=%rdi) value
.AGLP_START:
#; Load next value from the buffer into %xmm1
movss (%rdi), %xmm1
mulss %xmm0, %xmm1
movss %xmm1, (%rdi)
#; increment buffer, decrement counter
addq $4, %rdi #; buf++;
decq %rcx #; nframes--
jz .AG_END #; if we run out of frames, we go to the end
addq $4, %rdx #; one non-aligned byte less
cmp $16, %rdx
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
.AG_SSE:
#; We have reached the 16 byte aligned "buf" ("rdi") value
#; Figure out how many loops we should do
movq %rcx, %rax #; copy remaining nframes to %rax for division
shr $2,%rax #; unsigned divide by 4
#; %rax = SSE iterations
cmp $0, %rax
je .AGPOST_START
.AGLP_SSE:
movaps (%rdi), %xmm1
mulps %xmm0, %xmm1
movaps %xmm1, (%rdi)
addq $16, %rdi #; buf + 4
subq $4, %rcx #; nframes-=4
decq %rax
jnz .AGLP_SSE
#; Next we need to post-process all remaining frames
#; the remaining frame count is in %rcx
andq $3, %rcx #; nframes % 4
jz .AG_END
.AGPOST_START:
movss (%rdi), %xmm1
mulss %xmm0, %xmm1
movss %xmm1, (%rdi)
#; increment buffer, decrement counter
addq $4, %rdi #; buf++;
decq %rcx #; nframes--
jnz .AGPOST_START #; if we run out of frames, we go to the end
.AG_END:
popq %rsi
popq %rdi
popq %rcx
#; return
leave
ret
#; end proc
#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
.globl x86_sse_apply_gain_vector
.def x86_sse_apply_gain_vector; .scl 2; .type 32;
.endef
x86_sse_apply_gain_vector:
#; due to Microsoft calling convention
#; %rcx float *buf
#; %rdx float *gain_vector
#; %r8 unsigned int nframes
#; due to System V AMD64 (Linux) calling convention
#; %rdi float *buf
#; %rsi float *gain_vector
#; %rdx unsigned int nframes
pushq %rbp
movq %rsp, %rbp
#; save the registers
pushq %rbx #; must be preserved
pushq %rcx
pushq %rdx
pushq %rdi #; must be preserved
pushq %rsi #; must be preserved
#; to keep algorithms universal - move input params into Linux specific registers
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#; if nframes == 0 go to end
cmp $0, %rdx
je .AGA_END
#; Check alignment
movq %rdi, %rax
andq $12, %rax
movq %rsi, %rbx
andq $12, %rbx
cmp %rax,%rbx
jne .AGA_ENDLOOP
cmp $0, %rax
jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
.AGA_ALIGNLOOP:
movss (%rdi), %xmm0 #; buf => xmm0
movss (%rsi), %xmm1 #; gain value => xmm1
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
movss %xmm0, (%rdi) #; signal with gain => buf
decq %rdx
jz .AGA_END
addq $4, %rdi #; buf++
addq $4, %rsi #; gab++
addq $4, %rax
cmp $16, %rax
jne .AGA_ALIGNLOOP
#; There are frames left for sure, as that is checked in the beginning
#; and within the previous loop. BUT, there might be less than 4 frames
#; to process
.AGA_SSE:
movq %rdx, %rax #; nframes => %rax
shr $2, %rax #; unsigned divide by 4
cmp $0, %rax
je .AGA_ENDLOOP
.AGA_SSELOOP:
movaps (%rdi), %xmm0
movaps (%rsi), %xmm1
mulps %xmm1, %xmm0
movaps %xmm0, (%rdi)
addq $16, %rdi
addq $16, %rsi
decq %rax
jnz .AGA_SSELOOP
andq $3, %rdx #; Remaining frames are nframes & 3
jz .AGA_END
#; Inside this loop, we know there are frames left to process
#; but because either there are < 4 frames left, or the buffers
#; are not aligned, we can't use the parallel SSE ops
.AGA_ENDLOOP:
movss (%rdi), %xmm0 #; buf => xmm0
movss (%rsi), %xmm1 #; gain value => xmm1
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
movss %xmm0, (%rdi) #; signal with gain => buf
addq $4,%rdi
addq $4,%rsi
decq %rdx #; nframes--
jnz .AGA_ENDLOOP
.AGA_END:
popq %rsi
popq %rdi
popq %rdx
popq %rcx
popq %rbx
leave
ret
#; end proc
#; float x86_sse_compute_peak(float *buf, long nframes, float current);
.globl x86_sse_compute_peak
.def x86_sse_compute_peak; .scl 2; .type 32;
.endef
x86_sse_compute_peak:
#; due to Microsoft calling convention
#; %rcx float* buf 32(%rbp)
#; %rdx unsigned int nframes
#; %xmm2 float current
#; %xmm1 float buf[0]
#; due to System V AMD64 (Linux) calling convention
#; %rdi float* buf 32(%rbp)
#; %rsi unsigned int nframes
#; %xmm0 float current
#; %xmm1 float buf[0]
pushq %rbp
movq %rsp, %rbp
#; save registers
pushq %rcx
pushq %rdi #; must be preserved
pushq %rsi #; must be preserved
#; to keep algorithms universal - move input params into Linux specific registers
movq %rcx, %rdi
movq %rdx, %rsi
movss %xmm2, %xmm0
#; if nframes == 0, go to end
movq %rsi, %rcx #; nframes
cmp $0, %rcx
je .CP_END
#; create the "abs" mask in %xmm2
pushq $2147483647
movss (%rsp), %xmm2
addq $8, %rsp
shufps $0x00, %xmm2, %xmm2
#; Check for alignment
#;movq 8(%rbp), %rdi #; buf
movq %rdi, %rdx #; buf => %rdx
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
jz .CP_SSE #; if buffer IS aligned
#; PRE-LOOP
#; we iterate 1-3 times, doing normal x87 float comparison
#; so we reach a 16 byte aligned "buf" (=%rdi) value
.LP_START:
#; Load next value from the buffer
movss (%rdi), %xmm1
andps %xmm2, %xmm1
maxss %xmm1, %xmm0
#; increment buffer, decrement counter
addq $4, %rdi #; buf++;
decq %rcx #; nframes--
jz .CP_END #; if we run out of frames, we go to the end
addq $4, %rdx #; one non-aligned byte less
cmp $16, %rdx
jne .LP_START #; if more non-aligned frames exist, we do a do-over
.CP_SSE:
#; We have reached the 16 byte aligned "buf" ("rdi") value
#; Figure out how many loops we should do
movq %rcx, %rax #; copy remaining nframes to %rax for division
shr $2,%rax #; unsigned divide by 4
jz .POST_START
#; %rax = SSE iterations
#; current maximum is at %xmm0, but we need to ..
shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
#;prefetcht0 16(%rdi)
.LP_SSE:
movaps (%rdi), %xmm1
andps %xmm2, %xmm1
maxps %xmm1, %xmm0
addq $16, %rdi
subq $4, %rdx #; nframes-=4
decq %rax
jnz .LP_SSE
#; Calculate the maximum value contained in the 4 FP's in %xmm0
movaps %xmm0, %xmm1
shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
maxps %xmm1, %xmm0 #; maximums of the two pairs
movaps %xmm0, %xmm1
shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
maxps %xmm1, %xmm0
#; now every float in %xmm0 is the same value, current maximum value
#; Next we need to post-process all remaining frames
#; the remaining frame count is in %rcx
#; if no remaining frames, jump to the end
andq $3, %rcx #; nframes % 4
jz .CP_END
.POST_START:
movss (%rdi), %xmm1
andps %xmm2, %xmm1
maxss %xmm1, %xmm0
addq $4, %rdi #; buf++;
decq %rcx #; nframes--;
jnz .POST_START
.CP_END:
#; restore registers
popq %rsi
popq %rdi
popq %rcx
#; return value is in xmm0
#; return
leave
ret
#; end proc

View File

@ -408,6 +408,13 @@ def build(bld):
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s' ]
elif bld.env['build_target'] == 'x86_64':
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s' ]
if bld.env['build_target'] == 'mingw':
import platform as PLATFORM
u = PLATFORM.uname ()
cpu = u[4]
if re.search ("(x86_64|AMD64)", cpu) != None:
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
# i18n
if bld.is_defined('ENABLE_NLS'):

View File

@ -16,7 +16,7 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef COMPILER_MSVC
#if !(defined (COMPILER_MSVC) || defined (COMPILER_MINGW))
#include "libpbd-config.h"
#define _XOPEN_SOURCE 600
@ -39,10 +39,6 @@ FPU::FPU ()
_flags = Flags (0);
#if defined(__MINGW64__) // Vkamyshniy: under __MINGW64__ the assembler code below is not compiled
return;
#endif
#if !( (defined __x86_64__) || (defined __i386__) ) // !ARCH_X86
return;
#else

View File

@ -1,10 +1,14 @@
#ifdef COMPILER_MSVC // Added by JE - 05-12-2009. Inline assembler instructions
// have been changed to Intel format and (in the case of
// cpuid) was replaced by the equivalent VC++ system call).
// Added by JE - 05-12-2009. Inline assembler instructions
// have been changed to Intel format and (in the case of
// cpuid) was replaced by the equivalent VC++ system call).
#if defined (COMPILER_MSVC) || defined (COMPILER_MINGW)
#define _XOPEN_SOURCE 600
#include <cstdlib>
#include <stdint.h>
#include <intrin.h> // Added by JE - 05-12-2009
#include <assert.h>
#include <pbd/fpu.h>
#include <pbd/error.h>
@ -20,47 +24,19 @@ FPU::FPU ()
_flags = (Flags)0;
#ifndef ARCH_X86
return;
#else
#ifndef USE_X86_64_ASM
int cpuInfo[4];
return;
#endif
// Get CPU lfags using Microsof function
// It works for both 64 and 32 bit systems
// no need to use assembler for getting info from register, this function does this for us
int cpuInfo[4];
__cpuid (cpuInfo, 1);
cpuflags = cpuInfo[3];
/*
__asm { // This is how the original section would look if converted to Intel syntax.
// However, I have grave doubts about whether it's doing the right thing.
// It seems as if the intention was to retrieve feature information from
// the processor. However, feature information is returned in the ebx register
// (if you believe Wikipedia) or in edx (if you believe Microsoft). Unfortunately,
// both registers get ignored in the original code!! Confused?? Join the club!!
mov eax, 1
push ebx
cpuid
mov edx, 0
pop ebx
mov cpuflags, ecx // This can't be right, surely???
}; */
#else
// Note that this syntax is currently still in AT&T format !
asm volatile (
"pushq %%rbx\n"
"movq $1, %%rax\n"
"cpuid\n"
"movq %%rdx, %0\n"
"popq %%rbx\n"
: "=r" (cpuflags)
:
: "%rax", "%rcx", "%rdx", "memory"
);
#endif /* USE_X86_64_ASM */
if (cpuflags & (1<<25)) {
_flags = Flags (_flags | (HasSSE|HasFlushToZero));
_flags = Flags (_flags | (HasSSE|HasFlushToZero) );
}
if (cpuflags & (1<<26)) {
@ -68,32 +44,46 @@ int cpuInfo[4];
}
if (cpuflags & (1 << 24)) {
bool aligned_malloc = false; // Added by JE - 05-12-2009
char* fxbuf = 0;
// This section changed by JE - 05-12-2009
#ifdef NO_POSIX_MEMALIGN
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) // All of these support '_aligned_malloc()'
fxbuf = (char *) _aligned_malloc(512, 16); // (note that they all need at least MSVC runtime 7.0)
aligned_malloc = true;
#else
fxbuf = (char *) malloc(512);
#endif
#else
fxbuf = posix_memalign ((void**)&fxbuf, 16, 512);
#endif
char** fxbuf = 0;
// allocate alligned buffer
fxbuf = (char **) malloc (sizeof (char *));
assert (fxbuf);
*fxbuf = (char *) malloc (512);
assert (*fxbuf);
// Verify that fxbuf is correctly aligned
unsigned long buf_addr = (unsigned long)(void*)fxbuf;
unsigned long long buf_addr = (unsigned long long)(void*)fxbuf;
if ((0 == buf_addr) || (buf_addr % 16))
error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg;
else
{
memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
memset(*fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
#if defined (COMPILER_MINGW)
asm volatile (
"fxsave (%0)"
:
: "r" (*fxbuf)
: "memory"
);
/*
asm( ".intel_syntax noprefix\n" );
asm volatile (
"mov eax, fxbuf\n"
"fxsave [eax]\n"
);
asm( ".att_syntax prefix\n" );
*/
#elif defined (COMPILER_MSVC)
__asm {
mov eax, fxbuf
fxsave [eax]
};
#endif
uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);
/* if the mask is zero, set its default value (from intel specs) */
@ -106,13 +96,10 @@ int cpuInfo[4];
_flags = Flags (_flags | HasDenormalsAreZero);
}
if (aligned_malloc)
_aligned_free (fxbuf);
else
free (fxbuf);
free (*fxbuf);
free (fxbuf);
}
}
#endif // ARCH_X86
}
FPU::~FPU ()

View File

@ -48,7 +48,6 @@ libpbd_sources = [
'error.cc',
'ffs.cc',
'file_utils.cc',
'fpu.cc',
'glib_semaphore.cc',
'id.cc',
'locale_guard.cc',
@ -145,8 +144,18 @@ def build(bld):
if bld.env['build_target'] == 'x86_64':
obj.defines += [ 'USE_X86_64_ASM' ]
if bld.env['build_target'] == 'mingw':
import re
import platform as PLATFORM
u = PLATFORM.uname ()
cpu = u[4]
if re.search ("(x86_64|AMD64)", cpu) != None:
obj.defines += [ 'USE_X86_64_ASM' ]
obj.defines += ['NO_POSIX_MEMALIGN' ]
obj.source += [ 'windows_special_dirs.cc' ]
obj.source += [ 'msvc/fpu.cc' ]
obj.uselib += ' OLE'
else:
obj.source += [ 'fpu.cc' ]
if bld.env['BUILD_TESTS'] and bld.is_defined('HAVE_CPPUNIT'):
# Unit tests

28
wscript
View File

@ -71,6 +71,8 @@ compiler_flags_dictionaries= {
'ultra-strict' : ['-Wredundant-decls', '-Wstrict-prototypes', '-Wmissing-prototypes'],
# Flag to turn on C99 compliance by itself
'c99': '-std=c99',
# Flag to enable AT&T assembler syntax
'attasm': 'asm=att',
},
'msvc' : {
'debuggable' : ['/DDEBUG', '/Od', '/Zi', '/MDd', '/Gd', '/EHsc'],
@ -370,17 +372,15 @@ int main() { return 0; }''',
c_flags.append("-Qunused-arguments")
cxx_flags.append("-Qunused-arguments")
if ((re.search ("i[0-9]86", cpu) != None) or (re.search ("x86_64", cpu) != None)) and conf.env['build_target'] != 'none':
if (re.search ("(i[0-9]86|x86_64|AMD64)", cpu) != None) and conf.env['build_target'] != 'none':
#
# ARCH_X86 means anything in the x86 family from i386 to x86_64
# the compile-time presence of the macro _LP64 is used to
# distingush 32 and 64 bit assembler
#
if (re.search ("(i[0-9]86|x86_64)", cpu) != None):
compiler_flags.append ("-DARCH_X86")
compiler_flags.append ("-DARCH_X86")
if platform == 'linux' :
@ -405,9 +405,19 @@ int main() { return 0; }''',
elif cpu == "i686":
compiler_flags.append ("-march=i686")
if ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse:
if not is_clang and ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse:
compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'] ])
if (conf.env['build_target'] == 'mingw'):
if (re.search ("(x86_64|AMD64)", cpu) != None):
# on Windows sse is supported by 64 bit platforms only
build_host_supports_sse = True
# mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
# compiler_flags.append (["--mmnemonic=att", "msyntax=att")
compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dicts['attasm'] ])
# end of processor-specific section
# optimization section
@ -415,7 +425,7 @@ int main() { return 0; }''',
if sys.platform == 'darwin':
compiler_flags.append("-DBUILD_VECLIB_OPTIMIZATIONS");
conf.env.append_value('LINKFLAGS_OSX', ['-framework', 'Accelerate'])
elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64':
elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64' or (conf.env['build_target'] == 'mingw' and build_host_supports_sse):
compiler_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
if not build_host_supports_sse:
print("\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)")
@ -859,7 +869,7 @@ def configure(conf):
autowaf.check_pkg(conf, 'rubberband', uselib_store='RUBBERBAND', mandatory=True)
if Options.options.dist_target == 'mingw':
Options.options.fpu_optimization = False
Options.options.fpu_optimization = True
conf.env.append_value('CFLAGS', '-DPLATFORM_WINDOWS')
conf.env.append_value('CFLAGS', '-DCOMPILER_MINGW')
conf.env.append_value('CXXFLAGS', '-DPLATFORM_WINDOWS')