From 8af992c449b895ec8be638049fd2510388f23ddd Mon Sep 17 00:00:00 2001 From: Greg Zharun Date: Wed, 8 Apr 2015 16:29:33 +0300 Subject: [PATCH] [Summary] Added SSE sound processing functions support for Windows. Version 1. Conflicts: wscript --- libs/ardour/mix.cc | 6 +- libs/ardour/sse_functions_64bit_win.s | 679 ++++++++++++++++++++++++++ libs/ardour/wscript | 7 + libs/pbd/fpu.cc | 6 +- libs/pbd/msvc/fpu.cc | 105 ++-- libs/pbd/wscript | 11 +- wscript | 28 +- 7 files changed, 765 insertions(+), 77 deletions(-) create mode 100644 libs/ardour/sse_functions_64bit_win.s diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc index 220cd0660c..adae68ae7f 100644 --- a/libs/ardour/mix.cc +++ b/libs/ardour/mix.cc @@ -32,7 +32,7 @@ using namespace ARDOUR; // Debug wrappers float -debug_compute_peak (ARDOUR::Sample *buf, pframes_t nsamples, float current) +debug_compute_peak (const ARDOUR::Sample *buf, pframes_t nsamples, float current) { if ( ((intptr_t)buf % 16) != 0) { std::cerr << "compute_peak(): buffer unaligned!" << std::endl; @@ -52,7 +52,7 @@ debug_apply_gain_to_buffer (ARDOUR::Sample *buf, pframes_t nframes, float gain) } void -debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes, float gain) +debug_mix_buffers_with_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes, float gain) { if ( ((intptr_t)dst & 15) != 0) { std::cerr << "mix_buffers_with_gain(): dst unaligned!" << std::endl; @@ -67,7 +67,7 @@ debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t } void -debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes) +debug_mix_buffers_no_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes) { if ( ((intptr_t)dst & 15) != 0) { std::cerr << "mix_buffers_no_gain(): dst unaligned!" << std::endl; diff --git a/libs/ardour/sse_functions_64bit_win.s b/libs/ardour/sse_functions_64bit_win.s new file mode 100644 index 0000000000..7a50c9aef5 --- /dev/null +++ b/libs/ardour/sse_functions_64bit_win.s @@ -0,0 +1,679 @@ +/* + Copyright (C) 2005-2006 Paul Davis, John Rigg + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Author: Sampo Savolainen + 64-bit conversion: John Rigg + + $Id$ +*/ + +#; Microsoft version of SSE sample processing functions + +#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain); + +.globl x86_sse_mix_buffers_with_gain + .def x86_sse_mix_buffers_with_gain; .scl 2; .type 32; +.endef + +x86_sse_mix_buffers_with_gain: + +#; due to Microsoft calling convention +#; %rcx float *dst +#; %rdx float *src +#; %r8 unsigned int nframes +#; %xmm3 float gain + +#; due to System V AMD64 (Linux) calling convention +#; %rdi float *dst +#; %rsi float *src +#; %rdx unsigned int nframes +#; %xmm0 float gain + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + pushq %rcx + pushq %rdx + pushq %rdi #; must be preserved + pushq %rsi #; must be preserved + + #; to keep algorithms universal - move input params into Linux specific registers + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movss %xmm3, %xmm0 + + #; if nframes == 0, go to end + cmp $0, %rdx + je .MBWG_END + + #; Check for alignment + + movq %rdi, %rax + andq $12, %rax #; mask alignment offset + + movq %rsi, %rbx + andq $12, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBWG_NONALIGN #; if not aligned, calculate manually + + #; if we are aligned + cmp $0, %rbx + jz .MBWG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + +.MBWG_PRELOOP: + + #; gain is already in %xmm0 + movss (%rsi), %xmm1 + mulss %xmm0, %xmm1 + addss (%rdi), %xmm1 + movss %xmm1, (%rdi) + + addq $4, %rdi #; dst++ + addq $4, %rsi #; src++ + decq %rdx #; nframes-- + jz .MBWG_END + + addq $4, %rbx + + cmp $16, %rbx #; test if we've reached 16 byte alignment + jne .MBWG_PRELOOP + + +.MBWG_SSE: + + cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then + jnge .MBWG_NONALIGN #; we jump straight to the "normal" code + + #; gain is already in %xmm0 + shufps $0x00, %xmm0, %xmm0 + + +.MBWG_SSELOOP: + + movaps (%rsi), %xmm1 #; source => xmm0 + mulps %xmm0, %xmm1 #; apply gain to source + addps (%rdi), %xmm1 #; mix with destination + movaps %xmm1, (%rdi) #; copy result to destination + + addq $16, %rdi #; dst+=4 + addq $16, %rsi #; src+=4 + + subq $4, %rdx #; nframes-=4 + cmp $4, %rdx + jge .MBWG_SSELOOP + + cmp $0, %rdx + je .MBWG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBWG_NONALIGN: + #; not aligned! + + #; gain is already in %xmm0 + +.MBWG_NONALIGNLOOP: + + movss (%rsi), %xmm1 + mulss %xmm0, %xmm1 + addss (%rdi), %xmm1 + movss %xmm1, (%rdi) + + addq $4, %rdi + addq $4, %rsi + + decq %rdx + jnz .MBWG_NONALIGNLOOP + +.MBWG_END: + + popq %rsi + popq %rdi + popq %rdx + popq %rcx + popq %rbx + + #; return + leave + ret + + +#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes); + +.globl x86_sse_mix_buffers_no_gain + .def x86_sse_mix_buffers_no_gain; .scl 2; .type 32; +.endef + +x86_sse_mix_buffers_no_gain: + +#; due to Microsoft calling convention +#; %rcx float *dst +#; %rdx float *src +#; %r8 unsigned int nframes + +#; due to System V AMD64 (Linux) calling convention +#; %rdi float *dst +#; %rsi float *src +#; %rdx unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + pushq %rcx + pushq %rdx + pushq %rdi #; must be preserved + pushq %rsi #; must be preserved + + #; to keep algorithms universal - move input params into Linux specific registers + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + + #; the real function + + #; if nframes == 0, go to end + cmp $0, %r8 + je .MBNG_END + + #; Check for alignment + + movq %rdi, %rax + andq $12, %rax #; mask alignment offset + + movq %rsi, %rbx + andq $12, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBNG_NONALIGN #; if not aligned, calculate manually + + cmp $0, %rbx + je .MBNG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + +.MBNG_PRELOOP: + + movss (%rsi), %xmm0 + addss (%rdi), %xmm0 + movss %xmm0, (%rdi) + + addq $4, %rdi #; dst++ + addq $4, %rsi #; src++ + decq %rdx #; nframes-- + jz .MBNG_END + addq $4, %rbx + + cmp $16, %rbx #; test if we've reached 16 byte alignment + jne .MBNG_PRELOOP + +.MBNG_SSE: + + cmp $4, %rdx #; if there are frames left, but less than 4 + jnge .MBNG_NONALIGN #; we can't run SSE + +.MBNG_SSELOOP: + + movaps (%rsi), %xmm0 #; source => xmm0 + addps (%rdi), %xmm0 #; mix with destination + movaps %xmm0, (%rdi) #; copy result to destination + + addq $16, %rdi #; dst+=4 + addq $16, %rsi #; src+=4 + + subq $4, %rdx #; nframes-=4 + cmp $4, %rdx + jge .MBNG_SSELOOP + + cmp $0, %rdx + je .MBNG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBNG_NONALIGN: + #; not aligned! + + movss (%rsi), %xmm0 #; src => xmm0 + addss (%rdi), %xmm0 #; xmm0 += dst + movss %xmm0, (%rdi) #; xmm0 => dst + + addq $4, %rdi + addq $4, %rsi + + decq %rdx + jnz .MBNG_NONALIGN + +.MBNG_END: + + popq %rsi + popq %rdi + popq %rdx + popq %rcx + popq %rbx + + #; return + leave + ret + + +#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain); + +.globl x86_sse_apply_gain_to_buffer + .def x86_sse_apply_gain_to_buffer; .scl 2; .type 32; +.endef + +x86_sse_apply_gain_to_buffer: + +#; due to Microsoft calling convention +#; %rcx float *buf 32(%rbp) +#; %rdx unsigned int nframes +#; %xmm2 float gain +#; %xmm1 float buf[0] + +#; due to System V AMD64 (Linux) calling convention +#; %rdi float *buf 32(%rbp) +#; %rsi unsigned int nframes +#; %xmm0 float gain +#; %xmm1 float buf[0] + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rcx + pushq %rdi #; must be preserved + pushq %rsi #; must be preserved + + #; to keep algorithms universal - move input params into Linux specific registers + movq %rcx, %rdi + movq %rdx, %rsi + movss %xmm2, %xmm0 + + #; the real function + + #; if nframes == 0, go to end + movq %rsi, %rcx #; nframes + cmp $0, %rcx + je .AG_END + + #; set up the gain buffer (gain is already in %xmm0) + shufps $0x00, %xmm0, %xmm0 + + #; Check for alignment + + movq %rdi, %rdx #; buf => %rdx + andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .AG_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%rdi) value + +.AGLP_START: + + #; Load next value from the buffer into %xmm1 + movss (%rdi), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rdi) + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jz .AG_END #; if we run out of frames, we go to the end + + addq $4, %rdx #; one non-aligned byte less + cmp $16, %rdx + jne .AGLP_START #; if more non-aligned frames exist, we do a do-over + +.AG_SSE: + + #; We have reached the 16 byte aligned "buf" ("rdi") value + + #; Figure out how many loops we should do + movq %rcx, %rax #; copy remaining nframes to %rax for division + + shr $2,%rax #; unsigned divide by 4 + + #; %rax = SSE iterations + cmp $0, %rax + je .AGPOST_START + +.AGLP_SSE: + + movaps (%rdi), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, (%rdi) + + addq $16, %rdi #; buf + 4 + subq $4, %rcx #; nframes-=4 + + decq %rax + jnz .AGLP_SSE + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + + andq $3, %rcx #; nframes % 4 + jz .AG_END + +.AGPOST_START: + + movss (%rdi), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rdi) + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jnz .AGPOST_START #; if we run out of frames, we go to the end + +.AG_END: + + popq %rsi + popq %rdi + popq %rcx + + #; return + leave + ret + +#; end proc + + +#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes) + +.globl x86_sse_apply_gain_vector + .def x86_sse_apply_gain_vector; .scl 2; .type 32; +.endef + + +x86_sse_apply_gain_vector: + +#; due to Microsoft calling convention +#; %rcx float *buf +#; %rdx float *gain_vector +#; %r8 unsigned int nframes + +#; due to System V AMD64 (Linux) calling convention +#; %rdi float *buf +#; %rsi float *gain_vector +#; %rdx unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + pushq %rcx + pushq %rdx + pushq %rdi #; must be preserved + pushq %rsi #; must be preserved + + #; to keep algorithms universal - move input params into Linux specific registers + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + + #; if nframes == 0 go to end + cmp $0, %rdx + je .AGA_END + + #; Check alignment + movq %rdi, %rax + andq $12, %rax + + movq %rsi, %rbx + andq $12, %rbx + + cmp %rax,%rbx + jne .AGA_ENDLOOP + + cmp $0, %rax + jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop + +#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount +.AGA_ALIGNLOOP: + + movss (%rdi), %xmm0 #; buf => xmm0 + movss (%rsi), %xmm1 #; gain value => xmm1 + mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 + movss %xmm0, (%rdi) #; signal with gain => buf + + decq %rdx + jz .AGA_END + + addq $4, %rdi #; buf++ + addq $4, %rsi #; gab++ + + addq $4, %rax + cmp $16, %rax + jne .AGA_ALIGNLOOP + +#; There are frames left for sure, as that is checked in the beginning +#; and within the previous loop. BUT, there might be less than 4 frames +#; to process + +.AGA_SSE: + movq %rdx, %rax #; nframes => %rax + shr $2, %rax #; unsigned divide by 4 + + cmp $0, %rax + je .AGA_ENDLOOP + +.AGA_SSELOOP: + movaps (%rdi), %xmm0 + movaps (%rsi), %xmm1 + mulps %xmm1, %xmm0 + movaps %xmm0, (%rdi) + + addq $16, %rdi + addq $16, %rsi + + decq %rax + jnz .AGA_SSELOOP + + andq $3, %rdx #; Remaining frames are nframes & 3 + jz .AGA_END + + +#; Inside this loop, we know there are frames left to process +#; but because either there are < 4 frames left, or the buffers +#; are not aligned, we can't use the parallel SSE ops +.AGA_ENDLOOP: + movss (%rdi), %xmm0 #; buf => xmm0 + movss (%rsi), %xmm1 #; gain value => xmm1 + mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 + movss %xmm0, (%rdi) #; signal with gain => buf + + addq $4,%rdi + addq $4,%rsi + decq %rdx #; nframes-- + jnz .AGA_ENDLOOP + +.AGA_END: + + popq %rsi + popq %rdi + popq %rdx + popq %rcx + popq %rbx + + leave + ret + +#; end proc + + +#; float x86_sse_compute_peak(float *buf, long nframes, float current); + +.globl x86_sse_compute_peak + .def x86_sse_compute_peak; .scl 2; .type 32; +.endef + + +x86_sse_compute_peak: + +#; due to Microsoft calling convention +#; %rcx float* buf 32(%rbp) +#; %rdx unsigned int nframes +#; %xmm2 float current +#; %xmm1 float buf[0] + +#; due to System V AMD64 (Linux) calling convention +#; %rdi float* buf 32(%rbp) +#; %rsi unsigned int nframes +#; %xmm0 float current +#; %xmm1 float buf[0] + + pushq %rbp + movq %rsp, %rbp + + #; save registers + pushq %rcx + pushq %rdi #; must be preserved + pushq %rsi #; must be preserved + + #; to keep algorithms universal - move input params into Linux specific registers + movq %rcx, %rdi + movq %rdx, %rsi + movss %xmm2, %xmm0 + + #; if nframes == 0, go to end + movq %rsi, %rcx #; nframes + cmp $0, %rcx + je .CP_END + + #; create the "abs" mask in %xmm2 + pushq $2147483647 + movss (%rsp), %xmm2 + addq $8, %rsp + shufps $0x00, %xmm2, %xmm2 + + #; Check for alignment + + #;movq 8(%rbp), %rdi #; buf + movq %rdi, %rdx #; buf => %rdx + andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .CP_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%rdi) value + +.LP_START: + + #; Load next value from the buffer + movss (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jz .CP_END #; if we run out of frames, we go to the end + + addq $4, %rdx #; one non-aligned byte less + cmp $16, %rdx + jne .LP_START #; if more non-aligned frames exist, we do a do-over + +.CP_SSE: + + #; We have reached the 16 byte aligned "buf" ("rdi") value + + #; Figure out how many loops we should do + movq %rcx, %rax #; copy remaining nframes to %rax for division + + shr $2,%rax #; unsigned divide by 4 + jz .POST_START + + #; %rax = SSE iterations + + #; current maximum is at %xmm0, but we need to .. + shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's + + #;prefetcht0 16(%rdi) + +.LP_SSE: + + movaps (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxps %xmm1, %xmm0 + + addq $16, %rdi + + subq $4, %rdx #; nframes-=4 + + decq %rax + jnz .LP_SSE + + #; Calculate the maximum value contained in the 4 FP's in %xmm0 + movaps %xmm0, %xmm1 + shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) + maxps %xmm1, %xmm0 #; maximums of the two pairs + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) + maxps %xmm1, %xmm0 + + #; now every float in %xmm0 is the same value, current maximum value + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + + #; if no remaining frames, jump to the end + + andq $3, %rcx #; nframes % 4 + jz .CP_END + +.POST_START: + + movss (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + addq $4, %rdi #; buf++; + + decq %rcx #; nframes--; + jnz .POST_START + +.CP_END: + + #; restore registers + popq %rsi + popq %rdi + popq %rcx + + #; return value is in xmm0 + + #; return + leave + ret + +#; end proc \ No newline at end of file diff --git a/libs/ardour/wscript b/libs/ardour/wscript index 41d0edc3dd..6026816bcb 100644 --- a/libs/ardour/wscript +++ b/libs/ardour/wscript @@ -408,6 +408,13 @@ def build(bld): obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s' ] elif bld.env['build_target'] == 'x86_64': obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s' ] + + if bld.env['build_target'] == 'mingw': + import platform as PLATFORM + u = PLATFORM.uname () + cpu = u[4] + if re.search ("(x86_64|AMD64)", cpu) != None: + obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ] # i18n if bld.is_defined('ENABLE_NLS'): diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc index b12d341366..0998f43bdc 100644 --- a/libs/pbd/fpu.cc +++ b/libs/pbd/fpu.cc @@ -16,7 +16,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#ifndef COMPILER_MSVC +#if !(defined (COMPILER_MSVC) || defined (COMPILER_MINGW)) #include "libpbd-config.h" #define _XOPEN_SOURCE 600 @@ -39,10 +39,6 @@ FPU::FPU () _flags = Flags (0); -#if defined(__MINGW64__) // Vkamyshniy: under __MINGW64__ the assembler code below is not compiled - return; -#endif - #if !( (defined __x86_64__) || (defined __i386__) ) // !ARCH_X86 return; #else diff --git a/libs/pbd/msvc/fpu.cc b/libs/pbd/msvc/fpu.cc index 6997405928..2ade2ad511 100644 --- a/libs/pbd/msvc/fpu.cc +++ b/libs/pbd/msvc/fpu.cc @@ -1,10 +1,14 @@ -#ifdef COMPILER_MSVC // Added by JE - 05-12-2009. Inline assembler instructions - // have been changed to Intel format and (in the case of - // cpuid) was replaced by the equivalent VC++ system call). +// Added by JE - 05-12-2009. Inline assembler instructions +// have been changed to Intel format and (in the case of +// cpuid) was replaced by the equivalent VC++ system call). + +#if defined (COMPILER_MSVC) || defined (COMPILER_MINGW) + #define _XOPEN_SOURCE 600 #include #include #include // Added by JE - 05-12-2009 +#include #include #include @@ -20,47 +24,19 @@ FPU::FPU () _flags = (Flags)0; -#ifndef ARCH_X86 - return; - -#else - #ifndef USE_X86_64_ASM -int cpuInfo[4]; + return; +#endif + // Get CPU lfags using Microsof function + // It works for both 64 and 32 bit systems + // no need to use assembler for getting info from register, this function does this for us + int cpuInfo[4]; __cpuid (cpuInfo, 1); cpuflags = cpuInfo[3]; -/* - __asm { // This is how the original section would look if converted to Intel syntax. - // However, I have grave doubts about whether it's doing the right thing. - // It seems as if the intention was to retrieve feature information from - // the processor. However, feature information is returned in the ebx register - // (if you believe Wikipedia) or in edx (if you believe Microsoft). Unfortunately, - // both registers get ignored in the original code!! Confused?? Join the club!! - mov eax, 1 - push ebx - cpuid - mov edx, 0 - pop ebx - mov cpuflags, ecx // This can't be right, surely??? - }; */ -#else -// Note that this syntax is currently still in AT&T format ! - asm volatile ( - "pushq %%rbx\n" - "movq $1, %%rax\n" - "cpuid\n" - "movq %%rdx, %0\n" - "popq %%rbx\n" - : "=r" (cpuflags) - : - : "%rax", "%rcx", "%rdx", "memory" - ); - -#endif /* USE_X86_64_ASM */ if (cpuflags & (1<<25)) { - _flags = Flags (_flags | (HasSSE|HasFlushToZero)); + _flags = Flags (_flags | (HasSSE|HasFlushToZero) ); } if (cpuflags & (1<<26)) { @@ -68,32 +44,46 @@ int cpuInfo[4]; } if (cpuflags & (1 << 24)) { - bool aligned_malloc = false; // Added by JE - 05-12-2009 - char* fxbuf = 0; -// This section changed by JE - 05-12-2009 -#ifdef NO_POSIX_MEMALIGN -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) // All of these support '_aligned_malloc()' - fxbuf = (char *) _aligned_malloc(512, 16); // (note that they all need at least MSVC runtime 7.0) - aligned_malloc = true; -#else - fxbuf = (char *) malloc(512); -#endif -#else - fxbuf = posix_memalign ((void**)&fxbuf, 16, 512); -#endif + char** fxbuf = 0; + + // allocate alligned buffer + fxbuf = (char **) malloc (sizeof (char *)); + assert (fxbuf); + *fxbuf = (char *) malloc (512); + assert (*fxbuf); + // Verify that fxbuf is correctly aligned - unsigned long buf_addr = (unsigned long)(void*)fxbuf; + unsigned long long buf_addr = (unsigned long long)(void*)fxbuf; if ((0 == buf_addr) || (buf_addr % 16)) error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg; else { - memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009 + memset(*fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009 +#if defined (COMPILER_MINGW) + asm volatile ( + "fxsave (%0)" + : + : "r" (*fxbuf) + : "memory" + ); +/* + asm( ".intel_syntax noprefix\n" ); + + asm volatile ( + "mov eax, fxbuf\n" + "fxsave [eax]\n" + ); + + asm( ".att_syntax prefix\n" ); +*/ + +#elif defined (COMPILER_MSVC) __asm { mov eax, fxbuf fxsave [eax] }; - +#endif uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]); /* if the mask is zero, set its default value (from intel specs) */ @@ -106,13 +96,10 @@ int cpuInfo[4]; _flags = Flags (_flags | HasDenormalsAreZero); } - if (aligned_malloc) - _aligned_free (fxbuf); - else - free (fxbuf); + free (*fxbuf); + free (fxbuf); } } -#endif // ARCH_X86 } FPU::~FPU () diff --git a/libs/pbd/wscript b/libs/pbd/wscript index 8f947fbb26..e20131b068 100644 --- a/libs/pbd/wscript +++ b/libs/pbd/wscript @@ -48,7 +48,6 @@ libpbd_sources = [ 'error.cc', 'ffs.cc', 'file_utils.cc', - 'fpu.cc', 'glib_semaphore.cc', 'id.cc', 'locale_guard.cc', @@ -145,8 +144,18 @@ def build(bld): if bld.env['build_target'] == 'x86_64': obj.defines += [ 'USE_X86_64_ASM' ] if bld.env['build_target'] == 'mingw': + import re + import platform as PLATFORM + u = PLATFORM.uname () + cpu = u[4] + if re.search ("(x86_64|AMD64)", cpu) != None: + obj.defines += [ 'USE_X86_64_ASM' ] + obj.defines += ['NO_POSIX_MEMALIGN' ] obj.source += [ 'windows_special_dirs.cc' ] + obj.source += [ 'msvc/fpu.cc' ] obj.uselib += ' OLE' + else: + obj.source += [ 'fpu.cc' ] if bld.env['BUILD_TESTS'] and bld.is_defined('HAVE_CPPUNIT'): # Unit tests diff --git a/wscript b/wscript index bb3157434d..61e2abd4d1 100644 --- a/wscript +++ b/wscript @@ -71,6 +71,8 @@ compiler_flags_dictionaries= { 'ultra-strict' : ['-Wredundant-decls', '-Wstrict-prototypes', '-Wmissing-prototypes'], # Flag to turn on C99 compliance by itself 'c99': '-std=c99', + # Flag to enable AT&T assembler syntax + 'attasm': 'asm=att', }, 'msvc' : { 'debuggable' : ['/DDEBUG', '/Od', '/Zi', '/MDd', '/Gd', '/EHsc'], @@ -370,17 +372,15 @@ int main() { return 0; }''', c_flags.append("-Qunused-arguments") cxx_flags.append("-Qunused-arguments") - if ((re.search ("i[0-9]86", cpu) != None) or (re.search ("x86_64", cpu) != None)) and conf.env['build_target'] != 'none': - + if (re.search ("(i[0-9]86|x86_64|AMD64)", cpu) != None) and conf.env['build_target'] != 'none': # # ARCH_X86 means anything in the x86 family from i386 to x86_64 # the compile-time presence of the macro _LP64 is used to # distingush 32 and 64 bit assembler # - - if (re.search ("(i[0-9]86|x86_64)", cpu) != None): - compiler_flags.append ("-DARCH_X86") + + compiler_flags.append ("-DARCH_X86") if platform == 'linux' : @@ -405,9 +405,19 @@ int main() { return 0; }''', elif cpu == "i686": compiler_flags.append ("-march=i686") - if ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse: + if not is_clang and ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse: compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'] ]) - + + if (conf.env['build_target'] == 'mingw'): + if (re.search ("(x86_64|AMD64)", cpu) != None): + # on Windows sse is supported by 64 bit platforms only + build_host_supports_sse = True + + # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default + # compiler_flags.append (["--mmnemonic=att", "msyntax=att") + + compiler_flags.extend ([ flags_dict['sse'], flags_dict['fpmath-sse'], flags_dict['xmmintrinsics'], flags_dicts['attasm'] ]) + # end of processor-specific section # optimization section @@ -415,7 +425,7 @@ int main() { return 0; }''', if sys.platform == 'darwin': compiler_flags.append("-DBUILD_VECLIB_OPTIMIZATIONS"); conf.env.append_value('LINKFLAGS_OSX', ['-framework', 'Accelerate']) - elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64': + elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64' or (conf.env['build_target'] == 'mingw' and build_host_supports_sse): compiler_flags.append ("-DBUILD_SSE_OPTIMIZATIONS") if not build_host_supports_sse: print("\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)") @@ -859,7 +869,7 @@ def configure(conf): autowaf.check_pkg(conf, 'rubberband', uselib_store='RUBBERBAND', mandatory=True) if Options.options.dist_target == 'mingw': - Options.options.fpu_optimization = False + Options.options.fpu_optimization = True conf.env.append_value('CFLAGS', '-DPLATFORM_WINDOWS') conf.env.append_value('CFLAGS', '-DCOMPILER_MINGW') conf.env.append_value('CXXFLAGS', '-DPLATFORM_WINDOWS')