From 8bb54f41c66ff7db27dff8a06cc28e04bcd0c263 Mon Sep 17 00:00:00 2001 From: Paul Davis Date: Tue, 12 May 2015 21:27:27 -0400 Subject: [PATCH] add windows-only AVX functions --- libs/ardour/sse_avx_functions_64bit_win.s | 587 ++++++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 libs/ardour/sse_avx_functions_64bit_win.s diff --git a/libs/ardour/sse_avx_functions_64bit_win.s b/libs/ardour/sse_avx_functions_64bit_win.s new file mode 100644 index 0000000000..22829db98a --- /dev/null +++ b/libs/ardour/sse_avx_functions_64bit_win.s @@ -0,0 +1,587 @@ +/* + Copyright (C) 2005-2006 Paul Davis, John Rigg + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Author: Sampo Savolainen + 64-bit conversion: John Rigg + + $Id$ +*/ + +#; Microsoft version of AVX sample processing functions + +#; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain); + +.globl x86_sse_avx_mix_buffers_with_gain + .def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32; +.endef + +x86_sse_avx_mix_buffers_with_gain: + +#; due to Microsoft calling convention +#; %rcx float *dst +#; %rdx float *src +#; %r8 unsigned int nframes +#; %xmm3 float gain + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + + #; move current max to %xmm0 for convenience + movss %xmm3, %xmm0 + + #; if nframes == 0, go to end + cmp $0, %r8 + je .MBWG_END + + #; Check for alignment + + movq %rcx, %rax + andq $28, %rax #; mask alignment offset + + movq %rdx, %rbx + andq $28, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually + + #; if we are aligned + cmp $0, %rbx + jz .MBWG_AVX + + #; Pre-loop, we need to run 1-7 frames "manually" without + #; SSE instructions + +.MBWG_PRELOOP: + + #; gain is already in %xmm0 + movss (%rdx), %xmm1 + mulss %xmm0, %xmm1 + addss (%rcx), %xmm1 + movss %xmm1, (%rcx) + + addq $4, %rcx #; dst++ + addq $4, %rdx #; src++ + decq %r8 #; nframes-- + jz .MBWG_END + + addq $4, %rbx + + cmp $32, %rbx #; test if we've reached 32 byte alignment + jne .MBWG_PRELOOP + +.MBWG_AVX: + + cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then + jl .MBWG_NONALIGN #; we jump straight to the "normal" code + + #; set up the gain buffer (gain is already in %xmm0) + vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register + vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits + +.MBWG_AVXLOOP: + + vmovaps (%rdx), %ymm1 #; source => xmm0 + vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source + vaddps (%rcx), %ymm2, %ymm1 #; mix with destination + vmovaps %ymm1, (%rcx) #; copy result to destination + + addq $32, %rcx #; dst+=8 + addq $32, %rdx #; src+=8 + + subq $8, %r8 #; nframes-=8 + cmp $8, %r8 + jge .MBWG_AVXLOOP + + #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties + vzeroupper + + cmp $0, %r8 + je .MBWG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-7 frames. + +.MBWG_NONALIGN: + #; not aligned! + + #; gain is already in %xmm0 + +.MBWG_NONALIGNLOOP: + + movss (%rdx), %xmm1 + mulss %xmm0, %xmm1 + addss (%rcx), %xmm1 + movss %xmm1, (%rcx) + + addq $4, %rcx + addq $4, %rdx + + decq %r8 + jnz .MBWG_NONALIGNLOOP + +.MBWG_END: + + popq %rbx + + #; return + leave + ret + + +#; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes); + +.globl x86_sse_avx_mix_buffers_no_gain + .def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32; +.endef + +x86_sse_avx_mix_buffers_no_gain: + +#; due to Microsoft calling convention +#; %rcx float *dst +#; %rdx float *src +#; %r8 unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + + #; the real function + + #; if nframes == 0, go to end + cmp $0, %r8 + je .MBNG_END + + #; Check for alignment + + movq %rcx, %rax + andq $28, %rax #; mask alignment offset + + movq %rdx, %rbx + andq $28, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually + + cmp $0, %rbx + je .MBNG_AVX #; aligned at 32, rpoceed to AVX + + #; Pre-loop, we need to run 1-7 frames "manually" without + #; AVX instructions + +.MBNG_PRELOOP: + + movss (%rdx), %xmm0 + addss (%rcx), %xmm0 + movss %xmm0, (%rcx) + + addq $4, %rcx #; dst++ + addq $4, %rdx #; src++ + + decq %r8 #; nframes-- + jz .MBNG_END + + addq $4, %rbx #; one non-aligned byte less + + cmp $32, %rbx #; test if we've reached 32 byte alignment + jne .MBNG_PRELOOP + +.MBNG_AVX: + + cmp $8, %r8 #; if there are frames left, but less than 8 + jl .MBNG_NONALIGN #; we can't run AVX + +.MBNG_AVXLOOP: + + vmovaps (%rdx), %ymm0 #; source => xmm0 + vaddps (%rcx), %ymm0, %ymm1 #; mix with destination + vmovaps %ymm1, (%rcx) #; copy result to destination + + addq $32, %rcx #; dst+=8 + addq $32, %rdx #; src+=8 + + subq $8, %r8 #; nframes-=8 + cmp $8, %r8 + jge .MBNG_AVXLOOP + + #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties + vzeroupper + + cmp $0, %r8 + je .MBNG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-7 frames. + +.MBNG_NONALIGN: + #; not aligned! + #; + + movss (%rdx), %xmm0 #; src => xmm0 + addss (%rcx), %xmm0 #; xmm0 += dst + movss %xmm0, (%rcx) #; xmm0 => dst + + addq $4, %rcx + addq $4, %rdx + + decq %r8 + jnz .MBNG_NONALIGN + +.MBNG_END: + + popq %rbx + + #; return + leave + ret + + +#; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes); + +.globl x86_sse_avx_copy_vector + .def x86_sse_avx_copy_vector; .scl 2; .type 32; +.endef + +x86_sse_avx_copy_vector: + +#; due to Microsoft calling convention +#; %rcx float *dst +#; %rdx float *src +#; %r8 unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx #; must be preserved + + #; the real function + + #; if nframes == 0, go to end + cmp $0, %r8 + je .CB_END + + #; Check for alignment + + movq %rcx, %rax + andq $28, %rax #; mask alignment offset + + movq %rdx, %rbx + andq $28, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually + + cmp $0, %rbx + je .CB_AVX #; aligned at 32, rpoceed to AVX + + #; Pre-loop, we need to run 1-7 frames "manually" without + #; AVX instructions + +.CB_PRELOOP: + + movss (%rdx), %xmm0 + movss %xmm0, (%rcx) + + addq $4, %rcx #; dst++ + addq $4, %rdx #; src++ + + decq %r8 #; nframes-- + jz .CB_END + + addq $4, %rbx #; one non-aligned byte less + + cmp $32, %rbx #; test if we've reached 32 byte alignment + jne .CB_PRELOOP + +.CB_AVX: + + cmp $8, %r8 #; if there are frames left, but less than 8 + jl .CB_NONALIGN #; we can't run AVX + +.CB_AVXLOOP: + + vmovaps (%rdx), %ymm0 #; source => xmm0 + vmovaps %ymm0, (%rcx) #; copy result to destination + + addq $32, %rcx #; dst+=8 + addq $32, %rdx #; src+=8 + + subq $8, %r8 #; nframes-=8 + cmp $8, %r8 + jge .CB_AVXLOOP + + #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties + vzeroupper + + cmp $0, %r8 + je .CB_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-7 frames. + +.CB_NONALIGN: + #; not aligned! + #; + + movss (%rdx), %xmm0 #; src => xmm0 + movss %xmm0, (%rcx) #; xmm0 => dst + + addq $4, %rcx + addq $4, %rdx + + decq %r8 + jnz .CB_NONALIGN + +.CB_END: + + popq %rbx + + #; return + leave + ret + + +#; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain); + +.globl x86_sse_avx_apply_gain_to_buffer + .def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32; +.endef + +x86_sse_avx_apply_gain_to_buffer: + +#; due to Microsoft calling convention +#; %rcx float *buf 32(%rbp) +#; %rdx unsigned int nframes +#; %xmm2 float gain avx specific register + + pushq %rbp + movq %rsp, %rbp + + #; move current max to %xmm0 for convenience + movss %xmm2, %xmm0 + + #; the real function + + #; if nframes == 0, go to end + cmp $0, %rdx + je .AG_END + + #; Check for alignment + + movq %rcx, %r8 #; buf => %rdx + andq $28, %r8 #; check alignment with mask 11100 + jz .AG_AVX #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-7 times, doing normal x87 float comparison + #; so we reach a 32 byte aligned "buf" (=%rdi) value + +.AGLP_START: + + #; Load next value from the buffer into %xmm1 + movss (%rcx), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rcx) + + #; increment buffer, decrement counter + addq $4, %rcx #; buf++; + + decq %rdx #; nframes-- + jz .AG_END #; if we run out of frames, we go to the end + + addq $4, %r8 #; one non-aligned byte less + cmp $16, %r8 + jne .AGLP_START #; if more non-aligned frames exist, we do a do-over + +.AG_AVX: + + #; We have reached the 32 byte aligned "buf" ("rcx") value + #; use AVX instructions + + #; Figure out how many loops we should do + movq %rdx, %rax #; copy remaining nframes to %rax for division + + shr $3, %rax #; unsigned divide by 8 + + #; %rax = AVX iterations + cmp $0, %rax + je .AGPOST_START + + #; set up the gain buffer (gain is already in %xmm0) + vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register + vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits + +.AGLP_AVX: + + vmovaps (%rcx), %ymm1 + vmulps %ymm0, %ymm1, %ymm2 + vmovaps %ymm2, (%rcx) + + addq $32, %rcx #; buf + 8 + subq $8, %rdx #; nframes-=8 + + decq %rax + jnz .AGLP_AVX + + #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties + vzeroupper + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + cmpq $0, %rdx #; + jz .AG_END + +.AGPOST_START: + + movss (%rcx), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rcx) + + #; increment buffer, decrement counter + addq $4, %rcx #; buf++; + + decq %rdx #; nframes-- + jnz .AGPOST_START #; if we run out of frames, we go to the end + +.AG_END: + + #; return + leave + ret + +#; end proc + + +#; float x86_sse_avx_compute_peak(float *buf, long nframes, float current); + +.globl x86_sse_avx_compute_peak + .def x86_sse_avx_compute_peak; .scl 2; .type 32; +.endef + +x86_sse_avx_compute_peak: + +#; due to Microsoft calling convention +#; %rcx float* buf 32(%rbp) +#; %rdx unsigned int nframes +#; %xmm2 float current + + pushq %rbp + movq %rsp, %rbp + + #; move current max to %xmm0 for convenience + movss %xmm2, %xmm0 + + #; if nframes == 0, go to end + cmp $0, %rdx + je .CP_END + + #; Check for alignment + movq %rcx, %r8 #; buf => %rdx + andq $28, %r8 #; mask bits 1 & 2 + jz .CP_AVX #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-7 times, doing normal x87 float comparison + #; so we reach a 32 byte aligned "buf" (=%rcx) value + +.LP_START: + + #; Load next value from the buffer + movss (%rcx), %xmm1 + maxss %xmm1, %xmm0 + + #; increment buffer, decrement counter + addq $4, %rcx #; buf++; + + decq %rdx #; nframes-- + jz .CP_END #; if we run out of frames, we go to the end + + addq $4, %r8 #; one non-aligned byte less + cmp $32, %r8 + jne .LP_START #; if more non-aligned frames exist, we do a do-over + +.CP_AVX: + + #; We have reached the 32 byte aligned "buf" ("rdi") value + + #; Figure out how many loops we should do + movq %rdx, %rax #; copy remaining nframes to %rax for division + + shr $3, %rax #; unsigned divide by 8 + jz .POST_START + + #; %rax = AVX iterations + + #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register.. + vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register + vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits + +.LP_AVX: + + vmovaps (%rcx), %ymm1 + vmaxps %ymm1, %ymm0, %ymm0 + + addq $32, %rcx #; buf+=8 + subq $8, %rdx #; nframes-=8 + + decq %rax + jnz .LP_AVX + + #; Calculate the maximum value contained in the 4 FP's in %ymm0 + vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half + vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now + vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half + vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now + vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs + vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums + + #; now every float in %ymm0 is the same value, current maximum value + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + + #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties + vzeroupper + + #; if no remaining frames, jump to the end + cmp $0, %rdx + je .CP_END + +.POST_START: + + movss (%rcx), %xmm1 + maxss %xmm1, %xmm0 + + addq $4, %rcx #; buf++; + + decq %rdx #; nframes--; + jnz .POST_START + +.CP_END: + + #; return value is in xmm0 + + #; return + leave + ret + +#; end proc \ No newline at end of file