add windows-only AVX functions
This commit is contained in:
parent
b938129589
commit
8bb54f41c6
587
libs/ardour/sse_avx_functions_64bit_win.s
Normal file
587
libs/ardour/sse_avx_functions_64bit_win.s
Normal file
@ -0,0 +1,587 @@
|
||||
/*
|
||||
Copyright (C) 2005-2006 Paul Davis, John Rigg
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
Author: Sampo Savolainen
|
||||
64-bit conversion: John Rigg
|
||||
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#; Microsoft version of AVX sample processing functions
|
||||
|
||||
#; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_avx_mix_buffers_with_gain
|
||||
.def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_mix_buffers_with_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
#; %xmm3 float gain
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm3, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .MBWG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
|
||||
|
||||
#; if we are aligned
|
||||
cmp $0, %rbx
|
||||
jz .MBWG_AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBWG_PRELOOP:
|
||||
|
||||
#; gain is already in %xmm0
|
||||
movss (%rdx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rcx), %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
decq %r8 #; nframes--
|
||||
jz .MBWG_END
|
||||
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .MBWG_PRELOOP
|
||||
|
||||
.MBWG_AVX:
|
||||
|
||||
cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
|
||||
jl .MBWG_NONALIGN #; we jump straight to the "normal" code
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.MBWG_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm1 #; source => xmm0
|
||||
vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source
|
||||
vaddps (%rcx), %ymm2, %ymm1 #; mix with destination
|
||||
vmovaps %ymm1, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .MBWG_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .MBWG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.MBWG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
#; gain is already in %xmm0
|
||||
|
||||
.MBWG_NONALIGNLOOP:
|
||||
|
||||
movss (%rdx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rcx), %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .MBWG_NONALIGNLOOP
|
||||
|
||||
.MBWG_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_avx_mix_buffers_no_gain
|
||||
.def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_mix_buffers_no_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .MBNG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .MBNG_AVX #; aligned at 32, rpoceed to AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; AVX instructions
|
||||
|
||||
.MBNG_PRELOOP:
|
||||
|
||||
movss (%rdx), %xmm0
|
||||
addss (%rcx), %xmm0
|
||||
movss %xmm0, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
|
||||
decq %r8 #; nframes--
|
||||
jz .MBNG_END
|
||||
|
||||
addq $4, %rbx #; one non-aligned byte less
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .MBNG_PRELOOP
|
||||
|
||||
.MBNG_AVX:
|
||||
|
||||
cmp $8, %r8 #; if there are frames left, but less than 8
|
||||
jl .MBNG_NONALIGN #; we can't run AVX
|
||||
|
||||
.MBNG_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm0 #; source => xmm0
|
||||
vaddps (%rcx), %ymm0, %ymm1 #; mix with destination
|
||||
vmovaps %ymm1, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .MBNG_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .MBNG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.MBNG_NONALIGN:
|
||||
#; not aligned!
|
||||
#;
|
||||
|
||||
movss (%rdx), %xmm0 #; src => xmm0
|
||||
addss (%rcx), %xmm0 #; xmm0 += dst
|
||||
movss %xmm0, (%rcx) #; xmm0 => dst
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .MBNG_NONALIGN
|
||||
|
||||
.MBNG_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_avx_copy_vector
|
||||
.def x86_sse_avx_copy_vector; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_copy_vector:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .CB_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .CB_AVX #; aligned at 32, rpoceed to AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; AVX instructions
|
||||
|
||||
.CB_PRELOOP:
|
||||
|
||||
movss (%rdx), %xmm0
|
||||
movss %xmm0, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
|
||||
decq %r8 #; nframes--
|
||||
jz .CB_END
|
||||
|
||||
addq $4, %rbx #; one non-aligned byte less
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .CB_PRELOOP
|
||||
|
||||
.CB_AVX:
|
||||
|
||||
cmp $8, %r8 #; if there are frames left, but less than 8
|
||||
jl .CB_NONALIGN #; we can't run AVX
|
||||
|
||||
.CB_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm0 #; source => xmm0
|
||||
vmovaps %ymm0, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .CB_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .CB_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.CB_NONALIGN:
|
||||
#; not aligned!
|
||||
#;
|
||||
|
||||
movss (%rdx), %xmm0 #; src => xmm0
|
||||
movss %xmm0, (%rcx) #; xmm0 => dst
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .CB_NONALIGN
|
||||
|
||||
.CB_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_avx_apply_gain_to_buffer
|
||||
.def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_apply_gain_to_buffer:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float gain avx specific register
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .AG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %r8 #; buf => %rdx
|
||||
andq $28, %r8 #; check alignment with mask 11100
|
||||
jz .AG_AVX #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-7 times, doing normal x87 float comparison
|
||||
#; so we reach a 32 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.AGLP_START:
|
||||
|
||||
#; Load next value from the buffer into %xmm1
|
||||
movss (%rcx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jz .AG_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %r8 #; one non-aligned byte less
|
||||
cmp $16, %r8
|
||||
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.AG_AVX:
|
||||
|
||||
#; We have reached the 32 byte aligned "buf" ("rcx") value
|
||||
#; use AVX instructions
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rdx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $3, %rax #; unsigned divide by 8
|
||||
|
||||
#; %rax = AVX iterations
|
||||
cmp $0, %rax
|
||||
je .AGPOST_START
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.AGLP_AVX:
|
||||
|
||||
vmovaps (%rcx), %ymm1
|
||||
vmulps %ymm0, %ymm1, %ymm2
|
||||
vmovaps %ymm2, (%rcx)
|
||||
|
||||
addq $32, %rcx #; buf + 8
|
||||
subq $8, %rdx #; nframes-=8
|
||||
|
||||
decq %rax
|
||||
jnz .AGLP_AVX
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
cmpq $0, %rdx #;
|
||||
jz .AG_END
|
||||
|
||||
.AGPOST_START:
|
||||
|
||||
movss (%rcx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jnz .AGPOST_START #; if we run out of frames, we go to the end
|
||||
|
||||
.AG_END:
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
|
||||
|
||||
#; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
|
||||
|
||||
.globl x86_sse_avx_compute_peak
|
||||
.def x86_sse_avx_compute_peak; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_compute_peak:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float* buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float current
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .CP_END
|
||||
|
||||
#; Check for alignment
|
||||
movq %rcx, %r8 #; buf => %rdx
|
||||
andq $28, %r8 #; mask bits 1 & 2
|
||||
jz .CP_AVX #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-7 times, doing normal x87 float comparison
|
||||
#; so we reach a 32 byte aligned "buf" (=%rcx) value
|
||||
|
||||
.LP_START:
|
||||
|
||||
#; Load next value from the buffer
|
||||
movss (%rcx), %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jz .CP_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %r8 #; one non-aligned byte less
|
||||
cmp $32, %r8
|
||||
jne .LP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.CP_AVX:
|
||||
|
||||
#; We have reached the 32 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rdx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $3, %rax #; unsigned divide by 8
|
||||
jz .POST_START
|
||||
|
||||
#; %rax = AVX iterations
|
||||
|
||||
#; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.LP_AVX:
|
||||
|
||||
vmovaps (%rcx), %ymm1
|
||||
vmaxps %ymm1, %ymm0, %ymm0
|
||||
|
||||
addq $32, %rcx #; buf+=8
|
||||
subq $8, %rdx #; nframes-=8
|
||||
|
||||
decq %rax
|
||||
jnz .LP_AVX
|
||||
|
||||
#; Calculate the maximum value contained in the 4 FP's in %ymm0
|
||||
vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
|
||||
vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
|
||||
vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums
|
||||
|
||||
#; now every float in %ymm0 is the same value, current maximum value
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
#; if no remaining frames, jump to the end
|
||||
cmp $0, %rdx
|
||||
je .CP_END
|
||||
|
||||
.POST_START:
|
||||
|
||||
movss (%rcx), %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--;
|
||||
jnz .POST_START
|
||||
|
||||
.CP_END:
|
||||
|
||||
#; return value is in xmm0
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
Loading…
Reference in New Issue
Block a user