13
0

Removing _mm256_zeroupper()

This is probably not needed in 2021 as compilers will insert them
automatically. See stackoverflow reference:

https://stackoverflow.com/a/68738289
This commit is contained in:
Ayan Shafqat 2022-01-02 16:35:34 -05:00 committed by Robin Gareus
parent 5fc3ae79ae
commit 25fac546d5
Signed by: rgareus
GPG Key ID: A090BCE02CF57F04

View File

@ -157,14 +157,6 @@ x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current)
vmax = avx_getmax_ps(vmax);
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX
// instructions.
// _mm256_zeroupper();
// This is probably not needed in 2021 as compilers will insert them
// automatically. See stackoverflow reference:
// https://stackoverflow.com/questions/68736527/do-i-need-to-use-mm256-zeroupper-in-2021
#if defined(__GNUC__) && (__GNUC__ < 5)
return *((float *)&vmax);
#elif defined(__GNUC__) && (__GNUC__ < 8)
@ -255,13 +247,6 @@ x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *m
vmin = avx_getmin_ps(vmin);
vmax = avx_getmax_ps(vmax);
// There's a penalty going away from AVX mode to SSE mode. This can
// be avoided by ensuring to the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
_mm256_zeroupper();
_mm_store_ss(minf, _mm256_castps256_ps128(vmin));
_mm_store_ss(maxf, _mm256_castps256_ps128(vmax));
}
@ -318,13 +303,6 @@ x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
frames -= 8;
}
// There's a penalty going away from AVX mode to SSE mode. This can
// be avoided by ensuring to the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples
do {
__m128 g0 = _mm256_castps256_ps128(vgain);
@ -486,13 +464,6 @@ x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32
nframes -= 8;
}
// There's a penalty going away from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples
do {
__m128 g0 = _mm_set_ss(gain);
@ -586,13 +557,6 @@ x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t
nframes -= 8;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples, one sample at a time.
do {
__m128 g0 = _mm256_castps256_ps128(vgain); // use the same register
@ -676,12 +640,6 @@ x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t
nframes -= 8;
}
// There's a penalty going away from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples
do {
while (nframes > 0) {
@ -807,12 +765,6 @@ x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n
nframes -= 8;
}
// There's a penalty going from AVX mode to SSE mode. This can
// be avoided by ensuring the CPU that rest of the routine is no
// longer interested in the upper portion of the YMM register.
_mm256_zeroupper(); // zeros the upper portion of YMM register
// Process the remaining samples
do {
while (nframes > 0) {