[FFmpeg-cvslog] opus_pvq_search: only use rsqrtps approximation on CPUs with avx
Ivan Kalvachev
ikalvachev at gmail.com
Fri Aug 18 20:47:19 EEST 2017
Please revert this.
Also, in future, please send a patch to the maillist so I could take a
look on it.
On 8/18/17, Rostislav Pehlivanov <git at videolan.org> wrote:
> ffmpeg | branch: master | Rostislav Pehlivanov <atomnuker at gmail.com> | Fri
> Aug 18 17:28:40 2017 +0100| [f386dd70acdc81d42d6bcb885d2889634cdf45b7] |
> committer: Rostislav Pehlivanov
>
> opus_pvq_search: only use rsqrtps approximation on CPUs with avx
You got the commit message in reverse.
"rsqrtps" is the instruction that uses approximation and is faster.
AVX CPUs are "allegedly" the ones that have faster divps,
so the speed hit is not so bad.
You want to use divps on new cpu and rsqrtps on old ones,
not the reverse.
> Makes the search produce idential results with the C version.
>
> Signed-off-by: Rostislav Pehlivanov <atomnuker at gmail.com>
>
>> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f386dd70acdc81d42d6bcb885d2889634cdf45b7
> ---
>
> libavcodec/x86/opus_pvq_search.asm | 18 ++++++------------
> 1 file changed, 6 insertions(+), 12 deletions(-)
>
> diff --git a/libavcodec/x86/opus_pvq_search.asm
> b/libavcodec/x86/opus_pvq_search.asm
> index beb6cbcc9b..2f4864c95c 100644
> --- a/libavcodec/x86/opus_pvq_search.asm
> +++ b/libavcodec/x86/opus_pvq_search.asm
> @@ -28,12 +28,6 @@
> ALIGNMODE p6
> %endif
>
> -; Use float op that give half precision but execute for around 3 cycles.
> -; On Skylake & Ryzen the division is much faster (around 11c/3),
> -; that makes the full precision code about 2% slower.
> -; Opus also does use rsqrt approximation in their intrinsics code.
> -%define USE_APPROXIMATION 1
> -
> SECTION_RODATA 64
>
> const_float_abs_mask: times 8 dd 0x7fffffff
> @@ -102,17 +96,17 @@ align 16
> movaps m4, [tmpY + r4] ; y[i]
> movaps m5, [tmpX + r4] ; X[i]
>
> - %if USE_APPROXIMATION == 1
> +%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but
These changes are not necessary and
they do obfuscate the code.
The define name also documents what the different code path does.
Using just "not AVX" doesn't explain why
we use one code path and not the other.
> fast 1/sqrt
> xorps m0, m0
> cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
> - %endif
> +%endif
>
> addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
> addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
>
> - %if USE_APPROXIMATION == 1
> +%if !cpuflag(avx)
> andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent
> aproximation error from setting pulses in array padding.
> - %endif
> +%endif
>
> %else
> movaps m5, [tmpY + r4] ; m5 = y[i]
> @@ -125,7 +119,7 @@ align 16
> andps m5, m0 ; (0<y)?m5:0
> %endif
>
> -%if USE_APPROXIMATION == 1
> +%if !cpuflag(avx)
> rsqrtps m4, m4
> mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
> %else
> @@ -261,7 +255,7 @@ align 16
> jz %%zero_input ; if (Sx==0) goto zero_input
>
> cvtsi2ss xm0, dword Kd ; m0 = K
> -%if USE_APPROXIMATION == 1
> +%if !cpuflag(avx)
> rcpss xm1, xm1 ; m1 = approx(1/Sx)
> mulss xm0, xm1 ; m0 = K*(1/Sx)
> %else
>
The whole patch could be replaced with one liner:
@@ -391,5 +391,7 @@ PVQ_FAST_SEARCH
INIT_XMM sse4
PVQ_FAST_SEARCH
+%define USE_APPROXIMATION 0
+
INIT_XMM avx
PVQ_FAST_SEARCH
Best Regards
Ivan Kalvachev
More information about the ffmpeg-cvslog
mailing list