[FFmpeg-devel] [PATCH] lpc: rewrite lpc_compute_autocorr in external asm
James Almer
jamrial at gmail.com
Sun May 26 03:39:24 EEST 2024
On 5/25/2024 5:57 PM, Lynne via ffmpeg-devel wrote:
> The inline asm function had issues running under checkasm.
> So I came to finish what I started, and wrote the last part
> of LPC computation in assembly.
>
> autocorr_10_c: 135525.8
> autocorr_10_sse2: 50729.8
> autocorr_10_fma3: 19007.8
> autocorr_30_c: 390100.8
> autocorr_30_sse2: 142478.8
> autocorr_30_fma3: 50559.8
> autocorr_32_c: 407058.3
> autocorr_32_sse2: 151633.3
> autocorr_32_fma3: 50517.3
> ---
> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
> 2 files changed, 100 insertions(+), 78 deletions(-)
>
> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
> index a585c17ef5..790841b7f4 100644
> --- a/libavcodec/x86/lpc.asm
> +++ b/libavcodec/x86/lpc.asm
> @@ -32,6 +32,8 @@ dec_tab_sse2: times 2 dq -2.0
> dec_tab_scalar: times 2 dq -1.0
> seq_tab_sse2: dq 1.0, 0.0
>
> +autoc_init_tab: times 4 dq 1.0
There's one_tab already, so no need to add this.
> +
> SECTION .text
>
> %macro APPLY_WELCH_FN 0
> @@ -261,3 +263,92 @@ APPLY_WELCH_FN
> INIT_YMM avx2
> APPLY_WELCH_FN
> %endif
> +
> +%macro COMPUTE_AUTOCORR_FN 0
> +cglobal lpc_compute_autocorr, 4, 7, 8, data, len, lag, autoc, lag_p, data_l, len_p
> +
> + shl lagd, 3
> + shl lenq, 3
> + xor lag_pq, lag_pq
> +
> +.lag_l:
> + movaps m8, [autoc_init_tab]
> +
> + mov len_pq, lag_pq
> +
> + lea data_lq, [lag_pq + mmsize - 8]
> + neg data_lq ; -j - mmsize
> + add data_lq, dataq ; data[-j - mmsize]
> +.len_l:
> + ; We waste the upper value here on SSE2,
> + ; but we use it on AVX.
> + movupd xm0, [dataq + len_pq] ; data[i]
> + movupd m1, [data_lq + len_pq] ; data[i - j]
> +
> +%if cpuflag(avx)
> + vbroadcastsd m0, xm0
> + vperm2f128 m1, m1, m1, 0x01
You can do
vpermpd m1, [data_lq + len_pq], q0123
Which saves you the movupd + vperm2f128 + shufpd for fma3.
> +%endif
> +
> + shufpd m0, m0, m0, 1100b
> + shufpd m1, m1, m1, 0101b
> +
> +%if cpuflag(fma3)
> + fmaddpd m8, m0, m1, m8 ; sum += data[i]*data[i-j]
So i found out this instruction is what's killing performance for me. If
it remove it and use mulpd + addpd like in sse2, i see the same boost
you got.
Why? I have no idea. Other functions using this same instruction, like
ff_vector_dmac_scalar_fma3, don't see any performance hit.
> +%else
> + mulpd m0, m1
> + addpd m8, m0 ; sum += data[i]*data[i-j]
> +%endif
> +
> + add len_pq, 8
> + cmp len_pq, lenq
> + jl .len_l
> +
> + movups [autocq + lag_pq], m8 ; autoc[j] = sum
> + add lag_pq, mmsize
> + cmp lag_pq, lagq
> + jl .lag_l
> +
> + ; The tail computation is guaranteed never to happen
> + ; as long as we're doing multiples of 4, rather than 2.
> + ; It is trivial to convert this to avx if ever needed.
> +%if !cpuflag(avx)
> + jg .end
> + ; If lag_p == lag fallthrough
> +
> +.tail:
> + movaps xm2, [autoc_init_tab]
> +
> + mov len_pq, lag_pq
> + sub len_pq, mmsize
> +
> + lea data_lq, [lag_pq]
> + neg data_lq ; -j
> + add data_lq, dataq ; data[-j]
> +
> +.tail_l:
> + movupd xm0, [dataq + len_pq]
> + movupd xm1, [data_lq + len_pq]
> +
> + mulpd xm0, xm1
> + addpd xm2, xm0 ; sum += data[i]*data[i-j]
> +
> + add len_pq, mmsize
> + cmp len_pq, lenq
> + jl .tail_l
> +
> + shufpd xm1, xm2, xm2, 01b
> + addpd xm2, xm1
> +
> + movhpd [autocq + lag_pq], xm2
> +%endif
> +
> +.end:
> + RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +COMPUTE_AUTOCORR_FN
> +INIT_YMM fma3
> +COMPUTE_AUTOCORR_FN
> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
> index f2fca53799..96469fae40 100644
> --- a/libavcodec/x86/lpc_init.c
> +++ b/libavcodec/x86/lpc_init.c
> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
> double *w_data);
> -
> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
> -
> -#if HAVE_SSE2_INLINE
> -
> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> - double *autoc)
> -{
> - int j;
> -
> - if((x86_reg)data & 15)
> - data++;
> -
> - for(j=0; j<lag; j+=2){
> - x86_reg i = -len*sizeof(double);
> - if(j == lag-2) {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
> - "1: \n\t"
> - "movapd (%2,%0), %%xmm3 \n\t"
> - "movupd -8(%3,%0), %%xmm4 \n\t"
> - "movapd (%3,%0), %%xmm5 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm5 \n\t"
> - "mulpd -16(%3,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm5, %%xmm0 \n\t"
> - "addpd %%xmm3, %%xmm2 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "movhlps %%xmm2, %%xmm5 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "addsd %%xmm5, %%xmm2 \n\t"
> - "movsd %%xmm0, (%1) \n\t"
> - "movsd %%xmm1, 8(%1) \n\t"
> - "movsd %%xmm2, 16(%1) \n\t"
> - :"+&r"(i)
> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - :"memory"
> - );
> - } else {
> - __asm__ volatile(
> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
> - "1: \n\t"
> - "movapd (%3,%0), %%xmm3 \n\t"
> - "movupd -8(%4,%0), %%xmm4 \n\t"
> - "mulpd %%xmm3, %%xmm4 \n\t"
> - "mulpd (%4,%0), %%xmm3 \n\t"
> - "addpd %%xmm4, %%xmm1 \n\t"
> - "addpd %%xmm3, %%xmm0 \n\t"
> - "add $16, %0 \n\t"
> - "jl 1b \n\t"
> - "movhlps %%xmm0, %%xmm3 \n\t"
> - "movhlps %%xmm1, %%xmm4 \n\t"
> - "addsd %%xmm3, %%xmm0 \n\t"
> - "addsd %%xmm4, %%xmm1 \n\t"
> - "movsd %%xmm0, %1 \n\t"
> - "movsd %%xmm1, %2 \n\t"
> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
> - :"r"(data+len), "r"(data+len-j)
> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
> - );
> - }
> - }
> -}
> -
> -#endif /* HAVE_SSE2_INLINE */
> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
> +void ff_lpc_compute_autocorr_fma3(const double *data, ptrdiff_t len, int lag,
> + double *autoc);
>
> av_cold void ff_lpc_init_x86(LPCContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
>
> -#if HAVE_SSE2_INLINE
> - if (INLINE_SSE2_SLOW(cpu_flags))
> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
> -#endif
> + if (EXTERNAL_SSE2(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
> +
> + if (EXTERNAL_FMA3(cpu_flags))
> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_fma3;
>
> if (EXTERNAL_SSE2(cpu_flags))
> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
More information about the ffmpeg-devel
mailing list