[FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
James Almer
jamrial at gmail.com
Sun May 26 05:16:40 EEST 2024
On 5/25/2024 10:51 PM, James Almer wrote:
> On 5/25/2024 10:42 PM, Lynne via ffmpeg-devel wrote:
>> The inline asm function had issues running under checkasm.
>> So I came to finish what I started, and wrote the last part
>> of LPC computation in assembly.
>> ---
>> libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
>> libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
>> 2 files changed, 100 insertions(+), 78 deletions(-)
>>
>> diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
>> index a585c17ef5..9c359ae480 100644
>> --- a/libavcodec/x86/lpc.asm
>> +++ b/libavcodec/x86/lpc.asm
>> @@ -261,3 +261,94 @@ APPLY_WELCH_FN
>> INIT_YMM avx2
>> APPLY_WELCH_FN
>> %endif
>> +
>> +%macro COMPUTE_AUTOCORR_FN 0
>> +cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p,
>> data_l, len_p
>> + shl lagd, 3
>> + shl lenq, 3
>> + xor lag_pq, lag_pq
>> +
>> +.lag_l:
>> + movaps m2, [one_tab]
>
> Super nit: movapd
>
>> +
>> + mov len_pq, lag_pq
>> +
>> + lea data_lq, [lag_pq + mmsize - 8]
>> + neg data_lq ; -j - mmsize
>> + add data_lq, dataq ; data[-j - mmsize]
>> +.len_l:
>> +
>> +%if mmsize == 32
>> + vbroadcastsd m0, [dataq + len_pq]
>> + vpermpd m1, [data_lq + len_pq], q0123
>> +%else
>> + movupd m1, [data_lq + len_pq] ; data[i - j]
>> + movsd xm0, [dataq + len_pq] ; data[i]
>> + shufpd m1, m1, m1, 01b
I just realized you're shuffling the values inside the len_1 loop when
you could do it right before you store the sum.
Something like:
[...]
.len_l:
%if mmsize == 16
movsd m0, [dataq + len_pq] ; data[i]
shufpd m0, m0, m0, 0
movupd m1, [data_lq + len_pq] ; data[i - j]
mulpd m0, m1
%else
vbroadcastsd m0, [dataq + len_pq]
mulpd m0, [data_lq + len_pq] ; data[i - j]
%endif
addpd m2, m0 ; sum += data[i]*data[i-j]
add len_pq, 8
cmp len_pq, lenq
jl .len_l
shufpd m2, m2, m2, 0101b
%if mmsize == 32
vextractf128 [autocq + lag_pq], m2, 1
movupd [autocq + lag_pq + 16], xm2 ; autoc[j] = sum
%else
movupd [autocq + lag_pq], m2 ; autoc[j] = sum
%endif
add lag_pq, mmsize
cmp lag_pq, lagq
jl .lag_l
[...]
And by using vextractf128 here instead of vpermpd you can keep the
function as avx instead of avx2, unless a vpermpd + single 256bit store
is faster than shufpd + two stores (vextractf128 + movu 128bit), which i
assume it wont because of crosslane shuffling.
>> +%endif
>> +
>> + shufpd m0, m0, m0, 1100b
>
> This is not needed for mmsize == 32. The broadcast set every qword to
> the value movsd loaded.
>
>> +
>> + ; fmadd actually hurts performance in this case due to
>> + ; the earlier loads + shuffles
>> + mulpd m0, m1
>> + addpd m2, m0 ; sum += data[i]*data[i-j]
>> +
>> + add len_pq, 8
>> + cmp len_pq, lenq
>> + jl .len_l
>> +
>> + movupd [autocq + lag_pq], m2 ; autoc[j] = sum
>> + add lag_pq, mmsize
>> + cmp lag_pq, lagq
>> + jl .lag_l
>> +
>> + ; The tail computation is guaranteed never to happen
>> + ; as long as we're doing multiples of 4, rather than 2.
>> +%if mmsize != 32
>> + jg .end
>> + ; If lag_p == lag fallthrough
>> +
>> +.tail:
>> + movaps m2, [one_tab]
>> +
>> + mov len_pq, lag_pq
>> + sub len_pq, mmsize
>> +
>> + lea data_lq, [lag_pq]
>> + neg data_lq ; -j
>> + add data_lq, dataq ; data[-j]
>> +
>> +.tail_l:
>> + movupd m0, [dataq + len_pq]
>> + movupd m1, [data_lq + len_pq]
>> +
>> + mulpd m0, m1
>> + addpd m2, m0 ; sum += data[i]*data[i-j]
>> +
>> + add len_pq, mmsize
>> + cmp len_pq, lenq
>> + jl .tail_l
>> +
>> + shufpd m1, m2, m2, 01b
>> + addpd m2, m1
>> +
>> + ; Leave this here just in case its ever needed
>> +%if mmsize == 32
>> + vperm2f128 m1, m2, m2, 0x01
>> + addpd xm2, xm1
>> + movupd [autocq + lag_pq], xm2
>> +%else
>> + movhpd [autocq + lag_pq], xm2
>> +%endif
>> +
>> +.end:
>> +%endif
>> +
>> + RET
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +COMPUTE_AUTOCORR_FN
>> +INIT_YMM avx
>
> vpermpd is avx2, so it needs to be that.
>
>> +COMPUTE_AUTOCORR_FN
>> diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
>> index f2fca53799..bb174be53e 100644
>> --- a/libavcodec/x86/lpc_init.c
>> +++ b/libavcodec/x86/lpc_init.c
>> @@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t
>> *data, ptrdiff_t len,
>> double *w_data);
>> void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
>> double *w_data);
>> -
>> -DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
>> -
>> -#if HAVE_SSE2_INLINE
>> -
>> -static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t
>> len, int lag,
>> - double *autoc)
>> -{
>> - int j;
>> -
>> - if((x86_reg)data & 15)
>> - data++;
>> -
>> - for(j=0; j<lag; j+=2){
>> - x86_reg i = -len*sizeof(double);
>> - if(j == lag-2) {
>> - __asm__ volatile(
>> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
>> - "1: \n\t"
>> - "movapd (%2,%0), %%xmm3 \n\t"
>> - "movupd -8(%3,%0), %%xmm4 \n\t"
>> - "movapd (%3,%0), %%xmm5 \n\t"
>> - "mulpd %%xmm3, %%xmm4 \n\t"
>> - "mulpd %%xmm3, %%xmm5 \n\t"
>> - "mulpd -16(%3,%0), %%xmm3 \n\t"
>> - "addpd %%xmm4, %%xmm1 \n\t"
>> - "addpd %%xmm5, %%xmm0 \n\t"
>> - "addpd %%xmm3, %%xmm2 \n\t"
>> - "add $16, %0 \n\t"
>> - "jl 1b \n\t"
>> - "movhlps %%xmm0, %%xmm3 \n\t"
>> - "movhlps %%xmm1, %%xmm4 \n\t"
>> - "movhlps %%xmm2, %%xmm5 \n\t"
>> - "addsd %%xmm3, %%xmm0 \n\t"
>> - "addsd %%xmm4, %%xmm1 \n\t"
>> - "addsd %%xmm5, %%xmm2 \n\t"
>> - "movsd %%xmm0, (%1) \n\t"
>> - "movsd %%xmm1, 8(%1) \n\t"
>> - "movsd %%xmm2, 16(%1) \n\t"
>> - :"+&r"(i)
>> - :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
>> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> - :"memory"
>> - );
>> - } else {
>> - __asm__ volatile(
>> - "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
>> - "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
>> - "1: \n\t"
>> - "movapd (%3,%0), %%xmm3 \n\t"
>> - "movupd -8(%4,%0), %%xmm4 \n\t"
>> - "mulpd %%xmm3, %%xmm4 \n\t"
>> - "mulpd (%4,%0), %%xmm3 \n\t"
>> - "addpd %%xmm4, %%xmm1 \n\t"
>> - "addpd %%xmm3, %%xmm0 \n\t"
>> - "add $16, %0 \n\t"
>> - "jl 1b \n\t"
>> - "movhlps %%xmm0, %%xmm3 \n\t"
>> - "movhlps %%xmm1, %%xmm4 \n\t"
>> - "addsd %%xmm3, %%xmm0 \n\t"
>> - "addsd %%xmm4, %%xmm1 \n\t"
>> - "movsd %%xmm0, %1 \n\t"
>> - "movsd %%xmm1, %2 \n\t"
>> - :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
>> - :"r"(data+len), "r"(data+len-j)
>> - NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
>> - );
>> - }
>> - }
>> -}
>> -
>> -#endif /* HAVE_SSE2_INLINE */
>> +void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len,
>> int lag,
>> + double *autoc);
>> +void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len,
>> int lag,
>> + double *autoc);
>> av_cold void ff_lpc_init_x86(LPCContext *c)
>> {
>> int cpu_flags = av_get_cpu_flags();
>> -#if HAVE_SSE2_INLINE
>> - if (INLINE_SSE2_SLOW(cpu_flags))
>> - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
>> -#endif
>> + if (EXTERNAL_SSE2(cpu_flags))
>> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
>
> Place this with ff_lpc_apply_welch_window_sse2 below.
>
>> +
>> + if (EXTERNAL_AVX_FAST(cpu_flags))
>> + c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
>> if (EXTERNAL_SSE2(cpu_flags))
>> c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
More information about the ffmpeg-devel
mailing list