[FFmpeg-devel] [PATCH 7/8] lavc/flacenc: add AVX2 version of the 32-bit LPC encoder

Mon Nov 27 01:42:00 EET 2017

On 11/26/2017 8:13 PM, Rostislav Pehlivanov wrote:
> On 26 November 2017 at 22:51, James Darnley <james.darnley at gmail.com> wrote:
> 
>> When compared to the SSE4.2 version runtime, is reduced by 1 to 26%.  The
>> function itself is around 2 times faster.
>> ---
>>  libavcodec/x86/flac_dsp_gpl.asm | 56 ++++++++++++++++++++++++++++++
>> +----------
>>  libavcodec/x86/flacdsp_init.c   |  5 +++-
>>  2 files changed, 47 insertions(+), 14 deletions(-)
>>
>> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
>> b/libavcodec/x86/flac_dsp_gpl.asm
>> index 91989ce560..749e66dec8 100644
>> --- a/libavcodec/x86/flac_dsp_gpl.asm
>> +++ b/libavcodec/x86/flac_dsp_gpl.asm
>> @@ -22,11 +22,11 @@
>>
>>  %include "libavutil/x86/x86util.asm"
>>
>> -SECTION_RODATA
>> +SECTION_RODATA 32
>>
>> -pd_0_int_min: times  2 dd 0, -2147483648
>> -pq_int_min:   times  2 dq -2147483648
>> -pq_int_max:   times  2 dq  2147483647
>> +pd_0_int_min: times  4 dd 0, -2147483648
>> +pq_int_min:   times  4 dq -2147483648
>> +pq_int_max:   times  4 dq  2147483647
>>
>>  SECTION .text
>>
>> @@ -123,7 +123,10 @@ RET
>>  %endmacro
>>
>>  %macro PMINSQ 3
>> -    pcmpgtq %3, %2, %1
>> +    mova    %3, %2
>> +    ; We cannot use the 3-operand format because the memory location
>> cannot be
>> +    ; the second operand, only the third.
>> +    pcmpgtq %3, %1
>>
> 
> I don't get it, how did it work before then?
> 
> 
>>      pand    %1, %3
>>      pandn   %3, %2
>>      por     %1, %3
>> @@ -177,11 +180,11 @@ lea    resq,   [resq+orderq*4]
>>  lea    smpq,   [smpq+orderq*4]
>>  lea    coefsq, [coefsq+orderq*4]
>>  sub    length,  orderd
>> -movd   m3,      r5m
>> +movd   xm3,     r5m
>>  neg    orderq
>>
>>  movu   m4,     [pd_0_int_min] ; load 1 bit
>> -psrad  m4,      m3            ; turn that into shift+1 bits
>> +psrad  m4,      xm3           ; turn that into shift+1 bits
>>  pslld  m4,      1             ; reduce that
>>  mova  [rsp],    m4            ; save sign extend mask
>>
>> @@ -197,8 +200,20 @@ mova  [rsp],    m4            ; save sign extend mask
>>      xor  negj, negj
>>
>>      .looporder1:
>> +%if cpuflag(avx)
>> +        vbroadcastss m2, [coefsq+posj*4]
>> +%else
>>          movd   m2,  [coefsq+posj*4] ; c = coefs[j]
>>          SPLATD m2
>> +%endif
>> +%if cpuflag(avx)
>> +        vpmuldq  m1, m2, [smpq+negj*4-4]
>> +        vpmuldq  m5, m2, [smpq+negj*4-4+mmsize]
>> +        vpmuldq  m7, m2, [smpq+negj*4-4+mmsize*2]
>> +        vpaddq   m0, m1
>> +        vpaddq   m4, m5
>> +        vpaddq   m6, m7
>>
> 
> Why force VEX encoding for these instructions, on avx no less?

It's avx2 and using ymm regs, not avx.