[FFmpeg-devel] [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct

Thu Jan 21 05:33:32 EET 2021

On 1/20/2021 5:30 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>   libavutil/x86/float_dsp.asm    | 112 +++++++++++++++++++++++++++++++++
>   libavutil/x86/float_dsp_init.c |   2 +
>   2 files changed, 114 insertions(+)
> 
> diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
> index 517fd63638..f7497df34e 100644
> --- a/libavutil/x86/float_dsp.asm
> +++ b/libavutil/x86/float_dsp.asm
> @@ -463,6 +463,118 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
>   %endif
>       RET
>   
> +INIT_YMM fma3
> +cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
> +    xor   offsetq, offsetq
> +    xorps      m0, m0
> +    shl     sized, 2
> +    mov      lenq, sizeq
> +    cmp      lenq, 32
> +    jl   .l16
> +    cmp      lenq, 64
> +    jl   .l32
> +    cmp      lenq, 128
> +    jl   .l64
> +    and    lenq, ~127
> +    xorps    m1, m1
> +    xorps    m2, m2
> +    xorps    m3, m3
> +.loop128:
> +    movups   m4, [v1q+offsetq]
> +    movups   m5, [v1q+offsetq + 32]
> +    movups   m6, [v1q+offsetq + 64]
> +    movups   m7, [v1q+offsetq + 96]
> +    fmaddps  m0, m4, [v2q+offsetq     ], m0
> +    fmaddps  m1, m5, [v2q+offsetq + 32], m1
> +    fmaddps  m2, m6, [v2q+offsetq + 64], m2
> +    fmaddps  m3, m7, [v2q+offsetq + 96], m3

Could use mmsize for the offsets. It will make it easier to adapt this 
function to eventually use avx512.

> +    add   offsetq, 128
> +    cmp   offsetq, lenq
> +    jl .loop128
> +    addps    m0, m1
> +    addps    m2, m3
> +    addps    m0, m2

Do only

     addps    m0, m2
     addps    m1, m3

here. There's no need to combine all four regs into m0 if you end up 
jumping to l64, since m1 is still used there as an accumulator.

> +    mov      lenq, sizeq
> +    and      lenq, 127
> +    cmp      lenq, 64
> +    jge .l64

Then add

     addps    m0, m1

After this line, since l32 and l16 use only m0.

> +    cmp      lenq, 32
> +    jge .l32
> +    cmp      lenq, 16
> +    jge .l16

Move the next two instructions before this line. If you jump to l16 like 
this, vextractf128 will not be executed and you'll be missing the upper 
128 bits of m0 in the final sum.

> +    vextractf128 xmm2, m0, 1
> +    addps    xmm0, xmm2
> +    movhlps  xmm1, xmm0
> +    addps    xmm0, xmm1
> +    movss    xmm1, xmm0
> +    shufps   xmm0, xmm0, 1
> +    addss    xmm0, xmm1
> +    RET
> +.l64:
> +    and    lenq, ~63
> +    add    lenq, offsetq
> +    xorps    m1, m1
> +.loop64:
> +    movups   m4, [v1q+offsetq]
> +    movups   m5, [v1q+offsetq + 32]
> +    fmaddps  m0, m4, [v2q+offsetq], m0
> +    fmaddps  m1, m5, [v2q+offsetq + 32], m1
> +    add   offsetq, 64
> +    cmp   offsetq, lenq
> +    jl .loop64
> +    addps    m0, m1
> +    mov      lenq, sizeq
> +    and      lenq, 63
> +    cmp      lenq, 32
> +    jge .l32
> +    cmp      lenq, 16
> +    jge .l16

Ditto.

> +    vextractf128 xmm2, m0, 1
> +    addps    xmm0, xmm2
> +    movhlps  xmm1, xmm0
> +    addps    xmm0, xmm1
> +    movss    xmm1, xmm0
> +    shufps   xmm0, xmm0, 1
> +    addss    xmm0, xmm1
> +    RET
> +.l32:
> +    and    lenq, ~31
> +    add    lenq, offsetq
> +.loop32:
> +    movups   m4, [v1q+offsetq]
> +    fmaddps  m0, m4, [v2q+offsetq], m0
> +    add   offsetq, 32
> +    cmp   offsetq, lenq
> +    jl .loop32
> +    vextractf128 xmm2, m0, 1
> +    addps    xmm0, xmm2
> +    mov      lenq, sizeq
> +    and      lenq, 31
> +    cmp      lenq, 16
> +    jge .l16

You got it right here.

> +    movhlps  xmm1, xmm0
> +    addps    xmm0, xmm1
> +    movss    xmm1, xmm0
> +    shufps   xmm0, xmm0, 1
> +    addss    xmm0, xmm1
> +    RET
> +.l16:
> +    and    lenq, ~15
> +    add    lenq, offsetq
> +.loop16:
> +    movaps   xmm1, [v1q+offsetq]
> +    mulps    xmm1, [v2q+offsetq]
> +    addps    xmm0, xmm1
> +    add   offsetq, 16
> +    cmp   offsetq, lenq
> +    jl .loop16
> +    movhlps  xmm1, xmm0
> +    addps    xmm0, xmm1
> +    movss    xmm1, xmm0
> +    shufps   xmm0, xmm0, 1
> +    addss    xmm0, xmm1
> +    RET

%if ARCH_X86_64 == 0
     movss     r0m,  xm0
     fld dword r0m
%endif

Above every RET in the function.

> +
>   ;-----------------------------------------------------------------------------
>   ; void ff_butterflies_float(float *src0, float *src1, int len);
>   ;-----------------------------------------------------------------------------
> diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
> index 8826e4e2c9..67bfbe18d0 100644
> --- a/libavutil/x86/float_dsp_init.c
> +++ b/libavutil/x86/float_dsp_init.c
> @@ -76,6 +76,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
>                                    const float *src1, int len);
>   
>   float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
> +float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
>   
>   void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
>   
> @@ -117,5 +118,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
>           fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
>           fdsp->vector_fmul_add    = ff_vector_fmul_add_fma3;
>           fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
> +        fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
>       }
>   }
>