[FFmpeg-devel] [PATCH 2/2] avfilter/x86/vf_gblur: add postscale SIMD

James Almer jamrial at gmail.com
Sun Feb 14 03:36:02 EET 2021


On 2/13/2021 8:10 AM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>   libavfilter/x86/vf_gblur.asm    | 46 +++++++++++++++++++++++++++++++++
>   libavfilter/x86/vf_gblur_init.c | 11 ++++++--
>   2 files changed, 55 insertions(+), 2 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
> index a25b1659f5..8fea6d2a61 100644
> --- a/libavfilter/x86/vf_gblur.asm
> +++ b/libavfilter/x86/vf_gblur.asm
> @@ -183,3 +183,49 @@ HORIZ_SLICE
>   INIT_XMM avx2
>   HORIZ_SLICE
>   %endif
> +
> +%macro POSTSCALE_SLICE 0
> +%if UNIX64
> +cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x

cglobal postscale_slice, 2, 3, 4, ptr, length, x

> +%else
> +cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
> +%endif
> +    shl lengthd, 2
> +%if WIN64
> +    SWAP 0, 2
> +    SWAP 1, 3
> +    SWAP 2, 4
> +%endif
> +    shufps   xm0, xm0, 0
> +    shufps   xm1, xm1, 0
> +    shufps   xm2, xm2, 0
> +%if cpuflag(avx2)
> +    vinsertf128  m0, m0, xm0, 1
> +    vinsertf128  m1, m1, xm1, 1
> +    vinsertf128  m2, m2, xm2, 1

You can use vbroadcastss ymm, xmm with AVX2, which combines both the 
shufps and vinsertf128 into one instruction.

As is, this function is base AVX. So if you can't measure any 
performance gain with vbroadcastss, then just mark the function as AVX.

> +%endif
> +    xor      xq, xq
> +
> +    .loop:
> +    movu          m3, [ptrq + xq]
> +    mulps         m3, m0

AVX can use unaligned memory operands, so just do

mulps m3, m0, [ptrq + xq]

But keep the explicit movu + mulps for the SSE version, otherwise x86inc 
will expand it into a mova.

> +    maxps         m3, m1
> +    minps         m3, m2
> +    movu   [ptrq+xq], m3
> +
> +    add xq, mmsize
> +    cmp xd, lengthd

Can't you use the neg trick? It should let you reuse length instead of x.

> +    jl .loop
> +
> +    RET
> +%endmacro
> +
> +%if ARCH_X86_64

Nothing in this function seems to require x86_64.

> +INIT_XMM sse4

No instruction is SSE4 here. It's all base SSE.

> +POSTSCALE_SLICE
> +
> +%if HAVE_AVX_EXTERNAL

Wrong check.

> +INIT_YMM avx2
> +POSTSCALE_SLICE
> +%endif
> +%endif
> diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
> index e63e59fe23..7a9b40b0ad 100644
> --- a/libavfilter/x86/vf_gblur_init.c
> +++ b/libavfilter/x86/vf_gblur_init.c
> @@ -27,14 +27,21 @@
>   void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float nu, float bscale);
>   void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float nu, float bscale);
>   
> +void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max);
> +void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float min, float max);
> +
>   av_cold void ff_gblur_init_x86(GBlurContext *s)
>   {
>   #if ARCH_X86_64
>       int cpu_flags = av_get_cpu_flags();
>   
> -    if (EXTERNAL_SSE4(cpu_flags))
> +    if (EXTERNAL_SSE4(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_sse4;
> -    if (EXTERNAL_AVX2(cpu_flags))
> +        s->postscale_slice = ff_postscale_slice_sse4;
> +    }
> +    if (EXTERNAL_AVX2(cpu_flags)) {
>           s->horiz_slice = ff_horiz_slice_avx2;
> +        s->postscale_slice = ff_postscale_slice_avx2;

Needs to be EXTERNAL_AVX2_FAST. You're using ymm regs, unlike in 
ff_horiz_slice_avx2.

> +    }
>   #endif
>   }
> 



More information about the ffmpeg-devel mailing list