[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

Tue Dec 14 19:07:30 EET 2021

On 12/14/2021 12:23 PM, Alan Kelly wrote:
> Patch has been rebased from latest commits.
> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
> ---
>   libswscale/swscale_internal.h |   2 +
>   libswscale/utils.c            |  37 +++++++++++
>   libswscale/x86/Makefile       |   1 +
>   libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++
>   libswscale/x86/swscale.c      |  19 ++++++
>   tests/checkasm/sw_scale.c     |  20 ++++--
>   6 files changed, 186 insertions(+), 5 deletions(-)
>   create mode 100644 libswscale/x86/scale_avx2.asm
> 
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 708facba67..64aa0b9804 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr,
>   //number of extra lines to process
>   #define MAX_LINES_AHEAD 4
>   
> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW);
>   #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index ae92ac9fbc..d4a72d3ce1 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = {
>       [AV_PIX_FMT_P416LE]      = { 1, 0 },
>   };
>   
> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){
> +#if ARCH_X86_64
> +    int i, j, k, l;
> +    int cpu_flags = av_get_cpu_flags();
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
> +        if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> +            if (dstW % 16 == 0){
> +                if (filter != NULL){
> +                    for (i = 0; i < dstW; i += 8){
> +                        FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
> +                        FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
> +                    }
> +                    if (filterSize > 4){
> +                        int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
> +                        memcpy(tmp2, filter, dstW * filterSize * 2);
> +                        for (i = 0; i < dstW; i += 16){//pixel
> +                            for (k = 0; k < filterSize / 4; ++k){//fcoeff
> +                                for (j = 0; j < 16; ++j){//inner pixel
> +                                    for (l = 0; l < 4; ++l){//coeff
> +                                        int from = i * filterSize + j * filterSize + k * 4 + l;
> +                                        int to = (i) * filterSize + j * 4 + l + k * 64;
> +                                        filter[to] = tmp2[from];
> +                                    }
> +                                }
> +                            }
> +                        }
> +                        av_free(tmp2);
> +                    }
> +                }
> +            }
> +        }
> +    }
> +#endif
> +}
> +
>   int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
>   {
>       return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
> @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
>                              get_local_pos(c, 0, 0, 0),
>                              get_local_pos(c, 0, 0, 0))) < 0)
>                   goto fail;
> +            ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW);
>               if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
>                              &c->hChrFilterSize, c->chrXInc,
>                              c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
> @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
>                              get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0),
>                              get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0)
>                   goto fail;
> +            ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW);
>           }
>       } // initialize horizontal stuff
>   
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index bfe383364e..68391494be 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
>   X86ASM-OBJS                     += x86/input.o                          \
>                                      x86/output.o                         \
>                                      x86/scale.o                          \
> +                                   x86/scale_avx2.o                          \
>                                      x86/rgb_2_rgb.o                      \
>                                      x86/yuv_2_rgb.o                      \
>                                      x86/yuv2yuvX.o                       \
> diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
> new file mode 100644
> index 0000000000..d90fd2d791
> --- /dev/null
> +++ b/libswscale/x86/scale_avx2.asm
> @@ -0,0 +1,112 @@
> +;******************************************************************************
> +;* x86-optimized horizontal line scaling functions
> +;* Copyright 2020 Google LLC
> +;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA

SECTION_RODATA 32

That way you can use mova to load these two.

> +
> +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7
> +four: times 8 dd 4
> +
> +SECTION .text
> +
> +;-----------------------------------------------------------------------------
> +; horizontal line scaling
> +;
> +; void hscale8to15_<filterSize>_<opt>
> +;                   (SwsContext *c, int16_t *dst,
> +;                    int dstW, const uint8_t *src,
> +;                    const int16_t *filter,
> +;                    const int32_t *filterPos, int filterSize);
> +;
> +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. Output is
> +; 15 bits (in int16_t). Each output pixel is generated from $filterSize input
> +; pixels, the position of the first pixel is given in filterPos[nOutputPixel].
> +;-----------------------------------------------------------------------------
> +
> +%macro SCALE_FUNC 1
> +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner

This needs to be 7, 9, 16, since you're using m15.

> +  pxor m0, m0
> +  movu m15, [swizzle]
> +  mov countq, $0
> +%ifidn %1, X4
> +  movu m14, [four]
> +  movsxd fltsizeq, fltsized

Using fltsized on the shr below should implicitly clear the higher 32 
bits, so you can skip this.
What you need to add however is a movsxd wq, wd (on all cases, not just 
X4), otherwise checkasm fails on Windows.

FATE passes with the above fixes for me.

That aside, did you give my suggestion for an AVX2SLOW cpuflag that's 
set on Haswell and Zen < 3 a try?

> +  shr fltsizeq, 2
> +%endif
> +.loop:
> +  movu m1, [fltposq]
> +  movu m2, [fltposq+32]
> +%ifidn %1, X4
> +  pxor m9, m9
> +  pxor m10, m10
> +  pxor m11, m11
> +  pxor m12, m12
> +  mov innerq, $0
> +.innerloop:
> +%endif
> +  vpcmpeqd  m13, m13
> +  vpgatherdd m3,[srcmemq + m1], m13
> +  vpcmpeqd  m13, m13
> +  vpgatherdd m4,[srcmemq + m2], m13
> +  vpunpcklbw m5, m3, m0
> +  vpunpckhbw m6, m3, m0
> +  vpunpcklbw m7, m4, m0
> +  vpunpckhbw m8, m4, m0
> +  vpmaddwd m5, m5, [filterq]
> +  vpmaddwd m6, m6, [filterq + 32]
> +  vpmaddwd m7, m7, [filterq + 64]
> +  vpmaddwd m8, m8, [filterq + 96]
> +  add filterq, $80
> +%ifidn %1, X4
> +  paddd m9, m5
> +  paddd m10, m6
> +  paddd m11, m7
> +  paddd m12, m8
> +  paddd m1, m14
> +  paddd m2, m14
> +  add innerq, $1
> +  cmp innerq, fltsizeq
> +  jl .innerloop
> +  vphaddd m5, m9, m10
> +  vphaddd m6, m11, m12
> +%else
> +  vphaddd m5, m5, m6
> +  vphaddd m6, m7, m8
> +%endif
> +  vpsrad  m5, 7
> +  vpsrad  m6, 7
> +  vpackssdw m5, m5, m6
> +  vpermd m5, m15, m5
> +  vmovdqu [dstq + countq * 2], m5
> +  add fltposq, $40
> +  add countq, $10
> +  cmp countq, wq
> +  jl .loop
> +REP_RET
> +%endmacro
> +
> +%if ARCH_X86_64
> +INIT_YMM avx2
> +SCALE_FUNC 4
> +SCALE_FUNC X4
> +%endif
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 0848a31461..164b06d6ba 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2);
>   SCALE_FUNCS_SSE(ssse3);
>   SCALE_FUNCS_SSE(sse4);
>   
> +SCALE_FUNC(4, 8, 15, avx2);
> +SCALE_FUNC(X4, 8, 15, avx2);
> +
>   #define VSCALEX_FUNC(size, opt) \
>   void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
>                                           const int16_t **src, uint8_t *dest, int dstW, \
> @@ -568,6 +571,22 @@ switch(c->dstBpc){ \
>       }
>   
>   #if ARCH_X86_64
> +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \
> +    switch (filtersize) { \
> +    case 4:  hscalefn = ff_hscale8to15_4_avx2; break; \
> +    default:  hscalefn = ff_hscale8to15_X4_avx2; break; \
> +             break; \
> +    }
> +
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)){
> +      if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> +        if(c->chrDstW % 16 == 0)
> +          ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> +        if(c->dstW % 16 == 0)
> +          ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
> +      }
> +    }
> +
>       if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>           switch (c->dstFormat) {
>           case AV_PIX_FMT_NV12:
> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
> index 1e7ffe0fff..011cb46428 100644
> --- a/tests/checkasm/sw_scale.c
> +++ b/tests/checkasm/sw_scale.c
> @@ -134,13 +134,13 @@ static void check_yuv2yuvX(void)
>   }
>   
>   #undef SRC_PIXELS
> -#define SRC_PIXELS 128
> +#define SRC_PIXELS 512
>   
>   static void check_hscale(void)
>   {
>   #define MAX_FILTER_WIDTH 40
> -#define FILTER_SIZES 5
> -    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 };
> +#define FILTER_SIZES 6
> +    static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 };
>   
>   #define HSCALE_PAIRS 2
>       static const int hscale_pairs[HSCALE_PAIRS][2] = {
> @@ -159,6 +159,8 @@ static void check_hscale(void)
>       // padded
>       LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
>       LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]);
> +    LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH]);
> +    LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]);
>   
>       // The dst parameter here is either int16_t or int32_t but we use void* to
>       // just cover both cases.
> @@ -166,6 +168,8 @@ static void check_hscale(void)
>                         const uint8_t *src, const int16_t *filter,
>                         const int32_t *filterPos, int filterSize);
>   
> +    int cpu_flags = av_get_cpu_flags();
> +
>       ctx = sws_alloc_context();
>       if (sws_init_context(ctx, NULL, NULL) < 0)
>           fail();
> @@ -179,9 +183,11 @@ static void check_hscale(void)
>               ctx->srcBpc = hscale_pairs[hpi][0];
>               ctx->dstBpc = hscale_pairs[hpi][1];
>               ctx->hLumFilterSize = ctx->hChrFilterSize = width;
> +            ctx->dstW = ctx->chrDstW = SRC_PIXELS;
>   
>               for (i = 0; i < SRC_PIXELS; i++) {
>                   filterPos[i] = i;
> +                filterPosAvx[i] = i;
>   
>                   // These filter cofficients are chosen to try break two corner
>                   // cases, namely:
> @@ -210,16 +216,20 @@ static void check_hscale(void)
>                   filter[SRC_PIXELS * width + i] = rnd();
>               }
>               ff_sws_init_scale(ctx);
> +            memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
> +            if (cpu_flags & AV_CPU_FLAG_AVX2){
> +                ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS);
> +            }
>   
>               if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) {
>                   memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
>                   memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0]));
>   
>                   call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
> -                call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, width);
> +                call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, filterPosAvx, width);
>                   if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0])))
>                       fail();
> -                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, width);
> +                bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPosAvx, width);
>               }
>           }
>       }