[FFmpeg-devel] [PATCH v2 2/3] swscale/x86/output: add AVX2 version of yuv2nv12cX

Fri Apr 24 20:46:32 EEST 2020

On 4/24/2020 1:31 PM, Nelson Gomez wrote:
> From: Nelson Gomez <nelson.gomez at microsoft.com>
> 
> 256 bits is just wide enough to fit all the operands needed to vectorize
> the software implementation, but AVX2 is needed to for a couple of
> instructions like cross-lane permutation.
> 
> Output is bit-for-bit identical to C.
> 
> Signed-off-by: Nelson Gomez <nelson.gomez at microsoft.com>
> ---
>  libswscale/x86/output.asm | 124 +++++++++++++++++++++++++++++++++++++-
>  libswscale/x86/swscale.c  |  24 ++++++++
>  2 files changed, 147 insertions(+), 1 deletion(-)
> 
> diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
> index db3e9934f8..7478e12403 100644
> --- a/libswscale/x86/output.asm
> +++ b/libswscale/x86/output.asm
> @@ -2,6 +2,7 @@
>  ;* x86-optimized vertical line scaling functions
>  ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
>  ;*                    Kieran Kunhya <kieran at kunhya.com>
> +;*           (c) 2020 Nelson Gomez <nelson.gomez at microsoft.com>
>  ;*
>  ;* This file is part of FFmpeg.
>  ;*
> @@ -22,7 +23,7 @@
>  
>  %include "libavutil/x86/x86util.asm"
>  
> -SECTION_RODATA
> +SECTION_RODATA 32
>  
>  minshort:      times 8 dw 0x8000
>  yuv2yuvX_16_start:  times 4 dd 0x4000 - 0x40000000
> @@ -34,9 +35,20 @@ pd_4:          times 4 dd 4
>  pd_4min0x40000:times 4 dd 4 - (0x40000)
>  pw_16:         times 8 dw 16
>  pw_32:         times 8 dw 32
> +pd_255:        times 8 dd 255
>  pw_512:        times 8 dw 512
>  pw_1024:       times 8 dw 1024
>  
> +yuv2nv12_shuffle_mask: times 2 db 0,  4,  8, 12, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1
> +yuv2nv21_shuffle_mask: times 2 db 4,  0, 12,  8, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1, \
> +                                 -1, -1, -1, -1
> +yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
> +
>  SECTION .text
>  
>  ;-----------------------------------------------------------------------------
> @@ -423,3 +435,113 @@ yuv2plane1_fn  9, 5, 3
>  yuv2plane1_fn 10, 5, 3
>  yuv2plane1_fn 16, 5, 3
>  %endif
> +
> +%undef movsx
> +
> +;-----------------------------------------------------------------------------
> +; AVX2 yuv2nv12cX implementation
> +;
> +; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +;                         const int16_t *filter, int filterSize,
> +;                         const int16_t **u, const int16_t **v,
> +;                         uint8_t *dst, int dstWidth)
> +;
> +; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +;                         const int16_t *filter, int filterSize,
> +;                         const int16_t **u, const int16_t **v,
> +;                         uint8_t *dst, int dstWidth)
> +;-----------------------------------------------------------------------------
> +
> +%macro yuv2nv12cX_fn 1
> +cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
> +
> +    mov tmp1q, qword [ditherq]
> +    movq xm0, tmp1q
> +    ror tmp1q, 24
> +    movq xm1, tmp1q
> +
> +    pmovzxbd m0, xm0
> +    pslld m0, m0, 12                        ; ditherLo
> +    pmovzxbd m1, xm1
> +    pslld m1, m1, 12                        ; ditherHi
> +
> +    pxor m9, m9                             ; uint8_min dwords
> +    mova m10, [pd_255]                      ; uint8_max dwords
> +    mova m11, [%1_shuffle_mask]             ; shuffle_mask
> +    mova m12, [yuv2nv12_permute_mask]       ; permute mask
> +
> +    DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
> +
> +    xor r8q, r8q
> +
> +nv12_outer_%1:
> +    mova m2, m0                             ; resultLo
> +    mova m3, m1                             ; resultHi
> +    xor r9q, r9q
> +
> +nv12_inner_%1:
> +    movsx r10d, word [filterq + (2 * r9q)]
> +    movd xm4, r10d
> +    vpbroadcastd m4, xm4                    ; filter
> +
> +    mov tmp1q, [uq + (gprsize * r9q)]
> +    mova xm7, oword [tmp1q + 2 * r8q]
> +
> +    mov tmp2q, [vq + (gprsize * r9q)]
> +    mova xm8, oword [tmp2q + 2 * r8q]
> +
> +    punpcklwd xm5, xm7, xm8
> +    pmovsxwd m5, xm5                        ; multiplicandsLo
> +    punpckhwd xm6, xm7, xm8
> +    pmovsxwd m6, xm6                        ; multiplicandsHi
> +
> +    pmulld m7, m5, m4                       ; mulResultLo
> +    pmulld m8, m6, m4                       ; mulResultHi
> +    paddd m2, m2, m7                        ; resultLo += mulResultLo
> +    paddd m3, m3, m8                        ; resultHi += mulResultHi
> +
> +    inc r9d
> +    cmp r9d, filterSized
> +    jl nv12_inner_%1
> +    ; end of inner loop
> +
> +    psrad m2, m2, 19
> +    psrad m3, m3, 19
> +
> +    ; Vectorized av_clip_uint8
> +    pmaxsd m2, m2, m9
> +    pmaxsd m3, m3, m9
> +    pminsd m2, m2, m10
> +    pminsd m3, m3, m10
> +
> +    ; At this point we have clamped uint8s arranged in this order:
> +    ;     m2: u1  0  0  0  v1  0  0  0  [...]
> +    ;     m3: u5  0  0  0  v5  0  0  0  [...]
> +    ;
> +    ; First, we shuffle the bytes to make the bytes semi-contiguous.
> +    ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
> +    ;     m2: u1  v1  u2  v2  0  0  0  0  0  0  0  0  u3  v3  u4  v4
> +    ;     m3: u5  v5  u6  v6  0  0  0  0  0  0  0  0  u7  v7  u8  v8
> +    pshufb m2, m2, m11
> +    pshufb m3, m3, m11
> +
> +    ; To fix the cross-lane shuffling issue, we'll then use cross-lane
> +    ; permutation to combine the two segments
> +    vpermd m2, m12, m2
> +    vpermd m3, m12, m3
> +
> +    ; Now we have the final results in the lower 8 bytes of each register
> +    movq [dstq], xm2
> +    movq [dstq + 8], xm3
> +
> +    add r8d, 8
> +    add dstq, 16
> +
> +    cmp r8d, dstWidthd
> +    jl nv12_outer_%1
> +    RET
> +%endmacro
> +
> +INIT_YMM avx2
> +yuv2nv12cX_fn yuv2nv12
> +yuv2nv12cX_fn yuv2nv21

Wrap this entire chunk in an %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
preprocessor check (Leave the constants as is).
AVX2 may not be supported by the assembler (like it's the case with some
old yasm versions), and as Michael noticed in the first version, these
functions will not work on x86_32 seeing you're using more registers
than it supports.

> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 61110839ee..d0930b9656 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -380,6 +380,15 @@ INPUT_FUNCS(sse2);
>  INPUT_FUNCS(ssse3);
>  INPUT_FUNCS(avx);
>  
> +#define YUV2NV_DECL(fmt, opt) \
> +void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
> +                                  const int16_t *filter, int filterSize, \
> +                                  const int16_t **u, const int16_t **v, \
> +                                  uint8_t *dst, int dstWidth)
> +
> +YUV2NV_DECL(nv12, avx2);
> +YUV2NV_DECL(nv21, avx2);
> +
>  av_cold void ff_sws_init_swscale_x86(SwsContext *c)
>  {
>      int cpu_flags = av_get_cpu_flags();
> @@ -580,4 +589,19 @@ switch(c->dstBpc){ \
>              break;
>          }
>      }
> +
> +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {

For the same reasons as above, make this

if (ARCH_X86_64 && EXTERNAL_AVX2_FAST(cpu_flags))

> +        switch (c->dstFormat) {
> +        case AV_PIX_FMT_NV12:
> +        case AV_PIX_FMT_NV24:
> +            c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
> +            break;
> +        case AV_PIX_FMT_NV21:
> +        case AV_PIX_FMT_NV42:
> +            c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
> +            break;
> +        default:
> +            break;
> +        }
> +    }
>  }

No more comments from me, but you may want to wait a bit for other
reviews before resending this patch.