[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions

Andreas Rheinhardt andreas.rheinhardt at outlook.com
Thu Jun 6 17:48:44 EEST 2024


James Almer:
> And remove shuffle_bytes_2103_mmxext.
> 
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
> 
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c     | 14 ++++--
>  libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
>  2 files changed, 69 insertions(+), 28 deletions(-)
> 
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>  
>  #endif /* HAVE_INLINE_ASM */
>  
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
>  void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
>          rgb2rgb_init_avx();
>  #endif /* HAVE_INLINE_ASM */
>  
> -    if (EXTERNAL_MMXEXT(cpu_flags)) {
> -        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> -    }
>      if (EXTERNAL_SSE2(cpu_flags)) {
> +        shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> +        shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> +        shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> +        shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> +        shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
>  #if ARCH_X86_64
>          uyvytoyuv422 = ff_uyvytoyuv422_sse2;
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>  
>  SECTION_RODATA
>  
> -pb_mask_shuffle2103_mmx times 8 dw 255
>  pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
>  pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
>  pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
>  ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> -    mova   m6, [pb_mask_shuffle2103_mmx]
> -    mova   m7, m6
> -    psllq  m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m3
> +    pand     m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    pand     m0, m2
> +    pand     m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> +    pslld    m1, m0, 24
> +    psrld    m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> +    pslld    m1, m0, 8
> +    psrld    m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> +    pshuflw   m1, m0, 0xb1
> +    pshufhw   m1, m1, 0xb1
> +
> +    psrlw     m0, m1, 8
> +    psllw     m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> +    pcmpeqw        m2, m2
> +    psllw          m3, m2, 8 ; (word) { 0xff00 } x4
> +    psrlw          m2, 8     ; (word) { 0x00ff } x4
> +%endif
>  
>      movsxdifnidn wq, wd
>      mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
>      je .loop_simd
>  
>  .loop_scalar:
> -   mov          tmpb, [srcq + wq + 2]
> +   mov          tmpb, [srcq + wq + %1]
>     mov [dstq+wq + 0], tmpb
> -   mov          tmpb, [srcq + wq + 1]
> +   mov          tmpb, [srcq + wq + %2]
>     mov [dstq+wq + 1], tmpb
> -   mov          tmpb, [srcq + wq + 0]
> +   mov          tmpb, [srcq + wq + %3]
>     mov [dstq+wq + 2], tmpb
> -   mov          tmpb, [srcq + wq + 3]
> +   mov          tmpb, [srcq + wq + %4]
>     mov [dstq+wq + 3], tmpb
>     add            wq, 4
>     sub            xq, 4
> @@ -86,29 +124,26 @@ jge .end
>  
>  .loop_simd:
>      movu     m0, [srcq+wq]
> -    movu     m1, [srcq+wq+8]
> -
> -    pshufw   m3, m0, 177
> -    pshufw   m5, m1, 177
> -
> -    pand     m0, m7
> -    pand     m3, m6
>  
> -    pand     m1, m7
> -    pand     m5, m6
> +    SHUFFLE%1%2%3%4_SSE2
>  
> -    por      m0, m3
> -    por      m1, m5
> +    por      m0, m1
>  
>      movu      [dstq+wq], m0
> -    movu  [dstq+wq + 8], m1
>  
> -    add              wq, mmsize*2
> +    add              wq, mmsize
>      jl .loop_simd
>  
>  .end:
> -    emms
>      RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>  
>  ;------------------------------------------------------------------------------
>  ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)

How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).

- Andreas



More information about the ffmpeg-devel mailing list