[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add SSE2 shuffle_bytes functions
Andreas Rheinhardt
andreas.rheinhardt at outlook.com
Thu Jun 6 17:48:44 EEST 2024
James Almer:
> And remove shuffle_bytes_2103_mmxext.
>
> shuffle_bytes_0321_c: 28.1
> shuffle_bytes_0321_sse2: 13.6
> shuffle_bytes_0321_ssse3: 9.6
> shuffle_bytes_0321_avx2: 7.1
> shuffle_bytes_1230_c: 52.6
> shuffle_bytes_1230_sse2: 12.1
> shuffle_bytes_1230_ssse3: 8.6
> shuffle_bytes_1230_avx2: 6.6
> shuffle_bytes_2103_c: 29.1
> shuffle_bytes_2103_mmxext: 29.3 // removed
> shuffle_bytes_2103_sse2: 12.5
> shuffle_bytes_2103_ssse3: 8.6
> shuffle_bytes_2103_avx2: 7.1
> shuffle_bytes_3012_c: 52.1
> shuffle_bytes_3012_sse2: 12.1
> shuffle_bytes_3012_ssse3: 8.6
> shuffle_bytes_3012_avx2: 7.1
> shuffle_bytes_3210_c: 50.6
> shuffle_bytes_3210_sse2: 14.6
> shuffle_bytes_3210_ssse3: 8.6
> shuffle_bytes_3210_avx2: 7.1
>
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> libswscale/x86/rgb2rgb.c | 14 ++++--
> libswscale/x86/rgb_2_rgb.asm | 83 +++++++++++++++++++++++++-----------
> 2 files changed, 69 insertions(+), 28 deletions(-)
>
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 21ccfafe51..9f6c8efc72 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -116,7 +116,11 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
>
> #endif /* HAVE_INLINE_ASM */
>
> -void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_2103_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_0321_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_1230_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3012_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> +void ff_shuffle_bytes_3210_sse2(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
> @@ -154,10 +158,12 @@ av_cold void rgb2rgb_init_x86(void)
> rgb2rgb_init_avx();
> #endif /* HAVE_INLINE_ASM */
>
> - if (EXTERNAL_MMXEXT(cpu_flags)) {
> - shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
> - }
> if (EXTERNAL_SSE2(cpu_flags)) {
> + shuffle_bytes_2103 = ff_shuffle_bytes_2103_sse2;
> + shuffle_bytes_0321 = ff_shuffle_bytes_0321_sse2;
> + shuffle_bytes_1230 = ff_shuffle_bytes_1230_sse2;
> + shuffle_bytes_3012 = ff_shuffle_bytes_3012_sse2;
> + shuffle_bytes_3210 = ff_shuffle_bytes_3210_sse2;
> #if ARCH_X86_64
> uyvytoyuv422 = ff_uyvytoyuv422_sse2;
> #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 0bf1278718..9fc1974389 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -25,7 +25,6 @@
>
> SECTION_RODATA
>
> -pb_mask_shuffle2103_mmx times 8 dw 255
> pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
> pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> @@ -50,11 +49,50 @@ SECTION .text
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
> ;------------------------------------------------------------------------------
> -INIT_MMX mmxext
> -cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> - mova m6, [pb_mask_shuffle2103_mmx]
> - mova m7, m6
> - psllq m7, 8
> +
> +%macro SHUFFLE2103_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m3
> + pand m1, m2
> +%endmacro
> +
> +%macro SHUFFLE0321_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + pand m0, m2
> + pand m1, m3
> +%endmacro
> +
> +%macro SHUFFLE1230_SSE2 0
> + pslld m1, m0, 24
> + psrld m0, 8
> +%endmacro
> +
> +%macro SHUFFLE3012_SSE2 0
> + pslld m1, m0, 8
> + psrld m0, 24
> +%endmacro
> +
> +%macro SHUFFLE3210_SSE2 0
> + pshuflw m1, m0, 0xb1
> + pshufhw m1, m1, 0xb1
> +
> + psrlw m0, m1, 8
> + psllw m1, 8
> +%endmacro
> +
> +; %1-4 index shuffle
> +; %5 load mask
> +%macro SHUFFLE_BYTES_SSE2 5
> +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 4, src, dst, w, tmp, x
> +%if %5
> + pcmpeqw m2, m2
> + psllw m3, m2, 8 ; (word) { 0xff00 } x4
> + psrlw m2, 8 ; (word) { 0x00ff } x4
> +%endif
>
> movsxdifnidn wq, wd
> mov xq, wq
> @@ -68,13 +106,13 @@ cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
> je .loop_simd
>
> .loop_scalar:
> - mov tmpb, [srcq + wq + 2]
> + mov tmpb, [srcq + wq + %1]
> mov [dstq+wq + 0], tmpb
> - mov tmpb, [srcq + wq + 1]
> + mov tmpb, [srcq + wq + %2]
> mov [dstq+wq + 1], tmpb
> - mov tmpb, [srcq + wq + 0]
> + mov tmpb, [srcq + wq + %3]
> mov [dstq+wq + 2], tmpb
> - mov tmpb, [srcq + wq + 3]
> + mov tmpb, [srcq + wq + %4]
> mov [dstq+wq + 3], tmpb
> add wq, 4
> sub xq, 4
> @@ -86,29 +124,26 @@ jge .end
>
> .loop_simd:
> movu m0, [srcq+wq]
> - movu m1, [srcq+wq+8]
> -
> - pshufw m3, m0, 177
> - pshufw m5, m1, 177
> -
> - pand m0, m7
> - pand m3, m6
>
> - pand m1, m7
> - pand m5, m6
> + SHUFFLE%1%2%3%4_SSE2
>
> - por m0, m3
> - por m1, m5
> + por m0, m1
>
> movu [dstq+wq], m0
> - movu [dstq+wq + 8], m1
>
> - add wq, mmsize*2
> + add wq, mmsize
> jl .loop_simd
>
> .end:
> - emms
> RET
> +%endmacro
> +
> +INIT_XMM sse2
> +SHUFFLE_BYTES_SSE2 2, 1, 0, 3, 1
> +SHUFFLE_BYTES_SSE2 0, 3, 2, 1, 1
> +SHUFFLE_BYTES_SSE2 1, 2, 3, 0, 0
> +SHUFFLE_BYTES_SSE2 3, 0, 1, 2, 0
> +SHUFFLE_BYTES_SSE2 3, 2, 1, 0, 0
>
> ;------------------------------------------------------------------------------
> ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
How old are the youngest processors with SSE2, but without SSSE3?
According to Wikipedia, nearly 15 years. Which makes me believe that the
SSE2 versions are not worth it (how many of these CPUs will use a new
FFmpeg anyway?).
- Andreas
More information about the ffmpeg-devel
mailing list