[FFmpeg-devel] [PATCH] swscale/x86/output.asm: add x86-optimized planer gbr yuv2anyX functions
Mark Reid
mindmark at gmail.com
Wed Nov 3 20:29:15 EET 2021
On Sun, Oct 24, 2021 at 9:10 PM <mindmark at gmail.com> wrote:
> From: Mark Reid <mindmark at gmail.com>
>
> yuv2gbrp_full_X_4_512_c: 12096.6
> yuv2gbrp_full_X_4_512_sse2: 10782.6
> yuv2gbrp_full_X_4_512_sse4: 5143.6
> yuv2gbrp_full_X_4_512_avx2: 3000.1
> yuv2gbrap_full_X_4_512_c: 15463.1
> yuv2gbrap_full_X_4_512_sse2: 14296.6
> yuv2gbrap_full_X_4_512_sse4: 6319.1
> yuv2gbrap_full_X_4_512_avx2: 3554.1
> yuv2gbrp9be_full_X_4_512_c: 14281.6
> yuv2gbrp9be_full_X_4_512_sse2: 11206.1
> yuv2gbrp9be_full_X_4_512_sse4: 5033.6
> yuv2gbrp9be_full_X_4_512_avx2: 3012.6
> yuv2gbrp9le_full_X_4_512_c: 12688.6
> yuv2gbrp9le_full_X_4_512_sse2: 10914.1
> yuv2gbrp9le_full_X_4_512_sse4: 5144.6
> yuv2gbrp9le_full_X_4_512_avx2: 3014.6
> yuv2gbrp10be_full_X_4_512_c: 14257.6
> yuv2gbrp10be_full_X_4_512_sse2: 11089.6
> yuv2gbrp10be_full_X_4_512_sse4: 5039.1
> yuv2gbrp10be_full_X_4_512_avx2: 3001.1
> yuv2gbrp10le_full_X_4_512_c: 12098.6
> yuv2gbrp10le_full_X_4_512_sse2: 10884.1
> yuv2gbrp10le_full_X_4_512_sse4: 5138.1
> yuv2gbrp10le_full_X_4_512_avx2: 2999.6
> yuv2gbrap10be_full_X_4_512_c: 18549.6
> yuv2gbrap10be_full_X_4_512_sse2: 14538.6
> yuv2gbrap10be_full_X_4_512_sse4: 6292.6
> yuv2gbrap10be_full_X_4_512_avx2: 3583.6
> yuv2gbrap10le_full_X_4_512_c: 16631.1
> yuv2gbrap10le_full_X_4_512_sse2: 14190.6
> yuv2gbrap10le_full_X_4_512_sse4: 6348.1
> yuv2gbrap10le_full_X_4_512_avx2: 3554.6
> yuv2gbrp12be_full_X_4_512_c: 13555.1
> yuv2gbrp12be_full_X_4_512_sse2: 10952.1
> yuv2gbrp12be_full_X_4_512_sse4: 5137.6
> yuv2gbrp12be_full_X_4_512_avx2: 3009.6
> yuv2gbrp12le_full_X_4_512_c: 12082.6
> yuv2gbrp12le_full_X_4_512_sse2: 10891.1
> yuv2gbrp12le_full_X_4_512_sse4: 5184.1
> yuv2gbrp12le_full_X_4_512_avx2: 3011.1
> yuv2gbrap12be_full_X_4_512_c: 18689.6
> yuv2gbrap12be_full_X_4_512_sse2: 14522.6
> yuv2gbrap12be_full_X_4_512_sse4: 6237.6
> yuv2gbrap12be_full_X_4_512_avx2: 3585.6
> yuv2gbrap12le_full_X_4_512_c: 16760.6
> yuv2gbrap12le_full_X_4_512_sse2: 14202.1
> yuv2gbrap12le_full_X_4_512_sse4: 6252.1
> yuv2gbrap12le_full_X_4_512_avx2: 3591.1
> yuv2gbrp14be_full_X_4_512_c: 13555.6
> yuv2gbrp14be_full_X_4_512_sse2: 10949.1
> yuv2gbrp14be_full_X_4_512_sse4: 5185.1
> yuv2gbrp14be_full_X_4_512_avx2: 3012.1
> yuv2gbrp14le_full_X_4_512_c: 12068.1
> yuv2gbrp14le_full_X_4_512_sse2: 10883.6
> yuv2gbrp14le_full_X_4_512_sse4: 5145.1
> yuv2gbrp14le_full_X_4_512_avx2: 3007.1
> yuv2gbrp16be_full_X_4_512_c: 12383.6
> yuv2gbrp16be_full_X_4_512_sse2: 8230.6
> yuv2gbrp16be_full_X_4_512_sse4: 4765.6
> yuv2gbrp16be_full_X_4_512_avx2: 2742.6
> yuv2gbrp16le_full_X_4_512_c: 10906.1
> yuv2gbrp16le_full_X_4_512_sse2: 28732.1
> yuv2gbrp16le_full_X_4_512_sse4: 4709.6
> yuv2gbrp16le_full_X_4_512_avx2: 2753.1
> yuv2gbrap16be_full_X_4_512_c: 15472.6
> yuv2gbrap16be_full_X_4_512_sse2: 11021.6
> yuv2gbrap16be_full_X_4_512_sse4: 5487.6
> yuv2gbrap16be_full_X_4_512_avx2: 3143.6
> yuv2gbrap16le_full_X_4_512_c: 13668.6
> yuv2gbrap16le_full_X_4_512_sse2: 10562.1
> yuv2gbrap16le_full_X_4_512_sse4: 5506.6
> yuv2gbrap16le_full_X_4_512_avx2: 3149.6
> yuv2gbrpf32be_full_X_4_512_c: 15471.1
> yuv2gbrpf32be_full_X_4_512_sse2: 8524.6
> yuv2gbrpf32be_full_X_4_512_sse4: 4559.1
> yuv2gbrpf32be_full_X_4_512_avx2: 2388.1
> yuv2gbrpf32le_full_X_4_512_c: 14247.6
> yuv2gbrpf32le_full_X_4_512_sse2: 7600.6
> yuv2gbrpf32le_full_X_4_512_sse4: 4385.6
> yuv2gbrpf32le_full_X_4_512_avx2: 2258.6
> yuv2gbrapf32be_full_X_4_512_c: 18412.1
> yuv2gbrapf32be_full_X_4_512_sse2: 11353.6
> yuv2gbrapf32be_full_X_4_512_sse4: 5807.1
> yuv2gbrapf32be_full_X_4_512_avx2: 2928.1
> yuv2gbrapf32le_full_X_4_512_c: 16485.1
> yuv2gbrapf32le_full_X_4_512_sse2: 10202.1
> yuv2gbrapf32le_full_X_4_512_sse4: 5571.6
> yuv2gbrapf32le_full_X_4_512_avx2: 2847.6
>
>
> ---
> libswscale/x86/output.asm | 440 +++++++++++++++++++++++++++++++++++++-
> libswscale/x86/swscale.c | 99 +++++++++
> tests/checkasm/Makefile | 2 +-
> tests/checkasm/checkasm.c | 1 +
> tests/checkasm/checkasm.h | 1 +
> tests/checkasm/sw_gbrp.c | 198 +++++++++++++++++
> tests/fate/checkasm.mak | 1 +
> 7 files changed, 740 insertions(+), 2 deletions(-)
> create mode 100644 tests/checkasm/sw_gbrp.c
>
> diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
> index 52cf9f2c2e..e80b6256b4 100644
> --- a/libswscale/x86/output.asm
> +++ b/libswscale/x86/output.asm
> @@ -38,7 +38,49 @@ pw_32: times 8 dw 32
> pd_255: times 8 dd 255
> pw_512: times 8 dw 512
> pw_1024: times 8 dw 1024
> -
> +pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
> +pd_yuv2gbrp16_start: times 8 dd -0x40000000
> +pd_yuv2gbrp_y_start: times 8 dd (1 << 9)
> +pd_yuv2gbrp_uv_start: times 8 dd ((1 << 9) - (128 << 19))
> +pd_yuv2gbrp_a_start: times 8 dd (1 << 18)
> +pd_yuv2gbrp16_offset: times 8 dd 0x10000 ;(1 << 16)
> +pd_yuv2gbrp16_round13: times 8 dd 0x02000 ;(1 << 13)
> +pd_yuv2gbrp16_a_offset: times 8 dd 0x20002000
> +pd_yuv2gbrp16_upper30: times 8 dd 0x3FFFFFFF ;(1<<30) - 1
> +pd_yuv2gbrp16_upper27: times 8 dd 0x07FFFFFF ;(1<<27) - 1
> +pd_yuv2gbrp16_upperC: times 8 dd 0xC0000000
> +pb_lo_pack_shuffle8: db 0, 4, 8, 12, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +pb_hi_pack_shuffle8: db -1, -1, -1, -1, \
> + 0, 4, 8, 12, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +pb_lo_pack_shuffle16le: db 0, 1, 4, 5, \
> + 8, 9, 12, 13, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +pb_lo_pack_shuffle16be: db 1, 0, 5, 4, \
> + 9, 8, 13, 12, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +pb_hi_pack_shuffle16le: db -1, -1, -1, -1, \
> + -1, -1, -1, -1, \
> + 0, 1, 4, 5, \
> + 8, 9, 12, 13
> +pb_hi_pack_shuffle16be: db -1, -1, -1, -1, \
> + -1, -1, -1, -1, \
> + 1, 0, 5, 4, \
> + 9, 8, 13, 12
> +pb_shuffle32be db 3, 2, 1, 0, \
> + 7, 6, 5, 4, \
> + 11, 10, 9, 8, \
> + 15, 14, 13, 12, \
> + 3, 2, 1, 0, \
> + 7, 6, 5, 4, \
> + 11, 10, 9, 8, \
> + 15, 14, 13, 12
> yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
> -1, -1, -1, -1, \
> -1, -1, -1, -1, \
> @@ -549,3 +591,399 @@ yuv2nv12cX_fn yuv2nv12
> yuv2nv12cX_fn yuv2nv21
> %endif
> %endif ; ARCH_X86_64
> +
>
> +;-----------------------------------------------------------------------------
> +; planar grb yuv2anyX functions
> +; void ff_yuv2<gbr_format>_full_X_<opt>(SwsContext *c, const int16_t
> *lumFilter,
> +; const int16_t **lumSrcx, int
> lumFilterSize,
> +; const int16_t *chrFilter, const
> int16_t **chrUSrcx,
> +; const int16_t **chrVSrcx, int
> chrFilterSize,
> +; const int16_t **alpSrcx, uint8_t
> **dest,
> +; int dstW, int y)
>
> +;-----------------------------------------------------------------------------
> +
> +%if ARCH_X86_64
> +struc SwsContext
> + .padding: resb 40292 ; offsetof(SwsContext,
> yuv2rgb_y_offset)
> + .yuv2rgb_y_offset: resd 1
> + .yuv2rgb_y_coeff: resd 1
> + .yuv2rgb_v2r_coeff: resd 1
> + .yuv2rgb_v2g_coeff: resd 1
> + .yuv2rgb_u2g_coeff: resd 1
> + .yuv2rgb_u2b_coeff: resd 1
> +endstruc
> +
> +%define R m0
> +%define G m1
> +%define B m2
> +%define A m3
> +
> +%define Y m4
> +%define U m5
> +%define V m6
> +
> +; Clip a signed integer to an unsigned power of two range.
> +; av_clip_uintp2
> +; 1 - dest
> +; 2 - bit position to clip at
> +%macro CLIPP2 2
> + ; (~a) >> 31 & ((1<<p) - 1);
> + pcmpeqb m4, m4
> + pxor m4, %1
> + psrad m4, 31
> + movu m5, [pd_yuv2gbrp16_upper%2]
> + pand m4, m5
> +
> + ; (a & ~((1<<p) - 1)) == 0
> + pandn m5, %1
> + pxor m6, m6
> + pcmpeqd m5, m6
> +%if cpuflag(avx2)
> + vpblendvb %1, m4, %1, m5
> +%else
> + pxor %1, m4
> + pand %1, m5
> + pxor %1, m4
> +%endif
> +%endmacro
> +
> +; 1 - dest
> +; 2 - source
> +%macro LOAD16 2
> + %if cpuflag(avx2)
> + movu xm%1, %2
> + vpmovsxwd m%1, xm%1
> + %elif cpuflag(sse4)
> + movsd xm%1, %2
> + vpmovsxwd m%1, xm%1
+ %else
> + movsd xm%1, %2
> + pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
> + pshuflw xm%1, xm%1, (1 << 6 | 1 << 4 | 0 << 2 | 0 << 0)
> + pshufhw xm%1, xm%1, (1 << 6 | 1 << 4 | 0 << 2 | 0 << 0)
> + psrad xm%1, 16 ; sign extend
> + %endif
> +%endmacro
The sse4 path shouldn't have a vex prefix instruction, I think I also have
a way to load in sse2 with less instructions.
I'll submit a new version of this patch
More information about the ffmpeg-devel
mailing list