[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup
Michael Niedermayer
michael at niedermayer.cc
Wed Sep 16 01:39:44 EEST 2020
On Tue, Sep 15, 2020 at 06:11:58PM +0200, Alan Kelly wrote:
> ---
> libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
> 1 file changed, 72 insertions(+), 66 deletions(-)
>
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..e47fee2bbd 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> const int16_t **src, uint8_t *dest, int dstW,
> const uint8_t *dither, int offset)
> {
> - if(((uintptr_t)dest) & 15){
> + if(((uintptr_t)dest) & 31){
> yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
> return;
> }
> filterSize--;
> -#define MAIN_FUNCTION \
> - "pxor %%xmm0, %%xmm0 \n\t" \
> - "punpcklbw %%xmm0, %%xmm3 \n\t" \
> - "movd %4, %%xmm1 \n\t" \
> - "punpcklwd %%xmm1, %%xmm1 \n\t" \
> - "punpckldq %%xmm1, %%xmm1 \n\t" \
> - "punpcklqdq %%xmm1, %%xmm1 \n\t" \
> - "psllw $3, %%xmm1 \n\t" \
> - "paddw %%xmm1, %%xmm3 \n\t" \
> - "psraw $4, %%xmm3 \n\t" \
> - "movdqa %%xmm3, %%xmm4 \n\t" \
> - "movdqa %%xmm3, %%xmm7 \n\t" \
> - "movl %3, %%ecx \n\t" \
> - "mov %0, %%"FF_REG_d" \n\t"\
> - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
> - ".p2align 4 \n\t" /* FIXME Unroll? */\
> - "1: \n\t"\
> - "movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\
> - "movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
> - "movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
> - "add $16, %%"FF_REG_d" \n\t"\
> - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
> - "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
> - "pmulhw %%xmm0, %%xmm2 \n\t"\
> - "pmulhw %%xmm0, %%xmm5 \n\t"\
> - "paddw %%xmm2, %%xmm3 \n\t"\
> - "paddw %%xmm5, %%xmm4 \n\t"\
> - " jnz 1b \n\t"\
> - "psraw $3, %%xmm3 \n\t"\
> - "psraw $3, %%xmm4 \n\t"\
> - "packuswb %%xmm4, %%xmm3 \n\t"\
> - "movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\
> - "add $16, %%"FF_REG_c" \n\t"\
> - "cmp %2, %%"FF_REG_c" \n\t"\
> - "movdqa %%xmm7, %%xmm3 \n\t" \
> - "movdqa %%xmm7, %%xmm4 \n\t" \
> - "mov %0, %%"FF_REG_d" \n\t"\
> - "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
> - "jb 1b \n\t"
> -
> - if (offset) {
> - __asm__ volatile(
> - "movq %5, %%xmm3 \n\t"
> - "movdqa %%xmm3, %%xmm4 \n\t"
> - "psrlq $24, %%xmm3 \n\t"
> - "psllq $40, %%xmm4 \n\t"
> - "por %%xmm4, %%xmm3 \n\t"
> - MAIN_FUNCTION
> - :: "g" (filter),
> - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> - "m"(filterSize), "m"(((uint64_t *) dither)[0])
> - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> - "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> - );
> - } else {
> - __asm__ volatile(
> - "movq %5, %%xmm3 \n\t"
> - MAIN_FUNCTION
> - :: "g" (filter),
> - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> - "m"(filterSize), "m"(((uint64_t *) dither)[0])
> - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> - "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> - );
> - }
> + __asm__ volatile(
> + "vmovq %5, %%xmm3 \n\t"
> + "cmpl $0, %3 \n\t"
> + "jz 2f \n\t"
> +
> + "# offset != 0 path. \n\t"
> + "vpsrlq $24, %%xmm3, %%xmm5 \n\t"
> + "vpsllq $40, %%xmm3, %%xmm3 \n\t"
> + "vpor %%xmm3, %%xmm5, %%xmm3 \n\t"
> +
> + "2: \n\t"
> + "vpxor %%xmm0, %%xmm0, %%xmm0 \n\t"
> + "mov (%0), %%"FF_REG_S" \n\t"
> + "vpunpcklbw %%xmm0, %%xmm3, %%xmm3 \n\t"
> + "vpbroadcastw %4, %%xmm1 \n\t"
> + "vpsllw $3, %%xmm1, %%xmm1 \n\t"
> + "mov %0, %%"FF_REG_d" \n\t"
> + "vpaddw %%xmm1, %%xmm3, %%xmm3 \n\t"
> + "vpsraw $4, %%xmm3, %%xmm3 \n\t"
> + "vmovdqa %%xmm3, %%xmm4 \n\t"
> + "vmovdqa %%xmm3, %%xmm7 \n\t"
> + "vmovdqa %%xmm3, %%xmm9 \n\t"
> + "vmovdqa %%xmm3, %%xmm10 \n\t"
> + "movl %3, %%ecx \n\t"
> +
> + ".p2align 4 \n\t"
> + "1: \n\t"
> + "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */
> + "vmovdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
> + "vmovdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
> + "vmovdqa 32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
> + "vmovdqa 48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
> + "add $16, %%"FF_REG_d" \n\t"
> + "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"
> + "vpmulhw %%xmm0, %%xmm2, %%xmm2 \n\t"
> + "vpmulhw %%xmm0, %%xmm5, %%xmm5 \n\t"
> + "vpmulhw %%xmm0, %%xmm11, %%xmm11 \n\t"
> + "vpmulhw %%xmm0, %%xmm12, %%xmm12 \n\t"
> + "vpaddw %%xmm2, %%xmm3, %%xmm3 \n\t"
> + "vpaddw %%xmm5, %%xmm4, %%xmm4 \n\t"
> + "vpaddw %%xmm11, %%xmm9, %%xmm9 \n\t"
> + "vpaddw %%xmm12, %%xmm10, %%xmm10 \n\t"
> + "test %%"FF_REG_S", %%"FF_REG_S" \n\t"
> + "jnz 1b \n\t"
> +
> + "vpsraw $3, %%xmm3, %%xmm3 \n\t"
> + "vpsraw $3, %%xmm4, %%xmm4 \n\t"
> + "vpsraw $3, %%xmm9, %%xmm9 \n\t"
> + "vpsraw $3, %%xmm10, %%xmm10 \n\t"
> + "vpackuswb %%xmm4, %%xmm3, %%xmm3 \n\t"
> + "vpackuswb %%xmm10, %%xmm9, %%xmm9 \n\t"
> + "mov (%0), %%"FF_REG_S" \n\t"
> + "vmovntdq %%xmm3, (%1, %%"FF_REG_c")\n\t"
> + "vmovntdq %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
> + "add $32, %%"FF_REG_c" \n\t"
> + "vmovdqa %%xmm7, %%xmm3 \n\t"
> + "vmovdqa %%xmm7, %%xmm4 \n\t"
> + "vmovdqa %%xmm7, %%xmm9 \n\t"
> + "vmovdqa %%xmm7, %%xmm10 \n\t"
> + "mov %0, %%"FF_REG_d" \n\t"
> + "cmp %2, %%"FF_REG_c" \n\t"
> + "jb 1b \n\t"
> +
> + :
> + : "r" (filter),
> + "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
> + "m"(filterSize), "m"(((uint64_t *) dither)[0])
> + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
> + "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
> + "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> + );
> }
> #endif
>
this breaks build on x86-32 also new asm probably should be nasm/yasm not gcc
thx
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20200916/bda9d0c6/attachment.sig>
More information about the ffmpeg-devel
mailing list