[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup
Alan Kelly
alankelly at google.com
Tue Sep 15 19:11:58 EEST 2020
---
libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
1 file changed, 72 insertions(+), 66 deletions(-)
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..e47fee2bbd 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
- if(((uintptr_t)dest) & 15){
+ if(((uintptr_t)dest) & 31){
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
return;
}
filterSize--;
-#define MAIN_FUNCTION \
- "pxor %%xmm0, %%xmm0 \n\t" \
- "punpcklbw %%xmm0, %%xmm3 \n\t" \
- "movd %4, %%xmm1 \n\t" \
- "punpcklwd %%xmm1, %%xmm1 \n\t" \
- "punpckldq %%xmm1, %%xmm1 \n\t" \
- "punpcklqdq %%xmm1, %%xmm1 \n\t" \
- "psllw $3, %%xmm1 \n\t" \
- "paddw %%xmm1, %%xmm3 \n\t" \
- "psraw $4, %%xmm3 \n\t" \
- "movdqa %%xmm3, %%xmm4 \n\t" \
- "movdqa %%xmm3, %%xmm7 \n\t" \
- "movl %3, %%ecx \n\t" \
- "mov %0, %%"FF_REG_d" \n\t"\
- "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
- ".p2align 4 \n\t" /* FIXME Unroll? */\
- "1: \n\t"\
- "movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\
- "movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
- "movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
- "add $16, %%"FF_REG_d" \n\t"\
- "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
- "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
- "pmulhw %%xmm0, %%xmm2 \n\t"\
- "pmulhw %%xmm0, %%xmm5 \n\t"\
- "paddw %%xmm2, %%xmm3 \n\t"\
- "paddw %%xmm5, %%xmm4 \n\t"\
- " jnz 1b \n\t"\
- "psraw $3, %%xmm3 \n\t"\
- "psraw $3, %%xmm4 \n\t"\
- "packuswb %%xmm4, %%xmm3 \n\t"\
- "movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\
- "add $16, %%"FF_REG_c" \n\t"\
- "cmp %2, %%"FF_REG_c" \n\t"\
- "movdqa %%xmm7, %%xmm3 \n\t" \
- "movdqa %%xmm7, %%xmm4 \n\t" \
- "mov %0, %%"FF_REG_d" \n\t"\
- "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
- "jb 1b \n\t"
-
- if (offset) {
- __asm__ volatile(
- "movq %5, %%xmm3 \n\t"
- "movdqa %%xmm3, %%xmm4 \n\t"
- "psrlq $24, %%xmm3 \n\t"
- "psllq $40, %%xmm4 \n\t"
- "por %%xmm4, %%xmm3 \n\t"
- MAIN_FUNCTION
- :: "g" (filter),
- "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
- "m"(filterSize), "m"(((uint64_t *) dither)[0])
- : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
- "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
- );
- } else {
- __asm__ volatile(
- "movq %5, %%xmm3 \n\t"
- MAIN_FUNCTION
- :: "g" (filter),
- "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
- "m"(filterSize), "m"(((uint64_t *) dither)[0])
- : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
- "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
- );
- }
+ __asm__ volatile(
+ "vmovq %5, %%xmm3 \n\t"
+ "cmpl $0, %3 \n\t"
+ "jz 2f \n\t"
+
+ "# offset != 0 path. \n\t"
+ "vpsrlq $24, %%xmm3, %%xmm5 \n\t"
+ "vpsllq $40, %%xmm3, %%xmm3 \n\t"
+ "vpor %%xmm3, %%xmm5, %%xmm3 \n\t"
+
+ "2: \n\t"
+ "vpxor %%xmm0, %%xmm0, %%xmm0 \n\t"
+ "mov (%0), %%"FF_REG_S" \n\t"
+ "vpunpcklbw %%xmm0, %%xmm3, %%xmm3 \n\t"
+ "vpbroadcastw %4, %%xmm1 \n\t"
+ "vpsllw $3, %%xmm1, %%xmm1 \n\t"
+ "mov %0, %%"FF_REG_d" \n\t"
+ "vpaddw %%xmm1, %%xmm3, %%xmm3 \n\t"
+ "vpsraw $4, %%xmm3, %%xmm3 \n\t"
+ "vmovdqa %%xmm3, %%xmm4 \n\t"
+ "vmovdqa %%xmm3, %%xmm7 \n\t"
+ "vmovdqa %%xmm3, %%xmm9 \n\t"
+ "vmovdqa %%xmm3, %%xmm10 \n\t"
+ "movl %3, %%ecx \n\t"
+
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */
+ "vmovdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
+ "vmovdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
+ "vmovdqa 32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
+ "vmovdqa 48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
+ "add $16, %%"FF_REG_d" \n\t"
+ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"
+ "vpmulhw %%xmm0, %%xmm2, %%xmm2 \n\t"
+ "vpmulhw %%xmm0, %%xmm5, %%xmm5 \n\t"
+ "vpmulhw %%xmm0, %%xmm11, %%xmm11 \n\t"
+ "vpmulhw %%xmm0, %%xmm12, %%xmm12 \n\t"
+ "vpaddw %%xmm2, %%xmm3, %%xmm3 \n\t"
+ "vpaddw %%xmm5, %%xmm4, %%xmm4 \n\t"
+ "vpaddw %%xmm11, %%xmm9, %%xmm9 \n\t"
+ "vpaddw %%xmm12, %%xmm10, %%xmm10 \n\t"
+ "test %%"FF_REG_S", %%"FF_REG_S" \n\t"
+ "jnz 1b \n\t"
+
+ "vpsraw $3, %%xmm3, %%xmm3 \n\t"
+ "vpsraw $3, %%xmm4, %%xmm4 \n\t"
+ "vpsraw $3, %%xmm9, %%xmm9 \n\t"
+ "vpsraw $3, %%xmm10, %%xmm10 \n\t"
+ "vpackuswb %%xmm4, %%xmm3, %%xmm3 \n\t"
+ "vpackuswb %%xmm10, %%xmm9, %%xmm9 \n\t"
+ "mov (%0), %%"FF_REG_S" \n\t"
+ "vmovntdq %%xmm3, (%1, %%"FF_REG_c")\n\t"
+ "vmovntdq %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
+ "add $32, %%"FF_REG_c" \n\t"
+ "vmovdqa %%xmm7, %%xmm3 \n\t"
+ "vmovdqa %%xmm7, %%xmm4 \n\t"
+ "vmovdqa %%xmm7, %%xmm9 \n\t"
+ "vmovdqa %%xmm7, %%xmm10 \n\t"
+ "mov %0, %%"FF_REG_d" \n\t"
+ "cmp %2, %%"FF_REG_c" \n\t"
+ "jb 1b \n\t"
+
+ :
+ : "r" (filter),
+ "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
+ "m"(filterSize), "m"(((uint64_t *) dither)[0])
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
+ "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
+ "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+ );
}
#endif
--
2.28.0.618.gf4bc123cb7-goog
More information about the ffmpeg-devel
mailing list