[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup

Alan Kelly alankelly at google.com
Tue Sep 15 19:11:58 EEST 2020


---
 libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 66 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..e47fee2bbd 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
                            const int16_t **src, uint8_t *dest, int dstW,
                            const uint8_t *dither, int offset)
 {
-    if(((uintptr_t)dest) & 15){
+    if(((uintptr_t)dest) & 31){
         yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
         return;
     }
     filterSize--;
-#define MAIN_FUNCTION \
-        "pxor       %%xmm0, %%xmm0 \n\t" \
-        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
-        "movd           %4, %%xmm1 \n\t" \
-        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
-        "punpckldq  %%xmm1, %%xmm1 \n\t" \
-        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
-        "psllw          $3, %%xmm1 \n\t" \
-        "paddw      %%xmm1, %%xmm3 \n\t" \
-        "psraw          $4, %%xmm3 \n\t" \
-        "movdqa     %%xmm3, %%xmm4 \n\t" \
-        "movdqa     %%xmm3, %%xmm7 \n\t" \
-        "movl           %3, %%ecx  \n\t" \
-        "mov                                 %0, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        ".p2align                             4             \n\t" /* FIXME Unroll? */\
-        "1:                                                 \n\t"\
-        "movddup                  8(%%"FF_REG_d"), %%xmm0   \n\t" /* filterCoeff */\
-        "movdqa              (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
-        "movdqa            16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
-        "add                                $16, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        "test                         %%"FF_REG_S", %%"FF_REG_S"     \n\t"\
-        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
-        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
-        "paddw                            %%xmm2, %%xmm3      \n\t"\
-        "paddw                            %%xmm5, %%xmm4      \n\t"\
-        " jnz                                1b             \n\t"\
-        "psraw                               $3, %%xmm3      \n\t"\
-        "psraw                               $3, %%xmm4      \n\t"\
-        "packuswb                         %%xmm4, %%xmm3      \n\t"\
-        "movntdq                          %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-        "add                         $16, %%"FF_REG_c"        \n\t"\
-        "cmp                          %2, %%"FF_REG_c"        \n\t"\
-        "movdqa                   %%xmm7, %%xmm3            \n\t" \
-        "movdqa                   %%xmm7, %%xmm4            \n\t" \
-        "mov                                 %0, %%"FF_REG_d"        \n\t"\
-        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
-        "jb                                  1b             \n\t"
-
-    if (offset) {
-        __asm__ volatile(
-            "movq          %5, %%xmm3  \n\t"
-            "movdqa    %%xmm3, %%xmm4  \n\t"
-            "psrlq        $24, %%xmm3  \n\t"
-            "psllq        $40, %%xmm4  \n\t"
-            "por       %%xmm4, %%xmm3  \n\t"
-            MAIN_FUNCTION
-              :: "g" (filter),
-              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-              "m"(filterSize), "m"(((uint64_t *) dither)[0])
-              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
-                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-              );
-    } else {
-        __asm__ volatile(
-            "movq          %5, %%xmm3   \n\t"
-            MAIN_FUNCTION
-              :: "g" (filter),
-              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-              "m"(filterSize), "m"(((uint64_t *) dither)[0])
-              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
-                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-              );
-    }
+    __asm__ volatile(
+        "vmovq                    %5, %%xmm3            \n\t"
+        "cmpl                     $0, %3                \n\t"
+        "jz                       2f                    \n\t"
+
+        "# offset != 0 path.                            \n\t"
+        "vpsrlq                  $24, %%xmm3, %%xmm5    \n\t"
+        "vpsllq                  $40, %%xmm3, %%xmm3    \n\t"
+        "vpor                 %%xmm3, %%xmm5, %%xmm3    \n\t"
+
+        "2:                                             \n\t"
+        "vpxor                %%xmm0, %%xmm0, %%xmm0    \n\t"
+        "mov                    (%0), %%"FF_REG_S"      \n\t"
+        "vpunpcklbw           %%xmm0, %%xmm3, %%xmm3    \n\t"
+        "vpbroadcastw             %4, %%xmm1            \n\t"
+        "vpsllw                   $3, %%xmm1, %%xmm1    \n\t"
+        "mov                      %0, %%"FF_REG_d"      \n\t"
+        "vpaddw               %%xmm1, %%xmm3, %%xmm3    \n\t"
+        "vpsraw                   $4, %%xmm3, %%xmm3    \n\t"
+        "vmovdqa              %%xmm3, %%xmm4            \n\t"
+        "vmovdqa              %%xmm3, %%xmm7            \n\t"
+        "vmovdqa              %%xmm3, %%xmm9            \n\t"
+        "vmovdqa              %%xmm3, %%xmm10            \n\t"
+        "movl                     %3, %%ecx             \n\t"
+
+        ".p2align                  4                    \n\t"
+        "1:                                             \n\t"
+        "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0           \n\t" /* filterCoeff */
+        "vmovdqa       (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
+        "vmovdqa     16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
+        "vmovdqa     32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
+        "vmovdqa     48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
+        "add                     $16, %%"FF_REG_d"      \n\t"
+        "mov          (%%"FF_REG_d"), %%"FF_REG_S"      \n\t"
+        "vpmulhw              %%xmm0, %%xmm2, %%xmm2    \n\t"
+        "vpmulhw              %%xmm0, %%xmm5, %%xmm5    \n\t"
+        "vpmulhw              %%xmm0, %%xmm11, %%xmm11    \n\t"
+        "vpmulhw              %%xmm0, %%xmm12, %%xmm12    \n\t"
+        "vpaddw               %%xmm2, %%xmm3, %%xmm3    \n\t"
+        "vpaddw               %%xmm5, %%xmm4, %%xmm4    \n\t"
+        "vpaddw               %%xmm11, %%xmm9, %%xmm9    \n\t"
+        "vpaddw               %%xmm12, %%xmm10, %%xmm10    \n\t"
+        "test           %%"FF_REG_S", %%"FF_REG_S"      \n\t"
+        "jnz                      1b                    \n\t"
+
+        "vpsraw                   $3, %%xmm3, %%xmm3    \n\t"
+        "vpsraw                   $3, %%xmm4, %%xmm4    \n\t"
+        "vpsraw                   $3, %%xmm9, %%xmm9    \n\t"
+        "vpsraw                   $3, %%xmm10, %%xmm10    \n\t"
+        "vpackuswb            %%xmm4, %%xmm3, %%xmm3    \n\t"
+        "vpackuswb            %%xmm10, %%xmm9, %%xmm9    \n\t"
+        "mov                    (%0), %%"FF_REG_S"      \n\t"
+        "vmovntdq              %%xmm3, (%1, %%"FF_REG_c")\n\t"
+        "vmovntdq              %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
+        "add                     $32, %%"FF_REG_c"      \n\t"
+        "vmovdqa              %%xmm7, %%xmm3            \n\t"
+        "vmovdqa              %%xmm7, %%xmm4            \n\t"
+        "vmovdqa              %%xmm7, %%xmm9            \n\t"
+        "vmovdqa              %%xmm7, %%xmm10            \n\t"
+        "mov                      %0, %%"FF_REG_d"      \n\t"
+        "cmp                      %2, %%"FF_REG_c"      \n\t"
+        "jb                       1b                    \n\t"
+
+        :
+        : "r" (filter),
+          "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
+          "m"(filterSize), "m"(((uint64_t *) dither)[0])
+        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
+                       "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
+          "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
+    );
 }
 #endif
 
-- 
2.28.0.618.gf4bc123cb7-goog



More information about the ffmpeg-devel mailing list