[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup

Michael Niedermayer michael at niedermayer.cc
Wed Sep 16 01:39:44 EEST 2020


On Tue, Sep 15, 2020 at 06:11:58PM +0200, Alan Kelly wrote:
> ---
>  libswscale/x86/swscale.c | 138 ++++++++++++++++++++-------------------
>  1 file changed, 72 insertions(+), 66 deletions(-)
> 
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..e47fee2bbd 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>                             const int16_t **src, uint8_t *dest, int dstW,
>                             const uint8_t *dither, int offset)
>  {
> -    if(((uintptr_t)dest) & 15){
> +    if(((uintptr_t)dest) & 31){
>          yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
>          return;
>      }
>      filterSize--;
> -#define MAIN_FUNCTION \
> -        "pxor       %%xmm0, %%xmm0 \n\t" \
> -        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -        "movd           %4, %%xmm1 \n\t" \
> -        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -        "punpckldq  %%xmm1, %%xmm1 \n\t" \
> -        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -        "psllw          $3, %%xmm1 \n\t" \
> -        "paddw      %%xmm1, %%xmm3 \n\t" \
> -        "psraw          $4, %%xmm3 \n\t" \
> -        "movdqa     %%xmm3, %%xmm4 \n\t" \
> -        "movdqa     %%xmm3, %%xmm7 \n\t" \
> -        "movl           %3, %%ecx  \n\t" \
> -        "mov                                 %0, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        ".p2align                             4             \n\t" /* FIXME Unroll? */\
> -        "1:                                                 \n\t"\
> -        "movddup                  8(%%"FF_REG_d"), %%xmm0   \n\t" /* filterCoeff */\
> -        "movdqa              (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\
> -        "movdqa            16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\
> -        "add                                $16, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        "test                         %%"FF_REG_S", %%"FF_REG_S"     \n\t"\
> -        "pmulhw                           %%xmm0, %%xmm2      \n\t"\
> -        "pmulhw                           %%xmm0, %%xmm5      \n\t"\
> -        "paddw                            %%xmm2, %%xmm3      \n\t"\
> -        "paddw                            %%xmm5, %%xmm4      \n\t"\
> -        " jnz                                1b             \n\t"\
> -        "psraw                               $3, %%xmm3      \n\t"\
> -        "psraw                               $3, %%xmm4      \n\t"\
> -        "packuswb                         %%xmm4, %%xmm3      \n\t"\
> -        "movntdq                          %%xmm3, (%1, %%"FF_REG_c") \n\t"\
> -        "add                         $16, %%"FF_REG_c"        \n\t"\
> -        "cmp                          %2, %%"FF_REG_c"        \n\t"\
> -        "movdqa                   %%xmm7, %%xmm3            \n\t" \
> -        "movdqa                   %%xmm7, %%xmm4            \n\t" \
> -        "mov                                 %0, %%"FF_REG_d"        \n\t"\
> -        "mov                        (%%"FF_REG_d"), %%"FF_REG_S"     \n\t"\
> -        "jb                                  1b             \n\t"
> -
> -    if (offset) {
> -        __asm__ volatile(
> -            "movq          %5, %%xmm3  \n\t"
> -            "movdqa    %%xmm3, %%xmm4  \n\t"
> -            "psrlq        $24, %%xmm3  \n\t"
> -            "psllq        $40, %%xmm4  \n\t"
> -            "por       %%xmm4, %%xmm3  \n\t"
> -            MAIN_FUNCTION
> -              :: "g" (filter),
> -              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> -              "m"(filterSize), "m"(((uint64_t *) dither)[0])
> -              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> -                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> -              );
> -    } else {
> -        __asm__ volatile(
> -            "movq          %5, %%xmm3   \n\t"
> -            MAIN_FUNCTION
> -              :: "g" (filter),
> -              "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> -              "m"(filterSize), "m"(((uint64_t *) dither)[0])
> -              : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> -                "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> -              );
> -    }
> +    __asm__ volatile(
> +        "vmovq                    %5, %%xmm3            \n\t"
> +        "cmpl                     $0, %3                \n\t"
> +        "jz                       2f                    \n\t"
> +
> +        "# offset != 0 path.                            \n\t"
> +        "vpsrlq                  $24, %%xmm3, %%xmm5    \n\t"
> +        "vpsllq                  $40, %%xmm3, %%xmm3    \n\t"
> +        "vpor                 %%xmm3, %%xmm5, %%xmm3    \n\t"
> +
> +        "2:                                             \n\t"
> +        "vpxor                %%xmm0, %%xmm0, %%xmm0    \n\t"
> +        "mov                    (%0), %%"FF_REG_S"      \n\t"
> +        "vpunpcklbw           %%xmm0, %%xmm3, %%xmm3    \n\t"
> +        "vpbroadcastw             %4, %%xmm1            \n\t"
> +        "vpsllw                   $3, %%xmm1, %%xmm1    \n\t"
> +        "mov                      %0, %%"FF_REG_d"      \n\t"
> +        "vpaddw               %%xmm1, %%xmm3, %%xmm3    \n\t"
> +        "vpsraw                   $4, %%xmm3, %%xmm3    \n\t"
> +        "vmovdqa              %%xmm3, %%xmm4            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm7            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm9            \n\t"
> +        "vmovdqa              %%xmm3, %%xmm10            \n\t"
> +        "movl                     %3, %%ecx             \n\t"
> +
> +        ".p2align                  4                    \n\t"
> +        "1:                                             \n\t"
> +        "vpbroadcastq 8(%%"FF_REG_d"), %%xmm0           \n\t" /* filterCoeff */
> +        "vmovdqa       (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */
> +        "vmovdqa     16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */
> +        "vmovdqa     32(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm11 \n\t" /* srcData */
> +        "vmovdqa     48(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm12 \n\t" /* srcData */
> +        "add                     $16, %%"FF_REG_d"      \n\t"
> +        "mov          (%%"FF_REG_d"), %%"FF_REG_S"      \n\t"
> +        "vpmulhw              %%xmm0, %%xmm2, %%xmm2    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm5, %%xmm5    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm11, %%xmm11    \n\t"
> +        "vpmulhw              %%xmm0, %%xmm12, %%xmm12    \n\t"
> +        "vpaddw               %%xmm2, %%xmm3, %%xmm3    \n\t"
> +        "vpaddw               %%xmm5, %%xmm4, %%xmm4    \n\t"
> +        "vpaddw               %%xmm11, %%xmm9, %%xmm9    \n\t"
> +        "vpaddw               %%xmm12, %%xmm10, %%xmm10    \n\t"
> +        "test           %%"FF_REG_S", %%"FF_REG_S"      \n\t"
> +        "jnz                      1b                    \n\t"
> +
> +        "vpsraw                   $3, %%xmm3, %%xmm3    \n\t"
> +        "vpsraw                   $3, %%xmm4, %%xmm4    \n\t"
> +        "vpsraw                   $3, %%xmm9, %%xmm9    \n\t"
> +        "vpsraw                   $3, %%xmm10, %%xmm10    \n\t"
> +        "vpackuswb            %%xmm4, %%xmm3, %%xmm3    \n\t"
> +        "vpackuswb            %%xmm10, %%xmm9, %%xmm9    \n\t"
> +        "mov                    (%0), %%"FF_REG_S"      \n\t"
> +        "vmovntdq              %%xmm3, (%1, %%"FF_REG_c")\n\t"
> +        "vmovntdq              %%xmm9, 16(%1, %%"FF_REG_c")\n\t"
> +        "add                     $32, %%"FF_REG_c"      \n\t"
> +        "vmovdqa              %%xmm7, %%xmm3            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm4            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm9            \n\t"
> +        "vmovdqa              %%xmm7, %%xmm10            \n\t"
> +        "mov                      %0, %%"FF_REG_d"      \n\t"
> +        "cmp                      %2, %%"FF_REG_c"      \n\t"
> +        "jb                       1b                    \n\t"
> +
> +        :
> +        : "r" (filter),
> +          "r" (dest-offset), "r" ((int64_t)(dstW+offset)), "m" (offset),
> +          "m"(filterSize), "m"(((uint64_t *) dither)[0])
> +        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" ,
> +                       "%xmm5" , "%xmm7" , "%xmm9", "%xmm10" , "%xmm11" , "%xmm12" ,)
> +          "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
> +    );
>  }
>  #endif
>  

this breaks build on x86-32 also new asm probably should be nasm/yasm not gcc

thx

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

In fact, the RIAA has been known to suggest that students drop out
of college or go to community college in order to be able to afford
settlements. -- The RIAA
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20200916/bda9d0c6/attachment.sig>


More information about the ffmpeg-devel mailing list