[FFmpeg-devel] [PATCH] Make swscale x86 ASM depend less on registers been preserved across blocks

Michael Niedermayer michaelni at gmx.at
Wed Sep 17 22:31:27 CEST 2014


On Wed, Sep 17, 2014 at 09:21:08PM +0200, Vitor Sessak wrote:
> Hi!
> 
> I'm not sure we can count on registers been preserved across ASM
> blocks. Moreover, this causes problems with utils that instrument the
> code by inserting function calls between lines (like ThreadSanitizer).
> 
> The attached patch fix one instance of this problem in swscale.
> 
> -Vitor

>  swscale.c |   82 +++++++++++++++++++++++++++++++-------------------------------
>  1 file changed, 42 insertions(+), 40 deletions(-)
> 7fe9f7a0c1a7c1ed843b891f5f810fbd94a2e03f  0001-swscale-x86-do-not-expect-registers-to-be-preserved-.patch
> From f1c59628b2baeb9994bed8947c0a2c228610bf4f Mon Sep 17 00:00:00 2001
> From: Vitor Sessak <vsessak at google.com>
> Date: Wed, 17 Sep 2014 21:10:16 +0200
> Subject: [PATCH] swscale/x86: do not expect registers to be preserved across
>  inline ASM blocks
> 
> ---
>  libswscale/x86/swscale.c | 82 +++++++++++++++++++++++++-----------------------
>  1 file changed, 42 insertions(+), 40 deletions(-)
> 
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index c4c0e28..38157c8 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -205,36 +205,19 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>          yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
>          return;
>      }
> -    if (offset) {
> -        __asm__ volatile("movq       (%0), %%xmm3\n\t"
> -                         "movdqa    %%xmm3, %%xmm4\n\t"
> -                         "psrlq       $24, %%xmm3\n\t"
> -                         "psllq       $40, %%xmm4\n\t"
> -                         "por       %%xmm4, %%xmm3\n\t"
> -                         :: "r"(dither)
> -                         );
> -    } else {
> -        __asm__ volatile("movq       (%0), %%xmm3\n\t"
> -                         :: "r"(dither)
> -                         );
> -    }

> -    filterSize--;

not sure i missed it but it seems this is lost in teh new code


> -    __asm__ volatile(
> -        "pxor      %%xmm0, %%xmm0\n\t"
> -        "punpcklbw %%xmm0, %%xmm3\n\t"
> -        "movd          %0, %%xmm1\n\t"
> -        "punpcklwd %%xmm1, %%xmm1\n\t"
> -        "punpckldq %%xmm1, %%xmm1\n\t"
> -        "punpcklqdq %%xmm1, %%xmm1\n\t"
> -        "psllw         $3, %%xmm1\n\t"
> -        "paddw     %%xmm1, %%xmm3\n\t"
> -        "psraw         $4, %%xmm3\n\t"
> -        ::"m"(filterSize)
> -     );
> -    __asm__ volatile(
> -        "movdqa    %%xmm3, %%xmm4\n\t"
> -        "movdqa    %%xmm3, %%xmm7\n\t"
> -        "movl %3, %%ecx\n\t"
> +#define MAIN_FUNCTION \
> +        "pxor       %%xmm0, %%xmm0 \n\t" \
> +        "punpcklbw  %%xmm0, %%xmm3 \n\t" \
> +        "movd           %4, %%xmm1 \n\t" \
> +        "punpcklwd  %%xmm1, %%xmm1 \n\t" \
> +        "punpckldq  %%xmm1, %%xmm1 \n\t" \
> +        "punpcklqdq %%xmm1, %%xmm1 \n\t" \
> +        "psllw          $3, %%xmm1 \n\t" \
> +        "paddw      %%xmm1, %%xmm3 \n\t" \
> +        "psraw          $4, %%xmm3 \n\t" \
> +        "movdqa     %%xmm3, %%xmm4 \n\t" \
> +        "movdqa     %%xmm3, %%xmm7 \n\t" \
> +        "movl           %3, %%ecx  \n\t" \
>          "mov                                 %0, %%"REG_d"  \n\t"\
>          "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
>          ".p2align                             4             \n\t" /* FIXME Unroll? */\
> @@ -252,20 +235,39 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>          " jnz                                1b             \n\t"\
>          "psraw                               $3, %%xmm3      \n\t"\
>          "psraw                               $3, %%xmm4      \n\t"\
> -        "packuswb                         %%xmm4, %%xmm3      \n\t"
> -        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"
> +        "packuswb                         %%xmm4, %%xmm3      \n\t"\
> +        "movntdq                          %%xmm3, (%1, %%"REG_c")\n\t"\
>          "add                         $16, %%"REG_c"         \n\t"\
>          "cmp                          %2, %%"REG_c"         \n\t"\
> -        "movdqa    %%xmm7, %%xmm3\n\t"
> -        "movdqa    %%xmm7, %%xmm4\n\t"
> +        "movdqa                   %%xmm7, %%xmm3            \n\t" \
> +        "movdqa                   %%xmm7, %%xmm4            \n\t" \
>          "mov                                 %0, %%"REG_d"  \n\t"\
>          "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
> -        "jb                                  1b             \n\t"\
> -        :: "g" (filter),
> -           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
> -        : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
> -         "%"REG_d, "%"REG_S, "%"REG_c
> -    );
> +        "jb                                  1b             \n\t"
> +
> +    if (offset) {
> +        __asm__ volatile(
> +            "movq          %5, %%xmm3  \n\t"
> +            "movdqa    %%xmm3, %%xmm4  \n\t"
> +            "psrlq        $24, %%xmm3  \n\t"
> +            "psllq        $40, %%xmm4  \n\t"
> +            "por       %%xmm4, %%xmm3  \n\t"
> +            MAIN_FUNCTION
> +	      :: "g" (filter),
> +	      "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> +	      "m"(filterSize), "m"(((uint64_t *) dither)[0])
> +	      : "%"REG_d, "%"REG_S, "%"REG_c
> +	      );

you lost the XMM_CLOBBERS()



> +    } else {
> +        __asm__ volatile(
> +            "movq          %5, %%xmm3   \n\t"
> +            MAIN_FUNCTION
> +	      :: "g" (filter),
> +	      "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
> +	      "m"(filterSize), "m"(((uint64_t *) dither)[0])
> +	      : "%"REG_d, "%"REG_S, "%"REG_c
> +	      );

tabs

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Everything should be made as simple as possible, but not simpler.
-- Albert Einstein
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140917/753dbb00/attachment.asc>


More information about the ffmpeg-devel mailing list