[FFmpeg-devel] [PATCH] x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format

James Almer jamrial at gmail.com
Sun Jan 31 22:18:53 CET 2016


On 1/31/2016 4:48 PM, Timothy Gu wrote:
> ---
>  libavcodec/x86/vc1dsp.asm    | 104 ++++++++++++++++++++++
>  libavcodec/x86/vc1dsp_init.c |  13 +++
>  libavcodec/x86/vc1dsp_mmx.c  | 207 -------------------------------------------
>  3 files changed, 117 insertions(+), 207 deletions(-)
> 
> diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
> index 6415a83..f922927 100644
> --- a/libavcodec/x86/vc1dsp.asm
> +++ b/libavcodec/x86/vc1dsp.asm
> @@ -395,3 +395,107 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
>          jnz         .loop
>      REP_RET
>  %endif ; HAVE_MMX_INLINE
> +
> +%macro INV_TRANS_INIT 0
> +    movsxdifnidn linesizeq, linesized

Maybe change the prototype so linesize is ptrdiff_t?

> +    movd       m0, blockd
> +    SPLATW     m0, m0
> +    pxor       m1, m1
> +    psubw      m1, m0
> +    packuswb   m0, m0
> +    packuswb   m1, m1
> +%endmacro
> +
> +%macro INV_TRANS_PROCESS 1
> +    mov%1                  m2, [destq+linesizeq*0]
> +    mov%1                  m3, [destq+linesizeq*1]
> +    mov%1                  m4, [destq+linesizeq*2]
> +    mov%1                  m5, [destq+linesize3q]
> +    paddusb                m2, m0
> +    paddusb                m3, m0
> +    paddusb                m4, m0
> +    paddusb                m5, m0
> +    psubusb                m2, m1
> +    psubusb                m3, m1
> +    psubusb                m4, m1
> +    psubusb                m5, m1
> +    mov%1 [linesizeq*0+destq], m2
> +    mov%1 [linesizeq*1+destq], m3
> +    mov%1 [linesizeq*2+destq], m4
> +    mov%1 [linesize3q +destq], m5
> +%endmacro
> +
> +; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
> +INIT_MMX mmxext
> +cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
> +    movsx         r3d, WORD [blockq]

Can this value be negative? Because you're using it as an argument
for lea using native size after movsx sign extended the value to 32
bits, which means that on x86_64 the upper bits of the register will
be zeroed.

If it can you'll have to use blockq/r3q everywhere, and if it can't
then use movzx and shr.

> +    mov        blockd, r3d             ; dc
> +    shl        blockd, 4               ; 16 * dc
> +    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
> +    sar        blockd, 3               ; >> 3
> +    mov           r3d, blockd          ; dc
> +    shl        blockd, 4               ; 16 * dc
> +    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
> +    sar        blockd, 7               ; >> 7



More information about the ffmpeg-devel mailing list