[FFmpeg-devel] [PATCH] vp9/x86: 16x16 iadst_idct, idct_iadst and iadst_iadst (ssse3).

Clément Bœsch u at pkh.me
Wed Jan 15 15:23:27 CET 2014


On Tue, Jan 14, 2014 at 10:15:55PM -0500, Ronald S. Bultje wrote:
> Sample timings on ped1080p.webm:
> iadst_idct:  4672 -> 1175 cycles
> idct_iadst:  4736 -> 1263 cycles
> iadst_iadst: 4924 -> 1438 cycles
> Total decoding time changed from 6.565s to 6.413s.
> ---
>  libavcodec/x86/vp9dsp_init.c |  25 +++-
>  libavcodec/x86/vp9itxfm.asm  | 323 ++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 338 insertions(+), 10 deletions(-)
> 
[...]
> +INIT_XMM ssse3
> +cglobal vp9_idct_iadst_16x16_add, 3, 5, 16, 512, dst, stride, block, eob

Here and following, shouldn't it be 4 instead of 3?

Also, unless you plan to add specific code in those, you could create a
macro for all the combination you added (the following code is basically
duplicated 3x with very small changes).

That would ease the addition of avx btw.

> +    ; potential eob checks go here
> +
> +    DEFINE_ARGS dst, stride, block, cnt, dst_bak
> +    mov               cntd, 2
> +.loop1_full:
> +    VP9_IDCT16_1D   blockq, 1
> +    add             blockq, 16
> +    add                rsp, 256
> +    dec               cntd
> +    jg .loop1_full
> +    sub             blockq, 32
> +    sub                rsp, 512
> +
> +    mov               cntd, 2
> +    mov           dst_bakq, dstq
> +.loop2_full:
> +    VP9_IADST16_1D     rsp, 2
> +    lea               dstq, [dst_bakq+8]
> +    add                rsp, 16
> +    dec               cntd
> +    jg .loop2_full
> +    sub                rsp, 32
> +
> +    ; at the end of the loop, m0 should still be zero
> +    ; use that to zero out block coefficients
> +    ZERO_BLOCK      blockq, 32, 16, m0
> +    RET
> +
> +INIT_XMM ssse3
> +cglobal vp9_iadst_idct_16x16_add, 3, 5, 16, 512, dst, stride, block, eob
> +    ; potential eob checks go here
> +
> +    DEFINE_ARGS dst, stride, block, cnt, dst_bak
> +    mov               cntd, 2
> +.loop1_full:
> +    VP9_IADST16_1D  blockq, 1
> +    add             blockq, 16
> +    add                rsp, 256
> +    dec               cntd
> +    jg .loop1_full
> +    sub             blockq, 32
> +    sub                rsp, 512
> +
> +    mov               cntd, 2
> +    mov           dst_bakq, dstq
> +.loop2_full:
> +    VP9_IDCT16_1D      rsp, 2
> +    lea               dstq, [dst_bakq+8]
> +    add                rsp, 16
> +    dec               cntd
> +    jg .loop2_full
> +    sub                rsp, 32
> +
> +    ; at the end of the loop, m0 should still be zero
> +    ; use that to zero out block coefficients
> +    ZERO_BLOCK      blockq, 32, 16, m0
> +    RET
> +
> +INIT_XMM ssse3
> +cglobal vp9_iadst_iadst_16x16_add, 3, 5, 16, 512, dst, stride, block, eob
> +    ; potential eob checks go here
> +
> +    DEFINE_ARGS dst, stride, block, cnt, dst_bak
> +    mov               cntd, 2
> +.loop1_full:
> +    VP9_IADST16_1D  blockq, 1
> +    add             blockq, 16
> +    add                rsp, 256
> +    dec               cntd
> +    jg .loop1_full
> +    sub             blockq, 32
> +    sub                rsp, 512
> +
> +    mov               cntd, 2
> +    mov           dst_bakq, dstq
> +.loop2_full:
> +    VP9_IADST16_1D     rsp, 2
> +    lea               dstq, [dst_bakq+8]
> +    add                rsp, 16
> +    dec               cntd
> +    jg .loop2_full
> +    sub                rsp, 32
> +
> +    ; at the end of the loop, m0 should still be zero
> +    ; use that to zero out block coefficients
> +    ZERO_BLOCK      blockq, 32, 16, m0
> +    RET
> +
> +;---------------------------------------------------------------------------------------------
>  ; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
>  ;---------------------------------------------------------------------------------------------
>  

LGTM otherwise

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140115/08dc9776/attachment.asc>


More information about the ffmpeg-devel mailing list