[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 2/2 - SSE2

Tue Jul 8 00:07:49 CEST 2008

On Mon, Jul 07, 2008 at 09:21:07PM +0300, Kostya wrote:
> $subj
> 
> It makes APE with insane compression to decode only
> in 0.5 realtime instead of 0.11 realtime :).
> 
> Sorry for the implementation, I mostly translate
> from asm instead of writing code.

> Index: libavcodec/i386/dsputil_mmx.c
> ===================================================================
> --- libavcodec/i386/dsputil_mmx.c	(revision 14100)
> +++ libavcodec/i386/dsputil_mmx.c	(working copy)
> @@ -2061,6 +2061,70 @@
>  extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
>                            int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
>  
> +
> +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
> +{
> +    order <<= 1;
> +    asm volatile(
> +        "1:                     \n\t"
> +        "movdqa  (%0),   %%xmm0 \n\t"
> +        "movdqu  (%1),   %%xmm1 \n\t"
> +        "paddw   %%xmm1, %%xmm0 \n\t"

Maybe the following is faster? (ignore if not)
"movdqu  (%1),   %%xmm0 \n\t"
"paddw   (%0),   %%xmm0 \n\t"


> +        "movdqa  %%xmm0, (%0)   \n\t"

> +        "add     $16,    %0     \n\t"
> +        "add     $16,    %1     \n\t"
> +        "sub     $16,    %2     \n\t"
> +        "jnz     1b             \n\t"

2 of the add/sub are unneeded, see my commit a moment ago to float_to_int16

also the loop will likely significantly benefit from being unrolled once


> +        : "+r"(v1), "+r"(v2), "+r"(order)
> +    );
> +}
> +
> +static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
> +{
> +    order <<= 1;
> +    asm volatile(
> +        "1:                     \n\t"
> +        "movdqa  (%0),   %%xmm0 \n\t"
> +        "movdqu  (%1),   %%xmm1 \n\t"
> +        "psubw   %%xmm1, %%xmm0 \n\t"
> +        "movdqa  %%xmm0, (%0)   \n\t"
> +        "add     $16,    %0     \n\t"
> +        "add     $16,    %1     \n\t"
> +        "sub     $16,    %2     \n\t"
> +        "jnz     1b             \n\t"
> +        : "+r"(v1), "+r"(v2), "+r"(order)
> +    );
> +}
> +
> +static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
> +{
> +    int res = 0;
> +    DECLARE_ALIGNED_16(int64_t, sh);
> +
> +    sh = shift;
> +    order <<= 1;
> +    asm volatile(
> +        "pxor    %%xmm7, %%xmm7        \n\t"
> +        "1:                            \n\t"
> +        "movdqu  (%0),   %%xmm0        \n\t"
> +        "pmaddwd (%1),   %%xmm0        \n\t"
> +        "movhlps %%xmm0, %%xmm2        \n\t"
> +        "paddd   %%xmm2, %%xmm0        \n\t"
> +        "psrad   %4,     %%xmm0        \n\t"
> +        "pshuflw $0x4E,  %%xmm0,%%xmm2 \n\t"
> +        "paddd   %%xmm2, %%xmm0        \n\t"
> +        "paddd   %%xmm0, %%xmm7        \n\t"

i think ive already said that values should be accumlated vertically
before horizontally
the movhlps,psrad, pshuflw should be after the loop


> +        "add     $16,    %0            \n\t"
> +        "add     $16,    %1            \n\t"
> +        "sub     $16,    %3            \n\t"
> +        "jnz     1b                    \n\t"
> +        "movd    %%xmm7, %2            \n\t"
> +        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(order)

> +        : "m"(sh)

should be in a register


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Observe your enemies, for they first find out your faults. -- Antisthenes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080708/396375fa/attachment.pgp>