[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 2/2 - SSE2
Michael Niedermayer
michaelni
Tue Jul 8 00:07:49 CEST 2008
On Mon, Jul 07, 2008 at 09:21:07PM +0300, Kostya wrote:
> $subj
>
> It makes APE with insane compression to decode only
> in 0.5 realtime instead of 0.11 realtime :).
>
> Sorry for the implementation, I mostly translate
> from asm instead of writing code.
> Index: libavcodec/i386/dsputil_mmx.c
> ===================================================================
> --- libavcodec/i386/dsputil_mmx.c (revision 14100)
> +++ libavcodec/i386/dsputil_mmx.c (working copy)
> @@ -2061,6 +2061,70 @@
> extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
>
> +
> +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
> +{
> + order <<= 1;
> + asm volatile(
> + "1: \n\t"
> + "movdqa (%0), %%xmm0 \n\t"
> + "movdqu (%1), %%xmm1 \n\t"
> + "paddw %%xmm1, %%xmm0 \n\t"
Maybe the following is faster? (ignore if not)
"movdqu (%1), %%xmm0 \n\t"
"paddw (%0), %%xmm0 \n\t"
> + "movdqa %%xmm0, (%0) \n\t"
> + "add $16, %0 \n\t"
> + "add $16, %1 \n\t"
> + "sub $16, %2 \n\t"
> + "jnz 1b \n\t"
2 of the add/sub are unneeded, see my commit a moment ago to float_to_int16
also the loop will likely significantly benefit from being unrolled once
> + : "+r"(v1), "+r"(v2), "+r"(order)
> + );
> +}
> +
> +static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
> +{
> + order <<= 1;
> + asm volatile(
> + "1: \n\t"
> + "movdqa (%0), %%xmm0 \n\t"
> + "movdqu (%1), %%xmm1 \n\t"
> + "psubw %%xmm1, %%xmm0 \n\t"
> + "movdqa %%xmm0, (%0) \n\t"
> + "add $16, %0 \n\t"
> + "add $16, %1 \n\t"
> + "sub $16, %2 \n\t"
> + "jnz 1b \n\t"
> + : "+r"(v1), "+r"(v2), "+r"(order)
> + );
> +}
> +
> +static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
> +{
> + int res = 0;
> + DECLARE_ALIGNED_16(int64_t, sh);
> +
> + sh = shift;
> + order <<= 1;
> + asm volatile(
> + "pxor %%xmm7, %%xmm7 \n\t"
> + "1: \n\t"
> + "movdqu (%0), %%xmm0 \n\t"
> + "pmaddwd (%1), %%xmm0 \n\t"
> + "movhlps %%xmm0, %%xmm2 \n\t"
> + "paddd %%xmm2, %%xmm0 \n\t"
> + "psrad %4, %%xmm0 \n\t"
> + "pshuflw $0x4E, %%xmm0,%%xmm2 \n\t"
> + "paddd %%xmm2, %%xmm0 \n\t"
> + "paddd %%xmm0, %%xmm7 \n\t"
i think ive already said that values should be accumlated vertically
before horizontally
the movhlps,psrad, pshuflw should be after the loop
> + "add $16, %0 \n\t"
> + "add $16, %1 \n\t"
> + "sub $16, %3 \n\t"
> + "jnz 1b \n\t"
> + "movd %%xmm7, %2 \n\t"
> + : "+r"(v1), "+r"(v2), "=r"(res), "+r"(order)
> + : "m"(sh)
should be in a register
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Observe your enemies, for they first find out your faults. -- Antisthenes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080708/396375fa/attachment.pgp>
More information about the ffmpeg-devel
mailing list