[FFmpeg-devel] [RFC][PATCH] DSPUtilize some functions from APE decoder
Michael Niedermayer
michaelni
Thu Jul 3 14:09:52 CEST 2008
On Wed, Jul 02, 2008 at 04:26:25PM +0300, Kostya wrote:
> I'm not satisfied with the decoding speed of APE decoder,
> so I've decided to finally dsputilize functions marked as such.
>
> Altivec version is in development.
> Index: libavcodec/i386/dsputil_mmx.c
> ===================================================================
> --- libavcodec/i386/dsputil_mmx.c (revision 14044)
> +++ libavcodec/i386/dsputil_mmx.c (working copy)
> @@ -2061,6 +2061,66 @@
> extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
>
> +
> +static void vector_int16_add_sse(int16_t * v1, int16_t * v2, int order)
> +{
> + int i;
> + for(i = 0; i < order; i += 8){
> + asm volatile(
> + "movdqa (%0), %%xmm0 \n\t"
> + "movdqu (%1), %%xmm1 \n\t"
> + "paddw %%xmm1, %%xmm0 \n\t"
> + "movdqa %%xmm0, (%0) \n\t"
> + : "+r"(v1), "+r"(v2)
> + );
> + v1 += 8;
> + v2 += 8;
> + }
> +}
tabs, loop should be in asm not C
[...]
> +static int32_t vector_int16_scalarproduct_sse(int16_t * v1, int16_t * v2, int order)
> +{
> + int i;
> + int res = 0, *resp=&res;
> +
> + asm volatile("pxor %xmm7, %xmm7 \n\t");
> +
> + for(i = 0; i < order; i += 8){
> + asm volatile(
> + "movdqu (%0), %%xmm0 \n\t"
> + "movdqa (%1), %%xmm1 \n\t"
> + "pmaddwd %%xmm1, %%xmm0 \n\t"
> + "movhlps %%xmm0, %%xmm2 \n\t"
> +
> + "paddd %%xmm2, %%xmm0 \n\t"
> + "pshufd $0x01, %%xmm0,%%xmm2 \n\t"
> + "paddd %%xmm2, %%xmm0 \n\t"
> + "paddd %%xmm0, %%xmm7 \n\t"
> + : "+r"(v1), "+r"(v2)
> + );
> + v1 += 8;
> + v2 += 8;
":1
"movdqu (%0, %2), %%xmm0 \n\t"
"movdqa (%1, %2), %%xmm1 \n\t"
"pmaddwd %%xmm1 , %%xmm0 \n\t"
"paddd %%xmm0 , %%xmm2 \n\t"
"add $16 , %2 \n\t"
"jnc 1b \n\t"
"movhlps %%xmm2, %%xmm0 \n\t"
"paddd %%xmm2, %%xmm0 \n\t"
"pshufd $0x01, %%xmm0,%%xmm2 \n\t"
"paddd %%xmm2, %%xmm0 \n\t"
(or faster horizontal combineing code)
[...]
> +static int32_t vector_int16_scalarproduct_c(int16_t * v1, int16_t * v2, int order)
> +{
> + int res = 0;
> +
> + while (order--)
> + res += *v1++ * *v2++;
> +
> + return res;
> +}
duplicate of dot_product() from acelp_math.c
[...]
>
> + /* ape functions */
> + /* Add second vector values to the first one. v1 is aligned, v2 is not. */
> + void (*vector_int16_add)(int16_t *v1, int16_t *v2, int len);
> + /* Add second vector values to the first one. v1 is aligned, v2 is not. */
> + void (*vector_int16_sub)(int16_t *v1, int16_t *v2, int len);
names should be in line with existing add_bytes / diff_bytes (iam of course
fine with changing the existing names if it helps clarity)
> + /* Calculate scalar product of two vectors. v1 is unaligned, v2 is aligned. */
> + int32_t (*vector_int16_scalarproduct)(int16_t *v1, int16_t *v2, int len);
doxygen
the exact alignment requirements and len%8 requirements should be
documented.
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Complexity theory is the science of finding the exact solution to an
approximation. Benchmarking OTOH is finding an approximation of the exact
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080703/def7969a/attachment.pgp>
More information about the ffmpeg-devel
mailing list