[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 2/2 - SSE2
Kostya
kostya.shishkov
Thu Jul 10 10:16:01 CEST 2008
On Tue, Jul 08, 2008 at 02:18:25PM +0200, Michael Niedermayer wrote:
> On Tue, Jul 08, 2008 at 01:48:21PM +0300, Kostya wrote:
> > On Tue, Jul 08, 2008 at 12:07:49AM +0200, Michael Niedermayer wrote:
> > > On Mon, Jul 07, 2008 at 09:21:07PM +0300, Kostya wrote:
> [...]
> > > also the loop will likely significantly benefit from being unrolled once
> >
> > len is declared as multiple of 8, and loop handles 8 elements
>
> is it ever used with %16 != 0 len ?
no, fixed that and unrolled loops
> [...]
> > > > + "add $16, %0 \n\t"
> > > > + "add $16, %1 \n\t"
> > > > + "sub $16, %3 \n\t"
> > > > + "jnz 1b \n\t"
> > > > + "movd %%xmm7, %2 \n\t"
> > > > + : "+r"(v1), "+r"(v2), "=r"(res), "+r"(order)
> > >
> > > > + : "m"(sh)
> > >
> > > should be in a register
> >
> > why? psrad takes either (x)mm register, immediate value or memory
> > for input.
>
> memory is likely slower than a register
It may be, but I don't think explicitly loading that value into xmm
register would be better. And that instruction does not work with
general register.
And it's called only once.
> [...]
> > +static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
> > +{
> > + x86_reg o = order - 8;
> > + asm volatile(
> > + "1: \n\t"
> > + "movdqu (%1,%2,2), %%xmm0 \n\t"
> > + "paddw (%0,%2,2), %%xmm0 \n\t"
> > + "movdqa %%xmm0, (%0,%2,2) \n\t"
> > + "sub $8, %2 \n\t"
> > + "jge 1b \n\t"
> > + : "+r"(v1), "+r"(v2), "+r"(o)
> > + );
>
> accessing arrays from end to start is likely slower than start to end
> (if you want it i want to see benchmarks of it against the equivalent
> forward code)
> also (a,b) might be faster than (a,b,2)
> a add %2, %2 before the loop would avoid it
done
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Complexity theory is the science of finding the exact solution to an
> approximation. Benchmarking OTOH is finding an approximation of the exact
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c (revision 14100)
+++ libavcodec/i386/dsputil_mmx.c (working copy)
@@ -2061,6 +2061,82 @@
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ asm volatile(
+ "1: \n\t"
+ "movdqu (%1,%2), %%xmm0 \n\t"
+ "paddw (%0,%2), %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "add $16, %2 \n\t"
+ "movdqu (%1,%2), %%xmm0 \n\t"
+ "paddw (%0,%2), %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "add $16, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ asm volatile(
+ "1: \n\t"
+ "movdqa (%0,%2), %%xmm0 \n\t"
+ "movdqu (%1,%2), %%xmm1 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "add $16, %2 \n\t"
+ "movdqa (%0,%2), %%xmm0 \n\t"
+ "movdqu (%1,%2), %%xmm1 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "add $16, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+ int res = 0;
+ DECLARE_ALIGNED_16(int64_t, sh);
+ x86_reg o = -(order << 1);
+
+ v1 += order;
+ v2 += order;
+ sh = shift;
+ asm volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "movdqu (%0,%3), %%xmm0 \n\t"
+ "pmaddwd (%1,%3), %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm7 \n\t"
+ "add $16, %3 \n\t"
+ "movdqu (%0,%3), %%xmm0 \n\t"
+ "pmaddwd (%1,%3), %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm7 \n\t"
+ "add $16, %3 \n\t"
+ "js 1b \n\t"
+ "movhlps %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "psrad %4, %%xmm7 \n\t"
+ "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "movd %%xmm7, %2 \n\t"
+ : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+ : "m"(sh)
+ );
+ return res;
+}
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -2429,6 +2505,11 @@
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+ if(mm_flags & MM_SSE2){
+ c->add_int16 = add_int16_sse2;
+ c->sub_int16 = sub_int16_sse2;
+ c->scalarproduct_int16 = scalarproduct_int16_sse2;
+ }
}
if (ENABLE_ENCODERS)
More information about the ffmpeg-devel
mailing list