[Ffmpeg-devel] [PATCH] fix mpeg4 lowres chroma bug and increase h264/mpeg4 MC speed

Thu Feb 8 11:48:53 CET 2007

On Thu, 8 Feb 2007, Oleg Metelitsa wrote:
> Hello Trent,
>
> What  about  to  change  two  MMX  instruction to one SSE command as I
> proposed here:
> http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2007-February/052187.html
>
> As a result we will have
>
> #define H264_CHROMA_OP2(S,D,T)   "pinsrw $1, 2+" #S ", " #D " \n\t"
>
> instead of
>
> >> +#define H264_CHROMA_OP2(S,D,T) "movd 2+" #S ", " #T "\n\t"\
> >> +                               "punpcklwd " #T ", " #D "\n\t"

Well, because the pinsrw version is slower (about 4.76%).  This isn't that
surprising really.  On an AthlonXP processor (what I'm using), movd and
punpcklwd are directpatch instructions, while pinsrw instruction is a
much slower vectorpath instruction.

Also, doing N-bit reads which are not aligned to an N-bit boundary are
slower.  It is not always obvious how many bits a read is, for example
"punpcklwd mem32, mmxreg" is a 32-bit read while "punpckhwd mem32, mmxreg"
is a 64-bit read.  I suspect that pinsrw is 64-bit, does anyone know for
sure?

There are also partial memory stalls when you do 32-bite writes (with movd
to dst) and 64-bits reads (with pinsrw from dst) to the same location.
Using movd will do a 32-bit read and so avoid this.

Anyway, there is an obvious way to make it faster that we both missed the
first time:

#define H264_CHROMA_OP2(S,D,T) "punpcklwd 2+" #S ", " #D "\n\t"

This is about 4.38% faster than the my first patch, and 17.4% faster than
the original code.

> >> I benchmarked my version, by measuring put_h264_chroma_mc2_mmx2() from
> >> start to finish with rdtsc, as 12.8% faster than before.
>
> Speed was increased because memory-to-cache operation is faster during
> reading  comparing  to  writing. So, you are making fast cache preload
> and  it  speeds up the code. Is the patch faster if you test it during
> decode operation but not alone?

The speed increase is mainly from the changing "x*y" into "xtimesy[x][y]",
which changes an "imull %ebx, %edx" into "movzbl xtimesy(%ebx,%edx,8),
%ecx".  The latter really is much faster (really, it is! benchmark it (I
did)).

I've attached a new version, with the faster asm code.  I also got rid of
the bogus gas warning about a missing operand, from the "2+%0" offsetting a
memory reference construct.  In this case it was a minor tweak to the asm
code to avoid the warning and still generate the exact same code.
-------------- next part --------------
Index: i386/dsputil_h264_template_mmx.c
===================================================================

--- i386/dsputil_h264_template_mmx.c	(revision 7881)
+++ i386/dsputil_h264_template_mmx.c	(working copy)
@@ -265,7 +265,7 @@
 #ifdef H264_CHROMA_MC2_TMPL
 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 {
-    int CD=((1<<16)-1)*x*y + 8*y;
+    int CD=((1<<16)-1)*xtimesy[x][y] + 8*y;
     int AB=((8<<16)-8)*x + 64 - CD;
     int i;
 
@@ -310,15 +310,13 @@
 
         asm volatile(
             /* dst[0,1] = pack((mm1 + 32) >> 6) */
-            "paddw %1, %%mm1\n\t"
+            "paddw %2, %%mm1\n\t"
             "psrlw $6, %%mm1\n\t"
             "packssdw %%mm7, %%mm1\n\t"
             "packuswb %%mm7, %%mm1\n\t"
-            /* writes garbage to the right of dst.
-             * ok because partitions are processed from left to right. */
-            H264_CHROMA_OP4(%0, %%mm1, %%mm3)
+            H264_CHROMA_OP2(%0, %1, %%mm1, %%mm3)
             "movd %%mm1, %0\n\t"
-            : "=m" (dst[0]) : "m" (ff_pw_32));
+            : "=m" (dst[0]), "=m" (dst[2]) : "m" (ff_pw_32));
         dst += stride;
     }
 }
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c	(revision 7881)
+++ i386/h264dsp_mmx.c	(working copy)
@@ -1372,8 +1372,19 @@
 H264_MC(avg_, 16,mmx2)
 
 
+static const uint8_t xtimesy[8][8] = { 
+	{ 0, 0, 0, 0, 0, 0, 0, 0},
+	{ 0, 1, 2, 3, 4, 5, 6, 7},
+	{ 0, 2, 4, 6, 8,10,12,14},
+	{ 0, 3, 6, 9,12,15,18,21},
+	{ 0, 4, 8,12,16,20,24,28},
+	{ 0, 5,10,15,20,25,30,35},
+	{ 0, 6,12,18,24,30,36,42},
+	{ 0, 7,14,21,28,35,42,49} };
+
 #define H264_CHROMA_OP(S,D)
 #define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_OP2(S,S2,D,T) "punpcklwd " #S2 ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
@@ -1381,6 +1392,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL
@@ -1389,6 +1401,10 @@
 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
                                "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_OP2(S,S2,D,T) "movd  " #S ", " #T "\n\t"\
+                                  "pavgb " #T ", " #D "\n\t"\
+                                  "psrld $16, " #T "\n\t"\
+                                  "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
@@ -1396,6 +1412,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL