[Ffmpeg-devel] [PATCH] fix mpeg4 lowres chroma bug and increase h264/mpeg4 MC speed

Trent Piepho xyzzy
Thu Feb 8 09:05:23 CET 2007


This patch fixes the bugs with put_h264_chroma_mc2_mmx2() and
avg_h264_chroma_mc2_mmx2().  It also makes them faster, not slower.

put_h264_chroma_mc2_mmx2() writes two bytes of zeros after the two bytes it
is supposed to write.  This is only mostly ok if you process the entire
image one row at a time from left to right.  Other than the two bytes
written past the end of the image; hopefully nothing important was there!
In mpeg4 lowres mode, the image is processed two rows at a time and this
doesn't work.

avg_h264_chroma_mc2_mmx2() will average two extra bytes with zero after the
two bytes it was supposed to average.  This will never work correctly.  I
think that avg_h264_chroma_mc2_mmx2() is simply not used normally, as it
clearly hasn't been tested.

I've fixed these problems by still writing four bytes instead of two, but
instead of writing extra zeros, the extra two bytes are written with the
values they already had.

I tried a version that did a 16-bit write, but it was slower.

I benchmarked my version, by measuring put_h264_chroma_mc2_mmx2() from
start to finish with rdtsc, as 12.8% faster than before.  I didn't measure
avg_h264_chroma_mc2_mmx2() since I'm not sure what kind of input is needed
to trigger it, but it should see a speed increase at least as good as the
put_ version.
-------------- next part --------------
Index: i386/dsputil_h264_template_mmx.c
===================================================================
--- i386/dsputil_h264_template_mmx.c	(revision 7876)
+++ i386/dsputil_h264_template_mmx.c	(working copy)
@@ -265,7 +265,7 @@
 #ifdef H264_CHROMA_MC2_TMPL
 static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 {
-    int CD=((1<<16)-1)*x*y + 8*y;
+    int CD=((1<<16)-1)*xtimesy[x][y] + 8*y;
     int AB=((8<<16)-8)*x + 64 - CD;
     int i;
 
@@ -314,9 +314,7 @@
             "psrlw $6, %%mm1\n\t"
             "packssdw %%mm7, %%mm1\n\t"
             "packuswb %%mm7, %%mm1\n\t"
-            /* writes garbage to the right of dst.
-             * ok because partitions are processed from left to right. */
-            H264_CHROMA_OP4(%0, %%mm1, %%mm3)
+            H264_CHROMA_OP2(%0, %%mm1, %%mm3)
             "movd %%mm1, %0\n\t"
             : "=m" (dst[0]) : "m" (ff_pw_32));
         dst += stride;
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c	(revision 7876)
+++ i386/h264dsp_mmx.c	(working copy)
@@ -1372,8 +1372,20 @@
 H264_MC(avg_, 16,mmx2)
 
 
+static const uint8_t xtimesy[8][8] = { 
+	{ 0, 0, 0, 0, 0, 0, 0, 0},
+	{ 0, 1, 2, 3, 4, 5, 6, 7},
+	{ 0, 2, 4, 6, 8,10,12,14},
+	{ 0, 3, 6, 9,12,15,18,21},
+	{ 0, 4, 8,12,16,20,24,28},
+	{ 0, 5,10,15,20,25,30,35},
+	{ 0, 6,12,18,24,30,36,42},
+	{ 0, 7,14,21,28,35,42,49} };
+
 #define H264_CHROMA_OP(S,D)
 #define H264_CHROMA_OP4(S,D,T)
+#define H264_CHROMA_OP2(S,D,T) "movd 2+" #S ", " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
@@ -1381,6 +1393,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL
@@ -1389,6 +1402,10 @@
 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
                                "pavgb " #T ", " #D " \n\t"
+#define H264_CHROMA_OP2(S,D,T) "movd  " #S ", " #T " \n\t"\
+                               "pavgb " #T ", " #D " \n\t"\
+                               "psrld $16, " #T "\n\t"\
+                               "punpcklwd " #T ", " #D "\n\t"
 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
@@ -1396,6 +1413,7 @@
 #include "dsputil_h264_template_mmx.c"
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
+#undef H264_CHROMA_OP2
 #undef H264_CHROMA_MC8_TMPL
 #undef H264_CHROMA_MC4_TMPL
 #undef H264_CHROMA_MC2_TMPL



More information about the ffmpeg-devel mailing list