[FFmpeg-devel] [PATCH] Make VP3/Theora Decoder Much Faster

Mon Dec 7 17:28:24 CET 2009

On Mon, 7 Dec 2009, Mike Melanson wrote:

> I'm a little surprised to realize that this functionality doesn't already 
> exist (been a long time since I wrote the decoder). The original VP3 decoder 
> had IDCTs for 1- and 3-element fragments in addition to the full flavor IDCT. 
> I think perhaps I tried to bring them over but someone convinced me that 
> those other cases don't occur often enough to make it worthwhile. Have you 
> found a lot of fragments with 1-3 non-zero coeffs?

I've never examined a Theora bitstream, and I'm not about to start now.
However, if Theora doesn't have lots of DC-only blocks, it's either very 
different from every other inter-predicted DCT codec out there, or you're 
encoding at a ridiculously high bitrate.
I don't remember why I never committed such a change to mpegvideo, but 
it's not that it didn't help. Maybe this isn't bitexact and I never 
bothered to figure out why?

--Loren Merritt
-------------- next part --------------
Index: i386/dsputil_mmx.c
===================================================================

--- i386/dsputil_mmx.c	(revision 11552)
+++ i386/dsputil_mmx.c	(working copy)
@@ -37,6 +37,8 @@
 
 extern void ff_idct_xvid_mmx(short *block);
 extern void ff_idct_xvid_mmx2(short *block);
+extern void ff_xvid_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block);
+extern void ff_xvid_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block);
 
 int mm_flags; /* multimedia extension flags */
 
@@ -3190,6 +3192,8 @@
                 c->idct_add= ff_simple_idct_add_mmx;
                 c->idct    = ff_simple_idct_mmx;
                 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+                if(mm_flags & MM_MMXEXT)
+                    c->idct_dc_add= ff_simple_idct_dc_add_mmx2;
 #ifdef CONFIG_GPL
             }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
                 if(mm_flags & MM_MMXEXT){
@@ -3226,11 +3230,14 @@
                     c->idct_put= ff_idct_xvid_mmx2_put;
                     c->idct_add= ff_idct_xvid_mmx2_add;
                     c->idct    = ff_idct_xvid_mmx2;
+                    c->idct_dc_add= ff_xvid_idct_dc_add_mmx2;
                 }else{
                     c->idct_put= ff_idct_xvid_mmx_put;
                     c->idct_add= ff_idct_xvid_mmx_add;
                     c->idct    = ff_idct_xvid_mmx;
+                    c->idct_dc_add= ff_xvid_idct_dc_add;
                 }
+                c->idct_dc_put= ff_xvid_idct_dc_put;
             }
         }
 
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c	(revision 11552)
+++ i386/h264dsp_mmx.c	(working copy)
@@ -253,9 +253,8 @@
     );
 }
 
-static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+static inline void idct8_dc_add_mmx2(uint8_t *dst, int stride, int dc)
 {
-    int dc = (block[0] + 32) >> 6;
     int y;
     asm volatile(
         "movd          %0, %%mm0 \n\t"
@@ -264,7 +263,7 @@
         "psubw      %%mm0, %%mm1 \n\t"
         "packuswb   %%mm0, %%mm0 \n\t"
         "packuswb   %%mm1, %%mm1 \n\t"
-        ::"r"(dc)
+        ::"g"(dc)
     );
     for(y=2; y--; dst += 4*stride){
     asm volatile(
@@ -292,7 +291,22 @@
     }
 }
 
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    idct8_dc_add_mmx2(dst, stride, (block[0] + 32) >> 6);
+}
 
+static void ff_simple_idct_dc_add_mmx2(uint8_t *dst, int stride, DCTELEM *block)
+{
+    idct8_dc_add_mmx2(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+static void ff_xvid_idct_dc_add_mmx2(uint8_t *dst, int stride, DCTELEM *block)
+{
+    idct8_dc_add_mmx2(dst, stride, (block[0] + 4) >> 3);
+}
+
+
 /***********************************/
 /* deblocking */
 
Index: mpegvideo.c
===================================================================
--- mpegvideo.c	(revision 11552)
+++ mpegvideo.c	(working copy)
@@ -1792,7 +1792,10 @@
                            DCTELEM *block, int i, uint8_t *dest, int line_size, int qscale)
 {
     s->dct_unquantize_intra(s, block, i, qscale);
-    s->dsp.idct_put (dest, line_size, block);
+    if (s->block_last_index[i] > 0)
+        s->dsp.idct_put(dest, line_size, block);
+    else
+        s->dsp.idct_dc_put(dest, line_size, block);
 }
 
 /* add block[] to dest[] */
@@ -1800,7 +1803,10 @@
                            DCTELEM *block, int i, uint8_t *dest, int line_size)
 {
     if (s->block_last_index[i] >= 0) {
-        s->dsp.idct_add (dest, line_size, block);
+        if (s->block_last_index[i] > 0)
+            s->dsp.idct_add(dest, line_size, block);
+        else
+            s->dsp.idct_dc_add(dest, line_size, block);
     }
 }
 
@@ -1810,7 +1816,10 @@
     if (s->block_last_index[i] >= 0) {
         s->dct_unquantize_inter(s, block, i, qscale);
 
-        s->dsp.idct_add (dest, line_size, block);
+        if (s->block_last_index[i] > 0)
+            s->dsp.idct_add(dest, line_size, block);
+        else
+            s->dsp.idct_dc_add(dest, line_size, block);
     }
 }
 
Index: simple_idct.h
===================================================================
--- simple_idct.h	(revision 11552)
+++ simple_idct.h	(working copy)
@@ -44,4 +44,8 @@
 void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block);
 void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block);
 
+void ff_simple_idct_dc_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_dc_put(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_dc_put_mmx2(uint8_t *dest, int line_size, DCTELEM *block);
+
 #endif /* FFMPEG_SIMPLE_IDCT_H */
Index: dsputil.c
===================================================================
--- dsputil.c	(revision 11552)
+++ dsputil.c	(working copy)
@@ -4028,6 +4028,8 @@
             c->idct_put= ff_simple_idct_put;
             c->idct_add= ff_simple_idct_add;
             c->idct    = ff_simple_idct;
+            c->idct_dc_put= ff_simple_idct_dc_put;
+            c->idct_dc_add= ff_simple_idct_dc_add;
             c->idct_permutation_type= FF_NO_IDCT_PERM;
         }
     }
@@ -4298,6 +4300,11 @@
             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
     }
 
+    if(!c->idct_dc_put)
+        c->idct_dc_put= c->idct_put;
+    if(!c->idct_dc_add)
+        c->idct_dc_add= c->idct_add;
+
     switch(c->idct_permutation_type){
     case FF_NO_IDCT_PERM:
         for(i=0; i<64; i++)
Index: dsputil.h
===================================================================
--- dsputil.h	(revision 11552)
+++ dsputil.h	(working copy)
@@ -364,6 +364,10 @@
      */
     void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
 
+    /* as idct_put/idct_add, but assume all ac coefs are zero */
+    void (*idct_dc_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+    void (*idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+
     /**
      * idct input permutation.
      * several optimized IDCTs need a permutated input (relative to the normal order of the reference
Index: h264idct.c
===================================================================
--- h264idct.c	(revision 11552)
+++ h264idct.c	(working copy)
@@ -154,10 +154,9 @@
     }
 }
 
-void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+static inline void idct8_dc_add(uint8_t *dst, long stride, long dc){
     int i, j;
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
-    int dc = (block[0] + 32) >> 6;
     for( j = 0; j < 8; j++ )
     {
         for( i = 0; i < 8; i++ )
@@ -165,3 +164,31 @@
         dst += stride;
     }
 }
+
+static inline void idct8_dc_put(uint8_t *dst, long stride, long dc){
+    int i;
+    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+    uint64_t row = cm[dc] * 0x0101010101010101ULL;
+    for(i=0; i<8; i++)
+        *(uint64_t*)(dst+i*stride) = row;
+}
+
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+    idct8_dc_add(dst, stride, (block[0] + 32) >> 6);
+}
+
+void ff_simple_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block){
+    idct8_dc_add(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+void ff_simple_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block){
+    idct8_dc_put(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+void ff_xvid_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block){
+    idct8_dc_add(dst, stride, (block[0] + 4) >> 3);
+}
+
+void ff_xvid_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block){
+    idct8_dc_put(dst, stride, (block[0] + 4) >> 3);
+}