[Ffmpeg-devel] [PATCH] qpel approximation

Loren Merritt lorenm
Sat Feb 18 02:59:08 CET 2006


This patch adds approximate bilinear qpel mc functions, enabled for 
B-frames only, with -lavdopts fast.
Comments: does it look ok? should it have a separate option?

16x16 mc costs, averaged over all subpel positions:
mpeg4 qpel:   1443 cycles
h264 qpel:    1148 cycles
bilinear qpel: 174 cycles
mpeg4 hpel:     99 cycles

total speedup: ~12% in mpeg4, ~5% in h264.

--Loren Merritt
-------------- next part --------------
Index: dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.134
diff -u -r1.134 dsputil.c
--- libavcodec/dsputil.c	10 Feb 2006 06:55:24 -0000	1.134
+++ libavcodec/dsputil.c	18 Feb 2006 01:51:53 -0000
@@ -4047,6 +4047,9 @@
     c->try_8x8basis= try_8x8basis_c;
     c->add_8x8basis= add_8x8basis_c;
 
+    memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
+    memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
+
 #ifdef HAVE_MMX
     dsputil_init_mmx(c, avctx);
 #endif
@@ -4072,6 +4075,13 @@
     dsputil_init_sh4(c,avctx);
 #endif
 
+    for(i=0; i<64; i++){
+        if(!c->put_2tap_qpel_pixels_tab[0][i])
+            c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
+        if(!c->avg_2tap_qpel_pixels_tab[0][i])
+            c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
+    }
+
     switch(c->idct_permutation_type){
     case FF_NO_IDCT_PERM:
         for(i=0; i<64; i++)
Index: dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.128
diff -u -r1.128 dsputil.h
--- libavcodec/dsputil.h	10 Feb 2006 06:55:24 -0000	1.128
+++ libavcodec/dsputil.h	18 Feb 2006 01:51:54 -0000
@@ -258,6 +258,9 @@
     qpel_mc_func put_h264_qpel_pixels_tab[4][16];
     qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
 
+    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
+    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
+
     h264_weight_func weight_h264_pixels_tab[10];
     h264_biweight_func biweight_h264_pixels_tab[10];
 
Index: h263dec.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h263dec.c,v
retrieving revision 1.174
diff -u -r1.174 h263dec.c
--- libavcodec/h263dec.c	12 Jan 2006 22:43:15 -0000	1.174
+++ libavcodec/h263dec.c	18 Feb 2006 01:51:54 -0000
@@ -689,6 +689,17 @@
             s->next_p_frame_damaged=0;
     }
 
+    if((s->avctx->flags2 & CODEC_FLAG2_FAST) && s->pict_type==B_TYPE){
+        s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
+        s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
+    }else if((!s->no_rounding) || s->pict_type==B_TYPE){
+        s->me.qpel_put= s->dsp.put_no_rnd_qpel_pixels_tab;
+        s->me.qpel_avg= s->dsp.avg_qpel_pixels_tab;
+    }else{
+        s->me.qpel_put= s->dsp.put_qpel_pixels_tab;
+        s->me.qpel_avg= s->dsp.avg_qpel_pixels_tab;
+    }
+
     if(MPV_frame_start(s, avctx) < 0)
         return -1;
 
Index: h264.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h264.c,v
retrieving revision 1.190
diff -u -r1.190 h264.c
--- libavcodec/h264.c	16 Feb 2006 01:16:12 -0000	1.190
+++ libavcodec/h264.c	18 Feb 2006 01:51:55 -0000
@@ -3463,8 +3463,8 @@
             }
         }else if(s->codec_id == CODEC_ID_H264){
             hl_motion(h, dest_y, dest_cb, dest_cr,
-                      s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
-                      s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
+                      h->s.me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                      h->s.me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
         }
 
@@ -4499,6 +4499,14 @@
                );
     }
 
+    if((s->avctx->flags2 & CODEC_FLAG2_FAST) && s->pict_type==B_TYPE){
+        h->s.me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
+        h->s.me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
+    }else{
+        h->s.me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
+        h->s.me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
+    }
+
     return 0;
 }
 
Index: mpegvideo.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/mpegvideo.c,v
retrieving revision 1.510
diff -u -r1.510 mpegvideo.c
--- libavcodec/mpegvideo.c	6 Feb 2006 11:21:26 -0000	1.510
+++ libavcodec/mpegvideo.c	18 Feb 2006 01:51:58 -0000
@@ -3907,17 +3907,16 @@
                         MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix);
                     }
                 }else{
+                    op_qpix= s->me.qpel_put;
                     if ((!s->no_rounding) || s->pict_type==B_TYPE){
                         op_pix = s->dsp.put_pixels_tab;
-                        op_qpix= s->dsp.put_qpel_pixels_tab;
                     }else{
                         op_pix = s->dsp.put_no_rnd_pixels_tab;
-                        op_qpix= s->dsp.put_no_rnd_qpel_pixels_tab;
                     }
                     if (s->mv_dir & MV_DIR_FORWARD) {
                         MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix, op_qpix);
                         op_pix = s->dsp.avg_pixels_tab;
-                        op_qpix= s->dsp.avg_qpel_pixels_tab;
+                        op_qpix= s->me.qpel_avg;
                     }
                     if (s->mv_dir & MV_DIR_BACKWARD) {
                         MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix, op_qpix);
Index: i386/dsputil_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
retrieving revision 1.111
diff -u -r1.111 dsputil_mmx.c
--- libavcodec/i386/dsputil_mmx.c	10 Feb 2006 06:55:25 -0000	1.111
+++ libavcodec/i386/dsputil_mmx.c	18 Feb 2006 01:51:58 -0000
@@ -2403,6 +2403,51 @@
     c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
     c->avg_ ## postfix1 = avg_ ## postfix2;
 
+/***********************************/
+/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
+
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
+}
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
+}
+
+#define QPEL_2TAP(OPNAME, SIZE, MMX)\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
+                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 21, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 12, _y2_ ## MMX)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
+}\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
+}\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
+
+QPEL_2TAP(put_, 16, mmx2)
+QPEL_2TAP(avg_, 16, mmx2)
+QPEL_2TAP(put_,  8, mmx2)
+QPEL_2TAP(avg_,  8, mmx2)
+QPEL_2TAP(put_, 16, 3dnow)
+QPEL_2TAP(avg_, 16, 3dnow)
+QPEL_2TAP(put_,  8, 3dnow)
+QPEL_2TAP(avg_,  8, 3dnow)
+
+
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     long i=0;
 
@@ -2829,6 +2874,11 @@
             dspfunc(avg_h264_qpel, 0, 16);
             dspfunc(avg_h264_qpel, 1, 8);
             dspfunc(avg_h264_qpel, 2, 4);
+
+            dspfunc(put_2tap_qpel, 0, 16);
+            dspfunc(put_2tap_qpel, 1, 8);
+            dspfunc(avg_2tap_qpel, 0, 16);
+            dspfunc(avg_2tap_qpel, 1, 8);
 #undef dspfunc
 
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
@@ -2943,6 +2993,11 @@
             dspfunc(avg_h264_qpel, 1, 8);
             dspfunc(avg_h264_qpel, 2, 4);
 
+            dspfunc(put_2tap_qpel, 0, 16);
+            dspfunc(put_2tap_qpel, 1, 8);
+            dspfunc(avg_2tap_qpel, 0, 16);
+            dspfunc(avg_2tap_qpel, 1, 8);
+
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
         }
Index: i386/dsputil_mmx_avg.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx_avg.h,v
retrieving revision 1.29
diff -u -r1.29 dsputil_mmx_avg.h
--- libavcodec/i386/dsputil_mmx_avg.h	12 Jan 2006 22:43:17 -0000	1.29
+++ libavcodec/i386/dsputil_mmx_avg.h	18 Feb 2006 01:51:59 -0000
@@ -818,3 +818,86 @@
     DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
 
+static void DEF(put_2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+    asm volatile(
+        "1:                    \n\t"
+        "movq   (%1,%2), %%mm0 \n\t"
+        "movq  8(%1,%2), %%mm1 \n\t"
+        PAVGB"  (%1,%3), %%mm0 \n\t"
+        PAVGB" 8(%1,%3), %%mm1 \n\t"
+        PAVGB"  (%1),    %%mm0 \n\t"
+        PAVGB" 8(%1),    %%mm1 \n\t"
+        "movq  %%mm0,  (%1,%4) \n\t"
+        "movq  %%mm1, 8(%1,%4) \n\t"
+        "add   %5, %1          \n\t"
+        "decl  %0              \n\t"
+        "jnz   1b              \n\t"
+        :"+g"(h), "+r"(src)
+        :"r"((long)off1), "r"((long)off2),
+         "r"((long)(dst-src)), "r"((long)stride)
+        :"memory"
+    );
+}
+
+static void DEF(avg_2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+    asm volatile(
+        "1:                    \n\t"
+        "movq   (%1,%2), %%mm0 \n\t"
+        "movq  8(%1,%2), %%mm1 \n\t"
+        PAVGB"  (%1,%3), %%mm0 \n\t"
+        PAVGB" 8(%1,%3), %%mm1 \n\t"
+        PAVGB"  (%1),    %%mm0 \n\t"
+        PAVGB" 8(%1),    %%mm1 \n\t"
+        PAVGB"  (%1,%4), %%mm0 \n\t"
+        PAVGB" 8(%1,%4), %%mm1 \n\t"
+        "movq  %%mm0,  (%1,%4) \n\t"
+        "movq  %%mm1, 8(%1,%4) \n\t"
+        "add   %5, %1          \n\t"
+        "decl  %0              \n\t"
+        "jnz   1b              \n\t"
+        :"+g"(h), "+r"(src)
+        :"r"((long)off1), "r"((long)off2),
+         "r"((long)(dst-src)), "r"((long)stride)
+        :"memory"
+    );
+}
+
+static void DEF(put_2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+    asm volatile(
+        "1:                    \n\t"
+        "movq   (%1,%2), %%mm0 \n\t"
+        PAVGB"  (%1,%3), %%mm0 \n\t"
+        PAVGB"  (%1),    %%mm0 \n\t"
+        "movq  %%mm0,  (%1,%4) \n\t"
+        "add   %5, %1          \n\t"
+        "decl  %0              \n\t"
+        "jnz   1b              \n\t"
+        :"+g"(h), "+r"(src)
+        :"r"((long)off1), "r"((long)off2),
+         "r"((long)(dst-src)), "r"((long)stride)
+        :"memory"
+    );
+}
+
+static void DEF(avg_2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+    asm volatile(
+        "1:                    \n\t"
+        "movq   (%1,%2), %%mm0 \n\t"
+        PAVGB"  (%1,%3), %%mm0 \n\t"
+        PAVGB"  (%1),    %%mm0 \n\t"
+        PAVGB"  (%1,%4), %%mm0 \n\t"
+        "movq  %%mm0,  (%1,%4) \n\t"
+        "add   %5, %1          \n\t"
+        "decl  %0              \n\t"
+        "jnz   1b              \n\t"
+        :"+g"(h), "+r"(src)
+        :"r"((long)off1), "r"((long)off2),
+         "r"((long)(dst-src)), "r"((long)stride)
+        :"memory"
+    );
+}
+



More information about the ffmpeg-devel mailing list