[Ffmpeg-devel] [PATCH] qpel approximation
Loren Merritt
lorenm
Sat Feb 18 02:59:08 CET 2006
This patch adds approximate bilinear qpel mc functions, enabled for
B-frames only, with -lavdopts fast.
Comments: does it look ok? should it have a separate option?
16x16 mc costs, averaged over all subpel positions:
mpeg4 qpel: 1443 cycles
h264 qpel: 1148 cycles
bilinear qpel: 174 cycles
mpeg4 hpel: 99 cycles
total speedup: ~12% in mpeg4, ~5% in h264.
--Loren Merritt
-------------- next part --------------
Index: dsputil.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.c,v
retrieving revision 1.134
diff -u -r1.134 dsputil.c
--- libavcodec/dsputil.c 10 Feb 2006 06:55:24 -0000 1.134
+++ libavcodec/dsputil.c 18 Feb 2006 01:51:53 -0000
@@ -4047,6 +4047,9 @@
c->try_8x8basis= try_8x8basis_c;
c->add_8x8basis= add_8x8basis_c;
+ memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
+ memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
+
#ifdef HAVE_MMX
dsputil_init_mmx(c, avctx);
#endif
@@ -4072,6 +4075,13 @@
dsputil_init_sh4(c,avctx);
#endif
+ for(i=0; i<64; i++){
+ if(!c->put_2tap_qpel_pixels_tab[0][i])
+ c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
+ if(!c->avg_2tap_qpel_pixels_tab[0][i])
+ c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
+ }
+
switch(c->idct_permutation_type){
case FF_NO_IDCT_PERM:
for(i=0; i<64; i++)
Index: dsputil.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/dsputil.h,v
retrieving revision 1.128
diff -u -r1.128 dsputil.h
--- libavcodec/dsputil.h 10 Feb 2006 06:55:24 -0000 1.128
+++ libavcodec/dsputil.h 18 Feb 2006 01:51:54 -0000
@@ -258,6 +258,9 @@
qpel_mc_func put_h264_qpel_pixels_tab[4][16];
qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
+ qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
+ qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
+
h264_weight_func weight_h264_pixels_tab[10];
h264_biweight_func biweight_h264_pixels_tab[10];
Index: h263dec.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h263dec.c,v
retrieving revision 1.174
diff -u -r1.174 h263dec.c
--- libavcodec/h263dec.c 12 Jan 2006 22:43:15 -0000 1.174
+++ libavcodec/h263dec.c 18 Feb 2006 01:51:54 -0000
@@ -689,6 +689,17 @@
s->next_p_frame_damaged=0;
}
+ if((s->avctx->flags2 & CODEC_FLAG2_FAST) && s->pict_type==B_TYPE){
+ s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
+ s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
+ }else if((!s->no_rounding) || s->pict_type==B_TYPE){
+ s->me.qpel_put= s->dsp.put_no_rnd_qpel_pixels_tab;
+ s->me.qpel_avg= s->dsp.avg_qpel_pixels_tab;
+ }else{
+ s->me.qpel_put= s->dsp.put_qpel_pixels_tab;
+ s->me.qpel_avg= s->dsp.avg_qpel_pixels_tab;
+ }
+
if(MPV_frame_start(s, avctx) < 0)
return -1;
Index: h264.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/h264.c,v
retrieving revision 1.190
diff -u -r1.190 h264.c
--- libavcodec/h264.c 16 Feb 2006 01:16:12 -0000 1.190
+++ libavcodec/h264.c 18 Feb 2006 01:51:55 -0000
@@ -3463,8 +3463,8 @@
}
}else if(s->codec_id == CODEC_ID_H264){
hl_motion(h, dest_y, dest_cb, dest_cr,
- s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
- s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
+ h->s.me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+ h->s.me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
}
@@ -4499,6 +4499,14 @@
);
}
+ if((s->avctx->flags2 & CODEC_FLAG2_FAST) && s->pict_type==B_TYPE){
+ h->s.me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
+ h->s.me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
+ }else{
+ h->s.me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
+ h->s.me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
+ }
+
return 0;
}
Index: mpegvideo.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/mpegvideo.c,v
retrieving revision 1.510
diff -u -r1.510 mpegvideo.c
--- libavcodec/mpegvideo.c 6 Feb 2006 11:21:26 -0000 1.510
+++ libavcodec/mpegvideo.c 18 Feb 2006 01:51:58 -0000
@@ -3907,17 +3907,16 @@
MPV_motion_lowres(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix);
}
}else{
+ op_qpix= s->me.qpel_put;
if ((!s->no_rounding) || s->pict_type==B_TYPE){
op_pix = s->dsp.put_pixels_tab;
- op_qpix= s->dsp.put_qpel_pixels_tab;
}else{
op_pix = s->dsp.put_no_rnd_pixels_tab;
- op_qpix= s->dsp.put_no_rnd_qpel_pixels_tab;
}
if (s->mv_dir & MV_DIR_FORWARD) {
MPV_motion(s, dest_y, dest_cb, dest_cr, 0, s->last_picture.data, op_pix, op_qpix);
op_pix = s->dsp.avg_pixels_tab;
- op_qpix= s->dsp.avg_qpel_pixels_tab;
+ op_qpix= s->me.qpel_avg;
}
if (s->mv_dir & MV_DIR_BACKWARD) {
MPV_motion(s, dest_y, dest_cb, dest_cr, 1, s->next_picture.data, op_pix, op_qpix);
Index: i386/dsputil_mmx.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
retrieving revision 1.111
diff -u -r1.111 dsputil_mmx.c
--- libavcodec/i386/dsputil_mmx.c 10 Feb 2006 06:55:25 -0000 1.111
+++ libavcodec/i386/dsputil_mmx.c 18 Feb 2006 01:51:58 -0000
@@ -2403,6 +2403,51 @@
c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
c->avg_ ## postfix1 = avg_ ## postfix2;
+/***********************************/
+/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
+
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
+}
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
+}
+
+#define QPEL_2TAP(OPNAME, SIZE, MMX)\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
+ OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 21, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 12, _y2_ ## MMX)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
+}\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
+}\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
+
+QPEL_2TAP(put_, 16, mmx2)
+QPEL_2TAP(avg_, 16, mmx2)
+QPEL_2TAP(put_, 8, mmx2)
+QPEL_2TAP(avg_, 8, mmx2)
+QPEL_2TAP(put_, 16, 3dnow)
+QPEL_2TAP(avg_, 16, 3dnow)
+QPEL_2TAP(put_, 8, 3dnow)
+QPEL_2TAP(avg_, 8, 3dnow)
+
+
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
long i=0;
@@ -2829,6 +2874,11 @@
dspfunc(avg_h264_qpel, 0, 16);
dspfunc(avg_h264_qpel, 1, 8);
dspfunc(avg_h264_qpel, 2, 4);
+
+ dspfunc(put_2tap_qpel, 0, 16);
+ dspfunc(put_2tap_qpel, 1, 8);
+ dspfunc(avg_2tap_qpel, 0, 16);
+ dspfunc(avg_2tap_qpel, 1, 8);
#undef dspfunc
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
@@ -2943,6 +2993,11 @@
dspfunc(avg_h264_qpel, 1, 8);
dspfunc(avg_h264_qpel, 2, 4);
+ dspfunc(put_2tap_qpel, 0, 16);
+ dspfunc(put_2tap_qpel, 1, 8);
+ dspfunc(avg_2tap_qpel, 0, 16);
+ dspfunc(avg_2tap_qpel, 1, 8);
+
c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
}
Index: i386/dsputil_mmx_avg.h
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx_avg.h,v
retrieving revision 1.29
diff -u -r1.29 dsputil_mmx_avg.h
--- libavcodec/i386/dsputil_mmx_avg.h 12 Jan 2006 22:43:17 -0000 1.29
+++ libavcodec/i386/dsputil_mmx_avg.h 18 Feb 2006 01:51:59 -0000
@@ -818,3 +818,86 @@
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
}
+static void DEF(put_2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+ asm volatile(
+ "1: \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ "movq 8(%1,%2), %%mm1 \n\t"
+ PAVGB" (%1,%3), %%mm0 \n\t"
+ PAVGB" 8(%1,%3), %%mm1 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ PAVGB" 8(%1), %%mm1 \n\t"
+ "movq %%mm0, (%1,%4) \n\t"
+ "movq %%mm1, 8(%1,%4) \n\t"
+ "add %5, %1 \n\t"
+ "decl %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+r"(src)
+ :"r"((long)off1), "r"((long)off2),
+ "r"((long)(dst-src)), "r"((long)stride)
+ :"memory"
+ );
+}
+
+static void DEF(avg_2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+ asm volatile(
+ "1: \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ "movq 8(%1,%2), %%mm1 \n\t"
+ PAVGB" (%1,%3), %%mm0 \n\t"
+ PAVGB" 8(%1,%3), %%mm1 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ PAVGB" 8(%1), %%mm1 \n\t"
+ PAVGB" (%1,%4), %%mm0 \n\t"
+ PAVGB" 8(%1,%4), %%mm1 \n\t"
+ "movq %%mm0, (%1,%4) \n\t"
+ "movq %%mm1, 8(%1,%4) \n\t"
+ "add %5, %1 \n\t"
+ "decl %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+r"(src)
+ :"r"((long)off1), "r"((long)off2),
+ "r"((long)(dst-src)), "r"((long)stride)
+ :"memory"
+ );
+}
+
+static void DEF(put_2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+ asm volatile(
+ "1: \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ PAVGB" (%1,%3), %%mm0 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ "movq %%mm0, (%1,%4) \n\t"
+ "add %5, %1 \n\t"
+ "decl %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+r"(src)
+ :"r"((long)off1), "r"((long)off2),
+ "r"((long)(dst-src)), "r"((long)stride)
+ :"memory"
+ );
+}
+
+static void DEF(avg_2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2)
+{
+ asm volatile(
+ "1: \n\t"
+ "movq (%1,%2), %%mm0 \n\t"
+ PAVGB" (%1,%3), %%mm0 \n\t"
+ PAVGB" (%1), %%mm0 \n\t"
+ PAVGB" (%1,%4), %%mm0 \n\t"
+ "movq %%mm0, (%1,%4) \n\t"
+ "add %5, %1 \n\t"
+ "decl %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+r"(src)
+ :"r"((long)off1), "r"((long)off2),
+ "r"((long)(dst-src)), "r"((long)stride)
+ :"memory"
+ );
+}
+
More information about the ffmpeg-devel
mailing list