[Ffmpeg-devel] [PATCH] snow cached halfpel

Mon Mar 13 08:02:48 CET 2006

Note: I this patch is not ready for inclusion, I am just posting it as-is 
for anyone who cares.

Stores halfpel interpolated frames, so that each call to hpel mc is just a 
copy, and each qpel mc is just a pavgb. This makes encoding with obme+qpel 
9% faster, and obme+hpel 6% faster. But due to the extra branch in 
pred_block, decoding is 1% slower.

I expect that even better encoding speed gains can be had using the same 
method in mpeg4+qpel and snow+epzs, but that will involve much more 
invasive changes.

--Loren Merritt
-------------- next part --------------
Index: snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.91
diff -u -r1.91 snow.c

--- snow.c	13 Mar 2006 01:27:13 -0000	1.91
+++ snow.c	13 Mar 2006 06:29:00 -0000
@@ -445,7 +445,7 @@
     AVFrame new_picture;
     AVFrame input_picture;              ///< new_picture with the internal linesizes
     AVFrame current_picture;
-    AVFrame last_picture;
+    Picture last_picture;
     AVFrame mconly_picture;
 //     uint8_t q_context[16];
     uint8_t header_state[32];
@@ -2526,7 +2526,31 @@
         assert(tab_index>=0 && tab_index<4 || b_w==32);
         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
-        else if(b_w==32){
+        else if(s->last_picture.interpolated[0] && plane_index==0 && src!=tmp+MB_SIZE){
+            const uint8_t *hpels[4] = {
+                s->last_picture.data[0],
+                s->last_picture.interpolated[0],
+                s->last_picture.interpolated[1],
+                s->last_picture.interpolated[2]};
+            static const int h0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+            static const int h1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+            int offset = (sx+2) + (sy+2)*stride;
+            uint8_t *src0 = hpels[h0[dy+(dx>>2)]] + offset + (dy>=12)*stride;
+            uint8_t *src1 = hpels[h1[dy+(dx>>2)]] + offset + (dx>=12);
+            int qpel = (dx|dy)&7;
+            if(b_w==32){
+                s->dsp.put_pixels_tab[0][0](dst, src0, stride, b_h);
+                s->dsp.put_pixels_tab[0][0](dst+16, src0+16, stride, b_h);
+                if(qpel){
+                    s->dsp.avg_pixels_tab[0][0](dst, src1, stride, b_h);
+                    s->dsp.avg_pixels_tab[0][0](dst+16, src1+16, stride, b_h);
+                }
+            }else{
+                s->dsp.put_pixels_tab[tab_index][0](dst, src0, stride, b_h);
+                if(qpel)
+                    s->dsp.avg_pixels_tab[tab_index][0](dst, src1, stride, b_h);
+            }
+        }else if(b_w==32){
             int y;
             for(y=0; y<b_h; y+=16){
                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
@@ -3839,7 +3863,7 @@
     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
         mc_block_hpel ## dx ## dy ## 8;
 
-    mcfh(0, 0)
+//  mcfh(0, 0)
     mcfh(8, 0)
     mcfh(0, 8)
     mcfh(8, 8)
@@ -4012,6 +4036,19 @@
 
     s->avctx->get_buffer(s->avctx, &s->input_picture);
 
+    //FIXME hpel caching would help with any ME method, but requires more changes in motion_est.c
+    if(s->avctx->me_method == ME_ITER && !(avctx->flags&CODEC_FLAG_EMU_EDGE))
+    {
+        int size, offset, i;
+        int w= avctx->width;
+        int h= avctx->height;
+        avcodec_align_dimensions(avctx, &w, &h);
+        w+= EDGE_WIDTH*2;
+        h+= EDGE_WIDTH*2;
+        for(i=0; i<3; i++)
+            s->last_picture.interpolated[i]= av_malloc(w*h) + (w+1)*EDGE_WIDTH;
+    }
+
     return 0;
 }
 
@@ -4026,10 +4063,26 @@
         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
     }
 
-    tmp= s->last_picture;
-    s->last_picture= s->current_picture;
+    tmp= *(AVFrame*)&s->last_picture;
+    *(AVFrame*)&s->last_picture= s->current_picture;
     s->current_picture= tmp;
 
+    if(s->last_picture.data[0] && s->last_picture.interpolated[0]){
+        int x,y,i;
+        int stride = s->last_picture.linesize[0];
+        uint8_t **dst = s->last_picture.interpolated;
+        assert(EDGE_WIDTH >= 12);
+        for(y=-8; y<h+4; y+=8)
+            for(x=-8; x<w+4; x+=8){
+                uint8_t *src= s->last_picture.data[0]+x+y*stride;
+                s->dsp.put_h264_qpel_pixels_tab[1][ 2](dst[0]+x+y*stride, src, stride);
+                s->dsp.put_h264_qpel_pixels_tab[1][ 8](dst[1]+x+y*stride, src, stride);
+                s->dsp.put_h264_qpel_pixels_tab[1][10](dst[2]+x+y*stride, src, stride);
+            }
+        for(i=0; i<3; i++)
+            draw_edges(dst[i]-3-3*stride, stride, w+6, h+6, EDGE_WIDTH-3);
+    }
+
     s->current_picture.reference= 1;
     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
@@ -4221,7 +4274,7 @@
     }
 
     if(s->last_picture.data[0])
-        avctx->release_buffer(avctx, &s->last_picture);
+        avctx->release_buffer(avctx, (AVFrame*)&s->last_picture);
 
     s->current_picture.coded_picture_number = avctx->frame_number;
     s->current_picture.pict_type = pict->pict_type;
@@ -4264,6 +4317,13 @@
             }
         }
     }
+
+    if(s->last_picture.interpolated[0] && s->last_picture.linesize[0])
+    {
+        int i;
+        for(i=0; i<3; i++)
+            av_free(s->last_picture.interpolated[i] - (s->last_picture.linesize[0]+1)*EDGE_WIDTH);
+    }
 }
 
 static int encode_end(AVCodecContext *avctx)
@@ -4429,7 +4489,7 @@
     emms_c();
 
     if(s->last_picture.data[0])
-        avctx->release_buffer(avctx, &s->last_picture);
+        avctx->release_buffer(avctx, (AVFrame*)&s->last_picture);
 
 if(!(s->avctx->debug&2048))
     *picture= s->current_picture;