[FFmpeg-devel] snow.c optimisations

Thu Dec 29 04:07:09 CET 2011

> > > > > Perhaps that the y*stride can to be factorised into the
> > > > ff_snow_pred_block()
> > > > > func because this was very redundant ?
> > > > > (the same thing with the y*src_stride into ff_snow_inner_add_yblock()
> )
> >
> > With this diff, the factorisation into ff_snow_pred_block() is really more
> > apparent :)
[...]
> >  void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int
> > stride,
> > int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w,
> > int
> > h){
> > +
> >      if(block->type & BLOCK_INTRA){
> >          int x, y;
> >          const unsigned color  = block->color[plane_index];
> >          const unsigned color4 = color*0x01010101;
> >          if(b_w==32){
> > -            for(y=0; y < b_h; y++){
> > -                *(uint32_t*)&dst[0 + y*stride]= color4;
> > -                *(uint32_t*)&dst[4 + y*stride]= color4;
> > -                *(uint32_t*)&dst[8 + y*stride]= color4;
> > -                *(uint32_t*)&dst[12+ y*stride]= color4;
> > -                *(uint32_t*)&dst[16+ y*stride]= color4;
> > -                *(uint32_t*)&dst[20+ y*stride]= color4;
> > -                *(uint32_t*)&dst[24+ y*stride]= color4;
> > -                *(uint32_t*)&dst[28+ y*stride]= color4;
> > +            for(y=0; y < b_h; y++, dst += stride){
> > +                memset(dst,color4, 32);
> >              }
[...]
>
> And this portion into the non INTRA_BLOC can certainly to be factorised too :
>
>    int a_1=src[x + (HTAPS_MAX/2-4)*stride];
>    int a0= src[x + (HTAPS_MAX/2-3)*stride];
>    int a1= src[x + (HTAPS_MAX/2-2)*stride];
>    int a2= src[x + (HTAPS_MAX/2-1)*stride];
>    int a3= src[x + (HTAPS_MAX/2+0)*stride];
>    int a4= src[x + (HTAPS_MAX/2+1)*stride];
>    int a5= src[x + (HTAPS_MAX/2+2)*stride];
>    int a6= src[x + (HTAPS_MAX/2+3)*stride];
>
> The various (HTAPS_MAX/2+-qqchose)*stride was recomputed for each pixel
> => they can to be only recomputed on each line (it's b_w * more speed)

So, this finaly give this diff :

diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index 0ce9b28..4aae985 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -190,16 +190,26 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
     tmp2= tmp2t[1];

     if(b&2){
+
+        int  s_1 = (HTAPS_MAX/2-4)*stride;
+        int  s0 = (HTAPS_MAX/2-3)*stride;
+        int  s1 = (HTAPS_MAX/2-2)*stride;
+        int  s2 = (HTAPS_MAX/2-1)*stride;
+        int  s3 = (HTAPS_MAX/2-0)*stride;
+        int  s4 = (HTAPS_MAX/2+1)*stride;
+        int  s5 = (HTAPS_MAX/2+2)*stride;
+        int  s6 = (HTAPS_MAX/2+3)*stride;
+
         for(y=0; y < b_h; y++){
             for(x=0; x < b_w+1; x++){
-                int a_1=src[x + (HTAPS_MAX/2-4)*stride];
-                int a0= src[x + (HTAPS_MAX/2-3)*stride];
-                int a1= src[x + (HTAPS_MAX/2-2)*stride];
-                int a2= src[x + (HTAPS_MAX/2-1)*stride];
-                int a3= src[x + (HTAPS_MAX/2+0)*stride];
-                int a4= src[x + (HTAPS_MAX/2+1)*stride];
-                int a5= src[x + (HTAPS_MAX/2+2)*stride];
-                int a6= src[x + (HTAPS_MAX/2+3)*stride];
+                int a_1=src[x + s_1];
+                int a0= src[x + s0];
+                int a1= src[x + s1];
+                int a2= src[x + s2];
+                int a3= src[x + s3];
+                int a4= src[x + s4];
+                int a5= src[x + s5];
+                int a6= src[x + s6];
                 int am=0;
                 if(!p || p->fast_mc)
                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
@@ -218,16 +228,26 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
     tmp2= tmp2t[2];
     tmpI= tmpIt;
     if(b&4){
+
+        int  s_1 = (HTAPS_MAX/2-4)*64;
+        int  s0 = (HTAPS_MAX/2-3)*64;
+        int  s1 = (HTAPS_MAX/2-2)*64;
+        int  s2 = (HTAPS_MAX/2-1)*64;
+        int  s3 = (HTAPS_MAX/2-0)*64;
+        int  s4 = (HTAPS_MAX/2+1)*64;
+        int  s5 = (HTAPS_MAX/2+2)*64;
+        int  s6 = (HTAPS_MAX/2+3)*64;
+
         for(y=0; y < b_h; y++){
             for(x=0; x < b_w; x++){
-                int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
-                int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
-                int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
-                int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
-                int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
-                int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
-                int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
-                int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
+                int a_1=tmpI[x + s_1];
+                int a0= tmpI[x + s0];
+                int a1= tmpI[x + s1];
+                int a2= tmpI[x + s2];
+                int a3= tmpI[x + s3];
+                int a4= tmpI[x + s4];
+                int a5= tmpI[x + s5];
+                int a6= tmpI[x + s6];
                 int am=0;
                 if(!p || p->fast_mc)
                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
@@ -288,42 +308,35 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
 }

 void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride,
int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int
h){
+
     if(block->type & BLOCK_INTRA){
         int x, y;
         const unsigned color  = block->color[plane_index];
         const unsigned color4 = color*0x01010101;
         if(b_w==32){
-            for(y=0; y < b_h; y++){
-                *(uint32_t*)&dst[0 + y*stride]= color4;
-                *(uint32_t*)&dst[4 + y*stride]= color4;
-                *(uint32_t*)&dst[8 + y*stride]= color4;
-                *(uint32_t*)&dst[12+ y*stride]= color4;
-                *(uint32_t*)&dst[16+ y*stride]= color4;
-                *(uint32_t*)&dst[20+ y*stride]= color4;
-                *(uint32_t*)&dst[24+ y*stride]= color4;
-                *(uint32_t*)&dst[28+ y*stride]= color4;
+            for(y=0; y < b_h; y++, dst += stride){
+                memset(dst,color4, 32);
             }
         }else if(b_w==16){
-            for(y=0; y < b_h; y++){
-                *(uint32_t*)&dst[0 + y*stride]= color4;
-                *(uint32_t*)&dst[4 + y*stride]= color4;
-                *(uint32_t*)&dst[8 + y*stride]= color4;
-                *(uint32_t*)&dst[12+ y*stride]= color4;
+            for(y=0; y < b_h; y++, dst += stride){
+                memset(dst,color4, 16);
             }
         }else if(b_w==8){
-            for(y=0; y < b_h; y++){
-                *(uint32_t*)&dst[0 + y*stride]= color4;
-                *(uint32_t*)&dst[4 + y*stride]= color4;
+            for(y=0; y < b_h; y++, dst += stride){
+                memset(dst,color4, 8);
             }
         }else if(b_w==4){
-            for(y=0; y < b_h; y++){
-                *(uint32_t*)&dst[0 + y*stride]= color4;
+            for(y=0; y < b_h; y++, dst += stride){
+                *(uint32_t*)dst= color4;
             }
         }else{
-            for(y=0; y < b_h; y++){
+            for(y=0; y < b_h; y++, dst += stride){
+/*
                 for(x=0; x < b_w; x++){
-                    dst[x + y*stride]= color;
+                    dst[x]= color;
                 }
+*/
+                memset(dst, color, b_w);
             }
         }
     }else{


@+
Yannoo