[FFmpeg-devel] snow.c optimisations

Thu Dec 29 03:36:45 CET 2011

> > > > Perhaps that the y*stride can to be factorised into the
> > > ff_snow_pred_block()
> > > > func because this was very redundant ?
> > > > (the same thing with the y*src_stride into ff_snow_inner_add_yblock() )
>
> With this diff, the factorisation into ff_snow_pred_block() is really more
> apparent :)
>
>
> diff --git a/libavcodec/snow.c b/libavcodec/snow.c
> index 0ce9b28..5b256d4 100644
> --- a/libavcodec/snow.c
> +++ b/libavcodec/snow.c
> @@ -288,41 +288,31 @@ static void mc_block(Plane *p, uint8_t *dst, const
> uint8_t
> *src, int stride, int
>  }
>
>  void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int
> stride,
> int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w,
> int
> h){
> +
>      if(block->type & BLOCK_INTRA){
>          int x, y;
>          const unsigned color  = block->color[plane_index];
>          const unsigned color4 = color*0x01010101;
>          if(b_w==32){
> -            for(y=0; y < b_h; y++){
> -                *(uint32_t*)&dst[0 + y*stride]= color4;
> -                *(uint32_t*)&dst[4 + y*stride]= color4;
> -                *(uint32_t*)&dst[8 + y*stride]= color4;
> -                *(uint32_t*)&dst[12+ y*stride]= color4;
> -                *(uint32_t*)&dst[16+ y*stride]= color4;
> -                *(uint32_t*)&dst[20+ y*stride]= color4;
> -                *(uint32_t*)&dst[24+ y*stride]= color4;
> -                *(uint32_t*)&dst[28+ y*stride]= color4;
> +            for(y=0; y < b_h; y++, dst += stride){
> +                memset(dst,color4, 32);
>              }
>          }else if(b_w==16){
> -            for(y=0; y < b_h; y++){
> -                *(uint32_t*)&dst[0 + y*stride]= color4;
> -                *(uint32_t*)&dst[4 + y*stride]= color4;
> -                *(uint32_t*)&dst[8 + y*stride]= color4;
> -                *(uint32_t*)&dst[12+ y*stride]= color4;
> +            for(y=0; y < b_h; y++, dst += stride){
> +                memset(dst,color4, 16);
>              }
>          }else if(b_w==8){
> -            for(y=0; y < b_h; y++){
> -                *(uint32_t*)&dst[0 + y*stride]= color4;
> -                *(uint32_t*)&dst[4 + y*stride]= color4;
> +            for(y=0; y < b_h; y++, dst += stride){
> +                memset(dst,color4, 8);
>              }
>          }else if(b_w==4){
> -            for(y=0; y < b_h; y++){
> -                *(uint32_t*)&dst[0 + y*stride]= color4;
> +            for(y=0; y < b_h; y++, dst += stride){
> +                *(uint32_t*)dst= color4;
>              }
>          }else{
> -            for(y=0; y < b_h; y++){
> +            for(y=0; y < b_h; y++, dst += stride){
>                  for(x=0; x < b_w; x++){
> -                    dst[x + y*stride]= color;
> +                    dst[x]= color;
>                  }
>              }
>          }

And this portion into the non INTRA_BLOC can certainly to be factorised too :

   int a_1=src[x + (HTAPS_MAX/2-4)*stride];
   int a0= src[x + (HTAPS_MAX/2-3)*stride];
   int a1= src[x + (HTAPS_MAX/2-2)*stride];
   int a2= src[x + (HTAPS_MAX/2-1)*stride];
   int a3= src[x + (HTAPS_MAX/2+0)*stride];
   int a4= src[x + (HTAPS_MAX/2+1)*stride];
   int a5= src[x + (HTAPS_MAX/2+2)*stride];
   int a6= src[x + (HTAPS_MAX/2+3)*stride];

The various (HTAPS_MAX/2+-qqchose)*stride was recomputed for each pixel
=> they can to be only recomputed on each line (it's b_w * more speed)

@
Yannoo