[FFmpeg-devel] snow.c optimisations
yann.lepetitcorps at free.fr
yann.lepetitcorps at free.fr
Thu Dec 29 04:07:09 CET 2011
> > > > > Perhaps that the y*stride can to be factorised into the
> > > > ff_snow_pred_block()
> > > > > func because this was very redundant ?
> > > > > (the same thing with the y*src_stride into ff_snow_inner_add_yblock()
> )
> >
> > With this diff, the factorisation into ff_snow_pred_block() is really more
> > apparent :)
[...]
> > void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int
> > stride,
> > int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w,
> > int
> > h){
> > +
> > if(block->type & BLOCK_INTRA){
> > int x, y;
> > const unsigned color = block->color[plane_index];
> > const unsigned color4 = color*0x01010101;
> > if(b_w==32){
> > - for(y=0; y < b_h; y++){
> > - *(uint32_t*)&dst[0 + y*stride]= color4;
> > - *(uint32_t*)&dst[4 + y*stride]= color4;
> > - *(uint32_t*)&dst[8 + y*stride]= color4;
> > - *(uint32_t*)&dst[12+ y*stride]= color4;
> > - *(uint32_t*)&dst[16+ y*stride]= color4;
> > - *(uint32_t*)&dst[20+ y*stride]= color4;
> > - *(uint32_t*)&dst[24+ y*stride]= color4;
> > - *(uint32_t*)&dst[28+ y*stride]= color4;
> > + for(y=0; y < b_h; y++, dst += stride){
> > + memset(dst,color4, 32);
> > }
[...]
>
> And this portion into the non INTRA_BLOC can certainly to be factorised too :
>
> int a_1=src[x + (HTAPS_MAX/2-4)*stride];
> int a0= src[x + (HTAPS_MAX/2-3)*stride];
> int a1= src[x + (HTAPS_MAX/2-2)*stride];
> int a2= src[x + (HTAPS_MAX/2-1)*stride];
> int a3= src[x + (HTAPS_MAX/2+0)*stride];
> int a4= src[x + (HTAPS_MAX/2+1)*stride];
> int a5= src[x + (HTAPS_MAX/2+2)*stride];
> int a6= src[x + (HTAPS_MAX/2+3)*stride];
>
> The various (HTAPS_MAX/2+-qqchose)*stride was recomputed for each pixel
> => they can to be only recomputed on each line (it's b_w * more speed)
So, this finaly give this diff :
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index 0ce9b28..4aae985 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -190,16 +190,26 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
tmp2= tmp2t[1];
if(b&2){
+
+ int s_1 = (HTAPS_MAX/2-4)*stride;
+ int s0 = (HTAPS_MAX/2-3)*stride;
+ int s1 = (HTAPS_MAX/2-2)*stride;
+ int s2 = (HTAPS_MAX/2-1)*stride;
+ int s3 = (HTAPS_MAX/2-0)*stride;
+ int s4 = (HTAPS_MAX/2+1)*stride;
+ int s5 = (HTAPS_MAX/2+2)*stride;
+ int s6 = (HTAPS_MAX/2+3)*stride;
+
for(y=0; y < b_h; y++){
for(x=0; x < b_w+1; x++){
- int a_1=src[x + (HTAPS_MAX/2-4)*stride];
- int a0= src[x + (HTAPS_MAX/2-3)*stride];
- int a1= src[x + (HTAPS_MAX/2-2)*stride];
- int a2= src[x + (HTAPS_MAX/2-1)*stride];
- int a3= src[x + (HTAPS_MAX/2+0)*stride];
- int a4= src[x + (HTAPS_MAX/2+1)*stride];
- int a5= src[x + (HTAPS_MAX/2+2)*stride];
- int a6= src[x + (HTAPS_MAX/2+3)*stride];
+ int a_1=src[x + s_1];
+ int a0= src[x + s0];
+ int a1= src[x + s1];
+ int a2= src[x + s2];
+ int a3= src[x + s3];
+ int a4= src[x + s4];
+ int a5= src[x + s5];
+ int a6= src[x + s6];
int am=0;
if(!p || p->fast_mc)
am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
@@ -218,16 +228,26 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
tmp2= tmp2t[2];
tmpI= tmpIt;
if(b&4){
+
+ int s_1 = (HTAPS_MAX/2-4)*64;
+ int s0 = (HTAPS_MAX/2-3)*64;
+ int s1 = (HTAPS_MAX/2-2)*64;
+ int s2 = (HTAPS_MAX/2-1)*64;
+ int s3 = (HTAPS_MAX/2-0)*64;
+ int s4 = (HTAPS_MAX/2+1)*64;
+ int s5 = (HTAPS_MAX/2+2)*64;
+ int s6 = (HTAPS_MAX/2+3)*64;
+
for(y=0; y < b_h; y++){
for(x=0; x < b_w; x++){
- int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
- int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
- int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
- int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
- int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
- int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
- int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
- int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
+ int a_1=tmpI[x + s_1];
+ int a0= tmpI[x + s0];
+ int a1= tmpI[x + s1];
+ int a2= tmpI[x + s2];
+ int a3= tmpI[x + s3];
+ int a4= tmpI[x + s4];
+ int a5= tmpI[x + s5];
+ int a6= tmpI[x + s6];
int am=0;
if(!p || p->fast_mc)
am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
@@ -288,42 +308,35 @@ static void mc_block(Plane *p, uint8_t *dst, const uint8_t
*src, int stride, int
}
void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride,
int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int
h){
+
if(block->type & BLOCK_INTRA){
int x, y;
const unsigned color = block->color[plane_index];
const unsigned color4 = color*0x01010101;
if(b_w==32){
- for(y=0; y < b_h; y++){
- *(uint32_t*)&dst[0 + y*stride]= color4;
- *(uint32_t*)&dst[4 + y*stride]= color4;
- *(uint32_t*)&dst[8 + y*stride]= color4;
- *(uint32_t*)&dst[12+ y*stride]= color4;
- *(uint32_t*)&dst[16+ y*stride]= color4;
- *(uint32_t*)&dst[20+ y*stride]= color4;
- *(uint32_t*)&dst[24+ y*stride]= color4;
- *(uint32_t*)&dst[28+ y*stride]= color4;
+ for(y=0; y < b_h; y++, dst += stride){
+ memset(dst,color4, 32);
}
}else if(b_w==16){
- for(y=0; y < b_h; y++){
- *(uint32_t*)&dst[0 + y*stride]= color4;
- *(uint32_t*)&dst[4 + y*stride]= color4;
- *(uint32_t*)&dst[8 + y*stride]= color4;
- *(uint32_t*)&dst[12+ y*stride]= color4;
+ for(y=0; y < b_h; y++, dst += stride){
+ memset(dst,color4, 16);
}
}else if(b_w==8){
- for(y=0; y < b_h; y++){
- *(uint32_t*)&dst[0 + y*stride]= color4;
- *(uint32_t*)&dst[4 + y*stride]= color4;
+ for(y=0; y < b_h; y++, dst += stride){
+ memset(dst,color4, 8);
}
}else if(b_w==4){
- for(y=0; y < b_h; y++){
- *(uint32_t*)&dst[0 + y*stride]= color4;
+ for(y=0; y < b_h; y++, dst += stride){
+ *(uint32_t*)dst= color4;
}
}else{
- for(y=0; y < b_h; y++){
+ for(y=0; y < b_h; y++, dst += stride){
+/*
for(x=0; x < b_w; x++){
- dst[x + y*stride]= color;
+ dst[x]= color;
}
+*/
+ memset(dst, color, b_w);
}
}
}else{
@+
Yannoo
More information about the ffmpeg-devel
mailing list