Index: vf_eq2.c =================================================================== RCS file: /cvsroot/mplayer/main/libmpcodecs/vf_eq2.c,v retrieving revision 1.3 diff -d -u -r1.3 vf_eq2.c --- vf_eq2.c 3 Nov 2002 17:20:58 -0000 1.3 +++ vf_eq2.c 1 Dec 2002 21:41:39 -0000 @@ -70,7 +70,9 @@ eq2->lut[i] = 255; } else { - eq2->lut[i] = (unsigned char) (256.0 * v); + /* we divided by 255.0 so now we also multiply by 255.0, not + by 256.0. "+ 0.5" ensures proper rounding */ + eq2->lut[i] = (unsigned char) (255.0 * v + 0.5); } } } @@ -78,18 +80,46 @@ /* could inline this */ static -void process (unsigned char *dst, int dstride, unsigned char *src, int sstride, +/* src stride and dst stride are identical in this filter - we need only one + and thus save a register */ +void process (unsigned char *dst, unsigned char *src, int stride, int w, int h, unsigned char lut[256]) { int i, j; - for (j = 0; j < h; j++) { - for (i = 0; i < w; i++) { - *(dst++) = lut[*(src++)]; + /* counting backward to avoid (slower) comparison other than + (faster) index == 0 in loop where index variable is used + within the loop body and thus the compiler can't reverse it. + Technically we only need counting down for i, not for j, but + it looks more even that way */ + for (j = h - 1; j >= 0; j--) { + /* Here follows the inner loop where most time is spent */ + for (i = w - 1; i >= 0; i--) { + /* this line below is much more effficient on x86 architecture - + only one index variable needs to be updated within the inner + loop instead of three. + We rely on mov{b,zbl} (reg1,reg2),mem and similar instructions + which do the addition, pointer resolution and fetching in 1 + instruction. Thus the inner loop has just 5 instructions now + (the following example comes from -march=pentium3 and I use + the variable names for the registers which hold them): + + movzbl (i,src),%eax + movzbl (%eax,lut),%eax + movb %al,(i,dst) + decl i + jns beginning-of-inner-loop + + by the way, the lines are now processed from right to left - + that shouldn't be a problem though. + */ + *(dst + i)= lut[*(src + i)]; } - - src += sstride - w; - dst += dstride - w; + /* since instructions that include addition of three values don't exist + on x86, we don't use j in the inner loop and instead increment src + and dst in the outer loop */ + src += stride; + dst += stride; } } @@ -117,7 +147,9 @@ dst->planes[2] = src->planes[2]; process ( - dst->planes[0], dst->stride[0], src->planes[0], src->stride[0], + /* We don't pass two stride values since they're identical + anyway */ + dst->planes[0], src->planes[0], src->stride[0], src->w, src->h, eq2->lut );