Index: vf_eq2.c
===================================================================
RCS file: /cvsroot/mplayer/main/libmpcodecs/vf_eq2.c,v
retrieving revision 1.3
diff -d -u -r1.3 vf_eq2.c
--- vf_eq2.c	3 Nov 2002 17:20:58 -0000	1.3
+++ vf_eq2.c	1 Dec 2002 21:41:39 -0000
@@ -70,7 +70,9 @@
         eq2->lut[i] = 255;
       }
       else {
-        eq2->lut[i] = (unsigned char) (256.0 * v);
+        /* we divided by 255.0 so now we also multiply by 255.0, not
+           by 256.0. "+ 0.5" ensures proper rounding */
+        eq2->lut[i] = (unsigned char) (255.0 * v + 0.5);
       }
     }
   }
@@ -78,18 +80,46 @@
 
 /* could inline this */
 static
-void process (unsigned char *dst, int dstride, unsigned char *src, int sstride,
+/* src stride and dst stride are identical in this filter - we need only one
+   and thus save a register */
+void process (unsigned char *dst, unsigned char *src, int stride,
   int w, int h, unsigned char lut[256])
 {
   int i, j;
 
-  for (j = 0; j < h; j++) {
-    for (i = 0; i < w; i++) {
-      *(dst++) = lut[*(src++)];
+  /* counting backward to avoid (slower) comparison other than
+     (faster) index == 0 in loop where index variable is used 
+     within the loop body  and thus the compiler can't reverse it.
+     Technically we only need counting down for i, not for j, but
+     it looks more even that way */
+  for (j = h - 1; j >= 0; j--) {
+    /* Here follows the inner loop where most time is spent */
+    for (i = w - 1; i >= 0; i--) {
+      /* this line below is much more effficient on x86 architecture -
+         only one index variable needs to be updated within the inner
+         loop instead of three.
+         We rely on mov{b,zbl} (reg1,reg2),mem and similar instructions
+         which do the addition, pointer resolution and fetching in 1
+         instruction. Thus the inner loop has just 5 instructions now
+         (the following example comes from -march=pentium3 and I use
+          the variable names for the registers which hold them):
+       
+         movzbl (i,src),%eax
+         movzbl (%eax,lut),%eax
+         movb %al,(i,dst)
+         decl i
+         jns beginning-of-inner-loop
+       
+         by the way, the lines are now processed from right to left -
+         that shouldn't be a problem though.  
+      */
+      *(dst + i)= lut[*(src + i)];
     }
-
-    src += sstride - w;
-    dst += dstride - w;
+    /* since instructions that include addition of three values don't exist
+      on x86, we don't use j in the inner loop and instead increment src
+      and dst in the outer loop */
+    src += stride;
+    dst += stride;
   }
 }
 
@@ -117,7 +147,9 @@
   dst->planes[2] = src->planes[2];
 
   process (
-    dst->planes[0], dst->stride[0], src->planes[0], src->stride[0],
+	   /* We don't pass two stride values since they're identical
+              anyway */
+    dst->planes[0], src->planes[0], src->stride[0],
     src->w, src->h, eq2->lut
   );