[MPlayer-dev-eng] Improved remove-logo filter

Fri Sep 15 03:38:32 CEST 2006

On Thu, 14 Sep 2006, Uoti Urpala wrote:
> On Thu, 2006-09-14 at 00:53 -0700, Trent Piepho wrote:
> > BTW, mplayer does require gcc 3/4, the vf_yadif.c filter that was just
> > added uses the symbolic names for asm operands extension, which was added
> > in gcc 3.1.
>
> Actually not, it has code to disable asm under older gcc versions:
>
> +#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ == 0)
> +#undef HAVE_MMX
> +#endif

I suppose it does work under gcc 2.95 then, for loose definitions of
"works".  Somehow, I doubt Rich would have been satisfied if I had "fixed"
compilation on gcc 2.95 in the same manner.

Which sure would be nice, as renumbering all the %N values every time you
change the asm block's parameters is a huge PITA.

On Thu, 14 Sep 2006, Zuxy Meng wrote:
> MOVD mmreg, r32 is slow on AMD CPUs, maybe use "m" instead of "r" for
> sum will be faster?

I changed the asm block to:

#define MASKSUM8(data, mask) ({ uint32_t _sum; \
        asm("pandn %2, %1\n\t" "psadbw %2, %1" \
            : "=y" (_sum) : "0" (*(uint64_t*)mask), "ym" (*(uint64_t*)data)); \
        _sum; })

Gcc will produce this code for the relevant parts:
        movd    %mm0, 28(%esp)  # _sum,
        movl    28(%esp), %esi  #,
        addl    %esi, %edi      #, accumulator

In this case, I don't think gcc is using "movd mmreg, mem ; movl mem, reg"
because it's faster, but because gcc isn't smart enough to try "movd mmreg,
reg".

It does benchmark slighly faster, the mmx code is about 2.6% faster, the
filter is overall about 1.9% faster.

Of course, wouldn't it be even faster to put accumulator into an MMX
register, and the just use paddd %[sum], %[accumulator]?  That avoids the
movd/movl entirely.  paddd isn't much slower than addl, is it?

I tried that, telling gcc to input/output accumulator in an mmx register,
and doing the add myself with paddd.  For some reason, gcc thinks it needs
to save the mmx register to memory and then load it again.  So, it ends up
being slower.

So I tried writing the inner loop (over one line) in asm so accumulator
would be kept in mm1 for the whole loop.  Gcc still spills and loads
accumulator for no reason on each outer loop (for each line).  This ended
up being about the same speed.

So then I wrote both loops in asm!  Now the MMX part is 6.4% faster and the
filter is overall 3.8% faster.

It was a pain for such a small improvement.  And who knows what the
difference is for other CPUs?
-------------- next part --------------
Index: libmpcodecs/vf_remove_logo.c
===================================================================

--- libmpcodecs/vf_remove_logo.c	(revision 19836)
+++ libmpcodecs/vf_remove_logo.c	(working copy)
@@ -1,5 +1,6 @@
 /*
 Copyright 2005 Robert Edele.
+Copyright 2006 Trent Piepho.
 
 e-mail: yartrebo at earthlink.net
 
@@ -27,7 +28,7 @@
 
 /**
  * \file vf_remove_logo.c
- * 
+ *
  * \brief Advanced blur-based logo removing filter.
 
  *     Hello and welcome. This code implements a filter to remove annoying TV
@@ -92,6 +93,7 @@
 
 #include "config.h"
 #include "mp_msg.h"
+#include "cpudetect.h"
 #include "libvo/fastmemcpy.h"
 
 #include "img_format.h"
@@ -139,6 +141,13 @@
 
 } pgm_structure;
 
+typedef struct {
+  int x, y;
+  int width, height;
+  unsigned int count;
+  unsigned char *data;
+} pixel_mask;
+
 /**
  * \brief Stores persistant variables.
  *
@@ -147,11 +156,9 @@
  */
 typedef struct
 {
-  unsigned int fmt; /* Not exactly sure of the use for this. It came with the example filter I used as a basis for this, and it looks like a lot of stuff will break if I remove it. */
-  int max_mask_size; /* The largest possible mask size that will be needed with the given filter and corresponding half_size_filter. The half_size_filter can have a larger requirment in some rare (but not degenerate) cases. */
-  int * * * mask; /* Stores our collection of masks. The first * is for an array of masks, the second for the y axis, and the third for the x axis. */
-  pgm_structure * filter; /* Stores the full-size filter image. This is used to tell what pixels are in the logo or not in the luma plane. */
-  pgm_structure * half_size_filter; /* Stores a 50% width and 50% height filter image. This is used to tell what pixels are in the logo or not in the chroma planes. */
+  pixel_mask * const * masks;
+  pixel_mask * const * chroma_masks;
+  int width, height;  /* Width and hieght of the logo image, should match imput images. */
   /* These 8 variables store the bounding rectangles that the logo resides in. */
   int bounding_rectangle_posx1;
   int bounding_rectangle_posy1;
@@ -199,50 +206,50 @@
   int x; /* Temporary variables to run  */
   int y; /* through each row or column. */
   int start_x;
-  int start_y; 
+  int start_y;
   int end_x = filter->width - 1;
   int end_y = filter->height - 1;
   int did_we_find_a_logo_pixel = 0;
 
-  /* Let's find the top bound first. */
+  /* Let's find the left bound first. */
   for (start_x = 0; start_x < filter->width && !did_we_find_a_logo_pixel; start_x++)
   {
-    for (y = 0; y < filter->height; y++)
+    for (y = 0; y < filter->height && !did_we_find_a_logo_pixel; y++)
     {
-      did_we_find_a_logo_pixel |= test_filter(filter, start_x, y);
+      did_we_find_a_logo_pixel = test_filter(filter, start_x, y);
     }
   }
   start_x--;
 
-  /* Now the bottom bound. */
+  /* Top bound. */
   did_we_find_a_logo_pixel = 0;
-  for (end_x = filter->width - 1; end_x > start_x && !did_we_find_a_logo_pixel; end_x--)
+  for (start_y = 0; start_y < filter->height && !did_we_find_a_logo_pixel; start_y++)
   {
-    for (y = 0; y < filter->height; y++)
+    for (x = start_x; x < filter->width && !did_we_find_a_logo_pixel; x++)
     {
-      did_we_find_a_logo_pixel |= test_filter(filter, end_x, y);
+      did_we_find_a_logo_pixel = test_filter(filter, x, start_y);
     }
   }
-  end_x++;
+  start_y--;
 
-  /* Left bound. */
+  /* Now the right bound. */
   did_we_find_a_logo_pixel = 0;
-  for (start_y = 0; start_y < filter->height && !did_we_find_a_logo_pixel; start_y++)
+  for (end_x = filter->width - 1; end_x > start_x && !did_we_find_a_logo_pixel; end_x--)
   {
-    for (x = 0; x < filter->width; x++)
+    for (y = start_y; y < filter->height && !did_we_find_a_logo_pixel; y++)
     {
-      did_we_find_a_logo_pixel |= test_filter(filter, x, start_y);
+      did_we_find_a_logo_pixel = test_filter(filter, end_x, y);
     }
   }
-  start_y--;
+  end_x++;
 
-  /* Right bound. */
+  /* Bottom bound. */
   did_we_find_a_logo_pixel = 0;
   for (end_y = filter->height - 1; end_y > start_y && !did_we_find_a_logo_pixel; end_y--)
   {
-    for (x = 0; x < filter->width; x++)
+    for (x = start_x; x < end_x && !did_we_find_a_logo_pixel; x++)
     {
-      did_we_find_a_logo_pixel |= test_filter(filter, x, end_y);
+      did_we_find_a_logo_pixel = test_filter(filter, x, end_y);
     }
   }
   end_y++;
@@ -258,89 +265,68 @@
 /**
  * \brief Free mask memory.
  *
- * \param vf Data structure which stores our persistant data, and is to be freed.
+ * \param mask Pointer to masks to destroy.
  *
+ * \param max_mask_size How many masks to destroy.
+ *
  * We call this function when our filter is done. It will free the memory
  * allocated to the masks and leave the variables in a safe state.
  */
-static void destroy_masks(vf_instance_t * vf)
+static void destroy_masks(unsigned char * * mask, int max_mask_size)
 {
   int a, b;
 
-  /* Load values from the vf->priv struct for faster dereferencing. */
-  int * * * mask = ((vf_priv_s *)vf->priv)->mask;
-  int max_mask_size = ((vf_priv_s *)vf->priv)->max_mask_size;
-
   if (mask == NULL)
     return; /* Nothing allocated, so return before we segfault. */
 
   /* Free all allocated memory. */
   for (a = 0; a <= max_mask_size; a++) /* Loop through each mask. */
   {
-    for (b = -a; b <= a; b++) /* Loop through each scanline in a mask. */
-    {
-      free(mask[a][b + a]); /* Free a scanline. */
-    }
     free(mask[a]); /* Free a mask. */
   }
   free(mask); /* Free the array of pointers pointing to the masks. */
 
-  /* Set the pointer to NULL, so that any duplicate calls to this function will not cause a crash. */
-  ((vf_priv_s *)vf->priv)->mask = NULL;
-
   return;
 }
 
 /**
  * \brief Set up our array of masks.
  *
- * \param vf Where our filter stores persistance data, like these masks.
+ * \param max_mask_size This tells us how many masks we'll need to generate.
  *
+ * \return The masks generated
+ *
  * This creates an array of progressively larger masks and calculates their
  * values. The values will not change during program execution once this function
  * is done.
  */
-static void initialize_masks(vf_instance_t * vf)
+static unsigned char * * initialize_masks(const int max_mask_size)
 {
   int a, b, c;
+  unsigned char * maskp;
+  unsigned char * * mask = safe_malloc(sizeof(*mask) * (max_mask_size + 1));
 
-  /* Load values from the vf->priv struct for faster dereferencing. */
-  int * * * mask = ((vf_priv_s *)vf->priv)->mask;
-  int max_mask_size = ((vf_priv_s *)vf->priv)->max_mask_size; /* This tells us how many masks we'll need to generate. */
-
   /* Create a circular mask for each size up to max_mask_size. When the filter is applied, the mask size is
      determined on a pixel by pixel basis, with pixels nearer the edge of the logo getting smaller mask sizes. */
-  mask = (int * * *) safe_malloc(sizeof(int * *) * (max_mask_size + 1));
   for (a = 0; a <= max_mask_size; a++)
   {
-    mask[a] = (int * *) safe_malloc(sizeof(int *) * ((a * 2) + 1));
+    maskp = mask[a] = safe_malloc(sizeof(*maskp) * ((a * 2) + 1) * ((a * 2) + 1));
     for (b = -a; b <= a; b++)
-    {
-      mask[a][b + a] = (int *) safe_malloc(sizeof(int) * ((a * 2) + 1));
-      for (c = -a; c <= a; c++)
-      {
-        if ((b * b) + (c * c) <= (a * a)) /* Circular 0/1 mask. */
-          mask[a][b + a][c + a] = 1;
-        else
-          mask[a][b + a][c + a] = 0; 
-      }
-    }
+      for (c = -a; c <= a; c++, maskp++)
+        *maskp = (b * b) + (c * c) <= (a * a) ? 0xff : 0; /* Circular 0/1 mask. */
   }
 
-  /* Store values back to vf->priv so they aren't lost after the function returns. */
-  ((vf_priv_s *)vf->priv)->mask = mask;
-
-  return;
+  return mask;
 }
 
 /**
  * \brief Pre-processes an image to give distance information.
  *
- * \param vf Data structure that holds persistant information. All it is used for
-             in this function is to store the calculated max_mask_size variable.
  * \param mask This image will be converted from a greyscale image into a
  *             distance image.
  *
+ * \return The max mask size, could be put in the vf max_mask_size field
+ *
  * This function takes a greyscale image (pgm_structure * mask) and converts it
  * in place into a distance image. A distance image is zero for pixels ourside of
  * the logo and is the manhattan distance (|dx| + |dy|) for pixels inside of the
@@ -348,7 +334,7 @@
  * to implement than a proper pythagorean distance since I'm using a modified
  * erosion algorithm to compute the distances.
  */
-static void convert_mask_to_strength_mask(vf_instance_t * vf, pgm_structure * mask)
+static int convert_mask_to_strength_mask(pgm_structure * mask)
 {
   int x, y; /* Used by our for loops to go through every single pixel in the picture one at a time. */
   int has_anything_changed = 1; /* Used by the main while() loop to know if anything changed on the last erosion. */
@@ -356,7 +342,7 @@
                            and to get us max_mask_size later on. */
   int max_mask_size; /* This will record how large a mask the pixel that is the furthest from the edge of the logo
                            (and thus the neediest) is. */
-  char * current_pixel = mask->pixel; /* This stores the actual pixel data. */
+  unsigned char * current_pixel = mask->pixel; /* This stores the actual pixel data. */
 
   /* First pass, set all non-zero values to 1. After this loop finishes, the data should be considered numeric
      data for the filter, not color data. */
@@ -364,42 +350,53 @@
     if(*current_pixel) *current_pixel = 1;
 
   /* Second pass and future passes. For each pass, if a pixel is itself the same value as the current pass,
-     and its four neighbors are too, then it is incremented. If no pixels are incremented by the end of the pass,
+     and its four neighbors are too, then it is incremented. If any pixels are incremented by the end of the pass,
      then we go again. Edge pixels are counted as always excluded (this should be true anyway for any sane mask,
      but if it isn't this will ensure that we eventually exit). */
+  /* Excluding edge pixels does not work correctly.  A pixel that is part of the logo on the image edge will
+     always have a mask value of 1, even though it may be farther than that from a non-logo pixel.  This
+     fails when the blur operation tries to find non-logo pixels to blur within the mask size of 1, as there
+     will not be any.  To fix this pixels off the edge of the image are considered to have a value of infinity,
+     and edge pixels are processed.  A logo which covers the entire image (useless) would loop forver*/
   while (has_anything_changed)
   {
     current_pass++;
+    /* { char fn[16]; sprintf(fn, "tmp%02d.pgm", current_pass; write_pgm(fn, mask); } */
+    if (current_pass >= 255)
+    {
+	mp_msg(MSGT_VFILTER, MSGL_ERR, "Overflow calculating strength mask!\n");
+	break;
+    }
     current_pixel = mask->pixel;
 
     has_anything_changed = 0; /* If this doesn't get set by the end of this pass, then we're done. */
 
-    for (y = 1; y < mask->height - 1; y++)
+    for (y = 0; y < mask->height; y++)
     {
-      for (x = 1; x < mask->width - 1; x++)
+      for (x = 0; x < mask->width; x++)
       {
         /* Apply the in-place erosion transform. It is based on the following two premises: 1 - Any pixel that fails 1 erosion
            will fail all future erosions. 2 - Only pixels having survived all erosions up to the present will be >= to
            current_pass. It doesn't matter if it survived the current pass, failed it, or hasn't been tested yet. */
         if (*current_pixel >= current_pass && /* By using >= instead of ==, we allow the algorithm to work in place. */
-            *(current_pixel + 1) >= current_pass &&
-            *(current_pixel - 1) >= current_pass &&
-            *(current_pixel + mask->width) >= current_pass &&
-            *(current_pixel - mask->width) >= current_pass)
-         {
-           (*current_pixel)++; /* Increment the value since it still has not been eroded, as evidenced by the if statement
-                                  that just evaluated to true. */
-           has_anything_changed = 1;
-         }
+            (x == mask->width - 1 || *(current_pixel + 1) >= current_pass) &&
+            (x == 0 || *(current_pixel - 1) >= current_pass) &&
+            (y == mask->height - 1 || *(current_pixel + mask->width) >= current_pass) &&
+            (y == 0 || *(current_pixel - mask->width) >= current_pass))
+        {
+          (*current_pixel)++; /* Increment the value since it still has not been eroded, as evidenced by the if statement
+                                 that just evaluated to true. */
+          has_anything_changed = 1;
+        }
         current_pixel++;
       }
     }
   }
 
   /* Apply the fudge factor, which will increase the size of the mask a little to reduce jitter at the cost of more blur. */
-  for (y = 1; y < mask->height - 1; y++)
+  for (y = 0; y < mask->height; y++)
   {
-   for (x = 1; x < mask->width - 1; x++)
+   for (x = 0; x < mask->width; x++)
     {
       mask->pixel[(y * mask->width) + x] = apply_mask_fudge_factor(mask->pixel[(y * mask->width) + x]);
     }
@@ -408,78 +405,221 @@
   max_mask_size = current_pass + 1; /* As a side-effect, we now know the maximum mask size, which we'll use to generate our masks. */
   max_mask_size = apply_mask_fudge_factor(max_mask_size); /* Apply the fudge factor to this number too, since we must
                                                              ensure that enough masks are generated. */
-  ((vf_priv_s *)vf->priv)->max_mask_size = max_mask_size; /* Commit the newly calculated max_mask_size to the vf->priv struct. */
-
-  return;
+  return max_mask_size;
 }
 
 /**
- * \brief Our blurring function.
+ * \brief Compute masks for each pixel
  *
- * \param vf Stores persistant data. In this function we are interested in the
- *           array of masks.
- * \param value_out The properly blurred and delogoed pixel is outputted here.
- * \param logo_mask Tells us which pixels are in the logo and which aren't.
- * \param image The image that is having its logo removed.
- * \param x x-coordinate of the pixel to blur.
- * \param y y-coordinate of the pixel to blur.
- * \param plane 0 = luma, 1 = blue chroma, 2 = red chroma (YUV).
+ * \param mask_size Image created by convert_mask_to_strength_mask with mask size for each pixel
  *
- * This function is the core of the filter. It takes a pixel that is inside the
- * logo and blurs it. It does so by finding the average of all the pixels within
- * the mask and outside of the logo.
+ * \param base_masks Base circular masks used
+ *
+ * \return An array of pixel_mask pointers with each pixel's mask
+ *
+ * For every pixel in the image bounding box, creates a custom mask.  Starts with the base
+ * circular mask, clips it to the image boundary, and then removes all pixels in the logo.
+ * A minimum bounding box for this new mask is found, as well as the number of pixels in
+ * the mask.  If MMX2 is being used, the mask is padded to a multiple of 8 pixels wide.
  */
-static void get_blur(const vf_instance_t * const vf, unsigned int * const value_out, const pgm_structure * const logo_mask,
-              const mp_image_t * const image, const int x, const int y, const int plane)
+static unsigned char onepixelmask = 0xff;
+static pixel_mask * * calculate_masks(const pgm_structure * const mask_size,
+                                      unsigned char * const * const base_masks)
 {
-  int mask_size; /* Mask size tells how large a circle to use. The radius is about (slightly larger than) mask size. */
-  /* Get values from vf->priv for faster dereferencing. */
-  int * * * mask = ((vf_priv_s *)vf->priv)->mask;
+  int x, y, i, j;
+  const unsigned char *size;
+  unsigned char *in_mask, *p;
+  pixel_mask * * masks = safe_malloc(sizeof(*masks) * mask_size->width * mask_size->height);
+  pixel_mask * * mp;
 
-  int start_posx, start_posy, end_posx, end_posy;
-  int i, j;
-  unsigned int accumulator = 0, divisor = 0;
-  const unsigned char * mask_read_position; /* What pixel we are reading out of the circular blur mask. */
-  const unsigned char * logo_mask_read_position; /* What pixel we are reading out of the filter image. */
+  size = mask_size->pixel;
+  for (y = 0, mp = masks; y < mask_size->height; y++)
+  {
+    for (x = 0; x < mask_size->width; x++, size++, mp++)
+    {
+      int t, l, b, r;
+      pixel_mask *mask;
+      if(! *size) {
+	*mp = NULL;
+	continue;
+      }
 
-  /* Prepare our bounding rectangle and clip it if need be. */
-  mask_size = test_filter(logo_mask, x, y);
-  start_posx = max(0, x - mask_size);
-  start_posy = max(0, y - mask_size);
-  end_posx = min(image->width - 1, x + mask_size);
-  end_posy = min(image->height - 1, y + mask_size);
+      in_mask = safe_malloc((*size*2+1) * (*size*2+1));
+      mask = safe_malloc(sizeof(pixel_mask));
+      mask->count = 0;
 
-  mask_read_position = image->planes[plane] + (image->stride[plane] * start_posy) + start_posx;
-  logo_mask_read_position = logo_mask->pixel + (start_posy * logo_mask->width) + start_posx;
+      /* Start with a copy of the base mask */
+      memcpy(in_mask, base_masks[*size], sizeof(*in_mask) * (*size*2+1) * (*size*2+1));
 
-  for (j = start_posy; j <= end_posy; j++)
-  {
-    for (i = start_posx; i <= end_posx; i++)
-    {
-      if (!(*logo_mask_read_position) && mask[mask_size][i - start_posx][j - start_posy])
-      { /* Check to see if this pixel is in the logo or not. Only use the pixel if it is not. */
-        accumulator += *mask_read_position;
-        divisor++;
+      l = t = *size; r = b = -*size;
+      for (j = -*size, p = in_mask; j <= *size; j++)
+        for (i = -*size; i <= *size; i++, p++)
+	  /* Clip mask to image boundary & non-logo part of image */
+	  if (x+i < 0 || x+i >= mask_size->width || y+j < 0 || y+j >= mask_size->height ||
+	      test_filter(mask_size, x+i, y+j))
+	  {
+	    *p = 0;
+	  }
+	  else if (*p)
+	  {
+	    /* Pixel is in mask, adjust bounding box and count */
+	    mask->count++;
+	    if(t > j) t = j;
+	    if(b < j) b = j;
+	    if(l > i) l = i;
+	    if(r < i) r = i;
+	  }
+
+      if (!mask->count) {
+	mp_msg(MSGT_VFILTER, MSGL_ERR, "Pixel (%d,%d) has no data in mask (size %d)\n", x, y, *size);
+	free(mask);
+	free(in_mask);
+	*mp = NULL;
+	continue;
       }
 
-      mask_read_position++;
-      logo_mask_read_position++;
+      mask->x = x + l; mask->y = y + t;
+      mask->width = r - l + 1; mask->height = b - t + 1;
+      if (mask->width==1 && mask->height==1)
+      {
+	/* Mask is just one pixel, re-use the same data pointer as all other one pixel masks. */
+	mask->data = &onepixelmask;
+      }
+      else
+      {
+	int pad = mask->width;
+#ifdef HAVE_MMX2
+	if (gCpuCaps.hasMMX2 && mask->width > 4)
+	  mask->width = (mask->width + 7) & ~7; /* Round up to a multiple of 8 */
+#endif
+	pad = mask->width - pad; 		/* How many bytes were added to to round up */
+
+	mask->data = safe_malloc(mask->width * mask->height);
+	for (j = t, p = mask->data; j <= b; j++)
+	{
+	  for (i = l; i <= r; i++, p++)
+	    *p = in_mask[(j + *size) * (*size*2+1) + (i + *size)];
+	  for (i = 0; i < pad; i++, p++)
+	    *p = 0;
+	}
+      }
+      free(in_mask);
+      mp_msg(MSGT_VFILTER, MSGL_DBG2, "mask (%d,%d) origin (%d,%d) size %d bounds (%dx%d) count %d\n",
+             x, y, mask->x, mask->y, *size, mask->width, mask->height, mask->count);
+      *mp = mask;
     }
-
-    mask_read_position += (image->stride[plane] - ((end_posx + 1) - start_posx));
-    logo_mask_read_position += (logo_mask->width - ((end_posx + 1) - start_posx));
   }
 
-  if (divisor == 0) /* This means that not a single pixel is outside of the logo, so we have no data. */
-  { /* We should put some eye catching value here, to indicate the flaw to the user. */
-    *value_out = 255;
+  return masks;
+}
+
+/**
+ * \brief Our blurring function.
+ *
+ * \param logo_mask Tells us which pixels to use.
+ * \param image Pointer to the start of the visible image data to be de-logoed.
+ * \param stride The stride of the input image.
+ *
+ * \return The properly blurred and delogoed pixel.
+ *
+ * This function is the core of the filter.  It takes a pixel that is inside the logo and
+ * blurs it.  It does so by finding the average of all the pixels within the mask.  There
+ * is a specific mask for each pixel, which is all pixels in a circular mask centered on
+ * the pixel to be blured that are not part of the logo.
+ *
+ * We switch on the width of the mask so we can have width specific unrolled inner loops
+ * for masks that are 1, 2, 3, or 4 pixels wide.  Generally, about 52% of the masks will
+ * have a width of 4 pixels or less.  The 48% of the masks wider than 4 pixels take up
+ * about 85% of the total CPU time (with the custom loops masks for <= 4 pixels wide,
+ * which speeds up those masks sizes considerably).  For the wider masks, there is an MMX2
+ * asm version that is about double the speed of the C version.  The MMX2 version requires
+ * that the masks be padded to a multiple of 8 pixels wide.  For masks <= 4 pixels wide,
+ * the MMX2 version is not faster, partially due to the need for the extra padding.
+ *
+ * Selection of the MMX/non-MMX code is done here.  If MMX and non-MMX versions of
+ * get_blur() were selected with a function pointer, it would avoid the repeated gCpuCaps
+ * checks but duplicate the rest of the get_blur() code in the MMX vs non-MMX version.  It
+ * also ends up being slightly slower, because the function pointer prevents gcc from
+ * inlining get_blur() (which is only called from one place).
+ */
+static unsigned int get_blur(const pixel_mask * const logo_mask,
+                             const unsigned char * image, int stride)
+{
+  int i, j;
+  const unsigned char * mask = logo_mask->data;
+  unsigned int accumulator = logo_mask->count/2; /* Adjust for average rounding error */
+
+  image += logo_mask->y * stride + logo_mask->x;
+  switch(logo_mask->width) {
+    case 1:
+      for (j = 0; j < logo_mask->height; j++, mask++, image += stride)
+	accumulator += *image & *mask;
+      break;
+    case 2:
+      for (j = 0; j < logo_mask->height; j++, mask +=2, image += stride)
+      {
+	accumulator += image[0] & mask[0];
+	accumulator += image[1] & mask[1];
+      }
+      break;
+    case 3:
+      for (j = 0; j < logo_mask->height; j++, mask += 3, image += stride)
+      {
+	accumulator += image[0] & mask[0];
+	accumulator += image[1] & mask[1];
+	accumulator += image[2] & mask[2];
+      }
+      break;
+    case 4:
+      for (j = 0; j < logo_mask->height; j++, mask += 4, image += stride)
+      {
+	accumulator += image[0] & mask[0];
+	accumulator += image[1] & mask[1];
+	accumulator += image[2] & mask[2];
+	accumulator += image[3] & mask[3];
+      }
+      break;
+    default:
+#ifdef HAVE_MMX2
+#if (__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+#define MMX_CLOBBER : "mm0", "mm1"
+#else
+#define MMX_CLOBBER
+#endif
+      if (gCpuCaps.hasMMX2)
+      {
+	uint32_t sum, i = 0, j = logo_mask->height;
+	asm("movd %0, %%mm1\n\t"
+	    ".p2align 4,,7\n"
+	    "1:\n\t"
+            "movq (%3,%1), %%mm0\n\t"
+	    "pandn (%4,%1), %%mm0\n\t"
+	    "psadbw (%4,%1), %%mm0\n\t"
+	    "add $8, %1\n\t"
+	    "paddd %%mm0, %%mm1\n\t"
+	    "cmp %10, %1\n\t"
+	    "jl 1b\n\t"
+	    "addl %11, %4\n\t"
+	    "addl %10, %3\n\t"
+	    "xorl %1, %1\n\t"
+	    "decl %2\n\t"
+	    "jnz 1b\n\t"
+	    "movd %%mm1, %0"
+	    : "=m" (accumulator), "=r" (i), "=g" (j), "=r" (mask), "=r" (image)
+	    : "m" (accumulator), "1" (i), "2" (j), "3" (mask), "4" (image),
+	      "g" (logo_mask->width), "g" (stride) MMX_CLOBBER);
+      }
+      else
+#endif /* HAVE_MMX2 */
+      {
+        stride -= logo_mask->width; /* go from _end_ of one line to _start_ of next line */
+	for (j = 0; j < logo_mask->height; j++, image += stride)
+	  for (i = 0; i < logo_mask->width; i++, mask++, image++)
+	    if (*mask) accumulator += *image;
+      }
   }
-  else /* Else we need to normalise the data using the divisor. */
-  {
-    *value_out = (accumulator + (divisor / 2)) / divisor; /* Divide, taking into account average rounding error. */
-  }
 
-  return;
+  return accumulator / logo_mask->count;
 }
 
 /**
@@ -538,8 +678,8 @@
   FILE * input;
   int pnm_number;
   pgm_structure * new_pgm = (pgm_structure *) safe_malloc (sizeof(pgm_structure));
-  char * write_position;
-  char * end_position;
+  unsigned char * write_position;
+  unsigned char * end_position;
   int image_size; /* width * height */
 
   if((input = fopen(file_name, "rb")) == NULL) REMOVE_LOGO_LOAD_PGM_ERROR_MESSAGE("[vf]remove-logo: Unable to open file. File not found or insufficient permissions.\n");
@@ -573,16 +713,18 @@
     }
   }
 
+  fclose(input);
+
   return new_pgm;
 }
 
 /**
  * \brief Generates a scaled down image with half width, height, and intensity.
  *
- * \param vf Our struct for persistant data. In this case, it is used to update
- *           mask_max_size with the larger of the old or new value.
  * \param input_image The image from which the new half-sized one will be based.
  *
+ * \param max_mask_size In case this image needs largers masks, this must be updated.
+ *
  * \return The newly allocated and shrunken image.
  *
  * This function not only scales down an image, but halves the value in each pixel
@@ -593,95 +735,39 @@
  * rounding error will only cause a minor amount of excess blur in the chroma
  * planes.
  */
-static pgm_structure * generate_half_size_image(vf_instance_t * vf, pgm_structure * input_image)
+static pgm_structure * generate_half_size_image(const pgm_structure * const input_image,
+                                         int * const max_mask_size)
 {
-  int x, y;
-  pgm_structure * new_pgm = (pgm_structure *) safe_malloc (sizeof(pgm_structure));
-  int has_anything_changed = 1;
-  int current_pass;
-  int max_mask_size;
-  char * current_pixel;
+  int x, y, max;
+  pgm_structure * new_pgm = safe_malloc (sizeof(pgm_structure));
+  unsigned char * current_pixel;
 
   new_pgm->width = input_image->width / 2;
   new_pgm->height = input_image->height / 2;
   new_pgm->pixel = (unsigned char *) safe_malloc (sizeof(unsigned char) * new_pgm->width * new_pgm->height);
 
+  memset(new_pgm->pixel, 0, new_pgm->width * new_pgm->height);
   /* Copy over the image data, using the average of 4 pixels for to calculate each downsampled pixel. */
+  current_pixel = new_pgm->pixel;
   for (y = 0; y < new_pgm->height; y++)
     for (x = 0; x < new_pgm->width; x++)
     {
       /* Set the pixel if there exists a non-zero value in the source pixels, else clear it. */
-      new_pgm->pixel[(y * new_pgm->width) + x] = input_image->pixel[((y << 1) * input_image->width) + (x << 1)] ||
-                                                 input_image->pixel[((y << 1) * input_image->width) + (x << 1) + 1] ||
-                                                 input_image->pixel[(((y << 1) + 1) * input_image->width) + (x << 1)] ||
-                                                 input_image->pixel[(((y << 1) + 1) * input_image->width) + (x << 1) + 1];
-      new_pgm->pixel[(y * new_pgm->width) + x] = min(1, new_pgm->pixel[(y * new_pgm->width) + x]);
+      if (test_filter(input_image, (x<<1),   (y<<1)  ) ||
+          test_filter(input_image, (x<<1)+1, (y<<1)  ) ||
+          test_filter(input_image, (x<<1),   (y<<1)+1) ||
+          test_filter(input_image, (x<<1)+1, (y<<1)+1)) *current_pixel = 1;
+      current_pixel++;
     }
 
   /* Now we need to recalculate the numbers for the smaller size. Just using the old_value / 2 can cause subtle
      and fairly rare, but very nasty, bugs. */
+  max = convert_mask_to_strength_mask(new_pgm);
+  if(max > *max_mask_size) *max_mask_size = max;
 
-  current_pixel = new_pgm->pixel;
-  /* First pass, set all non-zero values to 1. */
-  for (x = 0; x < new_pgm->height * new_pgm->width; x++, current_pixel++)
-    if(*current_pixel) *current_pixel = 1;
-
-  /* Second pass and future passes. For each pass, if a pixel is itself the same value as the current pass,
-     and its four neighbors are too, then it is incremented. If no pixels are incremented by the end of the pass,
-     then we go again. Edge pixels are counted as always excluded (this should be true anyway for any sane mask,
-     but if it isn't this will ensure that we eventually exit). */
-  current_pass = 0;
-  while (has_anything_changed)
-  {
-    current_pass++;
-
-    has_anything_changed = 0; /* If this doesn't get set by the end of this pass, then we're done. */
-
-    for (y = 1; y < new_pgm->height - 1; y++)
-    {
-      for (x = 1; x < new_pgm->width - 1; x++)
-      {
-        if (new_pgm->pixel[(y * new_pgm->width) + x] >= current_pass && /* By using >= instead of ==, we allow the algorithm to work in place. */
-            new_pgm->pixel[(y * new_pgm->width) + (x + 1)] >= current_pass &&
-            new_pgm->pixel[(y * new_pgm->width) + (x - 1)] >= current_pass &&
-            new_pgm->pixel[((y + 1) * new_pgm->width) + x] >= current_pass &&
-            new_pgm->pixel[((y - 1) * new_pgm->width) + x] >= current_pass)
-         {
-           new_pgm->pixel[(y * new_pgm->width) + x]++; /* Increment the value since it still has not been eroded,
-                                                    as evidenced by the if statement that just evaluated to true. */
-           has_anything_changed = 1;
-         }
-      }
-    }
-  }
-
-  for (y = 1; y < new_pgm->height - 1; y++)
-  {
-   for (x = 1; x < new_pgm->width - 1; x++)
-    {
-      new_pgm->pixel[(y * new_pgm->width) + x] = apply_mask_fudge_factor(new_pgm->pixel[(y * new_pgm->width) + x]);
-    }
-  }
-
-  max_mask_size = current_pass + 1; /* As a side-effect, we now know the maximum mask size, which we'll use to generate our masks. */
-  max_mask_size = apply_mask_fudge_factor(max_mask_size);
-  /* Commit the newly calculated max_mask_size to the vf->priv struct. */
-  ((vf_priv_s *)vf->priv)->max_mask_size = max(max_mask_size, ((vf_priv_s *)vf->priv)->max_mask_size);
-
   return new_pgm;
 }
 
-/**
- * \brief Checks if YV12 is supported by the next filter.
- */
-static unsigned int find_best(struct vf_instance_s* vf){
-  int is_format_okay = vf->next->query_format(vf->next, IMGFMT_YV12);
-  if ((is_format_okay & VFCAP_CSP_SUPPORTED_BY_HW) || (is_format_okay & VFCAP_CSP_SUPPORTED))
-    return IMGFMT_YV12;
-  else
-    return 0;
-}
-
 //===========================================================================//
 
 /**
@@ -689,27 +775,30 @@
  */
 static int config(struct vf_instance_s* vf, int width, int height, int d_width, int d_height, unsigned int flags, unsigned int outfmt)
 {
-  if(!(((vf_priv_s *)vf->priv)->fmt=find_best(vf)))
+  const vf_priv_s * const state = (vf_priv_s *)vf->priv;
+
+  /* Check to make sure that the filter image and the video stream are the same size. */
+  if (state->width != width || state->height != height) {
+    mp_msg(MSGT_VFILTER, MSGL_ERR, "Filter image and video stream are not of the same size. (Filter: %d x %d, Stream: %d x %d)\n",
+	   state->width, state->height, width, height);
     return 0;
-  else
-    return vf_next_config(vf,width,height,d_width,d_height,flags,((vf_priv_s *)vf->priv)->fmt);
+  }
+
+  if (outfmt != IMGFMT_YV12)
+  {
+    mp_msg(MSGT_VFILTER, MSGL_ERR, "Filter only support YV12 format\n");
+    return 0;
+  }
+  return vf_next_config(vf,width,height,d_width,d_height,flags,IMGFMT_YV12);
 }
 
 /**
  * \brief Removes the logo from a plane (either luma or chroma).
  *
- * \param vf Not needed by this function, but needed by the blur function.
- * \param source The image to have it's logo removed.
- * \param destination Where the output image will be stored.
- * \param source_stride How far apart (in memory) two consecutive lines are.
- * \param destination Same as source_stride, but for the destination image.
- * \param width Width of the image. This is the same for source and destination.
- * \param height Height of the image. This is the same for source and destination.
- * \param is_image_direct If the image is direct, then source and destination are
- *        the same and we can save a lot of time by not copying pixels that
- *        haven't changed.
- * \param filter The image that stores the distance to the edge of the logo for
- *        each pixel.
+ * \param source_image The image to have it's logo removed.
+ * \param plane The plane of the image to process
+ * \param dest_image Where the output image will be stored.
+ * \param mask Stores the masks for each pixel in the logo, needed by the blur operation.
  * \param logo_start_x Smallest x-coordinate that contains at least 1 logo pixel.
  * \param logo_start_y Smallest y-coordinate that contains at least 1 logo pixel.
  * \param logo_end_x Largest x-coordinate that contains at least 1 logo pixel.
@@ -719,42 +808,79 @@
  * to the output without change, and pixels inside the logo have the de-blurring
  * function applied.
  */
-static void convert_yv12(const vf_instance_t * const vf, const char * const source, const int source_stride,
-                         const mp_image_t * const source_image, const int width, const int height,
-                         char * const destination, const int destination_stride, int is_image_direct, pgm_structure * filter,
-                         const int plane, const int logo_start_x, const int logo_start_y, const int logo_end_x, const int logo_end_y)
+static void convert_yv12(const mp_image_t * const source_image, const int plane,
+                         mp_image_t * const dest_image, pixel_mask * const * mask,
+                         const int logo_start_x, const int logo_start_y, const int logo_end_x, const int logo_end_y)
 {
-  int y;
-  int x;
+  unsigned char * dest = dest_image->planes[plane];
+  unsigned char * source = source_image->planes[plane];
+  int dstride = dest_image->stride[plane], sstride = source_image->stride[plane], mstride;
+  int width, height;
+  int x, y;
 
-  /* These pointers point to where we are getting our pixel data (inside mpi) and where we are storing it (inside dmpi). */
-  const unsigned char * source_line;
-  unsigned char * destination_line;
+  if (plane == 0)
+  {
+    width = source_image->width; height = source_image->height;
+    mstride = source_image->w;
+    x = dest_image->x; y = dest_image->y;  /* start of visible image?? */
+  }
+  else
+  {
+    width = source_image->chroma_width; height = source_image->chroma_height;
+    mstride = source_image->w/2;
+    x = dest_image->x/2; y = dest_image->y/2;  /* start of visible image?? */
+  }
 
-  if (!is_image_direct)
-    memcpy_pic(destination, source, width, height, destination_stride, source_stride);
+  if (!(source_image->flags & MP_IMGFLAG_DIRECT))
+    memcpy_pic(dest, source, width, height, dstride, sstride);
 
-  for (y = logo_start_y; y <= logo_end_y; y++)
-  {
-    source_line = (const unsigned char *) source + (source_stride * y);
-    destination_line = (unsigned char *) destination + (destination_stride * y);
+  /* width and height of the logo bounding box, which is the size of our loop */
+  width = logo_end_x - logo_start_x + 1;
+  height = logo_end_y - logo_start_y + 1;
 
-    for (x = logo_start_x; x <= logo_end_x; x++)
-    {
-      unsigned int output;
+  /* Adjust pointers to start of logo.
+     Keep in mind that dest has an additional offset to the start of the visible image. */
+  mask += logo_start_y * mstride + logo_start_x;
+  dest += (logo_start_y + y) * dstride + logo_start_x + x;
+  /* This is just adjusted to start of visible image, not the logo. */
+  source += y * source_image->stride[plane] + x;
 
-      if (filter->pixel[(y * filter->width) + x]) /* Only process if we are in the logo. */
-      {
-        get_blur(vf, &output, filter, source_image, x, y, plane);
-        destination_line[x] = output;
-      }
-      else /* Else just copy the data. */
-        if (!is_image_direct)
-          destination_line[x] = source_line[x];
-    }
-  }
+  /* Adjust strides to account for the pixels we already copied.  We want to go from the
+     _end_ of the logo on one line to the _start_ of the logo on the next. */
+  dstride -= width; mstride -= width;
+
+  for (y = height; y > 0; y--, dest += dstride, mask += mstride)
+    for (x = width; x > 0; x--, dest++, mask++)
+      if (*mask)
+	*dest = get_blur(*mask, source, sstride);
+
+#ifdef HAVE_MMX2
+  if (gCpuCaps.hasMMX2)
+    asm volatile("emms");
+#endif
 }
 
+#if 0
+/* This doesn't work */
+static void get_image(struct vf_instance_s* vf, mp_image_t *mpi)
+{
+  const vf_priv_s * const state = (vf_priv_s *)vf->priv;
+  if (mpi->flags&MP_IMGFLAG_PRESERVE) return; // What is this for?
+  if (mpi->imgfmt != IMGFMT_YV12) return;
+  if (mpi->width != state->width || mpi->height != state->height) return;
+
+  vf->dmpi = vf_get_image(vf->next, mpi->imgfmt, mpi->type, mpi->flags, mpi->w, mpi->h);
+  mpi->planes[0]=vf->dmpi->planes[0];
+  mpi->planes[1]=vf->dmpi->planes[1];
+  mpi->planes[2]=vf->dmpi->planes[2];
+  mpi->stride[0]=vf->dmpi->stride[0];
+  mpi->stride[1]=vf->dmpi->stride[1];
+  mpi->stride[2]=vf->dmpi->stride[2];
+  mpi->width = vf->dmpi->width;
+  mpi->flags|=MP_IMGFLAG_DIRECT;
+}
+#endif
+
 /**
  * \brief Process a frame.
  *
@@ -768,46 +894,26 @@
  * filter, has the logo removed by the filter, and is then sent to the next
  * filter.
  */
-static int put_image(struct vf_instance_s* vf, mp_image_t *mpi, double pts){
-    mp_image_t *dmpi;
-    
-    dmpi=vf_get_image(vf->next,((vf_priv_s *)vf->priv)->fmt,
-	MP_IMGTYPE_TEMP, MP_IMGFLAG_ACCEPT_STRIDE,
-	mpi->w, mpi->h);
+static int put_image(struct vf_instance_s* vf, mp_image_t *mpi, double pts)
+{
+  const vf_priv_s * state = (vf_priv_s *)vf->priv;
+  mp_image_t * dmpi;
 
-    /* Check to make sure that the filter image and the video stream are the same size. */
-    if ((((vf_priv_s *)vf->priv)->filter->width != mpi->w) || (((vf_priv_s *)vf->priv)->filter->height != mpi->h))
-    {
-      mp_msg(MSGT_VFILTER,MSGL_ERR, "Filter image and video stream are not of the same size. (Filter: %d x %d, Stream: %d x %d)\n",
-             ((vf_priv_s *)vf->priv)->filter->width, ((vf_priv_s *)vf->priv)->filter->height, mpi->w, mpi->h);
-      return 0;
-    }
+  dmpi = vf_get_image(vf->next, mpi->imgfmt, MP_IMGTYPE_TEMP, MP_IMGFLAG_ACCEPT_STRIDE, mpi->width, mpi->height);
 
-    switch(dmpi->imgfmt){
-    case IMGFMT_YV12:
-          convert_yv12(vf, mpi->planes[0],  mpi->stride[0], mpi, mpi->w, mpi->h,
-                          dmpi->planes[0], dmpi->stride[0],
-                          mpi->flags & MP_IMGFLAG_DIRECT, ((vf_priv_s *)vf->priv)->filter, 0,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_posx1, ((vf_priv_s *)vf->priv)->bounding_rectangle_posy1,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_posx2, ((vf_priv_s *)vf->priv)->bounding_rectangle_posy2);
-          convert_yv12(vf, mpi->planes[1],  mpi->stride[1], mpi, mpi->w / 2, mpi->h / 2,
-                          dmpi->planes[1], dmpi->stride[1],
-                          mpi->flags & MP_IMGFLAG_DIRECT, ((vf_priv_s *)vf->priv)->half_size_filter, 1,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx1, ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy1,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx2, ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy2);
-          convert_yv12(vf, mpi->planes[2],  mpi->stride[2], mpi, mpi->w / 2, mpi->h / 2,
-                          dmpi->planes[2], dmpi->stride[2],
-                          mpi->flags & MP_IMGFLAG_DIRECT, ((vf_priv_s *)vf->priv)->half_size_filter, 2,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx1, ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy1,
-                          ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx2, ((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy2);
-          break;
+  convert_yv12(mpi, 0, dmpi, state->masks,
+	       state->bounding_rectangle_posx1, state->bounding_rectangle_posy1,
+	       state->bounding_rectangle_posx2, state->bounding_rectangle_posy2);
+  convert_yv12(mpi, 1, dmpi, state->chroma_masks,
+	       state->bounding_rectangle_half_size_posx1, state->bounding_rectangle_half_size_posy1,
+	       state->bounding_rectangle_half_size_posx2, state->bounding_rectangle_half_size_posy2);
+  convert_yv12(mpi, 2, dmpi, state->chroma_masks,
+	       state->bounding_rectangle_half_size_posx1, state->bounding_rectangle_half_size_posy1,
+	       state->bounding_rectangle_half_size_posx2, state->bounding_rectangle_half_size_posy2);
 
-    default:
-	mp_msg(MSGT_VFILTER,MSGL_ERR,"Unhandled format: 0x%X\n",dmpi->imgfmt);
-	return 0;
-    }
+  vf_clone_mpi_attributes(dmpi, mpi);
 
-    return vf_next_put_image(vf,dmpi, pts);
+  return vf_next_put_image(vf, dmpi, pts);
 }
 
 //===========================================================================//
@@ -823,7 +929,48 @@
     return 0;
 }
 
+/* Useful for debugging */
+/*
+static void write_pgm(const char *fn, pgm_structure *pgm)
+{
+  FILE *f = fopen(fn, "wb");
+  fprintf(f, "P5\n%d %d 255\n", pgm->width, pgm->height);
+  fwrite(pgm->pixel, pgm->width, pgm->height, f);
+  fclose(f);
+}
+*/
+
 /**
+ * \brief Frees memory that our filter allocated.
+ *
+ * This is called at exit-time.
+ */
+static void uninit(vf_instance_t * vf)
+{
+  vf_priv_s * state = (vf_priv_s *)vf->priv;
+  int i;
+
+  /* Destroy our masks. */
+  for (i = 0; i < state->width * state->height; i++)
+    if (state->masks[i])
+    {
+      if(state->masks[i]->data != &onepixelmask) free(state->masks[i]->data);
+      free(state->masks[i]);
+    }
+  for (i = 0; i < (state->width>>1) * (state->height>>1); i++)
+    if (state->chroma_masks[i])
+    {
+      if(state->chroma_masks[i]->data != &onepixelmask) free(state->chroma_masks[i]->data);
+      free(state->chroma_masks[i]);
+    }
+
+  /* Destroy our private structure that had been used to store those masks and images. */
+  free(state);
+
+  return;
+}
+
+/**
  * \brief Initializes our filter.
  *
  * \param args The arguments passed in from the command line go here. This
@@ -834,73 +981,73 @@
  */
 static int open(vf_instance_t * vf, char * args)
 {
-  vf->priv = safe_malloc(sizeof(vf_priv_s));
+  pgm_structure * filter;
+  pgm_structure * half_filter;
+  unsigned char * * masks;
+  int max_mask_size;
+  vf_priv_s * state = safe_malloc(sizeof(vf_priv_s));
+  vf->priv = (void*)state;
 
   /* Load our filter image. */
   if (args)
-    ((vf_priv_s *)vf->priv)->filter = load_pgm(args);
+    filter = load_pgm(args);
   else
   {
     mp_msg(MSGT_VFILTER, MSGL_ERR, "[vf]remove_logo usage: remove_logo=/path/to/filter_image_file.pgm\n");
-    free(vf->priv);
+    free(state);
     return 0;
   }
 
-  if (((vf_priv_s *)vf->priv)->filter == NULL)
+  if (filter == NULL)
   {
     /* Error message was displayed by load_pgm(). */
-    free(vf->priv);
+    free(state);
     return 0;
   }
 
+  state->width = filter->width; state->height = filter->height;
+
+  max_mask_size = convert_mask_to_strength_mask(filter);
+
   /* Create the scaled down filter image for the chroma planes. */
-  convert_mask_to_strength_mask(vf, ((vf_priv_s *)vf->priv)->filter);
-  ((vf_priv_s *)vf->priv)->half_size_filter = generate_half_size_image(vf, ((vf_priv_s *)vf->priv)->filter);
+  half_filter = generate_half_size_image(filter, &max_mask_size);
 
   /* Now that we know how many masks we need (the info is in vf), we can generate the masks. */
-  initialize_masks(vf);
+  masks = initialize_masks(max_mask_size);
 
+  /* Create masks for each pixel in the logo. */
+  state->masks = calculate_masks(filter, masks);
+  state->chroma_masks = calculate_masks(half_filter, masks);
+
+  destroy_masks(masks, max_mask_size);
+
   /* Calculate our bounding rectangles, which determine in what region the logo resides for faster processing. */
-  calculate_bounding_rectangle(&((vf_priv_s *)vf->priv)->bounding_rectangle_posx1, &((vf_priv_s *)vf->priv)->bounding_rectangle_posy1,
-                               &((vf_priv_s *)vf->priv)->bounding_rectangle_posx2, &((vf_priv_s *)vf->priv)->bounding_rectangle_posy2,
-                                ((vf_priv_s *)vf->priv)->filter);
-  calculate_bounding_rectangle(&((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx1,
-                               &((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy1,
-                               &((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posx2,
-                               &((vf_priv_s *)vf->priv)->bounding_rectangle_half_size_posy2,
-                                ((vf_priv_s *)vf->priv)->half_size_filter);
+  calculate_bounding_rectangle(&state->bounding_rectangle_posx1, &state->bounding_rectangle_posy1,
+                               &state->bounding_rectangle_posx2, &state->bounding_rectangle_posy2,
+                               filter);
+  calculate_bounding_rectangle(&state->bounding_rectangle_half_size_posx1,
+                               &state->bounding_rectangle_half_size_posy1,
+                               &state->bounding_rectangle_half_size_posx2,
+                               &state->bounding_rectangle_half_size_posy2,
+                               half_filter);
 
+  destroy_pgm(filter);
+  destroy_pgm(half_filter);
+
   vf->config=config;
   vf->put_image=put_image;
   vf->query_format=query_format;
+  vf->uninit=uninit;
   return 1;
 }
 
 /**
- * \brief Frees memory that our filter allocated.
- *
- * This is called at exit-time.
- */
-void uninit(vf_instance_t * vf)
-{
-  /* Destroy our masks and images. */
-  destroy_pgm(((vf_priv_s *)vf->priv)->filter);
-  destroy_pgm(((vf_priv_s *)vf->priv)->half_size_filter);
-  destroy_masks(vf);
-
-  /* Destroy our private structure that had been used to store those masks and images. */
-  free(vf->priv);
-
-  return;
-}
-
-/**
  * \brief Meta data about our filter.
  */
 vf_info_t vf_info_remove_logo = {
     "Removes a tv logo based on a mask image.",
     "remove-logo",
-    "Robert Edele",
+    "Robert Edele & Trent Piepho",
     "",
     open,
     NULL