[FFmpeg-devel] [PATCH] avfilter/dctdnoiz: rewrite [f/i]dct

Mon Aug 4 01:14:37 CEST 2014

On Mon, Aug 04, 2014 at 12:44:48AM +0200, Michael Niedermayer wrote:
> On Sun, Aug 03, 2014 at 10:27:21PM +0200, Clément Bœsch wrote:
> > This removes the avcodec dependency and make the code almost twice as
> > fast. More to come.
> > 
> > The DCT factorization is based on "Fast and numerically stable
> > algorithms for discrete cosine transforms" from Gerlind Plonkaa &
> > Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015).
> > ---
> >  configure                 |   2 -
> >  libavfilter/vf_dctdnoiz.c | 328 +++++++++++++++++++++++++++++++++-------------
> >  2 files changed, 240 insertions(+), 90 deletions(-)
> > 
> > diff --git a/configure b/configure
> > index 9c3af50..6196b2a 100755
> > --- a/configure
> > +++ b/configure
> > @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl"
> >  bs2b_filter_deps="libbs2b"
> >  colormatrix_filter_deps="gpl"
> >  cropdetect_filter_deps="gpl"
> > -dctdnoiz_filter_deps="avcodec"
> > -dctdnoiz_filter_select="dct"
> >  delogo_filter_deps="gpl"
> >  deshake_filter_deps="avcodec"
> >  deshake_filter_select="me_cmp"
> > diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c
> > index 71b8536..6d24934 100644
> > --- a/libavfilter/vf_dctdnoiz.c
> > +++ b/libavfilter/vf_dctdnoiz.c
> > @@ -1,5 +1,5 @@
> >  /*
> > - * Copyright (c) 2013 Clément Bœsch
> > + * Copyright (c) 2013-2014 Clément Bœsch
> >   *
> >   * This file is part of FFmpeg.
> >   *
> > @@ -23,7 +23,6 @@
> >   * @see http://www.ipol.im/pub/art/2011/ys-dct/
> >   */
> >  
> > -#include "libavcodec/avfft.h"
> >  #include "libavutil/eval.h"
> >  #include "libavutil/opt.h"
> >  #include "drawutils.h"
> > @@ -35,7 +34,7 @@
> >  static const char *const var_names[] = { "c", NULL };
> >  enum { VAR_C, VAR_VARS_NB };
> >  
> > -typedef struct {
> > +typedef struct DCTdnoizContext {
> >      const AVClass *class;
> >  
> >      /* coefficient factor expression */
> > @@ -52,8 +51,9 @@ typedef struct {
> >      int p_linesize;             // line sizes for color and weights
> >      int overlap;                // number of block overlapping pixels
> >      int step;                   // block step increment (BSIZE - overlap)
> > -    DCTContext *dct, *idct;     // DCT and inverse DCT contexts
> > -    float *block, *tmp_block;   // two BSIZE x BSIZE block buffers
> > +    void (*filter_freq_func)(struct DCTdnoizContext *s,
> > +                             const float *src, int src_linesize,
> > +                             float *dst, int dst_linesize);
> >  } DCTdnoizContext;
> >  
> >  #define OFFSET(x) offsetof(DCTdnoizContext, x)
> > @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = {
> >  
> >  AVFILTER_DEFINE_CLASS(dctdnoiz);
> >  
> > -static float *dct_block(DCTdnoizContext *ctx, const float *src, int src_linesize)
> > +static void av_always_inline fdct16_1d(float *dst, const float *src,
> > +                                       int dst_stridea, int dst_strideb,
> > +                                       int src_stridea, int src_strideb)
> >  {
> > -    int x, y;
> > -    float *column;
> > -
> > -    for (y = 0; y < BSIZE; y++) {
> > -        float *line = ctx->block;
> > +    int i;
> >  
> > -        memcpy(line, src, BSIZE * sizeof(*line));
> > -        src += src_linesize;
> > -        av_dct_calc(ctx->dct, line);
> > -
> > -        column = ctx->tmp_block + y;
> > -        column[0] = line[0] * (1. / sqrt(BSIZE));
> > -        column += BSIZE;
> > -        for (x = 1; x < BSIZE; x++) {
> > -            *column = line[x] * sqrt(2. / BSIZE);
> > -            column += BSIZE;
> > -        }
> > +    for (i = 0; i < BSIZE; i++) {
> > +        const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea];
> > +        const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea];
> > +        const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea];
> > +        const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea];
> > +        const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea];
> > +        const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea];
> > +        const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea];
> > +        const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea];
> > +        const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea];
> > +        const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea];
> > +        const float x0_a = src[ 2*src_stridea] - src[13*src_stridea];
> > +        const float x0_b = src[ 3*src_stridea] - src[12*src_stridea];
> > +        const float x0_c = src[ 4*src_stridea] - src[11*src_stridea];
> > +        const float x0_d = src[ 5*src_stridea] - src[10*src_stridea];
> > +        const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea];
> > +        const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea];
> > +        const float x2_0 = x0_0 + x0_7;
> > +        const float x2_1 = x0_1 + x0_6;
> > +        const float x2_2 = x0_2 + x0_5;
> > +        const float x2_3 = x0_3 + x0_4;
> > +        const float x2_4 = x0_0 - x0_7;
> > +        const float x2_5 = x0_1 - x0_6;
> > +        const float x2_6 = x0_2 - x0_5;
> > +        const float x2_7 = x0_3 - x0_4;
> > +        const float x4_0 = x2_0 + x2_3;
> > +        const float x4_1 = x2_1 + x2_2;
> > +        const float x4_2 = x2_0 - x2_3;
> > +        const float x4_3 = x2_1 - x2_2;
> > +        const float x5_0 = x4_0 + x4_1;
> > +        const float x5_1 = x4_0 - x4_1;
> > +        const float x5_2 =  1.306562964876380*x4_2 + 0.541196100146197*x4_3;
> > +        const float x5_3 =  0.541196100146197*x4_2 - 1.306562964876380*x4_3;
> > +        const float x6_0 =  1.387039845322150*x2_4 + 0.275899379282943*x2_7;
> > +        const float x6_1 =  1.175875602419360*x2_5 + 0.785694958387102*x2_6;
> > +        const float x6_2 = -0.785694958387102*x2_5 + 1.175875602419360*x2_6;
> > +        const float x6_3 =  0.275899379282943*x2_4 - 1.387039845322150*x2_7;
> > +        const float x7_0 = x6_0 + x6_1;
> > +        const float x7_1 = x6_0 - x6_1;
> > +        const float x7_2 = x6_2 + x6_3;
> > +        const float x7_3 = x6_2 - x6_3;
> > +        const float x3_5 =  0.707106781186547*x7_1 - 0.707106781186547*x7_3;
> > +        const float x3_6 =  0.707106781186547*x7_1 + 0.707106781186547*x7_3;
> > +        const float x8_0 =  1.407403737526380*x0_8 + 0.138617169199091*x0_f;
> > +        const float x8_1 =  1.353318001174350*x0_9 + 0.410524527522357*x0_e;
> > +        const float x8_2 =  1.247225012986670*x0_a + 0.666655658477747*x0_d;
> > +        const float x8_3 =  1.093201867001760*x0_b + 0.897167586342636*x0_c;
> > +        const float x8_4 = -0.897167586342636*x0_b + 1.093201867001760*x0_c;
> > +        const float x8_5 =  0.666655658477747*x0_a - 1.247225012986670*x0_d;
> > +        const float x8_6 = -0.410524527522357*x0_9 + 1.353318001174350*x0_e;
> > +        const float x8_7 =  0.138617169199091*x0_8 - 1.407403737526380*x0_f;
> > +        const float xa_0 = x8_0 + x8_3;
> > +        const float xa_1 = x8_1 + x8_2;
> > +        const float xa_2 = x8_0 - x8_3;
> > +        const float xa_3 = x8_1 - x8_2;
> > +        const float xb_0 = xa_0 + xa_1;
> > +        const float xb_1 = xa_0 - xa_1;
> > +        const float xb_2 = 1.306562964876380*xa_2 + 0.541196100146197*xa_3;
> > +        const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> > +        const float xc_0 = x8_4 + x8_7;
> > +        const float xc_1 = x8_5 + x8_6;
> > +        const float xc_2 = x8_4 - x8_7;
> > +        const float xc_3 = x8_5 - x8_6;
> > +        const float xd_0 = xc_0 + xc_1;
> > +        const float xd_1 = xc_0 - xc_1;
> > +        const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> > +        const float xd_3 = 0.541196100146197*xc_2 - 1.306562964876380*xc_3;
> > +        const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3;
> > +        const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3;
> > +        const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1;
> > +        const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1;
> > +        const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> > +        const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> > +        dst[ 0*dst_stridea] = 0.25*x5_0;
> > +        dst[ 1*dst_stridea] = 0.25*xb_0;
> > +        dst[ 2*dst_stridea] = 0.25*x7_0;
> > +        dst[ 3*dst_stridea] = 0.25*x1_9;
> > +        dst[ 4*dst_stridea] = 0.25*x5_2;
> > +        dst[ 5*dst_stridea] = 0.25*x1_a;
> > +        dst[ 6*dst_stridea] = 0.25*x3_5;
> > +        dst[ 7*dst_stridea] = 0.25*x1_b;
> > +        dst[ 8*dst_stridea] = 0.25*x5_1;
> > +        dst[ 9*dst_stridea] = 0.25*x1_c;
> > +        dst[10*dst_stridea] = 0.25*x3_6;
> > +        dst[11*dst_stridea] = 0.25*x1_d;
> > +        dst[12*dst_stridea] = 0.25*x5_3;
> > +        dst[13*dst_stridea] = 0.25*x1_e;
> > +        dst[14*dst_stridea] = 0.25*x7_2;
> > +        dst[15*dst_stridea] = 0.25*xd_0;
> 
> many of these multiplies look like they can be merged into other
> multiplies
> 
> for example see:
> 
> 
> const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> dst[11*dst_stridea] = 0.25*x1_d;
> dst[13*dst_stridea] = 0.25*x1_e;
> 
> vs.
> 
> const float xd_2 = (0.25*0.707106781186547*1.306562964876380)*xc_2 + (0.25*0.707106781186547*0.541196100146197)*xc_3;
> const float xb_3 = (0.25*0.707106781186547*0.541196100146197)*xa_2 - (0.25*0.707106781186547*1.306562964876380)*xa_3;
> dst[11*dst_stridea] = xb_3 - xd_2;
> dst[13*dst_stridea] = xb_3 + xd_2;

also more generally
if you have 2 stages of butterflies each with 4 multiplies and 2 adds
in each butterfly

a----\-/--\---/----------a'
      X    \ /
b----/-\----------\---/--b'
           / \     \ /
c----\-/--/---\----------c'
      X            / \
d----/-\----------/---\--d'


of additions
the first stage can scale their output arbitrarily for free by
changing the respective coefficients
the second stage can use any scaled input for free by adjusting their
coefficients similarly, this gives you 4 free parameters in the
example above which can be
choosen so as to make some coefficients trivial like +-1.0
this also works accorss 2D (I)DCTs or with other things before or
after the (i)dct which can absorb such rescaling


[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Rewriting code that is poorly written but fully understood is good.
Rewriting code that one doesnt understand is a sign that one is less smart
then the original author, trying to rewrite it will not make it better.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140804/465671fe/attachment.asc>