[FFmpeg-devel] [PATCH] avfilter/dctdnoiz: rewrite [f/i]dct

Mon Aug 4 17:08:21 CEST 2014

On Mon, Aug 04, 2014 at 01:14:37AM +0200, Michael Niedermayer wrote:
> On Mon, Aug 04, 2014 at 12:44:48AM +0200, Michael Niedermayer wrote:
> > On Sun, Aug 03, 2014 at 10:27:21PM +0200, Clément Bœsch wrote:
> > > This removes the avcodec dependency and make the code almost twice as
> > > fast. More to come.
> > > 
> > > The DCT factorization is based on "Fast and numerically stable
> > > algorithms for discrete cosine transforms" from Gerlind Plonkaa &
> > > Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015).
> > > ---
> > >  configure                 |   2 -
> > >  libavfilter/vf_dctdnoiz.c | 328 +++++++++++++++++++++++++++++++++-------------
> > >  2 files changed, 240 insertions(+), 90 deletions(-)
> > > 
> > > diff --git a/configure b/configure
> > > index 9c3af50..6196b2a 100755
> > > --- a/configure
> > > +++ b/configure
> > > @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl"
> > >  bs2b_filter_deps="libbs2b"
> > >  colormatrix_filter_deps="gpl"
> > >  cropdetect_filter_deps="gpl"
> > > -dctdnoiz_filter_deps="avcodec"
> > > -dctdnoiz_filter_select="dct"
> > >  delogo_filter_deps="gpl"
> > >  deshake_filter_deps="avcodec"
> > >  deshake_filter_select="me_cmp"
> > > diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c
> > > index 71b8536..6d24934 100644
> > > --- a/libavfilter/vf_dctdnoiz.c
> > > +++ b/libavfilter/vf_dctdnoiz.c
> > > @@ -1,5 +1,5 @@
> > >  /*
> > > - * Copyright (c) 2013 Clément Bœsch
> > > + * Copyright (c) 2013-2014 Clément Bœsch
> > >   *
> > >   * This file is part of FFmpeg.
> > >   *
> > > @@ -23,7 +23,6 @@
> > >   * @see http://www.ipol.im/pub/art/2011/ys-dct/
> > >   */
> > >  
> > > -#include "libavcodec/avfft.h"
> > >  #include "libavutil/eval.h"
> > >  #include "libavutil/opt.h"
> > >  #include "drawutils.h"
> > > @@ -35,7 +34,7 @@
> > >  static const char *const var_names[] = { "c", NULL };
> > >  enum { VAR_C, VAR_VARS_NB };
> > >  
> > > -typedef struct {
> > > +typedef struct DCTdnoizContext {
> > >      const AVClass *class;
> > >  
> > >      /* coefficient factor expression */
> > > @@ -52,8 +51,9 @@ typedef struct {
> > >      int p_linesize;             // line sizes for color and weights
> > >      int overlap;                // number of block overlapping pixels
> > >      int step;                   // block step increment (BSIZE - overlap)
> > > -    DCTContext *dct, *idct;     // DCT and inverse DCT contexts
> > > -    float *block, *tmp_block;   // two BSIZE x BSIZE block buffers
> > > +    void (*filter_freq_func)(struct DCTdnoizContext *s,
> > > +                             const float *src, int src_linesize,
> > > +                             float *dst, int dst_linesize);
> > >  } DCTdnoizContext;
> > >  
> > >  #define OFFSET(x) offsetof(DCTdnoizContext, x)
> > > @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = {
> > >  
> > >  AVFILTER_DEFINE_CLASS(dctdnoiz);
> > >  
> > > -static float *dct_block(DCTdnoizContext *ctx, const float *src, int src_linesize)
> > > +static void av_always_inline fdct16_1d(float *dst, const float *src,
> > > +                                       int dst_stridea, int dst_strideb,
> > > +                                       int src_stridea, int src_strideb)
> > >  {
> > > -    int x, y;
> > > -    float *column;
> > > -
> > > -    for (y = 0; y < BSIZE; y++) {
> > > -        float *line = ctx->block;
> > > +    int i;
> > >  
> > > -        memcpy(line, src, BSIZE * sizeof(*line));
> > > -        src += src_linesize;
> > > -        av_dct_calc(ctx->dct, line);
> > > -
> > > -        column = ctx->tmp_block + y;
> > > -        column[0] = line[0] * (1. / sqrt(BSIZE));
> > > -        column += BSIZE;
> > > -        for (x = 1; x < BSIZE; x++) {
> > > -            *column = line[x] * sqrt(2. / BSIZE);
> > > -            column += BSIZE;
> > > -        }
> > > +    for (i = 0; i < BSIZE; i++) {
> > > +        const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea];
> > > +        const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea];
> > > +        const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea];
> > > +        const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea];
> > > +        const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea];
> > > +        const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea];
> > > +        const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea];
> > > +        const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea];
> > > +        const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea];
> > > +        const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea];
> > > +        const float x0_a = src[ 2*src_stridea] - src[13*src_stridea];
> > > +        const float x0_b = src[ 3*src_stridea] - src[12*src_stridea];
> > > +        const float x0_c = src[ 4*src_stridea] - src[11*src_stridea];
> > > +        const float x0_d = src[ 5*src_stridea] - src[10*src_stridea];
> > > +        const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea];
> > > +        const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea];
> > > +        const float x2_0 = x0_0 + x0_7;
> > > +        const float x2_1 = x0_1 + x0_6;
> > > +        const float x2_2 = x0_2 + x0_5;
> > > +        const float x2_3 = x0_3 + x0_4;
> > > +        const float x2_4 = x0_0 - x0_7;
> > > +        const float x2_5 = x0_1 - x0_6;
> > > +        const float x2_6 = x0_2 - x0_5;
> > > +        const float x2_7 = x0_3 - x0_4;
> > > +        const float x4_0 = x2_0 + x2_3;
> > > +        const float x4_1 = x2_1 + x2_2;
> > > +        const float x4_2 = x2_0 - x2_3;
> > > +        const float x4_3 = x2_1 - x2_2;
> > > +        const float x5_0 = x4_0 + x4_1;
> > > +        const float x5_1 = x4_0 - x4_1;
> > > +        const float x5_2 =  1.306562964876380*x4_2 + 0.541196100146197*x4_3;
> > > +        const float x5_3 =  0.541196100146197*x4_2 - 1.306562964876380*x4_3;
> > > +        const float x6_0 =  1.387039845322150*x2_4 + 0.275899379282943*x2_7;
> > > +        const float x6_1 =  1.175875602419360*x2_5 + 0.785694958387102*x2_6;
> > > +        const float x6_2 = -0.785694958387102*x2_5 + 1.175875602419360*x2_6;
> > > +        const float x6_3 =  0.275899379282943*x2_4 - 1.387039845322150*x2_7;
> > > +        const float x7_0 = x6_0 + x6_1;
> > > +        const float x7_1 = x6_0 - x6_1;
> > > +        const float x7_2 = x6_2 + x6_3;
> > > +        const float x7_3 = x6_2 - x6_3;
> > > +        const float x3_5 =  0.707106781186547*x7_1 - 0.707106781186547*x7_3;
> > > +        const float x3_6 =  0.707106781186547*x7_1 + 0.707106781186547*x7_3;
> > > +        const float x8_0 =  1.407403737526380*x0_8 + 0.138617169199091*x0_f;
> > > +        const float x8_1 =  1.353318001174350*x0_9 + 0.410524527522357*x0_e;
> > > +        const float x8_2 =  1.247225012986670*x0_a + 0.666655658477747*x0_d;
> > > +        const float x8_3 =  1.093201867001760*x0_b + 0.897167586342636*x0_c;
> > > +        const float x8_4 = -0.897167586342636*x0_b + 1.093201867001760*x0_c;
> > > +        const float x8_5 =  0.666655658477747*x0_a - 1.247225012986670*x0_d;
> > > +        const float x8_6 = -0.410524527522357*x0_9 + 1.353318001174350*x0_e;
> > > +        const float x8_7 =  0.138617169199091*x0_8 - 1.407403737526380*x0_f;
> > > +        const float xa_0 = x8_0 + x8_3;
> > > +        const float xa_1 = x8_1 + x8_2;
> > > +        const float xa_2 = x8_0 - x8_3;
> > > +        const float xa_3 = x8_1 - x8_2;
> > > +        const float xb_0 = xa_0 + xa_1;
> > > +        const float xb_1 = xa_0 - xa_1;
> > > +        const float xb_2 = 1.306562964876380*xa_2 + 0.541196100146197*xa_3;
> > > +        const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> > > +        const float xc_0 = x8_4 + x8_7;
> > > +        const float xc_1 = x8_5 + x8_6;
> > > +        const float xc_2 = x8_4 - x8_7;
> > > +        const float xc_3 = x8_5 - x8_6;
> > > +        const float xd_0 = xc_0 + xc_1;
> > > +        const float xd_1 = xc_0 - xc_1;
> > > +        const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> > > +        const float xd_3 = 0.541196100146197*xc_2 - 1.306562964876380*xc_3;
> > > +        const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3;
> > > +        const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3;
> > > +        const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1;
> > > +        const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1;
> > > +        const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> > > +        const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> > > +        dst[ 0*dst_stridea] = 0.25*x5_0;
> > > +        dst[ 1*dst_stridea] = 0.25*xb_0;
> > > +        dst[ 2*dst_stridea] = 0.25*x7_0;
> > > +        dst[ 3*dst_stridea] = 0.25*x1_9;
> > > +        dst[ 4*dst_stridea] = 0.25*x5_2;
> > > +        dst[ 5*dst_stridea] = 0.25*x1_a;
> > > +        dst[ 6*dst_stridea] = 0.25*x3_5;
> > > +        dst[ 7*dst_stridea] = 0.25*x1_b;
> > > +        dst[ 8*dst_stridea] = 0.25*x5_1;
> > > +        dst[ 9*dst_stridea] = 0.25*x1_c;
> > > +        dst[10*dst_stridea] = 0.25*x3_6;
> > > +        dst[11*dst_stridea] = 0.25*x1_d;
> > > +        dst[12*dst_stridea] = 0.25*x5_3;
> > > +        dst[13*dst_stridea] = 0.25*x1_e;
> > > +        dst[14*dst_stridea] = 0.25*x7_2;
> > > +        dst[15*dst_stridea] = 0.25*xd_0;
> > 
> > many of these multiplies look like they can be merged into other
> > multiplies
> > 
> > for example see:
> > 
> > 
> > const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> > const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> > const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> > const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> > dst[11*dst_stridea] = 0.25*x1_d;
> > dst[13*dst_stridea] = 0.25*x1_e;
> > 
> > vs.
> > 
> > const float xd_2 = (0.25*0.707106781186547*1.306562964876380)*xc_2 + (0.25*0.707106781186547*0.541196100146197)*xc_3;
> > const float xb_3 = (0.25*0.707106781186547*0.541196100146197)*xa_2 - (0.25*0.707106781186547*1.306562964876380)*xa_3;
> > dst[11*dst_stridea] = xb_3 - xd_2;
> > dst[13*dst_stridea] = xb_3 + xd_2;
> 
> also more generally
> if you have 2 stages of butterflies each with 4 multiplies and 2 adds
> in each butterfly
> 
> a----\-/--\---/----------a'
>       X    \ /
> b----/-\----------\---/--b'
>            / \     \ /
> c----\-/--/---\----------c'
>       X            / \
> d----/-\----------/---\--d'
> 
> 
> of additions
> the first stage can scale their output arbitrarily for free by
> changing the respective coefficients
> the second stage can use any scaled input for free by adjusting their
> coefficients similarly, this gives you 4 free parameters in the
> example above which can be
> choosen so as to make some coefficients trivial like +-1.0
> this also works accorss 2D (I)DCTs or with other things before or
> after the (i)dct which can absorb such rescaling

also i suggest that the patch is applied before time is
spend optimizing it further,
also the moving around of multiplies can probably affect numerical
stability if overdone

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Good people do not need laws to tell them to act responsibly, while bad
people will find a way around the laws. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140804/2ad0da32/attachment.asc>