[FFmpeg-devel] [FFMpeg-Devel] [PATCH 1/7] Moved postprocessing routines from postprocess.c to seperate file

Fri Mar 13 19:15:08 CET 2015

This is a somewhat large patchset, mostly because I've tried to break each
patch down into the smallest set of changes I could. On my machine (x86_64
linux, with AVX2), everything compiles after each patch. Whenever I configure
ffmpeg libpostproc is always disabled and I have to manually change config.mak
to fix this. I'm not sure if this is an issue on my end or just the default
setting.  

This is ultimately a qualifing task for the google summer of code, so I know I
still have a lot of things to work on, I just want to make sure I'm following
the proper coding conventions and everything for now.

This moves c functions to process blocks horozontally into a seperate
file, so that none of the postprocessing algorithms are in the main
postprecess.c file
---
 libpostproc/postprocess.c   | 352 +----------------------------------------
 libpostproc/postprocess_c.c | 373 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 374 insertions(+), 351 deletions(-)
 create mode 100644 libpostproc/postprocess_c.c

diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index 9d89782..86c0520 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -199,357 +199,7 @@ static inline void prefetcht2(const void *p)
 }
 #endif
 
-/* The horizontal functions exist only in C because the MMX
- * code is faster with vertical filters and transposing. */
-
-/**
- * Check if the given 8x8 Block is mostly "flat"
- */
-static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
-{
-    int numEq= 0;
-    int y;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-
-    for(y=0; y<BLOCK_SIZE; y++){
-        numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
-        src+= stride;
-    }
-    return numEq > c->ppMode.flatnessThreshold;
-}
-
-/**
- * Check if the middle 8x8 Block in the given 8x16 block is flat
- */
-static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
-{
-    int numEq= 0;
-    int y;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-
-    src+= stride*4; // src points to begin of the 8x8 Block
-    for(y=0; y<BLOCK_SIZE-1; y++){
-        numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
-        src+= stride;
-    }
-    return numEq > c->ppMode.flatnessThreshold;
-}
-
-static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
-{
-    int i;
-    for(i=0; i<2; i++){
-        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
-        src += stride;
-    }
-    return 1;
-}
-
-static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
-{
-    int x;
-    src+= stride*4;
-    for(x=0; x<BLOCK_SIZE; x+=4){
-        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
-    }
-    return 1;
-}
-
-static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
-{
-    if( isHorizDC_C(src, stride, c) ){
-        return isHorizMinMaxOk_C(src, stride, c->QP);
-    }else{
-        return 2;
-    }
-}
-
-static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
-{
-    if( isVertDC_C(src, stride, c) ){
-        return isVertMinMaxOk_C(src, stride, c->QP);
-    }else{
-        return 2;
-    }
-}
-
-static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
-{
-    int y;
-    for(y=0; y<BLOCK_SIZE; y++){
-        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
-
-        if(FFABS(middleEnergy) < 8*c->QP){
-            const int q=(dst[3] - dst[4])/2;
-            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
-            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
-
-            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-            d= FFMAX(d, 0);
-
-            d= (5*d + 32) >> 6;
-            d*= FFSIGN(-middleEnergy);
-
-            if(q>0)
-            {
-                d = FFMAX(d, 0);
-                d = FFMIN(d, q);
-            }
-            else
-            {
-                d = FFMIN(d, 0);
-                d = FFMAX(d, q);
-            }
-
-            dst[3]-= d;
-            dst[4]+= d;
-        }
-        dst+= stride;
-    }
-}
-
-/**
- * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- */
-static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
-{
-    int y;
-    for(y=0; y<BLOCK_SIZE; y++){
-        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
-        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
-
-        int sums[10];
-        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
-        sums[1] = sums[0] - first  + dst[3];
-        sums[2] = sums[1] - first  + dst[4];
-        sums[3] = sums[2] - first  + dst[5];
-        sums[4] = sums[3] - first  + dst[6];
-        sums[5] = sums[4] - dst[0] + dst[7];
-        sums[6] = sums[5] - dst[1] + last;
-        sums[7] = sums[6] - dst[2] + last;
-        sums[8] = sums[7] - dst[3] + last;
-        sums[9] = sums[8] - dst[4] + last;
-
-        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
-        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
-        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
-        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
-        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
-        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
-        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
-        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
-
-        dst+= stride;
-    }
-}
-
-/**
- * Experimental Filter 1 (Horizontal)
- * will not damage linear gradients
- * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
- * MMX2 version does correct clipping C version does not
- * not identical with the vertical one
- */
-static inline void horizX1Filter(uint8_t *src, int stride, int QP)
-{
-    int y;
-    static uint64_t lut[256];
-    if(!lut[255])
-    {
-        int i;
-        for(i=0; i<256; i++)
-        {
-            int v= i < 128 ? 2*i : 2*(i-256);
-/*
-//Simulate 112242211 9-Tap filter
-            uint64_t a= (v/16)  & 0xFF;
-            uint64_t b= (v/8)   & 0xFF;
-            uint64_t c= (v/4)   & 0xFF;
-            uint64_t d= (3*v/8) & 0xFF;
-*/
-//Simulate piecewise linear interpolation
-            uint64_t a= (v/16)   & 0xFF;
-            uint64_t b= (v*3/16) & 0xFF;
-            uint64_t c= (v*5/16) & 0xFF;
-            uint64_t d= (7*v/16) & 0xFF;
-            uint64_t A= (0x100 - a)&0xFF;
-            uint64_t B= (0x100 - b)&0xFF;
-            uint64_t C= (0x100 - c)&0xFF;
-            uint64_t D= (0x100 - c)&0xFF;
-
-            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
-                       (D<<24) | (C<<16) | (B<<8)  | (A);
-            //lut[i] = (v<<32) | (v<<24);
-        }
-    }
-
-    for(y=0; y<BLOCK_SIZE; y++){
-        int a= src[1] - src[2];
-        int b= src[3] - src[4];
-        int c= src[5] - src[6];
-
-        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
-
-        if(d < QP){
-            int v = d * FFSIGN(-b);
-
-            src[1] +=v/8;
-            src[2] +=v/4;
-            src[3] +=3*v/8;
-            src[4] -=3*v/8;
-            src[5] -=v/4;
-            src[6] -=v/8;
-        }
-        src+=stride;
-    }
-}
-
-/**
- * accurate deblock filter
- */
-static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
-                                            int stride, const PPContext *c, int mode)
-{
-    int y;
-    const int QP= c->QP;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-//START_TIMER
-    src+= step*4; // src points to begin of the 8x8 Block
-    for(y=0; y<8; y++){
-        int numEq= 0;
-
-        numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
-        numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
-        if(numEq > c->ppMode.flatnessThreshold){
-            int min, max, x;
-
-            if(src[0] > src[step]){
-                max= src[0];
-                min= src[step];
-            }else{
-                max= src[step];
-                min= src[0];
-            }
-            for(x=2; x<8; x+=2){
-                if(src[x*step] > src[(x+1)*step]){
-                        if(src[x    *step] > max) max= src[ x   *step];
-                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
-                }else{
-                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
-                        if(src[ x   *step] < min) min= src[ x   *step];
-                }
-            }
-            if(max-min < 2*QP){
-                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
-                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
-
-                int sums[10];
-                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
-                sums[1] = sums[0] - first       + src[3*step];
-                sums[2] = sums[1] - first       + src[4*step];
-                sums[3] = sums[2] - first       + src[5*step];
-                sums[4] = sums[3] - first       + src[6*step];
-                sums[5] = sums[4] - src[0*step] + src[7*step];
-                sums[6] = sums[5] - src[1*step] + last;
-                sums[7] = sums[6] - src[2*step] + last;
-                sums[8] = sums[7] - src[3*step] + last;
-                sums[9] = sums[8] - src[4*step] + last;
-
-                if (mode & VISUALIZE) {
-                    src[0*step] =
-                    src[1*step] =
-                    src[2*step] =
-                    src[3*step] =
-                    src[4*step] =
-                    src[5*step] =
-                    src[6*step] =
-                    src[7*step] = 128;
-                }
-                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
-                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
-                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
-                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
-                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
-                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
-                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
-                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
-            }
-        }else{
-            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
-
-            if(FFABS(middleEnergy) < 8*QP){
-                const int q=(src[3*step] - src[4*step])/2;
-                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
-                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
-
-                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-                d= FFMAX(d, 0);
-
-                d= (5*d + 32) >> 6;
-                d*= FFSIGN(-middleEnergy);
-
-                if(q>0){
-                    d = FFMAX(d, 0);
-                    d = FFMIN(d, q);
-                }else{
-                    d = FFMIN(d, 0);
-                    d = FFMAX(d, q);
-                }
-
-                if ((mode & VISUALIZE) && d) {
-                    d= (d < 0) ? 32 : -32;
-                    src[3*step]= av_clip_uint8(src[3*step] - d);
-                    src[4*step]= av_clip_uint8(src[4*step] + d);
-                    d = 0;
-                }
-
-                src[3*step]-= d;
-                src[4*step]+= d;
-            }
-        }
-
-        src += stride;
-    }
-/*if(step==16){
-    STOP_TIMER("step16")
-}else{
-    STOP_TIMER("stepX")
-}*/
-}
+#include "postprocess_c.c"
 
 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
 //Plain C versions
diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c
new file mode 100644
index 0000000..bf22e95
--- /dev/null
+++ b/libpostproc/postprocess_c.c
@@ -0,0 +1,373 @@
+/******************************************************************************
+* C implementation of postprocessing routines
+* Copyright (C) 2001-2002 Michael Niedermayer (michaelni at gmx.at)
+* Copyright (c) 2015 Tucker DiNapoli
+*
+* This file is part of FFmpeg.
+*
+* FFmpeg is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* FFmpeg is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with FFmpeg; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+******************************************************************************/
+
+/* The horizontal functions exist only in C because the MMX
+ * code is faster with vertical filters and transposing. */
+
+/**
+ * Check if the given 8x8 Block is mostly "flat"
+ */
+static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
+{
+    int numEq= 0;
+    int y;
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+    const int dcThreshold= dcOffset*2 + 1;
+
+    for(y=0; y<BLOCK_SIZE; y++){
+        numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
+        src+= stride;
+    }
+    return numEq > c->ppMode.flatnessThreshold;
+}
+
+/**
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
+ */
+static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
+{
+    int numEq= 0;
+    int y;
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+    const int dcThreshold= dcOffset*2 + 1;
+
+    src+= stride*4; // src points to begin of the 8x8 Block
+    for(y=0; y<BLOCK_SIZE-1; y++){
+        numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
+        src+= stride;
+    }
+    return numEq > c->ppMode.flatnessThreshold;
+}
+
+static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
+{
+    int i;
+    for(i=0; i<2; i++){
+        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
+        src += stride;
+        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
+        src += stride;
+        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
+        src += stride;
+        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
+        src += stride;
+    }
+    return 1;
+}
+
+static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
+{
+    int x;
+    src+= stride*4;
+    for(x=0; x<BLOCK_SIZE; x+=4){
+        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
+        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
+        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
+        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
+    }
+    return 1;
+}
+
+static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
+{
+    if( isHorizDC_C(src, stride, c) ){
+        return isHorizMinMaxOk_C(src, stride, c->QP);
+    }else{
+        return 2;
+    }
+}
+
+static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
+{
+    if( isVertDC_C(src, stride, c) ){
+        return isVertMinMaxOk_C(src, stride, c->QP);
+    }else{
+        return 2;
+    }
+}
+
+static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
+{
+    int y;
+    for(y=0; y<BLOCK_SIZE; y++){
+        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
+
+        if(FFABS(middleEnergy) < 8*c->QP){
+            const int q=(dst[3] - dst[4])/2;
+            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
+            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
+
+            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
+            d= FFMAX(d, 0);
+
+            d= (5*d + 32) >> 6;
+            d*= FFSIGN(-middleEnergy);
+
+            if(q>0)
+            {
+                d = FFMAX(d, 0);
+                d = FFMIN(d, q);
+            }
+            else
+            {
+                d = FFMIN(d, 0);
+                d = FFMAX(d, q);
+            }
+
+            dst[3]-= d;
+            dst[4]+= d;
+        }
+        dst+= stride;
+    }
+}
+
+/**
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
+ */
+static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
+{
+    int y;
+    for(y=0; y<BLOCK_SIZE; y++){
+        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
+        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
+
+        int sums[10];
+        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
+        sums[1] = sums[0] - first  + dst[3];
+        sums[2] = sums[1] - first  + dst[4];
+        sums[3] = sums[2] - first  + dst[5];
+        sums[4] = sums[3] - first  + dst[6];
+        sums[5] = sums[4] - dst[0] + dst[7];
+        sums[6] = sums[5] - dst[1] + last;
+        sums[7] = sums[6] - dst[2] + last;
+        sums[8] = sums[7] - dst[3] + last;
+        sums[9] = sums[8] - dst[4] + last;
+
+        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
+        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
+        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
+        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
+        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
+        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
+        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
+        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
+
+        dst+= stride;
+    }
+}
+
+/**
+ * Experimental Filter 1 (Horizontal)
+ * will not damage linear gradients
+ * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
+ * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
+ * MMX2 version does correct clipping C version does not
+ * not identical with the vertical one
+ */
+static inline void horizX1Filter(uint8_t *src, int stride, int QP)
+{
+    int y;
+    static uint64_t lut[256];
+    if(!lut[255])
+    {
+        int i;
+        for(i=0; i<256; i++)
+        {
+            int v= i < 128 ? 2*i : 2*(i-256);
+/*
+//Simulate 112242211 9-Tap filter
+            uint64_t a= (v/16)  & 0xFF;
+            uint64_t b= (v/8)   & 0xFF;
+            uint64_t c= (v/4)   & 0xFF;
+            uint64_t d= (3*v/8) & 0xFF;
+*/
+//Simulate piecewise linear interpolation
+            uint64_t a= (v/16)   & 0xFF;
+            uint64_t b= (v*3/16) & 0xFF;
+            uint64_t c= (v*5/16) & 0xFF;
+            uint64_t d= (7*v/16) & 0xFF;
+            uint64_t A= (0x100 - a)&0xFF;
+            uint64_t B= (0x100 - b)&0xFF;
+            uint64_t C= (0x100 - c)&0xFF;
+            uint64_t D= (0x100 - c)&0xFF;
+
+            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
+                       (D<<24) | (C<<16) | (B<<8)  | (A);
+            //lut[i] = (v<<32) | (v<<24);
+        }
+    }
+
+    for(y=0; y<BLOCK_SIZE; y++){
+        int a= src[1] - src[2];
+        int b= src[3] - src[4];
+        int c= src[5] - src[6];
+
+        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
+
+        if(d < QP){
+            int v = d * FFSIGN(-b);
+
+            src[1] +=v/8;
+            src[2] +=v/4;
+            src[3] +=3*v/8;
+            src[4] -=3*v/8;
+            src[5] -=v/4;
+            src[6] -=v/8;
+        }
+        src+=stride;
+    }
+}
+
+/**
+ * accurate deblock filter
+ */
+static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
+                                            int stride, const PPContext *c, int mode)
+{
+    int y;
+    const int QP= c->QP;
+    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
+    const int dcThreshold= dcOffset*2 + 1;
+//START_TIMER
+    src+= step*4; // src points to begin of the 8x8 Block
+    for(y=0; y<8; y++){
+        int numEq= 0;
+
+        numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
+        numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
+        if(numEq > c->ppMode.flatnessThreshold){
+            int min, max, x;
+
+            if(src[0] > src[step]){
+                max= src[0];
+                min= src[step];
+            }else{
+                max= src[step];
+                min= src[0];
+            }
+            for(x=2; x<8; x+=2){
+                if(src[x*step] > src[(x+1)*step]){
+                        if(src[x    *step] > max) max= src[ x   *step];
+                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
+                }else{
+                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
+                        if(src[ x   *step] < min) min= src[ x   *step];
+                }
+            }
+            if(max-min < 2*QP){
+                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
+                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
+
+                int sums[10];
+                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
+                sums[1] = sums[0] - first       + src[3*step];
+                sums[2] = sums[1] - first       + src[4*step];
+                sums[3] = sums[2] - first       + src[5*step];
+                sums[4] = sums[3] - first       + src[6*step];
+                sums[5] = sums[4] - src[0*step] + src[7*step];
+                sums[6] = sums[5] - src[1*step] + last;
+                sums[7] = sums[6] - src[2*step] + last;
+                sums[8] = sums[7] - src[3*step] + last;
+                sums[9] = sums[8] - src[4*step] + last;
+
+                if (mode & VISUALIZE) {
+                    src[0*step] =
+                    src[1*step] =
+                    src[2*step] =
+                    src[3*step] =
+                    src[4*step] =
+                    src[5*step] =
+                    src[6*step] =
+                    src[7*step] = 128;
+                }
+                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
+                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
+                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
+                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
+                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
+                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
+                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
+                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
+            }
+        }else{
+            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
+
+            if(FFABS(middleEnergy) < 8*QP){
+                const int q=(src[3*step] - src[4*step])/2;
+                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
+                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
+
+                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
+                d= FFMAX(d, 0);
+
+                d= (5*d + 32) >> 6;
+                d*= FFSIGN(-middleEnergy);
+
+                if(q>0){
+                    d = FFMAX(d, 0);
+                    d = FFMIN(d, q);
+                }else{
+                    d = FFMIN(d, 0);
+                    d = FFMAX(d, q);
+                }
+
+                if ((mode & VISUALIZE) && d) {
+                    d= (d < 0) ? 32 : -32;
+                    src[3*step]= av_clip_uint8(src[3*step] - d);
+                    src[4*step]= av_clip_uint8(src[4*step] + d);
+                    d = 0;
+                }
+
+                src[3*step]-= d;
+                src[4*step]+= d;
+            }
+        }
+
+        src += stride;
+    }
+/*if(step==16){
+    STOP_TIMER("step16")
+}else{
+    STOP_TIMER("stepX")
+}*/
+}
-- 
2.2.1