[FFmpeg-devel] [FFMpeg-Devel] Ideas for changes to libpostproc

Wed Mar 18 01:39:02 CET 2015

This isn't really a patch, but it's easiest to express my ideas in the form of
code. As a patch it creates a single file which is mostly composed of a rewrite
of the main postprocessing loop. I've tried to express most of my ideas in
the form of changes to the code, but in cases where that would be too much
work, or wouldn't make sense in this file I've written my ideas in comments.

I'm mostly looking for opinions/critisims on my ideas, not necessarily the 
code itself. I'm fully willing to change code, but I'm more intrested in
weather or not my ideas make sense or not.

Updating libpostproc is something I plan to do for the google summer of code,
so I can't make all the changes I'd like now. I need to have some sort of 
qualification task complete within the next week, I've submitted some patches to
the mailing list already, and thoes are more along the lines of what I want to do
right now, this code is more of an idea for work to do over the summer.

Tucker DiNapoli

---
 libpostproc/postprocess_main.c | 606 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 606 insertions(+)

diff --git a/libpostproc/postprocess_main.c b/libpostproc/postprocess_main.c
new file mode 100644
index 0000000..9ca62a2


--- /dev/null
+++ b/libpostproc/postprocess_main.c
@@ -0,0 +1,606 @@
+#include "postprocess_internal.h"
+//the assembler versions are named x264_name_instruction_set
+#if HAVE_AVX2
+#define RENAME(name) x264_##name##_avx2
+#elif HAVE_SSE2
+#define RENAME(name) x264_##name##_sse2
+#elif HAVE_MMX2
+#define RENAME(name) x264_##name##_mmx2
+#else
+#define RENAME(name) name##_C
+#endif
+int  RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c);
+void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c);
+int  RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c);
+void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c);
+void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co);
+void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c);
+void RENAME(dering)(uint8_t src[], int stride, PPContext *c);
+void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride);
+void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride);
+void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp);
+void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2);
+void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp);
+void RENAME(deInterlaceMedian)(uint8_t src[], int stride);
+void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2,
+                                    const uint8_t *src, int srcStride);
+void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src);
+void RENAME(tempNoiseReducer)(uint8_t *src, int stride,;
+void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){;
+void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride,
+                                     int width, int height, const QP_STORE_T QPs[],
+                                     int QPStride, int isColor, PPContext *c);
+void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
+                                   int levelFix, int64_t *packedOffsetAndScale);
+void RENAME(duplicate)(uint8_t src[], int stride);
+#if ARCH_X86 && HAVE_INLINE_ASM
+static inline void prefetchnta(const void *p)
+{
+    __asm__ volatile(   "prefetchnta (%0)\n\t"
+        : : "r" (p)
+    );
+}
+
+static inline void prefetcht0(const void *p)
+{
+    __asm__ volatile(   "prefetcht0 (%0)\n\t"
+        : : "r" (p)
+    );
+}
+
+static inline void prefetcht1(const void *p)
+{
+    __asm__ volatile(   "prefetcht1 (%0)\n\t"
+        : : "r" (p)
+    );
+}
+
+static inline void prefetcht2(const void *p)
+{
+    __asm__ volatile(   "prefetcht2 (%0)\n\t"
+        : : "r" (p)
+    );
+}
+#else
+//judging by the gcc manuals this is a conservative estimate for when
+// __builtin_prefetch was added
+#if AV_GCC_VERSION_AT_LEAST(3,3)
+#define prefetchnta(p) __builtin_prefetch(p,0,0)
+#define prefetcht0(p) __builtin_prefetch(p,0,1)
+#define prefetcht1(p) __builtin_prefetch(p,0,2)
+#define prefetcht2(p) __builtin_prefetch(p,0,3)
+#else
+#define prefetchnta(p)
+#define prefetcht0(p)
+#define prefetcht1(p)
+#define prefetcht2(p)
+#endif
+#endif
+
+/*void deInterlaceInterpolateLinear(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+                                  int width, int height, PPContext *c){
+    //assume 1st line has been delt with
+    static const copy_ahead = 5;
+    int aligned_height = (height - 16) % c->block_height;
+    int aligned_width = width % c->block_width;
+    int x,y;
+    const int mode= isColor ? c->ppMode.chromMode : c->ppMode.lumMode;
+    for(y=0; y < aligned_height; y+=c->block_height){
+        const uint8_t *src_block = src+(y * src_stride);
+        uint8_t *dst_block = dst+(y * dst_stride);
+        //((x>>2)&6) = x/4 - (x % 64)
+        prefetchnta(src_block + (((x>>2)&6) + 5) * (srcStride + 32));
+        prefetchnta(src_block + (((x>>2)&6) + 6) * (srcStride + 32));
+        prefetcht0(dst_block + (((x>>2)&6) + 5) * (dstStride + 32));
+        prefetcht0(dst_block + (((x>>2)&6) + 6) * (dstStride + 32));
+        RENAME(blockCopy)(dst_block*/
+
+static inline duplicate_byte_to_quadword(int *src, int *dst){
+/* c version, showing what's going on
+   for(i=0; i <c.block_width/8; i++){
+      int tempQP;
+      tempQP = c.QP[i]<<8;
+      tempQP |= (tempQP<<16);
+      c.pQPb[i] = (tempQP | (uint64_t)tempQP << 32);
+   }
+*/
+/*
+  The majority of assembler code is in seperate files, but since this
+  is such a small ammount the cost of a function call would outweigh
+  the benifits of using assembly, so it's either inline asm or using C
+*/
+
+#if HAVE_AVX2
+    DECLARE_ALIGNED(32, uint64_t, shuffle_mask)[4] = {0x0000000000000000,
+                                                      0x0808080808080808,
+                                                      0x0000000000000000,
+                                                      0x0808080808080808};
+    __asm__ volatile(
+                     "vmovdqa %1, %%ymm0\n\t"
+                     "vmovdqa %2, %%ymm1\n\t"
+                     "vpshufb %%ymm0, %%ymm1, %%ymm0\n\t"
+                     "vmovdqa %%ymm0, %0\n"
+                     : "=m" (dst)
+                     : "m"(src), "m"(shuffle_mask)
+                     : "ymm0","ymm1");
+#elif HAVE_SSE2
+#if HAVE_SSSE3
+    DECLARE_ALIGNED(32, uint64_t, shuffle_mask)[2] = {0x0000000000000000,
+                                                      0x0808080808080808}
+    __asm__ volatile(
+                     "movdqa %1, %%xmm0\n\t"
+                     "movdqa %2, %%xmm1\n\t"
+                     "pshufb %%xmm1, %%xmm0\n\t"
+                     "movdqa %%xmm0, %0\n"
+                     : "=m" (dst)
+                     : "m"(src), "m"(shuffle_mask)
+                     : "xmm0","xmm1");
+#else
+    __asm__ volatile(
+                     "movdqa %1, %%xmm0\n\t"
+                     "pshuflw $0, %%xmm0, %%xmm0"
+                     "shufpd $0, %%xmm0, %%xmm0"
+                     "packuswb %%xmm0, %%xmm0\n\t"
+                     "movdqa %%xmm0, %0\n"
+                     : "=m"(dst)
+                     : "m"(src)
+                     : "xmm0");
+#endif
+#elif HAVE_MMX
+    __asm__ volatile(
+                     "movd %1, %%mm0\n\t"
+                     "packuswb %%mm0, %%mm0\n\t"
+                     "packuswb %%mm0, %%mm0\n\t"
+                     "packuswb %%mm0, %%mm0\n\t"
+                     "movq %%mm7, %0\n\t"
+                     : "=m" (dst)
+                     : "r" (src)
+                     : "mm0");
+#endif
+    return;
+}
+/*
+  Changes to structs:
+  add level_fix field to PPMode
+  add block_width and block_height to PPContext
+
+  PQPb, pQPb2, packedYOffset, packedYScale, mmxDcOffset, mmxDcThreshold
+  will probably need to be changed to support sse/avx/variable block sizes
+*/
+/**
+ * Filter array of bytes (Y or U or V values)
+ */
+static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
+                        const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
+{
+    DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
+    int x,y;
+
+    const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
+
+    int black=0, white=255; // blackest black and whitest white in the picture
+    int QPCorrecture= 256*256;
+
+    int copyAhead;//this may need to be changed I'm not sure
+#if ARCH_X86
+    int i, j;
+#endif
+
+    const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
+    const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
+
+    //FIXME remove
+    uint64_t * const yHistogram= c.yHistogram;
+    uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
+    uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
+
+    const int aligned_height = height - (height % c.block_height);
+    const int aligned_width = width - (width % c.block_width);
+    //actual filter may take 2 or 4 args, so can't specify the actual function type
+    //alernatively redifine all deinterlace filters to take 4 args
+    void(*deinterlace_filter)();
+    //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
+
+    if (mode & VISUALIZE){
+        if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
+            av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
+        }
+    }
+//This may need to be changed, depending on how dc_offset/threshold work,
+//in order to support more blocks at once/support sse/avx
+#if ARCH_X86
+    for(i=0; i<57; i++){
+        int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
+        int threshold= offset*2 + 1;
+//        for(j = 0;j < c.block_width/8; j++){
+        c.mmxDcOffset[i]= 0x7F - offset;
+        c.mmxDcThreshold[i]= 0x7F - threshold;
+        c.mmxDcOffset[i]*= 0x0101010101010101LL;
+        c.mmxDcThreshold[i]*= 0x0101010101010101LL;
+//        }
+    }
+#endif
+
+    if(mode & CUBIC_IPOL_DEINT_FILTER){
+        copyAhead = 8;
+        deinterlace_filter = deInterlaceInterpolateCubic;
+    } else if(mode & LINEAR_BLEND_DEINT_FILTER){
+        copyAhead = 6;
+        deinterlace_filter = deInterlaceBlendLinear;
+    } else if(mode & FFMPEG_DEINT_FILTER){
+        copyAhead = 6;
+        deinterlace_filter = deInterlaceFF;
+    } else if(mode & LOWPASS5_DEINT_FILTER){
+        copyAhead = 6;
+        deinterlace_filter = deInterlaceL5;
+    } else if(mode & LINEAR_IPOL_DEINT_FILTER){
+        copyAhead = 5;
+        deinterlace_filter = deInterlaceInterpolateLinear;
+    } else if(mode & MEDIAN_DEINT_FILTER){
+        copyAhead = 5;
+        deinterlace_filter = MedianDeinterlace;
+    } else {
+        deinterlace_filter = deInterlaceDummy;
+        if(mode & V_A_DEBLOCK){
+            copyAhead = 5;
+        } else if(mode & V_DEBLOCK){
+            copyAhead = 6;
+        } else if(mode & V_X1_FILTER){
+            copyAhead == 3;
+        } else if(mode & DERING){
+            copyAhead = 1;
+        } else {
+            copyAhead = 0;
+        }
+    }
+    if(!isColor){
+        uint64_t sum = 0;
+        int i;
+        uint64_t maxClipped;
+        uint64_t clipped;
+        double scale;
+
+        c.frameNum++;
+        // first frame is fscked so we ignore it
+        if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
+
+        for(i=0; i<256; i++){
+            sum+= yHistogram[i];
+        }
+
+        /* We always get a completely black picture first. */
+        maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
+
+        clipped= sum;
+        for(black=255; black>0; black--){
+            if(clipped < maxClipped) break;
+            clipped-= yHistogram[black];
+        }
+
+        clipped= sum;
+        for(white=0; white<256; white++){
+            if(clipped < maxClipped) break;
+            clipped-= yHistogram[white];
+        }
+
+        scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
+
+#if ARCH_X86
+        c.packedYScale[0]= (uint16_t)(scale*256.0 + 0.5);
+        c.packedYOffset[0]= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
+
+        c.packedYOffset[0]|= c.packedYOffset<<32;
+        c.packedYOffset[0]|= c.packedYOffset<<16;
+
+        c.packedYScale[0]|= c.packedYScale<<32;
+        c.packedYScale[0]|= c.packedYScale<<16;
+
+        c.packedYScale[1] = c.packedYScale[2] = c.packedYScale[3] = c.packedYScale[0];
+        c.packedYOffset[1] = c.packedYOffset[2] = c.packedYOffset[3] = c.packedYOffset[0];
+#else
+        c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
+        c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
+
+        c.packedYOffset|= c.packedYOffset<<32;
+        c.packedYOffset|= c.packedYOffset<<16;
+
+        c.packedYScale|= c.packedYScale<<32;
+        c.packedYScale|= c.packedYScale<<16;
+#endif
+
+        if(c.PPMode.level_fix)        QPCorrecture= (int)(scale*256*256 + 0.5);
+        else                        QPCorrecture= 256*256;
+    } else {
+#if ARCH_X86
+        c.packedYScale= {0x0100010001000100LL,0x0100010001000100LL,
+                         0x0100010001000100LL,0x0100010001000100LL};
+        c.packedYOffset= {0,0,0,0};
+#else
+        c.packedYScale= 0x0100010001000100LL;
+        c.packedYOffset= 0;
+#endif
+        QPCorrecture= 256*256;
+    }
+
+    /* copy & deinterlace first row of blocks */
+    y=-block_height;
+    {
+        const uint8_t *srcBlock= &(src[y*srcStride]);
+        uint8_t *dstBlock= tempDst + dstStride;
+//change width to aligned_width and add another loop after this to deal with
+//extra unaligned blocks
+        for(x=0; x<width; x += c.block_width){
+            //(x>>2)&6 == x/4 - x%64;
+            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
+            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
+            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
+            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
+
+            RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
+                              srcBlock + srcStride*8, srcStride, c.PPMode.level_fix, &c.packedYOffset);
+
+            RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
+//Requires possible extra args and an explicit function call, but even so this should
+//still be faster than all the else/if checks that were here before
+            deinterlace_filter(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
+
+            dstBlock += c.block_width;
+            srcBlock += c.block_width;
+        }
+        if(width==FFABS(dstStride))
+            linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
+        else{
+            int i;
+            for(i=0; i<copyAhead; i++){
+                memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
+            }
+        }
+    }
+
+//change height to aligned height, and add extra loop after to deal with
+//extra unaligned blocks and temporary buffer stuff
+    for(y=0; y<height; y+=c.block_height){
+        //1% speedup if these are here instead of the inner loop
+        const uint8_t *srcBlock= &(src[y*srcStride]);
+        uint8_t *dstBlock= &(dst[y*dstStride]);
+#if ARCH_X86
+        uint8_t *tempBlock1= c.tempBlocks;
+        uint8_t *tempBlock2= c.tempBlocks + 8;
+#endif
+        const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
+        int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
+        int QP=0;
+        //TODO: move this out of the inner loop, need to change loop bounds first
+        /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
+           if not than use a temporary buffer */
+        if(y+15 >= height){
+            int i;
+            /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
+               blockcopy to dst later */
+            linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
+                    FFMAX(height-y-copyAhead, 0), srcStride);
+
+            /* duplicate last line of src to fill the void up to line (copyAhead+7) */
+            for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
+                    memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
+
+            /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
+            linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
+
+            /* duplicate last line of dst to fill the void up to line (copyAhead) */
+            for(i=height-y+1; i<=copyAhead; i++)
+                    memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
+
+            dstBlock= tempDst + dstStride;
+            srcBlock= tempSrc;
+        }
+
+        // From this point on it is guaranteed that we can read and write 16 lines downward
+        // finish 1 block before the next otherwise we might have a problem
+        // with the L1 Cache of the P4 ... or only a few blocks at a time or something
+
+        //change width to aligned width
+        for(x=0; x<width; x+=c.block_width){
+            const int stride= dstStride;
+#if ARCH_X86
+            uint8_t *tmpXchg;
+            int i;
+#endif
+//change QP/nonBQP to an array/pointer to allow processing multiple blocks at a time
+//maybe change them to 64 bit types, unless it'd cause too much of a slowdown on 32 bit
+//platforms
+            for(i=0;i<c.block_width/8;i++){
+                int block_x = x + (i*c.block_width);
+                if(isColor){
+                    QP = QPptr[block_x>>qpHShift];
+                    c.nonBQP[2*i]= nonBQPptr[block_x>>qpHShift];
+                }else{
+                    QP= QPptr[block_x>>4];
+                    QP= (QP* QPCorrecture + 256*128)>>16;
+                    c.nonBQP[2*i]= nonBQPptr[block_x>>4];
+                    c.nonBQP[2*i]= (c.nonBQP[2*i]* QPCorrecture + 256*128)>>16;
+                    yHistogram[ srcBlock[srcStride*12 + 4] ]++;
+                }
+                c.QP[2*i]= QP;
+            }
+#if ARCH_X86
+            duplicate_byte_to_quadword(c.QP, c.QP);
+#endif
+
+            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
+            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
+            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
+            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
+            RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
+                              srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
+
+
+            deinterlace_filter(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
+
+            /* only deblock if we have 2 blocks */
+            //This check can be eliminated when the loop bounds are changed
+            if(y + 8 < height){
+                if(mode & V_X1_FILTER)
+                    RENAME(vertX1Filter)(dstBlock, stride, &c);
+                else if(mode & V_DEBLOCK){
+                    //Not sure how to convert this to simd, I was thinking vertClassify
+                    //would return a mask classifying multiple blocks, but even if it
+                    //does I'm not sure how to run the filters
+
+                    //I guess I could test the mask, and if it's not uniform
+                    //run both filters and choose which one to use for each block
+                    //based on the mask
+                    const int t= RENAME(vertClassify)(dstBlock, stride, &c);
+
+                    if(t==1)
+                        RENAME(doVertLowPass)(dstBlock, stride, &c);
+                    else if(t==2)
+                        RENAME(doVertDefFilter)(dstBlock, stride, &c);
+                }else if(mode & V_A_DEBLOCK){
+                    RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
+                }
+            }
+            //IDEA: do the first block outside of the main loop, so we can
+            //get rid of this test
+            /* check if we have a previous block to deblock it with dstBlock */
+            if(x - 8 >= 0){
+#if ARCH_X86
+//Transpose the block(s) filter them and transpose them back
+            RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
+            /* check if we have a previous block to deblock it with dstBlock */
+                if(mode & H_X1_FILTER)
+                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
+                else if(mode & H_DEBLOCK){
+//START_TIMER
+                    const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
+//STOP_TIMER("dc & minmax")
+                    if(t==1)
+                        RENAME(doVertLowPass)(tempBlock1, 16, &c);
+                    else if(t==2)
+                        RENAME(doVertDefFilter)(tempBlock1, 16, &c);
+                }else if(mode & H_A_DEBLOCK){
+                        RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
+                }
+
+                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
+#elif HAVE_ALTIVEC
+//Leave altivec stuff as is
+                if(mode & H_X1_FILTER)
+                    horizX1Filter(dstBlock-4, stride, QP);
+                else if(mode & H_DEBLOCK){
+                    DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
+                    int t;
+                    transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
+
+                    t = vertClassify_altivec(tempBlock-48, 16, &c);
+                    if(t==1) {
+                        doVertLowPass_altivec(tempBlock-48, 16, &c);
+                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
+                    }
+                    else if(t==2) {
+                        doVertDefFilter_altivec(tempBlock-48, 16, &c);
+                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
+                    }
+                }else if(mode & H_A_DEBLOCK){
+                    RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
+                }
+#else
+//no need to transpose for C code
+                if(mode & H_X1_FILTER)
+                    horizX1Filter(dstBlock-4, stride, QP);
+                else if(mode & H_DEBLOCK){
+                    const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
+
+                    if(t==1)
+                        RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
+                    else if(t==2)
+                        RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
+                }else if(mode & H_A_DEBLOCK){
+                    RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
+                }
+#endif //ARCH_X86
+                if(mode & DERING){
+                //FIXME filter first line
+                    if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
+                }
+
+                if(mode & TEMP_NOISE_FILTER)
+                {
+                    RENAME(tempNoiseReducer)(dstBlock-8, stride,
+                            c.tempBlurred[isColor] + y*dstStride + x,
+                            c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
+                            c.ppMode.maxTmpNoise);
+                }
+            }
+
+            dstBlock+=c.block_width;
+            srcBlock+=c.block_width;
+
+#if ARCH_X86
+            tmpXchg= tempBlock1;
+            tempBlock1= tempBlock2;
+            tempBlock2 = tmpXchg;
+#endif
+        }
+//Code to clean up non-aligned blocks goes here
+
+        if(mode & DERING){
+            if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
+        }
+
+        if((mode & TEMP_NOISE_FILTER)){
+            RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
+                    c.tempBlurred[isColor] + y*dstStride + x,
+                    c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
+                    c.ppMode.maxTmpNoise);
+        }
+
+        //Again this test can be eliminated when we change the loop parameters
+        /* did we use a tmp buffer for the last lines*/
+        if(y+15 >= height){
+            uint8_t *dstBlock= &(dst[y*dstStride]);
+            if(width==FFABS(dstStride))
+                linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
+            else{
+                int i;
+                for(i=0; i<height-y; i++){
+                    memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
+                }
+            }
+        }
+    }
+//Code to deal with excess lines goes here
+
+#if   TEMPLATE_PP_3DNOW
+    __asm__ volatile("femms");
+#elif TEMPLATE_PP_MMX
+    __asm__ volatile("emms");
+#endif
+
+#ifdef DEBUG_BRIGHTNESS
+    if(!isColor){
+        int max=1;
+        int i;
+        for(i=0; i<256; i++)
+            if(yHistogram[i] > max) max=yHistogram[i];
+
+        for(i=1; i<256; i++){
+            int x;
+            int start=yHistogram[i-1]/(max/256+1);
+            int end=yHistogram[i]/(max/256+1);
+            int inc= end > start ? 1 : -1;
+            for(x=start; x!=end+inc; x+=inc)
+                dst[ i*dstStride + x]+=128;
+        }
+
+        for(i=0; i<100; i+=2){
+            dst[ (white)*dstStride + i]+=128;
+            dst[ (black)*dstStride + i]+=128;
+        }
+    }
+#endif
+
+    *c2= c; //copy local context back
+
+}
+#undef RENAME